Repository: HKUDS/AnyTool Branch: main Commit: 506430fec133 Files: 132 Total size: 1.1 MB Directory structure: gitextract_sqfmt1l8/ ├── .gitignore ├── COMMUNICATION.md ├── LICENSE ├── README.md ├── anytool/ │ ├── __init__.py │ ├── __main__.py │ ├── agents/ │ │ ├── __init__.py │ │ ├── base.py │ │ └── grounding_agent.py │ ├── config/ │ │ ├── __init__.py │ │ ├── config_agents.json │ │ ├── config_dev.json.example │ │ ├── config_grounding.json │ │ ├── config_mcp.json.example │ │ ├── config_security.json │ │ ├── constants.py │ │ ├── grounding.py │ │ ├── loader.py │ │ └── utils.py │ ├── grounding/ │ │ ├── backends/ │ │ │ ├── __init__.py │ │ │ ├── gui/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anthropic_client.py │ │ │ │ ├── anthropic_utils.py │ │ │ │ ├── config.py │ │ │ │ ├── provider.py │ │ │ │ ├── session.py │ │ │ │ ├── tool.py │ │ │ │ └── transport/ │ │ │ │ ├── actions.py │ │ │ │ ├── connector.py │ │ │ │ └── local_connector.py │ │ │ ├── mcp/ │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── config.py │ │ │ │ ├── installer.py │ │ │ │ ├── provider.py │ │ │ │ ├── session.py │ │ │ │ ├── tool_cache.py │ │ │ │ ├── tool_converter.py │ │ │ │ └── transport/ │ │ │ │ ├── connectors/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── http.py │ │ │ │ │ ├── sandbox.py │ │ │ │ │ ├── stdio.py │ │ │ │ │ ├── utils.py │ │ │ │ │ └── websocket.py │ │ │ │ └── task_managers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── sse.py │ │ │ │ ├── stdio.py │ │ │ │ ├── streamable_http.py │ │ │ │ └── websocket.py │ │ │ ├── shell/ │ │ │ │ ├── __init__.py │ │ │ │ ├── provider.py │ │ │ │ ├── session.py │ │ │ │ └── transport/ │ │ │ │ ├── connector.py │ │ │ │ └── local_connector.py │ │ │ └── web/ │ │ │ ├── __init__.py │ │ │ ├── provider.py │ │ │ └── session.py │ │ └── core/ │ │ ├── exceptions.py │ │ ├── grounding_client.py │ │ ├── provider.py │ │ ├── quality/ │ │ │ ├── __init__.py │ │ │ ├── manager.py │ │ │ ├── store.py │ │ │ └── types.py │ │ ├── search_tools.py │ │ ├── security/ │ │ │ ├── __init__.py │ │ │ ├── e2b_sandbox.py │ │ │ ├── policies.py │ │ │ └── sandbox.py │ │ ├── session.py │ │ ├── system/ │ │ │ ├── __init__.py │ │ │ ├── provider.py │ │ │ └── tool.py │ │ ├── tool/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── local_tool.py │ │ │ └── remote_tool.py │ │ ├── transport/ │ │ │ ├── connectors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── aiohttp_connector.py │ │ │ │ └── base.py │ │ │ └── task_managers/ │ │ │ ├── __init__.py │ │ │ ├── aiohttp_connection_manager.py │ │ │ ├── async_ctx.py │ │ │ ├── base.py │ │ │ ├── noop.py │ │ │ └── placeholder.py │ │ └── types.py │ ├── llm/ │ │ ├── __init__.py │ │ └── client.py │ ├── local_server/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config.json │ │ ├── feature_checker.py │ │ ├── health_checker.py │ │ ├── main.py │ │ ├── platform_adapters/ │ │ │ ├── __init__.py │ │ │ ├── linux_adapter.py │ │ │ ├── macos_adapter.py │ │ │ ├── pyxcursor.py │ │ │ └── windows_adapter.py │ │ ├── requirements.txt │ │ ├── run.sh │ │ └── utils/ │ │ ├── __init__.py │ │ ├── accessibility.py │ │ └── screenshot.py │ ├── platform/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── recording.py │ │ ├── screenshot.py │ │ └── system_info.py │ ├── prompts/ │ │ ├── __init__.py │ │ └── grounding_agent_prompts.py │ ├── recording/ │ │ ├── __init__.py │ │ ├── action_recorder.py │ │ ├── manager.py │ │ ├── recorder.py │ │ ├── utils.py │ │ ├── video.py │ │ └── viewer.py │ ├── tool_layer.py │ └── utils/ │ ├── cli_display.py │ ├── display.py │ ├── logging.py │ ├── telemetry/ │ │ ├── __init__.py │ │ ├── events.py │ │ ├── telemetry.py │ │ └── utils.py │ ├── ui.py │ └── ui_integration.py ├── pyproject.toml └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # OS files .DS_Store .DS_Store? ._* .Spotlight-V100 .Trashes ehthumbs.db Thumbs.db Desktop.ini # IDE files .vscode/ .idea/ .pytest_cache/ # Distribution / packaging dist/ build/ *.egg-info/ *.egg # Environment files .env # MCP files anytool/config/config_mcp.json # Logs logs/ # Embedding cache .anytool/ embedding_cache/ tool_quality/ # MCP tool cache mcp_tool_cache.json mcp_tool_cache_sanitized.json # Config files anytool/config/config_dev.json # LLM keys anytool/llm/remote_client/ # Local server temp files anytool/local_server/temp/ examples/ ================================================ FILE: COMMUNICATION.md ================================================ We provide QR codes for joining the HKUDS discussion groups on **WeChat** and **Feishu**. You can join by scanning the QR codes below: WeChat QR Code ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2025 HKUDS Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================
AnyTool Logo ## AnyTool: Universal Tool-Use Layer for AI Agents ### ✨ **One Line of Code to Supercharge any Agent with
Fast, Scalable and Powerful Tool Use** ✨ [![Platform](https://img.shields.io/badge/Platform-macOS%20%7C%20Linux%20%7C%20Windows-99C9BF.svg)](https://github.com/HKUDS/AnyTool/) [![Python](https://img.shields.io/badge/Python-3.12+-FCE7D6.svg)](https://www.python.org/) [![License](https://img.shields.io/badge/License-MIT-C1E5F5.svg)](https://opensource.org/licenses/MIT/) [![Feishu](https://img.shields.io/badge/Feishu-Group-E9DBFC?style=flat&logo=wechat&logoColor=white)](./COMMUNICATION.md) [![WeChat](https://img.shields.io/badge/WeChat-Group-C5EAB4?style=flat&logo=wechat&logoColor=white)](./COMMUNICATION.md) | ⚡ **Fast - Lightning Tool Retrieval**  |  📈 **Self-Evolving Tool Orchestration**  |  ⚡ **Universal Tool Automation** |
## 🎯 What is AnyTool? AnyTool is a **Universal Tool-Use Layer** that transforms how AI agents interact with tools. It solves three fundamental challenges that prevent reliable agent automation: **overwhelming tool contexts**, **unreliable community tools**, and **limited capability coverage** -- delivering the first truly intelligent tool orchestration system for production AI agents. ## 💡 Research Highlights ⚡ **Fast - Lightning Tool Retrieval** - **Smart Context Management**: Progressive tool filtering delivers exact tools in milliseconds through multi-stage pipeline, eliminating context pollution while maintaining speed. - **Zero-Waste Processing**: Pre-computed embeddings and lazy initialization eliminate redundant processing - tools are instantly ready across all executions. 📈 **Scalable - Self-Evolving Tool Orchestration** - **Adaptive MCP Tool Selection**: Smart caching and selective re-indexing maintain constant performance from 10 to 10,000 tools with optimal resource usage. - **Self-Evolving Tool Optimization**: System continuously improves through persistent memory, becoming more efficient as your tool ecosystem expands. 🌍 **Powerful - Universal Tool Automation** - **Quality-Aware Selection**: Built-in reliability tracking and safety controls deliver production-ready automation through persistent learning and execution safeguards. - **Universal Tool-Use Capability**: Multi-backend architecture seamlessly extends beyond web APIs to system operations, GUI automation, and deep research through unified interface. ## ⚡ Easy-to-Use and Effortless Integration One line to get intelligent tool orchestration. Zero-config setup transforms complex multi-tool workflows into a single API call. ```python from anytool import AnyTool # One line to get intelligent tool orchestration async with AnyTool() as tool_layer: result = await tool_layer.execute( "Research trending AI coding tools from GitHub and tech news, " "collect their features and user feedback, analyze adoption patterns, " "then create a comparison report with insights" ) ``` --- ## 📋 Table of Contents - [🎯 Quick Start](#-quick-start) - [🚀 Technical Innovation & Implementation](#-technical-implementation) - [🔧 Configuration Guide](#-configuration-guide) - [📖 Code Structure](#-code-structure) - [🔗 Related Projects](#-related-projects) --- ## 🎯 Quick Start ### 1. Environment Setup ```bash # Clone repository git clone https://github.com/HKUDS/AnyTool.git cd AnyTool # Create and activate conda environment (includes ffmpeg for video recording) conda create -n anytool python=3.12 ffmpeg -c conda-forge -y conda activate anytool # Install dependencies pip install -r requirements.txt ``` > [!NOTE] > Create a `.env` file and add your API keys (refer to `anytool/.env.example`). ### 2. Execution Mode: Local vs Server AnyTool's Shell and GUI backends support two execution modes. You can configure the mode in `anytool/config/config_grounding.json`: ```jsonc { "shell": { "mode": "local", ... }, // or "server" "gui": { "mode": "local", ... } // or "server" } ``` #### Local Mode (Default — no server needed) In **local mode**, Shell and GUI operations are executed directly in-process via `subprocess` / `asyncio`. This is the simplest setup — **no local server required**. Just use AnyTool as normal, see [Quick Integration](#3-quick-integration) for usage examples. > [!TIP] > **Use local mode when** you are running AnyTool on the same machine you want to control (your own laptop / desktop). This is the recommended mode for most users. #### Server Mode (for remote VMs / isolation) In **server mode**, Shell and GUI operations are sent over HTTP to a running `local_server` Flask service. This is required when: - **Controlling a remote VM** — the agent runs on your host, while the server runs inside the VM. - **Process isolation / sandboxing** — you want script execution in a separate process for security or stability. - **Multi-machine deployments** — the agent and the execution environment are on different machines. To use server mode, set `"mode": "server"` in `config_grounding.json`, then install platform-specific dependencies and start the server: > [!IMPORTANT] > **Platform-specific setup required**: Different operating systems need different dependencies for desktop control. Please install the required dependencies for your OS before starting the local server:
macOS Setup ```bash # Install macOS-specific dependencies pip install pyobjc-core pyobjc-framework-cocoa pyobjc-framework-quartz atomacos ``` **Permissions Required**: macOS will automatically prompt for permissions when you first run the local server. Grant the following: - **Accessibility** (for GUI control) - **Screen Recording** (for screenshots and video capture) > If prompts don't appear, manually grant permissions in System Settings → Privacy & Security.
Linux Setup ```bash # Install Linux-specific dependencies pip install python-xlib pyatspi numpy # Install system packages sudo apt install at-spi2-core python3-tk scrot ``` > [!NOTE] > **Optional dependencies:** > - Accessibility: `pyatspi` + `at-spi2-core` > - Window management: `wmctrl` > - Cursor in screenshots: `libx11-dev` + `libxfixes-dev`
Windows Setup ```bash # Install Windows-specific dependencies pip install pywinauto pywin32 PyGetWindow ```
After installing the platform-specific dependencies, start the local server: ```bash python -m anytool.local_server.main ``` > [!NOTE] > See [`anytool/local_server/README.md`](anytool/local_server/README.md) for complete API documentation and advanced configuration. #### Mode Comparison | | Local Mode (`"local"`) | Server Mode (`"server"`) | |---|---|---| | **Setup** | Zero — just run your agent | Start `local_server` first | | **Use case** | Same-machine development | Remote VMs, sandboxing, multi-machine | | **Shell execution** | `asyncio.subprocess` in-process | HTTP → Flask → `subprocess` | | **GUI execution** | Direct pyautogui / ScreenshotHelper | HTTP → Flask → pyautogui | | **Dependencies** | Only core AnyTool | Core + Flask + platform deps | | **Network** | None required | HTTP between agent ↔ server | ### 3. Quick Integration AnyTool is a **plug-and-play Universal Tool-Use Layer** for any AI agent. The task passed to `execute()` can come from your agent's planning module, user input, or any workflow system. ```python import asyncio from anytool import AnyTool from anytool.tool_layer import AnyToolConfig async def main(): config = AnyToolConfig( enable_recording=True, recording_backends=["gui", "shell", "mcp", "web"], enable_screenshot=True, enable_video=True, ) async with AnyTool(config=config) as tool_layer: result = await tool_layer.execute( "Research trending AI coding tools from GitHub and tech news, " "collect their features and user feedback, analyze adoption patterns, " "then create a comparison report with insights" ) print(result["response"]) asyncio.run(main()) ``` > [!TIP] > **MCP Server Configuration**: For tasks requiring specific tools, add relevant MCP servers to `anytool/config/config_mcp.json`. Unsure which servers to add? Simply add all potentially useful ones, AnyTool's Smart Tool RAG will automatically select the appropriate tools for your task. See [MCP Configuration](#mcp-configuration) for details. --- ## Technical Innovation & Implementation ### 🧩 Challenge 1: MCP Tool Context Overload **The Problem**. Current MCP agents suffer from a fundamental design flaw: they load ALL configured servers and tools at every execution step, creating an overwhelming action space, creates three critical issues: - ⚡ **Slow Performance with Massive Context Loading**
Complete tool set from all pre-configured servers loaded simultaneously at every step, degrading execution speed - 🎯 **Poor Accuracy from Blind Tool Setup**
Users cannot preview tools before connecting, leading to over-setup "just in case" and confusing tool selection - 💸 **Resource Waste with No Memory**
Same tools reloaded at every execution step with no caching, causing redundant loading ### ✅ AnyTool's Solution: Tool Context Management Framework **Motivation**: "Load Everything" → "Retrieve What's Needed"
**Improvement**: Faster tool selection, cleaner context, and efficient resource usage through smart retrieval and memory. #### **Technical Innovation**:
**🎯 Multi-Stage Tool Retrieval Pipeline** - **Progressive MCP Tool Filtering**: server selection → tool name matching → tool semantic search → LLM ranking - **Reduces MCP Tool Search Space**: Each stage narrows down candidate tools for optimizing precision and speed **💾 Long-Term Tool Memory** - **Save Once, Use Forever**: Pre-compute tool embeddings once and save them to disk for instant reuse - **Zero Waste Processing**: No more redundant processing - tools are ready to use immediately across all execution steps **🧠 Adaptive Tool Selection** - **Adaptive MCP Tool Ranking**: LLM-based tool selection refinement triggered only when MCP tool results are large or ambiguous - **Tool Selection Efficiency**: Balances MCP tool accuracy with computational efficiency **🚀 On-Demand Resource Management** - **Lazy MCP Server Startup**: MCP server initialization triggered only when specific tools are needed - **Selective Tool Updates**: Incremental re-indexing of only changed MCP tools, not the entire tool set --- ### 🚨 Challenge 2: MCP Tool Quality Issues **The Problem**. Current MCP servers suffer from community contribution challenges that create three critical issues: - 🔍 **Poor Tool Descriptions**
Misleading claims, non-existent advertised tools, and vague capability specifications lead to wrong tool selection. - 📊 **No Reliability Signals**
Cannot assess MCP tool quality before use, causing blind selection decisions. - ⚠️ **Security and Safety Gaps**
Unvetted community tools may execute dangerous operations without proper safeguards. ### ✅ **AnyTool Solution: Self-Contained Quality Management** **Motivation**: "Blind Tool Trust" → "Smart Quality Assessment"
**Improvement**: Reliable tool selection, safe execution, and autonomous recovery through quality tracking and safety controls. #### **Technical Innovation:**
**🎯 Quality-Aware Tool Selection** - **Description Quality Check**: LLM-based evaluation of MCP tool description clarity and completeness. - **Performance-Based Ranking**: Track call/success rates for each MCP tool in persistent memory to prioritize reliable options. **💾 Learning-Based Tool Memory** - **Track Tool Performance**: Remember which MCP tools work well and which fail over time. - **Smart Tool Prioritization**: Automatically rank tools based on past success rates and description quality. **🛡️ Safety-First Execution** - **Block Dangerous Operations**: Prevent arbitrary code execution and require user approval for sensitive MCP tool operations. - **Execution Safeguards**: Built-in safety controls for all MCP tool executions. **🚀 Self-Healing Tool Management** - **Autonomous Tool Switching**: Switch failed MCP tools locally without restarting expensive planning loops. - **Local Failure Recovery**: Automatically switch to alternative MCP tools on failure without escalating to upper-level agents. --- ### 🔄 Challenge 3: Limited MCP Capability Scope **The Problem**. Current MCP ecosystem focuses primarily on Web APIs and online services, creating significant automation gaps that prevent comprehensive task completion: - **🖥️ Missing System Operations**
No native support for file manipulation, process management, or command execution on local systems. - **🖱️ No Desktop Automation**
Cannot control GUI applications that lack APIs, limiting automation to web-only scenarios. - **📊 Incomplete Tool Coverage**
Limited server categories in community and incomplete tool sets within existing servers create workflow bottlenecks. ### ✅ AnyTool Solution: Universal Capability Extension
(MCP + System Commands + GUI Control ≈ Universal Task Completion) **Motivation**: "Web-Only MCP" → "Universal Task Completion"
**Improvement**: Complete automation coverage through multi-backend architecture that seamlessly extends MCP capabilities beyond web APIs. **🏗️ Multi-Backend Architecture** - **MCP Backend**: Community servers for Web APIs and online services - **Shell Backend**: Bash/Python execution for system-level operations and file management - **GUI Backend**: Pixel-level automation for any visual application without API requirements - **Web Backend**: Deep web research and data extraction capabilities **💡 Self-Evolving Capability Discovery** - **Intelligent Gap Detection**: Planning agent identifies when MCP tools are insufficient for task requirements - **Automatic Backend Selection**: Shell/GUI backends automatically fill capability gaps without manual intervention - **Dynamic Capability Expansion**: Previously impossible tasks become achievable through backend combination **🎭 Unified Tool Orchestration** - **Uniform Tool Schema**: All backends expose identical interface for seamless agent tool selection - **Transparent Backend Switching**: Agents select optimal tools across backend types without knowing implementation details - **Intelligent Tool Routing**: Automatic routing to the most appropriate backend based on task requirements **🚀 Seamless Integration Layer** - **Single Tool Interface**: Unified API that abstracts away backend complexity from AI agents. - **Cross-Backend Coordination**: Enable complex workflows that span multiple backend capabilities. - **Consistent Safety Controls**: Apply security and safety measures uniformly across all backend types. --- ## 🔧 Configuration Guide ### Configuration Overview AnyTool uses a layered configuration system: - **`config_dev.json`** (highest priority): Local development overrides. Overrides all other configurations. - **`config_agents.json`**: Agent definitions and backend access control - **`config_mcp.json`**: MCP server registry - **`config_grounding.json`**: Backend-specific settings and Smart Tool RAG configuration - **`config_security.json`**: Security policies with runtime user confirmation for sensitive operations --- ### Agent Configuration **Path**: `anytool/config/config_agents.json` **Purpose**: Define agent roles, control backend access scope, and set execution limits to prevent infinite loops. **Example configuration**: ```json { "agents": [ { "name": "GroundingAgent", "class_name": "GroundingAgent", "backend_scope": ["gui", "shell", "mcp", "system", "web"], "max_iterations": 20 } ] } ``` **Key Fields**: | Field | Description | Options/Example | |-------|-------------|-----------------| | `backend_scope` | Accessible backends | `[]` or any combination of `["gui", "shell", "mcp", "system", "web"]` | | `max_iterations` | Maximum execution cycles | Any integer (e.g., `15`, `20`, `50`) or `null` (unlimited) | --- ### MCP Configuration **Path**: `anytool/config/config_mcp.json` (copy from `config_mcp.json.example`) **Purpose**: Register MCP servers with connection details. AnyTool automatically discovers tools from all registered servers and makes them available through Smart Tool RAG. **Example configuration**: ```json { "mcpServers": { "github": { "command": "npx", "args": ["-y", "@modelcontextprotocol/server-github"], "env": { "GITHUB_PERSONAL_ACCESS_TOKEN": "${GITHUB_TOKEN}" } } } } ``` ---
Runtime Configuration (AnyToolConfig) ### Runtime Configuration (AnyToolConfig) **Complete example**: ```python from anytool import AnyTool from anytool.tool_layer import AnyToolConfig config = AnyToolConfig( # LLM Configuration llm_model="anthropic/claude-sonnet-4-5", llm_enable_thinking=False, llm_timeout=120.0, llm_max_retries=3, llm_rate_limit_delay=0.0, llm_kwargs={}, # Additional LiteLLM parameters # Separate models for specific tasks (None = use llm_model) tool_retrieval_model=None, # Model for tool retrieval LLM filter visual_analysis_model=None, # Model for visual analysis # Grounding Configuration grounding_config_path=None, # Path to custom config file grounding_max_iterations=20, grounding_system_prompt=None, # Custom system prompt # Backend Configuration backend_scope=["gui", "shell", "mcp", "web", "system"], # Workspace Configuration workspace_dir=None, # Auto-create temp dir if None # Recording Configuration enable_recording=True, recording_backends=["gui", "shell", "mcp"], recording_log_dir="./logs/recordings", enable_screenshot=True, enable_video=True, enable_conversation_log=True, # Save LLM conversations to conversations.jsonl # Logging Configuration log_level="INFO", log_to_file=False, log_file_path=None, ) async with AnyTool(config=config) as tool_layer: result = await tool_layer.execute("Your task here") # Or with external task_id for benchmark integration: # result = await tool_layer.execute("Your task", task_id="my-task-001") ```
---
Other Configuration Files ### Backend Configuration **Path**: `anytool/config/config_grounding.json` **Purpose**: Configure backend-specific behaviors, timeouts, Smart Tool RAG system for efficient tool selection, and Tool Quality Tracking for self-evolving tool intelligence. **Key Fields**: | Backend | Field | Description | Options/Default | |---------|-------|-------------|-----------------| | **shell** | `timeout` | Command timeout (seconds) | Any integer (default: `60`) | | | `conda_env` | Auto-activate conda environment | Environment name or `null` (default: `"anytool"`) | | | `working_dir` | Working directory for command execution | Any valid path (default: current directory) | | | `default_shell` | Shell to use | `"/bin/bash"`, `"/bin/zsh"`, etc. | | **gui** | `timeout` | Operation timeout (seconds) | Any integer (default: `90`) | | | `screenshot_on_error` | Capture screenshot on failure | `true` or `false` (default: `true`) | | | `driver_type` | GUI automation driver | `"pyautogui"` or other supported drivers | | **mcp** | `timeout` | Request timeout (seconds) | Any integer (default: `30`) | | | `sandbox` | Run in E2B sandbox | `true` or `false` (default: `false`) | | | `eager_sessions` | Pre-connect all servers at startup | `true` or `false` (default: `false`, lazy connection) | | **tool_search** | `search_mode` | Tool retrieval strategy | `"semantic"`, `"hybrid"` (semantic + LLM filter), or `"llm"` (default: `"hybrid"`) | | | `max_tools` | Maximum tools to return from search | Any integer (default: `40`) | | | `enable_llm_filter` | Enable LLM-based tool pre-filtering | `true` or `false` (default: `true`) | | | `llm_filter_threshold` | Enable LLM filter when tools exceed this count | Any integer (default: `50`) | | | `enable_cache_persistence` | Persist embedding cache to disk | `true` or `false` (default: `true`) | | **tool_quality** | `enabled` | Enable tool quality tracking | `true` or `false` (default: `true`) | | | `enable_persistence` | Persist quality data to disk | `true` or `false` (default: `true`) | | | `cache_dir` | Directory for quality cache | Path string (default: `.anytool/tool_quality` in project directory) | | | `auto_evaluate_descriptions` | Automatically evaluate tool descriptions using LLM | `true` or `false` (default: `true`) | | | `enable_quality_ranking` | Incorporate quality scores in tool ranking | `true` or `false` (default: `true`) | | | `evolve_interval` | Trigger self-evolution every N tool executions | Any integer 1-100 (default: `5`) | --- ### Security Configuration **Path**: `anytool/config/config_security.json` **Purpose**: Define security policies with command filtering and access control. **Key Fields**: | Section | Field | Description | Options | |---------|-------|-------------|---------| | **global** | `allow_shell_commands` | Enable shell command execution | `true` or `false` (default: `true`) | | | `allow_network_access` | Enable network operations | `true` or `false` (default: `true`) | | | `allow_file_access` | Enable file system operations | `true` or `false` (default: `true`) | | | `blocked_commands` | Platform-specific command blacklist | Object with `common`, `linux`, `darwin`, `windows` arrays | | | `sandbox_enabled` | Enable sandboxing for all operations | `true` or `false` (default: `false`) | | **backend** | `shell`, `mcp`, `gui`, `web` | Per-backend security overrides | Same fields as global, backend-specific | **Example blocked commands**: `rm -rf`, `shutdown`, `reboot`, `mkfs`, `dd`, `format`, `iptables` **Behavior**: - Blocked commands are **rejected automatically** - Sandbox mode isolates operations in secure environments (E2B sandbox for MCP) --- ### Developer Configuration **Path**: `anytool/config/config_dev.json` (copy from `config_dev.json.example`) **Loading Priority**: `config_grounding.json` → `config_security.json` → `config_dev.json` (dev.json overrides the former ones)
--- ## 📖 Code Structure ### 📖 Quick Overview > **Legend**: ⚡ Core modules | 🔧 Supporting modules ``` AnyTool/ ├── anytool/ │ ├── __init__.py # Package exports │ ├── __main__.py # CLI entry point (python -m anytool) │ ├── tool_layer.py # AnyTool main class │ │ │ ├── ⚡ agents/ # Agent System │ ├── ⚡ grounding/ # Unified Backend System │ │ ├── core/ # Core abstractions │ │ └── backends/ # Backend implementations │ │ ├── shell/ # Shell command execution │ │ ├── gui/ # Anthropic Computer Use │ │ ├── mcp/ # Model Context Protocol │ │ └── web/ # Web search & browsing │ │ │ ├── 🔧 prompts/ # Prompt Templates │ ├── 🔧 llm/ # LLM Integration │ ├── 🔧 config/ # Configuration System │ ├── 🔧 local_server/ # GUI Backend Server │ ├── 🔧 recording/ # Execution Recording │ ├── 🔧 platform/ # Platform Integration │ └── 🔧 utils/ # Utilities │ ├── .anytool/ # Runtime cache │ ├── embedding_cache/ # Tool embeddings for Smart Tool RAG │ └── tool_quality/ # Persistent tool quality tracking data │ ├── logs/ # Execution logs │ ├── requirements.txt # Python dependencies ├── pyproject.toml # Package configuration └── README.md ``` --- ### 📂 Detailed Module Structure
⚡ agents/ - Agent System ``` agents/ ├── __init__.py ├── base.py # Base agent class with common functionality └── grounding_agent.py # Execution Agent (tool calling & iteration control) ``` **Key Responsibilities**: Task execution with intelligent tool selection and iteration control.
⚡ grounding/ - Unified Backend System (Core Integration Layer) **Key Responsibilities**: Unified tool abstraction, backend routing, session pooling, Smart Tool RAG, and Self-Evolving Quality Tracking*. #### Core Abstractions ``` grounding/core/ ├── grounding_client.py # Unified interface across all backends ├── provider.py # Abstract provider base class ├── session.py # Session lifecycle management ├── search_tools.py # Smart Tool RAG for semantic search ├── exceptions.py # Custom exception definitions ├── types.py # Shared type definitions │ ├── tool/ # Tool abstraction layer │ ├── base.py # Tool base class │ ├── local_tool.py # Local tool implementation │ └── remote_tool.py # Remote tool implementation │ ├── quality/ # Self-evolving tool quality tracking │ ├── manager.py # Quality manager with adaptive ranking │ ├── store.py # Persistent quality data storage │ └── types.py # Quality record data types │ ├── security/ # Security & sandboxing 🔧 │ ├── policies.py # Security policy enforcement │ ├── sandbox.py # Sandbox abstraction │ └── e2b_sandbox.py # E2B sandbox integration │ ├── system/ # System-level provider │ ├── provider.py │ └── tool.py │ └── transport/ # Transport layer abstractions 🔧 ├── connectors/ │ ├── base.py │ └── aiohttp_connector.py └── task_managers/ ├── base.py ├── async_ctx.py ├── aiohttp_connection_manager.py └── placeholder.py ``` #### Backend Implementations
Shell Backend - Command execution via local server ``` backends/shell/ ├── provider.py # Shell provider implementation ├── session.py # Shell session management └── transport/ └── connector.py # HTTP connector to local server ```
GUI Backend - Anthropic Computer Use integration ``` backends/gui/ ├── provider.py # GUI provider implementation ├── session.py # GUI session management ├── tool.py # GUI-specific tools ├── anthropic_client.py # Anthropic API client wrapper ├── anthropic_utils.py # Utility functions ├── config.py # GUI configuration └── transport/ ├── connector.py # Computer Use API connector └── actions.py # Action execution logic ```
MCP Backend - Model Context Protocol servers ``` backends/mcp/ ├── provider.py # MCP provider implementation ├── session.py # MCP session management ├── client.py # MCP client ├── config.py # MCP configuration loader ├── installer.py # MCP server installer ├── tool_converter.py # Convert MCP tools to unified format ├── tool_cache.py # MCP tool cache for offline tool discovery └── transport/ ├── connectors/ # Multiple transport types │ ├── base.py │ ├── stdio.py # Standard I/O connector │ ├── http.py # HTTP connector │ ├── websocket.py # WebSocket connector │ ├── sandbox.py # Sandboxed connector │ └── utils.py └── task_managers/ # Protocol-specific managers ├── stdio.py ├── sse.py ├── streamable_http.py └── websocket.py ```
Web Backend - Search and browsing ``` backends/web/ ├── provider.py # Web provider implementation └── session.py # Web session management ```
🔧 prompts/ - Prompt Templates ``` prompts/ ├── __init__.py └── grounding_agent_prompts.py # Grounding agent system & tool selection prompts ```
🔧 llm/ - LLM Integration ``` llm/ ├── __init__.py └── client.py # LiteLLM wrapper with retry logic ```
🔧 config/ - Configuration System ``` config/ ├── __init__.py ├── loader.py # Configuration file loader ├── constants.py # System constants ├── grounding.py # Grounding configuration dataclasses ├── utils.py # Configuration utilities │ ├── config_grounding.json # Backend-specific settings ├── config_agents.json # Agent configurations ├── config_mcp.json.example # MCP server definitions (copy to config_mcp.json) ├── config_security.json # Security policies └── config_dev.json.example # Development config template ```
🔧 local_server/ - GUI Backend Server ``` local_server/ ├── __init__.py ├── main.py # Flask application entry point ├── config.json # Server configuration ├── feature_checker.py # Platform feature detection ├── health_checker.py # Server health monitoring ├── platform_adapters/ # OS-specific implementations │ ├── macos_adapter.py # macOS automation (atomacos, pyobjc) │ ├── linux_adapter.py # Linux automation (pyatspi, xlib) │ ├── windows_adapter.py # Windows automation (pywinauto) │ └── pyxcursor.py # Custom cursor handling ├── utils/ │ ├── accessibility.py # Accessibility tree utilities │ └── screenshot.py # Screenshot capture └── README.md ``` **Purpose**: Lightweight Flask service enabling computer control (GUI, Shell, Files, Screen capture).
🔧 recording/ - Execution Recording ``` recording/ ├── __init__.py ├── recorder.py # Main recording manager ├── manager.py # Recording lifecycle management ├── action_recorder.py # Action-level logging ├── video.py # Video capture integration ├── viewer.py # Trajectory viewer and analyzer └── utils.py # Recording utilities ``` **Purpose**: Execution audit with trajectory recording and video capture.
🔧 platform/ - Platform Integration ``` platform/ ├── __init__.py ├── config.py # Platform-specific configuration ├── recording.py # Recording integration ├── screenshot.py # Screenshot utilities └── system_info.py # System information gathering ```
🔧 utils/ - Shared Utilities ``` utils/ ├── logging.py # Structured logging system ├── ui.py # Terminal UI components ├── display.py # Display formatting utilities ├── cli_display.py # CLI-specific display ├── ui_integration.py # UI integration helpers └── telemetry/ # Usage analytics (opt-in) ├── __init__.py ├── events.py ├── telemetry.py └── utils.py ```
📊 logs/ - Execution Logs & Recordings ``` logs/ ├── / # Main application logs │ └── anytool_YYYY-MM-DD_HH-MM-SS.log # Timestamped log files │ └── recordings/ # Execution recordings └── task_/ # Individual recording session ├── trajectory.json # Complete execution trajectory ├── screenshots/ # Visual execution record (GUI backend) │ ├── tool__.png │ ├── tool__.png │ └── ... # Sequential screenshots ├── workspace/ # Task workspace │ └── [generated files] # Files created during execution └── screen_recording.mp4 # Video recording (if enabled) ``` **Recording Control**: Enable via `AnyToolConfig(enable_recording=True)`, filter backends with `recording_backends=["gui", "shell", ...]`
--- ## 🔗 Related Projects AnyTool builds upon excellent open-source projects, we sincerely thank their authors and contributors: - **[OSWorld](https://github.com/xlang-ai/OSWorld)**: Comprehensive benchmark for evaluating computer-use agents across diverse operating system tasks. - **[mcp-use](https://github.com/mcp-use/mcp-use)**: Platform that simplifies MCP agent development with client SDKs. ---
**🌟 If this project helps you, please give us a Star!** **🤖 Empower AI Agent with intelligent tool orchestration!**
---

❤️ Thanks for visiting ✨ AnyTool!

Views

================================================ FILE: anytool/__init__.py ================================================ from importlib import import_module as _imp from typing import Dict as _Dict, Any as _Any, TYPE_CHECKING as _TYPE_CHECKING if _TYPE_CHECKING: from anytool.tool_layer import AnyTool as AnyTool, AnyToolConfig as AnyToolConfig from anytool.agents import GroundingAgent as GroundingAgent from anytool.llm import LLMClient as LLMClient from anytool.recording import RecordingManager as RecordingManager __version__ = "0.1.0" __all__ = [ # Version "__version__", # Main API "AnyTool", "AnyToolConfig", # Core Components "GroundingAgent", "GroundingClient", "LLMClient", "BaseTool", "ToolResult", "BackendType", # Recording System "RecordingManager", "RecordingViewer", ] # Map attribute → sub-module that provides it _attr_to_module: _Dict[str, str] = { # Main API "AnyTool": "anytool.tool_layer", "AnyToolConfig": "anytool.tool_layer", # Core Components "GroundingAgent": "anytool.agents", "GroundingClient": "anytool.grounding.core.grounding_client", "LLMClient": "anytool.llm", "BaseTool": "anytool.grounding.core.tool.base", "ToolResult": "anytool.grounding.core.types", "BackendType": "anytool.grounding.core.types", # Recording System "RecordingManager": "anytool.recording", "RecordingViewer": "anytool.recording.viewer", } def __getattr__(name: str) -> _Any: """Dynamically import sub-modules on first attribute access. This keeps the *initial* package import lightweight and avoids raising `ModuleNotFoundError` for optional / heavy dependencies until the corresponding functionality is explicitly used. """ if name not in _attr_to_module: raise AttributeError(f"module 'anytool' has no attribute '{name}'") module_name = _attr_to_module[name] module = _imp(module_name) value = getattr(module, name) globals()[name] = value return value def __dir__(): return sorted(list(globals().keys()) + list(_attr_to_module.keys())) ================================================ FILE: anytool/__main__.py ================================================ import asyncio import argparse import sys import logging from typing import Optional from anytool.tool_layer import AnyTool, AnyToolConfig from anytool.utils.logging import Logger from anytool.utils.ui import create_ui, AnyToolUI from anytool.utils.ui_integration import UIIntegration from anytool.utils.cli_display import CLIDisplay from anytool.utils.display import colorize logger = Logger.get_logger(__name__) class UIManager: def __init__(self, ui: Optional[AnyToolUI], ui_integration: Optional[UIIntegration]): self.ui = ui self.ui_integration = ui_integration self._original_log_levels = {} async def start_live_display(self): if not self.ui or not self.ui_integration: return print() print(colorize(" ▣ Starting real-time visualization...", 'c')) print() await asyncio.sleep(1) self._suppress_logs() await self.ui.start_live_display() await self.ui_integration.start_monitoring(poll_interval=2.0) async def stop_live_display(self): if not self.ui or not self.ui_integration: return await self.ui_integration.stop_monitoring() await self.ui.stop_live_display() self._restore_logs() def print_summary(self, result: dict): if self.ui: self.ui.print_summary(result) else: CLIDisplay.print_result_summary(result) def _suppress_logs(self): log_names = ["anytool", "anytool.grounding", "anytool.agents"] for name in log_names: log = logging.getLogger(name) self._original_log_levels[name] = log.level log.setLevel(logging.CRITICAL) def _restore_logs(self): for name, level in self._original_log_levels.items(): logging.getLogger(name).setLevel(level) self._original_log_levels.clear() async def _execute_task(anytool: AnyTool, query: str, ui_manager: UIManager): await ui_manager.start_live_display() result = await anytool.execute(query) await ui_manager.stop_live_display() ui_manager.print_summary(result) return result async def interactive_mode(anytool: AnyTool, ui_manager: UIManager): CLIDisplay.print_interactive_header() while True: try: prompt = colorize(">>> ", 'c', bold=True) query = input(f"\n{prompt}").strip() if not query: continue if query.lower() in ['exit', 'quit', 'q']: print("\nExiting...") break if query.lower() == 'status': _print_status(anytool) continue if query.lower() == 'help': CLIDisplay.print_help() continue CLIDisplay.print_task_header(query) await _execute_task(anytool, query, ui_manager) except KeyboardInterrupt: print("\n\nInterrupt signal detected, exiting...") break except Exception as e: logger.error(f"Error: {e}", exc_info=True) print(f"\nError: {e}") async def single_query_mode(anytool: AnyTool, query: str, ui_manager: UIManager): CLIDisplay.print_task_header(query, title="▶ Single Query Execution") await _execute_task(anytool, query, ui_manager) def _print_status(anytool: AnyTool): """Print system status""" from anytool.utils.display import Box, BoxStyle box = Box(width=70, style=BoxStyle.ROUNDED, color='bl') print() print(box.text_line(colorize("System Status", 'bl', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) status_lines = [ f"Initialized: {colorize('Yes' if anytool.is_initialized() else 'No', 'g' if anytool.is_initialized() else 'rd')}", f"Running: {colorize('Yes' if anytool.is_running() else 'No', 'y' if anytool.is_running() else 'g')}", f"Model: {colorize(anytool.config.llm_model, 'c')}", ] if anytool.is_initialized(): backends = anytool.list_backends() status_lines.append(f"Backends: {colorize(', '.join(backends), 'c')}") sessions = anytool.list_sessions() status_lines.append(f"Active Sessions: {colorize(str(len(sessions)), 'y')}") for line in status_lines: print(box.text_line(f" {line}", indent=4, text_color='')) print(box.bottom_line(indent=4)) print() def _create_argument_parser() -> argparse.ArgumentParser: """Create command-line argument parser""" parser = argparse.ArgumentParser( description='AnyTool - Universal Tool-Use Layer for AI Agents', formatter_class=argparse.RawDescriptionHelpFormatter, ) w # Subcommands subparsers = parser.add_subparsers(dest='command', help='Available commands') # refresh-cache subcommand cache_parser = subparsers.add_parser( 'refresh-cache', help='Refresh MCP tool cache (starts all servers once)' ) cache_parser.add_argument( '--config', '-c', type=str, help='MCP configuration file path' ) # Basic arguments (for run mode) parser.add_argument('--config', '-c', type=str, help='Configuration file path (JSON format)') parser.add_argument('--query', '-q', type=str, help='Single query mode: execute query directly') # LLM arguments parser.add_argument('--model', '-m', type=str, help='LLM model name') # Logging arguments parser.add_argument('--log-level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], help='Log level') # Execution arguments parser.add_argument('--max-iterations', type=int, help='Maximum iteration count') parser.add_argument('--timeout', type=float, help='LLM API call timeout (seconds)') # UI arguments parser.add_argument('--interactive', '-i', action='store_true', help='Force interactive mode') parser.add_argument('--no-ui', action='store_true', help='Disable visualization UI') parser.add_argument('--ui-compact', action='store_true', help='Use compact UI layout') return parser async def refresh_mcp_cache(config_path: Optional[str] = None): """Refresh MCP tool cache by starting servers one by one and saving tool metadata.""" from anytool.grounding.backends.mcp import MCPProvider, get_tool_cache from anytool.grounding.core.types import SessionConfig, BackendType from anytool.config import load_config, get_config print("Refreshing MCP tool cache...") print("Servers will be started one by one (start -> get tools -> close).") print() # Load config if config_path: config = load_config(config_path) else: config = get_config() # Get MCP config mcp_config = getattr(config, 'mcp', None) or {} if hasattr(mcp_config, 'model_dump'): mcp_config = mcp_config.model_dump() # Skip dependency checks for refresh-cache (servers are pre-validated) mcp_config["check_dependencies"] = False # Create provider provider = MCPProvider(config=mcp_config) await provider.initialize() servers = provider.list_servers() total = len(servers) print(f"Found {total} MCP servers configured") print() cache = get_tool_cache() cache.set_server_order(servers) # Preserve config order when saving total_tools = 0 success_count = 0 skipped_count = 0 failed_servers = [] # Load existing cache to skip already processed servers existing_cache = cache.get_all_tools() # Timeout for each server (in seconds) SERVER_TIMEOUT = 60 # Process servers one by one for i, server_name in enumerate(servers, 1): # Skip if already cached (resume support) if server_name in existing_cache: cached_tools = existing_cache[server_name] total_tools += len(cached_tools) skipped_count += 1 print(f"[{i}/{total}] {server_name}... ⏭ cached ({len(cached_tools)} tools)") continue print(f"[{i}/{total}] {server_name}...", end=" ", flush=True) session_id = f"mcp-{server_name}" try: # Create session and get tools with timeout protection async with asyncio.timeout(SERVER_TIMEOUT): # Create session for this server cfg = SessionConfig( session_name=session_id, backend_type=BackendType.MCP, connection_params={"server": server_name}, ) session = await provider.create_session(cfg) # Get tools from this server tools = await session.list_tools() # Convert to metadata format tool_metadata = [] for tool in tools: tool_metadata.append({ "name": tool.schema.name, "description": tool.schema.description or "", "parameters": tool.schema.parameters or {}, }) # Save to cache (incremental) cache.save_server(server_name, tool_metadata) # Close session immediately to free resources await provider.close_session(session_id) total_tools += len(tools) success_count += 1 print(f"✓ {len(tools)} tools") except asyncio.TimeoutError: error_msg = f"Timeout after {SERVER_TIMEOUT}s" failed_servers.append((server_name, error_msg)) print(f"✗ {error_msg}") # Save failed server info to cache cache.save_failed_server(server_name, error_msg) # Try to close session if it was created try: await provider.close_session(session_id) except Exception: pass except Exception as e: error_msg = str(e) failed_servers.append((server_name, error_msg)) print(f"✗ {error_msg[:50]}") # Save failed server info to cache cache.save_failed_server(server_name, error_msg) # Try to close session if it was created try: await provider.close_session(session_id) except Exception: pass print() print(f"{'='*50}") print(f"✓ Collected {total_tools} tools from {success_count + skipped_count}/{total} servers") if skipped_count > 0: print(f" (skipped {skipped_count} cached, processed {success_count} new)") print(f"✓ Cache saved to: {cache.cache_path}") if failed_servers: print(f"✗ Failed servers ({len(failed_servers)}):") for name, err in failed_servers[:10]: print(f" - {name}: {err[:60]}") if len(failed_servers) > 10: print(f" ... and {len(failed_servers) - 10} more (see cache file for details)") print() print("Done! Future list_tools() calls will use cache (no server startup).") def _load_config(args) -> AnyToolConfig: """Load configuration""" cli_overrides = {} if args.model: cli_overrides['llm_model'] = args.model if args.max_iterations is not None: cli_overrides['grounding_max_iterations'] = args.max_iterations if args.timeout is not None: cli_overrides['llm_timeout'] = args.timeout if args.log_level: cli_overrides['log_level'] = args.log_level try: # Load from config file if provided if args.config: import json with open(args.config, 'r', encoding='utf-8') as f: config_dict = json.load(f) # Apply CLI overrides config_dict.update(cli_overrides) config = AnyToolConfig(**config_dict) print(f"✓ Loaded from config file: {args.config}") else: # Use default config + CLI overrides config = AnyToolConfig(**cli_overrides) print("✓ Using default configuration") if cli_overrides: print(f"✓ CLI overrides: {', '.join(cli_overrides.keys())}") if args.log_level: Logger.set_level(args.log_level) return config except Exception as e: logger.error(f"Failed to load configuration: {e}") sys.exit(1) def _setup_ui(args) -> tuple[Optional[AnyToolUI], Optional[UIIntegration]]: if args.no_ui: CLIDisplay.print_banner() return None, None ui = create_ui(enable_live=True, compact=args.ui_compact) ui.print_banner() ui_integration = UIIntegration(ui) return ui, ui_integration async def _initialize_anytool(config: AnyToolConfig, args) -> AnyTool: anytool = AnyTool(config) init_steps = [("Initializing AnyTool...", "loading")] CLIDisplay.print_initialization_progress(init_steps, show_header=False) if not args.config: original_log_level = Logger.get_logger("anytool").level for log_name in ["anytool", "anytool.grounding", "anytool.agents"]: Logger.get_logger(log_name).setLevel(logging.WARNING) await anytool.initialize() # Restore log level if not args.config: for log_name in ["anytool", "anytool.grounding", "anytool.agents"]: Logger.get_logger(log_name).setLevel(original_log_level) # Print initialization results backends = anytool.list_backends() init_steps = [ ("LLM Client", "ok"), (f"Grounding Backends ({len(backends)} available)", "ok"), ("Grounding Agent", "ok"), ] if config.enable_recording: init_steps.append(("Recording Manager", "ok")) CLIDisplay.print_initialization_progress(init_steps, show_header=True) return anytool async def main(): parser = _create_argument_parser() args = parser.parse_args() # Handle subcommands if args.command == 'refresh-cache': await refresh_mcp_cache(args.config) return 0 # Load configuration config = _load_config(args) # Setup UI ui, ui_integration = _setup_ui(args) # Print configuration CLIDisplay.print_configuration(config) anytool = None try: # Initialize AnyTool anytool = await _initialize_anytool(config, args) # Connect UI (if enabled) if ui_integration: ui_integration.attach_llm_client(anytool._llm_client) ui_integration.attach_grounding_client(anytool._grounding_client) CLIDisplay.print_system_ready() ui_manager = UIManager(ui, ui_integration) # Run appropriate mode if args.query: await single_query_mode(anytool, args.query, ui_manager) else: await interactive_mode(anytool, ui_manager) except KeyboardInterrupt: print("\n\nInterrupt signal detected") except Exception as e: logger.error(f"Error: {e}", exc_info=True) print(f"\nError: {e}") return 1 finally: if anytool: print("\nCleaning up resources...") await anytool.cleanup() print("\nGoodbye!") return 0 def run_main(): """Run main function""" try: exit_code = asyncio.run(main()) sys.exit(exit_code) except KeyboardInterrupt: print("\n\nProgram interrupted") sys.exit(0) if __name__ == "__main__": run_main() ================================================ FILE: anytool/agents/__init__.py ================================================ from anytool.agents.base import BaseAgent, AgentStatus, AgentRegistry from anytool.agents.grounding_agent import GroundingAgent __all__ = [ "BaseAgent", "AgentStatus", "AgentRegistry", "GroundingAgent", ] ================================================ FILE: anytool/agents/base.py ================================================ from __future__ import annotations import json from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Dict, List, Optional, Type, Any from anytool.utils.logging import Logger if TYPE_CHECKING: from anytool.llm import LLMClient from anytool.grounding.core.grounding_client import GroundingClient from anytool.recording import RecordingManager logger = Logger.get_logger(__name__) class BaseAgent(ABC): def __init__( self, name: str, backend_scope: Optional[List[str]] = None, llm_client: Optional[LLMClient] = None, grounding_client: Optional[GroundingClient] = None, recording_manager: Optional[RecordingManager] = None, ) -> None: """ Initialize the BaseAgent. Args: name: Unique name for the agent backend_scope: List of backend types this agent can access (e.g., ["gui", "shell", "mcp", "web", "system"]) llm_client: LLM client for agent reasoning (optional, can be set later) grounding_client: Reference to GroundingClient for tool execution recording_manager: RecordingManager for recording execution """ self._name = name self._grounding_client: Optional[GroundingClient] = grounding_client self._backend_scope = backend_scope or [] self._llm_client = llm_client self._recording_manager: Optional[RecordingManager] = recording_manager self._step = 0 self._status = AgentStatus.ACTIVE self._register_self() logger.info(f"Initialized {self.__class__.__name__}: {name}") @property def name(self) -> str: return self._name @property def grounding_client(self) -> Optional[GroundingClient]: """Get the grounding client.""" return self._grounding_client @property def backend_scope(self) -> List[str]: return self._backend_scope @property def llm_client(self) -> Optional[LLMClient]: return self._llm_client @llm_client.setter def llm_client(self, client: LLMClient) -> None: self._llm_client = client @property def recording_manager(self) -> Optional[RecordingManager]: """Get the recording manager.""" return self._recording_manager @property def step(self) -> int: return self._step @property def status(self) -> str: return self._status @abstractmethod async def process(self, context: Dict[str, Any]) -> Dict[str, Any]: pass @abstractmethod def construct_messages(self, context: Dict[str, Any]) -> List[Dict[str, Any]]: """ Construct messages for LLM reasoning. Context must contain 'instruction' key. """ pass async def get_llm_response( self, messages: List[Dict[str, Any]], tools: Optional[List] = None, **kwargs ) -> Dict[str, Any]: if not self._llm_client: raise ValueError(f"LLM client not initialized for agent {self.name}") try: response = await self._llm_client.complete( messages=messages, tools=tools, **kwargs ) return response except Exception as e: logger.error(f"{self.name}: LLM call failed: {e}", exc_info=True) raise def response_to_dict(self, response: str) -> Dict[str, Any]: try: if response.strip().startswith("```json") or response.strip().startswith("```"): lines = response.strip().split('\n') if lines and lines[0].startswith('```'): lines = lines[1:] end_idx = len(lines) for i, line in enumerate(lines): if line.strip() == '```': end_idx = i break response = '\n'.join(lines[:end_idx]) return json.loads(response) except json.JSONDecodeError as e: # If parsing fails, try to find and extract just the JSON object/array if "Extra data" in str(e): try: decoder = json.JSONDecoder() obj, idx = decoder.raw_decode(response) logger.warning( f"{self.name}: Successfully extracted JSON but found extra text after position {idx}. " f"Extra text: {response[idx:idx+100]}..." ) return obj except Exception as e2: logger.error(f"{self.name}: Failed to extract JSON even with raw_decode: {e2}") logger.error(f"{self.name}: Failed to parse response: {e}") logger.error(f"{self.name}: Response content: {response[:500]}") return {"error": "Failed to parse response", "raw": response} def increment_step(self) -> None: self._step += 1 @classmethod def _register_self(cls) -> None: """Register the agent class in the registry upon instantiation.""" # Get the actual instance class, not BaseAgent if cls.__name__ != "BaseAgent" and cls.__name__ not in AgentRegistry._registry: AgentRegistry.register(cls.__name__, cls) def __repr__(self) -> str: return f"<{self.__class__.__name__}(name={self.name}, step={self.step}, status={self.status})>" class AgentStatus: """Constants for agent status.""" ACTIVE = "active" IDLE = "idle" WAITING = "waiting" class AgentRegistry: """ Registry for managing agent classes. Allows dynamic registration and retrieval of agent types. """ _registry: Dict[str, Type[BaseAgent]] = {} @classmethod def register(cls, name: str, agent_cls: Type[BaseAgent]) -> None: if name in cls._registry: logger.warning(f"Agent class '{name}' already registered, overwriting") cls._registry[name] = agent_cls logger.debug(f"Registered agent class: {name}") @classmethod def get_cls(cls, name: str) -> Type[BaseAgent]: if name not in cls._registry: raise ValueError(f"No agent class registered under '{name}'") return cls._registry[name] @classmethod def list_registered(cls) -> List[str]: return list(cls._registry.keys()) @classmethod def clear(cls) -> None: cls._registry.clear() logger.debug("Agent registry cleared") ================================================ FILE: anytool/agents/grounding_agent.py ================================================ from __future__ import annotations import copy import json from typing import TYPE_CHECKING, Any, Dict, List, Optional from anytool.agents.base import BaseAgent from anytool.grounding.core.types import BackendType, ToolResult from anytool.platform.screenshot import ScreenshotClient from anytool.prompts import GroundingAgentPrompts from anytool.utils.logging import Logger if TYPE_CHECKING: from anytool.llm import LLMClient from anytool.grounding.core.grounding_client import GroundingClient from anytool.recording import RecordingManager logger = Logger.get_logger(__name__) class GroundingAgent(BaseAgent): def __init__( self, name: str = "GroundingAgent", backend_scope: Optional[List[str]] = None, llm_client: Optional[LLMClient] = None, grounding_client: Optional[GroundingClient] = None, recording_manager: Optional[RecordingManager] = None, system_prompt: Optional[str] = None, max_iterations: int = 15, visual_analysis_timeout: float = 30.0, tool_retrieval_llm: Optional[LLMClient] = None, visual_analysis_model: Optional[str] = None, ) -> None: """ Initialize the Grounding Agent. Args: name: Agent name backend_scope: List of backends this agent can access (None = all available) llm_client: LLM client for reasoning grounding_client: GroundingClient for tool execution recording_manager: RecordingManager for recording execution system_prompt: Custom system prompt max_iterations: Maximum LLM reasoning iterations for self-correction visual_analysis_timeout: Timeout for visual analysis LLM calls in seconds tool_retrieval_llm: LLM client for tool retrieval filter (None = use llm_client) visual_analysis_model: Model name for visual analysis (None = use llm_client.model) """ super().__init__( name=name, backend_scope=backend_scope or ["gui", "shell", "mcp", "web", "system"], llm_client=llm_client, grounding_client=grounding_client, recording_manager=recording_manager ) self._system_prompt = system_prompt or self._default_system_prompt() self._max_iterations = max_iterations self._visual_analysis_timeout = visual_analysis_timeout self._tool_retrieval_llm = tool_retrieval_llm self._visual_analysis_model = visual_analysis_model logger.info(f"Grounding Agent initialized: {name}") logger.info(f"Backend scope: {self._backend_scope}") logger.info(f"Max iterations: {self._max_iterations}") logger.info(f"Visual analysis timeout: {self._visual_analysis_timeout}s") if tool_retrieval_llm: logger.info(f"Tool retrieval model: {tool_retrieval_llm.model}") if visual_analysis_model: logger.info(f"Visual analysis model: {visual_analysis_model}") def _truncate_messages( self, messages: List[Dict[str, Any]], keep_recent: int = 8, max_tokens_estimate: int = 120000 ) -> List[Dict[str, Any]]: if len(messages) <= keep_recent + 2: # +2 for system and initial user return messages total_text = json.dumps(messages, ensure_ascii=False) estimated_tokens = len(total_text) // 4 if estimated_tokens < max_tokens_estimate: return messages logger.info(f"Truncating message history: {len(messages)} messages, " f"~{estimated_tokens:,} tokens -> keeping recent {keep_recent} rounds") system_messages = [] user_instruction = None conversation_messages = [] for msg in messages: role = msg.get("role") if role == "system": system_messages.append(msg) elif role == "user" and user_instruction is None: user_instruction = msg else: conversation_messages.append(msg) recent_messages = conversation_messages[-(keep_recent * 2):] if conversation_messages else [] truncated = system_messages.copy() if user_instruction: truncated.append(user_instruction) truncated.extend(recent_messages) logger.info(f"After truncation: {len(truncated)} messages, " f"~{len(json.dumps(truncated, ensure_ascii=False))//4:,} tokens (estimated)") return truncated async def process(self, context: Dict[str, Any]) -> Dict[str, Any]: """ Process a task execution request with multi-round iteration control. """ instruction = context.get("instruction", "") if not instruction: logger.error("Grounding Agent: No instruction provided") return {"error": "No instruction provided", "status": "error"} # Store current instruction for visual analysis context self._current_instruction = instruction logger.info(f"Grounding Agent: Processing instruction at step {self.step}") # Exist workspace files check workspace_info = await self._check_workspace_artifacts(context) if workspace_info["has_files"]: context["workspace_artifacts"] = workspace_info logger.info(f"Workspace has {len(workspace_info['files'])} existing files: {workspace_info['files']}") # Get available tools (auto-search with cap) tools = await self._get_available_tools(instruction) # Get search debug info (similarity scores, LLM selections) search_debug_info = None if self.grounding_client: search_debug_info = self.grounding_client.get_last_search_debug_info() # Build retrieved tools list for return value retrieved_tools_list = [] for tool in tools: tool_info = { "name": getattr(tool, "name", str(tool)), "description": getattr(tool, "description", ""), } if hasattr(tool, "backend_type"): tool_info["backend"] = tool.backend_type.value if hasattr(tool.backend_type, "value") else str(tool.backend_type) if hasattr(tool, "_runtime_info") and tool._runtime_info: tool_info["server_name"] = tool._runtime_info.server_name # Add similarity score if available if search_debug_info and search_debug_info.get("tool_scores"): for score_info in search_debug_info["tool_scores"]: if score_info["name"] == tool_info["name"]: tool_info["similarity_score"] = score_info["score"] break retrieved_tools_list.append(tool_info) # Record retrieved tools if self._recording_manager: from anytool.recording import RecordingManager await RecordingManager.record_retrieved_tools( task_instruction=instruction, tools=tools, search_debug_info=search_debug_info, ) # Initialize iteration state max_iterations = context.get("max_iterations", self._max_iterations) current_iteration = 0 all_tool_results = [] iteration_contexts = [] consecutive_empty_responses = 0 # Track consecutive empty LLM responses MAX_CONSECUTIVE_EMPTY = 5 # Exit after this many empty responses # Build initial messages messages = self.construct_messages(context) try: while current_iteration < max_iterations: current_iteration += 1 logger.info(f"Grounding Agent: Iteration {current_iteration}/{max_iterations}") # Truncate message history to prevent context length issues # Start truncating after 5 iterations to keep context manageable if current_iteration >= 5: messages = self._truncate_messages( messages, keep_recent=8, # 保留最近8轮对话 max_tokens_estimate=120000 # Claude Sonnet 4.5 上下文限制是200K,保守使用120K ) messages_input_snapshot = copy.deepcopy(messages) # [DISABLED] Iteration summary generation # Tool results (including visual analysis) are already in context, # LLM can make decisions directly without separate summary. # To re-enable, uncomment below and pass iteration_summary_prompt to complete() # iteration_summary_prompt = GroundingAgentPrompts.iteration_summary( # instruction=instruction, # iteration=current_iteration, # max_iterations=max_iterations # ) if context.get("auto_execute", True) else None # Call LLMClient for single round # LLM will decide whether to call tools or finish with llm_response = await self._llm_client.complete( messages=messages, tools=tools if context.get("auto_execute", True) else None, execute_tools=context.get("auto_execute", True), summary_prompt=None, # Disabled tool_result_callback=self._visual_analysis_callback ) # Update messages with LLM response messages = llm_response["messages"] # Collect tool results tool_results_this_iteration = llm_response.get("tool_results", []) if tool_results_this_iteration: all_tool_results.extend(tool_results_this_iteration) # [DISABLED] Iteration summary logging # llm_summary = llm_response.get("iteration_summary") # if llm_summary: # logger.info(f"Iteration {current_iteration} summary: {llm_summary[:150]}...") assistant_message = llm_response.get("message", {}) assistant_content = assistant_message.get("content", "") has_tool_calls = llm_response.get('has_tool_calls', False) logger.info(f"Iteration {current_iteration} - Has tool calls: {has_tool_calls}, " f"Tool results: {len(tool_results_this_iteration)}, " f"Content length: {len(assistant_content)} chars") if len(assistant_content) > 0: logger.info(f"Iteration {current_iteration} - Assistant content preview: {repr(assistant_content[:300])}") consecutive_empty_responses = 0 # Reset counter on valid response else: if not has_tool_calls: consecutive_empty_responses += 1 logger.warning(f"Iteration {current_iteration} - NO tool calls and NO content " f"(empty response {consecutive_empty_responses}/{MAX_CONSECUTIVE_EMPTY})") if consecutive_empty_responses >= MAX_CONSECUTIVE_EMPTY: logger.error(f"Exiting due to {MAX_CONSECUTIVE_EMPTY} consecutive empty LLM responses. " "This may indicate API issues, rate limiting, or context too long.") break else: consecutive_empty_responses = 0 # Reset if we have tool calls # Snapshot messages after LLM call (accumulated context) messages_output_snapshot = copy.deepcopy(messages) # Record iteration context iteration_context = { "iteration": current_iteration, "messages_input": messages_input_snapshot, "messages_output": messages_output_snapshot, "llm_response_summary": { "assistant_content": assistant_content, "has_tool_calls": has_tool_calls, # "iteration_summary": llm_summary, # Disabled with iteration summary "tool_calls_count": len(tool_results_this_iteration), }, } iteration_contexts.append(iteration_context) # Real-time save to conversations.jsonl from anytool.recording import RecordingManager await RecordingManager.record_iteration_context( iteration=current_iteration, messages_input=messages_input_snapshot, messages_output=messages_output_snapshot, llm_response_summary=iteration_context["llm_response_summary"], ) # Check for completion token in assistant content # [DISABLED] Also check in iteration summary when enabled # is_complete = ( # GroundingAgentPrompts.TASK_COMPLETE in assistant_content or # (llm_summary and GroundingAgentPrompts.TASK_COMPLETE in llm_summary) # ) is_complete = GroundingAgentPrompts.TASK_COMPLETE in assistant_content if is_complete: # Task is complete - LLM generated completion token logger.info(f"Task completed at iteration {current_iteration} (found {GroundingAgentPrompts.TASK_COMPLETE})") break else: # LLM didn't generate , continue to next iteration if tool_results_this_iteration: logger.debug(f"Task in progress, LLM called {len(tool_results_this_iteration)} tools") else: logger.debug(f"Task in progress, LLM did not generate ") # Remove previous iteration guidance to avoid accumulation messages = [ msg for msg in messages if not (msg.get("role") == "system" and "Iteration" in msg.get("content", "") and "complete" in msg.get("content", "")) ] guidance_msg = { "role": "system", "content": f"Iteration {current_iteration} complete. " f"Check if task is finished - if yes, output {GroundingAgentPrompts.TASK_COMPLETE}. " f"If not, continue with next action." } messages.append(guidance_msg) # [DISABLED] Full iteration feedback with summary # self._remove_previous_guidance(messages) # feedback_msg = self._build_iteration_feedback( # iteration=current_iteration, # llm_summary=llm_summary, # add_guidance=True # ) # if feedback_msg: # messages.append(feedback_msg) # logger.debug(f"Added iteration {current_iteration} feedback with guidance") continue # Build final result result = await self._build_final_result( instruction=instruction, messages=messages, all_tool_results=all_tool_results, iterations=current_iteration, max_iterations=max_iterations, iteration_contexts=iteration_contexts, retrieved_tools_list=retrieved_tools_list, search_debug_info=search_debug_info, ) # Record agent action to recording manager if self._recording_manager: await self._record_agent_execution(result, instruction) # Increment step self.increment_step() logger.info(f"Grounding Agent: Execution completed with status: {result.get('status')}") return result except Exception as e: logger.error(f"Grounding Agent: Execution failed: {e}") result = { "error": str(e), "status": "error", "instruction": instruction, "iteration": current_iteration } self.increment_step() return result def _default_system_prompt(self) -> str: """Default system prompt for the grounding agent.""" return GroundingAgentPrompts.SYSTEM_PROMPT def construct_messages( self, context: Dict[str, Any] ) -> List[Dict[str, Any]]: messages = [{"role": "system", "content": self._system_prompt}] # Get instruction from context instruction = context.get("instruction", "") if not instruction: raise ValueError("context must contain 'instruction' field") # Add workspace directory workspace_dir = context.get("workspace_dir") if workspace_dir: messages.append({ "role": "system", "content": GroundingAgentPrompts.workspace_directory(workspace_dir) }) # Add workspace artifacts information workspace_artifacts = context.get("workspace_artifacts") if workspace_artifacts and workspace_artifacts.get("has_files"): files = workspace_artifacts.get("files", []) matching_files = workspace_artifacts.get("matching_files", []) recent_files = workspace_artifacts.get("recent_files", []) if matching_files: artifact_msg = GroundingAgentPrompts.workspace_matching_files(matching_files) elif len(recent_files) >= 2: artifact_msg = GroundingAgentPrompts.workspace_recent_files( total_files=len(files), recent_files=recent_files ) else: artifact_msg = GroundingAgentPrompts.workspace_file_list(files) messages.append({ "role": "system", "content": artifact_msg }) # User instruction messages.append({"role": "user", "content": instruction}) return messages async def _get_available_tools(self, task_description: Optional[str]) -> List: """ Retrieve tools with auto-search + cap to control prompt bloat. Falls back to returning all tools if search fails. """ grounding_client = self.grounding_client if not grounding_client: return [] backends = [BackendType(name) for name in self._backend_scope] try: # Use dedicated tool retrieval LLM if configured, otherwise use main LLM retrieval_llm = self._tool_retrieval_llm or self._llm_client tools = await grounding_client.get_tools_with_auto_search( task_description=task_description, backend=backends, use_cache=True, llm_callable=retrieval_llm, ) logger.info( f"GroundingAgent selected {len(tools)} tools (auto-search) from {len(backends)} backends" ) return tools except Exception as e: logger.warning(f"Auto-search tools failed, falling back to full list: {e}") # Fallback: fetch all tools (previous behaviour) all_tools = [] for backend_name in self._backend_scope: try: backend_type = BackendType(backend_name) tools = await grounding_client.list_tools(backend=backend_type) all_tools.extend(tools) logger.debug(f"Retrieved {len(tools)} tools from backend: {backend_name}") except Exception as e: logger.debug(f"Could not get tools from {backend_name}: {e}") logger.info( f"GroundingAgent fallback retrieved {len(all_tools)} tools from {len(self._backend_scope)} backends" ) return all_tools async def _visual_analysis_callback( self, result: ToolResult, tool_name: str, tool_call: Dict, backend: str ) -> ToolResult: """ Callback for LLMClient to handle visual analysis after tool execution. """ # 1. Check if LLM requested to skip visual analysis skip_visual_analysis = False try: arguments = tool_call.function.arguments if isinstance(arguments, str): args = json.loads(arguments.strip() or "{}") else: args = arguments if isinstance(args, dict) and args.get("skip_visual_analysis"): skip_visual_analysis = True logger.info(f"Visual analysis skipped for {tool_name} (meta-parameter set by LLM)") except Exception as e: logger.debug(f"Could not parse tool arguments: {e}") # 2. If skip requested, return original result if skip_visual_analysis: return result # 3. Check if this backend needs visual analysis if backend != "gui": return result # 4. Check if tool has visual data metadata = getattr(result, 'metadata', None) has_screenshots = metadata and (metadata.get("screenshot") or metadata.get("screenshots")) # 5. If no visual data, try to capture a screenshot if not has_screenshots: try: logger.info(f"No visual data from {tool_name}, capturing screenshot...") screenshot_client = ScreenshotClient() screenshot_bytes = await screenshot_client.capture() if screenshot_bytes: # Add screenshot to result metadata if metadata is None: result.metadata = {} metadata = result.metadata metadata["screenshot"] = screenshot_bytes has_screenshots = True logger.info(f"Screenshot captured for visual analysis") else: logger.warning("Failed to capture screenshot") except Exception as e: logger.warning(f"Error capturing screenshot: {e}") # 6. If still no screenshots, return original result if not has_screenshots: logger.debug(f"No visual data available for {tool_name}") return result # 7. Perform visual analysis return await self._enhance_result_with_visual_context(result, tool_name) async def _enhance_result_with_visual_context( self, result: ToolResult, tool_name: str ) -> ToolResult: """ Enhance tool result with visual analysis for grounding agent workflows. """ import asyncio import base64 import litellm try: metadata = getattr(result, 'metadata', None) if not metadata: return result # Collect all screenshots screenshots_bytes = [] # Check for multiple screenshots first if metadata.get("screenshots"): screenshots_list = metadata["screenshots"] if isinstance(screenshots_list, list): screenshots_bytes = [s for s in screenshots_list if s] # Fall back to single screenshot elif metadata.get("screenshot"): screenshots_bytes = [metadata["screenshot"]] if not screenshots_bytes: return result # Select key screenshots if there are too many selected_screenshots = self._select_key_screenshots(screenshots_bytes, max_count=3) # Convert to base64 visual_b64_list = [] for visual_data in selected_screenshots: if isinstance(visual_data, bytes): visual_b64_list.append(base64.b64encode(visual_data).decode('utf-8')) else: visual_b64_list.append(visual_data) # Already base64 # Build prompt based on number of screenshots num_screenshots = len(visual_b64_list) prompt = GroundingAgentPrompts.visual_analysis( tool_name=tool_name, num_screenshots=num_screenshots, task_description=getattr(self, '_current_instruction', '') ) # Build content with text prompt + all images content = [{"type": "text", "text": prompt}] for visual_b64 in visual_b64_list: content.append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{visual_b64}" } }) # Use dedicated visual analysis model if configured, otherwise use main LLM model visual_model = self._visual_analysis_model or (self._llm_client.model if self._llm_client else "openrouter/anthropic/claude-sonnet-4.5") response = await asyncio.wait_for( litellm.acompletion( model=visual_model, messages=[{ "role": "user", "content": content }], timeout=self._visual_analysis_timeout ), timeout=self._visual_analysis_timeout + 5 ) analysis = response.choices[0].message.content.strip() # Inject visual analysis into content original_content = result.content or "(no text output)" enhanced_content = f"{original_content}\n\n**Visual content**: {analysis}" # Create enhanced result enhanced_result = ToolResult( status=result.status, content=enhanced_content, error=result.error, metadata={**metadata, "visual_analyzed": True, "visual_analysis": analysis}, execution_time=result.execution_time ) logger.info(f"Enhanced {tool_name} result with visual analysis ({num_screenshots} screenshot(s))") return enhanced_result except asyncio.TimeoutError: logger.warning(f"Visual analysis timed out for {tool_name}, returning original result") return result except Exception as e: logger.warning(f"Failed to analyze visual content for {tool_name}: {e}") return result def _select_key_screenshots( self, screenshots: List[bytes], max_count: int = 3 ) -> List[bytes]: """ Select key screenshots if there are too many. """ if len(screenshots) <= max_count: return screenshots selected_indices = set() # Always include last (final state) selected_indices.add(len(screenshots) - 1) # If room, include first (initial state) if max_count >= 2: selected_indices.add(0) # Fill remaining slots with evenly spaced middle screenshots remaining_slots = max_count - len(selected_indices) if remaining_slots > 0: # Calculate spacing available_indices = [ i for i in range(1, len(screenshots) - 1) if i not in selected_indices ] if available_indices: step = max(1, len(available_indices) // (remaining_slots + 1)) for i in range(remaining_slots): idx = min((i + 1) * step, len(available_indices) - 1) if idx < len(available_indices): selected_indices.add(available_indices[idx]) # Return screenshots in original order selected = [screenshots[i] for i in sorted(selected_indices)] logger.debug( f"Selected {len(selected)} screenshots at indices {sorted(selected_indices)} " f"from total of {len(screenshots)}" ) return selected def _get_workspace_path(self, context: Dict[str, Any]) -> Optional[str]: """ Get workspace directory path from context. """ return context.get("workspace_dir") def _scan_workspace_files( self, workspace_path: str, recent_threshold: int = 600 # seconds ) -> Dict[str, Any]: """ Scan workspace directory and collect file information. Args: workspace_path: Path to workspace directory recent_threshold: Threshold in seconds for recent files Returns: Dictionary with file information: - files: List of all filenames - file_details: Dict mapping filename to file info (size, modified, age_seconds) - recent_files: List of recently modified filenames """ import os import time result = { "files": [], "file_details": {}, "recent_files": [] } if not workspace_path or not os.path.exists(workspace_path): return result # Recording system files to exclude from workspace scanning excluded_files = {"metadata.json", "traj.jsonl"} try: current_time = time.time() for filename in os.listdir(workspace_path): filepath = os.path.join(workspace_path, filename) if os.path.isfile(filepath) and filename not in excluded_files: result["files"].append(filename) # Get file stats stat = os.stat(filepath) file_info = { "size": stat.st_size, "modified": stat.st_mtime, "age_seconds": current_time - stat.st_mtime } result["file_details"][filename] = file_info # Track recently created/modified files if file_info["age_seconds"] < recent_threshold: result["recent_files"].append(filename) result["files"] = sorted(result["files"]) except Exception as e: logger.debug(f"Error scanning workspace files: {e}") return result async def _check_workspace_artifacts(self, context: Dict[str, Any]) -> Dict[str, Any]: """ Check workspace directory for existing artifacts that might be relevant to the task. Enhanced to detect if task might already be completed. """ import re workspace_info = {"has_files": False, "files": [], "file_details": {}, "recent_files": []} try: # Get workspace path workspace_path = self._get_workspace_path(context) # Scan workspace files scan_result = self._scan_workspace_files(workspace_path, recent_threshold=600) if scan_result["files"]: workspace_info["has_files"] = True workspace_info["files"] = scan_result["files"] workspace_info["file_details"] = scan_result["file_details"] workspace_info["recent_files"] = scan_result["recent_files"] logger.info(f"Grounding Agent: Found {len(scan_result['files'])} existing files in workspace " f"({len(scan_result['recent_files'])} recent)") # Check if instruction mentions specific filenames instruction = context.get("instruction", "") if instruction: # Look for potential file references in instruction potential_outputs = [] # Match common file patterns: filename.ext, "filename", 'filename' file_patterns = re.findall(r'["\']?([a-zA-Z0-9_\-]+\.[a-zA-Z0-9]+)["\']?', instruction) for pattern in file_patterns: if pattern in scan_result["files"]: potential_outputs.append(pattern) if potential_outputs: workspace_info["matching_files"] = potential_outputs logger.info(f"Grounding Agent: Found {len(potential_outputs)} files matching task: {potential_outputs}") except Exception as e: logger.debug(f"Could not check workspace artifacts: {e}") return workspace_info def _build_iteration_feedback( self, iteration: int, llm_summary: Optional[str] = None, add_guidance: bool = True ) -> Optional[Dict[str, str]]: """ Build feedback message to add to next iteration. """ if not llm_summary: return None feedback_content = GroundingAgentPrompts.iteration_feedback( iteration=iteration, llm_summary=llm_summary, add_guidance=add_guidance ) return { "role": "system", "content": feedback_content } def _remove_previous_guidance(self, messages: List[Dict[str, Any]]) -> None: """ Remove guidance section from previous iteration feedback messages. """ for msg in messages: if msg.get("role") == "system": content = msg.get("content", "") # Check if this is an iteration feedback message with guidance if "## Iteration" in content and "Summary" in content and "---" in content: # Remove everything from "---" onwards (the guidance part) summary_only = content.split("---")[0].strip() msg["content"] = summary_only async def _generate_final_summary( self, instruction: str, messages: List[Dict], iterations: int ) -> tuple[str, bool, List[Dict]]: """ Generate final summary across all iterations for reporting to upper layer. Returns: tuple[str, bool, List[Dict]]: (summary_text, success_flag, context_used) - summary_text: The generated summary or error message - success_flag: True if summary was generated successfully, False otherwise - context_used: The cleaned messages used for generating summary """ final_summary_prompt = { "role": "user", "content": GroundingAgentPrompts.final_summary( instruction=instruction, iterations=iterations ) } clean_messages = [] for msg in messages: # Skip tool result messages if msg.get("role") == "tool": continue # Copy message and remove tool_calls if present clean_msg = msg.copy() if "tool_calls" in clean_msg: del clean_msg["tool_calls"] clean_messages.append(clean_msg) clean_messages.append(final_summary_prompt) # Save context for return context_for_return = copy.deepcopy(clean_messages) try: # Call LLMClient to generate final summary (without tools) summary_response = await self._llm_client.complete( messages=clean_messages, tools=None, execute_tools=False ) final_summary = summary_response.get("message", {}).get("content", "") if final_summary: logger.info(f"Generated final summary: {final_summary[:200]}...") return final_summary, True, context_for_return else: logger.warning("LLM returned empty final summary") return f"Task completed after {iterations} iteration(s). Check execution history for details.", True, context_for_return except Exception as e: logger.error(f"Error generating final summary: {e}") return f"Task completed after {iterations} iteration(s), but failed to generate summary: {str(e)}", False, context_for_return async def _build_final_result( self, instruction: str, messages: List[Dict], all_tool_results: List[Dict], iterations: int, max_iterations: int, iteration_contexts: List[Dict] = None, retrieved_tools_list: List[Dict] = None, search_debug_info: Dict[str, Any] = None, ) -> Dict[str, Any]: """ Build final execution result. Args: instruction: Original instruction messages: Complete conversation history (including all iteration summaries) all_tool_results: All tool execution results iterations: Number of iterations performed max_iterations: Maximum allowed iterations iteration_contexts: Context snapshots for each iteration retrieved_tools_list: List of tools retrieved for this task search_debug_info: Debug info from tool search (similarity scores, LLM selections) """ is_complete = self._check_task_completion(messages) tool_executions = self._format_tool_executions(all_tool_results) result = { "instruction": instruction, "step": self.step, "iterations": iterations, "tool_executions": tool_executions, "messages": messages, "iteration_contexts": iteration_contexts or [], "retrieved_tools_list": retrieved_tools_list or [], "search_debug_info": search_debug_info, "keep_session": True } if is_complete: logger.info("Task completed with marker") # Use LLM's own completion response directly (no extra LLM call needed) # LLM already generates a summary before outputting last_response = self._extract_last_assistant_message(messages) # Remove the token from response for cleaner output result["response"] = last_response.replace(GroundingAgentPrompts.TASK_COMPLETE, "").strip() result["status"] = "success" # [DISABLED] Extra LLM call to generate final summary # final_summary, summary_success, final_summary_context = await self._generate_final_summary( # instruction=instruction, # messages=messages, # iterations=iterations # ) # result["response"] = final_summary # result["final_summary_context"] = final_summary_context else: result["response"] = self._extract_last_assistant_message(messages) result["status"] = "incomplete" result["warning"] = ( f"Task reached max iterations ({max_iterations}) without completion. " f"This may indicate the task needs more steps or clarification." ) return result def _format_tool_executions(self, all_tool_results: List[Dict]) -> List[Dict]: executions = [] for tr in all_tool_results: tool_result_obj = tr.get("result") tool_call = tr.get("tool_call") status = "unknown" if hasattr(tool_result_obj, 'status'): status_obj = tool_result_obj.status status = getattr(status_obj, 'value', status_obj) # Extract tool_name and arguments from tool_call object (litellm format) tool_name = "unknown" arguments = {} if tool_call is not None: if hasattr(tool_call, 'function'): # tool_call is an object with .function attribute tool_name = getattr(tool_call.function, 'name', 'unknown') args_raw = getattr(tool_call.function, 'arguments', '{}') if isinstance(args_raw, str): try: arguments = json.loads(args_raw) if args_raw.strip() else {} except json.JSONDecodeError: arguments = {} else: arguments = args_raw if isinstance(args_raw, dict) else {} elif isinstance(tool_call, dict): # Fallback: tool_call is a dict func = tool_call.get("function", {}) tool_name = func.get("name", "unknown") args_raw = func.get("arguments", "{}") if isinstance(args_raw, str): try: arguments = json.loads(args_raw) if args_raw.strip() else {} except json.JSONDecodeError: arguments = {} else: arguments = args_raw if isinstance(args_raw, dict) else {} executions.append({ "tool_name": tool_name, "arguments": arguments, "backend": tr.get("backend"), "server_name": tr.get("server_name"), "status": status, "content": tool_result_obj.content if hasattr(tool_result_obj, 'content') else None, "error": tool_result_obj.error if hasattr(tool_result_obj, 'error') else None, "execution_time": tool_result_obj.execution_time if hasattr(tool_result_obj, 'execution_time') else None, "metadata": tool_result_obj.metadata if hasattr(tool_result_obj, 'metadata') else {}, }) return executions def _check_task_completion(self, messages: List[Dict]) -> bool: for msg in reversed(messages): if msg.get("role") == "assistant": content = msg.get("content", "") return GroundingAgentPrompts.TASK_COMPLETE in content return False def _extract_last_assistant_message(self, messages: List[Dict]) -> str: for msg in reversed(messages): if msg.get("role") == "assistant": return msg.get("content", "") return "" async def _record_agent_execution( self, result: Dict[str, Any], instruction: str ) -> None: """ Record agent execution to recording manager. Args: result: Execution result instruction: Original instruction """ if not self._recording_manager: return # Extract tool execution summary tool_summary = [] if result.get("tool_executions"): for exec_info in result["tool_executions"]: tool_summary.append({ "tool": exec_info.get("tool_name", "unknown"), "backend": exec_info.get("backend", "unknown"), "status": exec_info.get("status", "unknown"), }) await self._recording_manager.record_agent_action( agent_name=self.name, action_type="execute", input_data={"instruction": instruction}, reasoning={ "response": result.get("response", ""), "tools_selected": tool_summary, }, output_data={ "status": result.get("status", "unknown"), "iterations": result.get("iterations", 0), "num_tool_executions": len(result.get("tool_executions", [])), }, metadata={ "step": self.step, "instruction": instruction, } ) ================================================ FILE: anytool/config/__init__.py ================================================ from .grounding import * from .loader import * from .constants import * from .utils import * from . import constants __all__ = [ # Grounding Config "BackendConfig", "ShellConfig", "WebConfig", "MCPConfig", "GUIConfig", "ToolSearchConfig", "SessionConfig", "SecurityPolicy", "GroundingConfig", # Loader "CONFIG_DIR", "load_config", "get_config", "reset_config", "save_config", "load_agents_config", "get_agent_config", # Utils "get_config_value", "load_json_file", "save_json_file", ] + constants.__all__ ================================================ FILE: anytool/config/config_agents.json ================================================ { "agents": [ { "name": "GroundingAgent", "class_name": "GroundingAgent", "backend_scope": ["gui", "shell", "mcp", "system", "web"], "max_iterations": 15, "visual_analysis_timeout": 60.0 } ] } ================================================ FILE: anytool/config/config_dev.json.example ================================================ { "comment": "[Optional] Loading grounding.json → security.json → dev.json (dev.json overrides the former ones)", "debug": true, "log_level": "DEBUG", "security_policies": { "global": { "blocked_commands": [] } } } ================================================ FILE: anytool/config/config_grounding.json ================================================ { "shell": { "mode": "local", "timeout": 60, "max_retries": 3, "retry_interval": 3.0, "default_shell": "/bin/bash", "working_dir": null, "env": {}, "conda_env": null, "default_port": 5000 }, "mcp": { "timeout": 30, "max_retries": 3, "retry_interval": 2.0, "sandbox": false, "auto_initialize": true, "eager_sessions": false, "sse_read_timeout": 300.0, "check_dependencies": true, "auto_install": true }, "gui": { "mode": "local", "timeout": 90, "max_retries": 3, "retry_interval": 5.0, "driver_type": "pyautogui", "failsafe": false, "screenshot_on_error": true, "pkgs_prefix": "import pyautogui; import time; pyautogui.FAILSAFE = {failsafe}; {command}" }, "tool_search": { "embedding_model": "BAAI/bge-small-en-v1.5", "max_tools": 40, "search_mode": "hybrid", "enable_llm_filter": true, "llm_filter_threshold": 50, "enable_cache_persistence": true, "cache_dir": null }, "tool_quality": { "enabled": true, "enable_persistence": true, "cache_dir": null, "auto_evaluate_descriptions": true, "enable_quality_ranking": true, "evolve_interval": 5 }, "tool_cache_ttl": 600, "tool_cache_maxsize": 500, "debug": false, "log_level": "INFO", "enabled_backends": [ { "name": "shell", "provider_cls": "anytool.grounding.backends.shell.ShellProvider" }, { "name": "web", "provider_cls": "anytool.grounding.backends.web.WebProvider" }, { "name": "mcp", "provider_cls": "anytool.grounding.backends.mcp.MCPProvider" }, { "name": "gui", "provider_cls": "anytool.grounding.backends.gui.GUIProvider" } ], "_comment_system_backend": "Note: 'system' backend is automatically registered and always available. It provides meta-level tools for querying system state. Do not add it to enabled_backends as it requires special initialization." } ================================================ FILE: anytool/config/config_mcp.json.example ================================================ ================================================ FILE: anytool/config/config_security.json ================================================ { "security_policies": { "global": { "allow_shell_commands": true, "allow_network_access": true, "allow_file_access": true, "blocked_commands": { "common": ["rm", "-rf", "shutdown", "reboot", "poweroff", "halt"], "linux": ["mkfs", "dd", "iptables", "systemctl", "init", "kill", "-9", "pkill"], "darwin": ["diskutil", "dd", "pfctl", "launchctl", "killall"], "windows": ["del", "format", "rd", "rmdir", "/s", "/q", "taskkill", "/f"] }, "sandbox_enabled": false }, "backend": { "shell": { "allow_shell_commands": true, "allow_file_access": true, "blocked_commands": { "common": ["rm", "-rf", "shutdown", "reboot", "poweroff", "halt"], "linux": [ "mkfs", "mkfs.ext4", "mkfs.xfs", "dd", "iptables", "ip6tables", "nftables", "systemctl", "service", "fdisk", "parted", "gdisk", "mount", "umount", "chmod", "777", "chown", "root", "passwd", "useradd", "userdel", "usermod", "kill", "-9", "pkill", "killall" ], "darwin": [ "diskutil", "dd", "pfctl", "launchctl", "dscl", "chmod", "777", "chown", "root", "passwd", "killall", "pmset" ], "windows": [ "del", "erase", "format", "rd", "rmdir", "/s", "/q", "diskpart", "reg", "delete", "net", "user", "taskkill", "/f", "wmic" ] }, "sandbox_enabled": false }, "mcp": { "sandbox_enabled": false }, "web": { "allow_network_access": true, "allowed_domains": [] } } } } ================================================ FILE: anytool/config/constants.py ================================================ from pathlib import Path CONFIG_GROUNDING = "config_grounding.json" CONFIG_SECURITY = "config_security.json" CONFIG_MCP = "config_mcp.json" CONFIG_DEV = "config_dev.json" CONFIG_AGENTS = "config_agents.json" LOG_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] # Project root directory (AnyTool/) PROJECT_ROOT = Path(__file__).parent.parent.parent __all__ = [ "CONFIG_GROUNDING", "CONFIG_SECURITY", "CONFIG_MCP", "CONFIG_DEV", "CONFIG_AGENTS", "LOG_LEVELS", "PROJECT_ROOT", ] ================================================ FILE: anytool/config/grounding.py ================================================ from typing import Dict, Optional, Any, List, Literal try: from pydantic import BaseModel, Field, field_validator PYDANTIC_V2 = True except ImportError: from pydantic import BaseModel, Field, validator as field_validator PYDANTIC_V2 = False from anytool.grounding.core.types import ( SessionConfig, SecurityPolicy, BackendType ) from .constants import LOG_LEVELS class ConfigMixin: """Mixin to add utility methods for config access""" def get_value(self, key: str, default=None): """ Safely get config value, works with both dict and Pydantic models. Args: key: Configuration key default: Default value if key not found """ if isinstance(self, dict): return self.get(key, default) else: return getattr(self, key, default) class BackendConfig(BaseModel, ConfigMixin): """Base backend configuration""" enabled: bool = Field(True, description="Whether the backend is enabled") timeout: int = Field(30, ge=1, le=300, description="Timeout in seconds") max_retries: int = Field(3, ge=0, le=10, description="Maximum retry attempts") class ShellConfig(BackendConfig): """ Shell backend configuration Attributes: enabled: Whether shell backend is enabled mode: Execution mode - "local" runs scripts in-process via subprocess, "server" connects to a running local_server via HTTP timeout: Default timeout for shell operations (seconds) max_retries: Maximum number of retry attempts for failed operations retry_interval: Wait time between retries (seconds) default_shell: Path to default shell executable working_dir: Default working directory for bash scripts env: Default environment variables for shell operations conda_env: Conda environment name to activate before execution (optional) default_port: Default port for shell server connection (only used in server mode) """ mode: Literal["local", "server"] = Field("local", description="Execution mode: 'local' (in-process subprocess) or 'server' (HTTP local_server)") retry_interval: float = Field(3.0, ge=0.1, le=60.0, description="Wait time between retries in seconds") default_shell: str = Field("/bin/bash", description="Default shell path") working_dir: Optional[str] = Field(None, description="Default working directory for bash scripts") env: Dict[str, str] = Field(default_factory=dict, description="Default environment variables") conda_env: Optional[str] = Field(None, description="Conda environment name to activate (e.g., 'myenv')") default_port: int = Field(5000, ge=1, le=65535, description="Default port for shell server") @field_validator('default_shell') @classmethod def validate_shell(cls, v): if not v or not isinstance(v, str): raise ValueError("Shell path must be a non-empty string") return v @field_validator('working_dir') @classmethod def validate_working_dir(cls, v): if v is not None and not isinstance(v, str): raise ValueError("Working directory must be a string") return v class WebConfig(BackendConfig): """ Web backend configuration - AI Deep Research Attributes: enabled: Whether web backend is enabled timeout: Default timeout for web operations (seconds) max_retries: Maximum number of retry attempts Note: All web-specific parameters (API key, base URL) are loaded from environment variables or use default values in WebSession: - OPENROUTER_API_KEY: API key for deep research (required) - Deep research base URL defaults to "https://openrouter.ai/api/v1" """ pass class MCPConfig(BackendConfig): """MCP backend configuration""" sandbox: bool = Field(False, description="Whether to enable sandbox") auto_initialize: bool = Field(True, description="Whether to auto initialize") eager_sessions: bool = Field(False, description="Whether to eagerly create sessions for all servers on initialization") retry_interval: float = Field(2.0, ge=0.1, le=60.0, description="Wait time between retries in seconds") servers: Dict[str, Dict[str, Any]] = Field(default_factory=dict, description="MCP servers configuration, loaded from config_mcp.json") sse_read_timeout: float = Field(300.0, ge=1.0, le=3600.0, description="SSE read timeout in seconds for HTTP/Sandbox connectors") class GUIConfig(BackendConfig): """ GUI backend configuration Attributes: mode: Execution mode - "local" runs GUI operations in-process, "server" connects to a running local_server via HTTP """ mode: Literal["local", "server"] = Field("local", description="Execution mode: 'local' (in-process) or 'server' (HTTP local_server)") retry_interval: float = Field(5.0, ge=0.1, le=60.0, description="Wait time between retries in seconds") driver_type: str = Field("pyautogui", description="GUI driver type") failsafe: bool = Field(False, description="Whether to enable pyautogui failsafe mode") screenshot_on_error: bool = Field(True, description="Whether to capture screenshot on error") pkgs_prefix: str = Field( "import pyautogui; import time; pyautogui.FAILSAFE = {failsafe}; {command}", description="Python command prefix for pyautogui setup" ) class ToolSearchConfig(BaseModel): """Tool search and ranking configuration""" embedding_model: str = Field( "BAAI/bge-small-en-v1.5", description="Embedding model name for semantic search" ) max_tools: int = Field( 20, ge=1, le=1000, description="Maximum number of tools to return from search" ) search_mode: str = Field( "hybrid", description="Default search mode: semantic, keyword, or hybrid" ) enable_llm_filter: bool = Field( True, description="Whether to use LLM for backend/server filtering" ) llm_filter_threshold: int = Field( 50, ge=1, le=1000, description="Only apply LLM filter when tool count exceeds this threshold" ) enable_cache_persistence: bool = Field( False, description="Whether to persist embeddings to disk" ) cache_dir: Optional[str] = Field( None, description="Directory for embedding cache. None means use default /.anytool/embedding_cache" ) @field_validator('search_mode') @classmethod def validate_search_mode(cls, v): valid_modes = ['semantic', 'keyword', 'hybrid'] if v.lower() not in valid_modes: raise ValueError(f"Search mode must be one of {valid_modes}, got: {v}") return v.lower() class ToolQualityConfig(BaseModel): """Tool quality tracking configuration""" enabled: bool = Field( True, description="Whether to enable tool quality tracking" ) enable_persistence: bool = Field( True, description="Whether to persist quality data to disk" ) cache_dir: Optional[str] = Field( None, description="Directory for quality cache. None means use default /.anytool/tool_quality" ) auto_evaluate_descriptions: bool = Field( True, description="Whether to automatically evaluate tool descriptions using LLM" ) enable_quality_ranking: bool = Field( True, description="Whether to incorporate quality scores in tool ranking" ) evolve_interval: int = Field( 5, ge=1, le=100, description="Trigger quality evolution every N tool executions" ) class GroundingConfig(BaseModel): """ Main configuration for Grounding module. Contains configuration for all grounding backends and grounding-level settings. Note: Local server connection uses defaults or environment variables (LOCAL_SERVER_URL). """ # Backend configurations shell: ShellConfig = Field(default_factory=ShellConfig) web: WebConfig = Field(default_factory=WebConfig) mcp: MCPConfig = Field(default_factory=MCPConfig) gui: GUIConfig = Field(default_factory=GUIConfig) system: BackendConfig = Field(default_factory=BackendConfig) # Grounding-level settings tool_search: ToolSearchConfig = Field(default_factory=ToolSearchConfig) tool_quality: ToolQualityConfig = Field(default_factory=ToolQualityConfig) enabled_backends: List[Dict[str, str]] = Field( default_factory=list, description="List of enabled backends, each item: {'name': str, 'provider_cls': str}" ) session_defaults: SessionConfig = Field( default_factory=lambda: SessionConfig( session_name="", backend_type=BackendType.SHELL, timeout=30, auto_reconnect=True, health_check_interval=30 ) ) tool_cache_ttl: int = Field( 300, ge=1, le=3600, description="Tool cache time-to-live in seconds" ) tool_cache_maxsize: int = Field( 300, ge=1, le=10000, description="Maximum number of tool cache entries" ) debug: bool = Field(False, description="Debug mode") log_level: str = Field("INFO", description="Log level") security_policies: Dict[str, Any] = Field(default_factory=dict) @field_validator('log_level') @classmethod def validate_log_level(cls, v): if v.upper() not in LOG_LEVELS: raise ValueError(f"Log level must be one of {LOG_LEVELS}, got: {v}") return v.upper() def get_backend_config(self, backend_type: str) -> BackendConfig: """Get configuration for specified backend""" name = backend_type.lower() if not hasattr(self, name): from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) logger.warning(f"Unknown backend type: {backend_type}") return BackendConfig() return getattr(self, name) def get_security_policy(self, backend_type: str) -> SecurityPolicy: global_policy = self.security_policies.get("global", {}) backend_policy = self.security_policies.get("backend", {}).get(backend_type.lower(), {}) merged_policy = {**global_policy, **backend_policy} return SecurityPolicy.from_dict(merged_policy) __all__ = [ "BackendConfig", "ShellConfig", "WebConfig", "MCPConfig", "GUIConfig", "ToolSearchConfig", "ToolQualityConfig", "GroundingConfig", ] ================================================ FILE: anytool/config/loader.py ================================================ import threading from pathlib import Path from typing import Union, Iterable, Dict, Any, Optional from .grounding import GroundingConfig from .constants import ( CONFIG_GROUNDING, CONFIG_SECURITY, CONFIG_DEV, CONFIG_MCP, CONFIG_AGENTS ) from anytool.utils.logging import Logger from .utils import load_json_file, save_json_file as save_json logger = Logger.get_logger(__name__) CONFIG_DIR = Path(__file__).parent # Global configuration singleton _config: GroundingConfig | None = None _config_lock = threading.RLock() # Use RLock to support recursive locking def _deep_merge_dict(base: dict, update: dict) -> dict: """Deep merge two dictionaries, update's values will override base's values""" result = base.copy() for key, value in update.items(): if key in result and isinstance(result[key], dict) and isinstance(value, dict): result[key] = _deep_merge_dict(result[key], value) else: result[key] = value return result def _load_json_file(path: Path) -> Dict[str, Any]: """Load single JSON configuration file. This function wraps the generic load_json_file and adds global configuration specific error handling and logging. """ if not path.exists(): logger.debug(f"Configuration file does not exist, skipping: {path}") return {} try: data = load_json_file(path) logger.info(f"Loaded configuration file: {path}") return data except Exception as e: logger.warning(f"Failed to load configuration file {path}: {e}") return {} def _load_multiple_files(paths: Iterable[Path]) -> Dict[str, Any]: """Load configuration from multiple files""" merged = {} for path in paths: data = _load_json_file(path) if data: merged = _deep_merge_dict(merged, data) return merged def load_config(*config_paths: Union[str, Path]) -> GroundingConfig: """ Load configuration files """ global _config with _config_lock: if config_paths: paths = [Path(p) for p in config_paths] else: paths = [ CONFIG_DIR / CONFIG_GROUNDING, CONFIG_DIR / CONFIG_SECURITY, CONFIG_DIR / CONFIG_DEV, # Optional: development environment configuration ] # Load and merge configuration raw_data = _load_multiple_files(paths) # Load MCP configuration (separate processing) # Check if mcpServers already provided in merged custom configs has_custom_mcp_servers = "mcpServers" in raw_data if has_custom_mcp_servers: # Use mcpServers from custom config if "mcp" not in raw_data: raw_data["mcp"] = {} raw_data["mcp"]["servers"] = raw_data.pop("mcpServers") logger.debug(f"Using custom MCP servers from provided config ({len(raw_data['mcp']['servers'])} servers)") else: # Load default MCP servers from config_mcp.json mcp_data = _load_json_file(CONFIG_DIR / CONFIG_MCP) if mcp_data and "mcpServers" in mcp_data: if "mcp" not in raw_data: raw_data["mcp"] = {} raw_data["mcp"]["servers"] = mcp_data["mcpServers"] logger.debug(f"Loaded MCP servers from default config_mcp.json ({len(raw_data['mcp']['servers'])} servers)") # Validate and create configuration object try: _config = GroundingConfig.model_validate(raw_data) except Exception as e: logger.error(f"Validation failed, using default configuration: {e}") _config = GroundingConfig() # Adjust log level according to configuration if _config.debug: Logger.set_debug(2) elif _config.log_level: try: Logger.configure(level=_config.log_level) except Exception as e: logger.warning(f"Failed to set log level {_config.log_level}: {e}") return _config def get_config() -> GroundingConfig: """ Get global configuration instance. Usage: - Get configuration in Provider: get_config().get_backend_config('shell') - Get security policy in Tool: get_config().get_security_policy('shell') """ global _config if _config is None: with _config_lock: if _config is None: load_config() return _config def reset_config() -> None: """Reset configuration (for testing)""" global _config with _config_lock: _config = None def save_config(config: GroundingConfig, path: Union[str, Path]) -> None: save_json(config.model_dump(), path) logger.info(f"Configuration saved to: {path}") def load_agents_config() -> Dict[str, Any]: agents_config_path = CONFIG_DIR / CONFIG_AGENTS return _load_json_file(agents_config_path) def get_agent_config(agent_name: str) -> Optional[Dict[str, Any]]: """ Get the configuration of the specified agent """ agents_config = load_agents_config() if "agents" not in agents_config: logger.warning(f"No 'agents' key found in {CONFIG_AGENTS}") return None for agent_cfg in agents_config.get("agents", []): if agent_cfg.get("name") == agent_name: return agent_cfg logger.warning(f"Agent '{agent_name}' not found in {CONFIG_AGENTS}") return None __all__ = [ "CONFIG_DIR", "load_config", "get_config", "reset_config", "save_config", "load_agents_config", "get_agent_config" ] ================================================ FILE: anytool/config/utils.py ================================================ import json from pathlib import Path from typing import Any def get_config_value(config: Any, key: str, default=None): if isinstance(config, dict): return config.get(key, default) else: return getattr(config, key, default) def load_json_file(filepath: str | Path) -> dict[str, Any]: filepath = Path(filepath) if isinstance(filepath, str) else filepath with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def save_json_file(data: dict[str, Any], filepath: str | Path, indent: int = 2) -> None: filepath = Path(filepath) if isinstance(filepath, str) else filepath # Ensure directory exists filepath.parent.mkdir(parents=True, exist_ok=True) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=indent, ensure_ascii=False) __all__ = ["get_config_value", "load_json_file", "save_json_file"] ================================================ FILE: anytool/grounding/backends/__init__.py ================================================ # Use lazy imports to avoid loading all backends unconditionally def _lazy_import_provider(provider_name: str): """Lazy import provider class""" if provider_name == 'mcp': from .mcp.provider import MCPProvider return MCPProvider elif provider_name == 'shell': from .shell.provider import ShellProvider return ShellProvider elif provider_name == 'web': from .web.provider import WebProvider return WebProvider elif provider_name == 'gui': from .gui.provider import GUIProvider return GUIProvider else: raise ImportError(f"Unknown provider: {provider_name}") class _ProviderRegistry: """Lazy provider registry""" def __getitem__(self, key): return _lazy_import_provider(key) def __contains__(self, key): return key in ['mcp', 'shell', 'web', 'gui'] BACKEND_PROVIDERS = _ProviderRegistry() __all__ = [ 'BACKEND_PROVIDERS', '_lazy_import_provider' ] ================================================ FILE: anytool/grounding/backends/gui/__init__.py ================================================ from .provider import GUIProvider from .session import GUISession from .transport.connector import GUIConnector from .transport.local_connector import LocalGUIConnector try: from .anthropic_client import AnthropicGUIClient from . import anthropic_utils _anthropic_available = True except ImportError: _anthropic_available = False __all__ = [ # Core Provider and Session "GUIProvider", "GUISession", # Transport layer "GUIConnector", "LocalGUIConnector", ] # Add Anthropic modules to exports if available if _anthropic_available: __all__.extend(["AnthropicGUIClient", "anthropic_utils"]) ================================================ FILE: anytool/grounding/backends/gui/anthropic_client.py ================================================ import base64 import os import time from typing import Any, Dict, Optional, Tuple, List from anytool.utils.logging import Logger from PIL import Image import io logger = Logger.get_logger(__name__) try: from anthropic import ( Anthropic, AnthropicBedrock, AnthropicVertex, APIError, APIResponseValidationError, APIStatusError, ) from anthropic.types.beta import ( BetaMessageParam, BetaTextBlockParam, ) ANTHROPIC_AVAILABLE = True except ImportError: logger.warning("Anthropic SDK not available. Install with: pip install anthropic") ANTHROPIC_AVAILABLE = False # Import utility functions from .anthropic_utils import ( APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME, COMPUTER_USE_BETA_FLAG, PROMPT_CACHING_BETA_FLAG, get_system_prompt, inject_prompt_caching, maybe_filter_to_n_most_recent_images, response_to_params, ) # API retry configuration API_RETRY_TIMES = 10 API_RETRY_INTERVAL = 5 # seconds class AnthropicGUIClient: """ Anthropic LLM Client for GUI operations. Uses Claude Sonnet 4.5 with computer-use-2025-01-24 API. Features: - Vision-based screen understanding - Automatic screenshot resizing (configurable display size) - Coordinate scaling between display and actual screen """ def __init__( self, model: str = "claude-sonnet-4-5", platform: str = "Ubuntu", api_key: Optional[str] = None, provider: str = "anthropic", max_tokens: int = 4096, screen_size: Tuple[int, int] = (1920, 1080), display_size: Tuple[int, int] = (1024, 768), # Computer use display size pyautogui_size: Optional[Tuple[int, int]] = None, # PyAutoGUI working size only_n_most_recent_images: int = 3, enable_prompt_caching: bool = True, backup_api_key: Optional[str] = None, ): """ Initialize Anthropic GUI Client for Claude Sonnet 4.5. Args: model: Model name (only "claude-sonnet-4-5" supported) platform: Platform type (Ubuntu, Windows, or macOS) api_key: Anthropic API key (defaults to ANTHROPIC_API_KEY env var) provider: API provider (only "anthropic" supported) max_tokens: Maximum tokens for response screen_size: Actual screenshot resolution (width, height) - physical pixels display_size: Display size for computer use tool (width, height) Screenshots will be resized to this size before sending to API pyautogui_size: PyAutoGUI working size (logical pixels). If None, assumed same as screen_size. On Retina/HiDPI displays, this may be screen_size / 2 only_n_most_recent_images: Number of recent screenshots to keep in history enable_prompt_caching: Whether to enable prompt caching for cost optimization backup_api_key: Backup API key (defaults to ANTHROPIC_API_KEY_BACKUP env var) """ if not ANTHROPIC_AVAILABLE: raise RuntimeError("Anthropic SDK not installed. Install with: pip install anthropic") # Only support claude-sonnet-4-5 if model != "claude-sonnet-4-5": logger.warning(f"Model '{model}' not supported. Using 'claude-sonnet-4-5'") model = "claude-sonnet-4-5" self.model = model self.platform = platform self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") if not self.api_key: raise ValueError("Anthropic API key not provided. Set ANTHROPIC_API_KEY env var or pass api_key parameter") # Backup API key for failover self.backup_api_key = backup_api_key or os.environ.get("ANTHROPIC_API_KEY_BACKUP") # Only support anthropic provider if provider != "anthropic": logger.warning(f"Provider '{provider}' not supported. Using 'anthropic'") provider = "anthropic" self.provider = APIProvider(provider) self.max_tokens = max_tokens self.screen_size = screen_size self.display_size = display_size self.pyautogui_size = pyautogui_size or screen_size # Default to screen_size if not specified self.only_n_most_recent_images = only_n_most_recent_images self.enable_prompt_caching = enable_prompt_caching # Message history self.messages: List[BetaMessageParam] = [] # Calculate resize factor for coordinate scaling # Step 1: LLM coordinates (display_size) -> Physical pixels (screen_size) # Step 2: Physical pixels -> PyAutoGUI logical pixels (pyautogui_size) self.resize_factor = ( self.pyautogui_size[0] / display_size[0], # x scale factor self.pyautogui_size[1] / display_size[1] # y scale factor ) logger.info( f"Initialized AnthropicGUIClient:\n" f" Model: {model}\n" f" Platform: {platform}\n" f" Screen Size (physical): {screen_size}\n" f" PyAutoGUI Size (logical): {self.pyautogui_size}\n" f" Display Size (LLM): {display_size}\n" f" Resize Factor (LLM->PyAutoGUI): {self.resize_factor}\n" f" Prompt Caching: {enable_prompt_caching}" ) def _create_client(self, api_key: Optional[str] = None): """Create Anthropic client (only supports anthropic provider).""" key = api_key or self.api_key return Anthropic(api_key=key, max_retries=4) def _resize_screenshot(self, screenshot_bytes: bytes) -> bytes: """ Resize screenshot to display size for Computer Use API. For computer-use-2025-01-24, the screenshot must be resized to the display_width_px x display_height_px specified in the tool definition. """ screenshot_image = Image.open(io.BytesIO(screenshot_bytes)) resized_image = screenshot_image.resize(self.display_size, Image.Resampling.LANCZOS) output_buffer = io.BytesIO() resized_image.save(output_buffer, format='PNG') return output_buffer.getvalue() def _scale_coordinates(self, x: int, y: int) -> Tuple[int, int]: """ Scale coordinates from display size to actual screen size. The API returns coordinates in display_size (e.g., 1024x768). We need to scale them to actual screen_size (e.g., 1920x1080) for execution. Args: x, y: Coordinates in display size space Returns: Scaled coordinates in actual screen size space """ scaled_x = int(x * self.resize_factor[0]) scaled_y = int(y * self.resize_factor[1]) return scaled_x, scaled_y async def plan_action( self, task_description: str, screenshot: bytes, action_history: List[Dict[str, Any]] = None, ) -> Tuple[Optional[str], List[str]]: """ Plan next action based on task and current screenshot. Includes prompt caching, error handling, and backup API key support. Args: task_description: Task to accomplish screenshot: Current screenshot (PNG bytes) action_history: Previous actions (for context) Returns: Tuple of (reasoning, list of pyautogui commands) """ # Resize screenshot resized_screenshot = self._resize_screenshot(screenshot) screenshot_b64 = base64.b64encode(resized_screenshot).decode('utf-8') # Initialize messages with first task + screenshot if not self.messages: # IMPORTANT: Image should come BEFORE text for better model understanding # This matches OSWorld's implementation which has proven effectiveness self.messages.append({ "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": screenshot_b64, }, }, {"type": "text", "text": task_description}, ] }) # Filter images BEFORE adding new screenshot to control message size # This is critical to avoid exceeding the 25MB API limit image_truncation_threshold = 10 if self.only_n_most_recent_images and len(self.messages) > 1: # Reserve 1 slot for the screenshot we're about to add maybe_filter_to_n_most_recent_images( self.messages, max(1, self.only_n_most_recent_images - 1), min_removal_threshold=1, # More aggressive filtering ) # Add tool result from previous action if exists if self.messages and self.messages[-1]["role"] == "assistant": last_content = self.messages[-1]["content"] if isinstance(last_content, list) and any( block.get("type") == "tool_use" for block in last_content ): tool_use_id = next( block["id"] for block in last_content if block.get("type") == "tool_use" ) self._add_tool_result(tool_use_id, "Success", resized_screenshot) # Define tools and betas for claude-sonnet-4-5 with computer-use-2025-01-24 tools = [{ 'name': 'computer', 'type': 'computer_20250124', 'display_width_px': self.display_size[0], 'display_height_px': self.display_size[1], 'display_number': 1 }] betas = [COMPUTER_USE_BETA_FLAG] # Prepare system prompt with optional caching system = BetaTextBlockParam( type="text", text=get_system_prompt(self.platform) ) # Enable prompt caching if supported and enabled if self.enable_prompt_caching: betas.append(PROMPT_CACHING_BETA_FLAG) inject_prompt_caching(self.messages) system["cache_control"] = {"type": "ephemeral"} # type: ignore # Model name - use claude-sonnet-4-5 directly model_name = "claude-sonnet-4-5" # Enable thinking for complex computer use tasks extra_body = {"thinking": {"type": "enabled", "budget_tokens": 2048}} # Log request details for debugging # Count current images in messages total_images = sum( 1 for message in self.messages for item in (message.get("content", []) if isinstance(message.get("content"), list) else []) if isinstance(item, dict) and item.get("type") == "image" ) tool_result_images = sum( 1 for message in self.messages for item in (message.get("content", []) if isinstance(message.get("content"), list) else []) if isinstance(item, dict) and item.get("type") == "tool_result" for content in item.get("content", []) if isinstance(content, dict) and content.get("type") == "image" ) logger.info( f"Anthropic API request:\n" f" Model: {model_name}\n" f" Display Size: {self.display_size}\n" f" Betas: {betas}\n" f" Images: {total_images} ({tool_result_images} in tool_results)\n" f" Messages: {len(self.messages)}" ) # Try API call with retry and backup client = self._create_client() response = None try: # Retry loop with automatic image count reduction on 25MB error for attempt in range(API_RETRY_TIMES): try: response = client.beta.messages.create( max_tokens=self.max_tokens, messages=self.messages, model=model_name, system=[system], tools=tools, betas=betas, extra_body=extra_body ) logger.info(f"API call succeeded on attempt {attempt + 1}") break except (APIError, APIStatusError, APIResponseValidationError) as e: error_msg = str(e) logger.warning(f"Anthropic API error (attempt {attempt+1}/{API_RETRY_TIMES}): {error_msg}") # Handle 25MB payload limit error (including HTTP 413) if ("25000000" in error_msg or "Member must have length less than or equal to" in error_msg or "request_too_large" in error_msg or "413" in str(e)): logger.warning("Detected 25MB limit error, reducing image count") current_count = self.only_n_most_recent_images new_count = max(1, current_count // 2) self.only_n_most_recent_images = new_count maybe_filter_to_n_most_recent_images( self.messages, new_count, min_removal_threshold=1, # Aggressive filtering when hitting limit ) logger.info(f"Image count reduced from {current_count} to {new_count}") if attempt < API_RETRY_TIMES - 1: time.sleep(API_RETRY_INTERVAL) else: raise except (APIError, APIStatusError, APIResponseValidationError) as e: logger.error(f"Primary API key failed: {e}") # Try backup API key if available if self.backup_api_key: logger.warning("Retrying with backup API key...") try: backup_client = self._create_client(self.backup_api_key) response = backup_client.beta.messages.create( max_tokens=self.max_tokens, messages=self.messages, model=model_name, system=[system], tools=tools, betas=betas, extra_body=extra_body ) logger.info("Successfully used backup API key") except Exception as backup_e: logger.error(f"Backup API key also failed: {backup_e}") return None, ["FAIL"] else: return None, ["FAIL"] except Exception as e: logger.error(f"Unexpected error: {e}") return None, ["FAIL"] if not response: return None, ["FAIL"] # Parse response using utility function response_params = response_to_params(response) # Extract reasoning and commands reasoning = "" commands = [] for block in response_params: block_type = block.get("type") if block_type == "text": reasoning = block.get("text", "") elif block_type == "thinking": reasoning = block.get("thinking", "") elif block_type == "tool_use": tool_input = block.get("input", {}) command = self._parse_computer_tool_use(tool_input) if command: commands.append(command) else: logger.warning(f"Failed to parse tool_use: {tool_input}") # Store assistant response self.messages.append({ "role": "assistant", "content": response_params }) logger.info(f"Parsed {len(commands)} commands from response") return reasoning, commands def _add_tool_result( self, tool_use_id: str, result: str, screenshot_bytes: Optional[bytes] = None ): """ Add tool result to message history. IMPORTANT: Put screenshot BEFORE text for consistency with initial message. """ # Build content list with image first (if provided), then text content_list = [] # Add screenshot first if provided (consistent with initial message ordering) if screenshot_bytes is not None: screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8') content_list.append({ "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": screenshot_b64 } }) # Then add text result content_list.append({"type": "text", "text": result}) tool_result_content = [{ "type": "tool_result", "tool_use_id": tool_use_id, "content": content_list }] self.messages.append({ "role": "user", "content": tool_result_content }) def _parse_computer_tool_use(self, tool_input: Dict[str, Any]) -> Optional[str]: """ Parse Anthropic computer tool use to pyautogui command. Args: tool_input: Tool input from Anthropic (action, coordinate, text, etc.) Returns: PyAutoGUI command string or control command (DONE, FAIL) """ action = tool_input.get("action") if not action: return None # Action conversion action_conversion = { "left click": "click", "right click": "right_click" } action = action_conversion.get(action, action) text = tool_input.get("text") coordinate = tool_input.get("coordinate") scroll_direction = tool_input.get("scroll_direction") scroll_amount = tool_input.get("scroll_amount", 5) # Scale coordinates to actual screen size if coordinate: coordinate = self._scale_coordinates(coordinate[0], coordinate[1]) # Build commands command = "" if action == "mouse_move": if coordinate: x, y = coordinate command = f"pyautogui.moveTo({x}, {y}, duration=0.5)" elif action in ("left_click", "click"): if coordinate: x, y = coordinate command = f"pyautogui.click({x}, {y})" else: command = "pyautogui.click()" elif action == "right_click": if coordinate: x, y = coordinate command = f"pyautogui.rightClick({x}, {y})" else: command = "pyautogui.rightClick()" elif action == "double_click": if coordinate: x, y = coordinate command = f"pyautogui.doubleClick({x}, {y})" else: command = "pyautogui.doubleClick()" elif action == "middle_click": if coordinate: x, y = coordinate command = f"pyautogui.middleClick({x}, {y})" else: command = "pyautogui.middleClick()" elif action == "left_click_drag": if coordinate: x, y = coordinate command = f"pyautogui.dragTo({x}, {y}, duration=0.5)" elif action == "key": if text: keys = text.split('+') # Key conversion key_conversion = { "page_down": "pagedown", "page_up": "pageup", "super_l": "win", "super": "command", "escape": "esc" } converted_keys = [key_conversion.get(k.strip().lower(), k.strip().lower()) for k in keys] # Press and release keys for key in converted_keys: command += f"pyautogui.keyDown('{key}'); " for key in reversed(converted_keys): command += f"pyautogui.keyUp('{key}'); " # Remove trailing semicolon and space command = command.rstrip('; ') elif action == "type": if text: command = f"pyautogui.typewrite({repr(text)}, interval=0.01)" elif action == "scroll": if scroll_direction in ("up", "down"): scroll_value = scroll_amount if scroll_direction == "up" else -scroll_amount if coordinate: x, y = coordinate command = f"pyautogui.scroll({scroll_value}, {x}, {y})" else: command = f"pyautogui.scroll({scroll_value})" elif scroll_direction in ("left", "right"): scroll_value = scroll_amount if scroll_direction == "right" else -scroll_amount if coordinate: x, y = coordinate command = f"pyautogui.hscroll({scroll_value}, {x}, {y})" else: command = f"pyautogui.hscroll({scroll_value})" elif action == "screenshot": # Screenshot is automatically handled by the system # Return special marker to indicate no action needed return "SCREENSHOT" elif action == "wait": # Wait for specified duration duration = tool_input.get("duration", 1) command = f"pyautogui.sleep({duration})" elif action == "done": return "DONE" elif action == "fail": return "FAIL" return command if command else None def reset(self): """Reset message history.""" self.messages = [] logger.info("Reset AnthropicGUIClient message history") ================================================ FILE: anytool/grounding/backends/gui/anthropic_utils.py ================================================ from typing import List, cast from enum import Enum from datetime import datetime from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) try: from anthropic.types.beta import ( BetaCacheControlEphemeralParam, BetaContentBlockParam, BetaImageBlockParam, BetaMessage, BetaMessageParam, BetaTextBlock, BetaTextBlockParam, BetaToolResultBlockParam, BetaToolUseBlockParam, ) ANTHROPIC_AVAILABLE = True except ImportError: ANTHROPIC_AVAILABLE = False # Beta flags # For claude-sonnet-4-5 with computer-use-2025-01-24 COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24" PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" class APIProvider(Enum): """API Provider enumeration""" ANTHROPIC = "anthropic" # BEDROCK = "bedrock" # VERTEX = "vertex" # Provider to model name mapping (simplified for claude-sonnet-4-5 only) PROVIDER_TO_DEFAULT_MODEL_NAME: dict = { (APIProvider.ANTHROPIC, "claude-sonnet-4-5"): "claude-sonnet-4-5", # (APIProvider.BEDROCK, "claude-sonnet-4-5"): "us.anthropic.claude-sonnet-4-5-v1:0", # (APIProvider.VERTEX, "claude-sonnet-4-5"): "claude-sonnet-4-5-v1", } def get_system_prompt(platform: str = "Ubuntu") -> str: """ Get system prompt based on platform. Args: platform: Platform type (Ubuntu, Windows, macOS, or Darwin) Returns: System prompt string """ # Normalize platform name platform_lower = platform.lower() if platform_lower in ["windows", "win32"]: return f""" * You are utilising a Windows virtual machine using x86_64 architecture with internet access. * You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications. * To accomplish tasks, you MUST use the computer tool to see the screen and take actions. * To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is {datetime.today().strftime('%A, %B %d, %Y')}. * Home directory of this Windows system is 'C:\\Users\\user'. * When you want to open some applications on Windows, please use Double Click on it instead of clicking once. * After each action, the system will provide you with a new screenshot showing the result. * Continue taking actions until the task is complete. """ elif platform_lower in ["macos", "darwin", "mac"]: return f""" * You are utilising a macOS system with internet access. * You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications. * To accomplish tasks, you MUST use the computer tool to see the screen and take actions. * To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is {datetime.today().strftime('%A, %B %d, %Y')}. * Home directory of this macOS system is typically '/Users/[username]' or can be accessed via '~'. * On macOS, use Command (⌘) key combinations instead of Ctrl (e.g., Command+C for copy). * After each action, the system will provide you with a new screenshot showing the result. * Continue taking actions until the task is complete. * When the task is completed, simply describe what you've done in your response WITHOUT using the tool again. """ else: # Ubuntu/Linux return f""" * You are utilising an Ubuntu virtual machine using x86_64 architecture with internet access. * You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications. * To accomplish tasks, you MUST use the computer tool to see the screen and take actions. * To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is {datetime.today().strftime('%A, %B %d, %Y')}. * Home directory of this Ubuntu system is '/home/user'. * After each action, the system will provide you with a new screenshot showing the result. * Continue taking actions until the task is complete. """ def inject_prompt_caching(messages: List[BetaMessageParam]) -> None: """ Set cache breakpoints for the 3 most recent turns. One cache breakpoint is left for tools/system prompt, to be shared across sessions. Args: messages: Message history (modified in place) """ if not ANTHROPIC_AVAILABLE: return breakpoints_remaining = 3 for message in reversed(messages): if message["role"] == "user" and isinstance( content := message["content"], list ): if breakpoints_remaining: breakpoints_remaining -= 1 # Use type ignore to bypass TypedDict check until SDK types are updated content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore {"type": "ephemeral"} ) else: content[-1].pop("cache_control", None) # we'll only ever have one extra turn per loop break def maybe_filter_to_n_most_recent_images( messages: List[BetaMessageParam], images_to_keep: int, min_removal_threshold: int, ) -> None: """ With the assumption that images are screenshots that are of diminishing value as the conversation progresses, remove all but the final `images_to_keep` tool_result images in place, with a chunk of min_removal_threshold to reduce the amount we break the implicit prompt cache. Args: messages: Message history (modified in place) images_to_keep: Number of recent images to keep min_removal_threshold: Minimum number of images to remove at once (for cache efficiency) """ if not ANTHROPIC_AVAILABLE or images_to_keep is None: return tool_result_blocks = cast( list[BetaToolResultBlockParam], [ item for message in messages for item in ( message["content"] if isinstance(message["content"], list) else [] ) if isinstance(item, dict) and item.get("type") == "tool_result" ], ) total_images = sum( 1 for tool_result in tool_result_blocks for content in tool_result.get("content", []) if isinstance(content, dict) and content.get("type") == "image" ) images_to_remove = total_images - images_to_keep # for better cache behavior, we want to remove in chunks images_to_remove -= images_to_remove % min_removal_threshold for tool_result in tool_result_blocks: if isinstance(tool_result.get("content"), list): new_content = [] for content in tool_result.get("content", []): if isinstance(content, dict) and content.get("type") == "image": if images_to_remove > 0: images_to_remove -= 1 continue new_content.append(content) tool_result["content"] = new_content def response_to_params(response: BetaMessage) -> List[BetaContentBlockParam]: """ Convert Anthropic response to parameter list. Handles both text blocks, tool use blocks, and thinking blocks. Args: response: Anthropic API response Returns: List of content blocks """ if not ANTHROPIC_AVAILABLE: return [] res: List[BetaContentBlockParam] = [] if response.content: for block in response.content: # Check block type using type attribute # Note: type may be a string or enum, so convert to string for comparison block_type = str(getattr(block, "type", "")) if block_type == "text": # Regular text block if isinstance(block, BetaTextBlock) and block.text: res.append(BetaTextBlockParam(type="text", text=block.text)) elif block_type == "thinking": # Thinking block (for Claude 4 and Sonnet 3.7) thinking_block = { "type": "thinking", "thinking": getattr(block, "thinking", ""), } if hasattr(block, "signature"): thinking_block["signature"] = getattr(block, "signature", None) res.append(cast(BetaContentBlockParam, thinking_block)) elif block_type == "tool_use": # Tool use block - only include required fields to avoid API errors # (e.g., 'caller' field is not permitted by Anthropic API) tool_use_dict = { "type": "tool_use", "id": block.id, "name": block.name, "input": block.input, } res.append(cast(BetaToolUseBlockParam, tool_use_dict)) else: # Unknown block type - try to handle generically try: res.append(cast(BetaContentBlockParam, block.model_dump())) except Exception as e: logger.warning(f"Failed to parse block type {block_type}: {e}") return res else: return [] ================================================ FILE: anytool/grounding/backends/gui/config.py ================================================ from typing import Dict, Any, Optional import os import platform as platform_module from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) def build_llm_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ Build complete LLM configuration with auto-detection and environment variables. Auto-detects: - API key from environment variables (ANTHROPIC_API_KEY) - Platform from system (macOS/Windows/Ubuntu) - Provider defaults to 'anthropic' User-provided config values will override auto-detected values. Args: user_config: User-provided configuration (optional) Returns: Complete LLM configuration dict Example: >>> # Auto-detect everything >>> config = build_llm_config() >>> # Override specific values >>> config = build_llm_config({ ... "model": "claude-3-5-sonnet-20241022", ... "max_tokens": 8192 ... }) """ if user_config is None: user_config = {} # Auto-detect platform system = platform_module.system() if system == "Darwin": detected_platform = "macOS" elif system == "Windows": detected_platform = "Windows" else: # Linux detected_platform = "Ubuntu" # Auto-detect API key from environment api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: logger.warning( "ANTHROPIC_API_KEY not found in environment. " "Please set it: export ANTHROPIC_API_KEY='your-key'" ) # Build configuration with precedence: user_config > auto-detected > defaults config = { "type": user_config.get("type", "anthropic"), "model": user_config.get("model", "claude-sonnet-4-5"), "platform": user_config.get("platform", detected_platform), "api_key": user_config.get("api_key", api_key), "provider": user_config.get("provider", "anthropic"), "max_tokens": user_config.get("max_tokens", 4096), "only_n_most_recent_images": user_config.get("only_n_most_recent_images", 3), "enable_prompt_caching": user_config.get("enable_prompt_caching", True), } # Optional: screen_size (will be auto-detected from screenshot later) if "screen_size" in user_config: config["screen_size"] = user_config["screen_size"] logger.info(f"Built LLM config - Platform: {config['platform']}, Model: {config['model']}") if config["api_key"]: logger.info(f"API key loaded: {config['api_key'][:10]}...") return config ================================================ FILE: anytool/grounding/backends/gui/provider.py ================================================ from typing import Dict, Any, Union from anytool.grounding.core.types import BackendType, SessionConfig from anytool.grounding.core.provider import Provider from anytool.grounding.core.session import BaseSession from anytool.config import get_config from anytool.config.utils import get_config_value from anytool.platform import get_local_server_config from anytool.utils.logging import Logger from .transport.connector import GUIConnector from .transport.local_connector import LocalGUIConnector from .session import GUISession logger = Logger.get_logger(__name__) class GUIProvider(Provider): """ Provider for GUI desktop environment. Manages communication with desktop_env through HTTP API or local in-process execution. Supports two modes: - "local": Execute GUI operations directly in-process (no server needed) - "server": Connect to a running local_server via HTTP API Supports automatic default session creation: - If no session exists, a default session will be created on first use - Default session uses configuration from config file or environment """ DEFAULT_SID = BackendType.GUI.value def __init__(self, config: Dict[str, Any] = None): """ Initialize GUI provider. Args: config: Provider configuration """ super().__init__(BackendType.GUI, config) self.connectors: Dict[str, Union[GUIConnector, LocalGUIConnector]] = {} async def initialize(self) -> None: """ Initialize the provider and create default session. """ if not self.is_initialized: logger.info("Initializing GUI provider") # Auto-create default session await self.create_session(SessionConfig( session_name=self.DEFAULT_SID, backend_type=BackendType.GUI, connection_params={} )) self.is_initialized = True async def create_session(self, session_config: SessionConfig) -> BaseSession: """ Create GUI session. Args: session_config: Session configuration Returns: GUISession instance """ # Load GUI backend configuration gui_config = get_config().get_backend_config("gui") # Determine execution mode: "local" or "server" mode = getattr(gui_config, "mode", "local") # Extract connection parameters conn_params = session_config.connection_params timeout = get_config_value(conn_params, 'timeout', gui_config.timeout) retry_times = get_config_value(conn_params, 'retry_times', gui_config.max_retries) retry_interval = get_config_value(conn_params, 'retry_interval', gui_config.retry_interval) # Build pkgs_prefix with failsafe setting failsafe_str = "True" if gui_config.failsafe else "False" pkgs_prefix = get_config_value( conn_params, 'pkgs_prefix', gui_config.pkgs_prefix.format(failsafe=failsafe_str, command="{command}") ) if mode == "local": # ---------- LOCAL MODE ---------- logger.info("GUI backend using LOCAL mode (no server required)") connector = LocalGUIConnector( timeout=timeout, retry_times=retry_times, retry_interval=retry_interval, pkgs_prefix=pkgs_prefix, ) else: # ---------- SERVER MODE ---------- logger.info("GUI backend using SERVER mode (connecting to local_server)") local_server_config = get_local_server_config() vm_ip = get_config_value(conn_params, 'vm_ip', local_server_config['host']) server_port = get_config_value(conn_params, 'server_port', local_server_config['port']) connector = GUIConnector( vm_ip=vm_ip, server_port=server_port, timeout=timeout, retry_times=retry_times, retry_interval=retry_interval, pkgs_prefix=pkgs_prefix, ) # Create session session = GUISession( connector=connector, session_id=session_config.session_name, backend_type=BackendType.GUI, config=session_config, ) # Store connector and session self.connectors[session_config.session_name] = connector self._sessions[session_config.session_name] = session logger.info(f"Created GUI session: {session_config.session_name} (mode={mode})") return session async def close_session(self, session_name: str) -> None: """ Close GUI session. Args: session_name: Name of the session to close """ if session_name in self._sessions: session = self._sessions[session_name] await session.disconnect() del self._sessions[session_name] if session_name in self.connectors: connector = self.connectors[session_name] await connector.disconnect() del self.connectors[session_name] logger.info(f"Closed GUI session: {session_name}") ================================================ FILE: anytool/grounding/backends/gui/session.py ================================================ from typing import Dict, Any, Union import os from anytool.grounding.core.session import BaseSession from anytool.grounding.core.types import BackendType, SessionStatus, SessionConfig from anytool.utils.logging import Logger from .transport.connector import GUIConnector from .transport.local_connector import LocalGUIConnector from .tool import GUIAgentTool from .config import build_llm_config logger = Logger.get_logger(__name__) class GUISession(BaseSession): """ Session for GUI desktop environment. Manages connection and tools for GUI automation. """ def __init__( self, connector: Union[GUIConnector, LocalGUIConnector], session_id: str, backend_type: BackendType.GUI, config: SessionConfig, auto_connect: bool = True, auto_initialize: bool = True, ): """ Initialize GUI session. Args: connector: GUI HTTP connector session_id: Unique session identifier backend_type: Backend type (GUI) config: Session configuration auto_connect: Auto-connect on context enter auto_initialize: Auto-initialize on context enter """ super().__init__( connector=connector, session_id=session_id, backend_type=backend_type, auto_connect=auto_connect, auto_initialize=auto_initialize, ) self.config = config self.gui_connector = connector async def initialize(self) -> Dict[str, Any]: """ Initialize session: connect and discover tools. Returns: Session information dict """ logger.info(f"Initializing GUI session: {self.session_id}") # Ensure connected if not self.connector.is_connected: await self.connect() # Create LLM client if configured llm_client = None user_llm_config = self.config.connection_params.get("llm_config") # Build complete LLM config with auto-detection # If user provides llm_config, merge with auto-detected values # If user doesn't provide llm_config, try to auto-build one if ANTHROPIC_API_KEY exists if user_llm_config or os.environ.get("ANTHROPIC_API_KEY"): llm_config = build_llm_config(user_llm_config) if llm_config.get("type") == "anthropic": # Check if API key is available if not llm_config.get("api_key"): logger.warning( "Anthropic API key not found. Skipping LLM client initialization. " "Set ANTHROPIC_API_KEY environment variable or provide api_key in llm_config." ) else: try: from .anthropic_client import AnthropicGUIClient # Detect actual screen size from screenshot (most accurate) # PyAutoGUI may report logical resolution, but we need the actual screenshot size try: screenshot_bytes = await self.gui_connector.get_screenshot() if screenshot_bytes: from PIL import Image import io img = Image.open(io.BytesIO(screenshot_bytes)) actual_screen_size = img.size logger.info(f"Auto-detected screen size from screenshot: {actual_screen_size}") screen_size = actual_screen_size else: raise RuntimeError("Could not get screenshot") except Exception as e: # Fallback to pyautogui detection actual_screen_size = await self.gui_connector.get_screen_size() if actual_screen_size: logger.info(f"Auto-detected screen size from pyautogui: {actual_screen_size}") screen_size = actual_screen_size else: # Final fallback to configured value screen_size = llm_config.get("screen_size", (1920, 1080)) logger.warning(f"Could not auto-detect screen size, using configured: {screen_size}") # Detect PyAutoGUI working size (logical pixels) pyautogui_size = await self.gui_connector.get_screen_size() if pyautogui_size: logger.info(f"PyAutoGUI working size (logical): {pyautogui_size}") else: # If we can't detect PyAutoGUI size, assume it's the same as screen size pyautogui_size = screen_size logger.warning(f"Could not detect PyAutoGUI size, assuming same as screen: {pyautogui_size}") llm_client = AnthropicGUIClient( model=llm_config["model"], platform=llm_config["platform"], api_key=llm_config["api_key"], provider=llm_config["provider"], screen_size=screen_size, pyautogui_size=pyautogui_size, max_tokens=llm_config["max_tokens"], only_n_most_recent_images=llm_config["only_n_most_recent_images"], ) logger.info( f"Initialized Anthropic LLM client - " f"Model: {llm_config['model']}, Platform: {llm_config['platform']}" ) except Exception as e: logger.warning(f"Failed to initialize Anthropic client: {e}") # Get recording_manager from connection_params if available recording_manager = self.config.connection_params.get("recording_manager") # Create GUI Agent Tool self.tools = [ GUIAgentTool( connector=self.gui_connector, llm_client=llm_client, recording_manager=recording_manager ) ] logger.info(f"Initialized GUI session with {len(self.tools)} tool(s)") # Return session info session_info = { "session_id": self.session_id, "backend_type": self.backend_type.value, "vm_ip": self.gui_connector.vm_ip, "server_port": self.gui_connector.server_port, "num_tools": len(self.tools), "tools": [tool.name for tool in self.tools], "llm_client": "anthropic" if llm_client else "none", } return session_info async def connect(self) -> None: """Connect to GUI desktop environment""" if self.connector.is_connected: return self.status = SessionStatus.CONNECTING logger.info(f"Connecting to desktop_env at {self.gui_connector.base_url}") await self.connector.connect() self.status = SessionStatus.CONNECTED logger.info("Connected to desktop environment") async def disconnect(self) -> None: """Disconnect from GUI desktop environment""" if not self.connector.is_connected: return logger.info("Disconnecting from desktop environment") await self.connector.disconnect() self.status = SessionStatus.DISCONNECTED logger.info("Disconnected from desktop environment") @property def is_connected(self) -> bool: """Check if session is connected""" return self.connector.is_connected ================================================ FILE: anytool/grounding/backends/gui/tool.py ================================================ import base64 from typing import Any, Dict from anytool.grounding.core.tool.base import BaseTool from anytool.grounding.core.types import BackendType, ToolResult, ToolStatus from .transport.connector import GUIConnector from .transport.actions import ACTION_SPACE, KEYBOARD_KEYS from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class GUIAgentTool(BaseTool): """ LLM-powered GUI Agent Tool. This tool acts as an intelligent agent that: - Takes a task description as input - Observes the desktop via screenshot - Uses LLM/VLM to understand and plan actions - Outputs action space commands - Executes actions through the connector """ _name = "gui_agent" _description = """Vision-based GUI automation agent for tasks requiring graphical interface interaction. Use this tool when the task involves: - Operating desktop applications with graphical interfaces (browsers, editors, design tools, etc.) - Tasks that require visual understanding of UI elements, layouts, or content - Multi-step workflows that need click, drag, type, or other GUI interactions - Scenarios where programmatic APIs or command-line tools are unavailable or insufficient The agent observes screen state through screenshots, uses vision-language models to understand the interface, plans appropriate actions, and executes GUI operations autonomously. IMPORTANT - max_steps Parameter Guidelines: - Simple tasks (1-2 actions): 15-20 steps - Medium tasks (3-5 actions): 25-35 steps - Complex tasks (6+ actions, like web navigation): 35-50 steps - When uncertain, prefer larger values (35+) to avoid premature termination - Default is 25, but increase for multi-step workflows Input: - task_description: Natural language task description - max_steps: Maximum actions (default 25, increase for complex tasks) Output: Task execution results with action history and completion status """ backend_type = BackendType.GUI def __init__(self, connector: GUIConnector, llm_client=None, recording_manager=None, **kwargs): """ Initialize GUI Agent Tool. Args: connector: GUI connector for communication with desktop_env llm_client: LLM/VLM client for vision-based planning (optional) recording_manager: RecordingManager for recording intermediate steps (optional) **kwargs: Additional arguments for BaseTool """ super().__init__(**kwargs) self.connector = connector self.llm_client = llm_client # Will be injected later self.recording_manager = recording_manager # For recording intermediate steps self.action_history = [] # Track executed actions async def _arun( self, task_description: str, max_steps: int = 50, ) -> ToolResult: """ Execute a GUI automation task using LLM planning. This is the main entry point that: 1. Gets current screenshot 2. Uses LLM to plan next action based on task and screenshot 3. Executes the planned action 4. Repeats until task is complete or max_steps reached Args: task_description: Natural language description of the task max_steps: Maximum number of actions to execute (default 25) Recommended values based on task complexity: - Simple (1-2 actions): 15-20 - Medium (3-5 actions): 25-35 - Complex (6+ actions, web navigation, multi-app): 35-50 When in doubt, use higher values to avoid premature termination Returns: ToolResult with task execution status """ if not task_description: return ToolResult( status=ToolStatus.ERROR, error="task_description is required" ) logger.info(f"Starting GUI task: {task_description}") self.action_history = [] # Execute task with LLM planning loop try: result = await self._execute_task_with_planning( task_description=task_description, max_steps=max_steps, ) return result except Exception as e: logger.error(f"Task execution failed: {e}") return ToolResult( status=ToolStatus.ERROR, error=str(e), metadata={ "task_description": task_description, "actions_executed": len(self.action_history), "action_history": self.action_history, } ) async def _execute_task_with_planning( self, task_description: str, max_steps: int, ) -> ToolResult: """ Execute task with LLM-based planning loop. Planning loop: 1. Observe: Get screenshot 2. Plan: LLM decides next action 3. Execute: Perform the action 4. Verify: Check if task is complete 5. Repeat until done or max_steps Args: task_description: Task to complete max_steps: Maximum planning iterations Returns: ToolResult with execution details """ # Collect all screenshots for visual analysis all_screenshots = [] # Collect intermediate steps intermediate_steps = [] for step in range(max_steps): logger.info(f"Planning step {step + 1}/{max_steps}") # Step 1: Observe current state screenshot = await self.connector.get_screenshot() if not screenshot: return ToolResult( status=ToolStatus.ERROR, error="Failed to get screenshot for planning", metadata={"step": step, "action_history": self.action_history} ) # Collect screenshot for visual analysis all_screenshots.append(screenshot) # Step 2: Plan next action using LLM planned_action = await self._plan_next_action( task_description=task_description, screenshot=screenshot, action_history=self.action_history, ) # Check if task is complete if planned_action["action_type"] == "DONE": logger.info("Task marked as complete by LLM") reasoning = planned_action.get("reasoning", "Task completed successfully") intermediate_steps.append({ "step_number": step + 1, "action": "DONE", "reasoning": reasoning, "status": "done", }) return ToolResult( status=ToolStatus.SUCCESS, content=f"Task completed: {task_description}\n\nFinal state: {reasoning}", metadata={ "steps_taken": step + 1, "action_history": self.action_history, "screenshots": all_screenshots, "intermediate_steps": intermediate_steps, "final_reasoning": reasoning, } ) # Check if task failed if planned_action["action_type"] == "FAIL": logger.warning("Task marked as failed by LLM") reason = planned_action.get("reason", "Task cannot be completed") intermediate_steps.append({ "step_number": step + 1, "action": "FAIL", "reasoning": planned_action.get("reasoning", ""), "status": "failed", }) return ToolResult( status=ToolStatus.ERROR, error=reason, metadata={ "steps_taken": step + 1, "action_history": self.action_history, "screenshots": all_screenshots, "intermediate_steps": intermediate_steps, } ) # Check if action is WAIT (screenshot observation, continue to next step) if planned_action["action_type"] == "WAIT": logger.info("Screenshot observation step, continuing planning loop") intermediate_steps.append({ "step_number": step + 1, "action": "WAIT", "reasoning": planned_action.get("reasoning", ""), "status": "observation", }) continue # Step 3: Execute the planned action execution_result = await self._execute_planned_action(planned_action) # Record action in history self.action_history.append({ "step": step + 1, "planned_action": planned_action, "execution_result": execution_result, }) intermediate_steps.append({ "step_number": step + 1, "action": planned_action.get("action_type", "unknown"), "reasoning": planned_action.get("reasoning", ""), "status": execution_result.get("status", "unknown"), }) # Check execution result if execution_result.get("status") != "success": logger.warning(f"Action execution failed: {execution_result.get('error')}") # Continue to next iteration for retry planning # Max steps reached return ToolResult( status=ToolStatus.ERROR, error=f"Task incomplete after {max_steps} steps", metadata={ "task_description": task_description, "steps_taken": max_steps, "action_history": self.action_history, "screenshots": all_screenshots, "intermediate_steps": intermediate_steps, } ) async def _plan_next_action( self, task_description: str, screenshot: bytes, action_history: list, ) -> Dict[str, Any]: """ Use LLM/VLM to plan the next action. This method sends: - Task description - Current screenshot (vision input) - Action history (context) - Available ACTION_SPACE And gets back a structured action plan. Args: task_description: The task to accomplish screenshot: Current desktop screenshot (PNG/JPEG bytes) action_history: Previously executed actions Returns: Dict with action_type and parameters """ if self.llm_client is None: # Fallback: Simple heuristic or manual mode logger.warning("No LLM client configured, using fallback mode") return { "action_type": "FAIL", "reason": "LLM client not configured" } # Check if using Anthropic client try: from .anthropic_client import AnthropicGUIClient is_anthropic = isinstance(self.llm_client, AnthropicGUIClient) except ImportError: is_anthropic = False if is_anthropic: # Use Anthropic client try: reasoning, commands = await self.llm_client.plan_action( task_description=task_description, screenshot=screenshot, action_history=action_history, ) if commands == ["FAIL"]: return { "action_type": "FAIL", "reason": "Anthropic planning failed" } if commands == ["DONE"]: return { "action_type": "DONE", "reasoning": reasoning } if commands == ["SCREENSHOT"]: # Screenshot is automatically handled by system # Continue to next planning step logger.info("LLM requested screenshot (observation step)") return { "action_type": "WAIT", "reasoning": reasoning or "Observing screen state" } # If no commands but has reasoning, task is complete # (Anthropic returns text-only when task is done) if not commands and reasoning: logger.info("LLM returned text-only response, interpreting as task completion") return { "action_type": "DONE", "reasoning": reasoning } # No commands and no reasoning = error if not commands: return { "action_type": "FAIL", "reason": "No commands generated and no completion message" } # Return first command (Anthropic returns pyautogui commands directly) return { "action_type": "PYAUTOGUI_COMMAND", "command": commands[0], "reasoning": reasoning } except Exception as e: logger.error(f"Anthropic planning failed: {e}") return { "action_type": "FAIL", "reason": f"Planning error: {str(e)}" } # Generic LLM client (for future integration with other LLMs) # Encode screenshot to base64 for LLM screenshot_b64 = base64.b64encode(screenshot).decode('utf-8') # Prepare prompt for LLM prompt = self._build_planning_prompt( task_description=task_description, action_history=action_history, ) # Call LLM with vision input try: response = await self.llm_client.plan_action( prompt=prompt, image_base64=screenshot_b64, action_space=ACTION_SPACE, keyboard_keys=KEYBOARD_KEYS, ) # Parse LLM response to action dict action = self._parse_llm_response(response) logger.info(f"LLM planned action: {action['action_type']}") return action except Exception as e: logger.error(f"LLM planning failed: {e}") return { "action_type": "FAIL", "reason": f"Planning error: {str(e)}" } def _build_planning_prompt( self, task_description: str, action_history: list, ) -> str: """ Build prompt for LLM planning. Args: task_description: The task to accomplish action_history: Previously executed actions Returns: Formatted prompt string """ prompt = f"""You are a GUI automation agent. Your task is to complete the following: Task: {task_description} You can observe the current desktop state through the provided screenshot. You must plan the next action to take from the available ACTION_SPACE. Available actions: - Mouse: MOVE_TO, CLICK, RIGHT_CLICK, DOUBLE_CLICK, DRAG_TO, SCROLL - Keyboard: TYPING, PRESS, KEY_DOWN, KEY_UP, HOTKEY - Control: WAIT, DONE, FAIL """ if action_history: prompt += f"\nPrevious actions taken ({len(action_history)}):\n" for i, action in enumerate(action_history[-5:], 1): # Last 5 actions prompt += f"{i}. {action['planned_action']['action_type']}" if 'parameters' in action['planned_action']: prompt += f" - {action['planned_action']['parameters']}" prompt += "\n" prompt += """ Based on the screenshot and task, output the next action in JSON format: { "action_type": "ACTION_TYPE", "parameters": {...}, "reasoning": "Why this action is needed" } If the task is complete, output: {"action_type": "DONE"} If the task cannot be completed, output: {"action_type": "FAIL", "reason": "explanation"} """ return prompt def _parse_llm_response(self, response: str) -> Dict[str, Any]: """ Parse LLM response to extract action. Args: response: LLM response (should be JSON) Returns: Action dict with action_type and parameters """ import json try: # Try to parse as JSON action = json.loads(response) # Validate action if "action_type" not in action: raise ValueError("Missing action_type in LLM response") return action except json.JSONDecodeError: logger.error(f"Failed to parse LLM response as JSON: {response[:200]}") return { "action_type": "FAIL", "reason": "Invalid LLM response format" } async def _execute_planned_action( self, action: Dict[str, Any] ) -> Dict[str, Any]: """ Execute a planned action through the connector. Args: action: Action dict with action_type and parameters Returns: Execution result dict """ action_type = action["action_type"] # Handle Anthropic's direct pyautogui commands if action_type == "PYAUTOGUI_COMMAND": command = action.get("command", "") logger.info(f"Executing pyautogui command: {command}") try: result = await self.connector.execute_python_command(command) return { "status": "success" if result else "error", "action_type": action_type, "command": command, "result": result } except Exception as e: logger.error(f"Command execution error: {e}") return { "status": "error", "action_type": action_type, "error": str(e) } # Handle standard action space commands parameters = action.get("parameters", {}) logger.info(f"Executing action: {action_type}") try: result = await self.connector.execute_action(action_type, parameters) return result except Exception as e: logger.error(f"Action execution error: {e}") return { "status": "error", "action_type": action_type, "error": str(e) } # Helper methods for direct action execution async def execute_action( self, action_type: str, parameters: Dict[str, Any] ) -> ToolResult: """ Direct action execution (bypass LLM planning). Args: action_type: Action type from ACTION_SPACE parameters: Action parameters Returns: ToolResult with execution status """ result = await self.connector.execute_action(action_type, parameters) if result.get("status") == "success": return ToolResult( status=ToolStatus.SUCCESS, content=f"Executed {action_type}", metadata=result ) else: return ToolResult( status=ToolStatus.ERROR, error=result.get("error", "Unknown error"), metadata=result ) async def get_screenshot(self) -> ToolResult: """Get current desktop screenshot.""" screenshot = await self.connector.get_screenshot() if screenshot: return ToolResult( status=ToolStatus.SUCCESS, content=screenshot, metadata={"type": "screenshot", "size": len(screenshot)} ) else: return ToolResult( status=ToolStatus.ERROR, error="Failed to capture screenshot" ) async def _record_intermediate_step( self, step_number: int, planned_action: Dict[str, Any], execution_result: Dict[str, Any], screenshot: bytes, task_description: str, ): """ Record an intermediate step of GUI agent execution. This method records each planning-action cycle to the recording system, providing detailed traces of GUI agent's decision-making process. Args: step_number: Step number in the execution sequence planned_action: Action planned by LLM execution_result: Result of executing the action screenshot: Screenshot before executing the action task_description: Overall task description """ # Try to get recording_manager dynamically if not set at initialization recording_manager = self.recording_manager if not recording_manager and hasattr(self, '_runtime_info') and self._runtime_info: # Try to get from grounding_client grounding_client = self._runtime_info.grounding_client if grounding_client and hasattr(grounding_client, 'recording_manager'): recording_manager = grounding_client.recording_manager logger.debug(f"Step {step_number}: Dynamically retrieved recording_manager from grounding_client") if not recording_manager: logger.debug(f"Step {step_number}: No recording_manager available, skipping intermediate step recording") return # Check if recording is active try: from anytool.recording.manager import RecordingManager if not RecordingManager.is_recording(): logger.debug(f"Step {step_number}: RecordingManager not started") return except Exception as e: logger.debug(f"Step {step_number}: Failed to check recording status: {e}") return # Check if recorder is initialized if not hasattr(recording_manager, '_recorder') or not recording_manager._recorder: logger.warning(f"Step {step_number}: recording_manager._recorder not initialized") return # Build command string for display action_type = planned_action.get("action_type", "unknown") command = self._format_action_command(planned_action) # Build result summary status = execution_result.get("status", "unknown") is_success = status in ("success", "done", "observation") # Build result content if status == "done": result_content = f"Task completed at step {step_number}" elif status == "failed": result_content = execution_result.get("message", "Task failed") elif status == "observation": result_content = execution_result.get("message", "Screenshot observation") else: result_content = execution_result.get("result", execution_result.get("message", str(execution_result))) # Build parameters for recording parameters = { "task_description": task_description, "step_number": step_number, "action_type": action_type, "planned_action": planned_action, } # Record to trajectory recorder (handles screenshot saving) try: await recording_manager._recorder.record_step( backend="gui", tool="gui_agent_step", command=command, result={ "status": "success" if is_success else "error", "output": str(result_content)[:200], # Truncate long outputs }, parameters=parameters, screenshot=screenshot, extra={ "gui_step_number": step_number, "reasoning": planned_action.get("reasoning", ""), } ) logger.info(f"✓ Recorded GUI intermediate step {step_number}: {command}") except Exception as e: logger.error(f"✗ Failed to record intermediate step {step_number}: {e}", exc_info=True) def _format_action_command(self, planned_action: Dict[str, Any]) -> str: """ Format planned action into a human-readable command string. Args: planned_action: Action dictionary from LLM planning Returns: Formatted command string """ action_type = planned_action.get("action_type", "unknown") # Handle special action types if action_type == "DONE": return "DONE (task completed)" elif action_type == "FAIL": reason = planned_action.get("reason", "unknown") return f"FAIL ({reason})" elif action_type == "WAIT": return "WAIT (screenshot observation)" # Handle PyAutoGUI commands elif action_type == "PYAUTOGUI_COMMAND": command = planned_action.get("command", "") # Truncate long commands if len(command) > 100: return command[:100] + "..." return command # Handle standard action space commands else: parameters = planned_action.get("parameters", {}) if parameters: # Format first 2 parameters param_items = list(parameters.items())[:2] param_str = ", ".join([f"{k}={v}" for k, v in param_items]) return f"{action_type}({param_str})" else: return action_type ================================================ FILE: anytool/grounding/backends/gui/transport/actions.py ================================================ """ GUI Action Space Definitions. """ from typing import Dict, Any # Screen resolution constants X_MAX = 1920 Y_MAX = 1080 # Keyboard keys constants KEYBOARD_KEYS = [ '\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright' ] # Action Space Definition ACTION_SPACE = [ { "action_type": "MOVE_TO", "note": "move the cursor to the specified position", "parameters": { "x": {"type": float, "range": [0, X_MAX], "optional": False}, "y": {"type": float, "range": [0, Y_MAX], "optional": False}, } }, { "action_type": "CLICK", "note": "click the left button if button not specified, otherwise click the specified button", "parameters": { "button": {"type": str, "range": ["left", "right", "middle"], "optional": True}, "x": {"type": float, "range": [0, X_MAX], "optional": True}, "y": {"type": float, "range": [0, Y_MAX], "optional": True}, "num_clicks": {"type": int, "range": [1, 2, 3], "optional": True}, } }, { "action_type": "MOUSE_DOWN", "note": "press the mouse button", "parameters": { "button": {"type": str, "range": ["left", "right", "middle"], "optional": True} } }, { "action_type": "MOUSE_UP", "note": "release the mouse button", "parameters": { "button": {"type": str, "range": ["left", "right", "middle"], "optional": True} } }, { "action_type": "RIGHT_CLICK", "note": "right click at position", "parameters": { "x": {"type": float, "range": [0, X_MAX], "optional": True}, "y": {"type": float, "range": [0, Y_MAX], "optional": True} } }, { "action_type": "DOUBLE_CLICK", "note": "double click at position", "parameters": { "x": {"type": float, "range": [0, X_MAX], "optional": True}, "y": {"type": float, "range": [0, Y_MAX], "optional": True} } }, { "action_type": "DRAG_TO", "note": "drag the cursor to position", "parameters": { "x": {"type": float, "range": [0, X_MAX], "optional": False}, "y": {"type": float, "range": [0, Y_MAX], "optional": False} } }, { "action_type": "SCROLL", "note": "scroll the mouse wheel", "parameters": { "dx": {"type": int, "range": None, "optional": False}, "dy": {"type": int, "range": None, "optional": False} } }, { "action_type": "TYPING", "note": "type the specified text", "parameters": { "text": {"type": str, "range": None, "optional": False} } }, { "action_type": "PRESS", "note": "press the specified key", "parameters": { "key": {"type": str, "range": KEYBOARD_KEYS, "optional": False} } }, { "action_type": "KEY_DOWN", "note": "press down the specified key", "parameters": { "key": {"type": str, "range": KEYBOARD_KEYS, "optional": False} } }, { "action_type": "KEY_UP", "note": "release the specified key", "parameters": { "key": {"type": str, "range": KEYBOARD_KEYS, "optional": False} } }, { "action_type": "HOTKEY", "note": "press key combination", "parameters": { "keys": {"type": list, "range": [KEYBOARD_KEYS], "optional": False} } }, { "action_type": "WAIT", "note": "wait until next action", }, { "action_type": "FAIL", "note": "mark task as failed", }, { "action_type": "DONE", "note": "mark task as done", } ] def build_pyautogui_command(action_type: str, parameters: Dict[str, Any]) -> str: """ Build pyautogui command from action type and parameters. Args: action_type: Type of action (e.g., 'CLICK', 'TYPING') parameters: Action parameters Returns: Python command string """ if action_type == "MOVE_TO": if "x" in parameters and "y" in parameters: x, y = parameters["x"], parameters["y"] return f"pyautogui.moveTo({x}, {y}, 0.5, pyautogui.easeInQuad)" else: return "pyautogui.moveTo()" elif action_type == "CLICK": button = parameters.get("button", "left") num_clicks = parameters.get("num_clicks", 1) if "x" in parameters and "y" in parameters: x, y = parameters["x"], parameters["y"] return f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})" else: return f"pyautogui.click(button='{button}', clicks={num_clicks})" elif action_type == "MOUSE_DOWN": button = parameters.get("button", "left") return f"pyautogui.mouseDown(button='{button}')" elif action_type == "MOUSE_UP": button = parameters.get("button", "left") return f"pyautogui.mouseUp(button='{button}')" elif action_type == "RIGHT_CLICK": if "x" in parameters and "y" in parameters: x, y = parameters["x"], parameters["y"] return f"pyautogui.rightClick(x={x}, y={y})" else: return "pyautogui.rightClick()" elif action_type == "DOUBLE_CLICK": if "x" in parameters and "y" in parameters: x, y = parameters["x"], parameters["y"] return f"pyautogui.doubleClick(x={x}, y={y})" else: return "pyautogui.doubleClick()" elif action_type == "DRAG_TO": if "x" in parameters and "y" in parameters: x, y = parameters["x"], parameters["y"] return f"pyautogui.dragTo({x}, {y}, 1.0, button='left')" elif action_type == "SCROLL": dx = parameters.get("dx", 0) dy = parameters.get("dy", 0) return f"pyautogui.scroll({dy})" elif action_type == "TYPING": text = parameters.get("text", "") # Use repr() for proper string escaping return f"pyautogui.typewrite({repr(text)})" elif action_type == "PRESS": key = parameters.get("key", "") return f"pyautogui.press('{key}')" elif action_type == "KEY_DOWN": key = parameters.get("key", "") return f"pyautogui.keyDown('{key}')" elif action_type == "KEY_UP": key = parameters.get("key", "") return f"pyautogui.keyUp('{key}')" elif action_type == "HOTKEY": keys = parameters.get("keys", []) if keys: keys_str = ", ".join([f"'{k}'" for k in keys]) return f"pyautogui.hotkey({keys_str})" return None ================================================ FILE: anytool/grounding/backends/gui/transport/connector.py ================================================ import asyncio import re from typing import Any, Dict, Optional from anytool.grounding.core.transport.connectors import AioHttpConnector from .actions import build_pyautogui_command, KEYBOARD_KEYS from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class GUIConnector(AioHttpConnector): """ Connector for desktop_env HTTP API. Provides action execution and observation methods. """ def __init__( self, vm_ip: str, server_port: int = 5000, timeout: int = 90, retry_times: int = 3, retry_interval: float = 5.0, pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}", ): """ Initialize GUI connector. Args: vm_ip: IP address of the VM running desktop_env server_port: Port of the desktop_env HTTP server timeout: Request timeout in seconds retry_times: Number of retries for failed requests retry_interval: Interval between retries in seconds pkgs_prefix: Python command prefix for pyautogui setup """ base_url = f"http://{vm_ip}:{server_port}" super().__init__(base_url, timeout=timeout) self.vm_ip = vm_ip self.server_port = server_port self.retry_times = retry_times self.retry_interval = retry_interval self.pkgs_prefix = pkgs_prefix self.timeout = timeout async def _retry_invoke( self, operation_name: str, operation_func, *args, **kwargs ): """ Execute operation with retry logic. Args: operation_name: Name of operation for logging operation_func: Async function to execute *args: Positional arguments for operation_func **kwargs: Keyword arguments for operation_func Returns: Operation result Raises: Exception: Last exception after all retries fail """ last_exc: Exception | None = None for attempt in range(1, self.retry_times + 1): try: result = await operation_func(*args, **kwargs) logger.debug("%s executed successfully (attempt %d/%d)", operation_name, attempt, self.retry_times) return result except asyncio.TimeoutError as exc: logger.error("%s timed out", operation_name) raise RuntimeError(f"{operation_name} timed out after {self.timeout} seconds") from exc except Exception as exc: last_exc = exc if attempt == self.retry_times: break logger.warning( "%s failed (attempt %d/%d): %s, retrying in %.1f seconds...", operation_name, attempt, self.retry_times, exc, self.retry_interval ) await asyncio.sleep(self.retry_interval) error_msg = f"{operation_name} failed after {self.retry_times} retries" logger.error(error_msg) raise last_exc or RuntimeError(error_msg) @staticmethod def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool: """Validate image response using magic bytes.""" if not isinstance(data, (bytes, bytearray)) or not data: return False # PNG magic if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n": return True # JPEG magic if len(data) >= 3 and data[:3] == b"\xff\xd8\xff": return True # Fallback to content-type if content_type and ("image/png" in content_type or "image/jpeg" in content_type): return True return False @staticmethod def _fix_pyautogui_less_than_bug(command: str) -> str: """ Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls. This fixes the known PyAutoGUI issue where typing '<' produces '>' instead. References: - https://github.com/asweigart/pyautogui/issues/198 - https://github.com/xlang-ai/OSWorld/issues/257 Args: command (str): The original pyautogui command Returns: str: The fixed command with '<' characters handled properly """ # Pattern to match press('<') or press('\u003c') calls press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)' # Handle press('<') calls def replace_press_less_than(match): return 'pyautogui.hotkey("shift", ",")' # First handle press('<') calls command = re.sub(press_pattern, replace_press_less_than, command) # Pattern to match typewrite calls with quoted strings typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)' # Then handle typewrite calls def process_typewrite_match(match): quote_char = match.group(1) content = match.group(2) # Preprocess: Try to decode Unicode escapes like \u003c to actual '<' # This handles cases where '<' is represented as escaped Unicode try: # Attempt to decode unicode escapes decoded_content = content.encode('utf-8').decode('unicode_escape') content = decoded_content except UnicodeDecodeError: # If decoding fails, proceed with original content to avoid breaking existing logic pass # Graceful degradation - fall back to original content if decoding fails # Check if content contains '<' if '<' not in content: return match.group(0) # Split by '<' and rebuild parts = content.split('<') result_parts = [] for i, part in enumerate(parts): if i == 0: # First part if part: result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") else: # Add hotkey for '<' and then typewrite for the rest result_parts.append('pyautogui.hotkey("shift", ",")') if part: result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") return '; '.join(result_parts) command = re.sub(typewrite_pattern, process_typewrite_match, command) return command async def get_screen_size(self) -> Optional[tuple[int, int]]: """ Get actual screen size from desktop environment using pyautogui. Returns: (width, height) tuple, or None on failure """ try: command = "print(pyautogui.size())" result = await self.execute_python_command(command) if result and result.get("status") == "success": output = result.get("output", "") # Parse output like "Size(width=2880, height=1800)" import re match = re.search(r'width=(\d+).*height=(\d+)', output) if match: width = int(match.group(1)) height = int(match.group(2)) logger.info(f"Detected screen size: {width}x{height}") return (width, height) logger.warning(f"Failed to detect screen size, output: {result}") return None except Exception as e: logger.error(f"Failed to get screen size: {e}") return None async def get_screenshot(self) -> Optional[bytes]: """ Get screenshot from desktop environment. Returns: Screenshot image bytes (PNG/JPEG), or None on failure """ try: async def _get(): response = await self._request("GET", "/screenshot", timeout=10) if response.status == 200: content_type = response.headers.get("Content-Type", "") content = await response.read() if self._is_valid_image_response(content_type, content): return content else: raise ValueError("Invalid screenshot format") else: raise RuntimeError(f"HTTP {response.status}") return await self._retry_invoke("get_screenshot", _get) except Exception as e: logger.error(f"Failed to get screenshot: {e}") return None async def execute_python_command(self, command: str) -> Optional[Dict[str, Any]]: """ Execute a Python command on desktop environment. Used for pyautogui commands. Args: command: Python command to execute Returns: Response dict with execution result, or None on failure """ try: # Apply '<' character fix for PyAutoGUI bug fixed_command = self._fix_pyautogui_less_than_bug(command) command_list = ["python", "-c", self.pkgs_prefix.format(command=fixed_command)] payload = {"command": command_list, "shell": False} async def _execute(): return await self.post_json("/execute", payload) return await self._retry_invoke("execute_python_command", _execute) except Exception as e: logger.error(f"Failed to execute command: {e}") return None async def execute_action(self, action_type: str, parameters: Dict[str, Any] = None) -> Dict[str, Any]: """ Execute a desktop action. This is the main method for action space execution. Args: action_type: Action type (e.g., 'CLICK', 'TYPING') parameters: Action parameters Returns: Result dict with execution status """ parameters = parameters or {} # Handle control actions if action_type in ['WAIT', 'FAIL', 'DONE']: return { "status": "success", "action_type": action_type, "message": f"Control action {action_type} acknowledged" } # Validate keyboard keys if action_type in ['PRESS', 'KEY_DOWN', 'KEY_UP']: key = parameters.get('key') if key and key not in KEYBOARD_KEYS: return { "status": "error", "action_type": action_type, "error": f"Invalid key: {key}. Must be in supported keyboard keys." } if action_type == 'HOTKEY': keys = parameters.get('keys', []) invalid_keys = [k for k in keys if k not in KEYBOARD_KEYS] if invalid_keys: return { "status": "error", "action_type": action_type, "error": f"Invalid keys: {invalid_keys}" } # Build pyautogui command command = build_pyautogui_command(action_type, parameters) if command is None: return { "status": "error", "action_type": action_type, "error": f"Unsupported action type: {action_type}" } # Execute command result = await self.execute_python_command(command) if result: return { "status": "success", "action_type": action_type, "parameters": parameters, "result": result } else: return { "status": "error", "action_type": action_type, "parameters": parameters, "error": "Command execution failed" } async def get_accessibility_tree(self, max_depth: int = 5) -> Optional[Dict[str, Any]]: """ Get accessibility tree from desktop environment. Args: max_depth: Maximum depth of accessibility tree traversal Returns: Accessibility tree as dict, or None on failure """ try: async def _get(): response = await self._request("GET", "/accessibility", timeout=10) if response.status == 200: data = await response.json() return data.get("AT") else: raise RuntimeError(f"HTTP {response.status}") return await self._retry_invoke("get_accessibility_tree", _get) except Exception as e: logger.error(f"Failed to get accessibility tree: {e}") return None async def get_cursor_position(self) -> Optional[tuple[int, int]]: """ Get current mouse cursor position. Useful for GUI debugging and relative positioning. Returns: (x, y) tuple, or None on failure """ try: async def _get(): result = await self.get_json("/cursor_position") return (result.get("x"), result.get("y")) return await self._retry_invoke("get_cursor_position", _get) except Exception as e: logger.error(f"Failed to get cursor position: {e}") return None async def invoke(self, name: str, params: dict[str, Any]) -> Any: """ Unified RPC entry for operations. Required by BaseConnector. Args: name: Operation name (action_type or observation method) params: Operation parameters Returns: Operation result """ # Handle observation methods if name == "screenshot": return await self.get_screenshot() elif name == "accessibility_tree": max_depth = params.get("max_depth", 5) if params else 5 return await self.get_accessibility_tree(max_depth) elif name == "cursor_position": return await self.get_cursor_position() else: # Treat as action return await self.execute_action(name.upper(), params or {}) ================================================ FILE: anytool/grounding/backends/gui/transport/local_connector.py ================================================ """ Local GUI Connector — execute GUI operations directly in-process. This connector has the **same public API** as GUIConnector (HTTP version) but uses local pyautogui / ScreenshotHelper / AccessibilityHelper, removing the need for a local_server. Return format is kept identical so that GUISession / GUIAgentTool work without any changes. """ import asyncio import os import platform import re import tempfile import uuid from typing import Any, Dict, Optional from anytool.grounding.core.transport.connectors.base import BaseConnector from anytool.grounding.core.transport.task_managers.noop import NoOpConnectionManager from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) platform_name = platform.system() class LocalGUIConnector(BaseConnector[Any]): """ GUI connector that runs desktop automation **locally** using pyautogui / ScreenshotHelper / AccessibilityHelper, bypassing the Flask local_server. Public API is compatible with ``GUIConnector`` so that ``GUISession`` works without modification. """ def __init__( self, timeout: int = 90, retry_times: int = 3, retry_interval: float = 5.0, pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}", ): super().__init__(NoOpConnectionManager()) self.timeout = timeout self.retry_times = retry_times self.retry_interval = retry_interval self.pkgs_prefix = pkgs_prefix # Compatibility attributes expected by GUISession self.vm_ip = "localhost" self.server_port = 0 self.base_url = "local://localhost" # Lazy-initialized helpers (avoid import side effects at class load) self._screenshot_helper = None self._accessibility_helper = None def _get_screenshot_helper(self): if self._screenshot_helper is None: from anytool.local_server.utils import ScreenshotHelper self._screenshot_helper = ScreenshotHelper() return self._screenshot_helper def _get_accessibility_helper(self): if self._accessibility_helper is None: from anytool.local_server.utils import AccessibilityHelper self._accessibility_helper = AccessibilityHelper() return self._accessibility_helper # ------------------------------------------------------------------ # connect / disconnect # ------------------------------------------------------------------ async def connect(self) -> None: """No real connection for local mode.""" if self._connected: return await super().connect() logger.info("LocalGUIConnector: ready (local mode, no server required)") # ------------------------------------------------------------------ # Retry wrapper (same interface as GUIConnector._retry_invoke) # ------------------------------------------------------------------ async def _retry_invoke( self, operation_name: str, operation_func, *args, **kwargs, ): last_exc: Exception | None = None for attempt in range(1, self.retry_times + 1): try: result = await operation_func(*args, **kwargs) logger.debug( "%s executed successfully (attempt %d/%d)", operation_name, attempt, self.retry_times, ) return result except asyncio.TimeoutError as exc: logger.error("%s timed out", operation_name) raise RuntimeError( f"{operation_name} timed out after {self.timeout} seconds" ) from exc except Exception as exc: last_exc = exc if attempt == self.retry_times: break logger.warning( "%s failed (attempt %d/%d): %s, retrying in %.1f seconds...", operation_name, attempt, self.retry_times, exc, self.retry_interval, ) await asyncio.sleep(self.retry_interval) error_msg = f"{operation_name} failed after {self.retry_times} retries" logger.error(error_msg) raise last_exc or RuntimeError(error_msg) # ------------------------------------------------------------------ # PyAutoGUI '<' bug fix (same as GUIConnector) # ------------------------------------------------------------------ @staticmethod def _fix_pyautogui_less_than_bug(command: str) -> str: """Fix PyAutoGUI '<' character bug.""" press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)' def replace_press_less_than(match): return 'pyautogui.hotkey("shift", ",")' command = re.sub(press_pattern, replace_press_less_than, command) typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)' def process_typewrite_match(match): quote_char = match.group(1) content = match.group(2) try: decoded_content = content.encode("utf-8").decode("unicode_escape") content = decoded_content except UnicodeDecodeError: pass if "<" not in content: return match.group(0) parts = content.split("<") result_parts = [] for i, part in enumerate(parts): if i == 0: if part: result_parts.append( f"pyautogui.typewrite({quote_char}{part}{quote_char})" ) else: result_parts.append('pyautogui.hotkey("shift", ",")') if part: result_parts.append( f"pyautogui.typewrite({quote_char}{part}{quote_char})" ) return "; ".join(result_parts) command = re.sub(typewrite_pattern, process_typewrite_match, command) return command # ------------------------------------------------------------------ # Image response validation (same as GUIConnector) # ------------------------------------------------------------------ @staticmethod def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool: if not isinstance(data, (bytes, bytearray)) or not data: return False if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n": return True if len(data) >= 3 and data[:3] == b"\xff\xd8\xff": return True if content_type and ("image/png" in content_type or "image/jpeg" in content_type): return True return False # ------------------------------------------------------------------ # Public API (same signatures as GUIConnector) # ------------------------------------------------------------------ async def get_screen_size(self) -> Optional[tuple[int, int]]: """Get screen size using pyautogui.""" try: command = "print(pyautogui.size())" result = await self.execute_python_command(command) if result and result.get("status") == "success": output = result.get("output", "") match = re.search(r"width=(\d+).*height=(\d+)", output) if match: width = int(match.group(1)) height = int(match.group(2)) logger.info("Detected screen size: %dx%d", width, height) return (width, height) logger.warning("Failed to detect screen size, output: %s", result) return None except Exception as e: logger.error("Failed to get screen size: %s", e) return None async def get_screenshot(self) -> Optional[bytes]: """Capture screenshot locally using ScreenshotHelper.""" try: async def _get(): helper = self._get_screenshot_helper() tmp_path = os.path.join( tempfile.gettempdir(), f"screenshot_{uuid.uuid4().hex}.png" ) if helper.capture(tmp_path, with_cursor=True): with open(tmp_path, "rb") as f: data = f.read() os.remove(tmp_path) return data else: raise RuntimeError("Screenshot capture failed") return await self._retry_invoke("get_screenshot", _get) except Exception as e: logger.error("Failed to get screenshot: %s", e) return None async def execute_python_command(self, command: str) -> Optional[Dict[str, Any]]: """Execute a pyautogui Python command locally via subprocess.""" try: fixed_command = self._fix_pyautogui_less_than_bug(command) full_command = self.pkgs_prefix.format(command=fixed_command) async def _execute(): python_cmd = "python" if platform_name == "Windows" else "python3" proc = await asyncio.create_subprocess_exec( python_cmd, "-c", full_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout_b, stderr_b = await asyncio.wait_for( proc.communicate(), timeout=self.timeout ) stdout = stdout_b.decode("utf-8", errors="replace") if stdout_b else "" stderr = stderr_b.decode("utf-8", errors="replace") if stderr_b else "" returncode = proc.returncode or 0 return { "status": "success" if returncode == 0 else "error", "output": stdout + stderr, "error": stderr if returncode != 0 else "", "returncode": returncode, } return await self._retry_invoke("execute_python_command", _execute) except Exception as e: logger.error("Failed to execute command: %s", e) return None async def execute_action( self, action_type: str, parameters: Dict[str, Any] | None = None ) -> Dict[str, Any]: """Execute a desktop action (same logic as GUIConnector).""" parameters = parameters or {} if action_type in ["WAIT", "FAIL", "DONE"]: return { "status": "success", "action_type": action_type, "message": f"Control action {action_type} acknowledged", } # Import action builder (same module used by GUIConnector) from anytool.grounding.backends.gui.transport.actions import ( build_pyautogui_command, KEYBOARD_KEYS, ) if action_type in ["PRESS", "KEY_DOWN", "KEY_UP"]: key = parameters.get("key") if key and key not in KEYBOARD_KEYS: return { "status": "error", "action_type": action_type, "error": f"Invalid key: {key}. Must be in supported keyboard keys.", } if action_type == "HOTKEY": keys = parameters.get("keys", []) invalid_keys = [k for k in keys if k not in KEYBOARD_KEYS] if invalid_keys: return { "status": "error", "action_type": action_type, "error": f"Invalid keys: {invalid_keys}", } command = build_pyautogui_command(action_type, parameters) if command is None: return { "status": "error", "action_type": action_type, "error": f"Unsupported action type: {action_type}", } result = await self.execute_python_command(command) if result: return { "status": "success", "action_type": action_type, "parameters": parameters, "result": result, } else: return { "status": "error", "action_type": action_type, "parameters": parameters, "error": "Command execution failed", } async def get_accessibility_tree( self, max_depth: int = 5 ) -> Optional[Dict[str, Any]]: """Get accessibility tree locally.""" try: async def _get(): helper = self._get_accessibility_helper() return helper.get_tree(max_depth=max_depth) return await self._retry_invoke("get_accessibility_tree", _get) except Exception as e: logger.error("Failed to get accessibility tree: %s", e) return None async def get_cursor_position(self) -> Optional[tuple[int, int]]: """Get cursor position locally.""" try: async def _get(): helper = self._get_screenshot_helper() return helper.get_cursor_position() return await self._retry_invoke("get_cursor_position", _get) except Exception as e: logger.error("Failed to get cursor position: %s", e) return None # ------------------------------------------------------------------ # BaseConnector abstract methods # ------------------------------------------------------------------ async def invoke(self, name: str, params: dict[str, Any]) -> Any: if name == "screenshot": return await self.get_screenshot() elif name == "accessibility_tree": max_depth = params.get("max_depth", 5) if params else 5 return await self.get_accessibility_tree(max_depth) elif name == "cursor_position": return await self.get_cursor_position() else: return await self.execute_action(name.upper(), params or {}) async def request(self, *args: Any, **kwargs: Any) -> Any: raise NotImplementedError( "LocalGUIConnector does not support raw HTTP requests" ) ================================================ FILE: anytool/grounding/backends/mcp/__init__.py ================================================ """ MCP Backend for AnyTool Grounding. This module provides the MCP (Model Context Protocol) backend implementation for the grounding framework. It includes: - MCPProvider: Manages multiple MCP server sessions - MCPSession: Handles individual MCP server connections - MCPClient: High-level client for MCP server configuration - MCPInstallerManager: Manages automatic installation of MCP dependencies - MCPToolCache: Caches tool metadata to avoid starting servers on list_tools """ from .provider import MCPProvider from .session import MCPSession from .client import MCPClient from .installer import ( MCPInstallerManager, get_global_installer, set_global_installer, MCPDependencyError, MCPCommandNotFoundError, MCPInstallationCancelledError, MCPInstallationFailedError, ) from .tool_cache import MCPToolCache, get_tool_cache __all__ = [ "MCPProvider", "MCPSession", "MCPClient", "MCPInstallerManager", "get_global_installer", "set_global_installer", "MCPDependencyError", "MCPCommandNotFoundError", "MCPInstallationCancelledError", "MCPInstallationFailedError", "MCPToolCache", "get_tool_cache", ] ================================================ FILE: anytool/grounding/backends/mcp/client.py ================================================ """ Client for managing MCP servers and sessions. This module provides a high-level client that manages MCP servers, connectors, and sessions from configuration. """ import asyncio import warnings from typing import Any, Optional from anytool.grounding.core.types import SandboxOptions from anytool.config.utils import get_config_value, save_json_file, load_json_file from .config import create_connector_from_config from .session import MCPSession from .installer import MCPInstallerManager, MCPDependencyError from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class MCPClient: """Client for managing MCP servers and sessions. This class provides a unified interface for working with MCP servers, handling configuration, connector creation, and session management. """ def __init__( self, config: str | dict[str, Any] | None = None, sandbox: bool = False, sandbox_options: SandboxOptions | None = None, timeout: float = 30.0, sse_read_timeout: float = 300.0, max_retries: int = 3, retry_interval: float = 2.0, installer: Optional[MCPInstallerManager] = None, check_dependencies: bool = True, tool_call_max_retries: int = 3, tool_call_retry_delay: float = 1.0, ) -> None: """Initialize a new MCP client. Args: config: Either a dict containing configuration or a path to a JSON config file. If None, an empty configuration is used. sandbox: Whether to use sandboxed execution mode for running MCP servers. sandbox_options: Optional sandbox configuration options. timeout: Timeout for operations in seconds (default: 30.0) sse_read_timeout: SSE read timeout in seconds (default: 300.0) max_retries: Maximum number of retry attempts for failed operations (default: 3) retry_interval: Wait time between retries in seconds (default: 2.0) installer: Optional installer manager for dependency installation check_dependencies: Whether to check and install dependencies (default: True) tool_call_max_retries: Maximum number of retries for tool calls (default: 3) tool_call_retry_delay: Initial delay between tool call retries in seconds (default: 1.0) """ self.config: dict[str, Any] = {} self.sandbox = sandbox self.sandbox_options = sandbox_options self.timeout = timeout self.sse_read_timeout = sse_read_timeout self.max_retries = max_retries self.retry_interval = retry_interval self.installer = installer self.check_dependencies = check_dependencies self.tool_call_max_retries = tool_call_max_retries self.tool_call_retry_delay = tool_call_retry_delay self.sessions: dict[str, MCPSession] = {} self.active_sessions: list[str] = [] # Load configuration if provided if config is not None: if isinstance(config, str): self.config = load_json_file(config) else: self.config = config def _get_mcp_servers(self) -> dict[str, Any]: """Internal helper to get mcpServers configuration. Tries both 'mcpServers' and 'servers' keys for compatibility. Returns: Dictionary of MCP server configurations, empty dict if none found. """ servers = get_config_value(self.config, "mcpServers", None) if servers is None: servers = get_config_value(self.config, "servers", {}) return servers or {} @classmethod def from_dict( cls, config: dict[str, Any], sandbox: bool = False, sandbox_options: SandboxOptions | None = None, timeout: float = 30.0, sse_read_timeout: float = 300.0, max_retries: int = 3, retry_interval: float = 2.0, ) -> "MCPClient": """Create a MCPClient from a dictionary. Args: config: The configuration dictionary. sandbox: Whether to use sandboxed execution mode for running MCP servers. sandbox_options: Optional sandbox configuration options. timeout: Timeout for operations in seconds (default: 30.0) sse_read_timeout: SSE read timeout in seconds (default: 300.0) max_retries: Maximum number of retry attempts (default: 3) retry_interval: Wait time between retries in seconds (default: 2.0) """ return cls(config=config, sandbox=sandbox, sandbox_options=sandbox_options, timeout=timeout, sse_read_timeout=sse_read_timeout, max_retries=max_retries, retry_interval=retry_interval) @classmethod def from_config_file( cls, filepath: str, sandbox: bool = False, sandbox_options: SandboxOptions | None = None, timeout: float = 30.0, sse_read_timeout: float = 300.0, max_retries: int = 3, retry_interval: float = 2.0, ) -> "MCPClient": """Create a MCPClient from a configuration file. Args: filepath: The path to the configuration file. sandbox: Whether to use sandboxed execution mode for running MCP servers. sandbox_options: Optional sandbox configuration options. timeout: Timeout for operations in seconds (default: 30.0) sse_read_timeout: SSE read timeout in seconds (default: 300.0) max_retries: Maximum number of retry attempts (default: 3) retry_interval: Wait time between retries in seconds (default: 2.0) """ return cls(config=load_json_file(filepath), sandbox=sandbox, sandbox_options=sandbox_options, timeout=timeout, sse_read_timeout=sse_read_timeout, max_retries=max_retries, retry_interval=retry_interval) def add_server( self, name: str, server_config: dict[str, Any], ) -> None: """Add a server configuration. Args: name: The name to identify this server. server_config: The server configuration. """ mcp_servers = self._get_mcp_servers() if "mcpServers" not in self.config: self.config["mcpServers"] = {} self.config["mcpServers"][name] = server_config logger.debug(f"Added MCP server configuration: {name}") def remove_server(self, name: str) -> None: """Remove a server configuration. Args: name: The name of the server to remove. """ mcp_servers = self._get_mcp_servers() if name in mcp_servers: # Remove from config if "mcpServers" in self.config: self.config["mcpServers"].pop(name, None) elif "servers" in self.config: self.config["servers"].pop(name, None) # If we removed an active session, remove it from active_sessions if name in self.active_sessions: self.active_sessions.remove(name) logger.debug(f"Removed MCP server configuration: {name}") else: logger.warning(f"Server '{name}' not found in configuration") def get_server_names(self) -> list[str]: """Get the list of configured server names. Returns: List of server names. """ return list(self._get_mcp_servers().keys()) def save_config(self, filepath: str) -> None: """Save the current configuration to a file. Args: filepath: The path to save the configuration to. """ save_json_file(self.config, filepath) async def create_session(self, server_name: str, auto_initialize: bool = True) -> MCPSession: """Create a session for the specified server with retry logic. Args: server_name: The name of the server to create a session for. auto_initialize: Whether to automatically initialize the session. Returns: The created MCPSession. Raises: ValueError: If the specified server doesn't exist. Exception: If session creation fails after all retries. """ # Check if session already exists if server_name in self.sessions: logger.debug(f"Session for server '{server_name}' already exists, returning existing session") return self.sessions[server_name] # Get server config servers = self._get_mcp_servers() if not servers: warnings.warn("No MCP servers defined in config", UserWarning, stacklevel=2) return None if server_name not in servers: raise ValueError(f"Server '{server_name}' not found in config. Available: {list(servers.keys())}") server_config = servers[server_name] # Retry logic for session creation last_exc: Exception | None = None for attempt in range(1, self.max_retries + 1): try: # Create connector with options (now async) connector = await create_connector_from_config( server_config, server_name=server_name, sandbox=self.sandbox, sandbox_options=self.sandbox_options, timeout=self.timeout, sse_read_timeout=self.sse_read_timeout, installer=self.installer, check_dependencies=self.check_dependencies, tool_call_max_retries=self.tool_call_max_retries, tool_call_retry_delay=self.tool_call_retry_delay, ) # Create the session with proper initialization parameters session = MCPSession( connector=connector, session_id=f"mcp-{server_name}", auto_connect=True, auto_initialize=False, # We'll handle initialization explicitly below ) # Initialize if requested if auto_initialize: await session.initialize() logger.debug(f"Initialized session for server '{server_name}'") # Store session self.sessions[server_name] = session # Add to active sessions if server_name not in self.active_sessions: self.active_sessions.append(server_name) logger.info(f"Created session for MCP server '{server_name}' (attempt {attempt}/{self.max_retries})") return session except MCPDependencyError as e: # Don't retry dependency errors - they won't succeed on retry # Error already shown to user by installer, just re-raise logger.debug(f"Dependency error for server '{server_name}': {type(e).__name__}") raise except Exception as e: last_exc = e if attempt == self.max_retries: break # Use info level for first attempt (common after fresh install), warning for subsequent log_level = logger.info if attempt == 1 else logger.warning log_level( f"Failed to create session for server '{server_name}' (attempt {attempt}/{self.max_retries}): {e}, " f"retrying in {self.retry_interval} seconds..." ) await asyncio.sleep(self.retry_interval) # All retries failed error_msg = f"Failed to create session for server '{server_name}' after {self.max_retries} retries" logger.error(error_msg) raise last_exc or RuntimeError(error_msg) async def create_all_sessions( self, auto_initialize: bool = True, ) -> dict[str, MCPSession]: """Create sessions for all configured servers. Args: auto_initialize: Whether to automatically initialize the sessions. Returns: Dictionary mapping server names to their MCPSession instances. Warns: UserWarning: If no servers are configured. """ servers = self._get_mcp_servers() if not servers: warnings.warn("No MCP servers defined in config", UserWarning, stacklevel=2) return {} # Create sessions for all servers (create_session already handles initialization) logger.debug(f"Creating sessions for {len(servers)} servers") for name in servers: try: await self.create_session(name, auto_initialize) except Exception as e: logger.error(f"Failed to create session for server '{name}': {e}") logger.info(f"Created {len(self.sessions)} MCP sessions") return self.sessions def get_session(self, server_name: str) -> MCPSession: """Get an existing session. Args: server_name: The name of the server to get the session for. If None, uses the first active session. Returns: The MCPSession for the specified server. Raises: ValueError: If no active sessions exist or the specified session doesn't exist. """ if server_name not in self.sessions: raise ValueError(f"No session exists for server '{server_name}'") return self.sessions[server_name] def get_all_active_sessions(self) -> dict[str, MCPSession]: """Get all active sessions. Returns: Dictionary mapping server names to their MCPSession instances. """ return {name: self.sessions[name] for name in self.active_sessions if name in self.sessions} async def close_session(self, server_name: str) -> None: """Close a session. Args: server_name: The name of the server to close the session for. Raises: ValueError: If no active sessions exist or the specified session doesn't exist. """ # Check if the session exists if server_name not in self.sessions: logger.warning(f"No session exists for server '{server_name}', nothing to close") return # Get the session session = self.sessions[server_name] error_occurred = False try: # Disconnect from the session logger.debug(f"Closing session for server '{server_name}'") await session.disconnect() logger.info(f"Successfully closed session for server '{server_name}'") except Exception as e: error_occurred = True logger.error(f"Error closing session for server '{server_name}': {e}") finally: # Remove the session regardless of whether disconnect succeeded self.sessions.pop(server_name, None) # Remove from active_sessions if server_name in self.active_sessions: self.active_sessions.remove(server_name) if error_occurred: logger.warning(f"Session for '{server_name}' removed from tracking despite disconnect error") async def close_all_sessions(self) -> None: """Close all active sessions. This method ensures all sessions are closed even if some fail. """ # Get a list of all session names first to avoid modification during iteration server_names = list(self.sessions.keys()) errors = [] for server_name in server_names: try: logger.debug(f"Closing session for server '{server_name}'") await self.close_session(server_name) except Exception as e: error_msg = f"Failed to close session for server '{server_name}': {e}" logger.error(error_msg) errors.append(error_msg) # Log summary if there were errors if errors: logger.error(f"Encountered {len(errors)} errors while closing sessions") else: logger.debug("All sessions closed successfully") ================================================ FILE: anytool/grounding/backends/mcp/config.py ================================================ """ Configuration loader for MCP session. This module provides functionality to load MCP configuration from JSON files. """ from typing import Any, Optional from anytool.grounding.core.types import SandboxOptions from anytool.config.utils import get_config_value from .transport.connectors import ( MCPBaseConnector, HttpConnector, SandboxConnector, StdioConnector, WebSocketConnector, ) from .transport.connectors.utils import is_stdio_server from .installer import MCPInstallerManager # Import E2BSandbox try: from anytool.grounding.core.security import E2BSandbox E2B_AVAILABLE = True except ImportError: E2BSandbox = None E2B_AVAILABLE = False async def create_connector_from_config( server_config: dict[str, Any], server_name: str = "unknown", sandbox: bool = False, sandbox_options: SandboxOptions | None = None, timeout: float = 30.0, sse_read_timeout: float = 300.0, installer: Optional[MCPInstallerManager] = None, check_dependencies: bool = True, tool_call_max_retries: int = 3, tool_call_retry_delay: float = 1.0, ) -> MCPBaseConnector: """Create a connector based on server configuration. Args: server_config: The server configuration section server_name: Name of the MCP server (for display purposes) sandbox: Whether to use sandboxed execution mode for running MCP servers. sandbox_options: Optional sandbox configuration options. timeout: Timeout for operations in seconds (default: 30.0) sse_read_timeout: SSE read timeout in seconds (default: 300.0) installer: Optional installer manager for dependency installation check_dependencies: Whether to check and install dependencies (default: True) tool_call_max_retries: Maximum number of retries for tool calls (default: 3) tool_call_retry_delay: Initial delay between retries in seconds (default: 1.0) Returns: A configured connector instance Raises: RuntimeError: If dependencies are not installed and user declines installation """ # Get original command and args from config original_command = get_config_value(server_config, "command") original_args = get_config_value(server_config, "args", []) # Check and install dependencies if needed (only for stdio servers) if is_stdio_server(server_config) and check_dependencies: # Use provided installer or get global instance if installer is None: from .installer import get_global_installer installer = get_global_installer() # Ensure dependencies are installed (using original command/args) await installer.ensure_dependencies(server_name, original_command, original_args) # Stdio connector (command-based) if is_stdio_server(server_config) and not sandbox: return StdioConnector( command=get_config_value(server_config, "command"), args=get_config_value(server_config, "args"), env=get_config_value(server_config, "env", None), ) # Sandboxed connector elif is_stdio_server(server_config) and sandbox: if not E2B_AVAILABLE: raise ImportError( "E2B sandbox support not available. Please install e2b-code-interpreter: " "'pip install e2b-code-interpreter'" ) # Create E2B sandbox instance _sandbox_options = sandbox_options or {} e2b_sandbox = E2BSandbox(_sandbox_options) # Extract timeout values from sandbox_options or use defaults connector_timeout = _sandbox_options.get("timeout", timeout) connector_sse_timeout = _sandbox_options.get("sse_read_timeout", sse_read_timeout) # Create and return sandbox connector return SandboxConnector( sandbox=e2b_sandbox, command=get_config_value(server_config, "command"), args=get_config_value(server_config, "args"), env=get_config_value(server_config, "env", None), supergateway_command=_sandbox_options.get("supergateway_command", "npx -y supergateway"), port=_sandbox_options.get("port", 3000), timeout=connector_timeout, sse_read_timeout=connector_sse_timeout, ) # HTTP connector elif "url" in server_config: return HttpConnector( base_url=get_config_value(server_config, "url"), headers=get_config_value(server_config, "headers", None), auth_token=get_config_value(server_config, "auth_token", None), timeout=timeout, sse_read_timeout=sse_read_timeout, tool_call_max_retries=tool_call_max_retries, tool_call_retry_delay=tool_call_retry_delay, ) # WebSocket connector elif "ws_url" in server_config: return WebSocketConnector( url=get_config_value(server_config, "ws_url"), headers=get_config_value(server_config, "headers", None), auth_token=get_config_value(server_config, "auth_token", None), ) raise ValueError("Cannot determine connector type from config") ================================================ FILE: anytool/grounding/backends/mcp/installer.py ================================================ import asyncio import sys import shutil from typing import Callable, Awaitable, Optional, Dict, List from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) PromptFunc = Callable[[str], Awaitable[bool]] # Global lock to prevent concurrent user prompts _prompt_lock = asyncio.Lock() class MCPDependencyError(RuntimeError): """Base exception for MCP dependency errors.""" pass class MCPCommandNotFoundError(MCPDependencyError): """Raised when a required command is not available.""" pass class MCPInstallationCancelledError(MCPDependencyError): """Raised when user cancels installation.""" pass class MCPInstallationFailedError(MCPDependencyError): """Raised when installation fails.""" pass class Colors: RESET = "\033[0m" BOLD = "\033[1m" RED = "\033[91m" YELLOW = "\033[93m" GREEN = "\033[92m" CYAN = "\033[96m" GRAY = "\033[90m" WHITE = "\033[97m" BLUE = "\033[94m" class MCPInstallerManager: """ MCP dependencies package installer manager. Responsible for detecting if the MCP server dependencies are installed, and if not, asking the user whether to install them. """ def __init__(self, prompt: PromptFunc | None = None, auto_install: bool = False, verbose: bool = False): """Initialize the installer manager. Args: prompt: Custom user prompt function, if None, the default CLI prompt is used auto_install: If True, automatically install dependencies without asking the user verbose: If True, show detailed installation logs; if False, only show progress indicator """ self._prompt: PromptFunc | None = prompt or self._default_cli_prompt self._auto_install = auto_install self._verbose = verbose self._installed_cache: Dict[str, bool] = {} # Cache for checked packages self._failed_installations: Dict[str, str] = {} # Track failed installations to avoid retry async def _default_cli_prompt(self, message: str) -> bool: """Default CLI prompt function (called within lock by ensure_dependencies).""" from anytool.utils.display import print_separator, colorize print() print_separator(70, 'c', 2) print(f" {colorize('MCP dependencies installation prompt', color=Colors.BLUE, bold=True)}") print_separator(70, 'c', 2) print(f" {message}") print_separator(70, 'gr', 2) print(f" {colorize('[y/yes]', color=Colors.GREEN)} Install | {colorize('[n/no]', color=Colors.RED)} Cancel") print_separator(70, 'gr', 2) print(f" {colorize('Your choice:', bold=True)} ", end="", flush=True) answer = await asyncio.get_running_loop().run_in_executor(None, sys.stdin.readline) response = answer.strip().lower() in {"y", "yes"} if response: print(f"{Colors.GREEN}✓ Installation confirmed{Colors.RESET}\n") else: print(f"{Colors.RED}✗ Installation cancelled{Colors.RESET}\n") return response async def _ask_user(self, message: str) -> bool: """Ask the user whether to install.""" if self._auto_install: logger.info("Automatic installation mode enabled, will automatically install dependencies") return True if self._prompt: try: return await self._prompt(message) except Exception as e: logger.error(f"Error asking user: {e}") return False return False def _check_command_available(self, command: str) -> bool: """Check if the command is available. Args: command: The command to check (e.g. "npx", "uvx") Returns: bool: Whether the command is available """ return shutil.which(command) is not None async def _check_package_installed(self, command: str, args: List[str]) -> bool: """Check if the package is installed. Args: command: The command to check (e.g. "npx", "uvx") args: The arguments list Returns: bool: Whether the package is installed """ # Build cache key cache_key = f"{command}:{':'.join(args)}" # Check cache if cache_key in self._installed_cache: return self._installed_cache[cache_key] # For different types of commands, use different check methods try: if command == "npx": # For npx, check if the npm package exists package_name = self._extract_npm_package(args) if package_name: result = await self._check_npm_package(package_name) self._installed_cache[cache_key] = result return result elif command == "uvx": # For uvx, check if the Python package exists package_name = self._extract_python_package(args) if package_name: result = await self._check_python_package(package_name) self._installed_cache[cache_key] = result return result elif command == "uv": # For "uv run --with package ...", check if the Python package exists package_name = self._extract_uv_package(args) if package_name: result = await self._check_uv_pip_package(package_name) self._installed_cache[cache_key] = result return result except Exception as e: logger.debug(f"Error checking package installation status: {e}") # Default to assuming not installed return False def _extract_npm_package(self, args: List[str]) -> Optional[str]: """Extract package name from npx arguments. Args: args: npx arguments list, e.g. ["-y", "mcp-excalidraw-server"] or ["bazi-mcp"] Returns: Package name (without version tag) or None """ for i, arg in enumerate(args): # Skip option parameters if arg.startswith("-"): continue # Found package name, now strip version tag package_name = arg # Handle scoped packages: @scope/package@version -> @scope/package if package_name.startswith("@"): # Scoped package like @rtuin/mcp-mermaid-validator@latest parts = package_name.split("/", 1) if len(parts) == 2: scope = parts[0] name_with_version = parts[1] # Remove version tag from name part (e.g., "pkg@latest" -> "pkg") name = name_with_version.split("@")[0] if "@" in name_with_version else name_with_version return f"{scope}/{name}" return package_name else: # Regular package like mcp-deepwiki@latest -> mcp-deepwiki return package_name.split("@")[0] if "@" in package_name else package_name return None def _extract_python_package(self, args: List[str]) -> Optional[str]: """Extract package name from uvx arguments. Args: args: uvx arguments list, e.g. ["--from", "office-powerpoint-mcp-server", "ppt_mcp_server"] or ["--with", "mcp==1.9.0", "sitemap-mcp-server"] or ["arxiv-mcp-server", "--storage-path", "./path"] Returns: Package name or None """ # Find --from parameter (this is the package to install) for i, arg in enumerate(args): if arg == "--from" and i + 1 < len(args): return args[i + 1] # Skip option flags and their values, find the main package (FIRST positional arg) # Options that take a value: --with, --python, --from, --storage-path, etc. options_with_value = {"--with", "--from", "--python", "-p", "--storage-path"} skip_next = False for arg in args: if skip_next: skip_next = False continue if arg in options_with_value: skip_next = True continue if arg.startswith("-"): # Other flags without values (or unknown options with values) # Also skip the next arg if it looks like an option value (doesn't start with -) continue # First non-option argument is the package name return arg return None def _extract_uv_package(self, args: List[str]) -> Optional[str]: """Extract package name from uv run arguments. Args: args: uv arguments list, e.g. ["run", "--with", "biomcp-python", "biomcp", "run"] Returns: Package name or None """ # Find --with parameter (this specifies the package to install) for i, arg in enumerate(args): if arg == "--with" and i + 1 < len(args): package_name = args[i + 1] # Remove version specifier if present (e.g., "mcp==1.9.0" -> "mcp") if "==" in package_name: return package_name.split("==")[0] if ">=" in package_name: return package_name.split(">=")[0] return package_name return None async def _check_npm_package(self, package_name: str) -> bool: """Check if the npm package is globally installed. Args: package_name: npm package name Returns: bool: Whether the npm package is installed """ try: process = await asyncio.create_subprocess_exec( "npm", "list", "-g", package_name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() # npm list returns 0 if the package is installed return process.returncode == 0 except Exception as e: logger.debug(f"Error checking npm package {package_name}: {e}") return False async def _check_python_package(self, package_name: str) -> bool: """Check if the Python package is installed as a uvx tool. uvx tools are installed in ~/.local/share/uv/tools/ directory, not in the current pip environment. Args: package_name: Python package/tool name Returns: bool: Whether the uvx tool is installed """ import os from pathlib import Path # Strip version specifier if present (e.g., "mcp==1.9.0" -> "mcp") clean_name = package_name.split("==")[0].split(">=")[0].split("<=")[0].split(">")[0].split("<")[0] # Check if uvx tool exists in the standard uv tools directory uv_tools_dir = Path.home() / ".local" / "share" / "uv" / "tools" tool_dir = uv_tools_dir / clean_name if tool_dir.exists(): logger.debug(f"uvx tool '{clean_name}' found at {tool_dir}") return True # Fallback: try running uvx with --help to check if it's available try: process = await asyncio.create_subprocess_exec( "uvx", clean_name, "--help", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) # Just wait briefly, don't need the full output try: await asyncio.wait_for(process.communicate(), timeout=5.0) except asyncio.TimeoutError: process.kill() await process.wait() # If it didn't error immediately, the tool likely exists return process.returncode == 0 except Exception as e: logger.debug(f"Error checking uvx tool {clean_name}: {e}") return False async def _check_uv_pip_package(self, package_name: str) -> bool: """Check if a Python package is installed via uv pip. Args: package_name: Python package name Returns: bool: Whether the package is installed """ # Strip version specifier if present clean_name = package_name.split("==")[0].split(">=")[0].split("<=")[0].split(">")[0].split("<")[0] try: # Try using uv pip show to check if package is installed process = await asyncio.create_subprocess_exec( "uv", "pip", "show", clean_name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() if process.returncode == 0: logger.debug(f"uv pip package '{clean_name}' found") return True except Exception as e: logger.debug(f"Error checking uv pip package {clean_name}: {e}") # Fallback: check with regular pip try: process = await asyncio.create_subprocess_exec( "pip", "show", clean_name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() return process.returncode == 0 except Exception as e: logger.debug(f"Error checking pip package {clean_name}: {e}") return False async def _install_package(self, command: str, args: List[str], use_sudo: bool = False) -> bool: """Execute the install command. Args: command: The command to execute (e.g. "npx", "uvx") args: The arguments list use_sudo: Whether to use sudo for installation Returns: bool: Whether the installation is successful """ install_command = self._get_install_command(command, args) if not install_command: logger.error("Cannot determine install command") return False # Add sudo if requested if use_sudo: install_command = ["sudo"] + install_command logger.info(f"Executing install command: {' '.join(install_command)}") try: # For sudo commands, always show verbose output so password prompt is visible if self._verbose or use_sudo: # Verbose mode: show all installation logs from anytool.utils.display import print_separator, colorize print_separator(70, 'c', 2) if use_sudo: print(f" {colorize('Installing with administrator privileges...', color=Colors.BLUE)}") print(f" {colorize('>> You will be prompted for your password below <<', color=Colors.YELLOW)}") else: print(f" {colorize('Installing dependencies...', color=Colors.BLUE)}") print(f" {colorize('Command: ' + ' '.join(install_command), color=Colors.GRAY)}") print_separator(70, 'c', 2) print() # For sudo, don't redirect stdin so password prompt works if use_sudo: process = await asyncio.create_subprocess_exec( *install_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, stdin=None # Let sudo use terminal for password ) else: process = await asyncio.create_subprocess_exec( *install_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT ) # Real-time output of installation logs output_lines = [] while True: line = await process.stdout.readline() if not line: break line_str = line.decode().rstrip() output_lines.append(line_str) print(f"{Colors.GRAY}{line_str}{Colors.RESET}") await process.wait() full_output = '\n'.join(output_lines) else: # Quiet mode: only show progress indicator print(f"\n{Colors.BLUE}Installing dependencies...{Colors.RESET} ", end="", flush=True) process = await asyncio.create_subprocess_exec( *install_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) # Show spinner animation while installing spinner = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'] spinner_idx = 0 while True: try: await asyncio.wait_for(process.wait(), timeout=0.1) break except asyncio.TimeoutError: print(f"\r{Colors.BLUE}Installing dependencies...{Colors.RESET} {Colors.CYAN}{spinner[spinner_idx]}{Colors.RESET}", end="", flush=True) spinner_idx = (spinner_idx + 1) % len(spinner) # Clear the spinner line print(f"\r{' ' * 100}\r", end="", flush=True) # Collect output stdout, stderr = await process.communicate() full_output = (stdout or stderr).decode() if (stdout or stderr) else "" if process.returncode == 0: print(f"{Colors.GREEN}✓ Dependencies installed successfully{Colors.RESET}") if not use_sudo: print(f"{Colors.GRAY}(Note: First connection may take a moment to initialize){Colors.RESET}") # Update cache cache_key = f"{command}:{':'.join(args)}" self._installed_cache[cache_key] = True return True else: # Check if it's a permission error is_permission_error = "EACCES" in full_output or "permission denied" in full_output.lower() if is_permission_error and not use_sudo: print(f"\n{Colors.YELLOW}Permission denied{Colors.RESET}") print(f"{Colors.GRAY}The installation requires administrator privileges.{Colors.RESET}\n") # Ask user if they want to use sudo message = ( f"\n{Colors.WHITE}Administrator privileges required{Colors.RESET}\n\n" f"Command: {Colors.GRAY}{' '.join(install_command)}{Colors.RESET}\n\n" f"{Colors.YELLOW}Do you want to retry with sudo (requires password)?{Colors.RESET}" ) if await self._ask_user(message): # No extra print needed, the verbose mode will show clear instructions return await self._install_package(command, args, use_sudo=True) else: print(f"\n{Colors.RED}✗ Installation cancelled{Colors.RESET}") return False else: print(f"{Colors.RED}✗ Dependencies installation failed (return code: {process.returncode}){Colors.RESET}") # Show error output if not already shown if not self._verbose and full_output: # Limit error output to last 20 lines error_lines = full_output.split('\n') if len(error_lines) > 20: error_lines = ['...(truncated)...'] + error_lines[-20:] print(f"{Colors.GRAY}Error output:\n{chr(10).join(error_lines)}{Colors.RESET}") # Add general guidance for manual installation print(f"\n{Colors.YELLOW}Tip:{Colors.RESET} {Colors.GRAY}If automatic installation fails, please refer to the") print(f"official documentation of the MCP server for manual installation instructions.{Colors.RESET}\n") return False except Exception as e: logger.error(f"Error installing dependencies: {e}") print(f"{Colors.RED}✗ Error occurred during installation: {e}{Colors.RESET}") return False def _get_install_command(self, command: str, args: List[str]) -> Optional[List[str]]: """Generate install command based on command type. Args: command: The command to execute (e.g. "npx", "uvx", "uv") args: The original arguments list Returns: Install command list or None """ if command == "npx": package_name = self._extract_npm_package(args) if package_name: return ["npm", "install", "-g", package_name] elif command == "uvx": package_name = self._extract_python_package(args) if package_name: return ["pip", "install", package_name] elif command == "uv": # Handle "uv run --with package_name ..." format package_name = self._extract_uv_package(args) if package_name: return ["uv", "pip", "install", package_name] return None async def ensure_dependencies( self, server_name: str, command: str, args: List[str] ) -> bool: """Ensure the dependencies of the MCP server are installed. This method checks if the dependencies are installed, and if not, asks the user whether to install them. Args: server_name: MCP server name (for display purposes) command: The command to execute (e.g. "npx", "uvx") args: The arguments list Returns: bool: Whether the dependencies are installed (installed or successfully installed) Raises: RuntimeError: When the command is not available or the user refuses to install """ # Use lock to ensure entire installation process is atomic async with _prompt_lock: return await self._ensure_dependencies_impl(server_name, command, args) async def _ensure_dependencies_impl( self, server_name: str, command: str, args: List[str] ) -> bool: """Internal implementation of ensure_dependencies (called within lock).""" # Skip dependency checking for direct script execution commands # These commands run scripts directly and don't need package installation SKIP_COMMANDS = {"node", "python", "python3", "bash", "sh", "deno", "bun"} if command.lower() in SKIP_COMMANDS: logger.debug(f"Skipping dependency check for direct script execution command: {command}") return True # Skip dependency checking for GitHub-based npx packages # These packages are handled directly by npx which downloads, builds, and runs them # npm install -g doesn't work properly for GitHub packages that require building if command == "npx": package_name = self._extract_npm_package(args) if package_name and package_name.startswith("github:"): logger.debug(f"Skipping dependency check for GitHub-based npx package: {package_name}") return True # Check if this server has already failed installation cache_key = f"{server_name}:{command}:{':'.join(args)}" if cache_key in self._failed_installations: error_msg = self._failed_installations[cache_key] logger.debug(f"Skipping installation for '{server_name}' - previously failed") raise MCPDependencyError(error_msg) # Special handling for uvx - check if uv is installed if command == "uvx": if not self._check_command_available("uv"): # Only show once to user, no verbose logging print(f"\n{Colors.RED}✗ Server '{server_name}' requires 'uv' to be installed{Colors.RESET}") print(f"{Colors.YELLOW}Please install uv first:") print(f" • macOS/Linux: curl -LsSf https://astral.sh/uv/install.sh | sh") print(f" • Or with pip: pip install uv") print(f" • Or with brew: brew install uv{Colors.RESET}\n") error_msg = f"uvx requires 'uv' to be installed (server: {server_name})" self._failed_installations[cache_key] = error_msg raise MCPCommandNotFoundError(error_msg) # Check if the command is available if not self._check_command_available(command): error_msg = ( f"Command '{command}' is not available.\n" f"Please install the necessary tools first." ) logger.error(error_msg) self._failed_installations[cache_key] = error_msg raise MCPCommandNotFoundError(error_msg) # Check if the package is installed if await self._check_package_installed(command, args): logger.debug(f"The dependencies of the MCP server '{server_name}' are installed") return True # Extract package name for display if command == "npx": package_name = self._extract_npm_package(args) package_type = "npm" elif command == "uvx": package_name = self._extract_python_package(args) package_type = "Python" elif command == "uv": package_name = self._extract_uv_package(args) package_type = "Python" else: package_name = f"{command} {' '.join(args)}" package_type = "package" # Build the message for displaying the install command install_cmd = self._get_install_command(command, args) # If we can't determine an install command, show helpful message if not install_cmd: print(f"\n{Colors.YELLOW}Cannot automatically install dependencies for '{server_name}'{Colors.RESET}") print(f"{Colors.GRAY}Command: {command} {' '.join(args)}{Colors.RESET}") print(f"\n{Colors.WHITE}This MCP server may require manual installation or configuration.{Colors.RESET}") print(f"{Colors.GRAY}Please refer to the MCP server's official documentation for installation instructions.{Colors.RESET}\n") error_msg = f"Manual installation required for '{server_name}' (command: {command})" self._failed_installations[cache_key] = error_msg raise MCPDependencyError(error_msg) install_cmd_str = ' '.join(install_cmd) # Build the message message = ( f"\n{Colors.WHITE}The MCP server needs to install dependencies{Colors.RESET}\n\n" f"Server name: {Colors.CYAN}{server_name}{Colors.RESET}\n" f"Package type: {Colors.YELLOW}{package_type}{Colors.RESET}\n" f"Package name: {Colors.YELLOW}{package_name or 'Unknown'}{Colors.RESET}\n" f"Install command: {Colors.GRAY}{install_cmd_str}{Colors.RESET}\n\n" f"{Colors.YELLOW}Whether to install this dependency package?{Colors.RESET}" ) # Ask the user if not await self._ask_user(message): error_msg = f"User cancelled the dependency installation for '{server_name}'" logger.warning(error_msg) self._failed_installations[cache_key] = error_msg raise MCPInstallationCancelledError(error_msg) # Execute installation success = await self._install_package(command, args) if not success: error_msg = f"Dependency installation failed for '{server_name}'" logger.error(error_msg) self._failed_installations[cache_key] = error_msg raise MCPInstallationFailedError(error_msg) return True # Global singleton instance _global_installer: Optional[MCPInstallerManager] = None def get_global_installer() -> MCPInstallerManager: """Get the global installer manager instance.""" global _global_installer if _global_installer is None: _global_installer = MCPInstallerManager() return _global_installer def set_global_installer(installer: MCPInstallerManager) -> None: """Set the global installer manager instance.""" global _global_installer _global_installer = installer ================================================ FILE: anytool/grounding/backends/mcp/provider.py ================================================ """ MCP Provider implementation. This module provides a provider for managing MCP server sessions. """ import asyncio from typing import Dict, List, Optional from anytool.grounding.backends.mcp.session import MCPSession from anytool.grounding.core.provider import Provider from anytool.grounding.core.types import SessionConfig, BackendType, ToolSchema from anytool.grounding.backends.mcp.client import MCPClient from anytool.grounding.backends.mcp.installer import MCPInstallerManager, MCPDependencyError from anytool.grounding.backends.mcp.tool_cache import get_tool_cache from anytool.grounding.backends.mcp.tool_converter import _sanitize_mcp_schema from anytool.grounding.core.tool import BaseTool, RemoteTool from anytool.utils.logging import Logger from anytool.config.utils import get_config_value logger = Logger.get_logger(__name__) class MCPProvider(Provider[MCPSession]): """ MCP Provider manages multiple MCP server sessions. Each MCP server defined in config corresponds to one session. The provider handles lazy/eager session creation and tool aggregation. """ def __init__(self, config: Dict | None = None, installer: Optional[MCPInstallerManager] = None): """Initialize MCP Provider. Args: config: Configuration dict with MCP server definitions. Example: {"mcpServers": {"server1": {...}, "server2": {...}}} installer: Optional installer manager for dependency installation """ super().__init__(BackendType.MCP, config) # Extract MCP-specific configuration sandbox = get_config_value(config, "sandbox", False) timeout = get_config_value(config, "timeout", 30) sse_read_timeout = get_config_value(config, "sse_read_timeout", 300.0) max_retries = get_config_value(config, "max_retries", 3) retry_interval = get_config_value(config, "retry_interval", 2.0) check_dependencies = get_config_value(config, "check_dependencies", True) auto_install = get_config_value(config, "auto_install", False) # Tool call retry settings (for transient errors like 400, 500, etc.) tool_call_max_retries = get_config_value(config, "tool_call_max_retries", 3) tool_call_retry_delay = get_config_value(config, "tool_call_retry_delay", 1.0) # Create sandbox options if sandbox is enabled sandbox_options = None if sandbox: sandbox_options = { "timeout": timeout, "sse_read_timeout": sse_read_timeout, } # Create installer with auto_install setting if not provided if installer is None and auto_install: installer = MCPInstallerManager(auto_install=True) # Initialize MCPClient with configuration self._client = MCPClient( config=config or {}, sandbox=sandbox, sandbox_options=sandbox_options, timeout=timeout, sse_read_timeout=sse_read_timeout, max_retries=max_retries, retry_interval=retry_interval, installer=installer, check_dependencies=check_dependencies, tool_call_max_retries=tool_call_max_retries, tool_call_retry_delay=tool_call_retry_delay, ) # Map server name to session for quick lookup self._server_sessions: Dict[str, MCPSession] = {} async def initialize(self) -> None: """Initialize the MCP provider. If config["eager_sessions"] is True, creates sessions for all configured servers. Otherwise, sessions are created lazily on first access. """ if self.is_initialized: return # config can be dict or Pydantic model, use utility function eager = get_config_value(self.config, "eager_sessions", False) if eager: servers = self.list_servers() logger.debug(f"Eagerly initializing {len(servers)} MCP server sessions") for srv in servers: if srv not in self._server_sessions: cfg = SessionConfig( session_name=f"mcp-{srv}", backend_type=BackendType.MCP, connection_params={"server": srv}, ) await self.create_session(cfg) self.is_initialized = True logger.info( f"MCPProvider initialized with {len(self.list_servers())} servers (eager={eager})" ) def list_servers(self) -> List[str]: """Return all configured MCP server names from MCPClient config. Returns: List of server names """ return self._client.get_server_names() async def create_session(self, session_config: SessionConfig) -> MCPSession: """Create a new MCP session for a specific server. Args: session_config: Must contain 'server' in connection_params Returns: MCPSession instance Raises: ValueError: If 'server' not in connection_params Exception: If session creation or initialization fails """ server = get_config_value(session_config.connection_params, "server") if not server: raise ValueError("MCPProvider.create_session requires 'server' in connection_params") # Generate session_id: mcp- session_id = f"{self.backend_type.value}-{server}" # Check if session already exists if server in self._server_sessions: logger.debug(f"Session for server '{server}' already exists, returning existing session") return self._server_sessions[server] # Create session through MCPClient try: logger.debug(f"Creating new session for MCP server: {server}") session = await self._client.create_session(server, auto_initialize=True) session.session_id = session_id # Store in both maps self._server_sessions[server] = session self._sessions[session_id] = session logger.info(f"Created MCP session '{session_id}' for server '{server}'") return session except MCPDependencyError as e: # Dependency errors already shown to user, just debug log logger.debug(f"Dependency error for server '{server}': {type(e).__name__}") raise except Exception as e: logger.error(f"Failed to create session for server '{server}': {e}") raise async def close_session(self, session_name: str) -> None: """Close an MCP session by session name. Args: session_name: Session name in format 'mcp-' """ # Parse server name from session_name (format: mcp-) try: prefix, server_name = session_name.split("-", 1) if prefix != self.backend_type.value: raise ValueError(f"Invalid MCP session name format: {session_name}, expected 'mcp-'") except ValueError as e: logger.warning(f"Invalid session_name format: {session_name} - {e}") return # Check if session exists if session_name not in self._sessions and server_name not in self._server_sessions: logger.warning(f"Session '{session_name}' not found, nothing to close") return error_occurred = False try: logger.debug(f"Closing MCP session '{session_name}' (server: {server_name})") await self._client.close_session(server_name) logger.info(f"Successfully closed MCP session '{session_name}'") except Exception as e: error_occurred = True logger.error(f"Error closing MCP session '{session_name}': {e}") finally: # Clean up both maps regardless of errors self._server_sessions.pop(server_name, None) self._sessions.pop(session_name, None) if error_occurred: logger.warning(f"Session '{session_name}' removed from tracking despite close error") async def list_tools(self, session_name: str | None = None, use_cache: bool = True) -> List[BaseTool]: """List tools from MCP sessions. Args: session_name: If provided, only list tools from that session. If None, list tools from all sessions. use_cache: If True, try to load from cache first (no server startup). If False, start servers and get live tools. Returns: List of BaseTool instances """ await self.ensure_initialized() # Case 1: List tools from specific session (always live, no cache) if session_name: sess = self._sessions.get(session_name) if sess: try: tools = await sess.list_tools() server_name = session_name.replace(f"{self.backend_type.value}-", "", 1) for tool in tools: tool.bind_runtime_info( backend=self.backend_type, session_name=session_name, server_name=server_name, ) return tools except Exception as e: logger.error(f"Error listing tools from session '{session_name}': {e}") return [] else: logger.warning(f"Session '{session_name}' not found") return [] # Case 2: List tools from all servers # Try cache first if enabled if use_cache: cache = get_tool_cache() if cache.has_cache(): tools = self._load_tools_from_cache() if tools: logger.info(f"Loaded {len(tools)} tools from cache (no server startup)") return tools # No cache or cache disabled, start servers return await self._list_tools_live() def _load_tools_from_cache(self) -> List[BaseTool]: """Load tools from cache file without starting servers. Priority: 1. Try to load from sanitized cache (mcp_tool_cache_sanitized.json) 2. If not exists, load from raw cache and sanitize, then save sanitized version """ cache = get_tool_cache() config_servers = self.list_servers() # Try sanitized cache first if cache.has_sanitized_cache(): logger.debug("Loading from sanitized cache") all_cached_tools = cache.get_all_sanitized_tools() return self._build_tools_from_cache(all_cached_tools, config_servers) # Fall back to raw cache, sanitize and save if cache.has_cache(): logger.info("Sanitized cache not found, building from raw cache...") all_cached_tools = cache.get_all_tools() sanitized_servers = self._sanitize_and_save_cache(all_cached_tools, cache) return self._build_tools_from_cache(sanitized_servers, config_servers) return [] def _sanitize_and_save_cache( self, raw_tools: Dict[str, List[Dict]], cache ) -> Dict[str, List[Dict]]: """Sanitize raw cache and save to sanitized cache file.""" sanitized_servers: Dict[str, List[Dict]] = {} for server_name, tool_list in raw_tools.items(): sanitized_tools = [] for tool_meta in tool_list: raw_params = tool_meta.get("parameters", {}) sanitized_params = _sanitize_mcp_schema(raw_params) sanitized_tools.append({ "name": tool_meta["name"], "description": tool_meta.get("description", ""), "parameters": sanitized_params, }) sanitized_servers[server_name] = sanitized_tools # Save sanitized cache for future use cache.save_sanitized(sanitized_servers) logger.info(f"Created sanitized cache with {len(sanitized_servers)} servers") return sanitized_servers def _build_tools_from_cache( self, all_cached_tools: Dict[str, List[Dict]], config_servers: List[str] ) -> List[BaseTool]: """Build BaseTool instances from cached tool metadata.""" tools: List[BaseTool] = [] for server_name in config_servers: tool_list = all_cached_tools.get(server_name) if not tool_list: continue session_name = f"{self.backend_type.value}-{server_name}" for tool_meta in tool_list: schema = ToolSchema( name=tool_meta["name"], description=tool_meta.get("description", ""), parameters=tool_meta.get("parameters", {}), backend_type=BackendType.MCP, ) tool = RemoteTool(schema=schema, connector=None) tool.bind_runtime_info( backend=self.backend_type, session_name=session_name, server_name=server_name, ) tools.append(tool) return tools async def _list_tools_live(self) -> List[BaseTool]: """List tools by starting all servers. Uses a semaphore to serialize session creation, avoiding TaskGroup race conditions that occur when multiple MCP connections are initialized concurrently. """ servers = self.list_servers() if not servers: logger.warning("No MCP servers configured") return [] # Find servers that don't have sessions yet to_create = [s for s in servers if s not in self._server_sessions] # Create missing sessions with serialized execution using semaphore if to_create: logger.info(f"Creating {len(to_create)} MCP sessions (serialized to avoid race conditions)") # Use semaphore with limit=1 to serialize session creation # This avoids TaskGroup race conditions in concurrent HTTP connection setup semaphore = asyncio.Semaphore(1) async def _create_with_semaphore(server: str): async with semaphore: logger.debug(f"Creating session for '{server}'") return await self._lazy_create(server) tasks = [_create_with_semaphore(s) for s in to_create] results = await asyncio.gather(*tasks, return_exceptions=True) # Log errors for i, result in enumerate(results): if isinstance(result, MCPDependencyError): logger.debug(f"Dependency error for '{to_create[i]}': {type(result).__name__}") elif isinstance(result, Exception): logger.error(f"Failed to create session for '{to_create[i]}': {result}") # Aggregate tools from all sessions uniq: Dict[tuple[str, str], BaseTool] = {} failed_servers = [] logger.debug(f"Listing tools from {len(self._server_sessions)} sessions") for server, sess in self._server_sessions.items(): try: tools = await sess.list_tools() session_name = f"{self.backend_type.value}-{server}" for tool in tools: key = (server, tool.schema.name) if key not in uniq: tool.bind_runtime_info( backend=self.backend_type, session_name=session_name, server_name=server, ) uniq[key] = tool except Exception as e: failed_servers.append(server) logger.error(f"Error listing tools from server '{server}': {e}") if failed_servers: logger.warning(f"Failed to list tools from {len(failed_servers)} server(s): {failed_servers}") tools_list = list(uniq.values()) logger.debug(f"Listed {len(tools_list)} unique tools from {len(self._server_sessions)} MCP servers") # Save to cache for next time await self._save_tools_to_cache(tools_list) return tools_list async def _save_tools_to_cache(self, tools: List[BaseTool]) -> None: """Save tools metadata to cache file.""" cache = get_tool_cache() # Group tools by server servers: Dict[str, List[Dict]] = {} for tool in tools: server_name = tool.runtime_info.server_name if tool.is_bound else "unknown" if server_name not in servers: servers[server_name] = [] servers[server_name].append({ "name": tool.schema.name, "description": tool.schema.description or "", "parameters": tool.schema.parameters or {}, }) cache.save(servers) async def ensure_server_session(self, server_name: str) -> Optional[MCPSession]: """Ensure a server session exists, creating it if needed. This is used for on-demand server startup when executing tools. """ if server_name in self._server_sessions: return self._server_sessions[server_name] # Server not running, start it logger.info(f"Starting MCP server on-demand: {server_name}") cfg = SessionConfig( session_name=f"mcp-{server_name}", backend_type=BackendType.MCP, connection_params={"server": server_name}, ) try: session = await self.create_session(cfg) return session except Exception as e: logger.error(f"Failed to start server '{server_name}': {e}") return None async def _lazy_create(self, server: str) -> None: """Internal helper for lazy session creation. Args: server: Server name to create session for Raises: Exception: Re-raises any exception from session creation for error tracking """ # Double-check to avoid race conditions if server in self._server_sessions: logger.debug(f"Session for server '{server}' already exists, skipping lazy creation") return cfg = SessionConfig( session_name=f"mcp-{server}", backend_type=BackendType.MCP, connection_params={"server": server}, ) try: await self.create_session(cfg) logger.debug(f"Lazily created session for server '{server}'") except MCPDependencyError as e: # Dependency errors already shown to user logger.debug(f"Dependency error for server '{server}': {type(e).__name__}") # Re-raise so that asyncio.gather can track the error raise except Exception as e: logger.error(f"Failed to lazily create session for server '{server}': {e}") # Re-raise so that asyncio.gather can track the error raise ================================================ FILE: anytool/grounding/backends/mcp/session.py ================================================ """ Session manager for MCP connections. This module provides a session manager for MCP connections, which handles authentication, initialization, and tool discovery. """ from typing import Any, Dict from anytool.grounding.backends.mcp.transport.connectors import MCPBaseConnector from anytool.grounding.backends.mcp.tool_converter import convert_mcp_tool_to_base_tool from anytool.grounding.core.session import BaseSession from anytool.grounding.core.types import BackendType from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class MCPSession(BaseSession): """Session manager for MCP connections. This class manages the lifecycle of an MCP connection, including authentication, initialization, and tool discovery. """ def __init__( self, connector: MCPBaseConnector, *, session_id: str = "", auto_connect: bool = True, auto_initialize: bool = True, ) -> None: """Initialize a new MCP session. Args: connector: The connector to use for communicating with the MCP implementation. session_id: Unique identifier for this session auto_connect: Whether to automatically connect to the MCP implementation. auto_initialize: Whether to automatically initialize the session. """ super().__init__( connector=connector, session_id=session_id, backend_type=BackendType.MCP, auto_connect=auto_connect, auto_initialize=auto_initialize, ) async def initialize(self) -> Dict[str, Any]: """Initialize the MCP session and discover available tools. Returns: The session information returned by the MCP implementation. """ # Make sure we're connected if not self.is_connected and self.auto_connect: await self.connect() # Initialize the session through connector logger.debug(f"Initializing MCP session {self.session_id}") session_info = await self.connector.initialize() # List tools from MCP server and convert to BaseTool mcp_tools = self.connector.tools # MCPBaseConnector caches tools after initialize logger.debug(f"Converting {len(mcp_tools)} MCP tools to BaseTool") self.tools = [ convert_mcp_tool_to_base_tool(mcp_tool, self.connector) for mcp_tool in mcp_tools ] logger.debug(f"MCP session {self.session_id} initialized with {len(self.tools)} tools") return session_info ================================================ FILE: anytool/grounding/backends/mcp/tool_cache.py ================================================ import json from pathlib import Path from datetime import datetime from typing import Any, Dict, List, Optional from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) # Cache path in project root directory (AnyTool/) # __file__ = .../AnyTool/anytool/grounding/backends/mcp/tool_cache.py # parent x5 = .../AnyTool/ DEFAULT_CACHE_PATH = Path(__file__).parent.parent.parent.parent.parent / "mcp_tool_cache.json" # Sanitized cache path (Claude API compatible JSON Schema) DEFAULT_SANITIZED_CACHE_PATH = Path(__file__).parent.parent.parent.parent.parent / "mcp_tool_cache_sanitized.json" class MCPToolCache: """Simple file-based cache for MCP tool metadata.""" CACHE_VERSION = 1 def __init__(self, cache_path: Optional[Path] = None, sanitized_cache_path: Optional[Path] = None): self.cache_path = cache_path or DEFAULT_CACHE_PATH self.sanitized_cache_path = sanitized_cache_path or DEFAULT_SANITIZED_CACHE_PATH self._cache: Optional[Dict] = None self._sanitized_cache: Optional[Dict] = None self._server_order: Optional[List[str]] = None def set_server_order(self, order: List[str]): """Set expected server order (from config). Used when saving to disk.""" self._server_order = order def _reorder_servers(self, servers: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]: """Reorder servers dict according to _server_order.""" if not self._server_order: return servers ordered = {} # First add servers in config order for name in self._server_order: if name in servers: ordered[name] = servers[name] # Then add any remaining servers (not in config) for name in servers: if name not in ordered: ordered[name] = servers[name] return ordered def _ensure_dir(self): """Ensure cache directory exists.""" self.cache_path.parent.mkdir(parents=True, exist_ok=True) def load(self) -> Dict[str, Any]: """Load cache from disk. Returns empty dict if not exists.""" if self._cache is not None: return self._cache if not self.cache_path.exists(): self._cache = {"version": self.CACHE_VERSION, "servers": {}} return self._cache try: with open(self.cache_path, "r", encoding="utf-8") as f: self._cache = json.load(f) logger.info(f"Loaded MCP tool cache: {len(self._cache.get('servers', {}))} servers") return self._cache except Exception as e: logger.warning(f"Failed to load cache: {e}") self._cache = {"version": self.CACHE_VERSION, "servers": {}} return self._cache def save(self, servers: Dict[str, List[Dict]]): """ Save tool metadata to disk (overwrites existing cache). Args: servers: Dict mapping server_name -> list of tool metadata dicts Each tool dict should have: name, description, parameters """ self._ensure_dir() cache_data = { "version": self.CACHE_VERSION, "updated_at": datetime.now().isoformat(), "servers": servers, } try: with open(self.cache_path, "w", encoding="utf-8") as f: json.dump(cache_data, f, indent=2, ensure_ascii=False) self._cache = cache_data logger.info(f"Saved MCP tool cache: {len(servers)} servers") except Exception as e: logger.error(f"Failed to save cache: {e}") def save_server(self, server_name: str, tools: List[Dict]): """ Save/update a single server's tools to cache (incremental append). Args: server_name: Name of the MCP server tools: List of tool metadata dicts for this server """ self._ensure_dir() # Load existing cache cache = self.load() # Update server entry if "servers" not in cache: cache["servers"] = {} cache["servers"][server_name] = tools cache["servers"] = self._reorder_servers(cache["servers"]) cache["updated_at"] = datetime.now().isoformat() # Save back try: with open(self.cache_path, "w", encoding="utf-8") as f: json.dump(cache, f, indent=2, ensure_ascii=False) self._cache = cache logger.debug(f"Saved {len(tools)} tools for server '{server_name}'") except Exception as e: logger.error(f"Failed to save cache for server '{server_name}': {e}") def get_server_tools(self, server_name: str) -> Optional[List[Dict]]: """Get cached tools for a specific server.""" cache = self.load() return cache.get("servers", {}).get(server_name) def get_all_tools(self) -> Dict[str, List[Dict]]: """Get all cached tools, grouped by server.""" cache = self.load() return cache.get("servers", {}) def has_cache(self) -> bool: """Check if cache exists and has data.""" cache = self.load() return bool(cache.get("servers")) def clear(self): """Clear the cache.""" if self.cache_path.exists(): self.cache_path.unlink() self._cache = None logger.info("MCP tool cache cleared") def save_failed_server(self, server_name: str, error: str): """ Record a failed server to cache. Args: server_name: Name of the failed MCP server error: Error message """ self._ensure_dir() # Load existing cache cache = self.load() # Add to failed_servers list if "failed_servers" not in cache: cache["failed_servers"] = {} cache["failed_servers"][server_name] = { "error": error, "failed_at": datetime.now().isoformat(), } cache["updated_at"] = datetime.now().isoformat() # Save back try: with open(self.cache_path, "w", encoding="utf-8") as f: json.dump(cache, f, indent=2, ensure_ascii=False) self._cache = cache except Exception as e: logger.error(f"Failed to save failed server '{server_name}': {e}") def get_failed_servers(self) -> Dict[str, Dict]: """Get list of failed servers from cache.""" cache = self.load() return cache.get("failed_servers", {}) def load_sanitized(self) -> Dict[str, Any]: """Load sanitized cache from disk. Returns empty dict if not exists.""" if self._sanitized_cache is not None: return self._sanitized_cache if not self.sanitized_cache_path.exists(): self._sanitized_cache = {"version": self.CACHE_VERSION, "servers": {}} return self._sanitized_cache try: with open(self.sanitized_cache_path, "r", encoding="utf-8") as f: self._sanitized_cache = json.load(f) logger.info(f"Loaded sanitized MCP tool cache: {len(self._sanitized_cache.get('servers', {}))} servers") return self._sanitized_cache except Exception as e: logger.warning(f"Failed to load sanitized cache: {e}") self._sanitized_cache = {"version": self.CACHE_VERSION, "servers": {}} return self._sanitized_cache def save_sanitized(self, servers: Dict[str, List[Dict]]): """ Save sanitized tool metadata to disk. Args: servers: Dict mapping server_name -> list of sanitized tool metadata dicts """ self._ensure_dir() cache_data = { "version": self.CACHE_VERSION, "updated_at": datetime.now().isoformat(), "sanitized": True, "servers": servers, } try: with open(self.sanitized_cache_path, "w", encoding="utf-8") as f: json.dump(cache_data, f, indent=2, ensure_ascii=False) self._sanitized_cache = cache_data logger.info(f"Saved sanitized MCP tool cache: {len(servers)} servers") except Exception as e: logger.error(f"Failed to save sanitized cache: {e}") def get_all_sanitized_tools(self) -> Dict[str, List[Dict]]: """Get all sanitized cached tools, grouped by server.""" cache = self.load_sanitized() return cache.get("servers", {}) def has_sanitized_cache(self) -> bool: """Check if sanitized cache exists and has data.""" cache = self.load_sanitized() return bool(cache.get("servers")) def clear_sanitized(self): """Clear the sanitized cache.""" if self.sanitized_cache_path.exists(): self.sanitized_cache_path.unlink() self._sanitized_cache = None logger.info("Sanitized MCP tool cache cleared") # Global instance _tool_cache: Optional[MCPToolCache] = None def get_tool_cache() -> MCPToolCache: """Get global tool cache instance.""" global _tool_cache if _tool_cache is None: _tool_cache = MCPToolCache() return _tool_cache ================================================ FILE: anytool/grounding/backends/mcp/tool_converter.py ================================================ """ Tool converter for MCP. This module provides utilities to convert MCP tools to BaseTool instances. """ import copy from typing import Any, Dict from mcp.types import Tool as MCPTool from anytool.grounding.core.tool import BaseTool, RemoteTool from anytool.grounding.core.types import BackendType, ToolSchema from anytool.grounding.core.transport.connectors import BaseConnector from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) def _sanitize_mcp_schema(params: Dict[str, Any]) -> Dict[str, Any]: """ Sanitize MCP tool schema to ensure Claude API compatibility (JSON Schema draft 2020-12). Fixes: - Empty schemas -> valid object schema - Missing required fields (type, properties, required) - Removes non-standard fields (title, examples, nullable, default, etc.) - Recursively cleans nested properties and items - Ensures every property has a valid type - Ensures top-level type is 'object' (Anthropic API requirement) """ if not params: return {"type": "object", "properties": {}, "required": []} sanitized = copy.deepcopy(params) sanitized = _deep_sanitize(sanitized) # Anthropic API requires top-level type to be 'object' # If it's not an object, wrap the schema as a property of an object top_level_type = sanitized.get("type") if top_level_type and top_level_type != "object": logger.debug(f"[MCP_SCHEMA_SANITIZE] Wrapping non-object schema (type={top_level_type}) into object") wrapped = { "type": "object", "properties": { "value": sanitized # The original schema becomes a property }, "required": ["value"] # Make it required } sanitized = wrapped return sanitized def _deep_sanitize(schema: Dict[str, Any]) -> Dict[str, Any]: """ Recursively sanitize a JSON schema to conform to JSON Schema draft 2020-12. Removes non-standard fields and ensures valid structure. """ if not isinstance(schema, dict): return {"type": "string"} # Allowed top-level keys for Claude API compatibility allowed_keys = { "type", "properties", "required", "items", "description", "enum", "const", "minimum", "maximum", "minLength", "maxLength", "minItems", "maxItems", "pattern", "additionalProperties", "anyOf", "oneOf", "allOf" } # Remove disallowed keys keys_to_remove = [k for k in schema if k not in allowed_keys] for k in keys_to_remove: schema.pop(k, None) # Ensure type exists if "type" not in schema: # Type is defined via anyOf/oneOf/allOf - don't add default type # These combination keywords define the type themselves if "anyOf" in schema or "oneOf" in schema or "allOf" in schema: pass # Type is defined through combination keywords, do not add default type # Try to infer type elif "properties" in schema: schema["type"] = "object" elif "items" in schema: schema["type"] = "array" elif "enum" in schema: # For enum, try to infer from values enum_vals = schema.get("enum", []) if enum_vals and all(isinstance(v, str) for v in enum_vals): schema["type"] = "string" elif enum_vals and all(isinstance(v, (int, float)) for v in enum_vals): schema["type"] = "number" else: schema["type"] = "string" elif not schema: # Empty schema (e.g., only had $schema which was removed) -> no parameters needed schema["type"] = "object" schema["properties"] = {} schema["required"] = [] else: schema["type"] = "object" # Handle object type if schema.get("type") == "object": if "properties" not in schema: schema["properties"] = {} if "required" not in schema: schema["required"] = [] # Recursively sanitize properties if isinstance(schema.get("properties"), dict): for prop_name, prop_schema in list(schema["properties"].items()): if isinstance(prop_schema, dict): schema["properties"][prop_name] = _deep_sanitize(prop_schema) else: # Invalid property schema, replace with string schema["properties"][prop_name] = {"type": "string"} # Sanitize additionalProperties if present if "additionalProperties" in schema and isinstance(schema["additionalProperties"], dict): schema["additionalProperties"] = _deep_sanitize(schema["additionalProperties"]) # Handle array type elif schema.get("type") == "array": if "items" in schema: if isinstance(schema["items"], dict): schema["items"] = _deep_sanitize(schema["items"]) elif isinstance(schema["items"], list): # Tuple validation - sanitize each item schema["items"] = [_deep_sanitize(item) if isinstance(item, dict) else {"type": "string"} for item in schema["items"]] else: schema["items"] = {"type": "string"} else: # Default items to string if not specified schema["items"] = {"type": "string"} # Handle anyOf/oneOf/allOf for combo_key in ["anyOf", "oneOf", "allOf"]: if combo_key in schema and isinstance(schema[combo_key], list): schema[combo_key] = [ _deep_sanitize(sub) if isinstance(sub, dict) else {"type": "string"} for sub in schema[combo_key] ] return schema def convert_mcp_tool_to_base_tool( mcp_tool: MCPTool, connector: BaseConnector ) -> BaseTool: """ Convert an MCP Tool to a BaseTool (RemoteTool) instance. This function extracts the tool schema from an MCP tool object and creates a RemoteTool that can be used within the grounding framework. Args: mcp_tool: MCP Tool object from the MCP SDK connector: Connector instance for communicating with the MCP server Returns: RemoteTool instance wrapping the MCP tool """ # Extract tool metadata tool_name = mcp_tool.name tool_description = getattr(mcp_tool, 'description', None) or "" # Convert MCP input schema to our parameter schema format (with sanitization) input_schema: Dict[str, Any] = {} if hasattr(mcp_tool, 'inputSchema') and mcp_tool.inputSchema: input_schema = _sanitize_mcp_schema(mcp_tool.inputSchema) else: input_schema = {"type": "object", "properties": {}, "required": []} # Create ToolSchema schema = ToolSchema( name=tool_name, description=tool_description, parameters=input_schema, backend_type=BackendType.MCP, ) # Create and return RemoteTool remote_tool = RemoteTool( connector=connector, remote_name=tool_name, schema=schema, backend=BackendType.MCP, ) logger.debug(f"Converted MCP tool '{tool_name}' to RemoteTool") return remote_tool ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/__init__.py ================================================ """ Connectors for various MCP transports. This module provides interfaces for connecting to MCP implementations through different transport mechanisms. """ from .base import MCPBaseConnector # noqa: F401 from .http import HttpConnector # noqa: F401 from .sandbox import SandboxConnector # noqa: F401 from .stdio import StdioConnector # noqa: F401 from .websocket import WebSocketConnector # noqa: F401 __all__ = [ "MCPBaseConnector", "StdioConnector", "HttpConnector", "WebSocketConnector", "SandboxConnector", ] ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/base.py ================================================ """ Base connector for MCP implementations. This module provides the base connector interface that all MCP connectors must implement. """ import asyncio from abc import abstractmethod from typing import Any from mcp import ClientSession from mcp.shared.exceptions import McpError from mcp.types import CallToolResult, GetPromptResult, Prompt, ReadResourceResult, Resource, Tool from anytool.grounding.core.transport.task_managers import BaseConnectionManager from anytool.grounding.core.transport.connectors import BaseConnector from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) # Default retry settings for tool calls DEFAULT_TOOL_CALL_MAX_RETRIES = 3 DEFAULT_TOOL_CALL_RETRY_DELAY = 1.0 class MCPBaseConnector(BaseConnector[ClientSession]): """Base class for MCP connectors. This class defines the interface that all MCP connectors must implement. """ def __init__( self, connection_manager: BaseConnectionManager[ClientSession], tool_call_max_retries: int = DEFAULT_TOOL_CALL_MAX_RETRIES, tool_call_retry_delay: float = DEFAULT_TOOL_CALL_RETRY_DELAY, ): """Initialize base connector with common attributes. Args: connection_manager: The connection manager to use for the connection. tool_call_max_retries: Maximum number of retries for tool calls (default: 3) tool_call_retry_delay: Initial delay between retries in seconds (default: 1.0) """ super().__init__(connection_manager) self.client_session: ClientSession | None = None self._tools: list[Tool] | None = None self._resources: list[Resource] | None = None self._prompts: list[Prompt] | None = None self.auto_reconnect = True # Whether to automatically reconnect on connection loss (not configurable for now) self.tool_call_max_retries = tool_call_max_retries self.tool_call_retry_delay = tool_call_retry_delay @property @abstractmethod def public_identifier(self) -> str: """Get the identifier for the connector.""" pass async def _get_streams_from_connection(self): """Get read and write streams from the connection. Override in subclasses if needed.""" # Default implementation for most MCP connectors (stdio, HTTP) # Returns the connection directly as it should be a tuple of (read_stream, write_stream) return self._connection async def _after_connect(self) -> None: """Create ClientSession after connection is established. Some connectors (like WebSocket) don't use ClientSession and may override this method. """ # Get streams from the connection streams = await self._get_streams_from_connection() if streams is None: # Some connectors (like WebSocket) don't use ClientSession # They should override this method to set up their own resources logger.debug("No streams returned, ClientSession creation skipped") return if isinstance(streams, tuple) and len(streams) == 2: read_stream, write_stream = streams # Create the client session self.client_session = ClientSession(read_stream, write_stream, sampling_callback=None) await self.client_session.__aenter__() logger.debug("MCP ClientSession created successfully") else: raise RuntimeError(f"Invalid streams format: expected tuple of 2 elements, got {type(streams)}") async def _before_disconnect(self) -> None: """Clean up MCP-specific resources before disconnection.""" errors = [] # Close the client session if self.client_session: try: logger.debug("Closing MCP client session") await self.client_session.__aexit__(None, None, None) except Exception as e: error_msg = f"Error closing client session: {e}" logger.warning(error_msg) errors.append(error_msg) finally: self.client_session = None # Reset tools, resources, and prompts self._tools = None self._resources = None self._prompts = None if errors: logger.warning(f"Encountered {len(errors)} errors during MCP resource cleanup") async def _cleanup_on_connect_failure(self) -> None: """Override to add MCP-specific cleanup on connection failure.""" # Clean up client session if it was created if self.client_session: try: await self.client_session.__aexit__(None, None, None) except Exception: pass finally: self.client_session = None # Call parent cleanup await super()._cleanup_on_connect_failure() async def initialize(self) -> dict[str, Any]: """Initialize the MCP session and return session information.""" if not self.client_session: raise RuntimeError("MCP client is not connected") logger.debug("Initializing MCP session") # Initialize the session result = await self.client_session.initialize() server_capabilities = result.capabilities if server_capabilities.tools: # Get available tools tools_result = await self.list_tools() self._tools = tools_result or [] else: self._tools = [] if server_capabilities.resources: # Get available resources resources_result = await self.list_resources() self._resources = resources_result or [] else: self._resources = [] if server_capabilities.prompts: # Get available prompts prompts_result = await self.list_prompts() self._prompts = prompts_result or [] else: self._prompts = [] logger.debug( f"MCP session initialized with {len(self._tools)} tools, " f"{len(self._resources)} resources, " f"and {len(self._prompts)} prompts" ) return result @property def tools(self) -> list[Tool]: """Get the list of available tools.""" if self._tools is None: raise RuntimeError("MCP client is not initialized") return self._tools @property def resources(self) -> list[Resource]: """Get the list of available resources.""" if self._resources is None: raise RuntimeError("MCP client is not initialized") return self._resources @property def prompts(self) -> list[Prompt]: """Get the list of available prompts.""" if self._prompts is None: raise RuntimeError("MCP client is not initialized") return self._prompts @property def is_connected(self) -> bool: """Check if the connector is actually connected and the connection is alive. This property checks not only the connected flag but also verifies that the client session exists and the underlying connection is still active. Returns: True if the connector is connected and the connection is alive, False otherwise. """ # First check the basic connected flag if not self._connected: return False # Check if we have a client session if not self.client_session: self._connected = False return False # Check if connection manager task is still running (if applicable) if self._connection_manager and hasattr(self._connection_manager, "_task"): task = self._connection_manager._task if task and task.done(): logger.debug("Connection manager task is done, marking as disconnected") self._connected = False return False return True async def _ensure_connected(self) -> None: """Ensure the connector is connected, reconnecting if necessary. Raises: RuntimeError: If connection cannot be established and auto_reconnect is False. """ if not self.client_session: raise RuntimeError("MCP client is not connected") if not self.is_connected: if self.auto_reconnect: logger.debug("Connection lost, attempting to reconnect...") try: await self.connect() logger.debug("Reconnection successful") except Exception as e: raise RuntimeError(f"Failed to reconnect to MCP server: {e}") from e else: raise RuntimeError( "Connection to MCP server has been lost. Auto-reconnection is disabled. Please reconnect manually." ) async def call_tool(self, name: str, arguments: dict[str, Any]) -> CallToolResult: """Call an MCP tool with automatic reconnection handling and retry logic. Args: name: The name of the tool to call. arguments: The arguments to pass to the tool. Returns: The result of the tool call. Raises: RuntimeError: If the connection is lost and cannot be reestablished. Exception: If the tool call fails after all retries. """ last_error: Exception | None = None for attempt in range(self.tool_call_max_retries): # Ensure we're connected await self._ensure_connected() logger.debug(f"Calling tool '{name}' with arguments: {arguments} (attempt {attempt + 1}/{self.tool_call_max_retries})") try: result = await self.client_session.call_tool(name, arguments) logger.debug(f"Tool '{name}' called successfully") return result except Exception as e: last_error = e error_str = str(e).lower() # Check if the error might be due to connection loss if not self.is_connected: logger.warning(f"Tool call '{name}' failed due to connection loss: {e}") # Try to reconnect on next iteration continue # Check for retryable HTTP errors (400, 500, 502, 503, 504) is_retryable = any(code in error_str for code in ['400', '500', '502', '503', '504', 'bad request', 'internal server error', 'service unavailable', 'gateway timeout']) if is_retryable and attempt < self.tool_call_max_retries - 1: delay = self.tool_call_retry_delay * (2 ** attempt) # Exponential backoff logger.warning( f"Tool call '{name}' failed with retryable error: {e}, " f"retrying in {delay:.1f}s (attempt {attempt + 1}/{self.tool_call_max_retries})" ) await asyncio.sleep(delay) continue # Non-retryable error or max retries reached, re-raise raise # All retries exhausted error_msg = f"Tool call '{name}' failed after {self.tool_call_max_retries} retries" logger.error(error_msg) raise RuntimeError(error_msg) from last_error async def list_tools(self) -> list[Tool]: """List all available tools from the MCP implementation.""" # Ensure we're connected await self._ensure_connected() logger.debug("Listing tools") try: result = await self.client_session.list_tools() return result.tools except McpError as e: logger.error(f"Error listing tools: {e}") return [] async def list_resources(self) -> list[Resource]: """List all available resources from the MCP implementation.""" # Ensure we're connected await self._ensure_connected() logger.debug("Listing resources") try: result = await self.client_session.list_resources() return result.resources except McpError as e: logger.error(f"Error listing resources: {e}") return [] async def read_resource(self, uri: str) -> ReadResourceResult: """Read a resource by URI.""" if not self.client_session: raise RuntimeError("MCP client is not connected") logger.debug(f"Reading resource: {uri}") result = await self.client_session.read_resource(uri) return result async def list_prompts(self) -> list[Prompt]: """List all available prompts from the MCP implementation.""" # Ensure we're connected await self._ensure_connected() logger.debug("Listing prompts") try: result = await self.client_session.list_prompts() return result.prompts except McpError as e: logger.error(f"Error listing prompts: {e}") return [] async def get_prompt(self, name: str, arguments: dict[str, Any] | None = None) -> GetPromptResult: """Get a prompt by name.""" # Ensure we're connected await self._ensure_connected() logger.debug(f"Getting prompt: {name}") result = await self.client_session.get_prompt(name, arguments) return result async def request(self, method: str, params: dict[str, Any] | None = None) -> Any: """Send a raw request to the MCP implementation.""" # Ensure we're connected await self._ensure_connected() logger.debug(f"Sending request: {method} with params: {params}") return await self.client_session.request({"method": method, "params": params or {}}) async def invoke(self, name: str, params: dict[str, Any]) -> Any: await self._ensure_connected() if not name.startswith("__"): return await self.call_tool(name, params) if name == "__read_resource__": return await self.read_resource(params["uri"]) if name == "__list_prompts__": return await self.list_prompts() if name == "__get_prompt__": return await self.get_prompt(params["name"], params.get("args")) raise ValueError(f"Unsupported MCP invoke name: {name}") ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/http.py ================================================ """ HTTP connector for MCP implementations. This module provides a connector for communicating with MCP implementations through HTTP APIs with SSE, Streamable HTTP, or simple JSON-RPC for transport. """ import asyncio import anyio import httpx from typing import Any, Dict, List from mcp import ClientSession from mcp.types import ( CallToolResult, TextContent, ImageContent, EmbeddedResource, Tool, Resource, Prompt, GetPromptResult, ReadResourceResult, ) from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers.base import BaseConnectionManager from anytool.grounding.backends.mcp.transport.task_managers import SseConnectionManager, StreamableHttpConnectionManager from anytool.grounding.backends.mcp.transport.connectors.base import MCPBaseConnector, DEFAULT_TOOL_CALL_MAX_RETRIES, DEFAULT_TOOL_CALL_RETRY_DELAY logger = Logger.get_logger(__name__) class HttpConnector(MCPBaseConnector): """Connector for MCP implementations using HTTP transport. This connector uses HTTP/SSE or streamable HTTP to communicate with remote MCP implementations, using a connection manager to handle the proper lifecycle management. """ def __init__( self, base_url: str, auth_token: str | None = None, headers: dict[str, str] | None = None, timeout: float = 5, sse_read_timeout: float = 60 * 5, tool_call_max_retries: int = DEFAULT_TOOL_CALL_MAX_RETRIES, tool_call_retry_delay: float = DEFAULT_TOOL_CALL_RETRY_DELAY, ): """Initialize a new HTTP connector. Args: base_url: The base URL of the MCP HTTP API. auth_token: Optional authentication token. headers: Optional additional headers. timeout: Timeout for HTTP operations in seconds. sse_read_timeout: Timeout for SSE read operations in seconds. tool_call_max_retries: Maximum number of retries for tool calls (default: 3) tool_call_retry_delay: Initial delay between retries in seconds (default: 1.0) """ self.base_url = base_url.rstrip("/") self.auth_token = auth_token self.headers = headers or {} if auth_token: self.headers["Authorization"] = f"Bearer {auth_token}" self.timeout = timeout self.sse_read_timeout = sse_read_timeout # JSON-RPC HTTP mode fields self._use_jsonrpc = False self._jsonrpc_client: httpx.AsyncClient | None = None self._jsonrpc_request_id = 0 # Create a placeholder connection manager (will be set up later in connect()) # We use a placeholder here because the actual transport type (SSE vs Streamable HTTP) # can only be determined at runtime through server negotiation as per MCP specification from anytool.grounding.core.transport.task_managers import PlaceholderConnectionManager connection_manager = PlaceholderConnectionManager() super().__init__( connection_manager, tool_call_max_retries=tool_call_max_retries, tool_call_retry_delay=tool_call_retry_delay, ) async def connect(self) -> None: """Create the underlying session/connection. For JSON-RPC mode, we don't use a connection manager. """ if self._connected: return try: # Hook: before connection - this sets up transport type await self._before_connect() if self._use_jsonrpc: # JSON-RPC mode doesn't use connection manager # Just call _after_connect to set up the HTTP client await self._after_connect() self._connected = True else: # Use normal connection flow with connection manager # If _before_connect() already established a connection, reuse it if self._connection is None: self._connection = await self._connection_manager.start() await self._after_connect() self._connected = True except Exception: await self._cleanup_on_connect_failure() raise async def disconnect(self) -> None: """Close the session/connection and reset state.""" if not self._connected: return # Hook: before disconnection await self._before_disconnect() if not self._use_jsonrpc: # Stop the connection manager only for non-JSON-RPC modes if self._connection_manager: await self._connection_manager.stop() self._connection = None # Hook: after disconnection await self._after_disconnect() self._connected = False async def _before_connect(self) -> None: """Negotiate transport type and set up the appropriate connection manager. Tries transports in order: 1. Streamable HTTP (new MCP transport) 2. SSE (legacy MCP transport) 3. Simple JSON-RPC HTTP (for custom servers) This implements backwards compatibility per MCP specification. """ self.transport_type = None self._use_jsonrpc = False connection_manager = None streamable_error = None sse_error = None # First, try the new streamable HTTP transport try: logger.debug(f"Attempting streamable HTTP connection to: {self.base_url}") connection_manager = StreamableHttpConnectionManager( self.base_url, self.headers, self.timeout, self.sse_read_timeout ) # Test the connection by starting it with built-in timeout read_stream, write_stream = await connection_manager.start(timeout=self.timeout) # Create and verify ClientSession test_client = ClientSession(read_stream, write_stream, sampling_callback=None) # Add timeout to __aenter__ - use asyncio.wait_for instead of anyio.fail_after # to avoid cancel scope conflicts with background tasks try: await asyncio.wait_for(test_client.__aenter__(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"ClientSession enter timed out after {self.timeout}s") try: # Add timeout to initialize() using asyncio.wait_for to prevent hanging try: await asyncio.wait_for(test_client.initialize(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"initialize() timed out after {self.timeout}s") try: await asyncio.wait_for(test_client.list_tools(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"list_tools() timed out after {self.timeout}s") # SUCCESS! Keep the client session (don't close it, closing destroys the streams) # Store it directly as the client_session for later use self.transport_type = "streamable HTTP" self._connection_manager = connection_manager self._connection = connection_manager.get_streams() self.client_session = test_client # Reuse the working session logger.debug("Streamable HTTP transport selected") return except TimeoutError: try: await asyncio.wait_for(test_client.__aexit__(None, None, None), timeout=2) except (asyncio.TimeoutError, Exception): pass raise except Exception as init_error: # Clean up the test client only on error try: await asyncio.wait_for(test_client.__aexit__(None, None, None), timeout=2) except (asyncio.TimeoutError, Exception): pass raise init_error except Exception as e: streamable_error = e logger.debug(f"Streamable HTTP failed: {e}") # Clean up the failed connection manager if connection_manager: try: await asyncio.wait_for(connection_manager.stop(), timeout=2) except (asyncio.TimeoutError, Exception): pass # Try SSE fallback try: logger.debug(f"Attempting SSE fallback connection to: {self.base_url}") connection_manager = SseConnectionManager( self.base_url, self.headers, self.timeout, self.sse_read_timeout ) # Test the connection by starting it with built-in timeout read_stream, write_stream = await connection_manager.start(timeout=self.timeout) # Create and verify ClientSession test_client = ClientSession(read_stream, write_stream, sampling_callback=None) # Add timeout to __aenter__ - use asyncio.wait_for instead of anyio.fail_after # to avoid cancel scope conflicts with background tasks try: await asyncio.wait_for(test_client.__aenter__(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"ClientSession enter timed out after {self.timeout}s") try: try: await asyncio.wait_for(test_client.initialize(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"initialize() timed out after {self.timeout}s") try: await asyncio.wait_for(test_client.list_tools(), timeout=self.timeout) except asyncio.TimeoutError: raise TimeoutError(f"list_tools() timed out after {self.timeout}s") # SUCCESS! Keep the client session (don't close it, closing destroys the streams) # Store it directly as the client_session for later use self.transport_type = "SSE" self._connection_manager = connection_manager self._connection = connection_manager.get_streams() self.client_session = test_client # Reuse the working session logger.debug("SSE transport selected") return except TimeoutError: try: await asyncio.wait_for(test_client.__aexit__(None, None, None), timeout=2) except (asyncio.TimeoutError, Exception): pass raise except Exception as init_error: # Clean up the test client only on error try: await asyncio.wait_for(test_client.__aexit__(None, None, None), timeout=2) except (asyncio.TimeoutError, Exception): pass raise init_error except Exception as e: sse_error = e logger.debug(f"SSE failed: {e}") # Clean up the failed connection manager if connection_manager: try: await asyncio.wait_for(connection_manager.stop(), timeout=2) except (asyncio.TimeoutError, Exception): pass # Both MCP transports failed, try simple JSON-RPC HTTP as last resort # This is useful for custom MCP servers that don't implement proper MCP transports logger.debug(f"Attempting JSON-RPC HTTP fallback to: {self.base_url}") try: # Test JSON-RPC connection await self._try_jsonrpc_connection() self.transport_type = "JSON-RPC HTTP" self._use_jsonrpc = True logger.info(f"JSON-RPC HTTP transport selected for: {self.base_url}") return except Exception as jsonrpc_error: # All transports failed logger.error( f"All transport methods failed for {self.base_url}. " f"Streamable HTTP: {streamable_error}, SSE: {sse_error}, JSON-RPC: {jsonrpc_error}" ) # Raise the most relevant error - prefer the original streamable error raise streamable_error or sse_error or jsonrpc_error async def _try_jsonrpc_connection(self) -> None: """Test JSON-RPC HTTP connection by sending an initialize request.""" headers = {**self.headers, "Content-Type": "application/json"} async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout), headers=headers) as client: payload = { "jsonrpc": "2.0", "id": 1, "method": "initialize", "params": { "protocolVersion": "2024-11-05", "capabilities": {}, "clientInfo": {"name": "AnyTool", "version": "1.0.0"}, } } response = await client.post(self.base_url, json=payload) response.raise_for_status() data = response.json() # Check for JSON-RPC error if "error" in data: error = data["error"] raise RuntimeError(f"JSON-RPC error: {error.get('message', str(error))}") # Success - server supports JSON-RPC logger.debug(f"JSON-RPC test succeeded: {data.get('result', {})}") async def _after_connect(self) -> None: """Create ClientSession (or set up JSON-RPC client) and log success.""" if self._use_jsonrpc: # Set up JSON-RPC HTTP client headers = {**self.headers, "Content-Type": "application/json"} self._jsonrpc_client = httpx.AsyncClient( timeout=httpx.Timeout(self.timeout), headers=headers, ) logger.debug(f"JSON-RPC HTTP client set up for: {self.base_url}") else: # Skip creating ClientSession if _before_connect() already created one if self.client_session is None: await super()._after_connect() else: logger.debug("Reusing ClientSession from _before_connect()") logger.debug(f"Successfully connected to MCP implementation via {self.transport_type}: {self.base_url}") async def _before_disconnect(self) -> None: """Clean up resources before disconnection.""" # Clean up JSON-RPC client if used if self._jsonrpc_client: try: await self._jsonrpc_client.aclose() except Exception as e: logger.warning(f"Error closing JSON-RPC client: {e}") finally: self._jsonrpc_client = None # Call parent cleanup for MCP resources await super()._before_disconnect() @property def public_identifier(self) -> str: """Get the identifier for the connector.""" return {"type": self.transport_type, "base_url": self.base_url} # ===================== # JSON-RPC HTTP Methods # ===================== def _next_jsonrpc_id(self) -> int: """Get next JSON-RPC request ID.""" self._jsonrpc_request_id += 1 return self._jsonrpc_request_id async def _send_jsonrpc_request( self, method: str, params: Dict[str, Any] = None, max_retries: int = 3, retry_delay: float = 1.0, ) -> Any: """Send a JSON-RPC request and return the result. Args: method: The JSON-RPC method name (e.g., "tools/list", "tools/call") params: The method parameters max_retries: Maximum number of retries for transient errors (400, 503, etc.) retry_delay: Initial delay between retries (doubles each retry) Returns: The result field from the JSON-RPC response """ if not self._jsonrpc_client: raise RuntimeError("JSON-RPC client not initialized") last_error = None for attempt in range(max_retries): request_id = self._next_jsonrpc_id() payload = { "jsonrpc": "2.0", "id": request_id, "method": method, "params": params or {}, } logger.debug(f"Sending JSON-RPC request: {method} (id={request_id}, attempt {attempt + 1}/{max_retries})") try: response = await self._jsonrpc_client.post(self.base_url, json=payload) response.raise_for_status() data = response.json() if "error" in data: error = data["error"] error_msg = error.get("message", str(error)) raise RuntimeError(f"JSON-RPC error: {error_msg}") return data.get("result", {}) except httpx.HTTPStatusError as e: last_error = e status_code = e.response.status_code # Retry on 400 (Bad Request) and 5xx errors # 400 can happen when MCP server is temporarily not ready if status_code in (400, 500, 502, 503, 504) and attempt < max_retries - 1: delay = retry_delay * (2 ** attempt) logger.warning( f"HTTP {status_code} error on {method}, retrying in {delay:.1f}s " f"(attempt {attempt + 1}/{max_retries})" ) await asyncio.sleep(delay) continue raise RuntimeError(f"HTTP error: {status_code}") from e except httpx.RequestError as e: last_error = e # Retry on connection errors if attempt < max_retries - 1: delay = retry_delay * (2 ** attempt) logger.warning( f"Request error on {method}: {e}, retrying in {delay:.1f}s " f"(attempt {attempt + 1}/{max_retries})" ) await asyncio.sleep(delay) continue raise RuntimeError(f"Request error: {e}") from e # Should not reach here, but just in case raise RuntimeError(f"Max retries exceeded for {method}") from last_error def _parse_tools_from_json(self, tools_data: List[Dict]) -> List[Tool]: """Parse tool data into Tool objects.""" tools = [] for tool_dict in tools_data: try: tool = Tool( name=tool_dict.get("name", ""), description=tool_dict.get("description", ""), inputSchema=tool_dict.get("inputSchema", {}), ) tools.append(tool) except Exception as e: logger.warning(f"Failed to parse tool: {e}") return tools def _parse_resources_from_json(self, resources_data: List[Dict]) -> List[Resource]: """Parse resource data into Resource objects.""" resources = [] for res_dict in resources_data: try: resource = Resource( uri=res_dict.get("uri", ""), name=res_dict.get("name", ""), description=res_dict.get("description"), mimeType=res_dict.get("mimeType"), ) resources.append(resource) except Exception as e: logger.warning(f"Failed to parse resource: {e}") return resources def _parse_prompts_from_json(self, prompts_data: List[Dict]) -> List[Prompt]: """Parse prompt data into Prompt objects.""" prompts = [] for prompt_dict in prompts_data: try: prompt = Prompt( name=prompt_dict.get("name", ""), description=prompt_dict.get("description"), arguments=prompt_dict.get("arguments"), ) prompts.append(prompt) except Exception as e: logger.warning(f"Failed to parse prompt: {e}") return prompts # ===================== # Override MCP Methods for JSON-RPC Support # ===================== async def initialize(self) -> Dict[str, Any]: """Initialize the MCP session.""" if not self._use_jsonrpc: return await super().initialize() # JSON-RPC mode logger.debug("Initializing JSON-RPC HTTP MCP session") result = await self._send_jsonrpc_request("initialize", { "protocolVersion": "2024-11-05", "capabilities": {}, "clientInfo": {"name": "AnyTool", "version": "1.0.0"}, }) capabilities = result.get("capabilities", {}) # List tools if capabilities.get("tools"): try: tools_result = await self._send_jsonrpc_request("tools/list", {}) self._tools = self._parse_tools_from_json(tools_result.get("tools", [])) except Exception: self._tools = [] else: # Try anyway - some servers don't advertise capabilities correctly try: tools_result = await self._send_jsonrpc_request("tools/list", {}) self._tools = self._parse_tools_from_json(tools_result.get("tools", [])) except Exception: self._tools = [] # List resources if capabilities.get("resources"): try: resources_result = await self._send_jsonrpc_request("resources/list", {}) self._resources = self._parse_resources_from_json(resources_result.get("resources", [])) except Exception: self._resources = [] else: self._resources = [] # List prompts if capabilities.get("prompts"): try: prompts_result = await self._send_jsonrpc_request("prompts/list", {}) self._prompts = self._parse_prompts_from_json(prompts_result.get("prompts", [])) except Exception: self._prompts = [] else: self._prompts = [] logger.info( f"JSON-RPC HTTP MCP session initialized with {len(self._tools)} tools, " f"{len(self._resources)} resources, {len(self._prompts)} prompts" ) return result @property def is_connected(self) -> bool: """Check if the connector is connected.""" if self._use_jsonrpc: return self._connected and self._jsonrpc_client is not None return super().is_connected async def _ensure_connected(self) -> None: """Ensure the connector is connected.""" if self._use_jsonrpc: if not self._connected or not self._jsonrpc_client: raise RuntimeError("JSON-RPC HTTP connector is not connected") else: await super()._ensure_connected() async def list_tools(self) -> List[Tool]: """List all available tools.""" if not self._use_jsonrpc: return await super().list_tools() await self._ensure_connected() try: tools_result = await self._send_jsonrpc_request("tools/list", {}) self._tools = self._parse_tools_from_json(tools_result.get("tools", [])) return self._tools except Exception as e: logger.error(f"Error listing tools: {e}") return [] async def call_tool(self, name: str, arguments: Dict[str, Any]) -> CallToolResult: """Call an MCP tool.""" if not self._use_jsonrpc: return await super().call_tool(name, arguments) await self._ensure_connected() logger.debug(f"Calling tool '{name}' with arguments: {arguments}") result = await self._send_jsonrpc_request("tools/call", { "name": name, "arguments": arguments, }) # Parse the result into CallToolResult content = [] for item in result.get("content", []): item_type = item.get("type", "text") if item_type == "text": content.append(TextContent(type="text", text=item.get("text", ""))) elif item_type == "image": content.append(ImageContent( type="image", data=item.get("data", ""), mimeType=item.get("mimeType", "image/png"), )) elif item_type == "resource": content.append(EmbeddedResource( type="resource", resource=item.get("resource", {}), )) if not content and result: content.append(TextContent(type="text", text=str(result))) return CallToolResult( content=content, isError=result.get("isError", False), ) async def list_resources(self) -> List[Resource]: """List all available resources.""" if not self._use_jsonrpc: return await super().list_resources() await self._ensure_connected() try: resources_result = await self._send_jsonrpc_request("resources/list", {}) self._resources = self._parse_resources_from_json(resources_result.get("resources", [])) return self._resources except Exception as e: logger.error(f"Error listing resources: {e}") return [] async def read_resource(self, uri: str) -> ReadResourceResult: """Read a resource by URI.""" if not self._use_jsonrpc: return await super().read_resource(uri) await self._ensure_connected() result = await self._send_jsonrpc_request("resources/read", {"uri": uri}) return ReadResourceResult(**result) async def list_prompts(self) -> List[Prompt]: """List all available prompts.""" if not self._use_jsonrpc: return await super().list_prompts() await self._ensure_connected() try: prompts_result = await self._send_jsonrpc_request("prompts/list", {}) self._prompts = self._parse_prompts_from_json(prompts_result.get("prompts", [])) return self._prompts except Exception as e: logger.error(f"Error listing prompts: {e}") return [] async def get_prompt(self, name: str, arguments: Dict[str, Any] | None = None) -> GetPromptResult: """Get a prompt by name.""" if not self._use_jsonrpc: return await super().get_prompt(name, arguments) await self._ensure_connected() result = await self._send_jsonrpc_request("prompts/get", { "name": name, "arguments": arguments or {}, }) return GetPromptResult(**result) async def request(self, method: str, params: Dict[str, Any] | None = None) -> Any: """Send a raw request to the MCP implementation.""" if not self._use_jsonrpc: return await super().request(method, params) await self._ensure_connected() return await self._send_jsonrpc_request(method, params or {}) async def invoke(self, name: str, params: Dict[str, Any]) -> Any: """Invoke a tool or special method.""" if not self._use_jsonrpc: return await super().invoke(name, params) await self._ensure_connected() if not name.startswith("__"): return await self.call_tool(name, params) if name == "__read_resource__": return await self.read_resource(params["uri"]) if name == "__list_prompts__": return await self.list_prompts() if name == "__get_prompt__": return await self.get_prompt(params["name"], params.get("args")) raise ValueError(f"Unsupported MCP invoke name: {name}") ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/sandbox.py ================================================ """ Sandbox connector for MCP implementations. This module provides a connector for communicating with MCP implementations that are executed inside a sandbox environment (supports any BaseSandbox implementation). """ import asyncio import sys import time import aiohttp from mcp import ClientSession from anytool.utils.logging import Logger from anytool.grounding.backends.mcp.transport.task_managers import SseConnectionManager from anytool.grounding.core.security import BaseSandbox from anytool.grounding.backends.mcp.transport.connectors.base import MCPBaseConnector logger = Logger.get_logger(__name__) class SandboxConnector(MCPBaseConnector): """Connector for MCP implementations running in a sandbox environment. This connector runs a user-defined stdio command within a sandbox environment through a BaseSandbox implementation (e.g., E2BSandbox), potentially wrapped by a utility like 'supergateway' to expose its stdio. """ def __init__( self, sandbox: BaseSandbox, command: str, args: list[str], env: dict[str, str] | None = None, supergateway_command: str = "npx -y supergateway", port: int = 3000, timeout: float = 5, sse_read_timeout: float = 60 * 5, ): """Initialize a new sandbox connector. Args: sandbox: A BaseSandbox implementation (e.g., E2BSandbox) to run commands in. command: The user's MCP server command to execute in the sandbox. args: Command line arguments for the user's MCP server command. env: Environment variables for the user's MCP server command. supergateway_command: Command to run supergateway (default: "npx -y supergateway"). port: Port number for the sandbox server (default: 3000). timeout: Timeout for the sandbox process in seconds. sse_read_timeout: Timeout for the SSE connection in seconds. """ # Store user command configuration self.user_command = command self.user_args = args or [] self.user_env = env or {} self.port = port # Create a placeholder connection manager (will be set up in connect()) # We need the sandbox to start first to get the base_url, so we can't create # the real SseConnectionManager until connect() is called from anytool.grounding.core.transport.task_managers import PlaceholderConnectionManager connection_manager = PlaceholderConnectionManager() super().__init__(connection_manager) # Sandbox configuration self._sandbox = sandbox self.supergateway_cmd_parts = supergateway_command # Runtime state self.process = None self.client_session: ClientSession | None = None self.errlog = sys.stderr self.base_url: str | None = None self._connected = False self._connection_manager: SseConnectionManager | None = None # SSE connection parameters self.headers = {} self.timeout = timeout self.sse_read_timeout = sse_read_timeout self.stdout_lines: list[str] = [] self.stderr_lines: list[str] = [] self._server_ready = asyncio.Event() def _handle_stdout(self, data: str) -> None: """Handle stdout data from the sandbox process.""" self.stdout_lines.append(data) logger.debug(f"[SANDBOX STDOUT] {data}", end="", flush=True) def _handle_stderr(self, data: str) -> None: """Handle stderr data from the sandbox process.""" self.stderr_lines.append(data) logger.debug(f"[SANDBOX STDERR] {data}", file=self.errlog, end="", flush=True) async def wait_for_server_response(self, base_url: str, timeout: int = 30) -> bool: """Wait for the server to respond to HTTP requests. Args: base_url: The base URL to check for server readiness timeout: Maximum time to wait in seconds Returns: True if server is responding, raises TimeoutError otherwise """ logger.info(f"Waiting for server at {base_url} to respond...") sys.stdout.flush() start_time = time.time() ping_url = f"{base_url}/sse" # Try to connect to the server while time.time() - start_time < timeout: try: async with aiohttp.ClientSession() as session: try: # First try the endpoint async with session.get(ping_url, timeout=2) as response: if response.status == 200: elapsed = time.time() - start_time logger.info(f"Server is ready! SSE endpoint responded with 200 after {elapsed:.1f}s") return True except Exception: # If sse endpoint doesn't work, try the base URL async with session.get(base_url, timeout=2) as response: if response.status < 500: # Accept any non-server error elapsed = time.time() - start_time logger.info( f"Server is ready! Base URL responded with {response.status} after {elapsed:.1f}s" ) return True except Exception: # Wait a bit before trying again await asyncio.sleep(0.5) continue # If we get here, the request failed await asyncio.sleep(0.5) # Log status every 5 seconds elapsed = time.time() - start_time if int(elapsed) % 5 == 0: logger.info(f"Still waiting for server to respond... ({elapsed:.1f}s elapsed)") sys.stdout.flush() # If we get here, we timed out raise TimeoutError(f"Timeout waiting for server to respond (waited {timeout} seconds)") async def _before_connect(self) -> None: """Set up the sandbox and prepare the connection manager.""" logger.debug("Connecting to MCP implementation in sandbox") # Start the sandbox if not already active if not self._sandbox.is_active: logger.debug("Starting sandbox...") await self._sandbox.start() # Get the host for the sandbox # Note: This assumes the sandbox implementation has a get_host method # For E2BSandbox, this is available host = self._sandbox.get_host(self.port) self.base_url = f"https://{host}".rstrip("/") # Append command with args command = f"{self.user_command} {' '.join(self.user_args)}" # Construct the full command with supergateway full_command = f'{self.supergateway_cmd_parts} \ --base-url {self.base_url} \ --port {self.port} \ --cors \ --stdio "{command}"' logger.debug(f"Full command: {full_command}") # Execute the command in the sandbox self.process = await self._sandbox.execute_safe( full_command, envs=self.user_env, timeout=1000 * 60 * 10, # 10 minutes timeout background=True, on_stdout=self._handle_stdout, on_stderr=self._handle_stderr, ) # Wait for the server to be ready await self.wait_for_server_response(self.base_url, timeout=30) logger.debug("Initializing connection manager...") # Create the SSE connection URL sse_url = f"{self.base_url}/sse" # Create and set up the connection manager self._connection_manager = SseConnectionManager(sse_url, self.headers, self.timeout, self.sse_read_timeout) async def _after_connect(self) -> None: """Create ClientSession and log success.""" await super()._after_connect() logger.debug(f"Successfully connected to MCP implementation via HTTP/SSE in sandbox: {self.base_url}") async def _before_disconnect(self) -> None: """Clean up sandbox-specific resources before disconnection.""" logger.debug("Cleaning up sandbox resources") # Stop the sandbox (which will clean up processes) if self._sandbox and self._sandbox.is_active: try: logger.debug("Stopping sandbox instance") await self._sandbox.stop() logger.debug("Sandbox instance stopped successfully") except Exception as e: logger.warning(f"Error stopping sandbox: {e}") self.process = None # Call the parent method to clean up MCP resources await super()._before_disconnect() # Clear any collected output self.stdout_lines = [] self.stderr_lines = [] self.base_url = None async def _cleanup_on_connect_failure(self) -> None: """Clean up sandbox resources on connection failure.""" # Stop the sandbox if it was started if self._sandbox and self._sandbox.is_active: try: await self._sandbox.stop() except Exception as e: logger.warning(f"Error stopping sandbox during cleanup: {e}") self.process = None self.stdout_lines = [] self.stderr_lines = [] self.base_url = None # Call parent cleanup await super()._cleanup_on_connect_failure() @property def sandbox(self) -> BaseSandbox: """Get the underlying sandbox instance.""" return self._sandbox @property def public_identifier(self) -> str: """Get the identifier for the connector.""" return {"type": "sandbox", "command": self.user_command, "args": self.user_args} ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/stdio.py ================================================ """ StdIO connector for MCP implementations. This module provides a connector for communicating with MCP implementations through the standard input/output streams. """ import sys from mcp import ClientSession, StdioServerParameters from anytool.utils.logging import Logger from ..task_managers import StdioConnectionManager from .base import MCPBaseConnector logger = Logger.get_logger(__name__) class StdioConnector(MCPBaseConnector): """Connector for MCP implementations using stdio transport. This connector uses the stdio transport to communicate with MCP implementations that are executed as child processes. It uses a connection manager to handle the proper lifecycle management of the stdio client. """ def __init__( self, command: str = "npx", args: list[str] | None = None, env: dict[str, str] | None = None, errlog=None, ): """Initialize a new stdio connector. Args: command: The command to execute. args: Optional command line arguments. env: Optional environment variables. errlog: Stream to write error output to (defaults to filtered stderr). StdioConnectionManager will wrap this to filter harmless errors. """ self.command = command self.args = args or [] # Ensure args is never None # Ensure env is not None and add settings to suppress non-JSON output from servers self.env = env or {} # Add environment variables to encourage MCP servers to suppress non-JSON output # Many Node.js-based servers respect NODE_ENV=production if "NODE_ENV" not in self.env: self.env["NODE_ENV"] = "production" # Add flag to suppress informational messages (some servers respect this) if "MCP_SILENT" not in self.env: self.env["MCP_SILENT"] = "true" self.errlog = errlog # Create server parameters and connection manager # StdioConnectionManager will wrap errlog in FilteredStderrWrapper server_params = StdioServerParameters(command=self.command, args=self.args, env=self.env) connection_manager = StdioConnectionManager(server_params, self.errlog) super().__init__(connection_manager) async def _before_connect(self) -> None: """Log connection attempt.""" logger.debug(f"Connecting to MCP implementation: {self.command}") async def _after_connect(self) -> None: """Create ClientSession and log success.""" # Call parent's _after_connect to create the ClientSession await super()._after_connect() logger.debug(f"Successfully connected to MCP implementation: {self.command}") @property def public_identifier(self) -> dict[str, str]: return {"type": "stdio", "command&args": f"{self.command} {' '.join(self.args)}"} ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/utils.py ================================================ from typing import Any def is_stdio_server(server_config: dict[str, Any]) -> bool: """Check if the server configuration is for a stdio server. Args: server_config: The server configuration section Returns: True if the server is a stdio server, False otherwise """ return "command" in server_config and "args" in server_config ================================================ FILE: anytool/grounding/backends/mcp/transport/connectors/websocket.py ================================================ """ WebSocket connector for MCP implementations. This module provides a connector for communicating with MCP implementations through WebSocket connections. """ import asyncio import json import uuid from typing import Any from mcp.types import Tool from websockets import ClientConnection from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers.base import BaseConnectionManager from ..task_managers import WebSocketConnectionManager from .base import MCPBaseConnector logger = Logger.get_logger(__name__) class WebSocketConnector(MCPBaseConnector): """Connector for MCP implementations using WebSocket transport. This connector uses WebSockets to communicate with remote MCP implementations, using a connection manager to handle the proper lifecycle management. """ def __init__( self, url: str, auth_token: str | None = None, headers: dict[str, str] | None = None, ): """Initialize a new WebSocket connector. Args: url: The WebSocket URL to connect to. auth_token: Optional authentication token. headers: Optional additional headers. """ self.url = url self.auth_token = auth_token self.headers = headers or {} if auth_token: self.headers["Authorization"] = f"Bearer {auth_token}" self.ws: ClientConnection | None = None self._receiver_task: asyncio.Task | None = None self.pending_requests: dict[str, asyncio.Future] = {} self._tools: list[Tool] | None = None # Create connection manager with actual parameters connection_manager = WebSocketConnectionManager(self.url, self.headers) super().__init__(connection_manager) self._connected = False async def _get_streams_from_connection(self): """WebSocket doesn't use streams, return None to skip ClientSession creation.""" return None async def _after_connect(self) -> None: """Set up WebSocket-specific resources after connection. WebSocket doesn't use ClientSession, so we skip the parent's implementation and set up WebSocket-specific resources instead. """ # Store the WebSocket connection self.ws = self._connection # Start the message receiver task self._receiver_task = asyncio.create_task(self._receive_messages(), name="websocket_receiver_task") logger.debug(f"Successfully connected to MCP implementation via WebSocket: {self.url}") async def _receive_messages(self) -> None: """Continuously receive and process messages from the WebSocket.""" if not self.ws: raise RuntimeError("WebSocket is not connected") try: async for message in self.ws: # Parse the message data = json.loads(message) # Check if this is a response to a pending request request_id = data.get("id") if request_id and request_id in self.pending_requests: future = self.pending_requests.pop(request_id) if "result" in data: future.set_result(data["result"]) elif "error" in data: future.set_exception(Exception(data["error"])) logger.debug(f"Received response for request {request_id}") else: logger.debug(f"Received message: {data}") except Exception as e: logger.error(f"Error in WebSocket message receiver: {e}") # If the websocket connection was closed or errored, # reject all pending requests for future in self.pending_requests.values(): if not future.done(): future.set_exception(e) async def _before_disconnect(self) -> None: """Clean up WebSocket-specific resources before disconnection.""" errors = [] # First cancel the receiver task if self._receiver_task and not self._receiver_task.done(): try: logger.debug("Cancelling WebSocket receiver task") self._receiver_task.cancel() try: await self._receiver_task except asyncio.CancelledError: logger.debug("WebSocket receiver task cancelled successfully") except Exception as e: logger.warning(f"Error during WebSocket receiver task cancellation: {e}") except Exception as e: error_msg = f"Error cancelling WebSocket receiver task: {e}" logger.warning(error_msg) errors.append(error_msg) finally: self._receiver_task = None # Reject any pending requests if self.pending_requests: logger.debug(f"Rejecting {len(self.pending_requests)} pending requests") for future in self.pending_requests.values(): if not future.done(): future.set_exception(ConnectionError("WebSocket disconnected")) self.pending_requests.clear() # Reset WebSocket and tools self.ws = None self._tools = None if errors: logger.warning(f"Encountered {len(errors)} errors during WebSocket resource cleanup") async def _cleanup_on_connect_failure(self) -> None: """Clean up WebSocket resources on connection failure.""" # Cancel receiver task if it was started if self._receiver_task and not self._receiver_task.done(): try: self._receiver_task.cancel() await self._receiver_task except asyncio.CancelledError: pass except Exception: pass finally: self._receiver_task = None # Reject pending requests for future in self.pending_requests.values(): if not future.done(): future.set_exception(ConnectionError("Connection failed")) self.pending_requests.clear() # Call parent cleanup await super()._cleanup_on_connect_failure() self.ws = None async def _send_request(self, method: str, params: dict[str, Any] | None = None) -> Any: """Send a request and wait for a response.""" if not self.ws: raise RuntimeError("WebSocket is not connected") # Create a request ID request_id = str(uuid.uuid4()) # Create a future to receive the response future = asyncio.Future() self.pending_requests[request_id] = future # Send the request await self.ws.send(json.dumps({"id": request_id, "method": method, "params": params or {}})) logger.debug(f"Sent request {request_id} method: {method}") # Wait for the response try: return await future except Exception as e: # Remove the request from pending requests self.pending_requests.pop(request_id, None) logger.error(f"Error waiting for response to request {request_id}: {e}") raise async def initialize(self) -> dict[str, Any]: """Initialize the MCP session and return session information.""" logger.debug("Initializing MCP session") result = await self._send_request("initialize") # Get available tools tools_result = await self.list_tools() self._tools = [Tool(**tool) for tool in tools_result] logger.debug(f"MCP session initialized with {len(self._tools)} tools") return result async def list_tools(self) -> list[dict[str, Any]]: """List all available tools from the MCP implementation.""" logger.debug("Listing tools") result = await self._send_request("tools/list") return result.get("tools", []) @property def tools(self) -> list[Tool]: """Get the list of available tools.""" if not self._tools: raise RuntimeError("MCP client is not initialized") return self._tools async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any: """Call an MCP tool with the given arguments.""" logger.debug(f"Calling tool '{name}' with arguments: {arguments}") return await self._send_request("tools/call", {"name": name, "arguments": arguments}) async def list_resources(self) -> list[dict[str, Any]]: """List all available resources from the MCP implementation.""" logger.debug("Listing resources") result = await self._send_request("resources/list") return result async def read_resource(self, uri: str) -> tuple[bytes, str]: """Read a resource by URI.""" logger.debug(f"Reading resource: {uri}") result = await self._send_request("resources/read", {"uri": uri}) return result.get("content", b""), result.get("mimeType", "") async def request(self, method: str, params: dict[str, Any] | None = None) -> Any: """Send a raw request to the MCP implementation.""" logger.debug(f"Sending request: {method} with params: {params}") return await self._send_request(method, params) @property def public_identifier(self) -> str: """Get the identifier for the connector.""" return {"type": "websocket", "url": self.url} ================================================ FILE: anytool/grounding/backends/mcp/transport/task_managers/__init__.py ================================================ """ Connectors for various MCP transports. This module provides interfaces for connecting to MCP implementations through different transport mechanisms. """ from .sse import SseConnectionManager from .stdio import StdioConnectionManager from .streamable_http import StreamableHttpConnectionManager from .websocket import WebSocketConnectionManager __all__ = [ "StdioConnectionManager", "WebSocketConnectionManager", "SseConnectionManager", "StreamableHttpConnectionManager", ] ================================================ FILE: anytool/grounding/backends/mcp/transport/task_managers/sse.py ================================================ """ SSE connection management for MCP implementations. This module provides a connection manager for SSE-based MCP connections that ensures proper task isolation and resource cleanup. """ from typing import Any, Tuple from mcp.client.sse import sse_client from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers import ( AsyncContextConnectionManager, ) logger = Logger.get_logger(__name__) class SseConnectionManager(AsyncContextConnectionManager[Tuple[Any, Any], ...]): """Connection manager for SSE-based MCP connections. This class handles the proper task isolation for sse_client context managers to prevent the "cancel scope in different task" error. It runs the sse_client in a dedicated task and manages its lifecycle. """ def __init__( self, url: str, headers: dict[str, str] | None = None, timeout: float = 5, sse_read_timeout: float = 60 * 5, ): """Initialize a new SSE connection manager. Args: url: The SSE endpoint URL headers: Optional HTTP headers timeout: Timeout for HTTP operations in seconds sse_read_timeout: Timeout for SSE read operations in seconds """ super().__init__( sse_client, url=url, headers=headers or {}, timeout=timeout, sse_read_timeout=sse_read_timeout, ) self.url = url self.headers = headers or {} logger.debug("SseConnectionManager init url=%s", url) ================================================ FILE: anytool/grounding/backends/mcp/transport/task_managers/stdio.py ================================================ """ StdIO connection management for MCP implementations. This module provides a connection manager for stdio-based MCP connections that ensures proper task isolation and resource cleanup. """ import asyncio import io import logging import sys from typing import Any, TextIO, Tuple from mcp import StdioServerParameters from mcp.client.stdio import stdio_client from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers import ( AsyncContextConnectionManager, ) logger = Logger.get_logger(__name__) class FilteredStderrWrapper(io.TextIOBase): """Wrapper for stderr that filters out harmless MCP server shutdown messages. This wrapper suppresses error messages from MCP servers during shutdown that are harmless but create noise in the logs. """ def __init__(self, wrapped_stream: TextIO): """Initialize the wrapper. Args: wrapped_stream: The underlying stderr stream """ self._stream = wrapped_stream self._buffer = "" self._in_traceback = False self._traceback_lines = [] self._in_rich_traceback = False # Track rich-formatted tracebacks self._rich_traceback_needs_error_line = False # After ╰, need one more line def write(self, s: str) -> int: """Write to stderr, filtering out harmless error messages. Args: s: The string to write Returns: Number of characters written """ # Buffer the input for line-by-line processing self._buffer += s # Process complete lines while '\n' in self._buffer: line, self._buffer = self._buffer.split('\n', 1) self._process_line(line + '\n') return len(s) def _process_line(self, line: str): """Process a single line and decide whether to output it.""" # Detect start of traceback or exception group if line.lstrip().startswith(("╭", "┏")): self._in_traceback = True self._in_rich_traceback = True self._rich_traceback_needs_error_line = False self._traceback_lines = [line] return if (line.strip().startswith('Traceback (most recent call last)') or line.strip().startswith('Exception Group Traceback (most recent call last)') or line.strip().startswith('BaseExceptionGroup:') or line.strip().startswith('ExceptionGroup:')): self._in_traceback = True self._traceback_lines = [line] self._in_rich_traceback = False self._rich_traceback_needs_error_line = False return # Collect traceback lines if self._in_traceback: self._traceback_lines.append(line) # If not in rich traceback mode, but current line contains rich border characters, switch to rich mode if not self._in_rich_traceback and any(ch in line for ch in ("╭", "┏")): self._in_rich_traceback = True # Check for end of rich-formatted traceback (line with ╰) if self._in_rich_traceback and '╰' in line: # Rich traceback box ended, but we need to collect the error line that follows self._rich_traceback_needs_error_line = True return # If we just ended a rich traceback, this should be the error line if self._rich_traceback_needs_error_line: # Now we have the complete rich traceback including the error line if self._is_harmless_error(): logger.debug(f"Suppressed harmless rich-formatted MCP server error") else: # Output the full traceback for tb_line in self._traceback_lines: self._stream.write(tb_line) self._stream.flush() # Reset traceback collection self._in_traceback = False self._in_rich_traceback = False self._rich_traceback_needs_error_line = False self._traceback_lines = [] return # For exception groups, we need to collect more lines # Check if we've collected enough to determine if it's harmless if len(self._traceback_lines) > 5 and not self._in_rich_traceback: # Check periodically if this is a harmless error if self._is_harmless_error(): # Suppress this traceback logger.debug(f"Suppressed harmless MCP server shutdown error") self._in_traceback = False self._in_rich_traceback = False self._rich_traceback_needs_error_line = False self._traceback_lines = [] return # Check if this is the error line (last line of regular traceback) # But not for rich tracebacks which use box characters # A final traceback line is typically unindented and contains "ErrorType: message" if not self._in_rich_traceback and line and not line[0].isspace() and ':' in line: # Check if this is a harmless cleanup error if self._is_harmless_error(): # Suppress this traceback logger.debug(f"Suppressed harmless MCP server shutdown error") else: # Output the full traceback for tb_line in self._traceback_lines: self._stream.write(tb_line) self._stream.flush() # Reset traceback collection self._in_traceback = False self._in_rich_traceback = False self._rich_traceback_needs_error_line = False self._traceback_lines = [] return # If we've collected too many lines without finding the end, output and reset if len(self._traceback_lines) > 100: # Output what we have for tb_line in self._traceback_lines: self._stream.write(tb_line) self._stream.flush() self._in_traceback = False self._in_rich_traceback = False self._rich_traceback_needs_error_line = False self._traceback_lines = [] return else: # Normal line - check if it's a harmless error log line_lower = line.lower() harmless_log_patterns = [ 'an error occurred during closing of asynchronous generator', 'asyncgen:', 'service stopped.', ] # Check if this is a harmless log line is_harmless_log = any(pattern in line_lower for pattern in harmless_log_patterns) if not is_harmless_log: # Output normal lines self._stream.write(line) self._stream.flush() else: # Suppress harmless log messages logger.debug(f"Suppressed harmless log line: {line.strip()}") def _is_harmless_error(self) -> bool: """Check if the collected traceback is a harmless error.""" traceback_text = ''.join(self._traceback_lines).lower() # List of harmless error patterns (case-insensitive) harmless_patterns = [ 'valueerror: i/o operation on closed file', 'oserror: [errno 9] bad file descriptor', 'brokenpipeerror', 'runtimeerror: attempted to exit cancel scope in a different task', 'baseexceptiongroup: unhandled errors in a taskgroup', 'generatorexit', 'an error occurred during closing of asynchronous generator', ] # Check if any pattern matches and it's related to shutdown for pattern in harmless_patterns: if pattern in traceback_text: # Also check if it's related to shutdown/cleanup shutdown_keywords = ['finally:', 'stopped', 'cleanup', '__exit__', '__aexit__', 'stdio_client', 'service stopped'] if any(keyword in traceback_text for keyword in shutdown_keywords): return True return False def flush(self): """Flush any remaining buffered content and the underlying stream.""" if self._buffer: self._process_line(self._buffer) self._buffer = "" if self._traceback_lines: # Flush incomplete traceback for line in self._traceback_lines: self._stream.write(line) self._traceback_lines = [] self._stream.flush() def fileno(self) -> int: """Return the file descriptor of the underlying stream.""" if hasattr(self._stream, 'fileno'): return self._stream.fileno() return -1 @property def closed(self) -> bool: """Check if the stream is closed.""" return self._stream.closed class StdioConnectionManager(AsyncContextConnectionManager[Tuple[Any, Any], ...]): """Connection manager for stdio-based MCP connections. This class handles the proper task isolation for stdio_client context managers to prevent the "cancel scope in different task" error. It runs the stdio_client in a dedicated task and manages its lifecycle. Note: Error handling during cleanup (e.g., I/O operations on closed files) is handled by the parent AsyncContextConnectionManager class in _close_connection(). """ def __init__( self, server_params: StdioServerParameters, errlog: TextIO | None = None, ): """Initialize a new stdio connection manager. Args: server_params: The parameters for the stdio server errlog: The error log stream (defaults to filtered sys.stderr) """ # Wrap stderr to filter out harmless shutdown errors if errlog is None: errlog = FilteredStderrWrapper(sys.stderr) elif not isinstance(errlog, FilteredStderrWrapper): errlog = FilteredStderrWrapper(errlog) super().__init__(stdio_client, server_params, errlog) self.server_params = server_params self.errlog = errlog self._mcp_logger_filter = None self._stop_event: asyncio.Event | None = None # Signal for background task self._runner_task: asyncio.Task | None = None # Background runner task self._conn_future: asyncio.Future | None = None # Future for the established connection logger.debug("StdioConnectionManager init with params=%s", server_params) async def _establish_connection(self) -> Tuple[Any, Any]: """Establish connection in a dedicated task to avoid cancel-scope issues.""" # Suppress MCP SDK's noisy JSON parse errors **before** starting the runner self._suppress_mcp_json_errors() # Lazily create primitives the first time we connect if self._stop_event is None: self._stop_event = asyncio.Event() if self._conn_future is None or self._conn_future.done(): self._conn_future = asyncio.get_event_loop().create_future() async def _runner(): # Runs in its *own* task (same task for enter/exit) try: async with stdio_client(self.server_params, self.errlog) as conn: # Pass connection back to the caller if not self._conn_future.done(): self._conn_future.set_result(conn) # Wait until close is requested await self._stop_event.wait() finally: # Make sure the future is set even on error so awaiters don’t hang if not self._conn_future.done(): self._conn_future.set_exception(RuntimeError("Connection failed")) # Start background runner if not already active if self._runner_task is None or self._runner_task.done(): self._runner_task = asyncio.create_task(_runner(), name="stdio_client_runner") # Wait for the connection tuple from the future conn: Tuple[Any, Any] = await self._conn_future # type: ignore return conn async def _close_connection(self) -> None: """Request the background task to exit its context and wait for it.""" try: # Restore original logging configuration *before* shutdown self._restore_mcp_logging() # Signal the runner to exit its context manager if self._stop_event and not self._stop_event.is_set(): self._stop_event.set() # Await the runner task so that __aexit__ executes in *its* task if self._runner_task: try: await asyncio.wait_for(self._runner_task, timeout=2.0) except asyncio.TimeoutError: logger.warning("Timeout while waiting for stdio_client to shut down") finally: # Clean up helpers so next connect() creates new ones self._runner_task = None self._stop_event = None self._conn_future = None def _suppress_mcp_json_errors(self): """Suppress MCP SDK's JSON parsing error logs. The MCP SDK logs errors when it receives non-JSON messages from servers. These are harmless (the SDK continues working), so we filter them out. """ mcp_logger = logging.getLogger("mcp.client.stdio") class JSONErrorFilter(logging.Filter): """Filter out JSON parsing errors from MCP SDK.""" def filter(self, record): # Suppress "Failed to parse JSONRPC message" errors if "Failed to parse JSONRPC message" in str(record.msg): return False return True self._mcp_logger_filter = JSONErrorFilter() mcp_logger.addFilter(self._mcp_logger_filter) def _restore_mcp_logging(self): """Restore MCP SDK logging to normal.""" if self._mcp_logger_filter: mcp_logger = logging.getLogger("mcp.client.stdio") mcp_logger.removeFilter(self._mcp_logger_filter) self._mcp_logger_filter = None if not isinstance(sys.stderr, FilteredStderrWrapper): sys.stderr = FilteredStderrWrapper(sys.stderr) logger.debug("Applied global FilteredStderrWrapper to sys.stderr") ================================================ FILE: anytool/grounding/backends/mcp/transport/task_managers/streamable_http.py ================================================ """ Streamable HTTP connection management for MCP implementations. This module provides a connection manager for streamable HTTP-based MCP connections that ensures proper task isolation and resource cleanup. """ from datetime import timedelta from typing import Any, Tuple from contextlib import asynccontextmanager from mcp.client.streamable_http import streamablehttp_client from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers import ( AsyncContextConnectionManager, ) logger = Logger.get_logger(__name__) def _make_shim(): """ Create a shim that wraps streamablehttp_client with improved error handling. """ @asynccontextmanager async def _shim(**kw): client_streams = None ctx_manager = None try: # Enter the context - this may raise ExceptionGroup during concurrent init ctx_manager = streamablehttp_client(**kw) try: r, w, _sid_cb = await ctx_manager.__aenter__() client_streams = (r, w) except Exception as conn_error: # Handle connection errors during __aenter__ error_msg = str(conn_error).lower() if "unhandled errors in a taskgroup" in error_msg: logger.debug(f"TaskGroup race condition during connection: {type(conn_error).__name__}") # Clean up and re-raise to trigger retry if ctx_manager: try: await ctx_manager.__aexit__(None, None, None) except Exception: pass # Ignore cleanup errors raise else: # Other connection errors - log and re-raise logger.warning(f"Connection error: {conn_error}") raise # Yield to caller yield client_streams except GeneratorExit: # Normal generator exit - this happens during cleanup logger.debug("StreamableHTTP generator exit (normal cleanup)") finally: # Always try to exit the context manager if ctx_manager is not None: try: await ctx_manager.__aexit__(None, None, None) except (GeneratorExit, RuntimeError, OSError, Exception) as e: # Cleanup errors are expected during concurrent shutdown # Log at debug level and suppress error_type = type(e).__name__ if "ExceptionGroup" in error_type or "TaskGroup" in str(e): logger.debug(f"Benign TaskGroup cleanup error: {error_type}") else: logger.debug(f"Benign cleanup error: {error_type}") return _shim class StreamableHttpConnectionManager( AsyncContextConnectionManager[Tuple[Any, Any], ...] ): """ MCP Streamable-HTTP connection manager based on the generic AsyncContextConnectionManager. Extra session-id callback returned by the SDK is discarded by the shim above. """ def __init__( self, url: str, headers: dict[str, str] | None = None, timeout: float = 5, read_timeout: float = 60 * 5, ): shim = _make_shim() super().__init__( shim, url=url, headers=headers or {}, timeout=timedelta(seconds=timeout), sse_read_timeout=timedelta(seconds=read_timeout), ) self.url = url self.headers = headers or {} logger.debug("StreamableHttpConnectionManager init url=%s", url) ================================================ FILE: anytool/grounding/backends/mcp/transport/task_managers/websocket.py ================================================ """ WebSocket connection management for MCP implementations. This module provides a connection manager for WebSocket-based MCP connections. """ from typing import Any, Tuple from mcp.client.websocket import websocket_client from anytool.utils.logging import Logger from anytool.grounding.core.transport.task_managers import ( AsyncContextConnectionManager, ) logger = Logger.get_logger(__name__) class WebSocketConnectionManager( AsyncContextConnectionManager[Tuple[Any, Any], ...] ): def __init__(self, url: str, headers: dict[str, str] | None = None): # Note: The current MCP websocket_client implementation doesn't support headers # If headers need to be passed, this would need to be updated when MCP supports it super().__init__(websocket_client, url) self.url = url self.headers = headers or {} logger.debug("WebSocketConnectionManager init url=%s", url) ================================================ FILE: anytool/grounding/backends/shell/__init__.py ================================================ from .provider import ShellProvider from .session import ShellSession from .transport.connector import ShellConnector from .transport.local_connector import LocalShellConnector __all__ = [ "ShellProvider", "ShellSession", "ShellConnector", "LocalShellConnector", ] ================================================ FILE: anytool/grounding/backends/shell/provider.py ================================================ from anytool.grounding.core.provider import Provider from anytool.grounding.core.types import BackendType, SessionConfig from .session import ShellSession from .transport.connector import ShellConnector from .transport.local_connector import LocalShellConnector from anytool.config import get_config from anytool.config.utils import get_config_value from anytool.platform.config import get_local_server_config from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class ShellProvider(Provider[ShellSession]): DEFAULT_SID = BackendType.SHELL.value def __init__(self, config: dict | None = None): super().__init__(BackendType.SHELL, config) # Note: _setup_security_policy() is already called by parent class __init__ def _setup_security_policy(self, config: dict | None = None): security_policy = get_config().get_security_policy(self.backend_type.value) if config: security_config = get_config_value(config, "security", None) if security_config: for key, value in security_config.items(): if hasattr(security_policy, key): setattr(security_policy, key, value) sandbox_enabled = get_config_value(config, "sandbox_enabled", None) if sandbox_enabled is not None: security_policy.sandbox_enabled = sandbox_enabled logger.info(f"Shell security policy: allow_shell_commands={security_policy.allow_shell_commands}, " f"blocked_commands={security_policy.blocked_commands}") self.security_manager.set_backend_policy(BackendType.SHELL, security_policy) async def initialize(self) -> None: if not self.is_initialized: await self.create_session(SessionConfig( session_name=self.DEFAULT_SID, backend_type=BackendType.SHELL, connection_params={} )) self.is_initialized = True async def create_session(self, session_config: SessionConfig) -> ShellSession: sid = self.DEFAULT_SID if sid in self._sessions: return self._sessions[sid] # Load shell backend configuration shell_config = get_config().get_backend_config("shell") # Determine execution mode: "local" or "server" mode = getattr(shell_config, "mode", "local") if mode == "local": # ---------- LOCAL MODE ---------- # Execute scripts directly via subprocess, no server required. logger.info("Shell backend using LOCAL mode (no server required)") connector = LocalShellConnector( retry_times=shell_config.max_retries, retry_interval=shell_config.retry_interval, security_manager=self.security_manager, ) else: # ---------- SERVER MODE ---------- # Connect to a running local_server via HTTP. logger.info("Shell backend using SERVER mode (connecting to local_server)") local_server_config = get_local_server_config() default_port = local_server_config.get('port', shell_config.default_port) connector = ShellConnector( vm_ip=get_config_value(session_config.connection_params, "vm_ip", local_server_config['host']), port=get_config_value(session_config.connection_params, "port", default_port), retry_times=shell_config.max_retries, retry_interval=shell_config.retry_interval, security_manager=self.security_manager, ) # Create session with config parameters session = ShellSession( connector=connector, session_id=sid, security_manager=self.security_manager, default_working_dir=shell_config.working_dir, default_env=shell_config.env, default_conda_env=shell_config.conda_env ) await session.initialize() self._sessions[sid] = session return session async def close_session(self, session_id: str) -> None: sess = self._sessions.pop(session_id, None) if sess: await sess.disconnect() ================================================ FILE: anytool/grounding/backends/shell/session.py ================================================ import re from typing import Union from anytool.grounding.core.types import BackendType from anytool.grounding.core.session import BaseSession from anytool.grounding.backends.shell.transport.connector import ShellConnector from anytool.grounding.backends.shell.transport.local_connector import LocalShellConnector from anytool.grounding.core.tool import BaseTool from anytool.grounding.core.security.policies import SecurityPolicyManager from anytool.llm import LLMClient from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class ShellSession(BaseSession): backend_type = BackendType.SHELL def __init__( self, connector: Union[ShellConnector, LocalShellConnector], *, session_id: str, security_manager: SecurityPolicyManager = None, default_working_dir: str = None, default_env: dict = None, default_conda_env: str = None ): super().__init__(connector=connector, session_id=session_id, backend_type=BackendType.SHELL) self.security_manager = security_manager self.default_working_dir = default_working_dir self.default_env = default_env or {} self.default_conda_env = default_conda_env async def initialize(self): self.tools = [ShellAgentTool( self, security_manager=self.security_manager, default_working_dir=self.default_working_dir, default_env=self.default_env, default_conda_env=self.default_conda_env )] return {"tools": [t.name for t in self.tools]} class PythonScriptTool(BaseTool): _name = "_python_exec" _description = "Internal helper: run python code." def __init__(self, session: "ShellSession", default_working_dir: str = None, default_env: dict = None, default_conda_env: str = None): self._session = session self._default_working_dir = default_working_dir self._default_env = default_env or {} self._default_conda_env = default_conda_env super().__init__() async def _arun(self, code: str, timeout: int = 90, working_dir: str | None = None, env: dict | None = None, conda_env: str | None = None): # Use provided params, or fall back to session defaults effective_working_dir = working_dir or self._default_working_dir effective_env = {**self._default_env, **(env or {})} # Merge default and provided env effective_conda_env = conda_env or self._default_conda_env return await self._session.connector.run_python_script( code, timeout=timeout, working_dir=effective_working_dir, env=effective_env if effective_env else None, conda_env=effective_conda_env ) class BashScriptTool(BaseTool): _name = "_bash_exec" _description = "Internal helper: run bash script." def __init__(self, session: "ShellSession", default_working_dir: str = None, default_env: dict = None, default_conda_env: str = None): self._session = session self._default_working_dir = default_working_dir self._default_env = default_env or {} self._default_conda_env = default_conda_env super().__init__() async def _arun(self, script: str, timeout: int = 30, working_dir: str | None = None, env: dict | None = None, conda_env: str | None = None): # Use provided params, or fall back to session defaults effective_working_dir = working_dir or self._default_working_dir effective_env = {**self._default_env, **(env or {})} # Merge default and provided env effective_conda_env = conda_env or self._default_conda_env return await self._session.connector.run_bash_script( script, timeout=timeout, working_dir=effective_working_dir, env=effective_env if effective_env else None, conda_env=effective_conda_env ) class ShellAgentTool(BaseTool): _name = "shell_agent" _description = """Execute commands or scripts directly in the computer's terminal. This tool uses an internal agent that will write and run Python or Bash code to accomplish tasks or inspect the current system state. The internal agent will automatically retry and fix errors when possible. Use this tool when you need to: - Execute any terminal-based task that requires code - Check the current environment (files, processes, system info) - Run calculations or data processing - Install packages or modify system settings The tool will keep trying until the task succeeds or determines it cannot be completed.""" backend_type = BackendType.SHELL _CODE_RGX = re.compile( r"```(?Ppython|py|bash|shell|sh)[^\n]*\n(?P.*?)```", re.S | re.I, ) def __init__( self, session: "ShellSession", client_password: str = "", max_steps: int = 5, security_manager: SecurityPolicyManager = None, default_working_dir: str = None, default_env: dict = None, default_conda_env: str = None ): self._session = session self._llm = LLMClient() self.client_password = client_password self.max_steps = max_steps self._system_info = None self.security_manager = security_manager self._default_working_dir = default_working_dir self._default_env = default_env or {} self._default_conda_env = default_conda_env self._py_tool = PythonScriptTool(session, default_working_dir=default_working_dir, default_env=default_env, default_conda_env=default_conda_env) self._bash_tool = BashScriptTool(session, default_working_dir=default_working_dir, default_env=default_env, default_conda_env=default_conda_env) super().__init__() async def _get_system_info(self): """ Get system information for shell agent. First tries to get comprehensive info from local server's /platform endpoint. Falls back to simple bash commands if that fails. Returns: Dict with at least 'platform' and 'username' keys """ if self._system_info is None: try: # Try to get system info from server via HTTP API try: from anytool.platform import SystemInfoClient # Get base_url from connector base_url = self._session.connector.base_url # Create temporary client async with SystemInfoClient(base_url=base_url, timeout=5) as client: info = await client.get_system_info(use_cache=False) if info: # Use comprehensive info from server self._system_info = { "platform": info.get("system", "Linux"), "username": info.get("username", "user"), "machine": info.get("machine"), "release": info.get("release"), "full_info": info # Keep full info for reference } logger.debug(f"Got system info from server: {info.get('system')}") return self._system_info except ImportError: logger.debug("SystemInfoClient not available, using bash commands") # Fallback: use simple bash commands (original method) platform_result = await self._session.connector.run_bash_script("uname -s", timeout=5) username_result = await self._session.connector.run_bash_script("whoami", timeout=5) platform = self._extract_output(platform_result).strip() username = self._extract_output(username_result).strip() self._system_info = { "platform": platform, "username": username } logger.debug(f"Got system info from bash: {platform}") except Exception as e: logger.warning(f"Failed to get system info: {e}, using defaults") self._system_info = {"platform": "Linux", "username": "user"} return self._system_info async def _arun(self, task: str, timeout: int = 300): from anytool.grounding.core.types import ToolResult, ToolStatus sys_info = await self._get_system_info() conversation_history = [] iteration = 0 last_error = None # record the code history code_history = [] # Build environment context env_context = [] if self._default_working_dir: env_context.append(f"Working Directory: {self._default_working_dir}") if self._default_conda_env: env_context.append(f"Conda Environment: {self._default_conda_env}") if self._default_env: env_vars = ", ".join([f"{k}={v}" for k, v in list(self._default_env.items())[:3]]) if len(self._default_env) > 3: env_vars += f", ... (+{len(self._default_env)-3} more)" env_context.append(f"Custom Environment Variables: {env_vars}") env_section = "\n".join([f"# {ctx}" for ctx in env_context]) if env_context else "" SHELL_AGENT_SYSTEM_PROMPT = f"""You are an expert system administrator and programmer focused on executing tasks efficiently. # System: {sys_info["platform"]}, User: {sys_info["username"]} {env_section} # Your task: {task} # IMPORTANT: You MUST provide exactly ONE code block in EVERY response # Either ```bash or ```python - never respond without code # Available actions: 1. Execute bash commands: ```bash ``` 2. Write Python code: ```python ``` # Rules: - ALWAYS include a code block in your response - Write EXACTLY ONE code block per response - If you need to understand the current environment, start with bash commands like: pwd, ls, ps, df, etc. - If you get errors, analyze and fix them in the next iteration - For sudo: use 'echo {self.client_password} | sudo -S ' - The environment (working directory, conda env) is managed automatically # CRITICAL: Avoid quote escaping errors in bash: - For complex string operations (JSON, multi-line text, special chars): ALWAYS use Python with heredoc - Good: ```python ``` - Bad: bash commands with nested quotes like: echo "$(cat 'file' | grep "pattern")" - When reading/writing files with complex content: prefer Python over bash - When processing JSON: ALWAYS use Python's json module, never bash string manipulation # Before executing, check if task output already exists: - Use 'ls -la ' to check for existing files - If files exist, read and verify them first before recreating - Avoid redundant work - reuse existing valid outputs # Task completion marking: When you believe the task is COMPLETED, end your response with: [TASK_COMPLETED: brief explanation of what was accomplished] When you encounter an UNRECOVERABLE error that you cannot fix, end your response with: [TASK_FAILED: brief explanation of why it cannot be completed]""" conversation_history.append({"role": "system", "content": SHELL_AGENT_SYSTEM_PROMPT}) no_code_counter = 0 final_message = "" while iteration < self.max_steps: iteration += 1 logger.info(f"[ShellAgent] Step {iteration}/{self.max_steps}: Processing task") try: messages_text = LLMClient.format_messages_to_text(conversation_history) response = await self._llm.complete(messages_text) assistant_content = response["message"]["content"] logger.debug(f"[ShellAgent] Step {iteration} LLM response: {assistant_content[:200]}...") # extract and execute the code, and track the code block code_info, execution_result = await self._execute_code_from_response(assistant_content) if code_info: code_history.append(code_info) logger.info(f"[ShellAgent] Step {iteration} execution result: {execution_result[:100]}...") if execution_result == "ERROR: No valid code block found": no_code_counter += 1 if no_code_counter >= 3: final_message = f"Task failed after {iteration} steps: LLM failed to provide code blocks repeatedly" return ToolResult( status=ToolStatus.ERROR, content=final_message, metadata={"tool": self._name, "code_history": code_history} ) else: no_code_counter = 0 completion_status = self._check_task_status(assistant_content, execution_result, last_error) if completion_status["completed"]: content_parts = [f"Task completed successfully after {iteration} steps"] content_parts.append(f"\n{'='*60}") content_parts.append(f"\nFinal Result:") content_parts.append(execution_result) if len(code_history) > 1: content_parts.append(f"\n{'='*60}") content_parts.append(f"\nExecution Summary ({len(code_history)} steps):") for i, code_info in enumerate(code_history, 1): lang = code_info.get("language", "unknown") output = code_info.get("output", "") output_preview = output[:200].replace('\n', ' ') if len(output) > 200: output_preview += "..." content_parts.append(f"\n Step {i} [{lang}]: {output_preview}") content_parts.append(f"\n{'='*60}") content_parts.append(f"\nSummary: {completion_status['reason']}") final_message = "\n".join(content_parts) return ToolResult( status=ToolStatus.SUCCESS, content=final_message, metadata={"tool": self._name, "code_history": code_history} ) elif completion_status["failed"]: final_message = f"Task failed after {iteration} steps: {completion_status['reason']}\nLast result: {execution_result}" return ToolResult( status=ToolStatus.ERROR, content=final_message, metadata={"tool": self._name, "code_history": code_history} ) feedback = self._generate_feedback(execution_result, iteration, last_error) conversation_history.extend([ {"role": "assistant", "content": assistant_content}, {"role": "user", "content": feedback} ]) last_error = execution_result if "ERROR" in execution_result else None except Exception as e: final_message = f"Tool execution failed at step {iteration}: {str(e)}" return ToolResult( status=ToolStatus.ERROR, content=final_message, metadata={"tool": self._name, "code_history": code_history} ) final_message = f"Reached maximum steps ({self.max_steps}). Task may be too complex or impossible." return ToolResult( status=ToolStatus.ERROR, content=final_message, metadata={"tool": self._name, "code_history": code_history} ) async def _execute_code_from_response(self, response: str): """ execute the code and track the code block Returns: Tuple[Optional[Dict], str]: (code_info, execution_result) - code_info: {"lang": "python/bash", "code": "...", "status": "success/error"} - execution_result: the execution result string """ matches = list(self._CODE_RGX.finditer(response)) if not matches: return None, "ERROR: No valid code block found" lang, code = matches[0]["lang"].lower(), matches[0]["code"].strip() # standardize the language name lang_normalized = "python" if lang in ["python", "py"] else "bash" code_info = { "lang": lang_normalized, "code": code, } # Security check is only done at the Connector layer to avoid duplicate prompts try: if lang in ["python", "py"]: helper = self._py_tool result = await helper._arun(code) elif lang in ["bash", "shell", "sh"]: helper = self._bash_tool result = await helper._arun(code) else: execution_result = f"ERROR: Unsupported language: {lang}" code_info["status"] = "error" return code_info, execution_result execution_result = self._extract_output(result) code_info["status"] = "success" if "ERROR" not in execution_result else "error" return code_info, execution_result except Exception as e: execution_result = f"EXECUTION ERROR: {str(e)}" code_info["status"] = "error" return code_info, execution_result def _generate_feedback(self, result: str, iteration: int, last_error: str) -> str: feedback = f"Step {iteration} result:\n{result}\n\n" if "ERROR" in result: if last_error and last_error == result: feedback += "Same error as previous step. Try a different approach.\n" else: feedback += "Error occurred. Analyze the error and fix it.\n" else: feedback += "Execution successful. Continue to next step if needed.\n" feedback += "\nWhat's your next action? (Remember: provide exactly ONE code block)" return feedback def _extract_output(self, result): if isinstance(result, dict): # Check for execution errors stderr = result.get("error") or result.get("stderr") or "" returncode = result.get("returncode", 0) stdout = result.get("content") or result.get("output") or result.get("stdout") or "" # If there's a non-zero return code or stderr with actual errors, report it if returncode != 0 or (stderr and len(stderr.strip()) > 0): error_msg = f"EXECUTION ERROR (exit code {returncode}):\n" if stderr: error_msg += f"stderr: {stderr}\n" if stdout: error_msg += f"stdout: {stdout}" return error_msg return stdout or str(result) return str(result) def _check_task_status(self, response: str, execution_result: str, last_error: str) -> dict: if "[TASK_COMPLETED:" in response: reason = response.split("[TASK_COMPLETED:")[1].split("]")[0].strip() return {"completed": True, "failed": False, "reason": reason} if "[TASK_FAILED:" in response: reason = response.split("[TASK_FAILED:")[1].split("]")[0].strip() return {"completed": False, "failed": True, "reason": reason} # Extended error pattern detection error_patterns = [ "ERROR:", "EXECUTION ERROR:", "CommandNotFoundError", "Traceback (most recent call last)", "Exception:", "PermissionError", "FileNotFoundError", "SyntaxError:", "ImportError:", "ModuleNotFoundError", "No such file or directory", "command not found", ] has_error = any(pattern in execution_result for pattern in error_patterns) if has_error: if last_error and last_error == execution_result: return {"completed": False, "failed": True, "reason": "Same error repeated - unable to resolve"} return {"completed": False, "failed": False, "reason": "Execution error occurred"} return {"completed": False, "failed": False, "reason": "Task in progress"} ================================================ FILE: anytool/grounding/backends/shell/transport/connector.py ================================================ import asyncio from typing import Any, Optional, Dict from anytool.grounding.core.transport.connectors import AioHttpConnector from anytool.grounding.core.security import SecurityPolicyManager from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class ShellConnector(AioHttpConnector): """ Shell backend HTTP connector Basic routes: POST /run_python {"code": str} POST /run_bash_script {"script": str, "timeout": int, "working_dir": str | None} """ def __init__( self, vm_ip: str, port: int = 5000, *, retry_times: int = 3, retry_interval: float = 5, security_manager: "SecurityPolicyManager | None" = None, ) -> None: base_url = f"http://{vm_ip}:{port}" super().__init__(base_url) self.retry_times = retry_times self.retry_interval = retry_interval self._security_manager = security_manager async def _retry_invoke( self, name: str, payload: Dict[str, Any], script_timeout: int, *, break_on_timeout: bool = False ): """ Execute HTTP request and retry Args: name: RPC method name payload: Request payload script_timeout: Script execution timeout break_on_timeout: Whether to exit immediately on timeout (default False) Returns: Server response result Raises: Exception: Last exception thrown after all retries fail """ last_exc: Exception | None = None # HTTP request timeout should be longer than script execution timeout, leaving buffer time http_timeout = script_timeout + 60 for attempt in range(1, self.retry_times + 1): try: # Pass timeout parameter to server result = await self.invoke(name, payload | {"timeout": script_timeout}) logger.info("%s executed successfully (attempt %d/%d)", name, attempt, self.retry_times) return result except asyncio.TimeoutError as exc: # Timeout exception usually does not need to be retried (script execution time too long) if break_on_timeout: logger.error("%s timed out after %d seconds, aborting retry", name, script_timeout) raise RuntimeError( f"Script execution timed out after {script_timeout} seconds" ) from exc last_exc = exc if attempt == self.retry_times: break logger.warning( "%s timed out (attempt %d/%d), retrying in %.1f seconds...", name, attempt, self.retry_times, self.retry_interval ) await asyncio.sleep(self.retry_interval) except Exception as exc: last_exc = exc if attempt == self.retry_times: break logger.warning( "%s failed (attempt %d/%d): %s, retrying in %.1f seconds...", name, attempt, self.retry_times, exc, self.retry_interval ) await asyncio.sleep(self.retry_interval) error_msg = f"{name} failed after {self.retry_times} retries" logger.error(error_msg) raise last_exc or RuntimeError(error_msg) async def run_python_script( self, code: str, *, timeout: int = 90, working_dir: Optional[str] = None, env: Optional[Dict[str, str]] = None, conda_env: Optional[str] = None ) -> Any: """ Execute Python script on remote server Args: code: Python code string timeout: Execution timeout in seconds (default 90 seconds) working_dir: Working directory for script execution (optional) env: Environment variables for script execution (optional) conda_env: Conda environment name to activate (optional) Returns: Server response result Raises: PermissionError: Security policy blocked execution RuntimeError: Execution failed or timed out """ if self._security_manager: from anytool.grounding.core.types import BackendType allowed = await self._security_manager.check_command_allowed(BackendType.SHELL, code) if not allowed: logger.error("SecurityPolicy blocked python code execution") raise PermissionError("SecurityPolicy: python code execution blocked") payload = {"code": code, "working_dir": working_dir, "env": env, "conda_env": conda_env} logger.info( "Executing python script with timeout=%d seconds%s%s%s", timeout, f", working_dir={working_dir}" if working_dir else "", f", env={list(env.keys())}" if env else "", f", conda_env={conda_env}" if conda_env else "" ) # Python script timed out, exit immediately without retry (timeout usually means script logic problem) return await self._retry_invoke( "POST /run_python", payload, timeout, break_on_timeout=True ) async def run_bash_script( self, script: str, *, timeout: int = 90, working_dir: Optional[str] = None, env: Optional[Dict[str, str]] = None, conda_env: Optional[str] = None ) -> Any: """ Execute Bash script on remote server Args: script: Bash script content (can be multi-line) timeout: Execution timeout in seconds (default 90 seconds) working_dir: Working directory for script execution (optional) env: Environment variables for script execution (optional) conda_env: Conda environment name to activate (optional) Returns: Server response result, containing status, output, error, returncode, etc. Raises: PermissionError: Security policy blocked execution RuntimeError: Execution failed or timed out """ if self._security_manager: from anytool.grounding.core.types import BackendType allowed = await self._security_manager.check_command_allowed(BackendType.SHELL, script) if not allowed: logger.error("SecurityPolicy blocked bash script execution") raise PermissionError("SecurityPolicy: bash script execution blocked") payload = {"script": script, "working_dir": working_dir, "env": env, "conda_env": conda_env} logger.info( "Executing bash script with timeout=%d seconds%s%s%s", timeout, f", working_dir={working_dir}" if working_dir else "", f", env={list(env.keys())}" if env else "", f", conda_env={conda_env}" if conda_env else "" ) # Bash script timed out, exit immediately without retry (timeout usually means script logic problem) result = await self._retry_invoke( "POST /run_bash_script", payload, timeout, break_on_timeout=True ) # Record execution result if isinstance(result, dict) and "returncode" in result: logger.info("Bash script executed with return code: %d", result.get("returncode", -1)) return result ================================================ FILE: anytool/grounding/backends/shell/transport/local_connector.py ================================================ """ Local Shell Connector — execute Python / Bash scripts directly via subprocess. This connector has the **same public API** as ShellConnector (HTTP version) but runs everything in-process, removing the need for a local_server. Return format is kept identical so that ShellSession / ShellAgentTool work without any changes. """ import asyncio import os import platform import tempfile import uuid from typing import Any, Optional, Dict from anytool.grounding.core.transport.connectors.base import BaseConnector from anytool.grounding.core.transport.task_managers.noop import NoOpConnectionManager from anytool.grounding.core.security import SecurityPolicyManager from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) platform_name = platform.system() # --------------------------------------------------------------------------- # Conda helpers (mirrored from local_server/main.py) # --------------------------------------------------------------------------- def _get_conda_activation_prefix(conda_env: str | None) -> str: """Generate platform-specific conda activation prefix.""" if not conda_env: return "" if platform_name == "Windows": conda_paths = [ os.path.expandvars(r"%USERPROFILE%\miniconda3\Scripts\activate.bat"), os.path.expandvars(r"%USERPROFILE%\anaconda3\Scripts\activate.bat"), r"C:\ProgramData\Miniconda3\Scripts\activate.bat", r"C:\ProgramData\Anaconda3\Scripts\activate.bat", ] for p in conda_paths: if os.path.exists(p): return f'call "{p}" {conda_env} && ' return f"conda activate {conda_env} && " else: conda_paths = [ os.path.expanduser("~/miniconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/anaconda3/etc/profile.d/conda.sh"), "/opt/conda/etc/profile.d/conda.sh", "/usr/local/miniconda3/etc/profile.d/conda.sh", "/usr/local/anaconda3/etc/profile.d/conda.sh", ] for p in conda_paths: if os.path.exists(p): return f'source "{p}" && conda activate {conda_env} && ' return f"conda activate {conda_env} && " def _wrap_script_with_conda(script: str, conda_env: str | None) -> str: """Wrap bash script with conda activation if needed.""" if not conda_env: return script if platform_name == "Windows": prefix = _get_conda_activation_prefix(conda_env) return f"{prefix}{script}" else: conda_paths = [ os.path.expanduser("~/miniconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/anaconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/opt/anaconda3/etc/profile.d/conda.sh"), "/opt/conda/etc/profile.d/conda.sh", ] conda_sh = None for p in conda_paths: if os.path.exists(p): conda_sh = p break if conda_sh: return ( f'#!/bin/bash\n' f'if [ -f "{conda_sh}" ]; then\n' f' . "{conda_sh}"\n' f' conda activate {conda_env} 2>/dev/null || true\n' f'fi\n\n' f'{script}\n' ) else: logger.warning( "Conda environment '%s' requested but conda not found. " "Executing with system Python.", conda_env ) return script class LocalShellConnector(BaseConnector[Any]): """ Shell connector that runs scripts **locally** using asyncio subprocesses, bypassing the Flask local_server entirely. Public API is compatible with ``ShellConnector`` so that ``ShellSession`` works without modification. """ def __init__( self, *, retry_times: int = 3, retry_interval: float = 5, security_manager: "SecurityPolicyManager | None" = None, ) -> None: super().__init__(NoOpConnectionManager()) self.retry_times = retry_times self.retry_interval = retry_interval self._security_manager = security_manager # Provide base_url = None so ShellSession._get_system_info falls back # to bash-based detection instead of HTTP. self.base_url: str | None = None # ------------------------------------------------------------------ # connect / disconnect (mostly no-ops for local execution) # ------------------------------------------------------------------ async def connect(self) -> None: """No real connection to establish for local mode.""" if self._connected: return await super().connect() logger.info("LocalShellConnector: ready (local mode, no server required)") # ------------------------------------------------------------------ # Core execution helpers # ------------------------------------------------------------------ async def _run_subprocess( self, cmd: list[str], *, timeout: int = 90, working_dir: str | None = None, env: dict[str, str] | None = None, ) -> Dict[str, Any]: """Run a command via asyncio subprocess and return a result dict matching the format returned by the local_server endpoints.""" exec_env = os.environ.copy() if env: exec_env.update(env) cwd = working_dir or os.getcwd() try: proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd, env=exec_env, ) stdout_b, stderr_b = await asyncio.wait_for( proc.communicate(), timeout=timeout ) stdout = stdout_b.decode("utf-8", errors="replace") if stdout_b else "" stderr = stderr_b.decode("utf-8", errors="replace") if stderr_b else "" returncode = proc.returncode or 0 return { "status": "success" if returncode == 0 else "error", "output": stdout, "content": stdout or "Code executed successfully (no output)", "error": stderr, "returncode": returncode, } except asyncio.TimeoutError: return { "status": "error", "output": f"Execution timed out after {timeout} seconds", "content": f"Execution timed out after {timeout} seconds", "error": "", "returncode": -1, } except Exception as e: return { "status": "error", "output": "", "content": "", "error": str(e), "returncode": -1, } async def _run_shell_command( self, shell_cmd: str, *, timeout: int = 90, working_dir: str | None = None, env: dict[str, str] | None = None, ) -> Dict[str, Any]: """Run a shell command string (used for conda-wrapped scripts).""" exec_env = os.environ.copy() if env: exec_env.update(env) cwd = working_dir or os.getcwd() try: proc = await asyncio.create_subprocess_shell( shell_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, cwd=cwd, env=exec_env, ) stdout_b, _ = await asyncio.wait_for( proc.communicate(), timeout=timeout ) stdout = stdout_b.decode("utf-8", errors="replace") if stdout_b else "" returncode = proc.returncode or 0 return { "status": "success" if returncode == 0 else "error", "output": stdout, "content": stdout or "Code executed successfully (no output)", "error": "", "returncode": returncode, } except asyncio.TimeoutError: return { "status": "error", "output": f"Script execution timed out after {timeout} seconds", "content": f"Script execution timed out after {timeout} seconds", "error": "", "returncode": -1, } except Exception as e: return { "status": "error", "output": "", "content": "", "error": str(e), "returncode": -1, } # ------------------------------------------------------------------ # Public API (same signatures as ShellConnector) # ------------------------------------------------------------------ async def run_python_script( self, code: str, *, timeout: int = 90, working_dir: Optional[str] = None, env: Optional[Dict[str, str]] = None, conda_env: Optional[str] = None, ) -> Any: """Execute a Python script locally. Return format matches the server's ``/run_python`` endpoint. """ # Security check if self._security_manager: from anytool.grounding.core.types import BackendType allowed = await self._security_manager.check_command_allowed( BackendType.SHELL, code ) if not allowed: logger.error("SecurityPolicy blocked python code execution") raise PermissionError("SecurityPolicy: python code execution blocked") # Write code to temp file (same as local_server) suffix = uuid.uuid4().hex if platform_name == "Windows": temp_filename = os.path.join(tempfile.gettempdir(), f"python_exec_{suffix}.py") else: temp_filename = f"/tmp/python_exec_{suffix}.py" try: with open(temp_filename, "w") as f: f.write(code) logger.info( "Executing python script locally with timeout=%d seconds%s%s%s", timeout, f", working_dir={working_dir}" if working_dir else "", f", env={list(env.keys())}" if env else "", f", conda_env={conda_env}" if conda_env else "", ) if conda_env: activation = _get_conda_activation_prefix(conda_env) if activation: python_cmd = "python" if platform_name == "Windows" else "python3" full_cmd = f'{activation}{python_cmd} "{temp_filename}"' result = await self._run_shell_command( full_cmd, timeout=timeout, working_dir=working_dir, env=env ) else: python_cmd = "python" if platform_name == "Windows" else "python3" result = await self._run_subprocess( [python_cmd, temp_filename], timeout=timeout, working_dir=working_dir, env=env, ) else: python_cmd = "python" if platform_name == "Windows" else "python3" result = await self._run_subprocess( [python_cmd, temp_filename], timeout=timeout, working_dir=working_dir, env=env, ) return result finally: if os.path.exists(temp_filename): os.remove(temp_filename) async def run_bash_script( self, script: str, *, timeout: int = 90, working_dir: Optional[str] = None, env: Optional[Dict[str, str]] = None, conda_env: Optional[str] = None, ) -> Any: """Execute a Bash script locally. Return format matches the server's ``/run_bash_script`` endpoint. """ # Security check if self._security_manager: from anytool.grounding.core.types import BackendType allowed = await self._security_manager.check_command_allowed( BackendType.SHELL, script ) if not allowed: logger.error("SecurityPolicy blocked bash script execution") raise PermissionError("SecurityPolicy: bash script execution blocked") # Wrap with conda if needed final_script = _wrap_script_with_conda(script, conda_env) # Write to temp file (same as local_server) suffix = uuid.uuid4().hex if platform_name == "Windows": temp_filename = os.path.join(tempfile.gettempdir(), f"bash_exec_{suffix}.sh") else: temp_filename = f"/tmp/bash_exec_{suffix}.sh" try: with open(temp_filename, "w") as f: f.write(final_script) os.chmod(temp_filename, 0o755) logger.info( "Executing bash script locally with timeout=%d seconds%s%s%s", timeout, f", working_dir={working_dir}" if working_dir else "", f", env={list(env.keys())}" if env else "", f", conda_env={conda_env}" if conda_env else "", ) shell_cmd = ["bash", temp_filename] if platform_name == "Windows" else ["/bin/bash", temp_filename] result = await self._run_subprocess( shell_cmd, timeout=timeout, working_dir=working_dir, env=env, ) return result finally: if os.path.exists(temp_filename): os.unlink(temp_filename) # ------------------------------------------------------------------ # BaseConnector abstract methods # ------------------------------------------------------------------ async def invoke(self, name: str, params: dict[str, Any]) -> Any: """Dispatch by name — same routing as ShellConnector via AioHttpConnector.""" name_upper = name.strip().upper() if "/RUN_PYTHON" in name_upper: return await self.run_python_script( params.get("code", ""), timeout=params.get("timeout", 90), working_dir=params.get("working_dir"), env=params.get("env"), conda_env=params.get("conda_env"), ) elif "/RUN_BASH_SCRIPT" in name_upper: return await self.run_bash_script( params.get("script", ""), timeout=params.get("timeout", 90), working_dir=params.get("working_dir"), env=params.get("env"), conda_env=params.get("conda_env"), ) else: raise NotImplementedError(f"LocalShellConnector does not support: {name}") async def request(self, *args: Any, **kwargs: Any) -> Any: """Not used in local mode.""" raise NotImplementedError( "LocalShellConnector does not support raw HTTP requests" ) ================================================ FILE: anytool/grounding/backends/web/__init__.py ================================================ from .provider import WebProvider from .session import WebSession __all__ = [ "WebProvider", "WebSession" ] ================================================ FILE: anytool/grounding/backends/web/provider.py ================================================ from typing import Dict, Any from anytool.grounding.core.types import BackendType, SessionConfig from anytool.grounding.core.provider import Provider from .session import WebSession from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class WebProvider(Provider[WebSession]): DEFAULT_SID = BackendType.WEB.value def __init__(self, config: Dict[str, Any] = None): super().__init__(BackendType.WEB, config) async def initialize(self) -> None: """Initialize Web Provider and create default session""" if not self.is_initialized: logger.info("Initializing Web provider (Knowledge Research)") # Auto-create default session await self.create_session(SessionConfig( session_name=self.DEFAULT_SID, backend_type=BackendType.WEB, connection_params={} )) self.is_initialized = True async def create_session(self, session_config: SessionConfig) -> WebSession: """Create Web session""" session_name = session_config.session_name if session_name in self._sessions: logger.warning(f"Session {session_name} already exists, returning existing session") return self._sessions[session_name] # Create WebSession with auto-connect and auto-initialize enabled session = WebSession( session_id=session_name, config=session_config, auto_connect=True, auto_initialize=True ) self._sessions[session_name] = session logger.info(f"Created Web session (Knowledge Research): {session_name}") return session async def close_session(self, session_name: str) -> None: """Close Web session""" session = self._sessions.pop(session_name, None) if session: await session.disconnect() logger.info(f"Closed Web session: {session_name}") ================================================ FILE: anytool/grounding/backends/web/session.py ================================================ import os from typing import Dict, Any, Optional from anytool.grounding.core.session import BaseSession from anytool.grounding.core.types import BackendType, SessionConfig from anytool.grounding.core.tool import BaseTool from anytool.grounding.core.transport.connectors import BaseConnector from anytool.llm import LLMClient from anytool.utils.logging import Logger from dotenv import load_dotenv load_dotenv() logger = Logger.get_logger(__name__) try: from openai import AsyncOpenAI OPENAI_AVAILABLE = True except ImportError: OPENAI_AVAILABLE = False class WebConnector(BaseConnector): def __init__(self, api_key: str, base_url: str): self.api_key = api_key self.base_url = base_url self.client: Optional[AsyncOpenAI] = None self._connected = False async def connect(self) -> None: if self._connected: return if not OPENAI_AVAILABLE: raise RuntimeError( "OpenAI library not available. Install with: pip install openai" ) if not self.api_key: raise RuntimeError( "API key not provided. Set OPENROUTER_API_KEY environment variable " "or provide deep_research_api_key in config." ) self.client = AsyncOpenAI( base_url=self.base_url, api_key=self.api_key ) self._connected = True logger.info(f"Web connector connected to {self.base_url}") async def disconnect(self) -> None: if not self._connected: return self.client = None self._connected = False logger.info("Web connector disconnected") @property def is_connected(self) -> bool: return self._connected async def invoke(self, name: str, params: dict) -> Any: if name == "chat_completion": if not self.client: raise RuntimeError("Client not connected") return await self.client.chat.completions.create(**params) raise NotImplementedError(f"Unknown method: {name}") async def request(self, *args: Any, **kwargs: Any) -> Any: raise NotImplementedError("Web backend uses invoke() instead of request()") class WebSession(BaseSession): backend_type = BackendType.WEB def __init__( self, *, session_id: str, config: SessionConfig, deep_research_api_key: Optional[str] = None, deep_research_base_url: str = "https://openrouter.ai/api/v1", auto_connect: bool = True, auto_initialize: bool = True ): api_key = deep_research_api_key or os.getenv("OPENROUTER_API_KEY") connector = WebConnector( api_key=api_key or "", # Empty string will raise an error when connect base_url=deep_research_base_url ) super().__init__( connector=connector, session_id=session_id, backend_type=BackendType.WEB, auto_connect=auto_connect, auto_initialize=auto_initialize ) self.config = config @property def web_connector(self) -> WebConnector: return self.connector async def initialize(self) -> Dict[str, Any]: """Connect to WebConnector and register tools. BaseSession in __aenter__ will call connect() according to auto_connect, but in provider.create_session directly instantiating Session will not trigger this logic. Therefore, we need to explicitly ensure that the connection is established, avoiding AttributeError when DeepResearchTool is called and `self.web_connector.client` is still None. """ # If the connection is not established, connect explicitly if not self.is_connected: try: await self.connect() except Exception as e: logger.error(f"Failed to connect WebSession {self.session_id}: {e}") raise if self.tools: logger.debug(f"Web session {self.session_id} already initialized, skipping") return { "tools": [t.name for t in self.tools], "backend": BackendType.WEB.value } self.tools = [DeepResearchTool(session=self)] logger.info(f"Initialized Web session {self.session_id} with AI Deep Research tool") return { "tools": [t.name for t in self.tools], "backend": BackendType.WEB.value } class DeepResearchTool(BaseTool): backend_type = BackendType.WEB _name = "deep_research_agent" _description = """Knowledge Research Tool - Primary tool for acquiring external knowledge PURPOSE: Acquires comprehensive knowledge from the web through deep research and analysis. Powered by Perplexity AI's sonar-deep-research model, then post-processed to extract actionable insights and concise summaries. The main tool for gathering information beyond existing knowledge base. WHEN TO USE: - Information needed on professional/technical topics - Research on technical problems, concepts, or implementations - Understanding of latest developments, trends, or news - Comparison of different approaches, tools, or solutions - Factual information, definitions, or explanations required - Synthesis from multiple authoritative sources needed HOW IT WORKS: 1. Conducts deep web search using Perplexity's sonar-deep-research 2. Analyzes and synthesizes information from multiple sources 3. Post-processes to distill knowledge-dense summary retaining critical details 4. Returns comprehensive summary ready for immediate use RETURNS: Knowledge-dense comprehensive summary (400-600 words) that: - Retains important details and technical specifics - Focuses on substantive knowledge without losing critical information - Organized and structured for clarity - Directly usable by agents for decision-making and task execution NOT DESIGNED FOR: - Tasks requiring browser interaction or UI manipulation - Direct file downloads or web scraping operations - Real-time system operations or executions USAGE GUIDELINES: - Frame clear, specific questions (e.g., "Explain the architecture of Transformer models") - Specify context when needed (e.g., "Compare PostgreSQL vs MySQL for high-concurrency scenarios") - Suitable for any knowledge or information acquisition needs """ def __init__( self, session: WebSession ): super().__init__() self._session = session self._llm = LLMClient() async def _arun(self, query: str) -> str: if not query: return "ERROR: Missing required parameter: query" try: # Step 1: Deep research logger.info(f"Start deep research: {query}") completion = await self._session.web_connector.client.chat.completions.create( model="perplexity/sonar-deep-research", messages=[{"role": "user", "content": query}] ) full_answer = completion.choices[0].message.content logger.info(f"Research completed, length: {len(full_answer)} characters") # Step 2: Use LLMClient to generate summary and distill key points logger.info(f"Begin to distill key points...") SUMMARY_AGENT_PROMPT = f"""Please distill the following deep research results into a knowledge-dense summary. Requirements: Provide a comprehensive yet concise summary (400-600 words): - Focus on SUBSTANTIVE knowledge and key information - Retain important details, technical specifics, and concrete facts - Do NOT sacrifice critical information for brevity - Organize information clearly and logically with proper structure - Remove only redundancy and verbose explanations - Include actionable insights and decision-relevant information - Make it directly usable for task execution and decision-making Output ONLY the summary text, no additional formatting or JSON structure needed. Deep Research Results: {full_answer} """ summary_response = await self._llm.complete(SUMMARY_AGENT_PROMPT) summary = summary_response["message"]["content"].strip() logger.info(f"Summary generation completed") return summary except Exception as e: logger.error(f"Deep research failed: {e}") return f"ERROR: AI research failed: {e}" ================================================ FILE: anytool/grounding/core/exceptions.py ================================================ """ Unified exception & error-code definitions for the grounding framework """ from enum import Enum, auto from typing import Any, Dict class ErrorCode(str, Enum): # generic UNKNOWN = auto() CONFIG_INVALID = auto() # provider / session / connector PROVIDER_ERROR = auto() SESSION_NOT_FOUND = auto() # connection CONNECTION_FAILED = auto() CONNECTION_TIMEOUT = auto() # tool TOOL_NOT_FOUND = auto() TOOL_EXECUTION_FAIL = auto() AMBIGUOUS_TOOL = auto() class GroundingError(Exception): """ Framework-wide base exception. Parameters ---------- message : str Human readable error message. code : ErrorCode One of the error codes defined above. retryable : bool Whether the caller may retry the operation automatically. context : Dict[str, Any] Extra key-value pairs (e.g. tool_name, session_id) for logging / metrics. """ __slots__ = ("message", "code", "retryable", "context") def __init__( self, message: str, *, code: ErrorCode = ErrorCode.UNKNOWN, retryable: bool = False, **context: Any, ): super().__init__(f"[{code}] {message}") self.message: str = message self.code: ErrorCode = code self.retryable: bool = retryable self.context: Dict[str, Any] = context def to_dict(self) -> Dict[str, Any]: """Serialize error for structured logging / JSON response.""" return { "code": self.code.value, "message": self.message, "retryable": self.retryable, "context": self.context, } def __str__(self) -> str: return f"[{self.code}] {self.message}" def __repr__(self) -> str: return f"GroundingError(code={self.code}, msg={self.message!r})" ================================================ FILE: anytool/grounding/core/grounding_client.py ================================================ import asyncio import time from collections import OrderedDict from datetime import datetime from typing import Any, Dict, List, Optional from .types import BackendType, SessionConfig, SessionInfo, SessionStatus, ToolResult from .exceptions import ErrorCode, GroundingError from .tool import BaseTool from .provider import Provider, ProviderRegistry from .session import BaseSession from .search_tools import SearchCoordinator from anytool.config import GroundingConfig, get_config from anytool.config.utils import get_config_value from anytool.utils.logging import Logger import importlib class GroundingClient: """ Global Entry, Facing Agent/Application, only concerned with Provider & Session """ def __init__(self, config: Optional[GroundingConfig] = None, recording_manager=None) -> None: # Initialize logger first (needed by other initialization steps) self._logger = Logger.get_logger(__name__) self._config: GroundingConfig = config or get_config() self._registry: ProviderRegistry = ProviderRegistry() # Register providers from config self._register_providers_from_config() # Session self._sessions: Dict[str, BaseSession] = {} self._session_info: Dict[str, SessionInfo] = {} self._server_session_map: dict[tuple[BackendType, str], str] = {} # (backend, server) -> session_name # Tool cache self._tool_cache: "OrderedDict[str, tuple[List[BaseTool], float]]" = OrderedDict() self._tool_cache_ttl: int = get_config_value(self._config, "tool_cache_ttl", 300) self._tool_cache_maxsize: int = get_config_value(self._config, "tool_cache_maxsize", 300) # Concurrent control self._lock = asyncio.Lock() self._cache_lock = asyncio.Lock() # Tool search coordinator self._search_coordinator: Optional[SearchCoordinator] = None # Recording manager (optional, for GUI intermediate step recording) self._recording_manager = recording_manager # Tool quality manager self._quality_manager = self._init_quality_manager() # Register SystemProvider (requires GroundingClient instance, so must be done after __init__) self._register_system_provider() def _register_providers_from_config(self) -> None: """ Based on GroundingConfig.enabled_backends, register Provider instances to self._registry. Here only do *instantiation*, not await initialize(), to avoid blocking the event loop in the import stage; Provider will be lazily initialized when it is first used. Note: SystemProvider is skipped here and registered separately in _register_system_provider() because it requires a GroundingClient instance. """ if not self._config.enabled_backends: self._logger.warning("No enabled_backends defined in config") return for item in self._config.enabled_backends: be_name: str | None = item.get("name") cls_path: str | None = item.get("provider_cls") if not (be_name and cls_path): self._logger.warning("Invalid backend entry: %s", item) continue backend = BackendType(be_name.lower()) # Skip system backend - it will be registered separately if backend == BackendType.SYSTEM: self._logger.debug("Skipping system backend in config registration (will be registered separately)") continue if backend in self._registry.list(): continue # Already registered # Dynamically import Provider class try: module_path, _, cls_name = cls_path.rpartition(".") module = importlib.import_module(module_path) prov_cls = getattr(module, cls_name) except (ModuleNotFoundError, AttributeError) as e: self._logger.error("Import provider failed: %s (%s)", cls_path, e) continue backend_cfg = self._config.get_backend_config(be_name) provider: Provider = prov_cls(backend_cfg) self._registry.register(provider) def _register_system_provider(self) -> None: """ Register SystemProvider separately because it requires GroundingClient instance. SystemProvider provides meta-level tools for querying system state (list providers, tools, etc.) and is always available regardless of configuration. """ try: from .system import SystemProvider system_provider = SystemProvider(self) self._registry.register(system_provider) self._logger.debug("SystemProvider registered successfully") except Exception as e: self._logger.warning(f"Failed to register SystemProvider: {e}") def _init_quality_manager(self): """Initialize tool quality manager based on config.""" try: # Check if quality tracking is enabled in config quality_config = getattr(self._config, 'tool_quality', None) if not quality_config or not getattr(quality_config, 'enabled', True): self._logger.debug("Tool quality tracking disabled") return None from .quality import ToolQualityManager, set_quality_manager from pathlib import Path cache_dir = getattr(quality_config, 'cache_dir', None) if cache_dir: cache_dir = Path(cache_dir) manager = ToolQualityManager( cache_dir=cache_dir, enable_persistence=getattr(quality_config, 'enable_persistence', True), auto_save=True, evolve_interval=getattr(quality_config, 'evolve_interval', 5), ) # Set as global manager for BaseTool access set_quality_manager(manager) self._logger.info( f"ToolQualityManager initialized " f"(records={len(manager._records)})" ) return manager except Exception as e: self._logger.warning(f"Failed to initialize ToolQualityManager: {e}") return None @property def quality_manager(self): """Get the tool quality manager.""" return self._quality_manager # Quality API for Upper Layer def get_quality_report(self) -> Dict[str, Any]: """ Get comprehensive tool quality report. """ if not self._quality_manager: return {"status": "disabled", "message": "Quality tracking not enabled"} return self._quality_manager.get_quality_report() async def evolve_quality(self) -> Dict[str, Any]: """ Run quality self-evolution cycle. This triggers: - Tool change detection - Description re-evaluation for updated tools - Adaptive quality weight computation Call this periodically or after tool set changes. """ if not self._quality_manager: return {"status": "disabled"} # Get all tools all_tools = await self.list_tools() return await self._quality_manager.evolve(all_tools) def get_tool_insights(self, tool: BaseTool) -> Dict[str, Any]: """ Get detailed quality insights for a specific tool. """ if not self._quality_manager: return {"status": "disabled"} return self._quality_manager.get_tool_insights(tool) def register_provider(self, provider: Provider) -> None: self._registry.register(provider) def get_provider(self, backend: BackendType) -> Provider: return self._registry.get(backend) def list_providers(self) -> Dict[BackendType, Provider]: return self._registry.list() @property def recording_manager(self): """Get the recording manager.""" return self._recording_manager @recording_manager.setter def recording_manager(self, manager): """ Set or update the recording manager. This allows coordinator to inject recording_manager after GroundingClient creation. """ self._recording_manager = manager self._logger.info("GroundingClient: RecordingManager updated") async def initialize_all_providers(self) -> None: await asyncio.gather(*[provider.initialize() for provider in self._registry.list().values() if not provider.is_initialized]) async def create_session( self, *, backend: BackendType, name: str | None = None, connection_params: Dict[str, Any] | None = None, server: str | None = None, **options, ) -> str: """ Create and initialize Session, return "session_name" (external visible) name is auto generated when it's None: - MCP backend needs to provide server """ async with self._lock: # Check concurrent sessions limit max_sessions = get_config_value(self._config, "max_concurrent_sessions", 100) if len(self._sessions) >= max_sessions: raise GroundingError(f"Reached maximum session limit: {max_sessions}") # Session naming strategy if server: # Only MCP will pass in server name = name or f"{backend.value}-{server}" else: name = name or backend.value # Other backends have a fixed 1 session if name in self._sessions: # Reuse existing session self._logger.warning("Session '%s' exists, reusing.", name) return name # Get Provider (initialize if first time) provider = self._registry.get(backend) if not provider.is_initialized: await provider.initialize() if backend == BackendType.MCP: if server is None: raise GroundingError("Must specify 'server' when creating MCP session") # Construct SessionConfig, pass to Provider to create connection_params = connection_params or {} if server: connection_params.setdefault("server", server) # Inject recording_manager for GUI backend (for intermediate step recording) if backend == BackendType.GUI and self._recording_manager is not None: connection_params.setdefault("recording_manager", self._recording_manager) sess_cfg = SessionConfig( session_name=name, # Use external visible name backend_type=backend, connection_params=connection_params, **options, ) session_obj = await provider.create_session(sess_cfg) # Store session and monitoring info async with self._lock: self._sessions[name] = session_obj now = datetime.utcnow() self._session_info[name] = SessionInfo( session_name=name, backend_type=backend, status=SessionStatus.CONNECTED, created_at=now, last_activity=now, ) if server: self._server_session_map[(backend, server)] = name self._logger.info("Session created: %s", name) return name def list_sessions(self) -> List[str]: return list(self._sessions.keys()) async def close_session(self, name: str) -> None: async with self._lock: session = self._sessions.pop(name, None) info = self._session_info.pop(name, None) self._tool_cache.pop(name, None) for k, v in list(self._server_session_map.items()): if v == name: self._server_session_map.pop(k) if not session: self._logger.warning("Session '%s' not found", name) return try: provider = self._registry.get(info.backend_type) if info else None if provider: await provider.close_session(name) else: # Fallback: if no provider, disconnect directly await session.disconnect() finally: self._logger.info("Session closed: %s", name) async def close_all_sessions(self) -> None: for sid in list(self._sessions.keys()): await self.close_session(sid) async def ensure_session(self, backend: BackendType, server: str | None = None) -> str: sid = backend.value if server is None else f"{backend.value}-{server}" if sid not in self._sessions: await self.create_session(backend=backend, name=sid, server=server) return sid def get_session_info(self, name: str) -> SessionInfo: """Get session monitoring info""" if name not in self._session_info: raise ErrorCode.SESSION_NOT_FOUND(name) return self._session_info[name] def get_session(self, name: str) -> BaseSession: """Get session""" if name not in self._sessions: raise ErrorCode.SESSION_NOT_FOUND(name) return self._sessions[name] async def _fetch_tools( self, backend: BackendType, *, session_name: str | None = None, use_cache: bool = False, bind_runtime_info: bool = True, ) -> List[BaseTool]: """ Fetch tools from provider. Args: backend: Backend type session_name: - None: fetch all tools from all sessions of this backend - str: fetch tools from specific session use_cache: Whether to use cache bind_runtime_info: Whether to bind runtime info to tool instances """ now = time.time() # Auto-generate cache_scope from parameters if session_name: cache_scope = session_name else: cache_scope = f"backend-{backend.value}" # Check cache if use_cache: async with self._cache_lock: if cache_scope in self._tool_cache: tools, ts = self._tool_cache[cache_scope] if now - ts < self._tool_cache_ttl: self._tool_cache.move_to_end(cache_scope) return tools provider = self._registry.get(backend) if not provider.is_initialized: await provider.initialize() tools = await provider.list_tools(session_name=session_name) if bind_runtime_info: # If session_name is specified, bind all tools to that session if session_name: server_name = None if backend == BackendType.MCP: server_name = session_name.replace(f"{backend.value}-", "", 1) for tool in tools: tool.bind_runtime_info( backend=backend, session_name=session_name, server_name=server_name, grounding_client=self, ) else: # No session_name specified - get tools from all sessions # For each backend, find the default/primary session # For Shell/Web/GUI: use the default session (backend.value) # For MCP: tools should already be bound by the provider default_session_name = None # Try to find an existing session for this backend for sid, info in self._session_info.items(): if info.backend_type == backend: default_session_name = sid break # Fallback: use backend default naming if not default_session_name: default_session_name = backend.value server_name = None if backend == BackendType.MCP and default_session_name: server_name = default_session_name.replace(f"{backend.value}-", "", 1) for tool in tools: # Only bind if tool doesn't have runtime info already # (some providers like MCP bind runtime info during list_tools) if not tool.is_bound: tool.bind_runtime_info( backend=backend, session_name=default_session_name, server_name=server_name, grounding_client=self, ) elif not tool.runtime_info.grounding_client: # Tool has runtime info but no grounding_client, add it tool.bind_runtime_info( backend=tool.runtime_info.backend, session_name=tool.runtime_info.session_name, server_name=tool.runtime_info.server_name, grounding_client=self, ) # Save to cache if use_cache: async with self._cache_lock: self._tool_cache[cache_scope] = (tools, now) self._tool_cache.move_to_end(cache_scope) while len(self._tool_cache) > self._tool_cache_maxsize: self._tool_cache.popitem(last=False) return tools async def list_tools( self, backend: BackendType | list[BackendType] | None = None, session_name: str | None = None, *, use_cache: bool = False, ) -> List[BaseTool]: """ List tools from backend(s) or session. 1. session_name is provided → return tools from that session 2. backend is list → return tools from multiple backends 3. backend is single → return tools from that backend 4. backend is None → return tools from all backends Args: backend: Single backend, list of backends, or None for all session_name: Specific session name (overrides backend parameter) use_cache: Whether to use cache Returns: List of tools """ # Session-level if session_name: if session_name not in self._sessions: raise ErrorCode.SESSION_NOT_FOUND(session_name) backend_type = self._session_info[session_name].backend_type return await self._fetch_tools( backend_type, session_name=session_name, use_cache=use_cache, ) # Multiple backends if isinstance(backend, list): tools: List[BaseTool] = [] for be in backend: backend_tools = await self._fetch_tools( be, session_name=None, # Provider aggregates all sessions use_cache=use_cache, ) tools.extend(backend_tools) return tools # Single backend if backend is not None: return await self._fetch_tools( backend, session_name=None, use_cache=use_cache, ) # All backends tools: List[BaseTool] = [] for backend_type in self._registry.list().keys(): backend_tools = await self._fetch_tools( backend_type, session_name=None, use_cache=use_cache, ) tools.extend(backend_tools) return tools async def list_backend_tools( self, backend: BackendType | list[BackendType] | None = None, use_cache: bool = False ) -> list[BaseTool]: return await self.list_tools(backend=backend, session_name=None, use_cache=use_cache) async def list_session_tools( self, session_name: str, use_cache: bool = False ) -> list[BaseTool]: if session_name not in self._session_info: raise ErrorCode.SESSION_NOT_FOUND(session_name) backend = self._session_info[session_name].backend_type return await self.list_tools(backend, session_name, use_cache) async def list_all_backend_tools( self, use_cache: bool = False ) -> Dict[BackendType, list[BaseTool]]: """List static tools for every registered backend.""" result = {} for backend_type in self.list_providers().keys(): tools = await self.list_backend_tools(backend=backend_type, use_cache=use_cache) result[backend_type] = tools return result async def search_tools( self, task_description: str, *, backend: BackendType | list[BackendType] | None = None, session_name: str | None = None, max_tools: int | None = None, search_mode: str | None = None, use_cache: bool = True, llm_callable = None, enable_llm_filter: bool | None = None, llm_filter_threshold: int | None = None, enable_cache_persistence: bool | None = None, cache_dir: str | None = None, ) -> list[BaseTool]: """ Search tools from backend(s) or session. Args: task_description: Task description for searching relevant tools backend: Backend type(s) to search session_name: Specific session to search max_tools: Maximum number of tools to return search_mode: Search mode ("semantic", "keyword", "hybrid") use_cache: Whether to use cached tool list llm_callable: LLM client for intelligent filtering enable_llm_filter: Whether to use LLM pre-filtering llm_filter_threshold: Threshold for applying LLM filter enable_cache_persistence: Whether to persist embeddings to disk. If None, uses config value. cache_dir: Directory for persistent cache. If None, uses config value or default. """ candidate_tools = await self.list_tools( backend=backend, session_name=session_name, use_cache=use_cache, ) if not candidate_tools: self._logger.warning("No candidate tools found for search") return [] # lazy initialize SearchCoordinator (or recreate if parameters changed) if self._search_coordinator is None: # Get quality ranking settings from config quality_config = getattr(self._config, 'tool_quality', None) enable_quality_ranking = getattr(quality_config, 'enable_quality_ranking', True) if quality_config else True self._search_coordinator = SearchCoordinator( max_tools=max_tools, llm=llm_callable, enable_llm_filter=enable_llm_filter, llm_filter_threshold=llm_filter_threshold, enable_cache_persistence=enable_cache_persistence, cache_dir=cache_dir, quality_manager=self._quality_manager, enable_quality_ranking=enable_quality_ranking, ) # execute search and sort try: filtered_tools = await self._search_coordinator._arun( task_prompt=task_description, candidate_tools=candidate_tools, max_tools=max_tools, mode=search_mode, ) return filtered_tools except Exception as exc: self._logger.error(f"Tool search failed: {exc}") # fallback: return top N tools fallback_max = max_tools or self._config.tool_search.max_tools return candidate_tools[:fallback_max] def get_last_search_debug_info(self) -> Optional[Dict[str, Any]]: """Get debug info from the last tool search operation. Returns: Dict containing search debug info, or None if no search has been performed. """ if self._search_coordinator is None: return None return self._search_coordinator.get_last_search_debug_info() async def get_tools_with_auto_search( self, *, task_description: str | None = None, backend: BackendType | list[BackendType] | None = None, session_name: str | None = None, max_tools: int | None = None, search_mode: str | None = None, use_cache: bool = True, llm_callable = None, enable_llm_filter: bool | None = None, llm_filter_threshold: int | None = None, enable_cache_persistence: bool | None = None, cache_dir: str | None = None, ) -> list[BaseTool]: """ Intelligent tool retrieval: automatically decides whether to return all tools or trigger search. Logic: - If tool_count <= max_tools: return all tools directly - If tool_count > max_tools: trigger search and return top max_tools Args: task_description: Task description (required for search if triggered). If None, search will not be triggered even if tool count exceeds max_tools. backend: Backend type(s) to query session_name: Specific session name max_tools: Maximum number of tools to return. Also acts as the threshold for triggering search. - None: Use value from config (default: 30) search_mode: Search mode ("semantic", "keyword", "hybrid") use_cache: Whether to use cache llm_callable: LLM client (for intelligent filtering) enable_llm_filter: Whether to use LLM for backend/server pre-filtering. - None: Use config default - False: Disable LLM filter, use tool-level search only - True: Enable LLM filter llm_filter_threshold: Only apply LLM filter when tool count > this threshold. - None: Use default (50) - N: Only apply LLM filter when > N tools enable_cache_persistence: Whether to persist embeddings to disk. If None, uses config value. cache_dir: Directory for persistent cache. If None, uses config value or default. Returns: List of tools (at most max_tools) Examples: # Scenario 1: Auto-detect whether search is needed tools = await gc.get_tools_with_auto_search( task_description="Create a flowchart", backend=BackendType.MCP ) # Scenario 2: Custom max_tools tools = await gc.get_tools_with_auto_search( task_description="Edit file", backend=BackendType.SHELL, max_tools=30 # Return at most 30 tools ) # Scenario 3: Disable search (return all tools regardless of count) tools = await gc.get_tools_with_auto_search( backend=BackendType.MCP # No task_description = no search ) """ # Fetch all candidate tools all_tools = await self.list_tools( backend=backend, session_name=session_name, use_cache=use_cache, ) if not all_tools: self._logger.warning("No tools found") return [] # Determine max_tools from config if not provided if max_tools is None: max_tools = self._config.tool_search.max_tools # Decide whether search is needed tools_count = len(all_tools) need_search = tools_count > max_tools and task_description is not None if need_search: self._logger.info( f"Tool count ({tools_count}) > max_tools ({max_tools}), " f"triggering search to filter relevant tools..." ) return await self.search_tools( task_description=task_description, backend=backend, session_name=session_name, max_tools=max_tools, search_mode=search_mode, use_cache=use_cache, llm_callable=llm_callable, enable_llm_filter=enable_llm_filter, llm_filter_threshold=llm_filter_threshold, enable_cache_persistence=enable_cache_persistence, cache_dir=cache_dir, ) else: if task_description is None: self._logger.debug( f"No task description provided, returning all {tools_count} tools" ) else: self._logger.debug( f"Tool count ({tools_count}) ≤ max_tools ({max_tools}), " f"returning all tools without search" ) return all_tools async def invoke_tool( self, tool: BaseTool | str, parameters: Dict[str, Any] | None = None, *, backend: BackendType | None = None, session_name: str | None = None, server: str | None = None, keep_session: bool = False, **kwargs ) -> ToolResult: """ Universal tool invocation method. Supports multiple calling patterns: 1. Using BaseTool instance with bound runtime info 2. Using BaseTool instance with explicit backend/session 3. Using tool name with automatic lookup 4. Using tool name with explicit backend/session/server Args: tool: BaseTool instance or tool name string parameters: Tool parameters as dict backend: Backend type (optional for BaseTool with runtime_info) session_name: Session name (optional for BaseTool with runtime_info) server: Server name (for MCP, optional for BaseTool with runtime_info) keep_session: Whether to keep session alive after invocation **kwargs: Alternative parameter passing Returns: ToolResult Examples: # Pattern 1: Tool instance with runtime info (from list_tools) tools = await gc.list_tools() tool = next(t for t in tools if t.name == "read_file") result = await gc.invoke_tool(tool, {"path": "/tmp/a.txt"}) # Pattern 2: Tool instance with explicit backend/session my_tool = MyTool() result = await gc.invoke_tool( my_tool, {"arg": "value"}, backend=BackendType.SHELL ) # Pattern 3: Tool name with automatic lookup result = await gc.invoke_tool("read_file", {"path": "/tmp/a.txt"}) # Pattern 4: Tool name with explicit backend/server result = await gc.invoke_tool( "read_file", {"path": "/tmp/a.txt"}, backend=BackendType.MCP, server="filesystem" ) """ params = parameters or kwargs # BaseTool instance if isinstance(tool, BaseTool): tool_name = tool.schema.name # Try to use bound runtime info first if tool.is_bound and not (backend or session_name or server): # Use runtime info runtime_backend = tool.runtime_info.backend runtime_session = tool.runtime_info.session_name runtime_server = tool.runtime_info.server_name else: # Use provided or tool's default backend runtime_backend = backend or tool.backend_type runtime_session = session_name runtime_server = server if runtime_backend == BackendType.NOT_SET: raise GroundingError( f"Cannot invoke tool '{tool_name}': no backend specified. " f"Either bind runtime info or provide backend parameter.", code=ErrorCode.TOOL_EXECUTION_FAIL ) # Tool name string elif isinstance(tool, str): tool_name = tool # If explicit backend/session provided, use them if backend or session_name: runtime_session = session_name runtime_server = server # Infer backend: prefer explicit backend; otherwise get from session if backend is not None: runtime_backend = backend else: if runtime_session not in self._session_info: raise ErrorCode.SESSION_NOT_FOUND(runtime_session) runtime_backend = self._session_info[ runtime_session ].backend_type else: # Auto-lookup: search for the tool all_tools = await self.list_tools(use_cache=True) matching = [t for t in all_tools if t.name == tool_name] if not matching: raise GroundingError( f"Tool '{tool_name}' not found", code=ErrorCode.TOOL_NOT_FOUND ) if len(matching) > 1: sources = [ f"{t.runtime_info.backend.value}/{t.runtime_info.session_name}" for t in matching if t.is_bound ] raise GroundingError( f"Multiple tools named '{tool_name}' found in: {sources}. " f"Please specify 'backend' or 'session_name' parameter.", code=ErrorCode.AMBIGUOUS_TOOL ) # Use the found tool's runtime info found_tool = matching[0] runtime_backend = found_tool.runtime_info.backend runtime_session = found_tool.runtime_info.session_name runtime_server = found_tool.runtime_info.server_name # Execute the tool # Ensure session exists (except for SYSTEM backend which doesn't use sessions) # Check if session really exists - cached tools have session_name but session may not be running if runtime_backend != BackendType.SYSTEM: if not runtime_session or runtime_session not in self._sessions: runtime_session = await self.ensure_session(runtime_backend, runtime_server) try: provider = self._registry.get(runtime_backend) # SystemProvider doesn't use sessions, pass a dummy value session_param = runtime_session if runtime_session else "system" result = await provider.call_tool(session_param, tool_name, params) # Update last_activity in session_info (skip for SYSTEM backend) if runtime_backend != BackendType.SYSTEM and runtime_session and runtime_session in self._session_info: async with self._lock: old_info = self._session_info[runtime_session] self._session_info[runtime_session] = old_info.model_copy( update={"last_activity": datetime.utcnow()} ) return result finally: # Auto-close session if requested (skip for SYSTEM backend) if runtime_backend != BackendType.SYSTEM and not keep_session and runtime_session: if runtime_server or runtime_session.startswith(runtime_backend.value): await self.close_session(runtime_session) ================================================ FILE: anytool/grounding/core/provider.py ================================================ """ provider is to manage sessions of a backend, if the backend is mcp, then provider will manage sessions through servers """ from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional, Generic, TypeVar from .tool import BaseTool from .types import BackendType, SessionConfig, ToolResult, ToolStatus from .session import BaseSession from .security.policies import SecurityPolicyManager from anytool.config import get_config from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) TSession = TypeVar('TSession', bound=BaseSession) class Provider(ABC, Generic[TSession]): """Backend provider base class""" def __init__(self, backend_type: BackendType, config: Dict[str, Any] = None): self.backend_type = backend_type self.config = config or {} self.is_initialized = False self._sessions: Dict[str, TSession] = {} # session management self._session_counter: int = 0 self.security_manager = SecurityPolicyManager() self._setup_security_policy(config) def _setup_security_policy(self, config: dict | None = None): security_policy = get_config().get_security_policy(self.backend_type.value) self.security_manager.set_backend_policy(BackendType.SHELL, security_policy) async def ensure_initialized(self) -> None: """ Internal helper. Guarantee that `initialize()` has been executed """ if not self.is_initialized: await self.initialize() @abstractmethod async def initialize(self) -> None: """Initialize provider, call `create_session` to create all sessions if not exist Subclasses should set `self.is_initialized = True` after successful initialization """ pass @abstractmethod async def create_session(self, session_config: SessionConfig) -> TSession: """Create session, update _sessions""" pass @abstractmethod async def close_session(self, session_name: str) -> None: """Close session""" pass def list_sessions(self) -> List[str]: """Get all session IDs""" return list(self._sessions.keys()) def get_session(self, session_name: str) -> Optional[TSession]: """Get session object by ID""" return self._sessions.get(session_name) async def close_all_sessions(self) -> None: """Provider shutdown cleanup""" for session_name in list(self._sessions.keys()): try: await self.close_session(session_name) except Exception as e: print(f"Error closing session {session_name}: {e}") self._sessions.clear() self.is_initialized = False def __repr__(self) -> str: return (f"Provider(backend={self.backend_type.value}, " f"initialized={self.is_initialized}, " f"sessions={len(self._sessions)}, " f"config_items={len(self.config)})") async def list_tools(self, session_name: Optional[str] = None) -> List[BaseTool]: """ Return BaseTool list. If session_name is specified, only return the tools of the specified session. If session_name is not specified, return all tools of all sessions. """ await self.ensure_initialized() if session_name: session = self._sessions.get(session_name) return await session.list_tools() if session else [] tools: list[BaseTool] = [] for sess in self._sessions.values(): tools.extend(await sess.list_tools()) return tools async def call_tool( self, session_name: str, tool_name: str, parameters: Dict[str, Any] | None = None, ) -> ToolResult: await self.ensure_initialized() parameters = parameters or {} session = self._sessions.get(session_name) if session is None: return ToolResult( status=ToolStatus.ERROR, content="", error=f"Session '{session_name}' not found", metadata={"session_name": session_name, "tool_name": tool_name}, ) try: return await session.call_tool(tool_name, parameters) except Exception as e: logger.error("Execute tool error: %s @%s - %s", tool_name, session_name, e) return ToolResult( status=ToolStatus.ERROR, content="", error=str(e), metadata={"session_name": session_name, "tool_name": tool_name}, ) class ProviderRegistry: """ Maintain mapping of BackendType -> Provider, and provide dynamic registration / retrieval capabilities """ def __init__(self) -> None: self._providers: dict[BackendType, Provider] = {} def register(self, provider: "Provider") -> None: self._providers[provider.backend_type] = provider logger.debug("Provider for %s registered", provider.backend_type) def get(self, backend: BackendType) -> "Provider": if backend not in self._providers: raise KeyError(f"Provider for '{backend.value}' not registered") return self._providers[backend] def list(self) -> dict[BackendType, "Provider"]: return dict(self._providers) ================================================ FILE: anytool/grounding/core/quality/__init__.py ================================================ from .types import ToolQualityRecord, ExecutionRecord, DescriptionQuality from .manager import ToolQualityManager from .store import QualityStore # Global manager instance _global_manager: "ToolQualityManager | None" = None def get_quality_manager() -> "ToolQualityManager | None": """Get the global quality manager instance.""" return _global_manager def set_quality_manager(manager: "ToolQualityManager") -> None: """Set the global quality manager instance.""" global _global_manager _global_manager = manager __all__ = [ "ToolQualityRecord", "ExecutionRecord", "DescriptionQuality", "ToolQualityManager", "QualityStore", "get_quality_manager", "set_quality_manager", ] ================================================ FILE: anytool/grounding/core/quality/manager.py ================================================ """ Tool Quality Manager Core API (called by main flow): - record_execution(): Called by BaseTool after execution - adjust_ranking(): Called by SearchCoordinator for quality-aware sorting - evolve(): Called periodically by ToolLayer for self-evolution Query API (for inspection/debugging): - get_quality_report(), get_tool_insights() """ import hashlib from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple, TYPE_CHECKING from .types import ToolQualityRecord, ExecutionRecord, DescriptionQuality from .store import QualityStore from anytool.utils.logging import Logger from anytool.config.constants import PROJECT_ROOT if TYPE_CHECKING: from anytool.grounding.core.tool import BaseTool from anytool.grounding.core.types import ToolResult from anytool.llm import LLMClient logger = Logger.get_logger(__name__) class ToolQualityManager: """ Manages tool quality tracking and quality-aware ranking. Features: - Track execution success rate and latency - LLM-based description quality evaluation (optional, requires llm_client) - Persistent memory across sessions - Quality-integrated tool ranking - Incremental update detection """ def __init__( self, *, cache_dir: Optional[Path] = None, llm_client: Optional["LLMClient"] = None, enable_persistence: bool = True, auto_save: bool = True, evolve_interval: int = 5, ): self._cache_dir = cache_dir or PROJECT_ROOT / ".anytool" / "tool_quality" self._llm_client = llm_client self._enable_persistence = enable_persistence self._auto_save = auto_save self._evolve_interval = evolve_interval # In-memory cache self._records: Dict[str, ToolQualityRecord] = {} self._global_execution_count: int = 0 self._last_evolve_count: int = 0 # Persistent store self._store = QualityStore(self._cache_dir) if enable_persistence else None # Load from disk if self._store: self._records, self._global_execution_count = self._store.load_all() self._last_evolve_count = (self._global_execution_count // self._evolve_interval) * self._evolve_interval logger.info( f"ToolQualityManager initialized " f"(persistence={enable_persistence}, records={len(self._records)}, " f"global_count={self._global_execution_count}, evolve_interval={self._evolve_interval})" ) def get_tool_key(self, tool: "BaseTool") -> str: """Generate unique key for a tool.""" from anytool.grounding.core.types import BackendType if tool.is_bound: backend = tool.runtime_info.backend.value server = tool.runtime_info.server_name or "default" else: backend = tool.backend_type.value if tool.backend_type != BackendType.NOT_SET else "unknown" server = "default" return f"{backend}:{server}:{tool.name}" def _compute_description_hash(self, tool: "BaseTool") -> str: """Compute hash of tool description for change detection.""" content = f"{tool.name}|{tool.description or ''}|{tool.schema.parameters}" return hashlib.md5(content.encode()).hexdigest()[:16] def get_record(self, tool: "BaseTool") -> ToolQualityRecord: """Get or create quality record for a tool.""" key = self.get_tool_key(tool) if key not in self._records: backend, server, name = key.split(":", 2) self._records[key] = ToolQualityRecord( tool_key=key, backend=backend, server=server, tool_name=name, description_hash=self._compute_description_hash(tool), ) return self._records[key] def get_quality_score(self, tool: "BaseTool") -> float: """Get quality score for a tool (0-1).""" return self.get_record(tool).quality_score # Execution Tracking async def record_execution( self, tool: "BaseTool", result: "ToolResult", execution_time_ms: float, ) -> None: """Record tool execution result and increment global counter.""" record = self.get_record(tool) # Extract error message if failed error_message = None if result.is_error and result.error: error_message = str(result.error)[:500] # Add execution record record.add_execution(ExecutionRecord( timestamp=datetime.now(), success=result.is_success, execution_time_ms=execution_time_ms, error_message=error_message, )) # Increment global execution count self._global_execution_count += 1 # Auto-save if self._auto_save and self._store: await self._store.save_record(record, self._records, self._global_execution_count) logger.debug( f"Recorded execution: {record.tool_key} " f"success={result.is_success} time={execution_time_ms:.0f}ms " f"(global_count={self._global_execution_count})" ) async def evaluate_description( self, tool: "BaseTool", force: bool = False, ) -> Optional[DescriptionQuality]: """ Evaluate tool description quality using LLM. """ if not self._llm_client: logger.debug("LLM client not available for description evaluation") return None record = self.get_record(tool) # Skip if already evaluated and not forced if record.description_quality and not force: # Check if description changed current_hash = self._compute_description_hash(tool) if current_hash == record.description_hash: return record.description_quality # Build evaluation prompt desc = tool.description or "No description provided" if len(desc) > 4000: desc = desc[:4000] + "\n... (truncated for length)" params = tool.schema.parameters or {} if params: param_lines = [] # Extract parameter names and types from JSON schema if "properties" in params: for param_name, param_info in params.get("properties", {}).items(): param_type = param_info.get("type", "unknown") param_desc = param_info.get("description", "") param_lines.append(f"- {param_name} ({param_type}): {param_desc}" if param_desc else f"- {param_name} ({param_type})") param_text = "\n".join(param_lines) if param_lines else "No parameter descriptions available" else: param_text = "No parameters" prompt = f"""# Task: Evaluate this tool's documentation quality ## Tool Information Name: {tool.name} Description: {desc} Parameters: {param_text} ## Evaluation Task Rate the documentation on two dimensions (0.0 to 1.0 scale): ### 1. Clarity How clear is the tool's purpose and usage? - 0.0-0.3: No description or completely unclear - 0.4-0.6: Basic purpose but vague - 0.7-0.8: Clear purpose and functionality - 0.9-1.0: Very clear with usage examples or context ### 2. Completeness Are inputs/outputs properly documented? - 0.0-0.3: Missing critical information - 0.4-0.6: Basic info but lacks details - 0.7-0.8: Well documented with types - 0.9-1.0: Comprehensive with constraints and examples ## Scoring Guidelines - Short descriptions can score high if clear and accurate - If parameters exist but aren't explained in description, reduce completeness score - Missing description means clarity = 0.0 ## Output Respond with JSON only: ```json {{ "reasoning": "Brief 1-2 sentence analysis", "clarity": 0.8, "completeness": 0.7 }} ```""" try: response = await self._llm_client.complete(prompt) content = response["message"]["content"] # Parse JSON response import json # Extract complete JSON object def extract_json_object(text: str) -> str | None: """Extract first complete JSON object from text by counting braces.""" start = text.find('{') if start == -1: return None count = 0 in_string = False escape_next = False for i, char in enumerate(text[start:], start): if escape_next: escape_next = False continue if char == '\\': escape_next = True continue if char == '"' and not escape_next: in_string = not in_string continue if not in_string: if char == '{': count += 1 elif char == '}': count -= 1 if count == 0: return text[start:i+1] return None json_str = extract_json_object(content) if not json_str: logger.warning(f"Could not find JSON in LLM response for {tool.name}") return None data = json.loads(json_str) # Extract and validate scores with robust error handling def safe_float(value, default=0.5, min_val=0.0, max_val=1.0): """Safely convert to float and clamp to valid range.""" try: if value is None: return default f = float(value) return max(min_val, min(max_val, f)) except (ValueError, TypeError): logger.warning(f"Invalid score value: {value}, using default {default}") return default clarity = safe_float(data.get("clarity"), default=0.5) completeness = safe_float(data.get("completeness"), default=0.5) reasoning = str(data.get("reasoning", ""))[:500] # Limit reasoning length quality = DescriptionQuality( clarity=clarity, completeness=completeness, evaluated_at=datetime.now(), reasoning=reasoning, ) # Update record record.description_quality = quality record.description_hash = self._compute_description_hash(tool) record.last_updated = datetime.now() # Save if self._auto_save and self._store: await self._store.save_record(record, self._records, self._global_execution_count) logger.info(f"Evaluated description: {tool.name} score={quality.overall_score:.2f}") return quality except Exception as e: logger.error(f"Description evaluation failed for {tool.name}: {e}") return None # Quality-Aware Ranking def adjust_ranking( self, tools_with_scores: List[Tuple["BaseTool", float]], ) -> List[Tuple["BaseTool", float]]: """ Adjust tool ranking using penalty-based approach. Args: tools_with_scores: List of (tool, semantic_score) tuples """ adjusted = [] for tool, semantic_score in tools_with_scores: penalty = self.get_penalty(tool) adjusted_score = semantic_score * penalty adjusted.append((tool, adjusted_score)) # Sort by adjusted score (descending) adjusted.sort(key=lambda x: x[1], reverse=True) return adjusted def get_penalty(self, tool: "BaseTool") -> float: """Get penalty factor for a tool (0.2-1.0).""" return self.get_record(tool).penalty # Change Detection def check_changes(self, tools: List["BaseTool"]) -> Dict[str, str]: """ Check for tool changes (new/updated/unchanged). Returns dict: {tool_key: "new"|"updated"|"unchanged"} """ changes = {} for tool in tools: key = self.get_tool_key(tool) current_hash = self._compute_description_hash(tool) if key not in self._records: changes[key] = "new" elif self._records[key].description_hash != current_hash: changes[key] = "updated" # Clear old evaluation on description change self._records[key].description_quality = None self._records[key].description_hash = current_hash else: changes[key] = "unchanged" new_count = sum(1 for v in changes.values() if v == "new") updated_count = sum(1 for v in changes.values() if v == "updated") if new_count or updated_count: logger.info(f"Tool changes: {new_count} new, {updated_count} updated") return changes async def save(self) -> None: """ Manually save all records to disk. Note: Usually not needed - auto_save handles persistence in record_execution(), evaluate_description(), and evolve(). Provided as public API for explicit save when needed. """ if self._store: await self._store.save_all(self._records) def clear_cache(self) -> None: """Clear all cached data.""" self._records.clear() if self._store: self._store.clear() def get_stats(self) -> Dict: """ Get quality tracking statistics. Note: Query API for inspection, may not be called in main flow. """ if not self._records: return {"total_tools": 0} records = list(self._records.values()) return { "total_tools": len(records), "total_executions": sum(r.total_calls for r in records), "avg_success_rate": ( sum(r.success_rate for r in records) / len(records) if records else 0 ), "avg_quality_score": ( sum(r.quality_score for r in records) / len(records) if records else 0 ), "tools_with_description_eval": sum( 1 for r in records if r.description_quality ), } def get_top_tools( self, n: int = 10, backend: Optional[str] = None, min_calls: int = 3, ) -> List[ToolQualityRecord]: """ Get top N tools by quality score. Args: n: Number of tools to return backend: Filter by backend type (optional) min_calls: Minimum calls required (to filter untested tools) """ records = [ r for r in self._records.values() if r.total_calls >= min_calls and (backend is None or r.backend == backend) ] records.sort(key=lambda r: r.quality_score, reverse=True) return records[:n] def get_problematic_tools( self, success_rate_threshold: float = 0.5, min_calls: int = 5, ) -> List[ToolQualityRecord]: """ Get tools with low success rate (candidates for review/removal). Args: success_rate_threshold: Tools below this rate are flagged min_calls: Minimum calls required (avoid flagging new tools) """ return [ r for r in self._records.values() if r.total_calls >= min_calls and r.recent_success_rate < success_rate_threshold ] def get_quality_report(self) -> Dict: """ Generate comprehensive quality report for upper layer. Returns structured report with: - Overall stats - Per-backend breakdown - Top/problematic tools - Improvement suggestions """ if not self._records: return {"status": "no_data", "message": "No quality data collected yet"} records = list(self._records.values()) tested_records = [r for r in records if r.total_calls >= 3] # Per-backend stats backends = {} for r in records: if r.backend not in backends: backends[r.backend] = { "tools": 0, "total_calls": 0, "success_count": 0, "servers": set() } backends[r.backend]["tools"] += 1 backends[r.backend]["total_calls"] += r.total_calls backends[r.backend]["success_count"] += r.success_count backends[r.backend]["servers"].add(r.server) # Convert sets to counts for b in backends: backends[b]["servers"] = len(backends[b]["servers"]) backends[b]["success_rate"] = ( backends[b]["success_count"] / backends[b]["total_calls"] if backends[b]["total_calls"] > 0 else 0 ) # Top and problematic tools top_tools = self.get_top_tools(5) problematic = self.get_problematic_tools() return { "summary": { "total_tools": len(records), "tested_tools": len(tested_records), "total_executions": sum(r.total_calls for r in records), "overall_success_rate": ( sum(r.success_count for r in records) / max(1, sum(r.total_calls for r in records)) ), "avg_quality_score": ( sum(r.quality_score for r in tested_records) / len(tested_records) if tested_records else 0 ), }, "by_backend": backends, "top_tools": [ {"key": r.tool_key, "score": r.quality_score, "success_rate": r.success_rate} for r in top_tools ], "problematic_tools": [ {"key": r.tool_key, "success_rate": r.success_rate, "calls": r.total_calls} for r in problematic ], "recommendations": self._generate_recommendations(records, problematic), } def _generate_recommendations( self, records: List[ToolQualityRecord], problematic: List[ToolQualityRecord], ) -> List[str]: """Generate actionable recommendations based on quality data.""" recommendations = [] # Check for problematic tools if problematic: tool_names = [r.tool_name for r in problematic[:3]] recommendations.append( f"Review low-success tools: {', '.join(tool_names)}" ) # Check for tools needing description evaluation unevaluated = [r for r in records if not r.description_quality and r.total_calls >= 3] if unevaluated: recommendations.append( f"{len(unevaluated)} tools need description quality evaluation" ) # Check for low description quality poor_docs = [ r for r in records if r.description_quality and r.description_quality.overall_score < 0.5 ] if poor_docs: recommendations.append( f"{len(poor_docs)} tools have poor documentation quality" ) return recommendations def compute_adaptive_quality_weight(self) -> float: """ Compute adaptive quality weight based on data confidence. Returns higher weight when we have more reliable quality data, lower weight when data is sparse. """ if not self._records: return 0.1 # Low weight when no data records = list(self._records.values()) tested_count = sum(1 for r in records if r.total_calls >= 3) if tested_count == 0: return 0.1 # More tested tools -> higher confidence -> higher weight coverage = tested_count / len(records) # Average calls per tested tool -> data richness avg_calls = sum(r.total_calls for r in records) / len(records) richness = min(1.0, avg_calls / 20) # Cap at 20 calls average # Combine coverage and richness confidence = (coverage * 0.5 + richness * 0.5) # Map to weight range [0.1, 0.5] weight = 0.1 + confidence * 0.4 return round(weight, 2) def should_reevaluate_description(self, tool: "BaseTool") -> bool: """ Check if a tool's description should be re-evaluated. Triggers re-evaluation when: - Description hash changed - Success rate dropped significantly - No evaluation yet but enough calls """ record = self._records.get(self.get_tool_key(tool)) if not record: return True # Check hash change current_hash = self._compute_description_hash(tool) if current_hash != record.description_hash: return True # No evaluation yet but enough data if not record.description_quality and record.total_calls >= 5: return True # Success rate dropped significantly (maybe description is misleading) if record.description_quality and record.total_calls >= 10: if record.recent_success_rate < 0.5 and record.description_quality.overall_score > 0.7: # High doc quality but low success -> mismatch return True return False async def evolve(self, tools: List["BaseTool"]) -> Dict: """ Run self-evolution cycle on given tools. This method: 1. Detects tool changes 2. Re-evaluates descriptions where needed 3. Updates quality weights 4. Returns evolution report """ report = { "changes_detected": {}, "descriptions_evaluated": 0, "adaptive_weight": 0.0, "recommendations": [], } # 1. Detect changes report["changes_detected"] = self.check_changes(tools) # 2. Find tools needing re-evaluation needs_eval = [t for t in tools if self.should_reevaluate_description(t)] # 3. Evaluate descriptions (limit to avoid too many LLM calls) if needs_eval and self._llm_client: for tool in needs_eval[:5]: # Max 5 per cycle result = await self.evaluate_description(tool, force=True) if result: report["descriptions_evaluated"] += 1 # 4. Compute adaptive weight report["adaptive_weight"] = self.compute_adaptive_quality_weight() # 5. Generate recommendations problematic = self.get_problematic_tools() report["recommendations"] = self._generate_recommendations( list(self._records.values()), problematic ) # 6. Update last evolve count self._last_evolve_count = self._global_execution_count # Save if self._store: await self._store.save_all(self._records, self._global_execution_count) logger.info( f"Evolution cycle complete: " f"changes={len([v for v in report['changes_detected'].values() if v != 'unchanged'])}, " f"evaluated={report['descriptions_evaluated']}, " f"weight={report['adaptive_weight']}, " f"global_count={self._global_execution_count}" ) return report def should_evolve(self) -> bool: """Check if evolution should be triggered based on global execution count.""" return self._global_execution_count >= self._last_evolve_count + self._evolve_interval def get_tool_insights(self, tool: "BaseTool") -> Dict: """ Get detailed insights for a specific tool (for debugging/analysis). Returns comprehensive info about tool's quality history. """ record = self._records.get(self.get_tool_key(tool)) if not record: return {"status": "not_tracked", "tool": tool.name} # Count recent failures recent_failures_count = sum( 1 for e in record.recent_executions[-20:] if not e.success ) return { "tool_key": record.tool_key, "total_calls": record.total_calls, "success_rate": record.success_rate, "recent_success_rate": record.recent_success_rate, "avg_execution_time_ms": record.avg_execution_time_ms, "quality_score": record.quality_score, "description_quality": { "overall_score": record.description_quality.overall_score, "clarity": record.description_quality.clarity, "completeness": record.description_quality.completeness, "reasoning": record.description_quality.reasoning, } if record.description_quality else None, "recent_failures_count": recent_failures_count, "first_seen": record.first_seen.isoformat(), "last_updated": record.last_updated.isoformat(), } ================================================ FILE: anytool/grounding/core/quality/store.py ================================================ """ Persistent storage for tool quality data. """ import json import asyncio from pathlib import Path from typing import Dict, List, Optional from .types import ToolQualityRecord from anytool.utils.logging import Logger from anytool.config.constants import PROJECT_ROOT logger = Logger.get_logger(__name__) class QualityStore: """ Persistent storage for tool quality records. Storage structure: /.anytool/tool_quality/ ├── records.json # All quality records └── records_backup.json # Backup on save """ VERSION = 1 def __init__(self, cache_dir: Optional[Path] = None): if cache_dir is None: cache_dir = PROJECT_ROOT / ".anytool" / "tool_quality" self._cache_dir = Path(cache_dir) self._cache_dir.mkdir(parents=True, exist_ok=True) self._records_file = self._cache_dir / "records.json" self._backup_file = self._cache_dir / "records_backup.json" self._write_lock = asyncio.Lock() logger.debug(f"QualityStore initialized at {self._cache_dir}") def load_all(self) -> tuple[Dict[str, ToolQualityRecord], int]: """Load all quality records and global execution count from disk. Returns: Tuple of (records_dict, global_execution_count) """ if not self._records_file.exists(): return {}, 0 try: with open(self._records_file, "r", encoding="utf-8") as f: data = json.load(f) # Version check if data.get("version") != self.VERSION: logger.warning(f"Cache version mismatch, clearing cache") return {}, 0 records = {} for key, record_data in data.get("records", {}).items(): try: records[key] = ToolQualityRecord.from_dict(record_data) except Exception as e: logger.warning(f"Failed to load record {key}: {e}") global_count = data.get("global_execution_count", 0) logger.info(f"Loaded {len(records)} quality records from cache (global_count={global_count})") return records, global_count except Exception as e: logger.error(f"Failed to load quality cache: {e}") return {}, 0 async def save_all(self, records: Dict[str, ToolQualityRecord], global_execution_count: int = 0) -> None: """Save all quality records and global execution count to disk.""" async with self._write_lock: try: # Backup existing file if self._records_file.exists(): import shutil shutil.copy(self._records_file, self._backup_file) data = { "version": self.VERSION, "global_execution_count": global_execution_count, "records": { key: record.to_dict() for key, record in records.items() } } with open(self._records_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.debug(f"Saved {len(records)} quality records to cache (global_count={global_execution_count})") except Exception as e: logger.error(f"Failed to save quality cache: {e}") async def save_record(self, record: ToolQualityRecord, all_records: Dict[str, ToolQualityRecord], global_execution_count: int = 0) -> None: """Save a single record (saves all for simplicity).""" all_records[record.tool_key] = record await self.save_all(all_records, global_execution_count) def clear(self) -> None: """Clear all cached data.""" if self._records_file.exists(): self._records_file.unlink() if self._backup_file.exists(): self._backup_file.unlink() logger.info("Quality cache cleared") ================================================ FILE: anytool/grounding/core/quality/types.py ================================================ """ Data types for tool quality tracking. """ from dataclasses import dataclass, field from datetime import datetime from typing import ClassVar, Dict, List, Optional, Any @dataclass class ExecutionRecord: """Single execution record.""" timestamp: datetime success: bool execution_time_ms: float error_message: Optional[str] = None @dataclass class DescriptionQuality: """LLM-evaluated description quality.""" clarity: float # 0-1: Is the purpose and usage clear? completeness: float # 0-1: Are inputs/outputs documented? evaluated_at: datetime reasoning: str = "" # LLM's reasoning for the scores @property def overall_score(self) -> float: """Computed overall score (average of all dimensions).""" return (self.clarity + self.completeness) / 2 @dataclass class ToolQualityRecord: """ Complete quality record for a tool. Key: "{backend}:{server}:{tool_name}" """ tool_key: str backend: str server: str tool_name: str # Execution stats total_calls: int = 0 success_count: int = 0 total_execution_time_ms: float = 0.0 # Recent execution history (rolling window) recent_executions: List[ExecutionRecord] = field(default_factory=list) # Description quality (LLM-evaluated) description_quality: Optional[DescriptionQuality] = None # Metadata description_hash: Optional[str] = None first_seen: datetime = field(default_factory=datetime.now) last_updated: datetime = field(default_factory=datetime.now) # Keep only recent N executions MAX_RECENT_EXECUTIONS: ClassVar[int] = 100 # Penalty threshold: only penalize tools with success rate below this value # Tools with success rate >= this threshold get penalty = 1.0 (no penalty) PENALTY_THRESHOLD: ClassVar[float] = 0.4 @property def success_rate(self) -> float: """Overall success rate.""" if self.total_calls == 0: return 0.0 return self.success_count / self.total_calls @property def avg_execution_time_ms(self) -> float: """Average execution time.""" if self.total_calls == 0: return 0.0 return self.total_execution_time_ms / self.total_calls @property def recent_success_rate(self) -> float: """Success rate from recent executions.""" if not self.recent_executions: return self.success_rate successes = sum(1 for e in self.recent_executions if e.success) return successes / len(self.recent_executions) @property def consecutive_failures(self) -> int: """Count consecutive failures from the most recent execution.""" count = 0 for exec_record in reversed(self.recent_executions): if not exec_record.success: count += 1 else: break return count @property def penalty(self) -> float: """ Compute penalty factor based on failure rate. Design principles: - Only penalize tools with success rate < PENALTY_THRESHOLD (default 40%) - New tools (< 3 calls) get no penalty to allow fair evaluation Returns value between 0.2-1.0: - 1.0: No penalty (success rate >= threshold or insufficient data) - 0.2: Maximum penalty (consistently failing tool) """ if self.total_calls < 3: return 1.0 success_rate = self.recent_success_rate threshold = self.PENALTY_THRESHOLD if success_rate >= threshold: return 1.0 # Linear mapping: penalty = 0.3 + (success_rate / threshold) * 0.7 base_penalty = 0.3 + (success_rate / threshold) * 0.7 # Extra penalty for consecutive failures (indicates systematic issues) consec = self.consecutive_failures if consec >= 3: # 3 consecutive → extra 0.1, 5 consecutive → extra 0.3 extra_penalty = min(0.3, (consec - 2) * 0.1) base_penalty -= extra_penalty # Clamp to [0.2, 1.0] return max(0.2, min(1.0, base_penalty)) @property def quality_score(self) -> float: """ Legacy quality score for backward compatibility. Now delegates to penalty property. """ return self.penalty def add_execution(self, record: ExecutionRecord) -> None: """Add execution record and update stats.""" self.total_calls += 1 self.total_execution_time_ms += record.execution_time_ms if record.success: self.success_count += 1 self.recent_executions.append(record) # Trim to max size if len(self.recent_executions) > self.MAX_RECENT_EXECUTIONS: self.recent_executions = self.recent_executions[-self.MAX_RECENT_EXECUTIONS:] self.last_updated = datetime.now() def to_dict(self) -> Dict[str, Any]: """Serialize to dict for persistence.""" return { "tool_key": self.tool_key, "backend": self.backend, "server": self.server, "tool_name": self.tool_name, "total_calls": self.total_calls, "success_count": self.success_count, "total_execution_time_ms": self.total_execution_time_ms, "recent_executions": [ { "timestamp": e.timestamp.isoformat(), "success": e.success, "execution_time_ms": e.execution_time_ms, "error_message": e.error_message, } for e in self.recent_executions ], "description_quality": { "clarity": self.description_quality.clarity, "completeness": self.description_quality.completeness, "evaluated_at": self.description_quality.evaluated_at.isoformat(), "reasoning": self.description_quality.reasoning, } if self.description_quality else None, "description_hash": self.description_hash, "first_seen": self.first_seen.isoformat(), "last_updated": self.last_updated.isoformat(), } @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ToolQualityRecord": """Deserialize from dict.""" record = cls( tool_key=data["tool_key"], backend=data["backend"], server=data["server"], tool_name=data["tool_name"], total_calls=data.get("total_calls", 0), success_count=data.get("success_count", 0), total_execution_time_ms=data.get("total_execution_time_ms", 0.0), description_hash=data.get("description_hash"), first_seen=datetime.fromisoformat(data["first_seen"]), last_updated=datetime.fromisoformat(data["last_updated"]), ) # Parse recent executions for e in data.get("recent_executions", []): record.recent_executions.append(ExecutionRecord( timestamp=datetime.fromisoformat(e["timestamp"]), success=e["success"], execution_time_ms=e["execution_time_ms"], error_message=e.get("error_message"), )) # Parse description quality dq = data.get("description_quality") if dq: record.description_quality = DescriptionQuality( clarity=dq.get("clarity", 0.5), # Fallback for old data completeness=dq.get("completeness", 0.5), evaluated_at=datetime.fromisoformat(dq["evaluated_at"]), reasoning=dq.get("reasoning", ""), # Optional field ) return record ================================================ FILE: anytool/grounding/core/search_tools.py ================================================ from anytool.grounding.core.tool.base import BaseTool import re import os import numpy as np import httpx from typing import Iterable, List, Tuple, Dict, Optional, Any, TYPE_CHECKING from enum import Enum import json import pickle from pathlib import Path from datetime import datetime from .tool import BaseTool from .types import BackendType from anytool.llm import LLMClient from anytool.utils.logging import Logger from anytool.config.constants import PROJECT_ROOT if TYPE_CHECKING: from .quality import ToolQualityManager logger = Logger.get_logger(__name__) class SearchMode(str, Enum): SEMANTIC = "semantic" KEYWORD = "keyword" HYBRID = "hybrid" class ToolRanker: """ ToolRanker: rank tools by keyword, semantic or hybrid """ # Cache version for persistent storage - increment when cache format changes CACHE_VERSION = 1 def __init__( self, model_name: Optional[str] = None, cache_dir: Optional[str | Path] = None, enable_cache_persistence: bool = False ): """Initialize ToolRanker. Args: model_name: Embedding model name. If None, will use env or config value. cache_dir: Directory to store persistent embedding cache. enable_cache_persistence: Whether to persist embeddings to disk. """ # Check for remote API config from environment self._api_base_url = os.getenv("EMBEDDING_BASE_URL") self._api_key = os.getenv("EMBEDDING_API_KEY") self._use_remote_api = bool(self._api_key and self._api_base_url) # Get model name: env > param > config > default if model_name is None: model_name = os.getenv("EMBEDDING_MODEL") if model_name is None: try: from anytool.config import get_config config = get_config() model_name = config.tool_search.embedding_model except Exception as exc: logger.warning(f"Failed to load config, using default model: {exc}") model_name = "BAAI/bge-small-en-v1.5" self._model_name = model_name self._embed_model = None # lazy load self._embedding_fn = None if self._use_remote_api: logger.info(f"Using remote embedding API: {self._api_base_url}, model: {model_name}") # Persistent cache settings self._enable_cache_persistence = enable_cache_persistence if cache_dir is None: cache_dir = PROJECT_ROOT / ".anytool" / "embedding_cache" self._cache_dir = Path(cache_dir) # Log cache settings logger.info( f"ToolRanker initialized: enable_cache_persistence={enable_cache_persistence}, " f"cache_dir={self._cache_dir}" ) # Structured in-memory cache # Structure: {backend: {server: {tool_name: {"embedding": np.ndarray, "description": str, "cached_at": str}}}} self._structured_cache: Dict[str, Dict[str, Dict[str, Dict[str, Any]]]] = {} # For backward compatibility and quick lookup: {text -> (backend, server, tool_name)} self._text_to_key: Dict[str, Tuple[str, str, str]] = {} # Load persistent cache if enabled if self._enable_cache_persistence: logger.info(f"Loading persistent cache from {self._cache_dir}") self._load_persistent_cache() def _get_cache_key(self, tool: BaseTool) -> Tuple[str, str, str]: """Get structured cache key (backend, server, tool_name) from tool.""" if tool.is_bound: backend = tool.runtime_info.backend.value server = tool.runtime_info.server_name or "default" else: if not tool.backend_type or tool.backend_type == BackendType.NOT_SET: backend = "UNKNOWN" else: backend = tool.backend_type.value server = "default" return (backend, server, tool.name) def _get_cache_file_path(self) -> Path: """Get the cache file path for the current model.""" # Use model name in filename to support multiple models safe_model_name = self._model_name.replace("/", "_").replace("\\", "_") return self._cache_dir / f"embeddings_{safe_model_name}_v{self.CACHE_VERSION}.pkl" def _load_persistent_cache(self) -> None: """Load embeddings from disk cache.""" cache_file = self._get_cache_file_path() if not cache_file.exists(): logger.debug(f"No persistent cache found at {cache_file}") return try: with open(cache_file, 'rb') as f: data = pickle.load(f) # Validate cache version if isinstance(data, dict) and data.get("version") == self.CACHE_VERSION: self._structured_cache = data.get("embeddings", {}) self._rebuild_text_index() # Count total embeddings total = sum( len(tools) for backend in self._structured_cache.values() for tools in backend.values() ) logger.info(f"Loaded {total} embeddings from cache: {cache_file}") else: logger.warning(f"Cache version mismatch or invalid format, starting fresh") self._structured_cache = {} except Exception as exc: logger.warning(f"Failed to load persistent cache: {exc}") self._structured_cache = {} def _rebuild_text_index(self) -> None: """Rebuild text-to-key mapping for quick lookup.""" self._text_to_key.clear() for backend, servers in self._structured_cache.items(): for server, tools in servers.items(): for tool_name, tool_data in tools.items(): desc = tool_data.get("description", "") text = f"{tool_name}: {desc}" self._text_to_key[text] = (backend, server, tool_name) def _save_persistent_cache(self) -> None: """Save embeddings to disk cache.""" if not self._enable_cache_persistence or not self._structured_cache: return cache_file = self._get_cache_file_path() try: # Create directory if it doesn't exist cache_file.parent.mkdir(parents=True, exist_ok=True) # Build cache data with metadata cache_data = { "version": self.CACHE_VERSION, "model_name": self._model_name, "last_updated": datetime.now().isoformat(), "embeddings": self._structured_cache } # Save cache with open(cache_file, 'wb') as f: pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL) # Count total embeddings total = sum( len(tools) for backend in self._structured_cache.values() for tools in backend.values() ) logger.debug(f"Saved {total} embeddings to cache: {cache_file}") except Exception as exc: logger.warning(f"Failed to save persistent cache: {exc}") def rank( self, query: str, tools: List[BaseTool], *, top_k: int = 50, mode: SearchMode = SearchMode.SEMANTIC, ) -> List[Tuple[BaseTool, float]]: if mode == SearchMode.KEYWORD: return self._keyword_search(query, tools, top_k) if mode == SearchMode.SEMANTIC: return self._semantic_search(query, tools, top_k) # hybrid return self._hybrid_search(query, tools, top_k) @staticmethod def _tokenize(text: str) -> list[str]: tokens = re.split(r"[^\w]+", text.lower()) tokens = [tok for tok in tokens if tok] return tokens def _keyword_search( self, query: str, tools: Iterable[BaseTool], top_k: int ) -> List[Tuple[BaseTool, float]]: try: from rank_bm25 import BM25Okapi # type: ignore except ImportError: BM25Okapi = None # fallback below tool_list = list(tools) if not tool_list: return [] corpus_tokens: list[list[str]] = [self._tokenize(f"{t.name} {t.description}") for t in tool_list] query_tokens = self._tokenize(query) if BM25Okapi and corpus_tokens: bm25 = BM25Okapi(corpus_tokens) scores = bm25.get_scores(query_tokens) scored = [(t, float(s)) for t, s in zip(tool_list, scores, strict=True)] else: # fallback: simple term overlap ratio q_set = set(query_tokens) scored = [] for t, toks in zip(tool_list, corpus_tokens, strict=True): if not toks: scored.append((t, 0.0)) # Include tool with 0 score continue overlap = q_set.intersection(toks) score = len(overlap) / len(q_set) if len(q_set) > 0 else 0.0 scored.append((t, score)) scored.sort(key=lambda x: x[1], reverse=True) result = scored[:top_k] # If no matches found (all scores are 0), return all tools if not result or all(score == 0.0 for _, score in result): logger.debug(f"Keyword search found no matches, returning all {len(tool_list)} tools") return [(t, 0.0) for t in tool_list] return result def _ensure_model(self) -> bool: """Ensure embedding model is ready (local or remote).""" if self._embedding_fn is not None: return True if self._use_remote_api: return self._init_remote_embedding() return self._init_local_embedding() def _init_remote_embedding(self) -> bool: """Initialize remote embedding API (OpenRouter/OpenAI compatible).""" try: def embed_texts(texts: List[str]) -> List[np.ndarray]: with httpx.Client(timeout=60.0) as client: response = client.post( f"{self._api_base_url}/embeddings", headers={ "Authorization": f"Bearer {self._api_key}", "Content-Type": "application/json" }, json={"model": self._model_name, "input": texts} ) response.raise_for_status() data = response.json() return [np.array(item["embedding"]) for item in data["data"]] self._embedding_fn = embed_texts logger.info(f"Remote embedding API initialized: {self._model_name}") return True except Exception as exc: logger.error(f"Failed to initialize remote embedding API: {exc}") return False def _init_local_embedding(self) -> bool: """Initialize local fastembed model.""" try: from fastembed import TextEmbedding logger.debug(f"fastembed imported successfully, loading model: {self._model_name}") except ImportError as e: logger.warning( f"fastembed not installed (ImportError: {e}), semantic search unavailable. " f"Install with: pip install fastembed" ) return False try: logger.info(f"Loading embedding model: {self._model_name}...") self._embed_model = TextEmbedding(model_name=self._model_name) self._embedding_fn = lambda txts: list(self._embed_model.embed(txts)) logger.info(f"Embedding model '{self._model_name}' loaded successfully") return True except Exception as exc: logger.error(f"Embedding model '{self._model_name}' loading failed: {exc}") return False def _get_embedding(self, tool: BaseTool) -> Optional[np.ndarray]: """Get embedding from structured cache.""" backend, server, tool_name = self._get_cache_key(tool) if backend not in self._structured_cache: return None if server not in self._structured_cache[backend]: return None if tool_name not in self._structured_cache[backend][server]: return None return self._structured_cache[backend][server][tool_name].get("embedding") def _set_embedding(self, tool: BaseTool, embedding: np.ndarray) -> None: """Store embedding in structured cache.""" backend, server, tool_name = self._get_cache_key(tool) # Initialize nested structure if needed if backend not in self._structured_cache: self._structured_cache[backend] = {} if server not in self._structured_cache[backend]: self._structured_cache[backend][server] = {} # Store embedding with metadata self._structured_cache[backend][server][tool_name] = { "embedding": embedding, "description": tool.description or "", "cached_at": datetime.now().isoformat() } # Update text index for backward compatibility text = f"{tool.name}: {tool.description}" self._text_to_key[text] = (backend, server, tool_name) def _semantic_search( self, query: str, tools: Iterable[BaseTool], top_k: int ) -> List[Tuple[BaseTool, float]]: if not self._ensure_model(): logger.debug("Semantic search unavailable, returning empty list") return [] tools_list = list(tools) # Collect embeddings with cache reuse missing_tools = [t for t in tools_list if self._get_embedding(t) is None] cache_updated = False if missing_tools: try: # Generate embeddings for missing tools missing_texts = [f"{t.name}: {t.description}" for t in missing_tools] new_embs = self._embedding_fn(missing_texts) for tool, emb in zip(missing_tools, new_embs, strict=True): self._set_embedding(tool, emb) cache_updated = True logger.debug(f"Computed embeddings for {len(missing_tools)} new tools") except Exception as exc: logger.error("Failed to generate embeddings: %s", exc) return [] # Save to persistent cache if updated if cache_updated: self._save_persistent_cache() try: q_emb = self._embedding_fn([query])[0] except Exception as exc: logger.error("Failed to embed query: %s", exc) return [] scored: list[tuple[BaseTool, float]] = [] for t in tools_list: emb = self._get_embedding(t) if emb is None: # Should not happen, but handle gracefully logger.warning(f"No embedding found for tool: {t.name}") scored.append((t, 0.0)) continue # Calculate cosine similarity with zero-division protection q_norm = np.linalg.norm(q_emb) emb_norm = np.linalg.norm(emb) if q_norm == 0 or emb_norm == 0: sim = 0.0 else: sim = float(np.dot(q_emb, emb) / (q_norm * emb_norm)) scored.append((t, sim)) scored.sort(key=lambda x: x[1], reverse=True) return scored[:top_k] def _hybrid_search( self, query: str, tools: Iterable[BaseTool], top_k: int ) -> List[Tuple[BaseTool, float]]: # keyword filter kw_top = self._keyword_search(query, tools, top_k * 3) if not kw_top: # No keyword matches, try semantic search semantic_results = self._semantic_search(query, tools, top_k) if semantic_results: return semantic_results # Both failed, return top N tools logger.warning("Both keyword and semantic search failed, returning top N tools") return [(t, 0.0) for t in list(tools)[:top_k]] # semantic ranking on keyword results semantic_results = self._semantic_search(query, [t for t, _ in kw_top], top_k) if semantic_results: return semantic_results # Semantic unavailable, return keyword results logger.debug("Semantic search unavailable, using keyword results only") return kw_top[:top_k] def get_cache_stats(self) -> Dict[str, Any]: """Get statistics about the embedding cache. Returns: Dict with structure: { "total_embeddings": int, "backends": { "backend_name": { "total": int, "servers": { "server_name": int # count of tools } } } } """ stats = { "total_embeddings": 0, "backends": {} } for backend, servers in self._structured_cache.items(): backend_total = 0 server_stats = {} for server, tools in servers.items(): tool_count = len(tools) backend_total += tool_count server_stats[server] = tool_count stats["backends"][backend] = { "total": backend_total, "servers": server_stats } stats["total_embeddings"] += backend_total return stats def clear_cache(self, backend: Optional[str] = None, server: Optional[str] = None) -> int: """Clear embeddings from cache. Args: backend: If provided, only clear this backend. If None, clear all. server: If provided (and backend is provided), only clear this server. Returns: Number of embeddings cleared. """ cleared_count = 0 if backend is None: # Clear everything for b in self._structured_cache.values(): for s in b.values(): cleared_count += len(s) self._structured_cache.clear() self._text_to_key.clear() elif server is None: # Clear specific backend if backend in self._structured_cache: for s in self._structured_cache[backend].values(): cleared_count += len(s) del self._structured_cache[backend] # Rebuild text index self._rebuild_text_index() else: # Clear specific backend+server if backend in self._structured_cache and server in self._structured_cache[backend]: cleared_count = len(self._structured_cache[backend][server]) del self._structured_cache[backend][server] # Clean up empty backend if not self._structured_cache[backend]: del self._structured_cache[backend] # Rebuild text index self._rebuild_text_index() # Save after clearing if cleared_count > 0 and self._enable_cache_persistence: self._save_persistent_cache() logger.info(f"Cleared {cleared_count} embeddings from cache") return cleared_count class SearchDebugInfo: """Debug information from tool search process.""" def __init__(self): self.search_mode: str = "" self.total_candidates: int = 0 self.mcp_count: int = 0 self.non_mcp_count: int = 0 # LLM filter info self.llm_filter_used: bool = False self.llm_brief_plan: str = "" self.llm_utility_tools: Dict[str, List[str]] = {} # server -> tool names self.llm_domain_servers: List[str] = [] self.llm_utility_count: int = 0 self.llm_domain_count: int = 0 # Semantic search scores self.tool_scores: List[Dict[str, Any]] = [] # [{name, server, score, selected}] # Final selected tools self.selected_tools: List[Dict[str, Any]] = [] # [{name, server, backend}] def to_dict(self) -> Dict[str, Any]: return { "search_mode": self.search_mode, "total_candidates": self.total_candidates, "mcp_count": self.mcp_count, "non_mcp_count": self.non_mcp_count, "llm_filter": { "used": self.llm_filter_used, "brief_plan": self.llm_brief_plan, "utility_tools": self.llm_utility_tools, "domain_servers": self.llm_domain_servers, "utility_count": self.llm_utility_count, "domain_count": self.llm_domain_count, }, "tool_scores": self.tool_scores, "selected_tools": self.selected_tools, } class SearchCoordinator(BaseTool): _name = "_filter_tools" _description = "Internal helper: filter & rank tools from a given list." # Fallback defaults when config loading fails DEFAULT_MAX_TOOLS: int = 20 DEFAULT_LLM_FILTER: bool = True DEFAULT_LLM_THRESHOLD: int = 50 DEFAULT_CACHE_PERSISTENCE: bool = False DEFAULT_SEARCH_MODE: str = "hybrid" @classmethod def get_parameters_schema(cls) -> Dict[str, Any]: """Override to avoid JSON schema generation for list[BaseTool] parameter. The _arun method uses `candidate_tools: list[BaseTool]` which cannot be converted to JSON Schema because BaseTool is an ABC class, not a Pydantic model. Since this is an internal tool, we return an empty schema. """ return {} def __init__( self, *, max_tools: Optional[int] = None, llm: LLMClient = LLMClient(), enable_llm_filter: Optional[bool] = None, llm_filter_threshold: Optional[int] = None, enable_cache_persistence: Optional[bool] = None, cache_dir: Optional[str | Path] = None, quality_manager: Optional["ToolQualityManager"] = None, enable_quality_ranking: bool = True, ): """Create a SearchCoordinator. Args: max_tools: max number of tools to return. If None, will use the value from config. llm: optional async LLM, used to filter backend/server first enable_llm_filter: whether to use LLM to pre-filter by backend/server. If None, uses config value. llm_filter_threshold: only apply LLM filter when tool count > this threshold. If None, always apply (when enabled). enable_cache_persistence: whether to persist embeddings to disk. If None, uses config value. cache_dir: directory to store persistent embedding cache. If None, uses config value or default. """ super().__init__() # Load config (may be None if loading fails) tool_search_config = None try: from anytool.config import get_config tool_search_config = getattr(get_config(), 'tool_search', None) except Exception as exc: logger.warning(f"Failed to load config: {exc}") def resolve(user_value, config_attr: str, default): """Priority: user_value → config → default""" if user_value is not None: return user_value if tool_search_config is not None: config_value = getattr(tool_search_config, config_attr, None) if config_value is not None: return config_value return default # Resolve each setting with priority: user → config → default self.max_tools = resolve(max_tools, 'max_tools', self.DEFAULT_MAX_TOOLS) enable_llm_filter = resolve(enable_llm_filter, 'enable_llm_filter', self.DEFAULT_LLM_FILTER) llm_filter_threshold = resolve(llm_filter_threshold, 'llm_filter_threshold', self.DEFAULT_LLM_THRESHOLD) enable_cache_persistence = resolve(enable_cache_persistence, 'enable_cache_persistence', self.DEFAULT_CACHE_PERSISTENCE) cache_dir = resolve(cache_dir, 'cache_dir', None) self._default_mode = resolve(None, 'search_mode', self.DEFAULT_SEARCH_MODE) # Log cache settings for debugging logger.info( f"SearchCoordinator initialized with cache settings: " f"enable_cache_persistence={enable_cache_persistence}, cache_dir={cache_dir}" ) self._ranker = ToolRanker( enable_cache_persistence=enable_cache_persistence, cache_dir=cache_dir ) self._llm: LLMClient | None = llm if llm is not None else LLMClient() # LLM filter settings self._enable_llm_filter = enable_llm_filter self._llm_filter_threshold = llm_filter_threshold # Quality-aware ranking settings self._quality_manager = quality_manager self._enable_quality_ranking = enable_quality_ranking # Debug info from last search self._last_search_debug_info: Optional[SearchDebugInfo] = None async def _arun( self, task_prompt: str, candidate_tools: list[BaseTool], *, max_tools: int | None = None, mode: str | None = None, # "semantic" | "keyword" | "hybrid" ) -> list[BaseTool]: max_tools = self.max_tools if max_tools is None else max_tools mode = self._default_mode if mode is None else mode # Initialize debug info debug_info = SearchDebugInfo() debug_info.search_mode = mode debug_info.total_candidates = len(candidate_tools) self._last_search_debug_info = debug_info # Cache check cache_key = (id(candidate_tools), task_prompt, mode, max_tools) if not hasattr(self, "_query_cache"): self._query_cache: Dict[tuple, list[BaseTool]] = {} if cache_key in self._query_cache: return self._query_cache[cache_key] # Split MCP tools and non-MCP tools # Non-MCP tools (shell, gui, web, etc.) are always included, skip all filtering mcp_tools = [] non_mcp_tools = [] for t in candidate_tools: if t.is_bound: backend = t.runtime_info.backend.value else: backend = t.backend_type.value if t.backend_type else "UNKNOWN" if backend.lower() == "mcp": mcp_tools.append(t) else: non_mcp_tools.append(t) debug_info.mcp_count = len(mcp_tools) debug_info.non_mcp_count = len(non_mcp_tools) logger.info(f"Tool split: {len(mcp_tools)} MCP, {len(non_mcp_tools)} non-MCP (always included)") # If MCP tools within limit, return all if len(mcp_tools) <= max_tools: result = mcp_tools + non_mcp_tools self._query_cache[cache_key] = result self._populate_selected_tools(debug_info, result) return result mcp_count = len(mcp_tools) should_use_llm_filter = ( self._llm and self._enable_llm_filter and mcp_count > self._llm_filter_threshold ) # Path 1: LLM pre-filter (large MCP tool set) if should_use_llm_filter: logger.info(f"Path 1: MCP count ({mcp_count}) > threshold, using LLM filter...") debug_info.llm_filter_used = True try: utility_tools, domain_tools, llm_filter_info = await self._llm_filter_with_planning( task_prompt, mcp_tools ) # Record LLM filter results debug_info.llm_brief_plan = llm_filter_info.get("brief_plan", "") debug_info.llm_utility_tools = llm_filter_info.get("utility_tools", {}) debug_info.llm_domain_servers = llm_filter_info.get("domain_servers", []) utility_count = len(utility_tools) domain_count = len(domain_tools) debug_info.llm_utility_count = utility_count debug_info.llm_domain_count = domain_count total_count = utility_count + domain_count if total_count <= max_tools: mcp_result = utility_tools + domain_tools else: # Exceeds limit: keep utility, search domain domain_quota = max(max_tools - utility_count, 5) logger.info( f"Total ({total_count}) > max_tools ({max_tools}), " f"keeping {utility_count} utility, searching {domain_count} domain (quota: {domain_quota})" ) # Compute scores for utility tools (marked as LLM-selected) if utility_tools: utility_ranked = self._ranker.rank( task_prompt, utility_tools, top_k=len(utility_tools), mode=SearchMode(mode) ) self._record_tool_scores(debug_info, utility_ranked, is_selected=True) if domain_tools: # Rank all domain tools to see all scores for debugging all_domain_ranked = self._ranker.rank( task_prompt, domain_tools, top_k=len(domain_tools), mode=SearchMode(mode) ) # Save scores for all domain tools (mark which ones are selected) for i, (tool, score) in enumerate(all_domain_ranked): server_name = None if tool.is_bound and tool.runtime_info: server_name = tool.runtime_info.server_name debug_info.tool_scores.append({ "name": tool.name, "server": server_name, "score": round(score, 4), "selected": i < domain_quota, }) searched_domain = [t for t, _ in all_domain_ranked[:domain_quota]] else: searched_domain = [] mcp_result = utility_tools + searched_domain except Exception as exc: logger.warning(f"LLM filter failed ({exc}), fallback to direct ranking") ranked = self._ranker.rank(task_prompt, mcp_tools, top_k=max_tools, mode=SearchMode(mode)) self._record_tool_scores(debug_info, ranked, is_selected=True) mcp_result = [t for t, _ in ranked] # Path 2: Plan-enhanced search (small MCP tool set) else: logger.info(f"Path 2: MCP count ({mcp_count}) <= threshold, using enhanced search...") debug_info.llm_filter_used = False if self._llm: try: enhanced_query = await self._generate_search_query(task_prompt) except Exception: enhanced_query = task_prompt else: enhanced_query = task_prompt try: ranked = self._ranker.rank( enhanced_query, mcp_tools, top_k=max_tools, mode=SearchMode(mode) ) # Record all scores from semantic search self._record_tool_scores(debug_info, ranked, is_selected=True) mcp_result = [t for t, _ in ranked] except Exception: ranked = self._ranker._keyword_search( enhanced_query, mcp_tools, max_tools ) self._record_tool_scores(debug_info, ranked, is_selected=True) mcp_result = [t for t, _ in ranked] # Apply quality ranking on MCP results if self._enable_quality_ranking and self._quality_manager and mcp_result: try: ranked_with_scores = [(t, 1.0) for t in mcp_result] ranked_with_scores = self._quality_manager.adjust_ranking(ranked_with_scores) mcp_result = [t for t, _ in ranked_with_scores] except Exception: pass # Limit MCP tools, then combine with non-MCP tools mcp_result = mcp_result[:max_tools] result = mcp_result + non_mcp_tools # Populate final selected tools in debug info self._populate_selected_tools(debug_info, result) self._log_search_results(candidate_tools, result, mode) self._query_cache[cache_key] = result return result def _record_tool_scores( self, debug_info: SearchDebugInfo, ranked: List[Tuple[BaseTool, float]], is_selected: bool = False ) -> None: """Record tool scores from ranking results.""" for tool, score in ranked: server_name = None if tool.is_bound and tool.runtime_info: server_name = tool.runtime_info.server_name debug_info.tool_scores.append({ "name": tool.name, "server": server_name, "score": round(score, 4), "selected": is_selected, }) def _populate_selected_tools( self, debug_info: SearchDebugInfo, tools: List[BaseTool] ) -> None: """Populate selected tools in debug info.""" for tool in tools: backend = "UNKNOWN" server_name = None if tool.is_bound and tool.runtime_info: backend = tool.runtime_info.backend.value server_name = tool.runtime_info.server_name elif tool.backend_type: backend = tool.backend_type.value debug_info.selected_tools.append({ "name": tool.name, "server": server_name, "backend": backend, }) async def _llm_filter_with_planning( self, task_prompt: str, tools: list[BaseTool] ) -> tuple[list[BaseTool], list[BaseTool], Dict[str, Any]]: """ LLM pre-filter for MCP servers. Returns (utility_tools, domain_tools, llm_filter_info). """ from collections import defaultdict # Group tools by server name server_tools: Dict[str, list[BaseTool]] = defaultdict(list) for t in tools: if t.is_bound and t.runtime_info: server = t.runtime_info.server_name or "default" else: server = "unknown" server_tools[server].append(t) # Build tool name -> tool object mapping tool_name_map: Dict[str, BaseTool] = {t.name: t for t in tools} # Build server description with tool names lines: list[str] = ["Available MCP servers:"] lines.append("") for server, tool_list in server_tools.items(): lines.append(f"### Server: {server} ({len(tool_list)} tools)") tool_names = [t.name for t in tool_list] lines.append(f" All tools: {', '.join(tool_names)}") if tool_list: lines.append(f" Example capabilities:") for tool in tool_list[:5]: tool_desc = tool.description or "No description" if len(tool_desc) > 100: tool_desc = tool_desc[:97] + "..." lines.append(f" - {tool.name}: {tool_desc}") lines.append("") servers_block = "\n".join(lines) TOOL_FILTER_SYSTEM_PROMPT = f"""You are an expert tool selection assistant. # Your task Analyze the given task and determine which MCP servers and tools are needed. Think about how you would accomplish this task step by step, then classify needed servers and tools. # Important guidelines - **Focus on tool names and capabilities**: Carefully examine the tool names to understand what each server can do - **Be inclusive for domain servers**: If a server has tools that might be relevant to the core task, include it - **Be precise for utility tools**: Only select the specific auxiliary tools needed (e.g., file save, time query) - **When in doubt, include in domain_servers**: It's better to include a server than miss relevant tools {servers_block} # Output format Return ONLY a JSON object (no markdown, no explanation): {{ "brief_plan": "1-2 sentence execution plan", "utility_tools": {{ "server1": ["tool1", "tool2"] }}, "domain_servers": ["server2", "server3"] }} - **utility_tools**: Dict mapping server name to list of specific tool names. These are auxiliary tools for supporting operations (e.g., filesystem: ["write_file"], time-server: ["get_time"]). Only include the specific tools needed, NOT the entire server. - **domain_servers**: Server names that directly provide the main capabilities for the task. All tools from these servers will be considered. Be inclusive here.""" user_query = f"Task: {task_prompt}\n\nClassify the needed servers and tools." messages_text = LLMClient.format_messages_to_text([ {"role": "system", "content": TOOL_FILTER_SYSTEM_PROMPT}, {"role": "user", "content": user_query} ]) resp = await self._llm.complete(messages_text) content = resp["message"]["content"].strip() # Extract JSON code_block_pattern = r'```(?:json)?\s*\n?(.*?)\n?```' match = re.search(code_block_pattern, content, re.DOTALL) if match: content = match.group(1).strip() else: json_match = re.search(r'\{.*\}', content, re.DOTALL) if json_match: content = json_match.group() try: result = json.loads(content) except json.JSONDecodeError as e: logger.warning(f"Failed to parse LLM response: {e}") return [], tools # Parse utility_tools: {server: [tool_names]} utility_tools_config = result.get("utility_tools", {}) domain_servers = set(result.get("domain_servers", [])) brief_plan = result.get("brief_plan", "N/A") logger.info(f"LLM Planning: {brief_plan}") logger.info(f"Utility tools: {utility_tools_config}") logger.info(f"Domain servers: {domain_servers}") # Collect utility tools (specific tools only) utility_tools = [] for server_name, tool_names in utility_tools_config.items(): if server_name in server_tools: server_tool_names = {t.name for t in server_tools[server_name]} for tool_name in tool_names: if tool_name in server_tool_names and tool_name in tool_name_map: utility_tools.append(tool_name_map[tool_name]) # Collect domain tools (entire servers) domain_tools = [] for server, tool_list in server_tools.items(): if server in domain_servers: domain_tools.extend(tool_list) logger.info(f"LLM filter result: {len(utility_tools)} utility tools, {len(domain_tools)} domain tools") # Build LLM filter info for debugging llm_filter_info = { "brief_plan": brief_plan, "utility_tools": utility_tools_config, "domain_servers": list(domain_servers), } # Fallback if no match if not utility_tools and not domain_tools: logger.warning(f"LLM filter matched 0 tools, returning all as domain") return [], tools, llm_filter_info return utility_tools, domain_tools, llm_filter_info async def _generate_search_query(self, task_prompt: str) -> str: prompt = f"""Task: {task_prompt} List keywords for the capabilities needed (comma-separated, brief):""" resp = await self._llm.complete(prompt) capabilities = resp["message"]["content"].strip().replace("\n", " ") enhanced_query = f"{task_prompt} {capabilities}" logger.debug(f"Enhanced search query: {enhanced_query[:150]}...") return enhanced_query def _log_search_results(self, all_tools: list[BaseTool], filtered_tools: list[BaseTool], mode: str) -> None: """ Log search results in a concise, grouped format. Shows backend/server breakdown and tool names (truncated if too many). """ from collections import defaultdict # Group filtered tools by backend and server grouped: Dict[str, Dict[str | None, list[str]]] = defaultdict(lambda: defaultdict(list)) for t in filtered_tools: # Get backend and server info if t.is_bound: backend = t.runtime_info.backend.value server = t.runtime_info.server_name if backend.lower() == "mcp" else None else: if not t.backend_type or t.backend_type == BackendType.NOT_SET: backend = "UNKNOWN" server = None else: backend = t.backend_type.value server = None grouped[backend][server].append(t.name) # Build concise summary lines = [f"\n{'='*60}"] lines.append(f"🔍 Tool Search Results (mode: {mode})") lines.append(f" {len(all_tools)} candidates → {len(filtered_tools)} selected tools") lines.append(f"{'='*60}") for backend, srv_map in sorted(grouped.items()): backend_total = sum(len(tools) for tools in srv_map.values()) lines.append(f"\n📦 {backend} ({backend_total} tools)") for server, tool_names in sorted(srv_map.items()): if backend.lower() == "mcp" and server: prefix = f" └─ {server}: " else: prefix = f" └─ " # Limit display to avoid overwhelming output if len(tool_names) <= 8: tools_display = ", ".join(tool_names) else: tools_display = ", ".join(tool_names[:8]) + f" ... (+{len(tool_names)-8} more)" lines.append(f"{prefix}{tools_display}") lines.append(f"{'='*60}\n") # Use info level so users can see it logger.info("\n".join(lines)) @staticmethod def _format_tool_list(tools: list[BaseTool]) -> str: rows = [f"{i}. **{t.name}**: {t.description}" for i, t in enumerate(tools, 1)] return f"Total {len(tools)} tools, list out directly:\n\n" + "\n".join(rows) @staticmethod def _format_ranked(results: list[tuple[BaseTool, float]], mode: SearchMode) -> str: lines = [f"Search results (mode={mode}) total {len(results)}:\n"] for i, (tool, score) in enumerate(results, 1): lines.append(f"{i}. {tool.name} (score: {score:.3f})\n {tool.description}") return "\n".join(lines) def _run(self, *args, **kwargs): raise NotImplementedError("SearchCoordinator only supports asynchronous calls. Use _arun instead.") def get_embedding_cache_stats(self) -> Dict[str, Any]: """Get statistics about the embedding cache. Returns: Dict with cache statistics including total embeddings and breakdown by backend/server. """ return self._ranker.get_cache_stats() def clear_embedding_cache(self, backend: Optional[str] = None, server: Optional[str] = None) -> int: """Clear embeddings from cache. Args: backend: If provided, only clear this backend. If None, clear all. server: If provided (and backend is provided), only clear this server. Returns: Number of embeddings cleared. """ return self._ranker.clear_cache(backend=backend, server=server) def get_last_search_debug_info(self) -> Optional[Dict[str, Any]]: """Get debug info from the last search operation. Returns: Dict containing search debug info, or None if no search has been performed. Includes: - search_mode: The search mode used - total_candidates: Total number of candidate tools - mcp_count/non_mcp_count: Tool counts by type - llm_filter: LLM filter information if used - tool_scores: Similarity scores for each tool - selected_tools: Final selected tools """ if self._last_search_debug_info is None: return None return self._last_search_debug_info.to_dict() ================================================ FILE: anytool/grounding/core/security/__init__.py ================================================ from .sandbox import BaseSandbox, SandboxManager from .policies import SecurityPolicyManager, SecurityPolicy # Try to import E2BSandbox (optional dependency) try: from .e2b_sandbox import E2BSandbox E2B_AVAILABLE = True except ImportError: E2BSandbox = None E2B_AVAILABLE = False __all__ = [ "BaseSandbox", "SandboxManager", "SecurityPolicyManager", "SecurityPolicy" ] if E2B_AVAILABLE: __all__.append("E2BSandbox") ================================================ FILE: anytool/grounding/core/security/e2b_sandbox.py ================================================ """ E2B Sandbox implementation. This module provides a concrete implementation of BaseSandbox using E2B. """ import os from typing import Any, Dict, Optional, TYPE_CHECKING from anytool.utils.logging import Logger from .sandbox import BaseSandbox from ..types import SandboxOptions logger = Logger.get_logger(__name__) # Import E2B SDK components (optional dependency) if TYPE_CHECKING: # For type checking purposes only try: from e2b_code_interpreter import CommandHandle, Sandbox except ImportError: CommandHandle = None # type: ignore Sandbox = None # type: ignore try: logger.debug("Attempting to import e2b_code_interpreter...") from e2b_code_interpreter import ( # type: ignore CommandHandle, Sandbox, ) logger.debug("Successfully imported e2b_code_interpreter") E2B_AVAILABLE = True except ImportError as e: logger.debug(f"Failed to import e2b_code_interpreter: {e}") CommandHandle = None # type: ignore Sandbox = None # type: ignore E2B_AVAILABLE = False class E2BSandbox(BaseSandbox): """E2B sandbox implementation for secure code execution.""" def __init__(self, options: SandboxOptions): """Initialize E2B sandbox. Args: options: Sandbox configuration options including: - api_key: E2B API key (or use E2B_API_KEY env var) - sandbox_template_id: Template ID for the sandbox (default: "base") - timeout: Command execution timeout in seconds """ super().__init__(options) if not E2B_AVAILABLE: raise ImportError( "E2B SDK (e2b-code-interpreter) not found. Please install it with " "'pip install e2b-code-interpreter'." ) # Get API key from options or environment self.api_key = options.get("api_key") or os.environ.get("E2B_API_KEY") if not self.api_key: raise ValueError( "E2B API key is required. Provide it via 'options.api_key'" " or the E2B_API_KEY environment variable." ) # Get sandbox configuration self.sandbox_template_id = options.get("sandbox_template_id", "base") self.timeout = options.get("timeout", 600) # Default 10 minutes # Sandbox instance (using Any to avoid import issues with optional dependency) self._sandbox: Any = None self._process: Any = None async def start(self) -> bool: """Start the E2B sandbox instance. Returns: True if sandbox started successfully, False otherwise. """ if self._active: logger.debug("E2B sandbox already active") return True try: logger.debug(f"Creating E2B sandbox with template: {self.sandbox_template_id}") self._sandbox = Sandbox( template=self.sandbox_template_id, api_key=self.api_key, ) self._active = True logger.info(f"E2B sandbox started successfully (template: {self.sandbox_template_id})") return True except Exception as e: logger.error(f"Failed to start E2B sandbox: {e}") self._active = False return False async def stop(self) -> None: """Stop the E2B sandbox instance.""" if not self._active: logger.debug("E2B sandbox not active") return try: # Terminate any running process if self._process: try: logger.debug("Terminating sandbox process") self._process.kill() except Exception as e: logger.warning(f"Error terminating sandbox process: {e}") finally: self._process = None # Close the sandbox if self._sandbox: try: logger.debug("Closing E2B sandbox instance") self._sandbox.kill() logger.info("E2B sandbox stopped successfully") except Exception as e: logger.warning(f"Error closing E2B sandbox: {e}") finally: self._sandbox = None self._active = False except Exception as e: logger.error(f"Error stopping E2B sandbox: {e}") raise async def execute_safe(self, command: str, **kwargs) -> Any: """Execute a command safely in the E2B sandbox. Args: command: The command to execute **kwargs: Additional options: - envs: Environment variables (dict) - timeout: Command timeout in milliseconds - background: Run in background (bool) - on_stdout: Stdout callback function - on_stderr: Stderr callback function Returns: CommandHandle object representing the running process """ if not self._active or not self._sandbox: raise RuntimeError("E2B sandbox is not active. Call start() first.") try: # Extract execution options envs = kwargs.get("envs", {}) timeout = kwargs.get("timeout", self.timeout * 1000) # Convert to ms background = kwargs.get("background", False) on_stdout = kwargs.get("on_stdout") on_stderr = kwargs.get("on_stderr") logger.debug(f"Executing command in E2B sandbox: {command}") # Execute the command self._process = self._sandbox.commands.run( command, envs=envs, timeout=timeout, background=background, on_stdout=on_stdout, on_stderr=on_stderr, ) return self._process except Exception as e: logger.error(f"Failed to execute command in E2B sandbox: {e}") raise def get_connector(self) -> Any: """Get the underlying E2B sandbox connector. Returns: The E2B Sandbox instance, or None if not active. """ return self._sandbox def get_host(self, port: int) -> str: """Get the host URL for a specific port. Args: port: The port number to get the host for Returns: The host URL string Raises: RuntimeError: If sandbox is not active """ if not self._active or not self._sandbox: raise RuntimeError("E2B sandbox is not active. Call start() first.") return self._sandbox.get_host(port) @property def sandbox(self) -> Any: """Get the underlying E2B Sandbox instance.""" return self._sandbox @property def process(self) -> Any: """Get the current running process handle.""" return self._process ================================================ FILE: anytool/grounding/core/security/policies.py ================================================ import asyncio import sys from typing import Callable, Awaitable, Dict, Optional from ..types import SecurityPolicy, BackendType PromptFunc = Callable[[str], Awaitable[bool]] # ANSI color codes class Colors: RESET = "\033[0m" BOLD = "\033[1m" RED = "\033[91m" YELLOW = "\033[93m" GREEN = "\033[92m" CYAN = "\033[96m" GRAY = "\033[90m" WHITE = "\033[97m" class SecurityPolicyManager: def __init__(self, prompt: PromptFunc | None = None): self._policies: Dict[BackendType, SecurityPolicy] = {} self._global_policy: Optional[SecurityPolicy] = None self._prompt: PromptFunc | None = prompt or self._default_cli_prompt async def _default_cli_prompt(self, message: str) -> bool: # Clean and professional prompt using unified display from anytool.utils.display import Box, BoxStyle, colorize, print_separator print() print_separator(70, 'y', 2) print(f" {colorize('⚠️ Security Policy Warning', color=Colors.RED, bold=True)}") print_separator(70, 'y', 2) print(f" {message}") print_separator(70, 'gr', 2) print(f" {colorize('[y/yes]', color=Colors.GREEN)} Allow | {colorize('[n/no]', color=Colors.RED)} Deny") print_separator(70, 'gr', 2) print(f" {colorize('Your choice:', bold=True)} ", end="", flush=True) answer = await asyncio.get_running_loop().run_in_executor(None, sys.stdin.readline) response = answer.strip().lower() in {"y", "yes"} if response: print(f" {colorize('✓ Allowed', color=Colors.GREEN)}\n") else: print(f" {colorize('✗ Denied', color=Colors.RED)}\n") return response def set_global_policy(self, policy: SecurityPolicy) -> None: self._global_policy = policy def set_backend_policy(self, backend_type: BackendType, policy: SecurityPolicy) -> None: self._policies[backend_type] = policy def get_policy(self, backend_type: BackendType) -> SecurityPolicy: policy = self._policies.get(backend_type) if policy: return policy if self._global_policy: return self._global_policy return SecurityPolicy() async def _ask_user(self, message: str) -> bool: """If prompt is provided, ask user for confirmation, otherwise default to deny""" if self._prompt: try: return await self._prompt(message) except Exception: return False return False async def check_command_allowed(self, backend_type: BackendType, command: str) -> bool: policy = self.get_policy(backend_type) if policy.check(command=command): return True # Find dangerous tokens dangerous_tokens = policy.find_dangerous_tokens(command) # Extract only lines containing dangerous commands lines = command.split('\n') dangerous_lines = [] for i, line in enumerate(lines): line_lower = line.lower() if any(token in line_lower for token in dangerous_tokens): # Add line number and the line itself dangerous_lines.append((i + 1, line.strip())) # If no specific dangerous lines found but policy failed, show first few lines if not dangerous_lines: dangerous_lines = [(i + 1, line.strip()) for i, line in enumerate(lines[:5])] # Format dangerous lines for display (limit to 10 lines) max_display_lines = 10 if len(dangerous_lines) > max_display_lines: display_lines = dangerous_lines[:max_display_lines] truncated = True else: display_lines = dangerous_lines truncated = False # Build formatted command display formatted_cmd_lines = [] for line_num, line in display_lines: # Truncate very long lines if len(line) > 80: line = line[:77] + "..." formatted_cmd_lines.append(f" L{line_num}: {line}") if truncated: formatted_cmd_lines.append(" ... (more lines)") formatted_command = '\n'.join(formatted_cmd_lines) # Show which dangerous commands were detected dangerous_list = ', '.join([f"{Colors.RED}{tok}{Colors.RESET}" for tok in dangerous_tokens[:5]]) from anytool.utils.display import Box, BoxStyle, colorize # Build command box box = Box(width=66, style=BoxStyle.SQUARE, color='gr') cmd_box = [ box.top_line(2), box.empty_line(2), ] for line in formatted_cmd_lines: cmd_box.append(box.text_line(line, indent=2)) cmd_box.extend([ box.empty_line(2), box.bottom_line(2) ]) message = ( f"\n{colorize('Potentially dangerous command detected', color=Colors.WHITE)}\n\n" f"Backend: {colorize(backend_type.value, color=Colors.CYAN)}\n" f"Dangerous commands: {dangerous_list}\n\n" f"Affected lines:\n" + "\n".join(cmd_box) + "\n\n" f"{colorize('This command may contain risky operations. Continue?', color=Colors.YELLOW)}" ) return await self._ask_user(message) async def check_domain_allowed(self, backend_type: BackendType, domain: str) -> bool: policy = self.get_policy(backend_type) if policy.check(domain=domain): return True message = ( f"\n{Colors.WHITE}Unauthorized domain access detected{Colors.RESET}\n\n" f"Backend: {Colors.CYAN}{backend_type.value}{Colors.RESET}\n" f"Domain: {Colors.YELLOW}{domain}{Colors.RESET}\n\n" f"{Colors.YELLOW}This domain is not in the allowed list. Continue?{Colors.RESET}" ) return await self._ask_user(message) ================================================ FILE: anytool/grounding/core/security/sandbox.py ================================================ from typing import Any, Dict, Optional from abc import ABC, abstractmethod from ..types import SandboxOptions, BackendType class BaseSandbox(ABC): def __init__(self, options: SandboxOptions): self.options = options self._active = False @abstractmethod async def start(self) -> bool: """Set self._active to True""" pass @abstractmethod async def stop(self) -> None: """Set self._active to False""" pass @abstractmethod async def execute_safe(self, command: str, **kwargs) -> Any: pass @abstractmethod def get_connector(self) -> Any: pass @property def is_active(self) -> bool: return self._active class SandboxManager: def __init__(self): self._sandboxes: Dict[BackendType, BaseSandbox] = {} def register_sandbox(self, backend_type: BackendType, sandbox: BaseSandbox) -> None: self._sandboxes[backend_type] = sandbox def get_sandbox(self, backend_type: BackendType) -> Optional[BaseSandbox]: return self._sandboxes.get(backend_type) async def start_all(self) -> None: for sandbox in self._sandboxes.values(): await sandbox.start() async def stop_all(self) -> None: for sandbox in self._sandboxes.values(): await sandbox.stop() ================================================ FILE: anytool/grounding/core/session.py ================================================ from abc import ABC, abstractmethod from typing import Any, Dict, List from datetime import datetime from .tool import BaseTool from .transport.connectors import BaseConnector from .types import SessionInfo, SessionStatus, BackendType, ToolResult from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class BaseSession(ABC): """ Session manager for all backends. """ def __init__( self, connector: BaseConnector, *, session_id: str, backend_type: BackendType | None = None, auto_connect: bool = True, auto_initialize: bool = True, ) -> None: self.connector = connector self.session_id = session_id self.backend_type = backend_type or BackendType.NOT_SET self.auto_connect = auto_connect self.auto_initialize = auto_initialize self.status: SessionStatus = SessionStatus.DISCONNECTED self.session_info: Dict[str, Any] | None = None self._created_at = datetime.utcnow() self._last_activity = self._created_at self.tools: List[BaseTool] = [] async def __aenter__(self) -> "BaseSession": if self.auto_connect: await self.connect() if self.auto_initialize: self.session_info = await self.initialize() return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """Exit the async context manager. Args: exc_type: The exception type, if an exception was raised. exc_val: The exception value, if an exception was raised. exc_tb: The exception traceback, if an exception was raised. """ await self.disconnect() async def connect(self) -> None: if self.connector.is_connected: return self.status = SessionStatus.CONNECTING await self.connector.connect() self.status = SessionStatus.CONNECTED async def disconnect(self) -> None: if not self.connector.is_connected: return await self.connector.disconnect() self.status = SessionStatus.DISCONNECTED @property def is_connected(self) -> bool: return self.connector.is_connected @abstractmethod async def initialize(self) -> Dict[str, Any]: """ Negotiate with the backend, discover tools, etc. Return session information (can be an empty dict). `self.tools` need to be set in this method. """ raise NotImplementedError("Sub-class must implement this method") async def list_tools(self) -> List[BaseTool]: """ Return tools discovered during `initialize()`. """ if not self.tools: self.session_info = await self.initialize() return self.tools async def call_tool(self, tool_name: str, parameters=None) -> ToolResult: parameters = parameters or {} # Ensure tools are initialized before calling if not self.tools: logger.debug(f"Tools not initialized for session {self.session_id}, initializing now...") self.session_info = await self.initialize() tool_map = {t.schema.name: t for t in self.tools} if tool_name not in tool_map: raise ValueError(f"Unknown tool: {tool_name}") result = await tool_map[tool_name].arun(**parameters) self._touch() return result # Update when a successful call is made def _touch(self): self._last_activity = datetime.utcnow() @property def info(self) -> SessionInfo: return SessionInfo( session_id=self.session_id, backend_type=getattr(self, "backend_type", BackendType.NOT_SET), status=self.status, created_at=self._created_at, last_activity=self._last_activity, metadata=self.session_info or {}, ) ================================================ FILE: anytool/grounding/core/system/__init__.py ================================================ from .provider import SystemProvider from .tool import SYSTEM_TOOLS __all__ = [ "SystemProvider", "SYSTEM_TOOLS", ] ================================================ FILE: anytool/grounding/core/system/provider.py ================================================ from typing import List, Dict, Any from ..provider import Provider from ..types import BackendType, SessionConfig from ..grounding_client import GroundingClient from .tool import SYSTEM_TOOLS, _BaseSystemTool from ..exceptions import GroundingError, ErrorCode class SystemProvider(Provider): """ Provider for system-level query tools """ def __init__(self, client: GroundingClient): super().__init__(BackendType.SYSTEM, {}) # Instantiates all system tools self._tools: List[_BaseSystemTool] = [tool_cls(client) for tool_cls in SYSTEM_TOOLS] async def initialize(self): self.is_initialized = True async def create_session(self, session_config: SessionConfig): raise GroundingError( "SystemProvider does not support sessions", code=ErrorCode.CONFIG_INVALID, ) async def list_tools(self, session_name: str | None = None): return self._tools async def call_tool( self, session_name: str, tool_name: str, parameters: Dict[str, Any] | None = None, ): tool_map = {t.schema.name: t for t in self._tools} if tool_name not in tool_map: raise GroundingError( f"System tool '{tool_name}' not found", code=ErrorCode.TOOL_NOT_FOUND, ) return await tool_map[tool_name].arun(**(parameters or {})) async def close_session(self, session_name: str) -> None: return ================================================ FILE: anytool/grounding/core/system/tool.py ================================================ from ..tool.local_tool import LocalTool from ..types import BackendType, ToolResult, ToolStatus from ..grounding_client import GroundingClient class _BaseSystemTool(LocalTool): backend_type = BackendType.SYSTEM def __init__(self, client: GroundingClient): super().__init__(verbose=False, handle_errors=True) self._client = client @property def client(self) -> GroundingClient: return self._client class ListProvidersTool(_BaseSystemTool): _name = "list_providers" _description = "List all registered backend providers" async def _arun(self) -> ToolResult: prov = list(self.client.list_providers().keys()) return ToolResult( status=ToolStatus.SUCCESS, content=", ".join(prov), ) class ListBackendToolsTool(_BaseSystemTool): _name = "list_backend_tools" _description = "List static tools for a backend" async def _arun(self, backend: str) -> ToolResult: try: be = BackendType(backend.lower()) except ValueError: return ToolResult(ToolStatus.ERROR, error=f"Unknown backend '{backend}'") tools = await self.client.list_backend_tools(be) names = [t.schema.name for t in tools] return ToolResult( status=ToolStatus.SUCCESS, content=", ".join(names), ) class ListSessionToolsTool(_BaseSystemTool): _name = "list_session_tools" _description = "List tools (incl. dynamic) for a session" async def _arun(self, session_id: str) -> ToolResult: tools = await self.client.list_session_tools(session_id) names = [t.schema.name for t in tools] return ToolResult( status=ToolStatus.SUCCESS, content=", ".join(names), ) class ListAllBackendToolsTool(_BaseSystemTool): _name = "list_all_backend_tools" _description = "List static tools for every registered backend" async def _arun(self, use_cache: bool = False) -> ToolResult: all_tools = await self.client.list_all_backend_tools(use_cache=use_cache) lines = [ f"{backend.value}: {', '.join(t.schema.name for t in tools)}" for backend, tools in all_tools.items() ] return ToolResult( status=ToolStatus.SUCCESS, content="\n".join(lines), ) SYSTEM_TOOLS: list[type[_BaseSystemTool]] = [ ListProvidersTool, ListBackendToolsTool, ListSessionToolsTool, ListAllBackendToolsTool, ] ================================================ FILE: anytool/grounding/core/tool/__init__.py ================================================ from .base import BaseTool from .local_tool import LocalTool from .remote_tool import RemoteTool __all__ = ["BaseTool", "LocalTool", "RemoteTool"] ================================================ FILE: anytool/grounding/core/tool/base.py ================================================ """ BaseTool. All pre-defined grounding atomic operations will inherit this tool class. RemoteTool needs to pass in connector. """ import asyncio, time, inspect from abc import ABC, abstractmethod from functools import lru_cache from typing import Any, ClassVar, Dict, Optional, TYPE_CHECKING from pydantic import BaseModel, ConfigDict, Field, create_model from ..types import BackendType, ToolResult, ToolSchema, ToolStatus from ..exceptions import GroundingError, ErrorCode from anytool.utils.logging import Logger import jsonschema if TYPE_CHECKING: from ..grounding_client import GroundingClient logger = Logger.get_logger(__name__) class ToolRuntimeInfo: """Runtime information for a tool instance""" def __init__( self, backend: BackendType, session_name: str, server_name: Optional[str] = None, grounding_client: Optional['GroundingClient'] = None, ): self.backend = backend self.session_name = session_name self.server_name = server_name self.grounding_client = grounding_client def __repr__(self): return f"" class BaseTool(ABC): _name: ClassVar[str] = "" _description: ClassVar[str] = "" backend_type: ClassVar[BackendType] = BackendType.NOT_SET def __init__(self, schema: Optional[ToolSchema] = None, *, verbose: bool = False, handle_errors: bool = True) -> None: self.verbose = verbose self.handle_errors = handle_errors self.schema: ToolSchema = schema or ToolSchema( name=self._name or self.__class__.__name__.lower(), description=self._description, parameters=self.get_parameters_schema(), backend_type=self.backend_type, ) self._runtime_info: Optional[ToolRuntimeInfo] = None self._disable_outer_recording = True @property def name(self) -> str: """Get tool name from schema (supports both class-defined and runtime-injected names)""" return self.schema.name if hasattr(self, 'schema') and self.schema else self._name @property def description(self) -> str: """Get tool description from schema (supports both class-defined and runtime-injected descriptions)""" return self.schema.description if hasattr(self, 'schema') and self.schema else self._description @classmethod @lru_cache def get_parameters_schema(cls) -> Dict[str, Any]: """Auto-generate JSON-schema from _run() or _arun() signature. Returns empty dict for tools with no parameters. Priority: prefer _arun if overridden, otherwise use _run. """ # Priority: prefer _arun if it's overridden by subclass, else use _run # This allows async-first tools to define their signature via _arun sig_src = None # Check if _arun is overridden (not from BaseTool) if cls._arun is not BaseTool._arun: sig_src = cls._arun # Otherwise check if _run is overridden elif cls._run is not BaseTool._run: sig_src = cls._run # If neither is overridden, raise error else: raise ValueError( f"{cls.__name__} must implement _run() or _arun() to define its parameters schema" ) sig = inspect.signature(sig_src) fields: dict[str, Any] = {} for name, p in sig.parameters.items(): # Skip 'self' and **kwargs / *args if name == "self" or p.kind in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL): continue typ = p.annotation if p.annotation is not inspect._empty else str default = p.default if p.default is not inspect._empty else ... fields[name] = (typ, Field(default)) if not fields: return {} PModel: type[BaseModel] = create_model( f"{cls.__name__}Params", __config__=ConfigDict(arbitrary_types_allowed=True), **fields ) return PModel.model_json_schema() def validate_parameters(self, params: Dict[str, Any]) -> None: try: self.schema.validate_parameters(params, raise_exc=True) except jsonschema.ValidationError as ve: raise GroundingError( f"Invalid parameters: {ve.message}", code=ErrorCode.TOOL_EXECUTION_FAIL, tool_name=self.schema.name, ) from ve def run(self, **kwargs): try: return asyncio.run(self.invoke(**kwargs)) except RuntimeError: # already in running loop loop = asyncio.get_running_loop() return loop.create_task(self.invoke(**kwargs)) def __call__(self, **kwargs): return self.run(**kwargs) async def __acall__(self, **kwargs): return await self.arun(**kwargs) async def arun(self, **kwargs) -> ToolResult: start = time.time() try: self.validate_parameters(kwargs) raw = await self._arun(**kwargs) result = self._wrap_result(raw, time.time() - start) # Auto-record (if enabled) await self._auto_record_execution(kwargs, result, time.time() - start) return result except Exception as e: if self.handle_errors: return ToolResult( status=ToolStatus.ERROR, error=str(e), metadata={"tool": self.schema.name}, ) raise # to be implemented by subclasses @abstractmethod async def _arun(self, **kwargs): ... def bind_runtime_info( self, backend: BackendType, session_name: str, server_name: Optional[str] = None, grounding_client: Optional['GroundingClient'] = None, ) -> 'BaseTool': """ Bind runtime information to the tool instance. Allow the tool to be invoked directly without specifying backend/session/server. Args: backend: Backend type session_name: Session name server_name: Server name (for MCP) grounding_client: Optional reference to GroundingClient for direct invocation """ self._runtime_info = ToolRuntimeInfo( backend=backend, session_name=session_name, server_name=server_name, grounding_client=grounding_client, ) return self @property def runtime_info(self) -> Optional['ToolRuntimeInfo']: """Get runtime information if bound""" return self._runtime_info @property def is_bound(self) -> bool: """Check if tool has runtime information bound""" return self._runtime_info is not None async def invoke( self, parameters: Dict[str, Any] | None = None, keep_session: bool = True, **kwargs ) -> ToolResult: """ Invoke this tool using bound runtime information. Requires runtime info to be bound via bind_runtime_info(). If no runtime info is bound, the tool will be executed locally. """ params = parameters or kwargs if self.is_bound and self._runtime_info.grounding_client: return await self._runtime_info.grounding_client.invoke_tool( tool=self, parameters=params, keep_session=keep_session, ) return await self.arun(**params) def _wrap_result(self, obj: Any, elapsed: float) -> ToolResult: if isinstance(obj, ToolResult): obj.execution_time = elapsed return obj if self.verbose: logger.debug("[%s] done in %.2f s", self.schema.name, elapsed) if isinstance(obj, (bytes, bytearray)): obj = obj.decode("utf-8", errors="replace") return ToolResult( status=ToolStatus.SUCCESS, content=str(obj), execution_time=elapsed, metadata={"tool": self.schema.name}, ) async def _auto_record_execution( self, parameters: Dict[str, Any], result: ToolResult, execution_time: float, ): """Auto-record tool execution to recording manager and quality manager.""" # Record to quality manager (for quality tracking) await self._record_to_quality_manager(result, execution_time * 1000) # Record to recording manager (for trajectory recording) try: from anytool.recording import RecordingManager if not RecordingManager.is_recording(): return # Check if tool has disabled outer recording (e.g., GUI agent with intermediate steps) if hasattr(self, '_disable_outer_recording') and self._disable_outer_recording: logger.debug(f"Skipping outer recording for {self.schema.name} (intermediate steps recorded)") return # Get backend and server_name from runtime_info (if bound) backend = self.backend_type.value server_name = None if self.is_bound and self._runtime_info: # Prefer runtime_info information (more accurate) backend = self._runtime_info.backend.value server_name = self._runtime_info.server_name # Get screenshot (if GUI backend) screenshot = None if self.backend_type == BackendType.GUI and hasattr(self, 'connector'): try: screenshot = await self.connector.get_screenshot() except Exception as e: logger.debug(f"Failed to capture screenshot: {e}") # Record tool execution with complete runtime information await RecordingManager.record_tool_execution( tool_name=self.schema.name, backend=backend, parameters=parameters, result=result.content, server_name=server_name, is_success=result.is_success, # Pass actual success status from ToolResult ) except Exception as e: # Recording failure should not affect tool execution logger.debug(f"Failed to auto-record tool execution: {e}") async def _record_to_quality_manager( self, result: ToolResult, execution_time_ms: float, ): """Record execution result to quality manager for quality tracking.""" try: from anytool.grounding.core.quality import get_quality_manager manager = get_quality_manager() if manager: await manager.record_execution(self, result, execution_time_ms) except Exception as e: # Quality recording failure should not affect tool execution logger.debug(f"Failed to record to quality manager: {e}") # keep _run for backward-compatibility / thread-pool fallback def _run(self, **kwargs): raise NotImplementedError def __repr__(self): base = f"" def __init_subclass__(cls, **kwargs): """ - at least implement _run or _arun - backend_type is NOT_SET, only give a warning, allow RemoteTool to inject at runtime """ super().__init_subclass__(**kwargs) if cls._arun is BaseTool._arun and cls._run is BaseTool._run: raise ValueError(f"{cls.__name__} must implement _run() or _arun()") if cls.backend_type is BackendType.NOT_SET: logger.debug( "%s.backend_type is NOT_SET; remember to override or set at runtime.", cls.__name__, ) ================================================ FILE: anytool/grounding/core/tool/local_tool.py ================================================ """ LocalTool. Executes entirely inside this Python process. """ import asyncio from typing import Any from .base import BaseTool class LocalTool(BaseTool): def _run(self, **kwargs): raise NotImplementedError async def _dispatch_run(self, **kwargs) -> Any: # Prefer subclass's own _arun if it was overridden if self.__class__._arun is not LocalTool._arun: return await super()._arun(**kwargs) # Else fall back to thread-pooled _run if provided if self.__class__._run is not LocalTool._run: loop = asyncio.get_running_loop() return await loop.run_in_executor(None, lambda: self._run(**kwargs)) raise NotImplementedError( f"{self.__class__.__name__} must implement _run() or _arun()" ) async def _arun(self, **kwargs): return await self._dispatch_run(**kwargs) ================================================ FILE: anytool/grounding/core/tool/remote_tool.py ================================================ """ RemoteTool. Wrapper around a connector that calls a remote tool. """ from typing import Optional from anytool.utils.logging import Logger from ..types import BackendType, ToolResult, ToolSchema, ToolStatus from .base import BaseTool from anytool.grounding.core.transport.connectors import BaseConnector logger = Logger.get_logger(__name__) class RemoteTool(BaseTool): backend_type = BackendType.NOT_SET def __init__( self, schema: ToolSchema | None = None, connector: Optional[BaseConnector] = None, remote_name: str = "", *, verbose: bool = False, backend: BackendType = BackendType.NOT_SET, ): self._conn = connector self._remote_name = remote_name or (schema.name if schema else "") self.backend_type = backend super().__init__(schema=schema, verbose=verbose) async def _arun(self, **kwargs): # If no connector, tool must be invoked via grounding_client (on-demand startup) if self._conn is None: raise RuntimeError( f"Tool '{self.name}' has no connector. " "Use grounding_client.invoke_tool() to execute it with on-demand server startup." ) raw = await self._conn.invoke(self._remote_name, kwargs) if hasattr(raw, 'content') and hasattr(raw, 'isError'): content_parts = [] for item in (raw.content or []): # Extract text from TextContent if hasattr(item, 'text') and item.text: content_parts.append(item.text) # Handle ImageContent (just note its presence) elif hasattr(item, 'data'): content_parts.append(f"[Image data: {len(item.data) if item.data else 0} bytes]") # Handle EmbeddedResource elif hasattr(item, 'resource'): content_parts.append(f"[Embedded resource: {getattr(item.resource, 'uri', 'unknown')}]") content = "\n".join(content_parts) if content_parts else "" is_error = getattr(raw, 'isError', False) return ToolResult( status=ToolStatus.ERROR if is_error else ToolStatus.SUCCESS, content=content, error=content if is_error else None, ) # Handle dict response if isinstance(raw, dict): import json try: content = json.dumps(raw, ensure_ascii=False, indent=2) except (TypeError, ValueError): content = str(raw) # Handle list/tuple response elif isinstance(raw, (list, tuple)): import json try: content = json.dumps(raw, ensure_ascii=False, indent=2) except (TypeError, ValueError): content = str(raw) # Handle primitive types elif isinstance(raw, (int, float, bool)): content = str(raw) elif isinstance(raw, str): content = raw # Fallback for unknown types else: content = str(raw) return ToolResult( status=ToolStatus.SUCCESS, content=content, ) ================================================ FILE: anytool/grounding/core/transport/connectors/__init__.py ================================================ from .base import BaseConnector from .aiohttp_connector import AioHttpConnector __all__ = [ "BaseConnector", "AioHttpConnector", ] ================================================ FILE: anytool/grounding/core/transport/connectors/aiohttp_connector.py ================================================ from typing import Any from yarl import URL import aiohttp from ..task_managers import AioHttpConnectionManager from .base import BaseConnector from anytool.utils.logging import Logger from pydantic import BaseModel logger = Logger.get_logger(__name__) class AioHttpConnector(BaseConnector[aiohttp.ClientSession]): """Generic HTTP-based connector with auto-reconnect & helper methods.""" def __init__(self, base_url: str, **session_kw): connection_manager = AioHttpConnectionManager(base_url, **session_kw) super().__init__(connection_manager) self.base_url = base_url.rstrip("/") async def connect(self) -> None: await super().connect() try: async with self._connection.get(self.base_url, timeout=5) as resp: if resp.status >= 500: raise ConnectionError(f"HTTP {resp.status}") except Exception as e: await self.disconnect() raise ConnectionError(f"Ping {self.base_url} failed: {e}") async def _request( self, method: str, path: str, *, json: Any | BaseModel | None = None, data: Any | None = None, params: dict[str, Any] | None = None, **kw, ) -> aiohttp.ClientResponse: if not self.is_connected: await self.connect() assert self._connection is not None # for mypy url = URL(self.base_url) / path.lstrip("/") logger.debug("%s %s", method.upper(), url) return await self._connection.request( method.upper(), url, json=self._to_json_compatible(json), data=data, params=params, **kw, ) async def get_json(self, path: str, **kw) -> Any: response_model: type[BaseModel] | None = kw.pop("response_model", None) resp = await self._request("GET", path, **kw) resp.raise_for_status() data = await resp.json() return self._parse_as(data, response_model) async def get_bytes(self, path: str, **kw) -> bytes: resp = await self._request("GET", path, **kw) resp.raise_for_status() return await resp.read() async def post_json( self, path: str, payload: Any | BaseModel, *, response_model: type[BaseModel] | None = None, **kw, ) -> Any | BaseModel: resp = await self._request("POST", path, json=payload, **kw) try: data = await resp.json() except Exception: data = None if resp.status >= 400: # Extract detailed error from response body detail = "" if data: detail = data.get("output") or data.get("message") or data.get("error") or "" error_msg = f"{resp.status}, message='{resp.reason}'" if detail: error_msg += f", detail='{detail}'" raise aiohttp.ClientResponseError( resp.request_info, resp.history, status=resp.status, message=error_msg, ) return self._parse_as(data, response_model) async def request(self, method: str, path: str, **kw) -> aiohttp.ClientResponse: return await self._request(method, path, **kw) async def invoke(self, name: str, params: dict[str, Any]) -> Any: """ Generic tool-invocation mapping for HTTP back-ends. name rule (case-insensitive): - "GET /path" -> GET, return JSON - "GET_TEXT /path" -> GET, return str - "GET_BYTES /path" -> GET, return bytes - "POST /path" -> POST, payload = params (JSON) - other -> default POST /{name}, payload = params If PUT/PATCH/DELETE is needed in the future, it can be reused in _handle_other_json. """ verb_path = name.strip().split(maxsplit=1) verb = verb_path[0].upper() path = verb_path[1] if len(verb_path) == 2 else verb_path[0] if verb == "GET_BYTES": return await self.get_bytes(path, params=params) if verb == "GET_TEXT": resp = await self._request("GET", path, params=params) resp.raise_for_status() return await resp.text() if verb in {"GET", "POST"} and len(verb_path) == 2: if verb == "GET": return await self.get_json(path, params=params) return await self.post_json(path, payload=params) if verb in {"PUT", "PATCH", "DELETE"} and len(verb_path) == 2: return await self._handle_other_json(verb, path, params) return await self.post_json(name, payload=params) async def _handle_other_json(self, method: str, path: str, params: dict[str, Any]): """Fallback implementation for PUT/PATCH/DELETE returning JSON/text, can be overridden by subclasses.""" resp = await self._request(method, path, json=params) resp.raise_for_status() try: return await resp.json() except Exception: return await resp.text() ================================================ FILE: anytool/grounding/core/transport/connectors/base.py ================================================ """ Base connector abstraction. A connector is a very thin wrapper-class that owns a *connection manager* (e.g. AioHttpConnectionManager, AsyncContextConnectionManager, …). It exposes a unified `connect / disconnect / is_connected` lifecycle and defines an abstract `request()` method which concrete back-ends must implement. """ import asyncio from abc import ABC, abstractmethod from typing import Any, Generic, TypeVar, Type from pydantic import BaseModel from ..task_managers import BaseConnectionManager T = TypeVar("T") # The object returned by manager.start(): session / connection class BaseConnector(ABC, Generic[T]): """ Generic connector that delegates the heavy lifting to the supplied *connection manager*. Concrete subclasses only need to implement their own `request()` method. """ def __init__(self, connection_manager: BaseConnectionManager[T]): self._connection_manager = connection_manager # e.g. AioHttpConnectionManager instance # The raw connection object returned by the manager, for reusing the established long-term connection self._connection: T | None = None self._connected = False async def connect(self) -> None: """Create the underlying session/connection via the manager.""" if self._connected: return try: # Hook: before connection await self._before_connect() # Start the connection manager self._connection = await self._connection_manager.start() # Hook: after connection established await self._after_connect() # Mark as connected self._connected = True except Exception: # Clean up on failure await self._cleanup_on_connect_failure() raise async def disconnect(self) -> None: """Close the session/connection and reset state. Ensures proper cleanup of all resources including aiohttp sessions. """ if not self._connected: return # Hook: before disconnection await self._before_disconnect() # Stop the connection manager if self._connection_manager: await self._connection_manager.stop() self._connection = None # Hook: after disconnection await self._after_disconnect() self._connected = False async def _before_connect(self) -> None: """Hook called before establishing connection. Override in subclasses if needed.""" pass async def _after_connect(self) -> None: """Hook called after connection is established. Override in subclasses if needed.""" pass async def _cleanup_on_connect_failure(self) -> None: """Hook called when connection fails. Override in subclasses if needed.""" if self._connection_manager: try: await self._connection_manager.stop() except Exception: pass self._connection = None async def _before_disconnect(self) -> None: """Hook called before disconnection. Override in subclasses if needed.""" pass async def _after_disconnect(self) -> None: """Hook called after disconnection. Override in subclasses if needed.""" pass @property def is_connected(self) -> bool: """Return True iff `connect()` has completed successfully.""" return self._connected @staticmethod def _to_json_compatible(obj: Any) -> Any: """ Convert a Pydantic BaseModel to a JSON-serialisable dict (by_alias=True). Leave all other types unchanged. """ if isinstance(obj, BaseModel): return obj.model_dump(by_alias=True) return obj @staticmethod def _parse_as(data: Any, model_cls: "Type[BaseModel] | None" = None) -> Any: """ Try to parse *data* into *model_cls* (a subclass of BaseModel). If `model_cls` is None or not a subclass of BaseModel, return the original data. """ if model_cls is None: return data if isinstance(model_cls, type) and issubclass(model_cls, BaseModel): return model_cls.model_validate(data) return data @abstractmethod async def invoke(self, name: str, params: dict[str, Any]) -> Any: """ Unified RPC entry for all tools. Sub-class maps this to its actual RPC like call_tool / run_cmd. """ raise NotImplementedError @abstractmethod async def request(self, *args: Any, **kwargs: Any) -> Any: """Abstract RPC / HTTP / WS request method to be implemented by child classes.""" raise NotImplementedError("This connector has not implemented 'request'") ================================================ FILE: anytool/grounding/core/transport/task_managers/__init__.py ================================================ from .base import BaseConnectionManager from .aiohttp_connection_manager import AioHttpConnectionManager from .async_ctx import AsyncContextConnectionManager from .placeholder import PlaceholderConnectionManager from .noop import NoOpConnectionManager __all__ = [ "BaseConnectionManager", "AioHttpConnectionManager", "AsyncContextConnectionManager", "PlaceholderConnectionManager", "NoOpConnectionManager", ] ================================================ FILE: anytool/grounding/core/transport/task_managers/aiohttp_connection_manager.py ================================================ """ Long-lived aiohttp ClientSession manager based on AsyncContextConnectionManager. It keeps a single ClientSession open during the lifetime of a backend session, saving the overhead of creating and closing a TCP connection for every request. """ from typing import Optional import aiohttp from .async_ctx import AsyncContextConnectionManager class AioHttpConnectionManager( AsyncContextConnectionManager[aiohttp.ClientSession, ...] ): """Manage a persistent aiohttp.ClientSession.""" def __init__( self, base_url: str, headers: Optional[dict[str, str]] = None, timeout: float = 30, ): self.base_url = base_url.rstrip("/") timeout_cfg = aiohttp.ClientTimeout(total=timeout) super().__init__( aiohttp.ClientSession, timeout=timeout_cfg, headers=headers or {}, ) self._logger.debug( "Init AioHttpConnectionManager base_url=%s timeout=%s", self.base_url, timeout ) async def _establish_connection(self) -> aiohttp.ClientSession: """Create and enter the aiohttp.ClientSession context.""" session = await super()._establish_connection() self._logger.debug("aiohttp ClientSession created") return session async def _close_connection(self) -> None: """Close the session and then call the parent cleanup. Ensures proper cleanup even if close() fails. """ if self._ctx: try: self._logger.debug("Closing aiohttp ClientSession") await self._ctx.close() # Give aiohttp time to finish its internal cleanup callbacks import asyncio await asyncio.sleep(0.1) except Exception as e: self._logger.warning(f"Error closing aiohttp ClientSession: {e}") await super()._close_connection() ================================================ FILE: anytool/grounding/core/transport/task_managers/async_ctx.py ================================================ """ Generic connection manager based on an *async context manager*. Give it any factory that returns an async–context-manager. """ import sys from typing import Any, Callable, Generic, Optional, ParamSpec, TypeVar from .base import BaseConnectionManager # BaseExceptionGroup only exists in Python 3.11+ if sys.version_info >= (3, 11): _BaseExceptionGroup = BaseExceptionGroup else: # Dummy class for older Python versions class _BaseExceptionGroup(Exception): pass T = TypeVar("T") # Return type of the async context P = ParamSpec("P") # Parameter specification of the factory class AsyncContextConnectionManager(Generic[T, P], BaseConnectionManager[T]): def __init__(self, ctx_factory: Callable[P, Any], *args: P.args, **kwargs: P.kwargs): super().__init__() self._factory = ctx_factory self._factory_args = args self._factory_kwargs = kwargs self._ctx: Optional[Any] = None async def _establish_connection(self) -> T: """Create the context manager and enter it.""" self._logger.debug("Creating context via %s", self._factory.__name__) try: self._ctx = self._factory(*self._factory_args, **self._factory_kwargs) result: T = await self._ctx.__aenter__() self._logger.debug("Context %s entered successfully", self._factory.__name__) return result except Exception as e: # Check if this is a benign ExceptionGroup/TaskGroup error # These occur during concurrent initialization and cleanup error_msg = str(e).lower() is_taskgroup_error = ( "unhandled errors in a taskgroup" in error_msg or "cancel scope in a different task" in error_msg or "exceptiongroup" in type(e).__name__.lower() ) if is_taskgroup_error: # This is a benign race condition during concurrent connection setup # Log at debug level and re-raise to trigger retry logic self._logger.debug( f"Benign TaskGroup race condition during {self._factory.__name__} connection: {type(e).__name__}" ) # Clean up the partially created context if self._ctx is not None: try: await self._ctx.__aexit__(None, None, None) except Exception: pass # Ignore cleanup errors self._ctx = None raise else: # Real error - log at error level self._logger.error(f"Error establishing connection via {self._factory.__name__}: {e}") raise async def _close_connection(self) -> None: """Exit the context manager if it exists. Uses try-finally to ensure ctx is cleared even if __aexit__ fails. This prevents resource leaks when cleanup encounters errors. """ if self._ctx is not None: try: self._logger.debug("Exiting context %s", self._factory.__name__) # Give subprocesses a moment to flush buffers before closing import asyncio await asyncio.sleep(0.05) # Try to exit the context, but catch all possible exceptions try: await self._ctx.__aexit__(None, None, None) except BaseException as e: # Catch absolutely everything including SystemExit, KeyboardInterrupt, etc. # Check if it's a benign error benign_error_types = ( BrokenPipeError, ConnectionResetError, ValueError, OSError, IOError, ProcessLookupError, RuntimeError, GeneratorExit ) is_benign = False # Check direct exception type if isinstance(e, benign_error_types): is_benign = True # Check for BaseExceptionGroup (Python 3.11+) elif hasattr(e, 'exceptions'): # It's an exception group, check all sub-exceptions is_benign = all(isinstance(sub_e, benign_error_types) for sub_e in e.exceptions) if is_benign: self._logger.debug(f"Benign cleanup error for {self._factory.__name__}: {type(e).__name__}") else: self._logger.warning(f"Error during context exit for {self._factory.__name__}: {type(e).__name__}: {e}") # Don't re-raise - we want cleanup to complete except Exception as e: # Catch any other unexpected errors in the outer try block self._logger.warning(f"Unexpected error during cleanup for {self._factory.__name__}: {e}") finally: self._ctx = None ================================================ FILE: anytool/grounding/core/transport/task_managers/base.py ================================================ """ Base connection manager for all backend connectors. This module provides an abstract base class for different types of connection managers used in all backend connectors. Flow: start() → launch_connection_task() → call subclass _establish_connection() → notify ready → maintain connection until stop() → call subclass _close_connection() → cleanup """ import asyncio from abc import ABC, abstractmethod from typing import Generic, TypeVar from anytool.utils.logging import Logger T = TypeVar("T") class BaseConnectionManager(Generic[T], ABC): """Abstract base class for connection managers. This class defines the interface for different types of connection managers used with all backend connectors. """ def __init__(self): """Initialize a new connection manager.""" self._ready_event = asyncio.Event() self._done_event = asyncio.Event() self._exception: Exception | None = None self._connection: T | None = None self._task: asyncio.Task | None = None self._logger = Logger.get_logger(f"{__name__}.{self.__class__.__name__}") @abstractmethod async def _establish_connection(self) -> T: """Establish the connection. This method should be implemented by subclasses to establish the specific type of connection needed. Returns: The established connection. Raises: Exception: If connection cannot be established. """ pass @abstractmethod async def _close_connection(self) -> None: """Close the connection. This method should be implemented by subclasses to close the specific type of connection. """ pass async def start(self, timeout: float | None = None) -> T: """Start the connection manager and establish a connection. Args: timeout: Optional timeout in seconds. If None, waits indefinitely. If specified, will cancel the background task on timeout. Returns: The established connection. Raises: TimeoutError: If connection establishment times out. Exception: If connection cannot be established. """ # Reset state self._ready_event.clear() self._done_event.clear() self._exception = None # Create a task to establish and maintain the connection self._task = asyncio.create_task(self._connection_task(), name=f"{self.__class__.__name__}_task") # Wait for the connection to be ready or fail (with optional timeout) try: if timeout is not None: await asyncio.wait_for(self._ready_event.wait(), timeout=timeout) else: await self._ready_event.wait() except asyncio.TimeoutError: # Timeout! Cancel the background task self._logger.warning(f"Connection establishment timed out after {timeout}s, cancelling...") if self._task and not self._task.done(): self._task.cancel() try: await asyncio.wait_for(self._task, timeout=2.0) # Give it 2s to cleanup except (asyncio.CancelledError, asyncio.TimeoutError): pass except Exception as e: self._logger.debug(f"Error during task cancellation: {e}") raise TimeoutError(f"Connection establishment timed out after {timeout}s") # If there was an exception, raise it if self._exception: # Check if this is a benign TaskGroup race condition error_msg = str(self._exception).lower() is_benign_taskgroup_error = ( "unhandled errors in a taskgroup" in error_msg or "cancel scope in a different task" in error_msg or "exceptiongroup" in type(self._exception).__name__.lower() ) if is_benign_taskgroup_error: # Log as debug - this is expected and will be retried self._logger.debug(f"Benign TaskGroup race condition, will retry: {type(self._exception).__name__}") else: # Real error - log at error level self._logger.error(f"Failed to start connection: {self._exception}") raise self._exception # Return the connection if self._connection is None: error_msg = "Connection was not established" self._logger.error(error_msg) raise RuntimeError(error_msg) self._logger.info("Connection manager started successfully") return self._connection async def stop(self, timeout: float = 5.0) -> None: """Stop the connection manager and close the connection. Args: timeout: Maximum time to wait for cleanup (default 5s). Ensures all async resources (including aiohttp sessions) are properly closed. """ if self._task and not self._task.done(): self._task.cancel() try: await asyncio.wait_for(self._task, timeout=timeout) except asyncio.TimeoutError: self._logger.warning(f"Task cleanup timed out after {timeout}s") except asyncio.CancelledError: pass # Expected except Exception as e: self._logger.warning(f"Error stopping task: {e}") # Wait for the connection to be done (with timeout) try: await asyncio.wait_for(self._done_event.wait(), timeout=timeout) except asyncio.TimeoutError: self._logger.warning(f"Done event wait timed out after {timeout}s") self._logger.info("Connection manager stopped") def get_streams(self) -> T | None: """Get the current connection streams. Returns: The current connection (typically a tuple of read_stream, write_stream) or None if not connected. """ return self._connection async def _connection_task(self) -> None: """Run the connection task. This task establishes and maintains the connection until cancelled. """ try: # Establish the connection self._connection = await self._establish_connection() self._logger.debug("Connection established") # Signal that the connection is ready self._ready_event.set() # Wait indefinitely until cancelled try: await asyncio.Event().wait() except asyncio.CancelledError: raise except asyncio.CancelledError: raise except Exception as e: # Store the exception self._exception = e # Check if this is a benign TaskGroup race condition error_msg = str(e).lower() is_benign_taskgroup_error = ( "unhandled errors in a taskgroup" in error_msg or "cancel scope in a different task" in error_msg or "exceptiongroup" in type(e).__name__.lower() ) if is_benign_taskgroup_error: # Log as debug - this is expected during concurrent connection setup self._logger.debug(f"Benign TaskGroup race condition in connection task: {type(e).__name__}") else: # Real error - log at error level self._logger.error(f"Connection task failed: {e}") # Signal that the connection is ready (with error) self._ready_event.set() finally: # Close the connection if it was established if self._connection is not None: try: await self._close_connection() except Exception as e: self._logger.warning(f"Error closing connection: {e}") self._connection = None # Signal that the connection is done self._done_event.set() ================================================ FILE: anytool/grounding/core/transport/task_managers/noop.py ================================================ """No-op connection manager for local (in-process) connectors. Local connectors execute commands directly via subprocess, so they don't need a real network connection. This manager satisfies the BaseConnectionManager interface that BaseConnector requires. """ import asyncio from typing import Any from .base import BaseConnectionManager class NoOpConnectionManager(BaseConnectionManager[Any]): """Connection manager that immediately reports 'ready' without establishing any real connection. Used by LocalShellConnector and LocalGUIConnector. """ async def _establish_connection(self) -> Any: """No-op: return a sentinel value.""" return True async def _close_connection(self) -> None: """No-op: nothing to close.""" pass ================================================ FILE: anytool/grounding/core/transport/task_managers/placeholder.py ================================================ from typing import Any from .base import BaseConnectionManager class PlaceholderConnectionManager(BaseConnectionManager[Any]): """A placeholder connection manager that does nothing. This is used by connectors that set up their real connection manager during the connect() phase. """ async def _establish_connection(self) -> Any: """Establish the connection (placeholder implementation).""" raise NotImplementedError("PlaceholderConnectionManager should be replaced before use") async def _close_connection(self) -> None: """Close the connection (placeholder implementation).""" pass ================================================ FILE: anytool/grounding/core/types.py ================================================ from enum import Enum from datetime import datetime from typing import Any, Dict, Generic, List, TypeVar, Optional import jsonschema from pydantic import BaseModel, Field, ConfigDict # Pydantic v2 compatibility try: from pydantic import RootModel PYDANTIC_V2 = True except ImportError: PYDANTIC_V2 = False class BackendType(str, Enum): MCP = "mcp" SHELL = "shell" WEB = "web" GUI = "gui" SYSTEM = "system" NOT_SET = "not_set" class ToolStatus(str, Enum): SUCCESS = "success" ERROR = "error" class SessionStatus(str, Enum): CONNECTED = "connected" DISCONNECTED = "disconnected" CONNECTING = "connecting" ProgressToken = str | int RequestId = str | int RequestParamsT = TypeVar("RequestParamsT", bound=BaseModel | Dict[str, Any] | None) NotificationParamsT = TypeVar("NotificationParamsT", bound=BaseModel | Dict[str, Any] | None) MethodT = TypeVar("MethodT", bound=str) class BaseEntity(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict) model_config = ConfigDict(extra="allow") class JsonRpcBase(BaseEntity): jsonrpc: str = "2.0" class RpcMessage(JsonRpcBase, Generic[MethodT, RequestParamsT]): method: MethodT params: RequestParamsT class Request(RpcMessage[MethodT, RequestParamsT]): id: RequestId | None = None # id is None means Notification class Notification(RpcMessage[MethodT, NotificationParamsT]): pass class Result(JsonRpcBase): pass class ErrorData(BaseEntity): code: int message: str data: Any | None = None class ToolResult(Result): """Tool execution result""" status: ToolStatus content: Any = "" error: ErrorData | str | None = None execution_time: float | None = None @property def is_success(self) -> bool: return self.status == ToolStatus.SUCCESS @property def is_error(self) -> bool: return self.status == ToolStatus.ERROR class SecurityPolicy(BaseEntity): allow_shell_commands: bool = True allow_network_access: bool = True allow_file_access: bool = True allowed_domains: List[str] = Field(default_factory=list) blocked_commands: List[str] = Field(default_factory=list) sandbox_enabled: bool = False @classmethod def from_dict(cls, data: Dict) -> "SecurityPolicy": """ Create SecurityPolicy from configuration dict. Supports two formats for blocked_commands: 1. List format (applies to all OS): ["cmd1", "cmd2"] 2. Dict format (OS-specific): { "common": ["cmd1", "cmd2"], "linux": ["cmd3"], "darwin": ["cmd4"], "windows": ["cmd5"] } When using dict format, merges 'common' commands with current OS-specific commands. """ import sys import platform processed_data = {} for k, v in data.items(): if k not in cls.model_fields: continue # Special handling for blocked_commands if k == "blocked_commands": if isinstance(v, dict): # Dict format: merge common + OS-specific blocked_list = list(v.get("common", [])) # Determine current OS system = sys.platform if system.startswith("linux"): os_key = "linux" elif system == "darwin": os_key = "darwin" elif system.startswith("win"): os_key = "windows" else: os_key = None # Merge OS-specific commands if os_key and os_key in v: blocked_list.extend(v[os_key]) processed_data[k] = blocked_list elif isinstance(v, list): # List format: use as-is processed_data[k] = v else: # Invalid format, use empty list processed_data[k] = [] else: processed_data[k] = v return cls(**processed_data) def check(self, *, command: str | None = None, domain: str | None = None) -> bool: """ return True if allowed, False if denied. Command check uses token-level matching to prevent simple space/escape bypasses. """ import shlex # Shell / Python command check if command: if not self.allow_shell_commands: return False tokens = [t.lower() for t in shlex.split(command, posix=True)] blocked_set = {b.lower() for b in self.blocked_commands} if any(tok in blocked_set for tok in tokens): return False # Network access check if domain: if not self.allow_network_access: return False if self.allowed_domains and domain not in self.allowed_domains: return False return True def find_dangerous_tokens(self, command: str) -> List[str]: """ Find and return all dangerous tokens in the command. Returns empty list if no dangerous tokens found. """ import shlex if not command: return [] try: tokens = [t.lower() for t in shlex.split(command, posix=True)] except ValueError: # If shlex.split fails, fall back to simple split tokens = [t.lower() for t in command.split()] blocked_set = {b.lower() for b in self.blocked_commands} dangerous = [tok for tok in tokens if tok in blocked_set] return dangerous class ToolSchema(BaseEntity): name: str description: str | None = None parameters: Dict[str, Any] = Field(default_factory=dict) # JSON Schema, optional return_schema: Dict[str, Any] = Field(default_factory=dict) examples: List[dict] = Field(default_factory=list) usage_hint: str | None = None latency_hint: str | None = None backend_type: BackendType security_policy: SecurityPolicy | None = None def validate_parameters(self, params: Dict[str, Any], *, raise_exc: bool = False) -> bool: """use jsonschema to validate parameters Returns True if parameters are valid or if tool has no parameters. """ # If tool has no parameters defined and no parameters are provided, validation passes if not self.parameters and not params: return True # If tool has no parameters defined but parameters are provided, validation fails if not self.parameters and params: if raise_exc: raise ValueError(f"Tool '{self.name}' does not accept any parameters, but got: {list(params.keys())}") return False try: jsonschema.validate(params, self.parameters) return True except jsonschema.ValidationError: if raise_exc: raise return False def is_allowed(self, *, command: str | None = None, domain: str | None = None) -> bool: """check security policy""" return self.security_policy.check(command=command, domain=domain) if self.security_policy else True class SessionConfig(BaseEntity): session_name: str backend_type: BackendType connection_params: Dict[str, Any] = Field(default_factory=dict) timeout: int = 30 max_retries: int = 3 auto_reconnect: bool = True auto_connect: bool = True health_check_interval: int = 5 custom_settings: Dict[str, Any] = Field(default_factory=dict) class SessionInfo(SessionConfig): status: SessionStatus created_at: datetime last_activity: datetime class SandboxOptions(BaseEntity): api_key: str """Direct API key for sandbox provider (e.g., E2B API key). If not provided, will use E2B_API_KEY environment variable.""" sandbox_template_id: Optional[str] = None """Template ID for the sandbox environment. Default: 'base'""" supergateway_command: Optional[str] = None """Command to run supergateway. Default: 'npx -y supergateway'""" # ClientMessage: Only available in Pydantic v2 if PYDANTIC_V2: class ClientMessage( RootModel[ Request[Any, str] | Notification[Any, str] ] ): """ Unified deserialization entry: `ClientMessage.model_validate_json(raw_bytes)` """ else: # Pydantic v1 fallback: not used in current codebase ClientMessage = None # type: ignore ================================================ FILE: anytool/llm/__init__.py ================================================ from .client import LLMClient ================================================ FILE: anytool/llm/client.py ================================================ import litellm import json import asyncio import time from typing import List, Sequence, Union, Dict, Optional from dotenv import load_dotenv from openai.types.chat import ChatCompletionToolParam from anytool.grounding.core.types import ToolSchema, ToolResult, ToolStatus from anytool.grounding.core.tool import BaseTool from anytool.utils.logging import Logger load_dotenv() # Disable LiteLLM verbose logging to prevent stdout blocking with large tool schemas litellm.set_verbose = False litellm.suppress_debug_info = True logger = Logger.get_logger(__name__) def _sanitize_schema(params: Dict) -> Dict: """Sanitize tool parameter schema to comply with Claude API requirements. Fixes common issues: - Empty object schemas (no properties, no required) - Missing required fields for Claude compatibility """ if not params: return {"type": "object", "properties": {}, "required": []} # Deep copy to avoid modifying the original import copy sanitized = copy.deepcopy(params) # Anthropic API requires top-level type to be 'object' # If it's not an object, wrap the schema as a property of an object top_level_type = sanitized.get("type") if top_level_type and top_level_type != "object": # Wrap non-object schema as a single property called "value" logger.debug(f"[SCHEMA_SANITIZE] Wrapping non-object schema (type={top_level_type}) into object") wrapped = { "type": "object", "properties": { "value": sanitized # The original schema becomes a property }, "required": ["value"] # Make it required } sanitized = wrapped # If type is object but missing properties/required, add them if sanitized.get("type") == "object": if "properties" not in sanitized: sanitized["properties"] = {} if "required" not in sanitized: sanitized["required"] = [] # Remove non-standard fields that may cause issues (like 'title') sanitized.pop("title", None) # Recursively sanitize nested properties if "properties" in sanitized and isinstance(sanitized["properties"], dict): for prop_name, prop_schema in list(sanitized["properties"].items()): if isinstance(prop_schema, dict): # Remove title from nested properties prop_schema.pop("title", None) return sanitized def _schema_to_openai(schema: ToolSchema) -> ChatCompletionToolParam: """Convert ToolSchema to OpenAI ChatCompletion tool format""" function_def = { "name": schema.name, "description": schema.description or "", } # Sanitize and add parameters if schema.parameters: sanitized = _sanitize_schema(schema.parameters) function_def["parameters"] = sanitized # Debug: verify sanitization worked if "title" in schema.parameters and "title" not in sanitized: logger.debug(f"Sanitized tool '{schema.name}': removed title") else: # Claude requires parameters field even if empty function_def["parameters"] = {"type": "object", "properties": {}, "required": []} return { "type": "function", "function": function_def } def _prepare_tools_for_llmclient( tools: List[BaseTool] | None, fmt: str = "openai", ) -> tuple[Sequence[Union[ToolSchema, ChatCompletionToolParam]], Dict[str, BaseTool]]: """Convert BaseTool list to LLMClient usable format, with deduplication. Args: tools: BaseTool instance list (should be obtained from GroundingClient and bound to runtime_info) if None or empty list, return empty list fmt: output format, "openai" for OpenAI format """ if not tools: return [], {} if fmt == "openai": result = [] tool_map = {} # llm_name -> BaseTool name_count = {} for tool in tools: name = tool.schema.name name_count[name] = name_count.get(name, 0) + 1 seen_names = set() for tool in tools: original_name = tool.schema.name if name_count[original_name] > 1: server_name = "unknown" if tool.is_bound and tool.runtime_info and tool.runtime_info.server_name: server_name = tool.runtime_info.server_name llm_name = f"{server_name}__{original_name}" else: llm_name = original_name if llm_name in seen_names: logger.warning(f"[TOOL_DEDUP] Skipping duplicate tool: {llm_name}") continue seen_names.add(llm_name) tool_param = _schema_to_openai(tool.schema) tool_param["function"]["name"] = llm_name result.append(tool_param) tool_map[llm_name] = tool if llm_name != original_name: logger.info(f"[TOOL_RENAME] {original_name} -> {llm_name}") logger.info(f"[SCHEMA_SANITIZE] Prepared {len(result)} tools for LLM (from {len(tools)} total)") return result, tool_map tool_map = {tool.schema.name: tool for tool in tools} return [tool.schema for tool in tools], tool_map DEFAULT_SUMMARIZE_THRESHOLD_CHARS = 200000 # ~50K tokens, lowered from 400K to prevent context overflow MAX_TOOL_RESULT_CHARS = 200000 # Fallback truncation limit when summarization fails (~50K tokens) async def _summarize_tool_result( content: str, tool_name: str, task: str = "", model: str = "openrouter/anthropic/claude-sonnet-4.5", timeout: float = 60.0 ) -> str: """Use LLM to summarize large tool results.""" try: logger.info(f"Summarizing tool result from '{tool_name}': {len(content):,} chars") # Pre-truncate if content is too large for the model (leave room for prompt + output) # Assuming ~4 chars per token, 200K tokens limit, 8K output, ~500 tokens for prompt # Safe input limit: (200K - 8K - 0.5K) * 4 = ~766K chars, but be conservative at 400K max_input_chars = 400000 if len(content) > max_input_chars: logger.warning(f"Pre-truncating content for summarization: {len(content):,} -> {max_input_chars:,} chars") content = content[:max_input_chars] + f"\n\n[TRUNCATED for summarization: original was {len(content):,} chars]" task_hint = f"\n\nUser's task: {task}\nSummarize with focus on information relevant to this task." if task else "" prompt = f"""Tool '{tool_name}' returned a large result ({len(content):,} chars). Summarize it concisely.{task_hint} **Guidelines:** - Structured data (coordinates, steps, etc.): Keep key summary (totals, start/end), omit repetitive details. - Markup content (HTML, XML): Extract text and key data only, ignore tags/scripts. - Long documents: Keep structure outline and essential sections. - Lists/arrays: Summarize count and most relevant items. - Always preserve: numbers, URLs, file paths, IDs, key identifiers. Content: {content} Concise summary:""" response = await asyncio.wait_for( litellm.acompletion( model=model, messages=[{"role": "user", "content": prompt}], timeout=timeout ), timeout=timeout + 5 ) summary = response.choices[0].message.content.strip() result = f"[SUMMARY of {len(content):,} chars]\n{summary}" logger.info(f"Tool result summarized: {len(content):,} -> {len(result):,} chars") return result except Exception as e: logger.warning(f"Summarization failed for '{tool_name}': {e}") return None async def _tool_result_to_message_async( result: ToolResult, *, tool_call_id: str, tool_name: str, task: str = "", summarize_threshold: int = DEFAULT_SUMMARIZE_THRESHOLD_CHARS, summarize_model: str = "openrouter/anthropic/claude-sonnet-4.5", enable_summarization: bool = True ) -> Dict: """Convert ToolResult to LLMClient usable message format with LLM summarization for large results. Args: result: Tool execution result tool_call_id: OpenAI tool_call ID tool_name: Tool name task: User's original task for context-aware summarization summarize_threshold: If content exceeds this, use LLM summarization summarize_model: Model to use for summarization enable_summarization: Whether to enable LLM summarization Returns: OpenAI ChatCompletion tool message (text only) """ if result.is_error: text_content = f"[ERROR] {result.error or 'unknown error'}" else: text_content = ( result.content if isinstance(result.content, str) else json.dumps(result.content, ensure_ascii=False, default=str) ) original_len = len(text_content) # Use LLM summarization if content exceeds threshold if original_len > summarize_threshold and enable_summarization: summary = await _summarize_tool_result(text_content, tool_name, task, summarize_model) if summary: text_content = summary elif original_len > MAX_TOOL_RESULT_CHARS: # Fallback: truncate if summarization failed and content is too large truncate_msg = f"\n\n[TRUNCATED: Original content was {original_len:,} chars, showing first {MAX_TOOL_RESULT_CHARS:,}]" text_content = text_content[:MAX_TOOL_RESULT_CHARS - len(truncate_msg)] + truncate_msg logger.warning(f"Tool result truncated for '{tool_name}': {original_len:,} -> {len(text_content):,} chars (summarization failed)") return { "role": "tool", "name": tool_name, "content": text_content, "tool_call_id": tool_call_id, } async def _execute_tool_call( tool: BaseTool, openai_tool_call: Dict, ) -> ToolResult: """Execute LLMClient returned tool_call Args: tool: BaseTool instance (must be obtained from GroundingClient and bound to runtime_info) openai_tool_call: LLMClient usable tool_call object, contains id, type, function etc. fields """ if not tool.is_bound: raise ValueError( f"Tool '{tool.schema.name}' is not bound to runtime_info. " f"Please ensure tools are obtained from GroundingClient.list_tools() " f"with bind_runtime_info=True" ) func = openai_tool_call["function"] arguments = func.get("arguments", "{}") if isinstance(arguments, str): arguments = json.loads(arguments or "{}") # Filter out parameters that are not in the tool's schema if isinstance(arguments, dict) and tool.schema.parameters: # Get valid parameter names from tool schema (JSON Schema format) schema_params = tool.schema.parameters valid_params = set() if isinstance(schema_params, dict) and "properties" in schema_params: valid_params = set(schema_params["properties"].keys()) # Check for invalid parameters invalid_params = [] for param_name in list(arguments.keys()): if param_name == "skip_visual_analysis": invalid_params.append(param_name) continue # Check if parameter is in the tool's schema if valid_params and param_name not in valid_params: invalid_params.append(param_name) # Remove invalid parameters for param in invalid_params: arguments.pop(param) logger.debug( f"Removed parameter '{param}' from {tool.schema.name} " f"(not in tool schema)" ) return await tool.invoke( parameters=arguments, keep_session=True ) class LLMClient: """LLMClient class for single round call""" def __init__( self, model: str = "openrouter/anthropic/claude-sonnet-4.5", enable_thinking: bool = False, rate_limit_delay: float = 0.0, max_retries: int = 3, retry_delay: float = 1.0, timeout: float = 120.0, summarize_threshold_chars: int = DEFAULT_SUMMARIZE_THRESHOLD_CHARS, enable_tool_result_summarization: bool = True, **litellm_kwargs ): """ Args: model: LLM model identifier enable_thinking: Whether to enable extended thinking mode rate_limit_delay: Minimum delay between API calls in seconds (0 = no delay) max_retries: Maximum number of retries on rate limit errors retry_delay: Initial delay between retries in seconds (exponential backoff) timeout: Request timeout in seconds (default: 120s) summarize_threshold_chars: If tool result exceeds this threshold, use LLM to summarize the result (default: 50000 chars ≈ 12.5K tokens) enable_tool_result_summarization: Whether to enable LLM-based summarization for large tool results (default: True) **litellm_kwargs: Additional litellm parameters """ self.model = model self.enable_thinking = enable_thinking self.rate_limit_delay = rate_limit_delay self.max_retries = max_retries self.retry_delay = retry_delay self.timeout = timeout self.summarize_threshold_chars = summarize_threshold_chars self.enable_tool_result_summarization = enable_tool_result_summarization self.litellm_kwargs = litellm_kwargs self._logger = Logger.get_logger(__name__) self._last_call_time = 0.0 async def _rate_limit(self): """Apply rate limiting by adding delay between API calls""" if self.rate_limit_delay > 0: current_time = time.time() time_since_last_call = current_time - self._last_call_time if time_since_last_call < self.rate_limit_delay: sleep_time = self.rate_limit_delay - time_since_last_call self._logger.debug(f"Rate limiting: waiting {sleep_time:.2f}s before next API call") await asyncio.sleep(sleep_time) self._last_call_time = time.time() async def _call_with_retry(self, **completion_kwargs): """Call LLM with backoff retry on rate limit errors Timeout and retry strategy: - Single call timeout: self.timeout (default 120s) - Rate limit retry delays: 60s, 90s, 120s - Total max time: timeout * max_retries + sum(retry_delays) """ last_exception = None for attempt in range(self.max_retries): try: # Add timeout to the completion call response = await asyncio.wait_for( litellm.acompletion(**completion_kwargs), timeout=self.timeout ) return response except asyncio.TimeoutError: self._logger.error( f"LLM call timed out after {self.timeout}s (attempt {attempt + 1}/{self.max_retries})" ) last_exception = TimeoutError(f"LLM call timed out after {self.timeout}s") if attempt < self.max_retries - 1: # Retry on timeout with shorter delay self._logger.info(f"Retrying after {self.retry_delay}s delay...") await asyncio.sleep(self.retry_delay) continue else: raise last_exception except Exception as e: last_exception = e error_str = str(e).lower() # Check if it's a retryable error is_rate_limit = any( keyword in error_str for keyword in ['rate limit', 'rate_limit', 'too many requests', '429'] ) is_overloaded = any( keyword in error_str for keyword in ['overloaded', '500', '502', '503', '504', 'internal server error', 'service unavailable'] ) if attempt < self.max_retries - 1 and (is_rate_limit or is_overloaded): # Determine backoff delay based on error type if is_rate_limit: # Use longer backoff for rate limits to cross rate limit windows backoff_delay = 60 + (attempt * 30) # 60s, 90s, 120s error_type = "Rate limit" else: # is_overloaded # Use exponential backoff for server errors backoff_delay = min(5 * (2 ** attempt), 60) # 5s, 10s, 20s, max 60s error_type = "Server overload" self._logger.warning( f"{error_type} error (attempt {attempt + 1}/{self.max_retries}), " f"waiting {backoff_delay}s before retry..." ) await asyncio.sleep(backoff_delay) continue else: # Not a retryable error, or max retries reached if attempt >= self.max_retries - 1: self._logger.error(f"Max retries ({self.max_retries}) reached, giving up") raise raise last_exception async def complete( self, messages: List[Dict] | str, tools: List[BaseTool] | None = None, execute_tools: bool = True, summary_prompt: Optional[str] = None, tool_result_callback: Optional[callable] = None, **kwargs ) -> Dict: """ Single-round LLM call with optional tool execution. Args: messages: conversation history (List[Dict] for standard OpenAI format, or str for text format) tools: BaseTool instance list (must be obtained from GroundingClient and bound to runtime_info) if None or empty list, only perform conversation, no tools execute_tools: if LLM returns tool_calls, whether to automatically execute tools summary_prompt: Optional custom prompt for requesting iteration summary. If provided, will request summary after tool execution. If None, no summary will be requested. tool_result_callback: Optional async callback to process tool results after execution. Signature: async def callback(result: ToolResult, tool_name: str, tool_call: Dict, backend: str) -> ToolResult **kwargs: additional parameters for litellm completion """ # 1. Process messages if isinstance(messages, str): current_messages = [{"role": "user", "content": messages}] user_task = messages elif isinstance(messages, list): current_messages = messages.copy() # Extract first user message as task for context-aware summarization user_task = next( (m.get("content", "") for m in messages if m.get("role") == "user"), "" ) else: raise ValueError("messages must be List[Dict] or str") # 2. prepare base litellm completion kwargs completion_kwargs = { "model": kwargs.get("model", self.model), **self.litellm_kwargs, } # Add thinking/reasoning_effort only if explicitly enabled and not using tools enable_thinking = kwargs.get("enable_thinking", self.enable_thinking) # 3. if tools are provided, add them to the request llm_tools = None tool_map = {} # llm_name -> BaseTool if tools: llm_tools, tool_map = _prepare_tools_for_llmclient(tools, fmt="openai") if llm_tools: completion_kwargs["tools"] = llm_tools completion_kwargs["tool_choice"] = kwargs.get("tool_choice", "auto") # Disable thinking when using tools to avoid format conflicts enable_thinking = False self._logger.debug(f"Prepared {len(llm_tools)} tools for LLM") else: self._logger.warning("Tools provided but none could be prepared for LLM") # Add thinking parameters if enabled if enable_thinking: completion_kwargs["reasoning_effort"] = kwargs.get("reasoning_effort", "medium") # 4. Apply rate limiting await self._rate_limit() # 5. Call LLM with retry (single round) completion_kwargs["messages"] = current_messages response = await self._call_with_retry(**completion_kwargs) if not response.choices: raise ValueError("LLM response has no choices") response_message = response.choices[0].message # 6. Build assistant message assistant_message = { "role": "assistant", "content": response_message.content or "", } tool_calls = getattr(response_message, 'tool_calls', None) if tool_calls: assistant_message["tool_calls"] = [ { "id": tc.id, "type": "function", "function": { "name": tc.function.name, "arguments": tc.function.arguments } } for tc in tool_calls ] # Add assistant message to conversation current_messages.append(assistant_message) # 7. Execute tools if requested tool_results = [] if execute_tools and tool_calls and tools: self._logger.info(f"Executing {len(tool_calls)} tool calls...") for tool_call in tool_calls: tool_name = tool_call.function.name # Extract tool metadata and check visual analysis request tool_obj = tool_map.get(tool_name) backend = None server_name = None if tool_obj: try: # Prefer runtime_info if bound if getattr(tool_obj, 'is_bound', False) and getattr(tool_obj, 'runtime_info', None): backend = tool_obj.runtime_info.backend.value server_name = tool_obj.runtime_info.server_name else: backend = tool_obj.backend_type.value if hasattr(tool_obj, 'backend_type') else None except Exception: pass # Log tool execution try: if isinstance(tool_call.function.arguments, str): safe_args_str = tool_call.function.arguments.strip() or "{}" args = json.loads(safe_args_str) else: args = tool_call.function.arguments args_str = json.dumps(args, ensure_ascii=False)[:200] self._logger.info(f"Calling {tool_name} with args: {args_str}") except: pass if tool_name not in tool_map: result = ToolResult( status=ToolStatus.ERROR, error=f"Tool '{tool_name}' not found" ) else: try: result = await _execute_tool_call( tool=tool_map[tool_name], openai_tool_call={ "id": tool_call.id, "type": "function", "function": { "name": tool_call.function.name, "arguments": tool_call.function.arguments } } ) # Apply tool result callback if provided if tool_result_callback and not result.is_error: try: result = await tool_result_callback( result=result, tool_name=tool_name, tool_call=tool_call, backend=backend ) except Exception as e: self._logger.warning(f"Tool result callback failed for {tool_name}: {e}") except Exception as e: result = ToolResult( status=ToolStatus.ERROR, error=str(e) ) # Use async version with LLM summarization for large results tool_message = await _tool_result_to_message_async( result, tool_call_id=tool_call.id, tool_name=tool_name, task=user_task, summarize_threshold=self.summarize_threshold_chars, summarize_model=self.model, enable_summarization=self.enable_tool_result_summarization ) current_messages.append(tool_message) # Store result tool_results.append({ "tool_call": tool_call, "result": result, "message": tool_message, "backend": backend, "server_name": server_name, }) self._logger.info(f"Tool execution completed, {len(tool_results)} tools executed") # 8. Request summary if provided and tools were executed iteration_summary = None if summary_prompt and tool_results: self._logger.debug("Requesting iteration summary from LLM") summary_message = { "role": "system", "content": summary_prompt } current_messages.append(summary_message) # Apply rate limiting before summary call await self._rate_limit() # Call LLM to generate summary (without tools) summary_kwargs = { **self.litellm_kwargs, "model": self.model, "messages": current_messages, "tools": [], "tool_choice": "none", } summary_response = await self._call_with_retry(**summary_kwargs) if summary_response.choices: summary_message = summary_response.choices[0].message iteration_summary = summary_message.content or "" # Add summary response to messages current_messages.append({ "role": "assistant", "content": iteration_summary }) self._logger.debug(f"Generated iteration summary: {iteration_summary[:100]}...") # 9. Return single-round result return { "message": assistant_message, "tool_results": tool_results, "messages": current_messages, "has_tool_calls": bool(tool_calls), "iteration_summary": iteration_summary } @staticmethod def format_messages_to_text(messages: List[Dict]) -> str: """Format conversation history to readable text (for logging/debugging)""" formatted = "" for msg in messages: role = msg.get("role", "unknown").upper() content = msg.get("content", "") formatted += f"[{role}]\n{content}\n\n" return formatted ================================================ FILE: anytool/local_server/README.md ================================================ # AnyTool Local Server (Desktop Version) ## 1. Introduction The AnyTool Local Server is a **lightweight, cross-platform** Flask service that launches on the host workstation and exposes a uniform HTTP interface for controlling the native desktop environment. By translating REST calls into deterministic GUI actions—mouse and keyboard synthesis, window management, screenshot capture, file I/O—it enables higher-level AnyTool agents to interact with real software instead of simulated environments. **Supported platforms:** Windows 10/11, macOS 11+ (Intel & Apple Silicon) and mainstream Linux distributions (X11/Wayland). ## 2. System Architecture * **PlatformAdapter** abstracts OS-specific primitives (Windows, macOS, Linux). * **Accessibility Helper** queries the UI accessibility tree for semantic information. * **Screenshot Helper** captures full or partial screenshots (PNG). * **Recorder** streams screen recordings for offline analysis. * **Health / Feature Checker** validates runtime capabilities and permissions. ## 3. REST Endpoints | Path | Method | Semantics | |------|--------|-----------| | `/` | GET | Liveness probe | | `/platform` | GET | Return host OS metadata | | `/execute` | POST | Execute a PyAutoGUI script fragment | | `/execute_with_verification` | POST | Execute fragment and verify via template matching | | `/run_python` | POST | Run arbitrary Python within a sandbox | | `/run_bash_script` | POST | Run shell script (optional conda activation) | | `/screenshot` | GET | Return PNG screenshot (full or ROI) | | `/cursor_position` | GET | Current mouse coordinates | | `/screen_size` | GET/POST | Query or set virtual screen resolution | | `/list_directory` | POST | List directory contents | *see* `main.py` *for ~20 additional endpoints.* ## 4. Setup & Launch > [!NOTE] > python=3.12 > Accessibility / screen-record permissions (macOS: *System Settings ▸ Privacy & Security*). ### Dependency Installation ```bash cd anytool/local_server pip install -r requirements.txt ``` ### Launching the Server *Python entry point* ```bash python -m anytool.local_server.main \ --host 127.0.0.1 --port 5000 # flags optional; defaults read from config.json ``` *Bash helper script* ```bash ./run.sh # reads config.json then starts the service ``` Press `Ctrl+C` at any time to gracefully stop the server. --- ## 5. Configuration Runtime options live in `config.json`: ```json { "server": { "host": "127.0.0.1", // listening address (0.0.0.0 for all interfaces) "port": 5000, // default port "debug": false // verbose Flask logs } } ``` ================================================ FILE: anytool/local_server/__init__.py ================================================ from .main import app, run_server __all__ = ["app", "run_server"] ================================================ FILE: anytool/local_server/config.json ================================================ { "server": { "host": "127.0.0.1", "port": 5000, "debug": false, "threaded": true } } ================================================ FILE: anytool/local_server/feature_checker.py ================================================ import platform import subprocess import tempfile from typing import Dict, Any from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) platform_name = platform.system() class FeatureChecker: def __init__(self, platform_adapter=None, accessibility_helper=None): self.platform_adapter = platform_adapter self.accessibility_helper = accessibility_helper self.platform = platform_name self._cache = {} def check_screenshot_available(self, use_cache: bool = True) -> bool: if use_cache and 'screenshot' in self._cache: return self._cache['screenshot'] try: import pyautogui from PIL import Image size = pyautogui.size() result = size.width > 0 and size.height > 0 self._cache['screenshot'] = result logger.info(f"Screenshot check: {'available' if result else 'unavailable'}") return result except ImportError as e: logger.warning(f"Screenshot unavailable - missing dependency: {e}") self._cache['screenshot'] = False return False except Exception as e: logger.error(f"Screenshot check failed: {e}") self._cache['screenshot'] = False return False def check_shell_available(self, use_cache: bool = True) -> bool: if use_cache and 'shell' in self._cache: return self._cache['shell'] try: if self.platform == "Windows": cmd = ['cmd', '/c', 'echo', 'test'] else: cmd = ['echo', 'test'] result = subprocess.run( cmd, capture_output=True, timeout=2, text=True ) available = result.returncode == 0 self._cache['shell'] = available logger.info(f"Shell check: {'available' if available else 'unavailable'}") return available except FileNotFoundError as e: logger.warning(f"Shell check failed - command not found: {e}") self._cache['shell'] = False return False except Exception as e: logger.error(f"Shell check failed: {e}") self._cache['shell'] = False return False def check_python_available(self, use_cache: bool = True) -> bool: if use_cache and 'python' in self._cache: return self._cache['python'] python_commands = [] if self.platform == "Windows": python_commands = ['py', 'python', 'python3'] else: python_commands = ['python3', 'python'] for python_cmd in python_commands: try: result = subprocess.run( [python_cmd, '--version'], capture_output=True, timeout=2, text=True ) if result.returncode == 0: version = result.stdout.strip() or result.stderr.strip() self._cache['python'] = True logger.info(f"Python check: available ({python_cmd} - {version})") return True except FileNotFoundError: continue except Exception as e: logger.debug(f"Error testing {python_cmd}: {e}") continue logger.warning("Python check failed - no valid Python interpreter found") self._cache['python'] = False return False def check_file_ops_available(self, use_cache: bool = True) -> bool: if use_cache and 'file_ops' in self._cache: return self._cache['file_ops'] try: with tempfile.NamedTemporaryFile(mode='w+b', delete=True) as tmp: test_data = b'test data' tmp.write(test_data) tmp.flush() tmp.seek(0) read_data = tmp.read() available = read_data == test_data self._cache['file_ops'] = available logger.info(f"File operations check: {'available' if available else 'unavailable'}") return available except PermissionError as e: logger.warning(f"File operations check failed - permission denied: {e}") self._cache['file_ops'] = False return False except Exception as e: logger.error(f"File operations check failed: {e}") self._cache['file_ops'] = False return False def check_window_mgmt_available(self, use_cache: bool = True) -> bool: if use_cache and 'window_mgmt' in self._cache: return self._cache['window_mgmt'] try: if not self.platform_adapter: logger.warning("Window management check failed - no platform adapter loaded") self._cache['window_mgmt'] = False return False required_methods = ['activate_window', 'close_window', 'list_windows'] available_methods = [ method for method in required_methods if hasattr(self.platform_adapter, method) ] available = len(available_methods) > 0 self._cache['window_mgmt'] = available if available: logger.info(f"Window management check: {'available' if available else 'unavailable'} - supported methods: {', '.join(available_methods)}") else: logger.warning(f"Window management check failed - platform adapter missing required methods") return available except Exception as e: logger.error(f"Window management check failed: {e}") self._cache['window_mgmt'] = False return False def check_recording_available(self, use_cache: bool = True) -> bool: if use_cache and 'recording' in self._cache: return self._cache['recording'] try: if not self.platform_adapter: logger.warning("Recording check failed - no platform adapter loaded") self._cache['recording'] = False return False available = ( hasattr(self.platform_adapter, 'start_recording') and hasattr(self.platform_adapter, 'stop_recording') ) self._cache['recording'] = available logger.info(f"Recording check: {'available' if available else 'unavailable'}") return available except Exception as e: logger.error(f"Recording check failed: {e}") self._cache['recording'] = False return False def check_accessibility_available(self, use_cache: bool = True) -> bool: if use_cache and 'accessibility' in self._cache: return self._cache['accessibility'] try: if not self.accessibility_helper: logger.warning("Accessibility check failed - no accessibility helper loaded") self._cache['accessibility'] = False return False available = self.accessibility_helper.is_available() self._cache['accessibility'] = available logger.info(f"Accessibility check: {'available' if available else 'unavailable'}") return available except Exception as e: logger.error(f"Accessibility check failed: {e}") self._cache['accessibility'] = False return False def check_platform_adapter_available(self, use_cache: bool = True) -> bool: if use_cache and 'platform_adapter' in self._cache: return self._cache['platform_adapter'] available = self.platform_adapter is not None self._cache['platform_adapter'] = available logger.info(f"Platform adapter check: {'available' if available else 'unavailable'}") return available def check_all_features(self, use_cache: bool = True) -> Dict[str, bool]: logger.info(f"Checking all features (platform: {self.platform})") results = { 'accessibility': self.check_accessibility_available(use_cache), 'screenshot': self.check_screenshot_available(use_cache), 'recording': self.check_recording_available(use_cache), 'shell': self.check_shell_available(use_cache), 'python': self.check_python_available(use_cache), 'file_ops': self.check_file_ops_available(use_cache), 'window_mgmt': self.check_window_mgmt_available(use_cache), 'platform_adapter': self.check_platform_adapter_available(use_cache), } available_count = sum(1 for v in results.values() if v) total_count = len(results) logger.info(f"Feature check completed: {available_count}/{total_count} features available") return results def clear_cache(self): self._cache.clear() logger.debug("Feature check cache cleared") def get_feature_report(self) -> Dict[str, Any]: results = self.check_all_features() return { 'platform': { 'system': self.platform, 'release': platform.release(), 'version': platform.version(), 'machine': platform.machine(), 'processor': platform.processor(), }, 'features': results, 'summary': { 'total': len(results), 'available': sum(1 for v in results.values() if v), 'unavailable': sum(1 for v in results.values() if not v), } } ================================================ FILE: anytool/local_server/health_checker.py ================================================ import requests import os from pathlib import Path from typing import Dict, Tuple, Optional from anytool.utils.logging import Logger from anytool.local_server.feature_checker import FeatureChecker logger = Logger.get_logger(__name__) from anytool.utils.display import colorize as _c class HealthStatus: """Health status""" def __init__(self, feature_available: bool, endpoint_available: Optional[bool], endpoint_detail: str = ""): self.feature_available = feature_available self.endpoint_available = endpoint_available self.endpoint_detail = endpoint_detail @property def fully_available(self) -> bool: """Fully available: feature and endpoint are available""" return self.feature_available and (self.endpoint_available == True) def __str__(self): if not self.feature_available: return "Feature N/A" elif self.endpoint_available is None: return "Feature OK (endpoint not tested)" elif self.endpoint_available: return f"OK ({self.endpoint_detail})" else: return f"Endpoint failed: {self.endpoint_detail}" class HealthChecker: """Health checker with functional testing""" def __init__(self, feature_checker: FeatureChecker, base_url: str = "http://127.0.0.1:5000", auto_cleanup: bool = True, test_output_dir: str = None): self.feature_checker = feature_checker self.base_url = base_url self.results = {} self.auto_cleanup = auto_cleanup # set the test output directory if test_output_dir: self.test_output_dir = Path(test_output_dir) else: current_dir = Path(__file__).parent self.test_output_dir = current_dir / "temp" # create the directory self.test_output_dir.mkdir(exist_ok=True) self.temp_files = [] # Track temporary files for cleanup logger.info(f"Health checker initialized. Test output: {self.test_output_dir}, Auto-cleanup: {auto_cleanup}") def _get_test_file_path(self, filename: str) -> str: """Get path for a test file""" filepath = str(self.test_output_dir / filename) self._register_temp_file(filepath) return filepath def _register_temp_file(self, filepath: str): """Register a temporary file for later cleanup""" if filepath and filepath not in self.temp_files: self.temp_files.append(filepath) def cleanup_temp_files(self): """Clean up all temporary test files""" if not self.auto_cleanup: logger.info(f"Auto-cleanup disabled. Test files kept in: {self.test_output_dir}") return cleaned = 0 for filepath in self.temp_files: try: if os.path.exists(filepath): os.remove(filepath) cleaned += 1 logger.debug(f"Cleaned up: {filepath}") except Exception as e: logger.warning(f"Failed to clean up {filepath}: {e}") self.temp_files.clear() # if the directory is empty, delete it try: if self.test_output_dir.exists() and not any(self.test_output_dir.iterdir()): self.test_output_dir.rmdir() logger.debug(f"Removed empty directory: {self.test_output_dir}") except: pass if cleaned > 0: logger.info(f"Cleaned up {cleaned} test files") def check_screenshot(self) -> Tuple[bool, str]: """Functionally test screenshot - actually take a screenshot and verify""" # 1. Check feature first if not self.feature_checker.check_screenshot_available(): return False, "Feature N/A" # 2. Save screenshot to test directory screenshot_path = self._get_test_file_path("test_screenshot.png") try: response = requests.get(f"{self.base_url}/screenshot", timeout=10) if response.status_code != 200: return False, f"HTTP {response.status_code}" # 3. Save to file with open(screenshot_path, 'wb') as f: f.write(response.content) # 4. Verify it's actually an image content_type = response.headers.get('Content-Type', '') if 'image' not in content_type: return False, f"Invalid content type: {content_type}" # 5. Check file size (should be > 1KB) size_kb = len(response.content) / 1024 if size_kb < 1: return False, "Image too small" logger.info(f"Screenshot saved: {screenshot_path} ({size_kb:.1f}KB)") return True, f"OK ({size_kb:.1f}KB)" except requests.exceptions.Timeout: return False, "Timeout" except Exception as e: return False, f"Error: {str(e)[:30]}" def check_cursor_position(self) -> Tuple[bool, str]: """Test cursor position""" if not self.feature_checker.check_screenshot_available(): return False, "Feature N/A" try: response = requests.get(f"{self.base_url}/cursor_position", timeout=5) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() if 'x' in data and 'y' in data: return True, f"({data['x']}, {data['y']})" return False, "Invalid response" except Exception as e: return False, str(e)[:30] def check_screen_size(self) -> Tuple[bool, str]: """Test screen size""" if not self.feature_checker.check_screenshot_available(): return False, "Feature N/A" try: response = requests.get(f"{self.base_url}/screen_size", timeout=5) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() if 'width' in data and 'height' in data: return True, f"{data['width']}x{data['height']}" return False, "Invalid response" except Exception as e: return False, str(e)[:30] def check_shell_command(self) -> Tuple[bool, str]: """Functionally test shell command execution""" if not self.feature_checker.check_shell_available(): return False, "Feature N/A" try: response = requests.post( f"{self.base_url}/execute", json={"command": "echo hello_test", "shell": True}, timeout=5 ) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() output = data.get('output', '').strip() # Verify the command actually executed if 'hello_test' in output: return True, "Command executed" return False, "Command failed" except Exception as e: return False, str(e)[:30] def check_python_execution(self) -> Tuple[bool, str]: """Functionally test Python code execution""" if not self.feature_checker.check_python_available(): return False, "Feature N/A" try: test_code = 'print("test_output_123")' response = requests.post( f"{self.base_url}/run_python", json={"code": test_code}, timeout=5 ) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() content = data.get('content', '') # Verify Python executed correctly if 'test_output_123' in content: return True, "Python executed" return False, "Execution failed" except Exception as e: return False, str(e)[:30] def check_bash_script(self) -> Tuple[bool, str]: """Functionally test Bash script execution""" if not self.feature_checker.check_shell_available(): return False, "Feature N/A" try: response = requests.post( f"{self.base_url}/run_bash_script", json={"script": "echo bash_test_456"}, timeout=5 ) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() output = data.get('output', '') if 'bash_test_456' in output: return True, "Bash executed" return False, "Execution failed" except Exception as e: return False, str(e)[:30] def check_file_operations(self) -> Tuple[bool, str]: """Test file operations""" if not self.feature_checker.check_file_ops_available(): return False, "Feature N/A" try: # Test list directory response = requests.post( f"{self.base_url}/list_directory", json={"path": "."}, timeout=5 ) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() if 'items' in data and isinstance(data['items'], list): return True, f"{len(data['items'])} items" return False, "Invalid response" except Exception as e: return False, str(e)[:30] def check_desktop_path(self) -> Tuple[bool, str]: """Test desktop path""" if not self.feature_checker.check_file_ops_available(): return False, "Feature N/A" try: response = requests.get(f"{self.base_url}/desktop_path", timeout=5) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() path = data.get('path', '') if path and os.path.exists(path): return True, "Path valid" return False, "Path not found" except Exception as e: return False, str(e)[:30] def check_window_management(self) -> Tuple[bool, str]: """Test window management""" if not self.feature_checker.check_window_mgmt_available(): return False, "Feature N/A" try: # Just test if endpoint responds (window may not exist) response = requests.post( f"{self.base_url}/setup/activate_window", json={"window_name": "NonExistentWindow"}, timeout=5 ) # 200 (success), 404 (not found), 501 (not supported) are all acceptable if response.status_code in [200, 404, 501]: return True, f"API available" return False, f"HTTP {response.status_code}" except Exception as e: return False, str(e)[:30] def check_recording(self) -> Tuple[bool, str]: """Functionally test recording - actually start and stop recording""" if not self.feature_checker.check_recording_available(): return False, "Feature N/A" recording_path = self._get_test_file_path("test_recording.mp4") try: # 1. Start recording response = requests.post(f"{self.base_url}/start_recording", json={}, timeout=10) if response.status_code == 501: return False, "Not supported" if response.status_code != 200: return False, f"Start failed: {response.status_code}" # 2. Wait a bit import time time.sleep(3.0) # Record for 3 seconds # 3. Stop recording response = requests.post(f"{self.base_url}/end_recording", json={}, timeout=15) if response.status_code == 200: # Save the recording file with open(recording_path, 'wb') as f: f.write(response.content) size_kb = len(response.content) / 1024 logger.info(f"Recording saved: {recording_path} ({size_kb:.1f}KB)") return True, f"OK ({size_kb:.1f}KB)" else: return False, f"Stop failed: {response.status_code}" except Exception as e: # Try to stop recording in case of error try: requests.post(f"{self.base_url}/end_recording", json={}, timeout=5) except: pass return False, str(e)[:30] def check_accessibility(self) -> Tuple[bool, str]: """Test accessibility tree""" if not self.feature_checker.check_accessibility_available(): return False, "Feature N/A" try: response = requests.get(f"{self.base_url}/accessibility?max_depth=1", timeout=10) if response.status_code != 200: return False, f"HTTP {response.status_code}" data = response.json() if 'error' in data: return False, "Permission denied" # Should have some tree structure if 'platform' in data or 'children' in data: return True, "Tree available" return False, "Invalid response" except Exception as e: return False, str(e)[:30] def check_health_endpoint(self) -> Tuple[bool, str]: """Test health check endpoint""" try: response = requests.get(f"{self.base_url}/", timeout=5) if response.status_code == 200: data = response.json() if data.get('status') == 'ok': return True, "OK" return False, f"HTTP {response.status_code}" except Exception as e: return False, str(e)[:30] def check_platform_info(self) -> Tuple[bool, str]: """Test platform info endpoint""" try: response = requests.get(f"{self.base_url}/platform", timeout=5) if response.status_code == 200: data = response.json() if 'system' in data: return True, data['system'] return False, f"HTTP {response.status_code}" except Exception as e: return False, str(e)[:30] def check_all(self, test_endpoints: bool = True) -> Dict[str, HealthStatus]: """ Check all features with functional testing Args: test_endpoints: Whether to test endpoints (False only checks features) Returns: {Feature name: HealthStatus} """ results = {} if not test_endpoints: # Only check features, not endpoints feature_results = self.feature_checker.check_all_features() for name, available in feature_results.items(): results[name] = HealthStatus(available, None, "") self.results = results return results # Functional tests test_functions = { 'Health Check': self.check_health_endpoint, 'Platform Info': self.check_platform_info, 'Screenshot': self.check_screenshot, 'Cursor Position': self.check_cursor_position, 'Screen Size': self.check_screen_size, 'Shell Command': self.check_shell_command, 'Python Execution': self.check_python_execution, 'Bash Script': self.check_bash_script, 'File Operations': self.check_file_operations, 'Desktop Path': self.check_desktop_path, 'Window Management': self.check_window_management, 'Recording': self.check_recording, 'Accessibility': self.check_accessibility, } for name, test_func in test_functions.items(): success, detail = test_func() # Determine feature availability if detail == "Feature N/A": feature_available = False endpoint_available = None else: feature_available = True endpoint_available = success results[name] = HealthStatus(feature_available, endpoint_available, detail) # Clean up temporary files self.cleanup_temp_files() self.results = results return results def print_results(self, results: Dict[str, HealthStatus] = None, show_endpoint_details: bool = False): """Print check results""" if results is None: results = self.results if not results: return total = len(results) feature_available = sum(1 for s in results.values() if s.feature_available) fully_available = sum(1 for s in results.values() if s.fully_available) # Categorize basic = ['Health Check', 'Platform Info'] # Basic Features print() print(_c(" - Basic", 'c', bold=True)) basic_items = [] for name in basic: if name in results: status = results[name] # Use colored dot instead of emoji if status.fully_available: icon = _c("●", 'g') elif not status.feature_available: icon = _c("●", 'rd') elif status.endpoint_available is None: icon = _c("●", 'y') else: icon = _c("●", 'y') text = _c(name, 'gr' if not status.feature_available else '') basic_items.append((icon, text, status)) # Display in rows of 4 for i in range(0, len(basic_items), 4): line_items = [] for j in range(4): if i + j < len(basic_items): icon, text, status = basic_items[i + j] line_items.append(f"{icon} {text:<15}") print(" " + " ".join(line_items)) # Show details if requested if show_endpoint_details: for name in basic: if name in results: status = results[name] print(f" {_c('·', 'gr')} {name}: {_c(str(status), 'gr')}") # Advanced Features print() print(_c(" - Advanced", 'c', bold=True)) advanced_items = [] for name, status in results.items(): if name not in basic: # Use colored dot instead of emoji if status.fully_available: icon = _c("●", 'g') elif not status.feature_available: icon = _c("●", 'rd') elif status.endpoint_available is None: icon = _c("●", 'y') else: icon = _c("●", 'y') text = _c(name, 'gr' if not status.feature_available else '') advanced_items.append((icon, text, status)) # Display in rows of 4 for i in range(0, len(advanced_items), 4): line_items = [] for j in range(4): if i + j < len(advanced_items): icon, text, _ = advanced_items[i + j] line_items.append(f"{icon} {text:<15}") print(" " + " ".join(line_items)) # Show details if requested if show_endpoint_details: for name, status in results.items(): if name not in basic: print(f" {_c('·', 'gr')} {name}: {_c(str(status), 'gr')}") # Summary from anytool.utils.display import print_separator print() print_separator() print(f" {_c('Summary:', 'c', bold=True)} {_c(str(feature_available) + '/' + str(total), 'g' if feature_available == total else 'y')} features available", end='') if any(s.endpoint_available is not None for s in results.values()): print(f", {_c(str(fully_available) + '/' + str(total), 'g' if fully_available == total else 'y')} fully functional") else: print() print_separator() # Legend print(f" {_c('Legend:', 'gr')} {_c('●', 'g')} Available {_c('●', 'y')} Partial/Untested {_c('●', 'rd')} Unavailable") # Test files info if self.temp_files and not self.auto_cleanup: print() print(f" {_c('Test files saved:', 'y')} {self.test_output_dir}") print(f" {_c(str(len(self.temp_files)) + ' file(s) available for inspection', 'gr')}") print() def get_summary(self) -> dict: """Get summary""" if not self.results: return {} total = len(self.results) feature_available = sum(1 for s in self.results.values() if s.feature_available) fully_available = sum(1 for s in self.results.values() if s.fully_available) return { 'total': total, 'feature_available': feature_available, 'fully_available': fully_available, 'details': {k: str(v) for k, v in self.results.items()} } def get_simple_features_dict(self) -> Dict[str, bool]: """Get simple feature dict (for banner display)""" return self.feature_checker.check_all_features() ================================================ FILE: anytool/local_server/main.py ================================================ import os import platform import shlex import subprocess import signal import time import json import uuid from datetime import datetime from flask import Flask, request, jsonify, send_file, abort import pyautogui import threading from io import BytesIO import tempfile from anytool.utils.logging import Logger from anytool.local_server.utils import AccessibilityHelper, ScreenshotHelper from anytool.local_server.platform_adapters import get_platform_adapter from anytool.local_server.health_checker import HealthChecker from anytool.local_server.feature_checker import FeatureChecker platform_name = platform.system() app = Flask(__name__) app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB pyautogui.PAUSE = 0 if platform_name == "Darwin": pyautogui.DARWIN_CATCH_UP_TIME = 0 logger = Logger.get_logger(__name__) TIMEOUT = 1800 recording_process = None if platform_name == "Windows": recording_path = os.path.join(os.environ.get('TEMP', 'C:\\Temp'), 'recording.mp4') else: recording_path = "/tmp/recording.mp4" accessibility_helper = AccessibilityHelper() screenshot_helper = ScreenshotHelper() platform_adapter = get_platform_adapter() feature_checker = FeatureChecker( platform_adapter=platform_adapter, accessibility_helper=accessibility_helper ) def get_conda_activation_prefix(conda_env: str = None) -> str: """ Generate platform-specific conda activation command prefix Args: conda_env: Conda environment name (e.g., 'myenv') Returns: Activation command prefix string, empty if no conda_env """ if not conda_env: return "" if platform_name == "Windows": # Windows: use conda.bat or conda.exe # Try common conda installation paths conda_paths = [ os.path.expandvars("%USERPROFILE%\\miniconda3\\Scripts\\activate.bat"), os.path.expandvars("%USERPROFILE%\\anaconda3\\Scripts\\activate.bat"), "C:\\ProgramData\\Miniconda3\\Scripts\\activate.bat", "C:\\ProgramData\\Anaconda3\\Scripts\\activate.bat", ] # Find first existing conda activate script activate_script = None for path in conda_paths: if os.path.exists(path): activate_script = path break if activate_script: return f'call "{activate_script}" {conda_env} && ' else: # Fallback: assume conda is in PATH return f'conda activate {conda_env} && ' else: # Linux/macOS: source conda.sh then activate conda_paths = [ os.path.expanduser("~/miniconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/anaconda3/etc/profile.d/conda.sh"), "/opt/conda/etc/profile.d/conda.sh", "/usr/local/miniconda3/etc/profile.d/conda.sh", "/usr/local/anaconda3/etc/profile.d/conda.sh", ] # Find first existing conda.sh conda_sh = None for path in conda_paths: if os.path.exists(path): conda_sh = path break if conda_sh: return f'source "{conda_sh}" && conda activate {conda_env} && ' else: # Fallback: assume conda is already initialized in shell return f'conda activate {conda_env} && ' def wrap_script_with_conda(script: str, conda_env: str = None) -> str: """ Wrap script with conda activation command. If conda is not available, returns original script without conda activation. """ if not conda_env: return script if platform_name == "Windows": activation_prefix = get_conda_activation_prefix(conda_env) return f"{activation_prefix}{script}" else: conda_paths = [ os.path.expanduser("~/miniconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/anaconda3/etc/profile.d/conda.sh"), os.path.expanduser("~/opt/anaconda3/etc/profile.d/conda.sh"), "/opt/conda/etc/profile.d/conda.sh", ] conda_sh = None for path in conda_paths: if os.path.exists(path): conda_sh = path break if conda_sh: # Use bash -i -c to run interactively, or directly source conda.sh wrapped_script = f"""#!/bin/bash # Initialize conda if [ -f "{conda_sh}" ]; then . "{conda_sh}" conda activate {conda_env} 2>/dev/null || true fi # Run user script {script} """ return wrapped_script else: # Conda not found - log warning and execute script directly without conda logger.warning(f"Conda environment '{conda_env}' requested but conda not found. Executing with system Python.") return script health_checker = None @app.route('/', methods=['GET']) def health_check(): """Health check interface - return features information""" # Get features from health_checker if health_checker: features = health_checker.get_simple_features_dict() else: # Initial startup of health_checker may not have been initialized, fallback to feature_checker features = feature_checker.check_all_features(use_cache=True) return jsonify({ 'status': 'ok', 'service': 'AnyTool Desktop Server', 'version': '1.0.0', 'platform': platform_name, 'features': features, 'timestamp': datetime.now().isoformat() }) @app.route('/platform', methods=['GET']) def get_platform(): info = { 'system': platform_name, 'release': platform.release(), 'version': platform.version(), 'machine': platform.machine(), 'processor': platform.processor() } if platform_adapter and hasattr(platform_adapter, 'get_system_info'): info.update(platform_adapter.get_system_info()) return jsonify(info) @app.route('/execute', methods=['POST']) @app.route('/setup/execute', methods=['POST']) def execute_command(): data = request.json # The 'command' key in the JSON request should contain the command to be executed. shell = data.get('shell', False) command = data.get('command', "" if shell else []) timeout = data.get('timeout', 120) if isinstance(command, str) and not shell: command = shlex.split(command) # Expand user directory if isinstance(command, list): for i, arg in enumerate(command): if arg.startswith("~/"): command[i] = os.path.expanduser(arg) try: if platform_name == "Windows": result = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=timeout, creationflags=subprocess.CREATE_NO_WINDOW, ) else: result = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=timeout, ) return jsonify({ 'status': 'success', 'output': result.stdout, 'error': result.stderr, 'returncode': result.returncode }) except subprocess.TimeoutExpired: return jsonify({ 'status': 'error', 'message': f'Command timeout after {timeout} seconds' }), 408 except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/execute_with_verification', methods=['POST']) @app.route('/setup/execute_with_verification', methods=['POST']) def execute_command_with_verification(): """Execute command and verify the result based on provided verification criteria""" data = request.json shell = data.get('shell', False) command = data.get('command', "" if shell else []) verification = data.get('verification', {}) max_wait_time = data.get('max_wait_time', 10) # Maximum wait time in seconds check_interval = data.get('check_interval', 1) # Check interval in seconds if isinstance(command, str) and not shell: command = shlex.split(command) # Expand user directory if isinstance(command, list): for i, arg in enumerate(command): if arg.startswith("~/"): command[i] = os.path.expanduser(arg) # Execute the main command try: if platform_name == "Windows": result = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120, creationflags=subprocess.CREATE_NO_WINDOW, ) else: result = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120, ) # If no verification is needed, return immediately if not verification: return jsonify({ 'status': 'success', 'output': result.stdout, 'error': result.stderr, 'returncode': result.returncode }) # Wait and verify the result start_time = time.time() while time.time() - start_time < max_wait_time: verification_passed = True # Check window existence if specified if 'window_exists' in verification: window_name = verification['window_exists'] try: if platform_name == 'Linux': wmctrl_result = subprocess.run( ['wmctrl', '-l'], capture_output=True, text=True, check=True ) if window_name.lower() not in wmctrl_result.stdout.lower(): verification_passed = False elif platform_adapter: # Use platform adapter to check window existence windows = platform_adapter.list_windows() if hasattr(platform_adapter, 'list_windows') else [] if not any(window_name.lower() in str(w).lower() for w in windows): verification_passed = False except: verification_passed = False # Check command execution if specified if 'command_success' in verification: verify_cmd = verification['command_success'] try: verify_result = subprocess.run( verify_cmd, shell=True, capture_output=True, text=True, timeout=5 ) if verify_result.returncode != 0: verification_passed = False except: verification_passed = False if verification_passed: return jsonify({ 'status': 'success', 'output': result.stdout, 'error': result.stderr, 'returncode': result.returncode, 'verification': 'passed', 'wait_time': time.time() - start_time }) time.sleep(check_interval) # Verification failed return jsonify({ 'status': 'verification_failed', 'output': result.stdout, 'error': result.stderr, 'returncode': result.returncode, 'verification': 'failed', 'wait_time': max_wait_time }), 500 except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 def _get_machine_architecture() -> str: """Get the machine architecture, e.g., x86_64, arm64, aarch64, i386, etc. Returns 'amd' for x86/AMD architectures, 'arm' for ARM architectures, or 'unknown'. """ architecture = platform.machine().lower() if architecture in ['amd32', 'amd64', 'x86', 'x86_64', 'x86-64', 'x64', 'i386', 'i686']: return 'amd' elif architecture in ['arm64', 'aarch64', 'aarch32']: return 'arm' else: return 'unknown' @app.route('/setup/launch', methods=["POST"]) def launch_app(): data = request.json shell = data.get("shell", False) command = data.get("command", "" if shell else []) if isinstance(command, str) and not shell: command = shlex.split(command) # Expand user directory if isinstance(command, list): for i, arg in enumerate(command): if arg.startswith("~/"): command[i] = os.path.expanduser(arg) try: # ARM architecture compatibility: replace google-chrome with chromium # ARM64 Chrome is not available yet, can only use Chromium if isinstance(command, list) and 'google-chrome' in command and _get_machine_architecture() == 'arm': index = command.index('google-chrome') command[index] = 'chromium' logger.info("ARM architecture detected: replacing 'google-chrome' with 'chromium'") subprocess.Popen(command, shell=shell) cmd_str = command if shell else " ".join(command) logger.info(f"Application launched successfully: {cmd_str}") return jsonify({ 'status': 'success', 'message': f'{cmd_str} launched successfully' }) except Exception as e: logger.error(f"Application launch failed: {str(e)}") return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route("/run_python", methods=['POST']) def run_python(): data = request.json code = data.get('code', None) timeout = data.get('timeout', 30) working_dir = data.get('working_dir', None) env = data.get('env', None) conda_env = data.get('conda_env', None) if not code: return jsonify({'status': 'error', 'message': 'Code not supplied!'}), 400 # Generate unique filename if platform_name == "Windows": temp_filename = os.path.join(tempfile.gettempdir(), f"python_exec_{uuid.uuid4().hex}.py") else: temp_filename = f"/tmp/python_exec_{uuid.uuid4().hex}.py" try: with open(temp_filename, 'w') as f: f.write(code) # Prepare environment variables exec_env = os.environ.copy() if env: exec_env.update(env) # If conda_env is specified, try to use bash/cmd to activate and run # If conda is not available, fall back to system Python if conda_env: activation_cmd = get_conda_activation_prefix(conda_env) # Check if conda activation command is empty (conda not found) if not activation_cmd: logger.warning(f"Conda environment '{conda_env}' requested but conda not found. Using system Python.") conda_env = None # Disable conda and use default path if conda_env and get_conda_activation_prefix(conda_env): if platform_name == "Windows": # Windows: use cmd with activation activation_cmd = get_conda_activation_prefix(conda_env) full_cmd = f'{activation_cmd}python "{temp_filename}"' result = subprocess.run( ['cmd', '/c', full_cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout, cwd=working_dir or os.getcwd(), env=exec_env ) else: # Linux/macOS: use bash with activation activation_cmd = get_conda_activation_prefix(conda_env) full_cmd = f'{activation_cmd}python3 "{temp_filename}"' result = subprocess.run( ['/bin/bash', '-c', full_cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout, cwd=working_dir or os.getcwd(), env=exec_env ) else: # No conda activation needed python_cmd = 'python' if platform_name == "Windows" else 'python3' result = subprocess.run( [python_cmd, temp_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=timeout, cwd=working_dir or os.getcwd(), env=exec_env ) os.remove(temp_filename) output = result.stdout + result.stderr return jsonify({ 'status': 'success' if result.returncode == 0 else 'error', 'content': output or "Code executed successfully (no output)", 'returncode': result.returncode }) except subprocess.TimeoutExpired: if os.path.exists(temp_filename): os.remove(temp_filename) return jsonify({ 'status': 'error', 'message': f'Execution timeout after {timeout} seconds' }), 408 except Exception as e: if os.path.exists(temp_filename): os.remove(temp_filename) return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route("/run_bash_script", methods=['POST']) def run_bash_script(): data = request.json script = data.get('script', None) timeout = data.get('timeout', 30) working_dir = data.get('working_dir', None) env = data.get('env', None) conda_env = data.get('conda_env', None) if not script: return jsonify({'status': 'error', 'message': 'Script not supplied!'}), 400 # Generate unique filename if platform_name == "Windows": temp_filename = os.path.join(tempfile.gettempdir(), f"bash_exec_{uuid.uuid4().hex}.sh") else: temp_filename = f"/tmp/bash_exec_{uuid.uuid4().hex}.sh" try: # Wrap script with conda activation if needed final_script = wrap_script_with_conda(script, conda_env) with open(temp_filename, 'w') as f: f.write(final_script) os.chmod(temp_filename, 0o755) if platform_name == "Windows": shell_cmd = ['bash', temp_filename] else: shell_cmd = ['/bin/bash', temp_filename] # Prepare environment variables exec_env = os.environ.copy() if env: exec_env.update(env) result = subprocess.run( shell_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout, cwd=working_dir or os.getcwd(), env=exec_env ) os.unlink(temp_filename) return jsonify({ 'status': 'success' if result.returncode == 0 else 'error', 'output': result.stdout, 'error': "", 'returncode': result.returncode }) except subprocess.TimeoutExpired: if os.path.exists(temp_filename): os.unlink(temp_filename) return jsonify({ 'status': 'error', 'output': f'Script execution timed out after {timeout} seconds', 'error': "", 'returncode': -1 }), 500 except Exception as e: if os.path.exists(temp_filename): try: os.unlink(temp_filename) except: pass return jsonify({ 'status': 'error', 'output': f'Failed to execute script: {str(e)}', 'error': "", 'returncode': -1 }), 500 @app.route('/screenshot', methods=['GET']) def capture_screen_with_cursor(): """Capture screenshot (including mouse cursor)""" try: buf = BytesIO() tmp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4().hex}.png") if screenshot_helper.capture(tmp_path, with_cursor=True): with open(tmp_path, 'rb') as f: buf.write(f.read()) os.remove(tmp_path) buf.seek(0) return send_file(buf, mimetype='image/png') else: return jsonify({'status':'error','message':'Screenshot failed'}), 500 except Exception as e: logger.error(f"Screenshot failed: {str(e)}") return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/cursor_position', methods=['GET']) def get_cursor_position(): """Get cursor position""" try: x, y = screenshot_helper.get_cursor_position() return jsonify({'x': x, 'y': y, 'status': 'success'}) except Exception as e: return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route('/screen_size', methods=['POST', 'GET']) def get_screen_size(): """Get screen size""" try: width, height = screenshot_helper.get_screen_size() return jsonify({'width': width, 'height': height, 'status': 'success'}) except Exception as e: return jsonify({'status': 'error', 'message': str(e)}), 500 # Accessibility Tree @app.route("/accessibility", methods=["GET"]) def get_accessibility_tree(): """Get accessibility tree""" try: max_depth = request.args.get('max_depth', 10, type=int) tree = accessibility_helper.get_tree(max_depth=max_depth) return jsonify(tree) except Exception as e: logger.error(f"Failed to get accessibility tree: {str(e)}") return jsonify({ 'status': 'error', 'message': str(e) }), 500 # File Operations @app.route('/list_directory', methods=['POST']) def list_directory(): """List directory contents""" data = request.json path = data.get('path', '.') try: path = os.path.expanduser(path) items = [] for item in os.listdir(path): item_path = os.path.join(path, item) items.append({ 'name': item, 'is_dir': os.path.isdir(item_path), 'is_file': os.path.isfile(item_path), 'size': os.path.getsize(item_path) if os.path.isfile(item_path) else None }) return jsonify({ 'status': 'success', 'path': path, 'items': items }) except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/file', methods=['POST']) def file_operation(): """File operations""" data = request.json operation = data.get('operation', 'read') path = data.get('path') if not path: return jsonify({'status': 'error', 'message': 'Path required'}), 400 path = os.path.expanduser(path) try: if operation == 'read': with open(path, 'r') as f: content = f.read() return jsonify({ 'status': 'success', 'content': content }) elif operation == 'exists': exists = os.path.exists(path) return jsonify({ 'status': 'success', 'exists': exists }) else: return jsonify({ 'status': 'error', 'message': f'Unknown operation: {operation}' }), 400 except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/desktop_path', methods=['POST', 'GET']) def get_desktop_path(): """Get desktop path""" try: desktop = os.path.expanduser("~/Desktop") return jsonify({ 'status': 'success', 'path': desktop }) except Exception as e: return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route("/setup/activate_window", methods=['POST']) def activate_window(): """Activate window""" data = request.json window_name = data.get("window_name") strict = data.get("strict", False) by_class_name = data.get("by_class", False) if not window_name: return jsonify({'status': 'error', 'message': 'window_name required'}), 400 try: if platform_adapter and hasattr(platform_adapter, 'activate_window'): result = platform_adapter.activate_window(window_name, strict=strict) if result['status'] == 'success': return jsonify(result) else: return jsonify(result), 400 else: return jsonify({ 'status': 'error', 'message': f'Window activation not supported on {platform_name}' }), 501 except Exception as e: logger.error(f"Window activation failed: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route("/setup/close_window", methods=["POST"]) def close_window(): """Close window""" data = request.json window_name = data.get("window_name") strict = data.get("strict", False) by_class_name = data.get("by_class", False) if not window_name: return jsonify({'status': 'error', 'message': 'window_name required'}), 400 try: if platform_adapter and hasattr(platform_adapter, 'close_window'): result = platform_adapter.close_window(window_name, strict=strict) if result['status'] == 'success': return jsonify(result) else: return jsonify(result), 404 else: return jsonify({ 'status': 'error', 'message': f'Window closing not supported on {platform_name}' }), 501 except Exception as e: logger.error(f"Window closing failed: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route('/window_size', methods=['POST']) def get_window_size(): """Get window size""" try: width, height = screenshot_helper.get_screen_size() return jsonify({ 'status': 'success', 'width': width, 'height': height }) except Exception as e: return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route('/wallpaper', methods=['POST']) @app.route('/setup/change_wallpaper', methods=['POST']) def set_wallpaper(): """Set wallpaper""" data = request.json image_path = data.get('path') if not image_path: return jsonify({'status': 'error', 'message': 'path required'}), 400 try: if platform_adapter and hasattr(platform_adapter, 'set_wallpaper'): result = platform_adapter.set_wallpaper(image_path) if result['status'] == 'success': return jsonify(result) else: return jsonify(result), 400 else: return jsonify({ 'status': 'error', 'message': f'Wallpaper setting not supported on {platform_name}' }), 501 except Exception as e: logger.error(f"Failed to set wallpaper: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 # Screen Recording @app.route('/start_recording', methods=['POST']) def start_recording(): """Start screen recording (supports Linux, macOS, Windows)""" global recording_process # Check if platform adapter supports recording if not platform_adapter or not hasattr(platform_adapter, 'start_recording'): return jsonify({ 'status': 'error', 'message': f'Recording not supported on {platform_name}' }), 501 # Check if recording is already in progress if recording_process and recording_process.poll() is None: return jsonify({ 'status': 'error', 'message': 'Recording is already in progress.' }), 400 # Clean up old recording file if os.path.exists(recording_path): try: os.remove(recording_path) except OSError as e: logger.error(f"Cannot delete old recording file: {e}") try: # Use platform adapter to start recording result = platform_adapter.start_recording(recording_path) if result['status'] == 'success': recording_process = result.get('process') logger.info("Recording started successfully") return jsonify({ 'status': 'success', 'message': 'Recording started' }) else: logger.error(f"Failed to start recording: {result.get('message', 'Unknown error')}") return jsonify({ 'status': 'error', 'message': result.get('message', 'Failed to start recording') }), 500 except Exception as e: logger.error(f"Failed to start recording: {str(e)}") return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/end_recording', methods=['POST']) def end_recording(): """End screen recording (supports Linux, macOS, Windows)""" global recording_process # Check if recording is in progress if not recording_process or recording_process.poll() is not None: recording_process = None return jsonify({ 'status': 'error', 'message': 'No recording in progress' }), 400 try: # Use platform adapter to stop recording if platform_adapter and hasattr(platform_adapter, 'stop_recording'): result = platform_adapter.stop_recording(recording_process) recording_process = None if result['status'] != 'success': logger.error(f"Failed to stop recording: {result.get('message', 'Unknown error')}") return jsonify(result), 500 else: # Fallback: terminate process directly recording_process.send_signal(signal.SIGINT) try: recording_process.wait(timeout=15) except subprocess.TimeoutExpired: logger.warning("ffmpeg not responding, force terminating") recording_process.kill() recording_process.wait() recording_process = None # Check if recording file exists # wait for ffmpeg to write the file header for _ in range(10): if os.path.exists(recording_path) and os.path.getsize(recording_path) > 0: break time.sleep(0.5) if os.path.exists(recording_path) and os.path.getsize(recording_path) > 0: logger.info("Recording ended, file saved") return send_file(recording_path, as_attachment=True) else: logger.error("Recording file is missing or empty") return abort(500, description="Recording file is missing or empty") except Exception as e: logger.error(f"Failed to end recording: {str(e)}") if recording_process: try: recording_process.kill() recording_process.wait() except: pass recording_process = None return jsonify({ 'status': 'error', 'message': str(e) }), 500 @app.route('/terminal', methods=['GET']) def get_terminal_output(): """Get terminal output (supports Linux, macOS, Windows)""" try: if platform_adapter and hasattr(platform_adapter, 'get_terminal_output'): output = platform_adapter.get_terminal_output() if output: return jsonify({'output': output, 'status': 'success'}) else: return jsonify({ 'status': 'error', 'message': f'No terminal output available on {platform_name}', 'platform_note': 'Make sure a terminal window is open and active' }), 404 else: return jsonify({ 'status': 'error', 'message': f'Terminal output not supported on {platform_name}' }), 501 except Exception as e: logger.error(f"Failed to get terminal output: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route("/setup/upload", methods=["POST"]) def upload_file(): """Upload file""" if 'file' not in request.files: return jsonify({'status': 'error', 'message': 'No file provided'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'status': 'error', 'message': 'No file selected'}), 400 try: # Get target path target_path = request.form.get('path', os.path.expanduser('~/Desktop')) target_path = os.path.expanduser(target_path) # Ensure directory exists os.makedirs(target_path, exist_ok=True) # Save file file_path = os.path.join(target_path, file.filename) file.save(file_path) logger.info(f"File uploaded successfully: {file_path}") return jsonify({ 'status': 'success', 'path': file_path, 'message': 'File uploaded successfully' }) except Exception as e: logger.error(f"File upload failed: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route("/setup/download_file", methods=["POST"]) def download_file(): """Download file""" data = request.json path = data.get('path') if not path: return jsonify({'status': 'error', 'message': 'path required'}), 400 try: path = os.path.expanduser(path) if not os.path.exists(path): return jsonify({'status': 'error', 'message': f'File not found: {path}'}), 404 return send_file(path, as_attachment=True) except Exception as e: logger.error(f"File download failed: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 @app.route("/setup/open_file", methods=['POST']) def open_file(): """Open file (using system default application)""" data = request.json path = data.get('path') if not path: return jsonify({'status': 'error', 'message': 'path required'}), 400 try: path = os.path.expanduser(path) if not os.path.exists(path): return jsonify({'status': 'error', 'message': f'File not found: {path}'}), 404 if platform_name == "Darwin": subprocess.Popen(['open', path]) elif platform_name == "Linux": subprocess.Popen(['xdg-open', path]) elif platform_name == "Windows": os.startfile(path) logger.info(f"File opened successfully: {path}") return jsonify({ 'status': 'success', 'message': f'File opened: {path}' }) except Exception as e: logger.error(f"File opening failed: {str(e)}") return jsonify({'status': 'error', 'message': str(e)}), 500 def print_banner(host: str = "127.0.0.1", port: int = 5000, debug: bool = False): """Print startup banner with server information""" from anytool.utils.display import print_banner as display_banner, print_section, print_separator, colorize # STARTUP INFORMATION display_banner("AnyTool · Local Server") server_url = f"http://{host}:{port}" # Server section info_lines = [ colorize(server_url, 'g', bold=True), ] if host == '0.0.0.0': info_lines.append(f"{colorize('Listening on all interfaces', 'gr')} {colorize('(0.0.0.0:' + str(port) + ')', 'y')}") info_lines.append(f"{colorize(platform_name, 'gr')} · {colorize('Debug' if debug else 'Production', 'y' if debug else 'g')}") print_section("Server", info_lines) print() print_separator() print(f" {colorize('Press Ctrl+C to stop', 'gr')}") print() def run_health_check_async(): """Asynchronous running health check""" def _run(): from anytool.utils.display import colorize time.sleep(2) print(colorize("\n - Starting health check...\n", 'c', bold=True)) results = health_checker.check_all(test_endpoints=True) health_checker.print_results(results, show_endpoint_details=False) summary = health_checker.get_summary() logger.info(f"Health check completed: {summary['fully_available']}/{summary['total']} fully available") thread = threading.Thread(target=_run, daemon=True) thread.start() def run_server(host: str = "127.0.0.1", port: int = 5000, debug: bool = False): """ Start desktop control server Args: host: Listening address (127.0.0.1 for local, 0.0.0.0 for all interfaces) port: Listening port debug: Debug mode (display detailed logs) """ global health_checker # Initialize health_checker base_url = f"http://{host if host != '0.0.0.0' else '127.0.0.1'}:{port}" health_checker = HealthChecker(feature_checker, base_url, auto_cleanup=False) print_banner(host, port, debug) if not debug: run_health_check_async() app.run(host=host, port=port, debug=debug, threaded=True) def main(): import argparse from anytool.config.utils import get_config_value parser = argparse.ArgumentParser( description='AnyTool Local Server - Desktop Control Server' ) parser.add_argument('--host', type=str, default='127.0.0.1', help='Server host (default: 127.0.0.1)') parser.add_argument('--port', type=int, default=5000, help='Server port (default: 5000)') parser.add_argument('--debug', action='store_true', help='Enable debug mode') parser.add_argument('--config', type=str, help='Path to config.json file') args = parser.parse_args() config_path = args.config if not config_path: config_path = os.path.join(os.path.dirname(__file__), 'config.json') if os.path.exists(config_path): try: with open(config_path, 'r') as f: config = json.load(f) server_config = get_config_value(config, 'server', {}) host = args.host if args.host != '127.0.0.1' else get_config_value(server_config, 'host', '127.0.0.1') port = args.port if args.port != 5000 else get_config_value(server_config, 'port', 5000) debug = args.debug or get_config_value(server_config, 'debug', False) run_server(host=host, port=port, debug=debug) except Exception as e: logger.error(f"Failed to load config: {e}") run_server(host=args.host, port=args.port, debug=args.debug) else: run_server(host=args.host, port=args.port, debug=args.debug) if __name__ == "__main__": main() ================================================ FILE: anytool/local_server/platform_adapters/__init__.py ================================================ import platform from typing import Optional, Any platform_name = platform.system() if platform_name == "Darwin": try: from .macos_adapter import MacOSAdapter as PlatformAdapter ADAPTER_AVAILABLE = True except ImportError: PlatformAdapter = None ADAPTER_AVAILABLE = False elif platform_name == "Linux": try: from .linux_adapter import LinuxAdapter as PlatformAdapter ADAPTER_AVAILABLE = True except ImportError: PlatformAdapter = None ADAPTER_AVAILABLE = False elif platform_name == "Windows": try: from .windows_adapter import WindowsAdapter as PlatformAdapter ADAPTER_AVAILABLE = True except ImportError: PlatformAdapter = None ADAPTER_AVAILABLE = False else: PlatformAdapter = None ADAPTER_AVAILABLE = False def get_platform_adapter() -> Optional[Any]: if ADAPTER_AVAILABLE and PlatformAdapter: return PlatformAdapter() return None __all__ = ["PlatformAdapter", "get_platform_adapter", "ADAPTER_AVAILABLE"] ================================================ FILE: anytool/local_server/platform_adapters/linux_adapter.py ================================================ import subprocess import os from typing import Dict, Any, Optional, List from anytool.utils.logging import Logger from PIL import Image import pyautogui try: import pyatspi from pyatspi import Accessible, StateType, STATE_SHOWING import Xlib from Xlib import display, X LINUX_LIBS_AVAILABLE = True except ImportError: LINUX_LIBS_AVAILABLE = False logger = Logger.get_logger(__name__) class LinuxAdapter: def __init__(self): if not LINUX_LIBS_AVAILABLE: logger.warning("Linux libraries are not fully installed, some features may not be available") self.available = LINUX_LIBS_AVAILABLE def capture_screenshot_with_cursor(self, output_path: str) -> bool: """ Use pyautogui + pyxcursor to capture screenshot (including cursor) Args: output_path: Output file path Returns: Whether the screenshot is successful """ try: # Use pyautogui to capture screenshot screenshot = pyautogui.screenshot() # Try to add cursor try: # Import pyxcursor (should be in the same directory) import sys import os sys.path.insert(0, os.path.dirname(__file__)) from pyxcursor import Xcursor cursor_obj = Xcursor() imgarray = cursor_obj.getCursorImageArrayFast() cursor_img = Image.fromarray(imgarray) cursor_x, cursor_y = pyautogui.position() screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img) logger.info("Linux screenshot successfully (with cursor)") except Exception as e: logger.warning(f"Failed to add cursor to screenshot: {e}") logger.info("Linux screenshot successfully (without cursor)") screenshot.save(output_path) return True except Exception as e: logger.error(f"Linux screenshot failed: {e}") return False def activate_window(self, window_name: str, strict: bool = False, by_class: bool = False) -> Dict[str, Any]: """ Activate window (Linux uses wmctrl) Args: window_name: Window name strict: Whether to strictly match by_class: Whether to match by class name Returns: Result dictionary """ try: # Build wmctrl command flags = f"-{'x' if by_class else ''}{'F' if strict else ''}a" cmd = ["wmctrl", flags, window_name] subprocess.run(cmd, check=True, timeout=5) logger.info(f"Linux window activated successfully: {window_name}") return {'status': 'success', 'message': 'Window activated'} except subprocess.CalledProcessError as e: logger.warning(f"wmctrl command execution failed: {e}") return {'status': 'error', 'message': f'Window {window_name} not found or wmctrl failed'} except FileNotFoundError: logger.error("wmctrl not installed, please install: sudo apt install wmctrl") return {'status': 'error', 'message': 'wmctrl not installed'} except Exception as e: logger.error(f"Linux window activation failed: {e}") return {'status': 'error', 'message': str(e)} def close_window(self, window_name: str, strict: bool = False, by_class: bool = False) -> Dict[str, Any]: """ Close window (Linux uses wmctrl) Args: window_name: Window name strict: Whether to strictly match by_class: Whether to match by class name Returns: Result dictionary """ try: # Build wmctrl command flags = f"-{'x' if by_class else ''}{'F' if strict else ''}c" cmd = ["wmctrl", flags, window_name] subprocess.run(cmd, check=True, timeout=5) logger.info(f"Linux window closed successfully: {window_name}") return {'status': 'success', 'message': 'Window closed'} except subprocess.CalledProcessError as e: logger.warning(f"wmctrl command execution failed: {e}") return {'status': 'error', 'message': f'Window {window_name} not found or wmctrl failed'} except FileNotFoundError: logger.error("wmctrl not installed") return {'status': 'error', 'message': 'wmctrl not installed'} except Exception as e: logger.error(f"Linux window close failed: {e}") return {'status': 'error', 'message': str(e)} def get_accessibility_tree(self, max_depth: int = 10, max_width: int = 50) -> Dict[str, Any]: """ Get Linux accessibility tree (using AT-SPI) Args: max_depth: Maximum depth max_width: Maximum number of child elements per level Returns: Accessibility tree data """ if not LINUX_LIBS_AVAILABLE: return {'error': 'Linux accessibility libraries not available'} try: # Get desktop root node desktop = pyatspi.Registry.getDesktop(0) # Serialize accessibility tree tree = self._serialize_atspi_element( desktop, depth=0, max_depth=max_depth, max_width=max_width ) return { 'tree': tree, 'platform': 'Linux' } except Exception as e: logger.error(f"Linux get accessibility tree failed: {e}") return {'error': str(e)} def _serialize_atspi_element( self, element: Accessible, depth: int = 0, max_depth: int = 10, max_width: int = 50 ) -> Optional[Dict[str, Any]]: """ Serialize AT-SPI element to dictionary Args: element: AT-SPI accessible element depth: Current depth max_depth: Maximum depth max_width: Maximum width Returns: Serialized dictionary """ if depth > max_depth: return None try: result = { 'depth': depth, 'role': element.getRoleName(), 'name': element.name, } # Get states try: states = element.getState().get_states() result['states'] = [StateType._enum_lookup[st].split('_', 1)[1].lower() for st in states if st in StateType._enum_lookup] except: result['states'] = [] # Get attributes try: attributes = element.get_attributes() if attributes: result['attributes'] = dict(attributes) except: result['attributes'] = {} # Get position and size (if visible) if STATE_SHOWING in element.getState().get_states(): try: component = element.queryComponent() bbox = component.getExtents(pyatspi.XY_SCREEN) result['position'] = {'x': bbox[0], 'y': bbox[1]} result['size'] = {'width': bbox[2], 'height': bbox[3]} except: pass # Get text content try: text_obj = element.queryText() text = text_obj.getText(0, text_obj.characterCount) if text: result['text'] = text.replace("\ufffc", "").replace("\ufffd", "") except: pass # Recursively get child elements result['children'] = [] try: child_count = min(element.childCount, max_width) for i in range(child_count): try: child = element.getChildAtIndex(i) child_data = self._serialize_atspi_element( child, depth + 1, max_depth, max_width ) if child_data: result['children'].append(child_data) except Exception as e: logger.debug(f"Cannot serialize child element {i}: {e}") continue except Exception as e: logger.debug(f"Cannot get child elements: {e}") return result except Exception as e: logger.debug(f"Failed to serialize element (depth={depth}): {e}") return None def get_screen_size(self) -> Dict[str, int]: """ Get screen size Returns: Screen size dictionary """ try: if LINUX_LIBS_AVAILABLE: d = display.Display() screen = d.screen() return { 'width': screen.width_in_pixels, 'height': screen.height_in_pixels } else: # Use pyautogui as fallback size = pyautogui.size() return {'width': size.width, 'height': size.height} except Exception as e: logger.error(f"Failed to get screen size: {e}") return {'width': 1920, 'height': 1080} # Default value def list_windows(self) -> List[Dict[str, Any]]: """ List all windows Returns: Window list """ try: result = subprocess.run( ['wmctrl', '-l'], capture_output=True, text=True, check=True ) windows = [] for line in result.stdout.strip().split('\n'): if line: parts = line.split(None, 3) if len(parts) >= 4: windows.append({ 'id': parts[0], 'desktop': parts[1], 'hostname': parts[2], 'title': parts[3] }) return windows except FileNotFoundError: logger.error("wmctrl not installed") return [] except Exception as e: logger.error(f"List windows failed: {e}") return [] def get_terminal_output(self) -> Optional[str]: """ Get terminal output (GNOME Terminal) Returns: Terminal output content """ if not LINUX_LIBS_AVAILABLE: return None try: desktop = pyatspi.Registry.getDesktop(0) # Find gnome-terminal-server for app in desktop: if app.getRoleName() == "application" and app.name == "gnome-terminal-server": for frame in app: if frame.getRoleName() == "frame" and frame.getState().contains(pyatspi.STATE_ACTIVE): # Find terminal component for component in self._find_terminals(frame): try: text_obj = component.queryText() output = text_obj.getText(0, text_obj.characterCount) return output.rstrip() if output else None except: continue return None except Exception as e: logger.error(f"Failed to get terminal output: {e}") return None def _find_terminals(self, element) -> List[Accessible]: """Recursively find terminal components""" terminals = [] try: if element.getRoleName() == "terminal": terminals.append(element) for i in range(element.childCount): child = element.getChildAtIndex(i) terminals.extend(self._find_terminals(child)) except: pass return terminals def set_wallpaper(self, image_path: str) -> Dict[str, Any]: """ Set desktop wallpaper (GNOME) Args: image_path: Image path Returns: Result dictionary """ try: image_path = os.path.expanduser(image_path) image_path = os.path.abspath(image_path) if not os.path.exists(image_path): return {'status': 'error', 'message': f'Image not found: {image_path}'} # Use gsettings to set wallpaper (GNOME) subprocess.run([ 'gsettings', 'set', 'org.gnome.desktop.background', 'picture-uri', f'file://{image_path}' ], check=True, timeout=5) logger.info(f"Linux wallpaper set successfully: {image_path}") return {'status': 'success', 'message': 'Wallpaper set successfully'} except Exception as e: logger.error(f"Linux set wallpaper failed: {e}") return {'status': 'error', 'message': str(e)} def get_system_info(self) -> Dict[str, Any]: """ Get Linux system information Returns: System information dictionary """ try: # Get distribution information try: with open('/etc/os-release', 'r') as f: os_info = {} for line in f: if '=' in line: key, value = line.strip().split('=', 1) os_info[key] = value.strip('"') distro = os_info.get('PRETTY_NAME', 'Unknown Linux') except: distro = 'Unknown Linux' # Get kernel version kernel = subprocess.run( ['uname', '-r'], capture_output=True, text=True ).stdout.strip() return { 'platform': 'Linux', 'distro': distro, 'kernel': kernel, 'available': self.available } except Exception as e: logger.error(f"Failed to get system information: {e}") return { 'platform': 'Linux', 'error': str(e) } def start_recording(self, output_path: str) -> Dict[str, Any]: try: try: subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True, timeout=5) except (subprocess.CalledProcessError, FileNotFoundError): return { 'status': 'error', 'message': 'ffmpeg not installed. Install with: sudo apt install ffmpeg' } try: if LINUX_LIBS_AVAILABLE: from Xlib import display as xdisplay d = xdisplay.Display() screen_width = d.screen().width_in_pixels screen_height = d.screen().height_in_pixels else: # use pyautogui as fallback size = pyautogui.size() screen_width = size.width screen_height = size.height except: screen_width, screen_height = 1920, 1080 command = [ 'ffmpeg', '-y', '-f', 'x11grab', '-draw_mouse', '1', '-s', f'{screen_width}x{screen_height}', '-i', ':0.0', '-c:v', 'libx264', '-preset', 'ultrafast', '-r', '30', output_path ] process = subprocess.Popen( command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True ) import time time.sleep(1) if process.poll() is not None: error_output = process.stderr.read() if process.stderr else "Unknown error" return { 'status': 'error', 'message': f'Failed to start recording: {error_output}' } logger.info(f"Linux recording started: {output_path}") return { 'status': 'success', 'message': 'Recording started', 'process': process } except Exception as e: logger.error(f"Linux start recording failed: {e}") return { 'status': 'error', 'message': str(e) } def stop_recording(self, process) -> Dict[str, Any]: try: import signal if not process or process.poll() is not None: return { 'status': 'error', 'message': 'No recording in progress' } process.send_signal(signal.SIGINT) try: process.wait(timeout=15) except subprocess.TimeoutExpired: logger.warning("ffmpeg did not respond to SIGINT, killing process") process.kill() process.wait() logger.info("Linux recording stopped successfully") return { 'status': 'success', 'message': 'Recording stopped' } except Exception as e: logger.error(f"Linux stop recording failed: {e}") return { 'status': 'error', 'message': str(e) } def get_running_applications(self) -> List[Dict[str, str]]: """ Get list of all running applications Returns: Application list """ try: import psutil apps = [] seen_names = set() for proc in psutil.process_iter(['pid', 'name', 'exe', 'cmdline']): try: pinfo = proc.info name = pinfo['name'] exe = pinfo['exe'] # Skip kernel processes and system daemons if not exe or name.startswith('['): continue # Skip duplicates if name in seen_names: continue seen_names.add(name) apps.append({ 'name': name, 'pid': pinfo['pid'], 'path': exe or '', 'cmdline': ' '.join(pinfo.get('cmdline', [])) }) except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): pass return apps except ImportError: logger.warning("psutil not installed, cannot get running applications") return [] except Exception as e: logger.error(f"Failed to get running applications list: {e}") return [] ================================================ FILE: anytool/local_server/platform_adapters/macos_adapter.py ================================================ import subprocess import os from typing import Dict, Any, Optional, List from anytool.utils.logging import Logger try: import AppKit import atomacos MACOS_LIBS_AVAILABLE = True except ImportError: MACOS_LIBS_AVAILABLE = False logger = Logger.get_logger(__name__) _warning_shown = False class MacOSAdapter: def __init__(self): global _warning_shown if not MACOS_LIBS_AVAILABLE and not _warning_shown: logger.warning("macOS libraries are not fully installed, some features may not be available") logger.info("To install missing libraries, run: pip install pyobjc-framework-Cocoa atomacos") _warning_shown = True self.available = MACOS_LIBS_AVAILABLE def capture_screenshot_with_cursor(self, output_path: str) -> bool: """ Capture screenshot with cursor using macOS native screencapture command Args: output_path: Output file path Returns: Whether successful """ try: # -C parameter includes cursor, -x disables sound, -m captures main display subprocess.run(["screencapture", "-C", "-x", "-m", output_path], check=True) logger.info(f"macOS screenshot successfully: {output_path}") return True except Exception as e: logger.error(f"macOS screenshot failed: {e}") return False def activate_window(self, window_name: str, strict: bool = False) -> Dict[str, Any]: """ Activate window (macOS uses AppleScript) Args: window_name: Window name or application name strict: Whether to strictly match Returns: Result dictionary """ try: # Try to activate application script = f''' tell application "System Events" set appName to "{window_name}" try -- Try to activate application by name set frontmost of first process whose name is appName to true return "success" on error -- Try to find window by title set foundWindow to false repeat with theProcess in (every process whose visible is true) try tell theProcess repeat with theWindow in windows if name of theWindow contains appName then set frontmost of theProcess to true set foundWindow to true exit repeat end if end repeat end tell end try if foundWindow then exit repeat end repeat if foundWindow then return "success" else return "not found" end if end try end tell ''' result = subprocess.run( ['osascript', '-e', script], capture_output=True, text=True, timeout=10 ) if "success" in result.stdout: logger.info(f"macOS window activated successfully: {window_name}") return {'status': 'success', 'message': 'Window activated'} else: logger.warning(f"macOS window not found: {window_name}") return {'status': 'error', 'message': f'Window {window_name} not found'} except Exception as e: logger.error(f"macOS window activation failed: {e}") return {'status': 'error', 'message': str(e)} def close_window(self, window_name: str, strict: bool = False) -> Dict[str, Any]: """ Close window or application (macOS uses AppleScript) Args: window_name: Window name or application name strict: Whether to strictly match Returns: Result dictionary """ try: # Try to exit application script = f''' tell application "{window_name}" quit end tell ''' subprocess.run(['osascript', '-e', script], check=True, timeout=5) logger.info(f"macOS window/application closed successfully: {window_name}") return {'status': 'success', 'message': 'Window/Application closed'} except subprocess.TimeoutExpired: # If timeout, try to force terminate try: script_force = f''' tell application "{window_name}" quit end tell do shell script "killall '{window_name}'" ''' subprocess.run(['osascript', '-e', script_force], timeout=5) logger.info(f"macOS application force closed: {window_name}") return {'status': 'success', 'message': 'Application force closed'} except Exception as e2: logger.error(f"macOS force close failed: {e2}") return {'status': 'error', 'message': str(e2)} except Exception as e: logger.error(f"macOS close window failed: {e}") return {'status': 'error', 'message': str(e)} def get_accessibility_tree(self, max_depth: int = 10) -> Dict[str, Any]: """ Get macOS accessibility tree Args: max_depth: Maximum depth Returns: Accessibility tree data """ if not MACOS_LIBS_AVAILABLE: return {'error': 'macOS accessibility libraries not available'} try: # Get frontmost application workspace = AppKit.NSWorkspace.sharedWorkspace() active_app = workspace.activeApplication() if not active_app: return {'error': 'No active application'} app_name = active_app.get('NSApplicationName', 'Unknown') bundle_id = active_app.get('NSApplicationBundleIdentifier', '') logger.info(f"Getting accessibility tree: {app_name} ({bundle_id})") # Use atomacos to get application reference try: if bundle_id: app_ref = atomacos.getAppRefByBundleId(bundle_id) else: # If no bundle_id, try to find by name return {'error': 'Cannot find application without bundle ID'} # Serialize accessibility tree tree = self._serialize_ax_element(app_ref, depth=0, max_depth=max_depth) return { 'app_name': app_name, 'bundle_id': bundle_id, 'tree': tree, 'platform': 'macOS' } except Exception as e: logger.error(f"Cannot get app reference: {e}") return { 'error': f'Cannot get app reference: {e}', 'app_name': app_name, 'bundle_id': bundle_id } except Exception as e: logger.error(f"macOS get accessibility tree failed: {e}") return {'error': str(e)} def _serialize_ax_element(self, element, depth: int = 0, max_depth: int = 10) -> Optional[Dict[str, Any]]: """ Serialize macOS accessibility element to dictionary Args: element: AX element depth: Current depth max_depth: Maximum depth Returns: Serialized dictionary """ if depth > max_depth: return None try: result = { 'depth': depth } # Get common attributes try: result['role'] = element.AXRole if hasattr(element, 'AXRole') else 'unknown' except: result['role'] = 'unknown' try: result['title'] = element.AXTitle if hasattr(element, 'AXTitle') else '' except: result['title'] = '' try: result['description'] = element.AXDescription if hasattr(element, 'AXDescription') else '' except: result['description'] = '' try: result['value'] = str(element.AXValue) if hasattr(element, 'AXValue') else '' except: result['value'] = '' try: result['enabled'] = element.AXEnabled if hasattr(element, 'AXEnabled') else False except: result['enabled'] = False try: result['focused'] = element.AXFocused if hasattr(element, 'AXFocused') else False except: result['focused'] = False # Position and size try: if hasattr(element, 'AXPosition'): pos = element.AXPosition result['position'] = {'x': pos.x, 'y': pos.y} except: pass try: if hasattr(element, 'AXSize'): size = element.AXSize result['size'] = {'width': size.width, 'height': size.height} except: pass # Recursively get child elements (with limit) result['children'] = [] try: if hasattr(element, 'AXChildren') and element.AXChildren: for i, child in enumerate(element.AXChildren[:30]): # Limit to max 30 child elements try: child_data = self._serialize_ax_element(child, depth + 1, max_depth) if child_data: result['children'].append(child_data) except Exception as e: logger.debug(f"Cannot serialize child element {i}: {e}") continue except Exception as e: logger.debug(f"Cannot get child elements: {e}") return result except Exception as e: logger.debug(f"Failed to serialize element (depth={depth}): {e}") return None def get_running_applications(self) -> List[Dict[str, str]]: """ Get list of all running applications Returns: Application list """ try: workspace = AppKit.NSWorkspace.sharedWorkspace() running_apps = workspace.runningApplications() apps = [] for app in running_apps: if app.activationPolicy() == AppKit.NSApplicationActivationPolicyRegular: apps.append({ 'name': app.localizedName() or 'Unknown', 'bundle_id': app.bundleIdentifier() or '', 'pid': app.processIdentifier(), 'active': app.isActive() }) return apps except Exception as e: logger.error(f"Failed to get running applications list: {e}") return [] def set_wallpaper(self, image_path: str) -> Dict[str, Any]: """ Set desktop wallpaper Args: image_path: Image path Returns: Result dictionary """ try: image_path = os.path.expanduser(image_path) if not os.path.exists(image_path): return {'status': 'error', 'message': f'Image not found: {image_path}'} # Use AppleScript to set wallpaper script = f''' tell application "System Events" tell every desktop set picture to "{image_path}" end tell end tell ''' subprocess.run(['osascript', '-e', script], check=True, timeout=10) logger.info(f"macOS wallpaper set successfully: {image_path}") return {'status': 'success', 'message': 'Wallpaper set successfully'} except Exception as e: logger.error(f"macOS set wallpaper failed: {e}") return {'status': 'error', 'message': str(e)} def get_system_info(self) -> Dict[str, Any]: """ Get macOS system information Returns: System information dictionary """ try: # Get macOS version version = subprocess.run( ['sw_vers', '-productVersion'], capture_output=True, text=True ).stdout.strip() # Get hardware information model = subprocess.run( ['sysctl', '-n', 'hw.model'], capture_output=True, text=True ).stdout.strip() return { 'platform': 'macOS', 'version': version, 'model': model, 'available': self.available } except Exception as e: logger.error(f"Failed to get system information: {e}") return { 'platform': 'macOS', 'error': str(e) } def _detect_screen_device(self) -> str: """ Return the screen device number of avfoundation, like '1:none' On macOS, ffmpeg -f avfoundation -list_devices true -i "" will list all devices: - AVFoundation video devices (usually the camera is [0]) - AVFoundation audio devices - The screen capture device usually displays as "Capture screen X", numbered from [1] """ try: probe = subprocess.run( ['ffmpeg', '-f', 'avfoundation', '-list_devices', 'true', '-i', ''], stderr=subprocess.PIPE, text=True, timeout=5 ) # Find all "Capture screen" devices screen_devices = [] for line in probe.stderr.splitlines(): # Match lines like "[AVFoundation indev @ 0x...] [1] Capture screen 0" if 'Capture screen' in line and '[AVFoundation' in line: # Extract device number from square brackets import re # Find pattern like "] [number] Capture screen" match = re.search(r'\]\s*\[(\d+)\]\s*Capture screen', line) if match: device_id = match.group(1) screen_devices.append(device_id) logger.info(f"Found screen capture device: {device_id} - {line.strip()}") # Use first found screen capture device if screen_devices: device = f'{screen_devices[0]}:none' logger.info(f"Using screen capture device: {device}") return device else: logger.warning("No screen capture device found, using default '1:none'") return '1:none' # Usually screen capture is device 1 except Exception as e: logger.warning(f"Failed to detect screen device: {e}, using default '1:none'") return '1:none' def start_recording(self, output_path: str) -> Dict[str, Any]: try: # Check if libx264 encoder is available result = subprocess.run( ['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=5 ) has_libx264 = 'libx264' in result.stdout # Get screen resolution try: if MACOS_LIBS_AVAILABLE: from AppKit import NSScreen screen = NSScreen.mainScreen() frame = screen.frame() width = int(frame.size.width) height = int(frame.size.height) logger.info(f"Screen resolution: {width}x{height}") else: width, height = 1920, 1080 logger.info(f"Using default resolution: {width}x{height}") except: width, height = 1920, 1080 logger.info(f"Using default resolution: {width}x{height}") # Detect screen capture device screen_dev = self._detect_screen_device() logger.info(f"Screen capture device: {screen_dev}") # Build ffmpeg command command = [ 'ffmpeg', '-y', '-f', 'avfoundation', '-capture_cursor', '1', '-capture_mouse_clicks', '1', '-framerate', '30', '-i', screen_dev, # Use detected screen device ] if has_libx264: command.extend(['-c:v', 'libx264', '-pix_fmt', 'yuv420p']) logger.info("Using libx264 encoder") else: command.extend(['-c:v', 'mpeg4']) logger.info("Using mpeg4 encoder") command.extend(['-r', '30', output_path]) logger.info(f"Starting recording with command: {' '.join(command)}") process = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True ) import time time.sleep(1.5) # Wait for a longer time to ensure ffmpeg starts # Check if process exited early if process.poll() is not None: err = process.stderr.read() if process.stderr else "" logger.error(f"FFmpeg exited early with stderr: {err}") if "Operation not permitted" in err or "Screen Recording" in err: return { "status": "error", "message": "Screen-recording permission denied. Please grant permission in System Settings → Privacy & Security → Screen Recording." } # Check if it's a device error if "Input/output error" in err or "Invalid argument" in err or "does not exist" in err: return { "status": "error", "message": f"Invalid screen capture device. Please ensure screen recording is enabled. Error: {err[:200]}" } error_output = err or "Unknown error" return { 'status': 'error', 'message': f'Failed to start recording: {error_output[:300]}' } logger.info(f"macOS recording started successfully: {output_path}") return { 'status': 'success', 'message': 'Recording started', 'process': process } except Exception as e: logger.error(f"macOS start recording failed: {e}") return { 'status': 'error', 'message': str(e) } def stop_recording(self, process) -> Dict[str, Any]: try: import signal import time if not process or process.poll() is not None: return { 'status': 'error', 'message': 'No recording in progress' } try: process.stdin.write('q') process.stdin.flush() logger.info("Sent 'q' command to ffmpeg") process.wait(timeout=5) logger.info("ffmpeg exited gracefully") time.sleep(0.2) # give ffmpeg time to flush the file except subprocess.TimeoutExpired: logger.warning("ffmpeg did not respond to 'q', trying SIGINT") process.send_signal(signal.SIGINT) try: process.wait(timeout=20) logger.info("ffmpeg responded to SIGINT") except subprocess.TimeoutExpired: logger.warning("ffmpeg did not respond to SIGINT, killing process") process.kill() process.wait() except Exception as e: logger.warning(f"Failed to send 'q': {e}, trying SIGINT") process.send_signal(signal.SIGINT) try: process.wait(timeout=20) except subprocess.TimeoutExpired: logger.warning("Killing ffmpeg") process.kill() process.wait() time.sleep(0.5) logger.info("macOS recording stopped successfully") return { 'status': 'success', 'message': 'Recording stopped' } except Exception as e: logger.error(f"macOS stop recording failed: {e}") return { 'status': 'error', 'message': str(e) } def list_windows(self) -> List[Dict[str, Any]]: """ List all windows Returns: Window list """ try: # Use AppleScript to get window list script = ''' tell application "System Events" set windowList to {} repeat with theProcess in (every process whose visible is true) try set processName to name of theProcess tell theProcess repeat with theWindow in windows try set windowTitle to name of theWindow set windowInfo to {processName, windowTitle} set end of windowList to windowInfo end try end repeat end tell end try end repeat return windowList end tell ''' result = subprocess.run( ['osascript', '-e', script], capture_output=True, text=True, timeout=10 ) windows = [] if result.returncode == 0 and result.stdout: # Parse AppleScript output: "app1, window1, app2, window2" output = result.stdout.strip() if output: # AppleScript returns comma-separated list items = [item.strip() for item in output.split(',')] # Group by pairs (app, window) for i in range(0, len(items), 2): if i + 1 < len(items): windows.append({ 'app_name': items[i], 'window_title': items[i + 1] }) return windows except Exception as e: logger.error(f"List windows failed: {e}") return [] def get_terminal_output(self) -> Optional[str]: """ Get terminal output (macOS Terminal.app or iTerm2) Returns: Terminal output content """ try: # Try to get Terminal.app output first script = ''' tell application "Terminal" if (count of windows) > 0 then try set currentTab to selected tab of front window set terminalOutput to contents of currentTab return terminalOutput on error return "" end try else return "" end if end tell ''' result = subprocess.run( ['osascript', '-e', script], capture_output=True, text=True, timeout=5 ) if result.returncode == 0 and result.stdout: output = result.stdout.strip() if output: return output # Try iTerm2 if Terminal.app failed iterm_script = ''' tell application "iTerm" if (count of windows) > 0 then try tell current session of current window set terminalOutput to contents return terminalOutput end tell on error return "" end try else return "" end if end tell ''' result = subprocess.run( ['osascript', '-e', iterm_script], capture_output=True, text=True, timeout=5 ) if result.returncode == 0 and result.stdout: output = result.stdout.strip() if output: return output return None except Exception as e: logger.error(f"Failed to get terminal output: {e}") return None ================================================ FILE: anytool/local_server/platform_adapters/pyxcursor.py ================================================ import os import ctypes import ctypes.util import numpy as np # A helper function to convert data from Xlib to byte array. import struct, array # Define ctypes version of XFixesCursorImage structure. PIXEL_DATA_PTR = ctypes.POINTER(ctypes.c_ulong) Atom = ctypes.c_ulong class XFixesCursorImage(ctypes.Structure): """ See /usr/include/X11/extensions/Xfixes.h typedef struct { short x, y; unsigned short width, height; unsigned short xhot, yhot; unsigned long cursor_serial; unsigned long *pixels; if XFIXES_MAJOR >= 2 Atom atom; /* Version >= 2 only */ const char *name; /* Version >= 2 only */ endif } XFixesCursorImage; """ _fields_ = [('x', ctypes.c_short), ('y', ctypes.c_short), ('width', ctypes.c_ushort), ('height', ctypes.c_ushort), ('xhot', ctypes.c_ushort), ('yhot', ctypes.c_ushort), ('cursor_serial', ctypes.c_ulong), ('pixels', PIXEL_DATA_PTR), ('atom', Atom), ('name', ctypes.c_char_p)] class Display(ctypes.Structure): pass class Xcursor: display = None def __init__(self, display=None): if not display: try: display = os.environ["DISPLAY"].encode("utf-8") except KeyError: raise Exception("$DISPLAY not set.") # XFixeslib = ctypes.CDLL('libXfixes.so') XFixes = ctypes.util.find_library("Xfixes") if not XFixes: raise Exception("No XFixes library found.") self.XFixeslib = ctypes.cdll.LoadLibrary(XFixes) # xlib = ctypes.CDLL('libX11.so.6') x11 = ctypes.util.find_library("X11") if not x11: raise Exception("No X11 library found.") self.xlib = ctypes.cdll.LoadLibrary(x11) # Define ctypes' version of XFixesGetCursorImage function XFixesGetCursorImage = self.XFixeslib.XFixesGetCursorImage XFixesGetCursorImage.restype = ctypes.POINTER(XFixesCursorImage) XFixesGetCursorImage.argtypes = [ctypes.POINTER(Display)] self.XFixesGetCursorImage = XFixesGetCursorImage XOpenDisplay = self.xlib.XOpenDisplay XOpenDisplay.restype = ctypes.POINTER(Display) XOpenDisplay.argtypes = [ctypes.c_char_p] if not self.display: self.display = self.xlib.XOpenDisplay(display) # (display) or (None) def argbdata_to_pixdata(self, data, len): if data == None or len < 1: return None # Create byte array b = array.array('b', b'\x00' * 4 * len) offset, i = 0, 0 while i < len: argb = data[i] & 0xffffffff rgba = (argb << 8) | (argb >> 24) b1 = (rgba >> 24) & 0xff b2 = (rgba >> 16) & 0xff b3 = (rgba >> 8) & 0xff b4 = rgba & 0xff struct.pack_into("=BBBB", b, offset, b1, b2, b3, b4) offset = offset + 4 i = i + 1 return b def getCursorImageData(self): # Call the function. Read data of cursor/mouse-pointer. cursor_data = self.XFixesGetCursorImage(self.display) if not (cursor_data and cursor_data[0]): raise Exception("Cannot read XFixesGetCursorImage()") # Note: cursor_data is a pointer, take cursor_data[0] return cursor_data[0] def getCursorImageArray(self): data = self.getCursorImageData() # x, y = data.x, data.y height, width = data.height, data.width bytearr = self.argbdata_to_pixdata(data.pixels, height * width) imgarray = np.array(bytearr, dtype=np.uint8) imgarray = imgarray.reshape(height, width, 4) del bytearr return imgarray def getCursorImageArrayFast(self): data = self.getCursorImageData() # x, y = data.x, data.y height, width = data.height, data.width bytearr = ctypes.cast(data.pixels, ctypes.POINTER(ctypes.c_ulong * height * width))[0] imgarray = np.array(bytearray(bytearr)) imgarray = imgarray.reshape(height, width, 8)[:, :, (0, 1, 2, 3)] del bytearr return imgarray def saveImage(self, imgarray, text): from PIL import Image img = Image.fromarray(imgarray) img.save(text) if __name__ == "__main__": cursor = Xcursor() imgarray = cursor.getCursorImageArrayFast() cursor.saveImage(imgarray, 'cursor_image.png') ================================================ FILE: anytool/local_server/platform_adapters/windows_adapter.py ================================================ import os import ctypes import subprocess from typing import Dict, Any, Optional, List from anytool.utils.logging import Logger from PIL import Image, ImageGrab try: from pywinauto import Desktop import win32ui import win32gui import win32con import pygetwindow as gw WINDOWS_LIBS_AVAILABLE = True except ImportError: WINDOWS_LIBS_AVAILABLE = False logger = Logger.get_logger(__name__) class WindowsAdapter: """Windows platform-specific functionality adapter""" def __init__(self): if not WINDOWS_LIBS_AVAILABLE: logger.warning("Windows libraries are not fully installed, some features may not be available") self.available = WINDOWS_LIBS_AVAILABLE def capture_screenshot_with_cursor(self, output_path: str) -> bool: """ Capture screenshot using ImageGrab (including cursor) Args: output_path: Output file path Returns: Whether successful """ try: # Use ImageGrab to capture screenshot img = ImageGrab.grab(bbox=None, include_layered_windows=True) # Try to add cursor try: if WINDOWS_LIBS_AVAILABLE: cursor, hotspot = self._get_cursor() if cursor: # Get scaling ratio ratio = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100 pos_win = win32gui.GetCursorPos() pos = ( round(pos_win[0] * ratio - hotspot[0]), round(pos_win[1] * ratio - hotspot[1]) ) img.paste(cursor, pos, cursor) logger.info("Windows screenshot successfully (with cursor)") else: logger.info("Windows screenshot successfully (without cursor)") except Exception as e: logger.warning(f"Cannot add cursor to screenshot: {e}") logger.info("Windows screenshot successfully (without cursor)") img.save(output_path) return True except Exception as e: logger.error(f"Windows screenshot failed: {e}") return False def _get_cursor(self) -> tuple: """ Get current cursor image and hotspot Returns: (cursor_image, (hotspot_x, hotspot_y)) """ try: hcursor = win32gui.GetCursorInfo()[1] hdc = win32ui.CreateDCFromHandle(win32gui.GetDC(0)) hbmp = win32ui.CreateBitmap() hbmp.CreateCompatibleBitmap(hdc, 36, 36) hdc_compatible = hdc.CreateCompatibleDC() hdc_compatible.SelectObject(hbmp) hdc_compatible.DrawIcon((0, 0), hcursor) bmpinfo = hbmp.GetInfo() bmpstr = hbmp.GetBitmapBits(True) cursor = Image.frombuffer( 'RGB', (bmpinfo['bmWidth'], bmpinfo['bmHeight']), bmpstr, 'raw', 'BGRX', 0, 1 ).convert("RGBA") win32gui.DestroyIcon(hcursor) win32gui.DeleteObject(hbmp.GetHandle()) hdc_compatible.DeleteDC() # Make black pixels transparent pixdata = cursor.load() width, height = cursor.size for y in range(height): for x in range(width): if pixdata[x, y] == (0, 0, 0, 255): pixdata[x, y] = (0, 0, 0, 0) hotspot = win32gui.GetIconInfo(hcursor)[1:3] return (cursor, hotspot) except Exception as e: logger.debug(f"Failed to get cursor image: {e}") return (None, (0, 0)) def activate_window(self, window_name: str, strict: bool = False) -> Dict[str, Any]: """ Activate window (Windows uses pygetwindow) Args: window_name: Window title strict: Whether to strictly match Returns: Result dictionary """ if not WINDOWS_LIBS_AVAILABLE: return {'status': 'error', 'message': 'Windows libraries not available'} try: windows = gw.getWindowsWithTitle(window_name) if not windows: logger.warning(f"Window not found: {window_name}") return {'status': 'error', 'message': f'Window {window_name} not found'} window = None if strict: # Strict match for wnd in windows: if wnd.title == window_name: window = wnd break if not window: return {'status': 'error', 'message': f'Window {window_name} not found (strict mode)'} else: window = windows[0] window.activate() logger.info(f"Windows window activated successfully: {window_name}") return {'status': 'success', 'message': 'Window activated'} except Exception as e: logger.error(f"Windows window activation failed: {e}") return {'status': 'error', 'message': str(e)} def close_window(self, window_name: str, strict: bool = False) -> Dict[str, Any]: """ Close window (Windows uses pygetwindow) Args: window_name: Window title strict: Whether to strictly match Returns: Result dictionary """ if not WINDOWS_LIBS_AVAILABLE: return {'status': 'error', 'message': 'Windows libraries not available'} try: windows = gw.getWindowsWithTitle(window_name) if not windows: logger.warning(f"Window not found: {window_name}") return {'status': 'error', 'message': f'Window {window_name} not found'} window = None if strict: for wnd in windows: if wnd.title == window_name: window = wnd break if not window: return {'status': 'error', 'message': f'Window {window_name} not found (strict mode)'} else: window = windows[0] window.close() logger.info(f"Windows window closed successfully: {window_name}") return {'status': 'success', 'message': 'Window closed'} except Exception as e: logger.error(f"Windows window close failed: {e}") return {'status': 'error', 'message': str(e)} def get_accessibility_tree(self, max_depth: int = 10, max_width: int = 50) -> Dict[str, Any]: """ Get Windows accessibility tree (using pywinauto) Args: max_depth: Maximum depth max_width: Maximum number of child elements per level Returns: Accessibility tree data """ if not WINDOWS_LIBS_AVAILABLE: return {'error': 'Windows accessibility libraries not available'} try: # Get desktop desktop = Desktop(backend="uia") # Serialize accessibility tree tree = self._serialize_uia_element( desktop, depth=0, max_depth=max_depth, max_width=max_width, visited=set() ) return { 'tree': tree, 'platform': 'Windows' } except Exception as e: logger.error(f"Windows get accessibility tree failed: {e}") return {'error': str(e)} def _serialize_uia_element( self, element, depth: int = 0, max_depth: int = 10, max_width: int = 50, visited: set = None ) -> Optional[Dict[str, Any]]: """ Serialize Windows UIA element to dictionary Args: element: UIA element depth: Current depth max_depth: Maximum depth max_width: Maximum width visited: Set of visited elements Returns: Serialized dictionary """ if visited is None: visited = set() if depth > max_depth or element in visited: return None visited.add(element) try: result = { 'depth': depth } # Get basic attributes try: result['class_name'] = element.class_name() except: result['class_name'] = 'unknown' try: result['name'] = element.window_text() except: result['name'] = '' # Get states states = {} state_methods = [ 'is_enabled', 'is_visible', 'is_minimized', 'is_maximized', 'is_focused', 'is_checked', 'is_selected' ] for method_name in state_methods: if hasattr(element, method_name): try: method = getattr(element, method_name) states[method_name] = method() except: pass if states: result['states'] = states # Get position and size try: rectangle = element.rectangle() result['position'] = { 'left': rectangle.left, 'top': rectangle.top } result['size'] = { 'width': rectangle.width(), 'height': rectangle.height() } except: pass # Recursively get child elements result['children'] = [] try: children = element.children() for i, child in enumerate(children[:max_width]): try: child_data = self._serialize_uia_element( child, depth + 1, max_depth, max_width, visited ) if child_data: result['children'].append(child_data) except Exception as e: logger.debug(f"Cannot serialize child element {i}: {e}") continue except Exception as e: logger.debug(f"Cannot get child elements: {e}") return result except Exception as e: logger.debug(f"Failed to serialize element (depth={depth}): {e}") return None def list_windows(self) -> List[Dict[str, Any]]: """ List all windows Returns: Window list """ if not WINDOWS_LIBS_AVAILABLE: return [] try: windows = gw.getAllWindows() return [ { 'title': win.title, 'left': win.left, 'top': win.top, 'width': win.width, 'height': win.height, 'visible': win.visible, 'active': win.isActive } for win in windows if win.title # Only return windows with titles ] except Exception as e: logger.error(f"List windows failed: {e}") return [] def set_wallpaper(self, image_path: str) -> Dict[str, Any]: """ Set desktop wallpaper Args: image_path: Image path Returns: Result dictionary """ try: image_path = os.path.expanduser(image_path) image_path = os.path.abspath(image_path) if not os.path.exists(image_path): return {'status': 'error', 'message': f'Image not found: {image_path}'} # Use Windows API to set wallpaper SPI_SETDESKWALLPAPER = 20 ctypes.windll.user32.SystemParametersInfoW( SPI_SETDESKWALLPAPER, 0, image_path, 3 # SPIF_UPDATEINIFILE | SPIF_SENDCHANGE ) logger.info(f"Windows wallpaper set successfully: {image_path}") return {'status': 'success', 'message': 'Wallpaper set successfully'} except Exception as e: logger.error(f"Windows set wallpaper failed: {e}") return {'status': 'error', 'message': str(e)} def get_system_info(self) -> Dict[str, Any]: """ Get Windows system information Returns: System information dictionary """ try: import platform as plat return { 'platform': 'Windows', 'version': plat.version(), 'release': plat.release(), 'edition': plat.win32_edition() if hasattr(plat, 'win32_edition') else 'Unknown', 'available': self.available } except Exception as e: logger.error(f"Failed to get system information: {e}") return { 'platform': 'Windows', 'error': str(e) } def start_recording(self, output_path: str) -> Dict[str, Any]: try: try: result = subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True, timeout=5, creationflags=subprocess.CREATE_NO_WINDOW) except (subprocess.CalledProcessError, FileNotFoundError): return { 'status': 'error', 'message': 'ffmpeg not installed. Download from: https://ffmpeg.org/download.html' } try: user32 = ctypes.windll.user32 width = user32.GetSystemMetrics(0) # SM_CXSCREEN height = user32.GetSystemMetrics(1) # SM_CYSCREEN except: width, height = 1920, 1080 command = [ 'ffmpeg', '-y', '-f', 'gdigrab', '-draw_mouse', '1', '-framerate', '30', '-video_size', f'{width}x{height}', '-i', 'desktop', '-c:v', 'libx264', '-preset', 'ultrafast', '-pix_fmt', 'yuv420p', '-r', '30', output_path ] process = subprocess.Popen( command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True, creationflags=subprocess.CREATE_NO_WINDOW ) import time time.sleep(1) if process.poll() is not None: error_output = process.stderr.read() if process.stderr else "Unknown error" return { 'status': 'error', 'message': f'Failed to start recording: {error_output}' } logger.info(f"Windows recording started: {output_path}") return { 'status': 'success', 'message': 'Recording started', 'process': process } except Exception as e: logger.error(f"Windows start recording failed: {e}") return { 'status': 'error', 'message': str(e) } def stop_recording(self, process) -> Dict[str, Any]: try: if not process or process.poll() is not None: return { 'status': 'error', 'message': 'No recording in progress' } import signal try: process.send_signal(signal.CTRL_C_EVENT) except: process.terminate() try: process.wait(timeout=15) except subprocess.TimeoutExpired: logger.warning("ffmpeg did not respond, killing process") process.kill() process.wait() logger.info("Windows recording stopped successfully") return { 'status': 'success', 'message': 'Recording stopped' } except Exception as e: logger.error(f"Windows stop recording failed: {e}") return { 'status': 'error', 'message': str(e) } def get_running_applications(self) -> List[Dict[str, str]]: """ Get list of all running applications Returns: Application list """ if not WINDOWS_LIBS_AVAILABLE: return [] try: import psutil apps = [] seen_names = set() for proc in psutil.process_iter(['pid', 'name', 'exe']): try: pinfo = proc.info name = pinfo['name'] exe = pinfo['exe'] # Skip system processes if not exe or name in ['System', 'Registry', 'svchost.exe', 'csrss.exe']: continue # Skip duplicates if name in seen_names: continue seen_names.add(name) apps.append({ 'name': name, 'pid': pinfo['pid'], 'path': exe or '' }) except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): pass return apps except ImportError: logger.warning("psutil not installed, cannot get running applications") return [] except Exception as e: logger.error(f"Failed to get running applications list: {e}") return [] def get_screen_size(self) -> Dict[str, int]: """ Get screen size Returns: Screen size dictionary """ try: user32 = ctypes.windll.user32 width = user32.GetSystemMetrics(0) # SM_CXSCREEN height = user32.GetSystemMetrics(1) # SM_CYSCREEN return {'width': width, 'height': height} except Exception as e: logger.error(f"Failed to get screen size: {e}") return {'width': 1920, 'height': 1080} # Default value def get_terminal_output(self) -> Optional[str]: """ Get terminal output (Windows Command Prompt, PowerShell, or Windows Terminal) Note: Due to Windows architecture, getting terminal output is complex. This method attempts to find active console windows. Returns: Terminal output content (limited functionality on Windows) """ try: # Windows doesn't provide easy access to terminal content like Linux/macOS # This is a limitation of the Windows platform # We can try to use PowerShell to get recent command history # Try to get PowerShell history try: history_path = os.path.expanduser( '~\\AppData\\Roaming\\Microsoft\\Windows\\PowerShell\\PSReadLine\\ConsoleHost_history.txt' ) if os.path.exists(history_path): with open(history_path, 'r', encoding='utf-8', errors='ignore') as f: # Get last 50 lines lines = f.readlines() recent_history = ''.join(lines[-50:]) if recent_history: return f"PowerShell History (last 50 commands):\n{recent_history}" except Exception as e: logger.debug(f"Cannot read PowerShell history: {e}") # Try to get Command Prompt history using doskey try: result = subprocess.run( ['doskey', '/history'], capture_output=True, text=True, timeout=2, creationflags=subprocess.CREATE_NO_WINDOW ) if result.returncode == 0 and result.stdout: return f"Command Prompt History:\n{result.stdout}" except Exception as e: logger.debug(f"Cannot get Command Prompt history: {e}") logger.warning("Windows terminal output is limited - only command history available") return None except Exception as e: logger.error(f"Failed to get terminal output: {e}") return None ================================================ FILE: anytool/local_server/requirements.txt ================================================ # Local server dependencies (cross-platform) flask>=3.1.0 pyautogui>=0.9.54 pydantic>=2.12.0 requests>=2.32.0 # # macOS-specific dependencies (local server) # pyobjc-core>=12.0; sys_platform == 'darwin' # pyobjc-framework-cocoa>=12.0; sys_platform == 'darwin' # pyobjc-framework-quartz>=12.0; sys_platform == 'darwin' # atomacos>=3.2.0; sys_platform == 'darwin' # # Linux-specific dependencies (local server) # python-xlib>=0.33; sys_platform == 'linux' # pyatspi>=2.38.0; sys_platform == 'linux' # numpy>=1.24.0; sys_platform == 'linux' # # Windows-specific dependencies (local server) # pywinauto>=0.6.8; sys_platform == 'win32' # pywin32>=306; sys_platform == 'win32' # PyGetWindow>=0.0.9; sys_platform == 'win32' ================================================ FILE: anytool/local_server/run.sh ================================================ #!/bin/bash SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" PROJECT_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )" # Check Python if ! command -v python3 &> /dev/null; then echo "Error: python3 not installed" exit 1 fi # Check if dependencies are installed if ! python3 -c "import flask" &> /dev/null; then echo "Installing dependencies..." pip3 install -q -r "$SCRIPT_DIR/requirements.txt" || { echo "Failed to install dependencies" exit 1 } fi # Set PYTHONPATH and start server export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" cd "$PROJECT_ROOT" python3 -m anytool.local_server.main ================================================ FILE: anytool/local_server/utils/__init__.py ================================================ from .accessibility import AccessibilityHelper from .screenshot import ScreenshotHelper __all__ = ["AccessibilityHelper", "ScreenshotHelper"] ================================================ FILE: anytool/local_server/utils/accessibility.py ================================================ import platform from anytool.utils.logging import Logger from typing import Dict, Any, Optional logger = Logger.get_logger(__name__) platform_name = platform.system() class AccessibilityHelper: def __init__(self): self.platform = platform_name self.adapter = None try: if platform_name == "Darwin": from ..platform_adapters.macos_adapter import MacOSAdapter self.adapter = MacOSAdapter() elif platform_name == "Linux": from ..platform_adapters.linux_adapter import LinuxAdapter self.adapter = LinuxAdapter() elif platform_name == "Windows": from ..platform_adapters.windows_adapter import WindowsAdapter self.adapter = WindowsAdapter() except ImportError as e: logger.warning(f"Failed to import platform adapter: {e}") def get_tree(self, max_depth: int = 10) -> Dict[str, Any]: if not self.adapter: return { 'error': f'No adapter available for {self.platform}', 'platform': self.platform } try: return self.adapter.get_accessibility_tree(max_depth=max_depth) except Exception as e: logger.error(f"Failed to get accessibility tree: {e}") return { 'error': str(e), 'platform': self.platform } def is_available(self) -> bool: return self.adapter is not None and hasattr(self.adapter, 'available') and self.adapter.available def find_element_by_name(self, tree: Dict[str, Any], name: str) -> Optional[Dict[str, Any]]: if not tree or 'tree' not in tree: return None return self._search_tree(tree['tree'], 'name', name) def find_element_by_role(self, tree: Dict[str, Any], role: str) -> Optional[Dict[str, Any]]: if not tree or 'tree' not in tree: return None return self._search_tree(tree['tree'], 'role', role) def _search_tree(self, node: Dict[str, Any], key: str, value: str) -> Optional[Dict[str, Any]]: if not node: return None # Check current node if key in node and node[key] == value: return node # Recursively search child nodes if 'children' in node: for child in node['children']: result = self._search_tree(child, key, value) if result: return result return None def flatten_tree(self, tree: Dict[str, Any]) -> list: if not tree or 'tree' not in tree: return [] result = [] self._flatten_node(tree['tree'], result) return result def _flatten_node(self, node: Dict[str, Any], result: list): """Recursively flatten nodes""" if not node: return # Add current node (remove children) node_copy = {k: v for k, v in node.items() if k != 'children'} result.append(node_copy) # Recursively process child nodes if 'children' in node: for child in node['children']: self._flatten_node(child, result) def get_visible_elements(self, tree: Dict[str, Any]) -> list: all_elements = self.flatten_tree(tree) visible = [] for element in all_elements: if self.platform == "Linux": if 'states' in element and 'showing' in element.get('states', []): visible.append(element) elif self.platform == "Darwin": if element.get('enabled', False): visible.append(element) elif self.platform == "Windows": if element.get('states', {}).get('is_visible', False): visible.append(element) return visible def get_clickable_elements(self, tree: Dict[str, Any]) -> list: all_elements = self.flatten_tree(tree) clickable_roles = [ 'button', 'push-button', 'toggle-button', 'radio-button', 'link', 'menu-item', 'AXButton', 'AXLink', 'AXMenuItem' ] clickable = [] for element in all_elements: role = element.get('role', '').lower() if any(cr in role for cr in clickable_roles): clickable.append(element) return clickable def get_statistics(self, tree: Dict[str, Any]) -> Dict[str, Any]: all_elements = self.flatten_tree(tree) # Count roles roles = {} for element in all_elements: role = element.get('role', 'unknown') roles[role] = roles.get(role, 0) + 1 return { 'total_elements': len(all_elements), 'visible_elements': len(self.get_visible_elements(tree)), 'clickable_elements': len(self.get_clickable_elements(tree)), 'roles': roles, 'platform': self.platform } ================================================ FILE: anytool/local_server/utils/screenshot.py ================================================ import platform import os import logging from typing import Optional, Tuple from PIL import Image import pyautogui logger = logging.getLogger(__name__) platform_name = platform.system() class ScreenshotHelper: def __init__(self): self.platform = platform_name self.adapter = None try: if platform_name == "Darwin": from ..platform_adapters.macos_adapter import MacOSAdapter self.adapter = MacOSAdapter() elif platform_name == "Linux": from ..platform_adapters.linux_adapter import LinuxAdapter self.adapter = LinuxAdapter() elif platform_name == "Windows": from ..platform_adapters.windows_adapter import WindowsAdapter self.adapter = WindowsAdapter() except ImportError as e: logger.warning(f"Failed to import platform adapter: {e}") def capture(self, output_path: str, with_cursor: bool = True) -> bool: try: # Ensure directory exists os.makedirs(os.path.dirname(output_path), exist_ok=True) if with_cursor and self.adapter: # Use platform-specific method to capture screenshot (with cursor) return self.adapter.capture_screenshot_with_cursor(output_path) else: # Use pyautogui to capture screenshot (without cursor) screenshot = pyautogui.screenshot() screenshot.save(output_path) logger.info(f"Screenshot successfully (without cursor): {output_path}") return True except Exception as e: logger.error(f"Screenshot failed: {e}") return False def capture_region( self, output_path: str, x: int, y: int, width: int, height: int ) -> bool: """ Capture specified screen region Args: output_path: Output path x: Starting x coordinate y: Starting y coordinate width: Width height: Height Returns: Whether successful """ try: os.makedirs(os.path.dirname(output_path), exist_ok=True) screenshot = pyautogui.screenshot(region=(x, y, width, height)) screenshot.save(output_path) logger.info(f"Region screenshot successfully: {output_path}") return True except Exception as e: logger.error(f"Region screenshot failed: {e}") return False def get_screen_size(self) -> Tuple[int, int]: """ Get screen size Returns: (width, height) """ try: size = pyautogui.size() return (size.width, size.height) except Exception as e: logger.error(f"Failed to get screen size: {e}") return (1920, 1080) # Default value def get_cursor_position(self) -> Tuple[int, int]: """ Get cursor position Returns: (x, y) """ try: pos = pyautogui.position() return (pos.x, pos.y) except Exception as e: logger.error(f"Failed to get cursor position: {e}") return (0, 0) def capture_to_base64(self, with_cursor: bool = True) -> Optional[str]: """ Capture screenshot and convert to base64 Args: with_cursor: Whether to include cursor Returns: Base64 encoded image string """ import tempfile import base64 try: # Create temporary file with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: tmp_path = tmp.name # Capture screenshot if self.capture(tmp_path, with_cursor): # Read and encode with open(tmp_path, 'rb') as f: img_data = f.read() img_base64 = base64.b64encode(img_data).decode('utf-8') # Delete temporary file os.remove(tmp_path) return img_base64 else: if os.path.exists(tmp_path): os.remove(tmp_path) return None except Exception as e: logger.error(f"Failed to convert screenshot to base64: {e}") return None def compare_screenshots(self, path1: str, path2: str) -> float: """ Compare similarity between two screenshots Args: path1: First image path path2: Second image path Returns: Similarity (0-1), 1 means identical """ try: from PIL import ImageChops import math import operator from functools import reduce img1 = Image.open(path1) img2 = Image.open(path2) # Ensure same size if img1.size != img2.size: # Resize to same size img2 = img2.resize(img1.size) # Calculate difference diff = ImageChops.difference(img1, img2) # Calculate statistics stat = diff.histogram() sum_of_squares = reduce( operator.add, map(lambda h, i: h * (i ** 2), stat, range(len(stat))) ) # Calculate RMS rms = math.sqrt(sum_of_squares / float(img1.size[0] * img1.size[1])) # Normalize to 0-1, RMS max value is approximately 441 (for RGB) similarity = 1 - (rms / 441.0) return max(0, min(1, similarity)) except Exception as e: logger.error(f"Failed to compare screenshots: {e}") return 0.0 def annotate_screenshot( self, input_path: str, output_path: str, annotations: list ) -> bool: """ Add annotations to screenshot Args: input_path: Input image path output_path: Output image path annotations: List of annotations, each annotation is a dict: {'type': 'rectangle'/'text', 'x': int, 'y': int, 'width': int, 'height': int, 'text': str, 'color': tuple} Returns: Whether successful """ try: from PIL import ImageDraw, ImageFont img = Image.open(input_path) draw = ImageDraw.Draw(img) for annotation in annotations: ann_type = annotation.get('type', 'rectangle') color = annotation.get('color', (255, 0, 0)) if ann_type == 'rectangle': x = annotation.get('x', 0) y = annotation.get('y', 0) width = annotation.get('width', 100) height = annotation.get('height', 100) draw.rectangle( [(x, y), (x + width, y + height)], outline=color, width=2 ) elif ann_type == 'text': x = annotation.get('x', 0) y = annotation.get('y', 0) text = annotation.get('text', '') try: font = ImageFont.truetype("Arial.ttf", 20) except: font = ImageFont.load_default() draw.text((x, y), text, fill=color, font=font) img.save(output_path) logger.info(f"Annotated screenshot successfully: {output_path}") return True except Exception as e: logger.error(f"Failed to annotate screenshot: {e}") return False ================================================ FILE: anytool/platform/__init__.py ================================================ from .system_info import SystemInfoClient, get_system_info, get_screen_size from .recording import RecordingClient, RecordingContextManager from .screenshot import ScreenshotClient, AutoScreenshotWrapper from .config import get_local_server_config, get_client_base_url __all__ = [ # System Info "SystemInfoClient", "get_system_info", "get_screen_size", # Recording "RecordingClient", "RecordingContextManager", # Screenshot "ScreenshotClient", "AutoScreenshotWrapper", # Config "get_local_server_config", "get_client_base_url", ] ================================================ FILE: anytool/platform/config.py ================================================ import os import json from typing import Dict, Any from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) def get_local_server_config() -> Dict[str, Any]: """ Read local server configuration. Priority: 1. Environment variable LOCAL_SERVER_URL (parsed into host/port) 2. Config file local_server/config.json 3. Defaults (127.0.0.1:5000) Returns: Dict with 'host' and 'port' from server config """ # Check environment variable first (for OSWorld/remote VM integration) env_url = os.getenv("LOCAL_SERVER_URL") if env_url: try: # Parse URL like "http://localhost:5000" from urllib.parse import urlparse parsed = urlparse(env_url) host = parsed.hostname or '127.0.0.1' port = parsed.port or 5000 logger.debug(f"Using LOCAL_SERVER_URL: {host}:{port}") return { 'host': host, 'port': port, 'debug': False, } except Exception as e: logger.warning(f"Failed to parse LOCAL_SERVER_URL: {e}") # Find local_server config file try: # Try relative path from this file current_dir = os.path.dirname(__file__) config_path = os.path.join(current_dir, '../local_server/config.json') config_path = os.path.abspath(config_path) if os.path.exists(config_path): with open(config_path, 'r') as f: config = json.load(f) server_config = config.get('server', {}) return { 'host': server_config.get('host', '127.0.0.1'), 'port': server_config.get('port', 5000), 'debug': server_config.get('debug', False), } except Exception as e: logger.debug(f"Failed to read local server config: {e}") # Return defaults return { 'host': '127.0.0.1', 'port': 5000, 'debug': False, } def get_client_base_url() -> str: """ Get base URL for connecting to local server. Priority: 1. Environment variable LOCAL_SERVER_URL 2. Read from local_server/config.json 3. Default http://localhost:5000 Returns: Base URL string """ # Check environment variable first env_url = os.getenv("LOCAL_SERVER_URL") if env_url: return env_url # Read from config file config = get_local_server_config() host = config['host'] port = config['port'] # Convert 0.0.0.0 to localhost for client if host == '0.0.0.0': host = 'localhost' return f"http://{host}:{port}" ================================================ FILE: anytool/platform/recording.py ================================================ import aiohttp from typing import Optional from anytool.utils.logging import Logger from .config import get_client_base_url logger = Logger.get_logger(__name__) class RecordingClient: """ Client for screen recording via HTTP API. This client directly calls the local server's recording endpoints: - POST /start_recording - POST /end_recording """ def __init__( self, base_url: Optional[str] = None, timeout: int = 30 ): """ Initialize recording client. Args: base_url: Base URL of the local server (default: read from local_server/config.json or env LOCAL_SERVER_URL) timeout: Request timeout in seconds """ # Get base_url: priority is explicit > env > config file if base_url is None: base_url = get_client_base_url() self.base_url = base_url.rstrip("/") self.timeout = timeout self._session: Optional[aiohttp.ClientSession] = None async def _get_session(self) -> aiohttp.ClientSession: """Get or create aiohttp session.""" if self._session is None or self._session.closed: self._session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=self.timeout) ) return self._session async def start_recording(self, auto_cleanup: bool = True) -> bool: """ Start screen recording. Args: auto_cleanup: If True, automatically end previous recording if one is in progress """ try: session = await self._get_session() url = f"{self.base_url}/start_recording" async with session.post(url) as response: if response.status == 200: logger.info("Screen recording started") return True elif response.status == 400 and auto_cleanup: # Check if error is due to recording already in progress error_text = await response.text() if "already in progress" in error_text.lower(): logger.warning("Recording already in progress, stopping previous recording...") # Try to end the previous recording video_bytes = await self.end_recording() if video_bytes: logger.info("Previous recording ended successfully, retrying start...") else: logger.warning("Failed to end previous recording, but will retry start anyway...") # Retry starting recording (without auto_cleanup to avoid infinite loop) return await self.start_recording(auto_cleanup=False) else: logger.error(f"Failed to start recording: HTTP {response.status} - {error_text}") return False else: error_text = await response.text() logger.error(f"Failed to start recording: HTTP {response.status} - {error_text}") return False except Exception as e: logger.error(f"Failed to start recording: {e}") return False async def end_recording(self, dest: Optional[str] = None) -> Optional[bytes]: """ End screen recording and optionally save to file. """ try: session = await self._get_session() url = f"{self.base_url}/end_recording" # Use longer timeout for end_recording (file may be large) async with session.post(url, timeout=aiohttp.ClientTimeout(total=60)) as response: if response.status == 200: video_bytes = await response.read() # Save to file if destination provided if dest: try: with open(dest, "wb") as f: f.write(video_bytes) logger.info(f"Recording saved to: {dest}") except Exception as e: logger.error(f"Failed to save recording file: {e}") return None logger.info("Screen recording ended") return video_bytes else: error_text = await response.text() logger.error(f"Failed to end recording: HTTP {response.status} - {error_text}") return None except Exception as e: logger.error(f"Failed to end recording: {e}") return None async def close(self): """Close the HTTP session.""" if self._session and not self._session.closed: await self._session.close() # Give aiohttp time to finish cleanup callbacks import asyncio await asyncio.sleep(0.25) logger.debug("Recording client session closed") async def __aenter__(self): """Context manager entry.""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" await self.close() return False class RecordingContextManager: def __init__( self, base_url: Optional[str] = None, output_path: Optional[str] = None, timeout: Optional[int] = None ): """ Initialize recording context manager. Args: base_url: Base URL of the local server (default: from config) output_path: Path to save recording (default: from config) timeout: Request timeout in seconds (default: from config) """ # Load output_path from config if not provided if output_path is None: try: from anytool.config import get_config config = get_config() if config.recording.screen_recording_path: output_path = config.recording.screen_recording_path except Exception: pass self.client = RecordingClient(base_url=base_url, timeout=timeout) self.output_path = output_path self.recording_started = False async def __aenter__(self) -> RecordingClient: """Start recording on context entry.""" success = await self.client.start_recording() if success: self.recording_started = True logger.info("Recording context started") else: logger.warning("Failed to start recording in context") return self.client async def __aexit__(self, exc_type, exc_val, exc_tb): """Stop recording on context exit.""" if self.recording_started: try: await self.client.end_recording(dest=self.output_path) logger.info("Recording context ended") except Exception as e: logger.error(f"Failed to end recording in context: {e}") await self.client.close() return False ================================================ FILE: anytool/platform/screenshot.py ================================================ """ Screenshot client for capturing screens via HTTP API. This module provides a screenshot client that captures screenshots by calling the local_server's /screenshot endpoint. Always uses HTTP API (like RecordingClient): - Local: http://127.0.0.1:5000/screenshot - Remote: http://remote-vm:5000/screenshot """ import aiohttp from typing import Optional from anytool.utils.logging import Logger from .config import get_client_base_url logger = Logger.get_logger(__name__) class ScreenshotClient: def __init__( self, base_url: Optional[str] = None, timeout: int = 10 ): """ Initialize screenshot client. Args: base_url: Base URL of local_server (default: read from config/env, typically http://127.0.0.1:5000) timeout: Request timeout (seconds) """ # Get base_url from config if not provided if base_url is None: base_url = get_client_base_url() self.base_url = base_url.rstrip("/") self.timeout = timeout self._session = None logger.debug(f"ScreenshotClient initialized: {self.base_url}") async def _get_session(self) -> aiohttp.ClientSession: """Get or create aiohttp session.""" if self._session is None or self._session.closed: self._session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=self.timeout) ) return self._session @staticmethod def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool: """ Validate image response using magic bytes. Args: content_type: HTTP Content-Type header data: Response data bytes Returns: True if data is valid PNG/JPEG image """ if not isinstance(data, (bytes, bytearray)) or not data: return False # PNG magic bytes: \x89PNG\r\n\x1a\n if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n": return True # JPEG magic bytes: \xff\xd8\xff if len(data) >= 3 and data[:3] == b"\xff\xd8\xff": return True # Fallback to content-type check if content_type and ("image/png" in content_type or "image/jpeg" in content_type): return True return False async def capture(self) -> Optional[bytes]: """ Capture screenshot via HTTP API. Calls: GET {base_url}/screenshot Returns: PNG image bytes, or None on failure """ try: session = await self._get_session() url = f"{self.base_url}/screenshot" logger.debug(f"Requesting screenshot: {url}") async with session.get(url) as response: if response.status == 200: content_type = response.headers.get("Content-Type", "") screenshot_bytes = await response.read() # Validate image format if self._is_valid_image_response(content_type, screenshot_bytes): logger.debug(f"Screenshot captured: {len(screenshot_bytes)} bytes") return screenshot_bytes else: logger.error("Invalid screenshot format received") return None else: error_text = await response.text() logger.error(f"Failed to capture screenshot: HTTP {response.status} - {error_text}") return None except Exception as e: logger.error(f"Failed to capture screenshot: {e}") return None async def capture_to_file(self, output_path: str) -> bool: try: screenshot = await self.capture() if screenshot: import os os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True) with open(output_path, 'wb') as f: f.write(screenshot) logger.info(f"Screenshot saved to: {output_path}") return True return False except Exception as e: logger.error(f"Failed to save screenshot to file: {e}") return False async def get_screen_size(self) -> tuple[int, int]: """ Get screen size via HTTP API. Calls: GET {base_url}/screen_size Returns: (width, height) """ try: session = await self._get_session() url = f"{self.base_url}/screen_size" async with session.get(url) as response: if response.status == 200: data = await response.json() width = data.get('width', 1920) height = data.get('height', 1080) logger.debug(f"Screen size: {width}x{height}") return (width, height) else: logger.warning("Failed to get screen size, using default") return (1920, 1080) except Exception as e: logger.error(f"Failed to get screen size: {e}") return (1920, 1080) async def close(self): """Close HTTP session.""" if self._session and not self._session.closed: await self._session.close() logger.debug("Screenshot client session closed") async def __aenter__(self): """Context manager entry.""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" await self.close() return False class AutoScreenshotWrapper: """ Wrapper that automatically captures screenshots after backend calls. This wrapper can be used to wrap any backend tool/session and automatically capture screenshots after each operation. Usage: # Wrap a backend tool wrapped_tool = AutoScreenshotWrapper( tool=gui_tool, screenshot_client=screenshot_client, on_screenshot=lambda screenshot: recorder.record_step(...) ) # Use wrapped tool normally result = await wrapped_tool.execute(...) # Screenshot is automatically captured and handled """ def __init__( self, tool, screenshot_client: Optional[ScreenshotClient] = None, on_screenshot=None, enabled: bool = True ): """ Initialize auto-screenshot wrapper. Args: tool: The tool/session to wrap screenshot_client: Screenshot client to use (created if None) on_screenshot: Callback function(screenshot_bytes) called after each screenshot enabled: Whether auto-screenshot is enabled """ self._tool = tool self._screenshot_client = screenshot_client or ScreenshotClient() self._on_screenshot = on_screenshot self._enabled = enabled def __getattr__(self, name): """Delegate attribute access to wrapped tool.""" return getattr(self._tool, name) async def _capture_and_notify(self): """Capture screenshot and notify callback.""" if not self._enabled: return try: screenshot = await self._screenshot_client.capture() if screenshot and self._on_screenshot: await self._on_screenshot(screenshot) except Exception as e: logger.warning(f"Failed to auto-capture screenshot: {e}") async def execute(self, *args, **kwargs): """ Execute tool and auto-capture screenshot. """ # Execute original method result = await self._tool.execute(*args, **kwargs) # Capture screenshot after execution await self._capture_and_notify() return result async def _arun(self, *args, **kwargs): """ Run tool and auto-capture screenshot. """ # Execute original method result = await self._tool._arun(*args, **kwargs) # Capture screenshot after execution await self._capture_and_notify() return result def enable(self): """Enable auto-screenshot.""" self._enabled = True def disable(self): """Disable auto-screenshot.""" self._enabled = False ================================================ FILE: anytool/platform/system_info.py ================================================ import aiohttp from typing import Optional, Dict, Any from anytool.utils.logging import Logger from .config import get_client_base_url logger = Logger.get_logger(__name__) class SystemInfoClient: """ This client provides simple methods to get: - Platform info (OS, architecture, version, etc.) - Screen size - Cursor position """ def __init__( self, base_url: Optional[str] = None, timeout: int = 10 ): """ Initialize system info client. Args: base_url: Base URL of the local server (default: read from local_server/config.json or env LOCAL_SERVER_URL) timeout: Request timeout in seconds """ # Get base_url: priority is explicit > env > config file if base_url is None: base_url = get_client_base_url() self.base_url = base_url.rstrip("/") self.timeout = timeout self._session: Optional[aiohttp.ClientSession] = None self._cached_info: Optional[Dict[str, Any]] = None async def _get_session(self) -> aiohttp.ClientSession: """Get or create aiohttp session.""" if self._session is None or self._session.closed: self._session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=self.timeout) ) return self._session async def get_system_info(self, use_cache: bool = True) -> Optional[Dict[str, Any]]: """ Get comprehensive system information. Returns information including: - system: OS name (Linux, Darwin, Windows) - release: OS release version - version: Detailed version string - machine: Architecture (x86_64, arm64, etc.) - processor: Processor type - Additional platform-specific info Args: use_cache: Whether to use cached info (default: True) """ # Check cache if use_cache and self._cached_info: logger.debug("Using cached system info") return self._cached_info try: session = await self._get_session() url = f"{self.base_url}/platform" async with session.get(url) as response: if response.status == 200: info = await response.json() # Cache the result if use_cache: self._cached_info = info logger.debug(f"System info retrieved: {info.get('system')}") return info else: error_text = await response.text() logger.error(f"Failed to get system info: HTTP {response.status} - {error_text}") return None except Exception as e: logger.error(f"Failed to get system info: {e}") return None async def get_screen_size(self) -> Optional[Dict[str, int]]: """ Get screen size. Returns: Dict with 'width' and 'height', or None on failure """ try: session = await self._get_session() url = f"{self.base_url}/screen_size" async with session.get(url) as response: if response.status == 200: size = await response.json() logger.debug(f"Screen size: {size.get('width')}x{size.get('height')}") return { "width": size.get("width"), "height": size.get("height") } else: error_text = await response.text() logger.error(f"Failed to get screen size: HTTP {response.status} - {error_text}") return None except Exception as e: logger.error(f"Failed to get screen size: {e}") return None async def get_cursor_position(self) -> Optional[Dict[str, int]]: """ Get current cursor position. Returns: Dict with 'x' and 'y', or None on failure """ try: session = await self._get_session() url = f"{self.base_url}/cursor_position" async with session.get(url) as response: if response.status == 200: pos = await response.json() return { "x": pos.get("x"), "y": pos.get("y") } else: error_text = await response.text() logger.error(f"Failed to get cursor position: HTTP {response.status} - {error_text}") return None except Exception as e: logger.error(f"Failed to get cursor position: {e}") return None def clear_cache(self): """Clear cached system information.""" self._cached_info = None logger.debug("System info cache cleared") async def close(self): """Close the HTTP session.""" if self._session and not self._session.closed: await self._session.close() logger.debug("System info client session closed") async def __aenter__(self): """Context manager entry.""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" await self.close() return False async def get_system_info(base_url: Optional[str] = None) -> Optional[Dict[str, Any]]: async with SystemInfoClient(base_url=base_url) as client: return await client.get_system_info(use_cache=False) async def get_screen_size(base_url: Optional[str] = None) -> Optional[Dict[str, int]]: async with SystemInfoClient(base_url=base_url) as client: return await client.get_screen_size() ================================================ FILE: anytool/prompts/__init__.py ================================================ from anytool.prompts.grounding_agent_prompts import GroundingAgentPrompts __all__ = ["GroundingAgentPrompts"] ================================================ FILE: anytool/prompts/grounding_agent_prompts.py ================================================ from typing import List class GroundingAgentPrompts: TASK_COMPLETE = "" SYSTEM_PROMPT = f"""You are a Grounding Agent. Execute tasks using tools. # Tool Execution - Select appropriate tools from descriptions and schemas - Provide correct parameters - Call multiple tools if needed - Tools execute immediately, results appear in next iteration - If you need results to decide next action, wait for next iteration # Tool Selection Tips - **MCP tools** and **Shell tools** are typically faster and more accurate when applicable - **GUI tools** offer finer-grained control and can handle tasks not covered by MCP/shell tools - Choose based on the task requirements and tool availability; prefer MCP/shell when they fit well # Visual Analysis Control GUI tools auto-analyze screenshots to extract information. To skip analysis when NOT needed, add parameter: ```json {{"task_description": "...", "skip_visual_analysis": true}} ``` **Decision Rule:** - Task goal is OPERATIONAL (open/navigate/click/show): Skip analysis - Task goal requires KNOWLEDGE EXTRACTION (read/extract/save data): Keep analysis **Examples:** - "Open settings page": Operational only, skip analysis - "Open settings and record all values": Needs knowledge, keep analysis - "Navigate to GitHub homepage": Operational only, skip analysis - "Search Python tutorials and save top 5 titles": Needs knowledge, keep analysis **Key principle:** If you need to extract information FROM the screen for subsequent steps or user reporting, keep analysis (don't skip). **Note:** Only GUI tools support this parameter. Other backend tools ignore it. # Task Completion After each iteration, evaluate if the task is complete: **If task is COMPLETE:** - Write a response summarizing what was accomplished - Include the completion token `{TASK_COMPLETE}` on a new line at the end of your response - Example response format: ``` I have successfully completed the task. The file has been created at /path/to/file.txt with the requested content. {TASK_COMPLETE} ``` **If task is NOT complete:** - Continue by calling the appropriate tools - Do NOT output `{TASK_COMPLETE}` - Tool results will appear in the next iteration The token `{TASK_COMPLETE}` signals that no further iterations are needed.""" @staticmethod def iteration_summary( instruction: str, iteration: int, max_iterations: int ) -> str: """ Build iteration summary prompt for LLMClient auto-summary. LLM extracts information directly from tool results in conversation history. """ return f"""Based on the original task and the tool execution results in the conversation above, generate a structured iteration summary. **Original Task:** {instruction} **Progress:** Iteration {iteration} of {max_iterations} **Generate Summary in This Format:** ## Iteration {iteration} Progress Actions taken: Knowledge obtained (COMPLETE and SPECIFIC): - File locations: - Visual content: - Data retrieved: - URLs/Links: - System state: Errors encountered: CRITICAL GUIDELINES: - This summary is for preserving knowledge for subsequent iterations - Extract ALL concrete information from tool outputs in the conversation above - Filenames, paths, URLs - use exact values from tool outputs - Visual content - extract actual text/data visible, not just "saw something" - Search results - include specific data, not vague descriptions - The next iteration cannot see current tool outputs - this summary is the ONLY source of knowledge""" @staticmethod def visual_analysis( tool_name: str, num_screenshots: int, task_description: str = "" ) -> str: """ Build prompt for visual analysis of screenshots. Args: tool_name: Tool name that generated the screenshots num_screenshots: Number of screenshots task_description: Original task description for context """ screenshot_text = "screenshot" if num_screenshots == 1 else f"{num_screenshots} screenshots" these_text = "this screenshot" if num_screenshots == 1 else "these screenshots" task_context = f""" **Original Task**: {task_description} Focus on extracting information RELEVANT to this task. Prioritize content that helps accomplish the goal. """ if task_description else "" return f"""Extract the KNOWLEDGE and INFORMATION from {these_text}. This will be passed to the next iteration so it can continue working with the information (search, analyze, save, etc.). Without this extraction, the visual content would only be viewable by humans and unusable for subsequent operations. {task_context} **EXTRACT all visible knowledge content** (prioritize task-relevant information): 1. **Text content**: Articles, documentation, code, messages, descriptions - extract the actual text 2. **Data points**: Numbers, statistics, measurements, values, percentages - be specific 3. **List items**: Names, titles, entries in lists/search results/files - list them out 4. **Structured data**: Information from tables, charts, forms - describe what they contain 5. **Key information**: URLs, paths, names, IDs, dates, labels - anything useful for next steps **IGNORE interface elements**: - Buttons, menus, toolbars, navigation bars - UI design, layout, colors, styling - Non-informational visual elements **Goal**: Extract usable knowledge that enables the next agent to work with this information programmatically. Be SPECIFIC and COMPLETE, but FOCUS on what's relevant to the task. {screenshot_text.capitalize()} from tool '{tool_name}'""" @staticmethod def final_summary( instruction: str, iterations: int ) -> str: """ Build prompt for generating final summary across all iterations. """ return f"""Based on the complete conversation history above (including all {iterations} iteration summaries and tool executions), generate a comprehensive final summary. ## Final Task Summary Task: {instruction} What was accomplished: Key information obtained: - Files: - Data: - Findings: Issues encountered: Result: <"Success" or "Incomplete"> Guidelines: - Consolidate information from ALL iteration summaries - Include concrete deliverables (file paths, data, etc.) - Be comprehensive but concise - Focus on what the user cares about""" @staticmethod def workspace_directory(workspace_dir: str) -> str: """ Build workspace directory information for cross-iteration/cross-backend data sharing. """ # Check if this is a benchmark scenario (LiveMCPBench /root mapping) # In benchmark mode, paths in query are already converted by caller (e.g., map_path_to_local) is_benchmark = "/root" in workspace_dir or "LiveMCPBench/root" in workspace_dir if is_benchmark: # Benchmark mode: all task files are in workspace directory return f"""**Working Directory**: `{workspace_dir}` - All task files (input/output) are located in this directory - Read from and write to this directory for all file operations""" else: # Normal mode: workspace is for intermediate results return f"""**Working Directory**: `{workspace_dir}` - Persist intermediate results here; later iterations/backends can read what you saved earlier - Note: User's personal files are NOT here - search in ~/Desktop, ~/Documents, ~/Downloads, etc.""" @staticmethod def workspace_matching_files(matching_files: List[str]) -> str: """ Build alert for files matching task requirements. """ files_str = ', '.join([f"`{f}`" for f in matching_files]) return f"""**Workspace Alert**: Files matching task requirements found: {files_str} - Read these files to verify if they satisfy the task - If satisfied, mark task as completed - If not satisfied, modify or recreate as needed""" @staticmethod def workspace_recent_files(total_files: int, recent_files: List[str]) -> str: """ Build info for recently modified files. """ recent_list = ', '.join([f"`{f}`" for f in recent_files[:15]]) return f"""**Workspace Info**: {total_files} files exist, {len(recent_files)} recently modified Recent files: {recent_list} Consider checking recent files before creating new ones""" @staticmethod def workspace_file_list(files: List[str]) -> str: """ Build list of all existing files. """ files_list = ', '.join([f"`{f}`" for f in files[:15]]) if len(files) > 15: files_list += f" (and {len(files) - 15} more)" return f"**Workspace Info**: {len(files)} existing file(s): {files_list}" @staticmethod def iteration_feedback( iteration: int, llm_summary: str, add_guidance: bool = True ) -> str: """ Build feedback message to pass iteration summary to next iteration. """ content = f"""## Iteration {iteration} Summary {llm_summary}""" if add_guidance: content += f""" --- Now continue with iteration {iteration + 1}. You can see the full conversation history above. Based on all progress so far, decide whether to: - Call more tools if the task is not yet complete - Output {GroundingAgentPrompts.TASK_COMPLETE} if the task is fully accomplished""" return content ================================================ FILE: anytool/recording/__init__.py ================================================ """ RecordingManager ├── internal management of platform.RecordingClient ├── internal management of platform.ScreenshotClient ├── internal management of TrajectoryRecorder └── internal management of ActionRecorder """ # Auto-record the tool execution from .manager import RecordingManager # Low-level components (advanced users) from .recorder import TrajectoryRecorder from .action_recorder import ActionRecorder # Utility functions from .utils import ( load_trajectory_from_jsonl, load_metadata, format_trajectory_for_export, analyze_trajectory, load_recording_session, filter_trajectory, extract_errors, generate_summary_report, ) from .action_recorder import ( load_agent_actions, analyze_agent_actions, format_agent_actions, ) __all__ = [ # Manager 'RecordingManager', # Recorders 'TrajectoryRecorder', 'ActionRecorder', # Trajectory utils 'load_trajectory_from_jsonl', 'load_metadata', 'format_trajectory_for_export', 'analyze_trajectory', 'load_recording_session', 'filter_trajectory', 'extract_errors', 'generate_summary_report', # Agent action utils 'load_agent_actions', 'analyze_agent_actions', 'format_agent_actions', ] ================================================ FILE: anytool/recording/action_recorder.py ================================================ """ Agent Action Recorder Records agent decision-making processes, reasoning, and outputs. Focuses on high-level agent behaviors rather than low-level tool executions. """ import datetime import json from typing import Any, Dict, Optional from pathlib import Path from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class ActionRecorder: """ Records agent actions and decision-making processes. This recorder captures the 'thinking' layer of the agent: - Task planning and decomposition - Tool selection reasoning - Evaluation decisions """ def __init__(self, trajectory_dir: Path): """ Initialize action recorder. Args: trajectory_dir: Directory to save action records """ self.trajectory_dir = trajectory_dir self.actions_file = trajectory_dir / "agent_actions.jsonl" self.step_counter = 0 # Ensure directory exists self.trajectory_dir.mkdir(parents=True, exist_ok=True) async def record_action( self, agent_name: str, action_type: str, input_data: Optional[Dict[str, Any]] = None, reasoning: Optional[Dict[str, Any]] = None, output_data: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None, related_tool_steps: Optional[list] = None, correlation_id: Optional[str] = None, ) -> Dict[str, Any]: """ Record an agent action. Args: agent_name: Name of the agent performing the action action_type: Type of action (plan | execute | evaluate | monitor) input_data: Input data the agent received (simplified) reasoning: Agent's reasoning process (structured) output_data: Agent's output/decision (structured) metadata: Additional metadata (LLM model, tokens, duration, etc.) related_tool_steps: List of tool execution step numbers related to this action correlation_id: Optional correlation ID to link related events """ self.step_counter += 1 timestamp = datetime.datetime.now().isoformat() # Infer agent type from agent name agent_type = self._infer_agent_type(agent_name) action_info = { "step": self.step_counter, "timestamp": timestamp, "agent_name": agent_name, "agent_type": agent_type, "action_type": action_type, "correlation_id": correlation_id or f"action_{self.step_counter}_{timestamp}", } # Add input (with smart truncation) if input_data: action_info["input"] = self._truncate_data(input_data, max_length=1000) # Add reasoning (keep structured) if reasoning: action_info["reasoning"] = self._truncate_data(reasoning, max_length=2000) # Add output (keep structured) if output_data: action_info["output"] = self._truncate_data(output_data, max_length=1000) # Add metadata if metadata: action_info["metadata"] = metadata # Add related tool steps for correlation if related_tool_steps: action_info["related_tool_steps"] = related_tool_steps # Append to JSONL file await self._append_to_file(action_info) logger.debug( f"Recorded {action_type} action from {agent_name} (step {self.step_counter})" ) return action_info def _infer_agent_type(self, agent_name: str) -> str: name_lower = agent_name.lower() if "host" in name_lower: return "host" elif "grounding" in name_lower: return "grounding" elif "eval" in name_lower: return "eval" elif "coordinator" in name_lower: return "coordinator" else: return "unknown" def _truncate_data(self, data: Any, max_length: int) -> Any: if isinstance(data, str): if len(data) > max_length: return data[:max_length] + "... [truncated]" return data elif isinstance(data, dict): result = {} for key, value in data.items(): if isinstance(value, str) and len(value) > max_length: result[key] = value[:max_length] + "... [truncated]" elif isinstance(value, (dict, list)): # Recursively truncate nested structures result[key] = self._truncate_data(value, max_length) else: result[key] = value return result elif isinstance(data, list): # Truncate list items result = [] for item in data: if isinstance(item, str) and len(item) > max_length: result.append(item[:max_length] + "... [truncated]") elif isinstance(item, (dict, list)): result.append(self._truncate_data(item, max_length)) else: result.append(item) return result else: return data async def _append_to_file(self, action_info: Dict[str, Any]): """Append action to JSONL file.""" with open(self.actions_file, "a", encoding="utf-8") as f: f.write(json.dumps(action_info, ensure_ascii=False)) f.write("\n") def get_step_count(self) -> int: """Get current step count.""" return self.step_counter def load_agent_actions(trajectory_dir: str) -> list: """ Load agent actions from a trajectory directory. """ actions_file = Path(trajectory_dir) / "agent_actions.jsonl" if not actions_file.exists(): logger.warning(f"Agent actions file not found: {actions_file}") return [] actions = [] try: with open(actions_file, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: actions.append(json.loads(line)) logger.info(f"Loaded {len(actions)} agent actions from {actions_file}") return actions except Exception as e: logger.error(f"Failed to load agent actions from {actions_file}: {e}") return [] def analyze_agent_actions(actions: list) -> Dict[str, Any]: """ Analyze agent actions and generate statistics. """ if not actions: return { "total_actions": 0, "by_agent": {}, "by_type": {}, } # Count by agent by_agent = {} by_type = {} for action in actions: agent_name = action.get("agent_name", "unknown") action_type = action.get("action_type", "unknown") by_agent[agent_name] = by_agent.get(agent_name, 0) + 1 by_type[action_type] = by_type.get(action_type, 0) + 1 return { "total_actions": len(actions), "by_agent": by_agent, "by_type": by_type, } def format_agent_actions(actions: list, format_type: str = "compact") -> str: """ Format agent actions for display. """ if not actions: return "No agent actions recorded" if format_type == "compact": lines = [] for action in actions: step = action.get("step", "?") agent = action.get("agent_name", "?") action_type = action.get("action_type", "?") # Try to extract key info from reasoning or output key_info = "" if action.get("reasoning"): thought = action["reasoning"].get("thought", "") if thought: key_info = f": {thought[:60]}..." lines.append(f"Step {step}: [{agent}] {action_type}{key_info}") return "\n".join(lines) elif format_type == "detailed": lines = [] for action in actions: lines.append(f"\n{'='*60}") lines.append(f"Step {action.get('step', '?')}: {action.get('agent_name', '?')}") lines.append(f"Type: {action.get('action_type', '?')}") lines.append(f"Time: {action.get('timestamp', '?')}") if action.get("reasoning"): lines.append("\nReasoning:") lines.append(json.dumps(action["reasoning"], indent=2, ensure_ascii=False)) if action.get("output"): lines.append("\nOutput:") lines.append(json.dumps(action["output"], indent=2, ensure_ascii=False)) if action.get("metadata"): lines.append("\nMetadata:") lines.append(json.dumps(action["metadata"], indent=2, ensure_ascii=False)) return "\n".join(lines) else: raise ValueError(f"Unknown format type: {format_type}") ================================================ FILE: anytool/recording/manager.py ================================================ import datetime import json import ast import types from typing import Any, Dict, List, Optional from pathlib import Path from anytool.utils.logging import Logger from .recorder import TrajectoryRecorder from .action_recorder import ActionRecorder logger = Logger.get_logger(__name__) class RecordingManager: # Global instance management (singleton pattern) _global_instance: Optional['RecordingManager'] = None def __init__( self, enabled: bool = True, task_id: str = "", log_dir: str = "./logs/recordings", backends: Optional[List[str]] = None, enable_screenshot: bool = True, enable_video: bool = False, enable_conversation_log: bool = True, auto_save_interval: int = 10, server_url: Optional[str] = None, agent_name: str = "GroundingAgent", ): """ Initialize automatic recording manager Args: enabled: whether to enable recording task_id: task ID (for naming recording directory) log_dir: log directory path backends: list of backends to record (None = all) (optional: "mcp", "gui", "shell", "system", "web") enable_screenshot: whether to enable screenshot (through platform.ScreenshotClient) enable_video: whether to enable video recording (through platform.RecordingClient) enable_conversation_log: whether to save LLM conversations to conversations.jsonl (default: True) auto_save_interval: automatic save interval (steps) server_url: local server address (None = read from config/environment variables) agent_name: name of the agent performing the recording (default: "GroundingAgent") """ self.enabled = enabled self.task_id = task_id self.log_dir = log_dir self.backends = set(backends) if backends else {"mcp", "gui", "shell", "system", "web"} self.enable_screenshot = enable_screenshot self.enable_video = enable_video self.enable_conversation_log = enable_conversation_log self.auto_save_interval = auto_save_interval self.server_url = server_url self.agent_name = agent_name # internal state self._recorder: Optional[TrajectoryRecorder] = None self._action_recorder: Optional[ActionRecorder] = None self._is_started = False self._step_counter = 0 # registered LLM clients (for automatic recording) self._registered_llm_clients = [] # Store original methods for restoration self._original_methods = {} # video/screenshot clients (internal management) self._recording_client = None self._screenshot_client = None # Register as global instance RecordingManager._global_instance = self @classmethod def is_recording(cls) -> bool: """ Check if there is an active recording session Returns: bool: True if recording is active """ return cls._global_instance is not None and cls._global_instance._is_started @classmethod async def record_retrieved_tools( cls, task_instruction: str, tools: List[Any], search_debug_info: Optional[Dict[str, Any]] = None, ): """ Record the tools retrieved for a task Args: task_instruction: The task instruction used for retrieval tools: List of retrieved tools search_debug_info: Debug info from search (similarity scores, LLM selections) """ instance = cls._global_instance if not instance or not instance._is_started or not instance._recorder: return # Extract tool info tool_info = [] for tool in tools: info = { "name": getattr(tool, "name", str(tool)), } if hasattr(tool, "backend_type"): info["backend"] = tool.backend_type.value if hasattr(tool.backend_type, "value") else str(tool.backend_type) if hasattr(tool, "_runtime_info") and tool._runtime_info: info["server_name"] = tool._runtime_info.server_name tool_info.append(info) # Build metadata metadata = { "instruction": task_instruction[:500], # Truncate long instructions "count": len(tools), "tools": tool_info, } # Add search debug info if available if search_debug_info: metadata["search_debug"] = { "search_mode": search_debug_info.get("search_mode", ""), "total_candidates": search_debug_info.get("total_candidates", 0), "mcp_count": search_debug_info.get("mcp_count", 0), "non_mcp_count": search_debug_info.get("non_mcp_count", 0), "llm_filter": search_debug_info.get("llm_filter", {}), "tool_scores": search_debug_info.get("tool_scores", []), } # Save to metadata await instance._recorder.add_metadata("retrieved_tools", metadata) logger.info(f"Recorded {len(tools)} retrieved tools (with search debug info: {search_debug_info is not None})") @classmethod async def record_iteration_context( cls, iteration: int, messages_input: List[Dict[str, Any]], messages_output: List[Dict[str, Any]], llm_response_summary: Dict[str, Any], max_content_length: int = 5000, ): """ Record a single iteration's LLM conversation to conversations.jsonl (real-time). Args: iteration: Iteration number messages_input: Messages sent to LLM messages_output: Messages after LLM response llm_response_summary: Summary of LLM response max_content_length: Max length for message content truncation """ instance = cls._global_instance if not instance or not instance._is_started or not instance._recorder: return # Check if conversation recording is enabled if not getattr(instance, 'enable_conversation_log', True): return def truncate_message_content(messages: List[Dict]) -> List[Dict]: """Truncate message content to avoid huge log files.""" result = [] for msg in messages: new_msg = {"role": msg.get("role", "unknown")} content = msg.get("content", "") if isinstance(content, str): if len(content) > max_content_length: new_msg["content"] = content[:max_content_length] + f"... [truncated, total {len(content)} chars]" else: new_msg["content"] = content elif isinstance(content, list): # Handle multi-part content (e.g., with images) new_content = [] for item in content: if isinstance(item, dict): if item.get("type") == "image": new_content.append({"type": "image", "note": "[image data omitted]"}) elif item.get("type") == "text": text = item.get("text", "") if len(text) > max_content_length: new_content.append({ "type": "text", "text": text[:max_content_length] + f"... [truncated, total {len(text)} chars]" }) else: new_content.append(item) else: new_content.append(item) else: new_content.append(item) new_msg["content"] = new_content else: new_msg["content"] = str(content)[:max_content_length] if "tool_calls" in msg: new_msg["tool_calls"] = msg["tool_calls"] result.append(new_msg) return result # Build record import datetime record = { "iteration": iteration, "timestamp": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), "llm_response_summary": llm_response_summary, "messages_input": truncate_message_content(messages_input), "messages_output": truncate_message_content(messages_output), } # Append to conversations.jsonl (real-time) conv_file = instance._recorder.trajectory_dir / "conversations.jsonl" try: with open(conv_file, "a", encoding="utf-8") as f: f.write(json.dumps(record, ensure_ascii=False)) f.write("\n") except Exception as e: logger.debug(f"Failed to write conversation log: {e}") @classmethod async def record_tool_execution( cls, tool_name: str, backend: str, parameters: Dict[str, Any], result: Any, server_name: Optional[str] = None, is_success: bool = True, metadata: Optional[Dict[str, Any]] = None, ): """ Record tool execution (internal method, called by BaseTool automatically) Args: tool_name: Name of the tool backend: Backend type (gui, shell, mcp, etc.) parameters: Tool parameters result: Tool execution result (content or error message) server_name: Server name for MCP backend is_success: Whether the tool execution was successful (default: True for backward compatibility) metadata: Tool result metadata (e.g. intermediate_steps for GUI) """ if not cls._global_instance or not cls._global_instance._is_started: return instance = cls._global_instance # Check if should record this backend if backend not in instance.backends: return # Create mock tool_call and result objects for compatibility with existing _record_* methods class MockFunctionCall: def __init__(self, name, arguments): self.name = name self.arguments = arguments class MockToolCall: def __init__(self, name, arguments): self.function = MockFunctionCall(name, arguments) class MockResult: def __init__(self, content, is_success=True, metadata=None): self.content = content self.is_success = is_success self.is_error = not is_success self.error = content if not is_success else None self.metadata = metadata or {} tool_call = MockToolCall(tool_name, parameters) mock_result = MockResult(result, is_success=is_success, metadata=metadata) try: if backend == "mcp": server = server_name or "unknown" await instance._record_mcp(tool_call, mock_result, server) elif backend == "gui": await instance._record_gui(tool_call, mock_result) elif backend == "shell": await instance._record_shell(tool_call, mock_result) elif backend == "system": await instance._record_system(tool_call, mock_result) elif backend == "web": await instance._record_web(tool_call, mock_result) instance._step_counter += 1 except Exception as e: logger.debug(f"Failed to record tool execution: {e}") @staticmethod def _parse_arguments(arg_data): """Safely parse tool_call.function.arguments which may be JSON string. Handles: 1. Proper JSON strings with true/false/null 2. Python literal strings (produced by OpenAI) using ast.literal_eval 3. Already-dict objects (returned by SDK) """ if not isinstance(arg_data, str): return arg_data or {} # First, try JSON try: return json.loads(arg_data) except json.JSONDecodeError: pass # Fallback to Python literal try: return ast.literal_eval(arg_data) except Exception: logger.debug("Failed to parse arguments, returning raw string") return {"raw": arg_data} async def start(self, task_id: Optional[str] = None): """Start automatic recording Args: task_id: If provided, override the current task_id for this recording session. This allows external callers (e.g. Coordinator) to specify a meaningful task identifier without having to recreate the RecordingManager instance. """ # Allow dynamic update of task_id before recording actually starts if task_id: self.task_id = task_id if not self.enabled or self._is_started: return try: # check server availability (only when video or screenshot is enabled) if self.enable_video or self.enable_screenshot: await self._check_server_availability() self._recorder = TrajectoryRecorder( task_name=self.task_id, log_dir=self.log_dir, enable_screenshot=self.enable_screenshot, enable_video=self.enable_video, server_url=self.server_url, ) # create action recorder for agent decision tracking self._action_recorder = ActionRecorder( trajectory_dir=Path(self._recorder.get_trajectory_dir()) ) # create video client (internal management) if self.enable_video: from anytool.platform import RecordingClient self._recording_client = RecordingClient(base_url=self.server_url) success = await self._recording_client.start_recording() if success: logger.info("Video recording started") else: logger.warning("Video recording failed to start") # create screenshot client (internal management) if self.enable_screenshot: from anytool.platform import ScreenshotClient self._screenshot_client = ScreenshotClient(base_url=self.server_url) logger.debug("Screenshot client ready") # save initial metadata await self._recorder.add_metadata("task_id", self.task_id) await self._recorder.add_metadata("backends", list(self.backends)) await self._recorder.add_metadata("start_time", datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")) # Capture and save initial screenshot if enabled if self.enable_screenshot and self._screenshot_client: try: init_shot = await self._screenshot_client.capture() if init_shot: await self._recorder.save_init_screenshot(init_shot) logger.debug("Initial screenshot saved") except Exception as e: logger.debug(f"Failed to capture initial screenshot: {e}") self._is_started = True logger.info(f"Recording started: {self._recorder.get_trajectory_dir()}") except Exception as e: logger.error(f"Recording failed to start: {e}") raise async def _check_server_availability(self): """Check if local server is available""" try: from anytool.platform import SystemInfoClient # Use context manager to ensure aiohttp session is closed, avoiding warning of unclosed session async with SystemInfoClient(base_url=self.server_url) as client: info = await client.get_system_info() if info: logger.info(f"Server connected ({info.get('platform', 'unknown')})") else: logger.warning("Server not responding, video/screenshot functionality unavailable") except Exception: logger.warning("Cannot connect to server, video/screenshot functionality unavailable") async def stop(self): """Stop automatic recording""" if not self.enabled or not self._is_started: return try: # stop video recording and save if self._recording_client: try: video_path = None if self._recorder: video_path = str(Path(self._recorder.get_trajectory_dir()) / "screen_recording.mp4") video_bytes = await self._recording_client.end_recording(dest=video_path) if video_bytes and video_path: video_size_mb = len(video_bytes) / (1024 * 1024) logger.info(f"Video recording saved: {video_path} ({video_size_mb:.2f} MB)") except Exception as e: logger.warning(f"Video recording failed to save: {e}") # close RecordingClient session, avoid unclosed session warning try: if self._recording_client: await self._recording_client.close() except Exception as e: logger.debug(f"Failed to close RecordingClient session: {e}") # close screenshot client if self._screenshot_client: try: await self._screenshot_client.close() except Exception as e: logger.debug(f"Screenshot client failed to close: {e}") finally: self._screenshot_client = None # finalize trajectory recording if self._recorder: # save final metadata await self._recorder.add_metadata("end_time", datetime.datetime.now().isoformat()) await self._recorder.add_metadata("total_steps", self._step_counter) # generate summary await self.generate_summary() # finalize recording await self._recorder.finalize() logger.info(f"Recording completed: {self._recorder.get_trajectory_dir()}") # Restore original methods for registered LLM clients for client in self._registered_llm_clients: client_id = id(client) if client_id in self._original_methods: try: # Restore original complete method original_method = self._original_methods[client_id] client.complete = original_method except Exception as e: logger.debug(f"Failed to restore original method for LLM client: {e}") # Clear registered clients and original methods self._registered_llm_clients.clear() self._original_methods.clear() self._is_started = False self._recorder = None self._action_recorder = None except Exception as e: logger.error(f"Recording failed to stop: {e}") def register_to_llm(self, llm_client): if not self.enabled: return # Check if already registered to avoid double-wrapping if id(llm_client) in self._original_methods: logger.warning(f"LLM client {llm_client} is already registered, skipping") return # Save original complete method for restoration original_complete = llm_client.complete self._original_methods[id(llm_client)] = original_complete # Wrap complete method async def wrapped_complete(self_client, *args, **kwargs): # Call original method response = await original_complete(*args, **kwargs) # Automatically record tool calls if response.get("tool_results"): await self._auto_record_tool_results(response["tool_results"]) return response # Replace method with properly bound method llm_client.complete = types.MethodType(wrapped_complete, llm_client) self._registered_llm_clients.append(llm_client) async def _auto_record_tool_results(self, tool_results: List[Dict]): """ Internal method: automatically record tool execution results from LLM client This is called by register_to_llm() wrapper. Tool results should contain backend and server_name information. """ if not self._recorder or not self._is_started: return for tool_result in tool_results: # Get necessary information from tool_result tool_call = tool_result.get("tool_call") result = tool_result.get("result") backend = tool_result.get("backend") server_name = tool_result.get("server_name") if not tool_call or not result: logger.warning("Tool result missing 'tool_call' or 'result', skipping") continue if not backend: logger.warning( f"Tool result missing 'backend' field, skipping recording. " f"Tool: {tool_call.function.name}. " f"Ensure your LLM client provides backend information in tool_results." ) continue # Extract metadata for embedding intermediate_steps (GUI) result_metadata = result.metadata if hasattr(result, 'metadata') else None await RecordingManager.record_tool_execution( tool_name=tool_call.function.name, backend=backend, parameters=self._parse_arguments(tool_call.function.arguments), result=result.content if hasattr(result, 'content') else str(result), server_name=server_name, is_success=result.is_success if hasattr(result, 'is_success') else True, metadata=result_metadata, ) async def _record_mcp(self, tool_call, result, server: str): tool_name = tool_call.function.name parameters = self._parse_arguments(tool_call.function.arguments) command = f"{server}.{tool_name}" result_str = str(result.content) if result.is_success else str(result.error) result_brief = result_str[:200] + "..." if len(result_str) > 200 else result_str is_actual_success = result.is_success and not result_str.startswith("ERROR:") step_info = await self._recorder.record_step( backend="mcp", tool=tool_name, command=command, result={ "status": "success" if is_actual_success else "error", "output": result_brief, }, parameters=parameters, extra={ "server": server, }, auto_screenshot=self.enable_screenshot ) # Add agent_name to step_info step_info["agent_name"] = self.agent_name async def _record_gui(self, tool_call, result): tool_name = tool_call.function.name parameters = self._parse_arguments(tool_call.function.arguments) # Extract actual pyautogui command (from action_history) command = "gui_agent" if result.is_success and hasattr(result, 'metadata') and result.metadata: action_history = result.metadata.get("action_history", []) if action_history: # Get last successful execution action for action in reversed(action_history): planned_action = action.get("planned_action", {}) execution_result = action.get("execution_result", {}) if planned_action.get("action_type") == "PYAUTOGUI_COMMAND": cmd = planned_action.get("command", "") if cmd and execution_result.get("status") == "success": command = cmd break elif execution_result.get("status") == "success": action_type = planned_action.get("action_type", "") if action_type and action_type not in ["WAIT", "DONE", "FAIL"]: params = planned_action.get("parameters", {}) if params: param_str = ", ".join([f"{k}={v}" for k, v in list(params.items())[:2]]) command = f"{action_type}({param_str})" else: command = action_type break result_str = str(result.content) if result.is_success else str(result.error) is_actual_success = result.is_success if result.is_success: first_200_chars = result_str[:200] if result_str else "" critical_failure_patterns = ["Task failed", "CRITICAL ERROR:", "FATAL:"] has_critical_failure = any(pattern in first_200_chars for pattern in critical_failure_patterns) is_actual_success = not has_critical_failure # Extract intermediate_steps from metadata for embedding in traj.jsonl extra = {} if hasattr(result, 'metadata') and result.metadata: intermediate_steps = result.metadata.get("intermediate_steps") if intermediate_steps: extra["intermediate_steps"] = intermediate_steps step_info = await self._recorder.record_step( backend="gui", tool="gui_agent", command=command, result={ "status": "success" if is_actual_success else "error", "output": result_str, }, parameters=parameters, auto_screenshot=self.enable_screenshot, extra=extra if extra else None, ) step_info["agent_name"] = self.agent_name async def _record_shell(self, tool_call, result): tool_name = tool_call.function.name parameters = self._parse_arguments(tool_call.function.arguments) task = parameters.get("task", tool_name) exit_code = 0 if result.is_success else 1 stdout = str(result.content) if result.is_success else "" stderr = str(result.error) if result.is_error else "" command = task if hasattr(result, 'metadata') and result.metadata: code_history = result.metadata.get("code_history", []) if code_history: # Try to find the last successful execution found_success = False for code_info in reversed(code_history): if code_info.get("status") == "success": lang = code_info.get("lang", "bash") code = code_info.get("code", "") # String format code block: ```lang\ncode\n``` command = f"```{lang}\n{code}\n```" found_success = True break # If no successful execution found, use last code block if not found_success and code_history: last_code = code_history[-1] lang = last_code.get("lang", "bash") code = last_code.get("code", "") command = f"```{lang}\n{code}\n```" stdout_brief = stdout[:200] + "..." if len(stdout) > 200 else stdout stderr_brief = stderr[:200] + "..." if len(stderr) > 200 else stderr is_actual_success = result.is_success if result.is_success: first_200_chars = stdout[:200] if stdout else "" critical_failure_patterns = ["Task failed after", "[TASK_FAILED:"] has_critical_failure = any(pattern in first_200_chars for pattern in critical_failure_patterns) is_actual_success = not has_critical_failure step_info = await self._recorder.record_step( backend="shell", tool="shell_agent", command=command, result={ "status": "success" if is_actual_success else "error", "exit_code": exit_code, "stdout": stdout_brief, "stderr": stderr_brief, }, auto_screenshot=self.enable_screenshot ) step_info["agent_name"] = self.agent_name async def _record_system(self, tool_call, result): tool_name = tool_call.function.name parameters = self._parse_arguments(tool_call.function.arguments) command = tool_name if parameters: key_params = [] for key in ['path', 'file', 'directory', 'name', 'provider', 'backend']: if key in parameters and parameters[key]: key_params.append(f"{parameters[key]}") if key_params: command = f"{tool_name}({', '.join(key_params[:2])})" result_str = str(result.content) if result.is_success else str(result.error) result_brief = result_str[:200] + "..." if len(result_str) > 200 else result_str is_actual_success = result.is_success if result.is_success and result_str: is_actual_success = not result_str.startswith("ERROR:") step_info = await self._recorder.record_step( backend="system", tool=tool_name, command=command, result={ "status": "success" if is_actual_success else "error", "output": result_brief, }, auto_screenshot=self.enable_screenshot ) step_info["agent_name"] = self.agent_name async def _record_web(self, tool_call, result): tool_name = tool_call.function.name parameters = self._parse_arguments(tool_call.function.arguments) query = parameters.get("query", "") command = query if query else "deep_research" result_str = str(result.content) if result.is_success else str(result.error) is_actual_success = result.is_success if result.is_success and result_str: is_actual_success = not result_str.startswith("ERROR:") step_info = await self._recorder.record_step( backend="web", tool="deep_research_agent", command=command, result={ "status": "success" if is_actual_success else "error", "output": result_str, # Full output preserved for training/replay }, auto_screenshot=self.enable_screenshot ) # Add agent_name to step_info step_info["agent_name"] = self.agent_name async def add_metadata(self, key: str, value: Any): if self._recorder: await self._recorder.add_metadata(key, value) async def save_plan(self, plan: Dict[str, Any], agent_name: str = "GroundingAgent"): """ Save agent plan to recording directory. This integrates planning information with execution trajectory. Args: plan: The plan data (usually containing task_updates or plan steps) agent_name: Name of the agent creating the plan """ if not self._recorder or not self._is_started: logger.warning("Cannot save plan: recording not started") return try: plan_dir = Path(self._recorder.get_trajectory_dir()) / "plans" plan_dir.mkdir(exist_ok=True) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") plan_data = { "version": timestamp, "created_at": datetime.datetime.now().isoformat(), "created_by": agent_name, "plan": plan } # Save versioned plan plan_file = plan_dir / f"plan_{timestamp}.json" with open(plan_file, 'w', encoding='utf-8') as f: json.dump(plan_data, f, indent=2, ensure_ascii=False) # Save current plan (latest) current_plan_file = plan_dir / "current_plan.json" with open(current_plan_file, 'w', encoding='utf-8') as f: json.dump(plan_data, f, indent=2, ensure_ascii=False) logger.debug(f"Saved plan to recording: {plan_file.name}") except Exception as e: logger.error(f"Failed to save plan: {e}") async def log_decision( self, agent_name: str, decision: str, context: Optional[Dict[str, Any]] = None ): """ Log agent decision with optional context. This provides insight into agent reasoning process. Args: agent_name: Name of the agent making the decision decision: Description of the decision context: Additional context information """ if not self._recorder or not self._is_started: logger.warning("Cannot log decision: recording not started") return try: traj_dir = Path(self._recorder.get_trajectory_dir()) log_file = traj_dir / "decisions.log" timestamp = datetime.datetime.now().isoformat() log_entry = f"[{timestamp}] {agent_name}: {decision}" if context: log_entry += f"\n Context: {json.dumps(context, ensure_ascii=False)}" log_entry += "\n" with open(log_file, 'a', encoding='utf-8') as f: f.write(log_entry) logger.debug(f"Logged decision from {agent_name}") except Exception as e: logger.error(f"Failed to log decision: {e}") async def record_agent_action( self, agent_name: str, action_type: str, input_data: Optional[Dict[str, Any]] = None, reasoning: Optional[Dict[str, Any]] = None, output_data: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None, related_tool_steps: Optional[list] = None, correlation_id: Optional[str] = None, ) -> Optional[Dict[str, Any]]: """ Record an agent's action and decision-making process. Args: agent_name: Name of the agent performing the action action_type: Type of action (plan | execute | evaluate | monitor) input_data: Input data the agent received (simplified) reasoning: Agent's reasoning process (structured) output_data: Agent's output/decision (structured) metadata: Additional metadata (LLM model, tokens, duration, etc.) related_tool_steps: List of tool execution step numbers related to this action correlation_id: Optional correlation ID to link related events Returns: The recorded action info, or None if recording not started """ if not self._action_recorder or not self._is_started: logger.debug("Cannot record agent action: recording not started") return None try: action_info = await self._action_recorder.record_action( agent_name=agent_name, action_type=action_type, input_data=input_data, reasoning=reasoning, output_data=output_data, metadata=metadata, related_tool_steps=related_tool_steps, correlation_id=correlation_id, ) logger.debug(f"Recorded agent action: {agent_name} - {action_type}") return action_info except Exception as e: logger.error(f"Failed to record agent action: {e}") return None async def generate_summary(self) -> Dict[str, Any]: """ Generate a comprehensive summary of the recording session. """ if not self._recorder or not self._is_started: logger.warning("Cannot generate summary: recording not started") return {} try: from .action_recorder import load_agent_actions, analyze_agent_actions from .utils import load_trajectory_from_jsonl, analyze_trajectory traj_dir = self._recorder.get_trajectory_dir() # Load all recorded data trajectory = load_trajectory_from_jsonl(f"{traj_dir}/traj.jsonl") agent_actions = load_agent_actions(traj_dir) # Analyze data traj_stats = analyze_trajectory(trajectory) action_stats = analyze_agent_actions(agent_actions) # Build summary summary = { "task_id": self.task_id, "start_time": self._recorder.metadata.get("start_time", ""), "end_time": self._recorder.metadata.get("end_time", ""), "trajectory": { "total_steps": traj_stats.get("total_steps", 0), "success_count": traj_stats.get("success_count", 0), "success_rate": traj_stats.get("success_rate", 0), "by_backend": traj_stats.get("backends", {}), "by_tool": traj_stats.get("tools", {}), }, "agent_actions": { "total_actions": action_stats.get("total_actions", 0), "by_agent": action_stats.get("by_agent", {}), "by_type": action_stats.get("by_type", {}), } } # Save summary to file summary_file = Path(traj_dir) / "summary.json" with open(summary_file, 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) logger.info(f"Generated summary: {summary_file}") return summary except Exception as e: logger.error(f"Failed to generate summary: {e}") return {} async def __aenter__(self): await self.start() return self async def __aexit__(self, exc_type, exc_val, exc_tb): await self.stop() return False @property def recording_status(self) -> bool: return self._is_started @property def trajectory_dir(self) -> Optional[str]: if self._recorder: return str(self._recorder.get_trajectory_dir()) return None @property def recording_client(self): return self._recording_client @property def screenshot_client(self): return self._screenshot_client @property def step_count(self) -> int: """Get current step count""" return self._step_counter __all__ = [ 'RecordingManager', ] ================================================ FILE: anytool/recording/recorder.py ================================================ import datetime import json from typing import Any, Dict, List, Optional from pathlib import Path from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class TrajectoryRecorder: def __init__( self, task_name: str = "", log_dir: str = "./logs/trajectories", enable_screenshot: bool = True, enable_video: bool = False, server_url: Optional[str] = None, ): """ Initialize trajectory recorder Args: task_name: task name (optional, will be saved in metadata) log_dir: log directory enable_screenshot: whether to save screenshots (through platform.ScreenshotClient) enable_video: whether to enable video recording (through platform.RecordingClient) server_url: local_server address (None = read from config/environment variables) """ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # Simplify naming rule: add prefix if task_name is provided, otherwise use timestamp only if task_name: folder_name = f"{task_name}_{timestamp}" else: folder_name = timestamp self.trajectory_dir = Path(log_dir) / folder_name self.trajectory_dir.mkdir(parents=True, exist_ok=True) # Create screenshots directory if enable_screenshot: self.screenshots_dir = self.trajectory_dir / "screenshots" self.screenshots_dir.mkdir(exist_ok=True) else: self.screenshots_dir = None # Config self.task_name = task_name self.enable_screenshot = enable_screenshot self.enable_video = enable_video self.server_url = server_url # Trajectory data self.steps: List[Dict] = [] self.step_counter = 0 # Metadata self.metadata = { "task_name": task_name, "start_time": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), "enable_screenshot": enable_screenshot, "enable_video": enable_video, } # Video recorder (lazy initialization) self._video_recorder = None # Save initial metadata self._save_metadata() async def record_step( self, backend: str, tool: str, command: str, result: Optional[Dict[str, Any]] = None, parameters: Optional[Dict[str, Any]] = None, screenshot: Optional[bytes] = None, extra: Optional[Dict[str, Any]] = None, auto_screenshot: bool = False, ) -> Dict[str, Any]: """ Record one step operation Args: backend: backend type (gui/shell/mcp/web/system) tool: tool name (name of BaseTool) command: human-readable core command result: execution result parameters: tool parameters screenshot: screenshot bytes (if provided) extra: extra information (e.g. server field for MCP) auto_screenshot: whether to automatically capture screenshot (through platform.ScreenshotClient) """ self.step_counter += 1 step_num = self.step_counter timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") step_info = { "step": step_num, "timestamp": timestamp, "backend": backend, } # MCP needs to record server (between backend and tool) if extra and "server" in extra: step_info["server"] = extra.pop("server") # General fields step_info["tool"] = tool # BaseTool name step_info["command"] = command # human-readable core command # parameters unified write to top level if parameters: step_info["parameters"] = parameters elif extra and "parameters" in extra: step_info["parameters"] = extra.pop("parameters") # Execution result remains original step_info["result"] = result or {} # Other extra information (e.g. coordinates/url) only added when needed if extra: step_info.update(extra) # Automatic screenshot (if enabled and no screenshot provided) if auto_screenshot and screenshot is None and self.enable_screenshot: screenshot = await self._capture_screenshot() # Save screenshot if screenshot and self.enable_screenshot and self.screenshots_dir: screenshot_filename = f"step_{step_num:03d}.png" screenshot_path = self.screenshots_dir / screenshot_filename with open(screenshot_path, "wb") as f: f.write(screenshot) step_info["screenshot"] = f"screenshots/{screenshot_filename}" # Add to trajectory self.steps.append(step_info) # Save to traj.jsonl in real time await self._append_to_traj_file(step_info) return step_info async def _capture_screenshot(self) -> Optional[bytes]: """Capture screenshot automatically through platform.ScreenshotClient""" try: from anytool.platform import ScreenshotClient # Lazy initialization screenshot client if not hasattr(self, '_screenshot_client'): try: self._screenshot_client = ScreenshotClient(base_url=self.server_url) except Exception: self._screenshot_client = None return None if self._screenshot_client is None: return None return await self._screenshot_client.capture() except Exception: return None async def save_init_screenshot(self, screenshot: bytes, filename: str = "init.png"): """Save initial screenshot to screenshots dir and update metadata.""" if not (self.enable_screenshot and self.screenshots_dir and screenshot): return try: filepath = self.screenshots_dir / filename with open(filepath, "wb") as f: f.write(screenshot) # Update metadata self.metadata["init_screenshot"] = f"screenshots/{filename}" self._save_metadata() except Exception as e: logger.debug(f"Failed to save initial screenshot: {e}") async def _append_to_traj_file(self, step_info: Dict[str, Any]): """Add step to traj.jsonl file""" traj_file = self.trajectory_dir / "traj.jsonl" with open(traj_file, "a", encoding="utf-8") as f: f.write(json.dumps(step_info, ensure_ascii=False)) f.write("\n") def _save_metadata(self): """Save metadata to metadata.json""" metadata_file = self.trajectory_dir / "metadata.json" with open(metadata_file, "w", encoding="utf-8") as f: json.dump(self.metadata, f, indent=2, ensure_ascii=False) async def start_video_recording(self): """Start video recording (through platform.RecordingClient)""" if not self.enable_video: return try: from anytool.recording.video import VideoRecorder video_path = self.trajectory_dir / "recording.mp4" self._video_recorder = VideoRecorder(str(video_path), base_url=self.server_url) success = await self._video_recorder.start() if not success: self._video_recorder = None except Exception as e: logger.warning(f"Video recording failed to start: {e}") self._video_recorder = None async def stop_video_recording(self): """Stop video recording""" if self._video_recorder: try: await self._video_recorder.stop() except Exception: pass finally: self._video_recorder = None async def add_metadata(self, key: str, value: Any): """Add metadata""" self.metadata[key] = value self._save_metadata() async def finalize(self): """Finalize recording, save final information""" self.metadata["end_time"] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") self.metadata["total_steps"] = self.step_counter # Backend statistics backend_counts = {} for step in self.steps: backend = step.get("backend", "unknown") backend_counts[backend] = backend_counts.get(backend, 0) + 1 self.metadata["backend_counts"] = backend_counts self._save_metadata() # Close internal ScreenshotClient, avoid unclosed session warning await self._cleanup_screenshot_client() # Stop video recording await self.stop_video_recording() logger.info(f"Recording completed: {self.trajectory_dir} (steps: {self.step_counter})") async def _cleanup_screenshot_client(self): """Cleanup screenshot client resources""" if hasattr(self, '_screenshot_client') and self._screenshot_client: try: await self._screenshot_client.close() except Exception as e: logger.debug(f"Failed to close screenshot client: {e}") finally: self._screenshot_client = None def __del__(self): """Ensure resources are cleaned up even if finalize() is not called""" # Note: This is a safety net. Best practice is to call finalize() explicitly. if hasattr(self, '_video_recorder') and self._video_recorder: logger.warning( f"TrajectoryRecorder for {self.trajectory_dir} was not finalized properly. " "Consider calling finalize() or using async context manager." ) def get_trajectory_dir(self) -> str: """Get trajectory directory path""" return str(self.trajectory_dir) async def __aenter__(self): """Async context manager entry""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit - ensures finalize() is called""" await self.finalize() return False async def record_gui_step( recorder: TrajectoryRecorder, command: str, task_description: str, result: Dict[str, Any] = None, screenshot: Optional[bytes] = None, max_steps: int = 10, tool: str = "gui_agent", ) -> Dict[str, Any]: """ Record GUI step Args: recorder: recorder instance command: actual executed pyautogui command (e.g. "pyautogui.moveTo(960, 540)") task_description: task description result: execution result screenshot: screenshot max_steps: maximum number of steps tool: tool name """ parameters = { "task_description": task_description, "max_steps": max_steps, } return await recorder.record_step( backend="gui", tool=tool, command=command, result=result, parameters=parameters, screenshot=screenshot, ) async def record_shell_step( recorder: TrajectoryRecorder, command: str, exit_code: int, stdout: Optional[str] = None, stderr: Optional[str] = None, screenshot: Optional[bytes] = None, tool: str = "shell_agent", ) -> Dict[str, Any]: """ Record Shell step Args: recorder: recorder instance command: command executed exit_code: exit code stdout: standard output (simplified version, not saved completely) stderr: standard error (simplified version) screenshot: screenshot tool: tool name """ stdout_brief = stdout[:200] + "..." if stdout and len(stdout) > 200 else stdout stderr_brief = stderr[:200] + "..." if stderr and len(stderr) > 200 else stderr result = { "status": "success" if exit_code == 0 else "error", "exit_code": exit_code, "stdout": stdout_brief, "stderr": stderr_brief, } return await recorder.record_step( backend="shell", tool=tool, command=command, result=result, screenshot=screenshot, ) async def record_mcp_step( recorder: TrajectoryRecorder, server: str, tool_name: str, parameters: Dict[str, Any], result: Any, screenshot: Optional[bytes] = None, ) -> Dict[str, Any]: """ Record MCP step Args: recorder: recorder instance server: MCP server name tool_name: tool name parameters: tool parameters result: execution result screenshot: screenshot """ command = f"{server}.{tool_name}" result_str = str(result) result_brief = result_str[:200] + "..." if len(result_str) > 200 else result_str return await recorder.record_step( backend="mcp", tool=tool_name, command=command, result={"status": "success", "output": result_brief}, parameters=parameters, screenshot=screenshot, extra={ "server": server, } ) async def record_web_step( recorder: TrajectoryRecorder, query: str, result: Dict[str, Any], screenshot: Optional[bytes] = None, tool: str = "deep_research_agent", ) -> Dict[str, Any]: """ Record Web step (deep research) Args: recorder: recorder instance query: search query result: execution result screenshot: screenshot tool: tool name """ command = query # directly use query as command return await recorder.record_step( backend="web", tool=tool, command=command, result=result, screenshot=screenshot, ) ================================================ FILE: anytool/recording/utils.py ================================================ import json import os from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) def load_trajectory_from_jsonl(jsonl_path: str) -> List[Dict[str, Any]]: trajectory = [] # Check if file exists first if not os.path.exists(jsonl_path): logger.debug(f"No trajectory file found at {jsonl_path} (this is normal for knowledge-only tasks)") return [] try: with open(jsonl_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: step = json.loads(line) trajectory.append(step) logger.info(f"Loaded {len(trajectory)} steps from {jsonl_path}") return trajectory except Exception as e: logger.error(f"Failed to load trajectory from {jsonl_path}: {e}") return [] def load_metadata(trajectory_dir: str) -> Optional[Dict[str, Any]]: metadata_path = os.path.join(trajectory_dir, "metadata.json") try: with open(metadata_path, "r", encoding="utf-8") as f: metadata = json.load(f) return metadata except Exception as e: logger.warning(f"Failed to load metadata from {metadata_path}: {e}") return None def format_trajectory_for_export( trajectory: List[Dict[str, Any]], format_type: str = "compact" ) -> str: if format_type == "compact": return _format_compact(trajectory) elif format_type == "detailed": return _format_detailed(trajectory) elif format_type == "markdown": return _format_markdown(trajectory) else: raise ValueError(f"Unknown format type: {format_type}") def _format_compact(trajectory: List[Dict[str, Any]]) -> str: """Compact format: one line per step.""" lines = [] for step in trajectory: step_num = step.get("step", "?") backend = step.get("backend", "?") server = step.get("server") tool = step.get("tool", "?") result_status = "success" if step.get("result", {}).get("status") == "success" else "error" # Include server name for MCP backend backend_str = f"{backend}@{server}" if server else backend lines.append(f"Step {step_num}: [{backend_str}] {tool} -> {result_status}") return "\n".join(lines) def _format_detailed(trajectory: List[Dict[str, Any]]) -> str: """Detailed format: multiple lines per step with parameters.""" lines = [] for step in trajectory: step_num = step.get("step", "?") timestamp = step.get("timestamp", "?") backend = step.get("backend", "?") server = step.get("server") tool = step.get("tool", "?") command = step.get("command", "?") parameters = step.get("parameters", {}) result = step.get("result", {}) from anytool.utils.display import Box, BoxStyle box = Box(width=66, style=BoxStyle.ROUNDED, color='bl') lines.append("") lines.append(box.top_line(0)) lines.append(box.text_line(f"Step {step_num} ({timestamp})", align='center', indent=0, text_color='c')) lines.append(box.separator_line(0)) lines.append(box.text_line(f"Backend: {backend}", indent=0)) if server: lines.append(box.text_line(f"Server: {server}", indent=0)) lines.append(box.text_line(f"Tool: {tool}", indent=0)) lines.append(box.text_line(f"Command: {command}", indent=0)) lines.append(box.separator_line(0)) # Parameters and result can be multi-line param_str = json.dumps(parameters, indent=2) for param_line in param_str.split('\n'): lines.append(box.text_line(param_line, indent=0)) lines.append(box.separator_line(0)) result_str = json.dumps(result, indent=2) for result_line in result_str.split('\n'): lines.append(box.text_line(result_line, indent=0)) lines.append(box.bottom_line(0)) return "\n".join(lines) def _format_markdown(trajectory: List[Dict[str, Any]]) -> str: """Markdown format: table format.""" lines = [ "# Trajectory", "", "| Step | Backend | Server | Tool | Status | Screenshot |", "|------|---------|--------|------|--------|------------|" ] for step in trajectory: step_num = step.get("step", "?") backend = step.get("backend", "?") server = step.get("server", "-") tool = step.get("tool", "?") result_status = "✓" if step.get("result", {}).get("status") == "success" else "✗" screenshot = "📷" if step.get("screenshot") else "" lines.append(f"| {step_num} | {backend} | {server} | {tool} | {result_status} | {screenshot} |") return "\n".join(lines) def analyze_trajectory(trajectory: List[Dict[str, Any]]) -> Dict[str, Any]: """ Analyze trajectory and return statistics. """ if not trajectory: return { "total_steps": 0, "success_rate": 0.0, "backends": {}, "action_types": {} } total_steps = len(trajectory) success_count = 0 backends = {} action_types = {} for step in trajectory: # Count successes if step.get("result", {}).get("status") == "success": success_count += 1 # Count backends backend = step.get("backend", "unknown") backends[backend] = backends.get(backend, 0) + 1 # Count tool types tool = step.get("tool", "unknown") action_types[tool] = action_types.get(tool, 0) + 1 return { "total_steps": total_steps, "success_count": success_count, "success_rate": success_count / total_steps if total_steps > 0 else 0.0, "backends": backends, "tools": action_types } def load_recording_session(recording_dir: str) -> Dict[str, Any]: """ Load complete recording session including trajectory, metadata, plans, and snapshots. Args: recording_dir: Path to recording directory Returns: Dictionary containing all session data: { "trajectory": List[Dict], "metadata": Dict, "plans": List[Dict], "decisions": List[str], "statistics": Dict } """ recording_path = Path(recording_dir) if not recording_path.exists(): logger.error(f"Recording directory not found: {recording_dir}") return {} session = { "trajectory": [], "metadata": None, "plans": [], "decisions": [], "statistics": {} } # Load trajectory traj_file = recording_path / "traj.jsonl" if traj_file.exists(): session["trajectory"] = load_trajectory_from_jsonl(str(traj_file)) session["statistics"] = analyze_trajectory(session["trajectory"]) # Load metadata metadata_file = recording_path / "metadata.json" if metadata_file.exists(): session["metadata"] = load_metadata(str(recording_path)) # Load plans plans_dir = recording_path / "plans" if plans_dir.exists(): for plan_file in sorted(plans_dir.glob("plan_*.json")): try: with open(plan_file, 'r', encoding='utf-8') as f: session["plans"].append(json.load(f)) except Exception as e: logger.warning(f"Failed to load plan {plan_file}: {e}") # Load decisions log decisions_file = recording_path / "decisions.log" if decisions_file.exists(): try: with open(decisions_file, 'r', encoding='utf-8') as f: session["decisions"] = f.readlines() except Exception as e: logger.warning(f"Failed to load decisions: {e}") return session def filter_trajectory( trajectory: List[Dict[str, Any]], backend: Optional[str] = None, tool: Optional[str] = None, status: Optional[str] = None, time_range: Optional[Tuple[str, str]] = None ) -> List[Dict[str, Any]]: filtered = trajectory if backend: filtered = [s for s in filtered if s.get("backend") == backend] if tool: filtered = [s for s in filtered if s.get("tool") == tool] if status: filtered = [s for s in filtered if s.get("result", {}).get("status") == status] if time_range: start_time, end_time = time_range filtered = [ s for s in filtered if start_time <= s.get("timestamp", "") <= end_time ] return filtered def extract_errors(trajectory: List[Dict[str, Any]]) -> List[Dict[str, Any]]: return [ step for step in trajectory if step.get("result", {}).get("status") == "error" ] def generate_summary_report(recording_dir: str, output_file: Optional[str] = None) -> str: session = load_recording_session(recording_dir) if not session: return "Error: Could not load recording session" lines = [] lines.append("# Recording Session Summary\n") # Metadata section if session["metadata"]: lines.append("## Metadata") metadata = session["metadata"] lines.append(f"- **Task ID**: {metadata.get('task_id', 'N/A')}") lines.append(f"- **Start Time**: {metadata.get('start_time', 'N/A')}") lines.append(f"- **End Time**: {metadata.get('end_time', 'N/A')}") lines.append(f"- **Total Steps**: {metadata.get('total_steps', 0)}") lines.append(f"- **Backends**: {', '.join(metadata.get('backends', []))}") lines.append("") # Statistics section if session["statistics"]: lines.append("## Statistics") stats = session["statistics"] lines.append(f"- **Total Steps**: {stats.get('total_steps', 0)}") lines.append(f"- **Success Count**: {stats.get('success_count', 0)}") lines.append(f"- **Success Rate**: {stats.get('success_rate', 0):.2%}") lines.append("") lines.append("### Backend Distribution") for backend, count in stats.get('backends', {}).items(): lines.append(f"- {backend}: {count}") lines.append("") lines.append("### Tool Distribution") for tool, count in sorted(stats.get('tools', {}).items(), key=lambda x: x[1], reverse=True): lines.append(f"- {tool}: {count}") lines.append("") # Plans section if session["plans"]: lines.append(f"## Plans ({len(session['plans'])} total)") for i, plan in enumerate(session["plans"], 1): lines.append(f"### Plan {i}") lines.append(f"- Created: {plan.get('created_at', 'N/A')}") lines.append(f"- Created by: {plan.get('created_by', 'N/A')}") plan_data = plan.get('plan', {}) if 'task_updates' in plan_data: lines.append(f"- Tasks: {len(plan_data['task_updates'])}") lines.append("") # Errors section if session["trajectory"]: errors = extract_errors(session["trajectory"]) if errors: lines.append(f"## Errors ({len(errors)} total)") for error in errors[:5]: # Show first 5 errors lines.append(f"- Step {error.get('step')}: {error.get('backend')} - {error.get('tool')}") error_msg = error.get('result', {}).get('output', 'No error message') lines.append(f" ```\n {error_msg[:200]}\n ```") if len(errors) > 5: lines.append(f" ... and {len(errors) - 5} more errors") lines.append("") # Decisions section if session["decisions"]: lines.append(f"## Decisions ({len(session['decisions'])} total)") for decision in session["decisions"][:10]: # Show first 10 decisions lines.append(f" {decision.strip()}") if len(session["decisions"]) > 10: lines.append(f" ... and {len(session['decisions']) - 10} more decisions") lines.append("") report = "\n".join(lines) # Save to file if requested if output_file: try: with open(output_file, 'w', encoding='utf-8') as f: f.write(report) logger.info(f"Report saved to {output_file}") except Exception as e: logger.error(f"Failed to save report: {e}") return report def compare_recordings(recording_dir1: str, recording_dir2: str) -> Dict[str, Any]: session1 = load_recording_session(recording_dir1) session2 = load_recording_session(recording_dir2) stats1 = session1.get("statistics", {}) stats2 = session2.get("statistics", {}) return { "session1": { "path": recording_dir1, "total_steps": stats1.get("total_steps", 0), "success_rate": stats1.get("success_rate", 0), "backends": stats1.get("backends", {}) }, "session2": { "path": recording_dir2, "total_steps": stats2.get("total_steps", 0), "success_rate": stats2.get("success_rate", 0), "backends": stats2.get("backends", {}) }, "differences": { "step_diff": stats2.get("total_steps", 0) - stats1.get("total_steps", 0), "success_rate_diff": stats2.get("success_rate", 0) - stats1.get("success_rate", 0) } } ================================================ FILE: anytool/recording/video.py ================================================ """ Video Recorder Communicates with local_server through platform.RecordingClient Supports local and remote recording (through configuration LOCAL_SERVER_URL) """ from pathlib import Path from typing import Optional from anytool.utils.logging import Logger from anytool.platform import RecordingClient logger = Logger.get_logger(__name__) class VideoRecorder: def __init__( self, output_path: str, base_url: Optional[str] = None, ): """ Initialize video recorder Args: output_path: output video path base_url: local_server address (None = read from config/environment variables) """ self.output_path = Path(output_path) self.base_url = base_url self.is_recording = False self._client: Optional[RecordingClient] = None async def start(self): """Start recording screen""" if self.is_recording: return False try: if self._client is None: self._client = RecordingClient(base_url=self.base_url) success = await self._client.start_recording() if success: self.is_recording = True logger.info(f"Video recording started") return True else: logger.warning("Video recording failed to start") return False except Exception as e: logger.warning(f"Video recording failed to start: {e}") return False async def stop(self): """Stop recording screen and save to local""" if not self.is_recording: return False try: if self._client: video_bytes = await self._client.end_recording(dest=str(self.output_path)) if video_bytes: video_size_mb = len(video_bytes) / (1024 * 1024) self.is_recording = False logger.info(f"Video recording stopped ({video_size_mb:.2f} MB)") return True else: logger.warning("Video recording failed to stop") return False except Exception as e: logger.warning(f"Video recording failed to stop: {e}") return False finally: if self._client: try: await self._client.close() except Exception: pass self._client = None __all__ = ['VideoRecorder'] ================================================ FILE: anytool/recording/viewer.py ================================================ """ Recording Viewer Convenient tools for viewing and analyzing recording sessions. """ import json from pathlib import Path from typing import Optional, Dict, Any, List from anytool.utils.logging import Logger from .utils import load_recording_session, generate_summary_report from .action_recorder import load_agent_actions, analyze_agent_actions, format_agent_actions logger = Logger.get_logger(__name__) class RecordingViewer: """ Viewer for analyzing recording sessions. Provides convenient methods to: - Load and display recordings - Analyze agent behaviors - Generate reports """ def __init__(self, recording_dir: str): """ Initialize viewer with a recording directory. Args: recording_dir: Path to recording directory """ self.recording_dir = Path(recording_dir) if not self.recording_dir.exists(): raise ValueError(f"Recording directory not found: {recording_dir}") # Load session data self.session = load_recording_session(str(self.recording_dir)) logger.info(f"Loaded recording from {recording_dir}") def show_summary(self) -> str: """ Display a summary of the recording. Returns: Formatted summary string """ if not self.session.get("metadata"): return "No metadata available" metadata = self.session["metadata"] stats = self.session.get("statistics", {}) lines = [] lines.append("=" * 70) lines.append("RECORDING SUMMARY") lines.append("=" * 70) lines.append(f"Task ID: {metadata.get('task_id', 'N/A')}") lines.append(f"Start: {metadata.get('start_time', 'N/A')}") lines.append(f"End: {metadata.get('end_time', 'N/A')}") lines.append(f"Total Steps: {metadata.get('total_steps', 0)}") lines.append("") lines.append("Statistics:") lines.append(f" - Success Rate: {stats.get('success_rate', 0):.2%}") lines.append(f" - Success Count: {stats.get('success_count', 0)}/{stats.get('total_steps', 0)}") lines.append("") if stats.get("backends"): lines.append("Backend Usage:") for backend, count in sorted(stats["backends"].items(), key=lambda x: x[1], reverse=True): lines.append(f" - {backend}: {count}") lines.append("=" * 70) return "\n".join(lines) def show_agent_actions(self, format_type: str = "compact", agent_name: Optional[str] = None) -> str: actions = load_agent_actions(str(self.recording_dir)) if agent_name: actions = [a for a in actions if a.get("agent_name") == agent_name] if not actions: return f"No agent actions found{' for ' + agent_name if agent_name else ''}" # Add header header = f"\nAGENT ACTIONS ({len(actions)} total)" if agent_name: header += f" - {agent_name}" header += "\n" + "=" * 70 # Format actions formatted = format_agent_actions(actions, format_type) return header + "\n" + formatted def analyze_agents(self) -> str: actions = load_agent_actions(str(self.recording_dir)) stats = analyze_agent_actions(actions) lines = [] lines.append("\nAGENT ANALYSIS") lines.append("=" * 70) lines.append(f"Total Actions: {stats.get('total_actions', 0)}") lines.append("") lines.append("By Agent:") for agent, count in sorted(stats.get('by_agent', {}).items(), key=lambda x: x[1], reverse=True): percentage = (count / stats['total_actions'] * 100) if stats['total_actions'] > 0 else 0 lines.append(f" - {agent}: {count} ({percentage:.1f}%)") lines.append("") lines.append("By Action Type:") for action_type, count in sorted(stats.get('by_type', {}).items(), key=lambda x: x[1], reverse=True): percentage = (count / stats['total_actions'] * 100) if stats['total_actions'] > 0 else 0 lines.append(f" - {action_type}: {count} ({percentage:.1f}%)") return "\n".join(lines) def generate_full_report(self, output_file: Optional[str] = None) -> str: return generate_summary_report(str(self.recording_dir), output_file) def export_to_json(self, output_file: str): with open(output_file, 'w', encoding='utf-8') as f: json.dump(self.session, f, indent=2, ensure_ascii=False) logger.info(f"Exported session to {output_file}") def show_timeline(self, max_events: int = 50) -> str: # Load all events actions = load_agent_actions(str(self.recording_dir)) trajectory = self.session.get("trajectory", []) # Combine all events with unified format timeline = [] # Add agent actions for action in actions: timeline.append({ "timestamp": action.get("timestamp", ""), "type": "agent_action", "agent_name": action.get("agent_name", ""), "agent_type": action.get("agent_type", "unknown"), "action_type": action.get("action_type", ""), "step": action.get("step"), "correlation_id": action.get("correlation_id", ""), "description": f"[{action.get('agent_type', '?').upper()}] {action.get('action_type', '?')}", "related_tool_steps": action.get("related_tool_steps", []), }) # Add tool executions for traj_step in trajectory: timeline.append({ "timestamp": traj_step.get("timestamp", ""), "type": "tool_execution", "backend": traj_step.get("backend", ""), "tool": traj_step.get("tool", ""), "step": traj_step.get("step"), "agent_name": traj_step.get("agent_name", ""), "description": f"[TOOL:{traj_step.get('backend', '?').upper()}] {traj_step.get('tool', '?')}", "status": traj_step.get("result", {}).get("status", ""), }) # Sort by timestamp timeline.sort(key=lambda x: x.get("timestamp", "")) # Format output lines = [] lines.append("\nUNIFIED TIMELINE") lines.append("=" * 100) lines.append(f"Total events: {len(timeline)} (showing first {max_events})") lines.append("") for i, item in enumerate(timeline[:max_events]): timestamp = item.get("timestamp", "N/A") time_str = timestamp.split("T")[1][:8] if "T" in timestamp else timestamp[-8:] # Format line with type indicator type_marker = { "agent_action": "🤖", "tool_execution": "🔧" }.get(item.get("type"), "•") desc = item.get("description", "") agent = item.get("agent_name", "") agent_type = item.get("agent_type", "") line = f"{time_str} {type_marker} {desc}" # Add agent info if available if agent and agent_type: line += f" (by {agent}/{agent_type})" elif agent: line += f" (by {agent})" lines.append(line) # Show correlations correlations = [] if item.get("related_tool_steps"): correlations.append(f"→ tool steps: {item['related_tool_steps']}") if item.get("related_action_step"): correlations.append(f"→ action step: {item['related_action_step']}") if correlations: for corr in correlations: lines.append(f" {corr}") if len(timeline) > max_events: lines.append(f"\n... and {len(timeline) - max_events} more events") return "\n".join(lines) def show_agent_flow(self, agent_name: Optional[str] = None) -> str: """ Show the flow of a specific agent's actions and related events. """ actions = load_agent_actions(str(self.recording_dir)) if agent_name: actions = [a for a in actions if a.get("agent_name") == agent_name] lines = [] lines.append(f"\nAGENT FLOW{' - ' + agent_name if agent_name else ''}") lines.append("=" * 100) # Sort by timestamp actions.sort(key=lambda x: x.get("timestamp", "")) for action in actions: timestamp = action.get("timestamp", "N/A").split("T")[1][:8] if "T" in action.get("timestamp", "") else "N/A" agent_type = action.get("agent_type", "?").upper() action_type = action.get("action_type", "?") step = action.get("step", "?") lines.append(f"{timestamp} [{agent_type}] Action #{step}: {action_type}") # Show reasoning if available if action.get("reasoning"): thought = action["reasoning"].get("thought", "") if thought: lines.append(f" 💭 {thought[:80]}...") # Show output if action.get("output"): output = action["output"] if isinstance(output, dict): for key in ["message", "status", "evaluation"]: if key in output: lines.append(f" 📤 {key}: {str(output[key])[:60]}") lines.append("") return "\n".join(lines) def view_recording(recording_dir: str): """ Quick interactive viewer for a recording. """ try: viewer = RecordingViewer(recording_dir) print(viewer.show_summary()) print("\n") print(viewer.analyze_agents()) print("\n") print("Agent Actions (compact):") print(viewer.show_agent_actions(format_type="compact")) except Exception as e: logger.error(f"Failed to view recording: {e}") print(f"Error: {e}") def compare_recordings(recording_dir1: str, recording_dir2: str) -> str: """ Compare two recordings side by side. """ try: viewer1 = RecordingViewer(recording_dir1) viewer2 = RecordingViewer(recording_dir2) lines = [] lines.append("=" * 70) lines.append("RECORDING COMPARISON") lines.append("=" * 70) lines.append("") # Compare metadata meta1 = viewer1.session.get("metadata", {}) meta2 = viewer2.session.get("metadata", {}) lines.append("Recording 1:") lines.append(f" Task: {meta1.get('task_id', 'N/A')}") lines.append(f" Steps: {meta1.get('total_steps', 0)}") lines.append("") lines.append("Recording 2:") lines.append(f" Task: {meta2.get('task_id', 'N/A')}") lines.append(f" Steps: {meta2.get('total_steps', 0)}") lines.append("") # Compare statistics stats1 = viewer1.session.get("statistics", {}) stats2 = viewer2.session.get("statistics", {}) lines.append("Differences:") lines.append(f" Steps: {meta2.get('total_steps', 0) - meta1.get('total_steps', 0):+d}") lines.append(f" Success Rate: {stats2.get('success_rate', 0) - stats1.get('success_rate', 0):+.2%}") return "\n".join(lines) except Exception as e: logger.error(f"Failed to compare recordings: {e}") return f"Error: {e}" # CLI interface if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python -m anytool.recording.viewer ") sys.exit(1) recording_dir = sys.argv[1] view_recording(recording_dir) ================================================ FILE: anytool/tool_layer.py ================================================ from __future__ import annotations import asyncio import traceback import uuid from dataclasses import dataclass, field from typing import Any, Dict, List, Optional from anytool.agents import GroundingAgent from anytool.llm import LLMClient from anytool.grounding.core.grounding_client import GroundingClient from anytool.config import get_config, load_config from anytool.config.loader import get_agent_config from anytool.recording import RecordingManager from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) @dataclass class AnyToolConfig: # LLM Configuration llm_model: str = "openrouter/anthropic/claude-sonnet-4.5" llm_enable_thinking: bool = False llm_timeout: float = 120.0 llm_max_retries: int = 3 llm_rate_limit_delay: float = 0.0 llm_kwargs: Dict[str, Any] = field(default_factory=dict) # Separate models for specific tasks (None = use llm_model) tool_retrieval_model: Optional[str] = None # Model for tool retrieval LLM filter visual_analysis_model: Optional[str] = None # Model for visual analysis # Grounding Configuration grounding_config_path: Optional[str] = None grounding_max_iterations: int = 20 grounding_system_prompt: Optional[str] = None # Backend Configuration backend_scope: Optional[List[str]] = None # None = All backends ["shell", "gui", "mcp", "web", "system"] # Workspace Configuration workspace_dir: Optional[str] = None # Recording Configuration enable_recording: bool = False recording_backends: Optional[List[str]] = None recording_log_dir: str = "./logs/recordings" enable_screenshot: bool = True enable_video: bool = True enable_conversation_log: bool = True # Save LLM conversations to conversations.jsonl # Logging Configuration log_level: str = "INFO" log_to_file: bool = False log_file_path: Optional[str] = None def __post_init__(self): """Validate configuration""" if not self.llm_model: raise ValueError("llm_model is required") logger.debug(f"AnyToolConfig initialized with model: {self.llm_model}") class AnyTool: def __init__(self, config: Optional[AnyToolConfig] = None): self.config = config or AnyToolConfig() self._llm_client: Optional[LLMClient] = None self._grounding_client: Optional[GroundingClient] = None self._grounding_agent: Optional[GroundingAgent] = None self._recording_manager: Optional[RecordingManager] = None self._initialized = False self._running = False logger.debug("AnyTool instance created") async def initialize(self) -> None: if self._initialized: logger.warning("AnyTool already initialized") return logger.info("Initializing AnyTool...") try: self._llm_client = LLMClient( model=self.config.llm_model, enable_thinking=self.config.llm_enable_thinking, rate_limit_delay=self.config.llm_rate_limit_delay, max_retries=self.config.llm_max_retries, timeout=self.config.llm_timeout, **self.config.llm_kwargs ) logger.info(f"✓ LLM Client: {self.config.llm_model}") # Load grounding config # If custom config is provided, merge it with default configs # load_config supports multiple files and deep merges them (later files override earlier ones) if self.config.grounding_config_path: from anytool.config.loader import CONFIG_DIR from anytool.config.constants import CONFIG_GROUNDING, CONFIG_SECURITY # Load default configs + custom config (custom values will override defaults) grounding_config = load_config( CONFIG_DIR / CONFIG_GROUNDING, CONFIG_DIR / CONFIG_SECURITY, self.config.grounding_config_path ) logger.info(f"Merged custom grounding config: {self.config.grounding_config_path}") else: # Load default configs only grounding_config = get_config() self._grounding_client = GroundingClient(config=grounding_config) await self._grounding_client.initialize_all_providers() backends = list(self._grounding_client.list_providers().keys()) logger.info(f"✓ Grounding Client: {len(backends)} backends") logger.debug(f" Available backends: {[b.value for b in backends]}") if self.config.enable_recording: self._recording_manager = RecordingManager( enabled=True, task_id="", log_dir=self.config.recording_log_dir, backends=self.config.recording_backends, enable_screenshot=self.config.enable_screenshot, enable_video=self.config.enable_video, enable_conversation_log=self.config.enable_conversation_log, agent_name="AnyTool", ) # Inject recording_manager to grounding_client for GUI intermediate steps self._grounding_client.recording_manager = self._recording_manager # Register to LLM client for auto-recording tool results self._recording_manager.register_to_llm(self._llm_client) logger.info(f"✓ Recording enabled: {len(self._recording_manager.backends or [])} backends") agent_config = get_agent_config("GroundingAgent") if agent_config: # Use config file values, but command-line args (self.config) take priority max_iterations = agent_config.get("max_iterations", self.config.grounding_max_iterations) # Command-line backend_scope > config file > default backend_scope = self.config.backend_scope or agent_config.get("backend_scope") or ["gui", "shell", "mcp", "web", "system"] visual_analysis_timeout = agent_config.get("visual_analysis_timeout", 30.0) # Update config with values from config file self.config.grounding_max_iterations = max_iterations logger.info(f"Loaded GroundingAgent config from config_agents.json (max_iterations={max_iterations}, visual_analysis_timeout={visual_analysis_timeout}s)") else: # Fall back to AnyToolConfig values max_iterations = self.config.grounding_max_iterations backend_scope = self.config.backend_scope or ["gui", "shell", "mcp", "web", "system"] visual_analysis_timeout = 30.0 logger.warning(f"config_agents.json not found, using default config (max_iterations={max_iterations})") # Create separate LLM client for tool retrieval if configured tool_retrieval_llm = None if self.config.tool_retrieval_model: tool_retrieval_llm = LLMClient( model=self.config.tool_retrieval_model, timeout=self.config.llm_timeout, max_retries=self.config.llm_max_retries, ) logger.info(f"✓ Tool retrieval LLM: {self.config.tool_retrieval_model}") self._grounding_agent = GroundingAgent( name="AnyTool-GroundingAgent", backend_scope=backend_scope, llm_client=self._llm_client, grounding_client=self._grounding_client, recording_manager=self._recording_manager, system_prompt=self.config.grounding_system_prompt, max_iterations=max_iterations, visual_analysis_timeout=visual_analysis_timeout, tool_retrieval_llm=tool_retrieval_llm, visual_analysis_model=self.config.visual_analysis_model, ) logger.info(f"✓ GroundingAgent: {', '.join(backend_scope)}") self._initialized = True logger.info("="*60) logger.info("AnyTool ready to use!") logger.info("="*60) except Exception as e: logger.error(f"Failed to initialize AnyTool: {e}") await self.cleanup() raise async def execute( self, task: str, context: Optional[Dict[str, Any]] = None, workspace_dir: Optional[str] = None, max_iterations: Optional[int] = None, task_id: Optional[str] = None, ) -> Dict[str, Any]: """ Execute a task with AnyTool. Args: task: Task instruction context: Additional context workspace_dir: Working directory max_iterations: Max iterations override task_id: External task ID for recording/logging. If None, generates a random one. This allows external callers (e.g., OSWorld) to specify their own task ID so recordings can be easily matched with benchmark results. """ if not self._initialized: raise RuntimeError( "AnyTool not initialized. " "Call await tool_layer.initialize() first or use async with." ) if self._running: raise RuntimeError("AnyTool is already running a task.") logger.info("="*60) logger.info(f"Task: {task[:100]}...") logger.info("="*60) self._running = True start_time = asyncio.get_event_loop().time() # Use external task_id if provided, otherwise generate one if task_id is None: task_id = f"task_{uuid.uuid4().hex[:8]}" logger.info(f"Task ID: {task_id}") try: execution_context = context or {} execution_context["task_id"] = task_id execution_context["instruction"] = task if max_iterations is not None: execution_context["max_iterations"] = max_iterations if self._recording_manager: if self._recording_manager.recording_status: await self._recording_manager.stop() logger.debug("Stopped previous recording session") self._recording_manager.task_id = task_id await self._recording_manager.start() logger.info(f"Recording started: {task_id}") if workspace_dir: execution_context["workspace_dir"] = workspace_dir logger.info(f"Workspace: {workspace_dir}") elif self.config.workspace_dir: execution_context["workspace_dir"] = self.config.workspace_dir logger.info(f"Workspace: {self.config.workspace_dir}") elif self._recording_manager and self._recording_manager.trajectory_dir: execution_context["workspace_dir"] = self._recording_manager.trajectory_dir logger.info(f"Workspace: {execution_context['workspace_dir']}") else: import tempfile from pathlib import Path workspace = Path(tempfile.gettempdir()) / "anytool_workspace" / task_id workspace.mkdir(parents=True, exist_ok=True) execution_context["workspace_dir"] = str(workspace) logger.info(f"Workspace: {execution_context['workspace_dir']}") logger.info(f"Executing with GroundingAgent (max {max_iterations or self.config.grounding_max_iterations} iterations)...") result = await self._grounding_agent.process(execution_context) execution_time = asyncio.get_event_loop().time() - start_time final_result = { **result, "task_id": task_id, "execution_time": execution_time, } status = result.get('status', 'unknown') iterations = result.get('iterations', 0) tool_count = len(result.get('tool_executions', [])) logger.info("="*60) if status == "success": logger.info( f"Task completed successfully! " f"({iterations} iterations, {tool_count} tool calls, {execution_time:.2f}s)" ) elif status == "incomplete": logger.warning( f"Task incomplete after {iterations} iterations. " f"Consider increasing max_iterations." ) else: logger.error(f"Task failed: {result.get('error', 'Unknown error')}") logger.info("="*60) return final_result except Exception as e: execution_time = asyncio.get_event_loop().time() - start_time tb = traceback.format_exc(limit=10) logger.error(f"Task execution failed: {e}", exc_info=True) return { "status": "error", "error": str(e), "traceback": tb, "response": f"Task execution error: {str(e)}", "execution_time": execution_time, "task_id": task_id, "iterations": 0, "tool_executions": [], } finally: if self._recording_manager and self._recording_manager.recording_status: try: await self._recording_manager.stop() logger.debug(f"Recording stopped: {task_id}") except Exception as e: logger.warning(f"Failed to stop recording: {e}") # Trigger quality evolution periodically await self._maybe_evolve_quality() self._running = False async def _maybe_evolve_quality(self) -> None: """Trigger quality evolution based on global execution count.""" if not self._grounding_client or not self._grounding_client.quality_manager: return # Check if evolution should be triggered (every 10 global executions) if self._grounding_client.quality_manager.should_evolve(): try: report = await self._grounding_client.evolve_quality() if report.get("recommendations"): logger.info(f"Quality evolution: {report['recommendations']}") except Exception as e: logger.debug(f"Quality evolution skipped: {e}") async def cleanup(self) -> None: """ Close all sessions and release resources. Automatically called when using context manager. """ logger.info("Cleaning up AnyTool resources...") try: if self._grounding_client: await self._grounding_client.close_all_sessions() logger.debug("All grounding sessions closed") if self._recording_manager and self._recording_manager.recording_status: try: await self._recording_manager.stop() logger.debug("Recording manager stopped") except Exception as e: logger.warning(f"Failed to stop recording: {e}") self._initialized = False self._running = False logger.info("AnyTool cleanup complete") except Exception as e: logger.error(f"Error during cleanup: {e}", exc_info=True) def is_initialized(self) -> bool: return self._initialized def is_running(self) -> bool: return self._running def get_config(self) -> AnyToolConfig: return self.config def list_backends(self) -> List[str]: if not self._initialized: raise RuntimeError("AnyTool not initialized") return [backend.value for backend in self._grounding_client.list_providers().keys()] def list_sessions(self) -> List[str]: if not self._initialized: raise RuntimeError("AnyTool not initialized") return self._grounding_client.list_sessions() async def __aenter__(self): """Context manager entry""" await self.initialize() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" await self.cleanup() return False def __repr__(self) -> str: status = "initialized" if self._initialized else "not initialized" if self._running: status = "running" backends = ", ".join(self.config.backend_scope) if self.config.backend_scope else "all" return f"" ================================================ FILE: anytool/utils/cli_display.py ================================================ """CLI Display utilities for AnyTool startup and interaction""" from anytool.tool_layer import AnyToolConfig from anytool.utils.display import Box, BoxStyle, colorize class CLIDisplay: @staticmethod def print_banner(): box = Box(width=70, style=BoxStyle.ROUNDED, color='c') print() print(box.top_line(indent=4)) print(box.empty_line(indent=4)) title = colorize("AnyTool", 'c', bold=True) print(box.text_line(title, align='center', indent=4, text_color='')) subtitle = "Universal Tool-Use Layer for AI Agents" print(box.text_line(subtitle, align='center', indent=4, text_color='gr')) print(box.empty_line(indent=4)) print(box.bottom_line(indent=4)) print() @staticmethod def print_configuration(config: AnyToolConfig): box = Box(width=70, style=BoxStyle.ROUNDED, color='bl') print(box.text_line(colorize("◉ System Configuration", 'c', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) configs = [ ("AI Model", config.llm_model, 'bl'), ("Max Iterations", str(config.grounding_max_iterations), 'c'), ("LLM Timeout", f"{config.llm_timeout}s", 'c'), ] for label, value, color in configs: line = f" {label:20s} {colorize(value, color)}" print(box.text_line(line, indent=4, text_color='')) print(box.bottom_line(indent=4)) print() @staticmethod def print_initialization_progress(steps: list, show_header: bool = True): box = Box(width=70, style=BoxStyle.ROUNDED, color='g') if show_header: print(box.text_line(colorize("► Initializing Components", 'g', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) for step, status in steps: if status == "ok": icon = colorize("✓", 'g') elif status == "error": icon = colorize("✗", 'rd') else: icon = colorize("[...]", 'y') line = f" {icon} {step}" print(box.text_line(line, indent=4, text_color='')) print(box.bottom_line(indent=4)) print() @staticmethod def print_result_summary(result: dict): box = Box(width=70, style=BoxStyle.ROUNDED, color='c') print() print(box.text_line(colorize("◈ Execution Summary", 'c', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) status = result.get("status", "unknown") status_colors = { "completed": 'g', "timeout": 'y', "error": 'rd', "max_iterations_reached": 'y', } status_color = status_colors.get(status, 'gr') status_display = colorize(status.upper(), status_color, bold=True) exec_time = result.get('execution_time', 0) result_lines = [ f" Status: {status_display}", f" Execution Time: {colorize(f'{exec_time:.2f}s', 'c')}", f" Iterations: {colorize(str(result.get('iterations', 0)), 'y')}", f" Completed Tasks: {colorize(str(result.get('completed_tasks', 0)), 'g')}", ] if result.get('evaluation_results'): result_lines.append(f" Evaluations: {colorize(str(len(result['evaluation_results'])), 'bl')}") for line in result_lines: print(box.text_line(line, indent=4, text_color='')) print(box.bottom_line(indent=4)) print() # Print user response (the actual answer/result) if result.get('user_response'): response_box = Box(width=70, style=BoxStyle.ROUNDED, color='g') print(response_box.text_line(colorize("◈ Result", 'g', bold=True), align='center', indent=4, text_color='')) print(response_box.separator_line(indent=4)) user_response = result['user_response'] for line in user_response.split('\n'): if line.strip(): display_line = f" {line.strip()}" print(response_box.text_line(display_line, indent=4, text_color='')) print(response_box.bottom_line(indent=4)) print() @staticmethod def print_interactive_header(): box = Box(width=70, style=BoxStyle.ROUNDED, color='c') print(box.text_line(colorize("⌨ Interactive Mode", 'c', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) help_lines = [ "", colorize(" Ready to execute your tasks!", 'g'), "", colorize(" Available Commands:", 'c', bold=True), " " + colorize("status", 'bl') + " → View system status", " " + colorize("help", 'bl') + " → Show available commands", " " + colorize("quit", 'bl') + " → Exit interactive mode", "", colorize(" ▸ Enter your task description below:", 'gr'), "", ] for line in help_lines: print(box.text_line(line, indent=4, text_color='')) print(box.bottom_line(indent=4)) print() @staticmethod def print_task_header(query: str, title: str = "▶ Executing Task"): box = Box(width=70, style=BoxStyle.ROUNDED, color='g') print() print(box.text_line(colorize(title, 'g', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) print(box.text_line("", indent=4, text_color='')) print(box.text_line(f" {query}", indent=4, text_color='')) print(box.text_line("", indent=4, text_color='')) print(box.bottom_line(indent=4)) @staticmethod def print_system_ready(): box = Box(width=70, style=BoxStyle.ROUNDED, color='g') print(box.text_line(colorize("◈ System Ready", 'g', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) print(box.text_line("", indent=4, text_color='')) print(box.text_line(colorize(" Real-time UI will display:", 'c'), indent=4, text_color='')) print(box.text_line(" § Agent activities and status", indent=4, text_color='')) print(box.text_line(" ⊕ Grounding backend operations", indent=4, text_color='')) print(box.text_line(" ⊞ Execution logs", indent=4, text_color='')) print(box.text_line("", indent=4, text_color='')) print(box.bottom_line(indent=4)) print() @staticmethod def print_status(agent): box = Box(width=70, style=BoxStyle.ROUNDED, color='bl') print() print(box.text_line(colorize("System Status", 'bl', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) status = agent.get_status() status_lines = [ f"Initialized: {colorize('Yes' if status['initialized'] else 'No', 'g' if status['initialized'] else 'rd')}", f"Running: {colorize('Yes' if status['running'] else 'No', 'y' if status['running'] else 'g')}", ] if "agents" in status: status_lines.append(f"Agents: {colorize(', '.join(status['agents']), 'c')}") for line in status_lines: print(box.text_line(line, indent=4, text_color='')) print(box.bottom_line(indent=4)) print() @staticmethod def print_help(): box = Box(width=70, style=BoxStyle.ROUNDED, color='y') print() print(box.text_line(colorize("Available Commands", 'y', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) help_items = [ (colorize("status", 'c'), "Show system status"), (colorize("help", 'c'), "Show this help message"), (colorize("quit/exit", 'c'), "Exit interactive mode"), ("", ""), (colorize("Other input", 'gr'), "Execute as task"), ] for cmd, desc in help_items: if cmd: print(box.text_line(f" {cmd:20s} {desc}", indent=4, text_color='')) else: print(box.separator_line(indent=4)) print(box.bottom_line(indent=4)) print() ================================================ FILE: anytool/utils/display.py ================================================ from typing import Optional, List from enum import Enum import re class Colors: RESET = "\033[0m" BOLD = "\033[1m" DIM = "\033[2m" RED = "\033[91m" GREEN = "\033[92m" YELLOW = "\033[93m" BLUE = "\033[94m" MAGENTA = "\033[95m" CYAN = "\033[96m" WHITE = "\033[97m" GRAY = "\033[90m" GREEN_SOFT = '\033[38;5;78m' BLUE_SOFT = '\033[38;5;39m' CYAN_SOFT = '\033[38;5;51m' YELLOW_SOFT = '\033[38;5;222m' RED_SOFT = '\033[38;5;204m' MAGENTA_SOFT = '\033[38;5;141m' GRAY_SOFT = '\033[38;5;246m' class BoxStyle(Enum): ROUNDED = "rounded" # Rounded corner box ╭─╮╰╯ SQUARE = "square" # Square corner box ┌─┐└┘ DOUBLE = "double" # Double line box ╔═╗╚╝ SIMPLE = "simple" # Simple box === BOX_CHARS = { BoxStyle.ROUNDED: { 'tl': '╭', 'tr': '╮', 'bl': '╰', 'br': '╯', 'h': '─', 'v': '│' }, BoxStyle.SQUARE: { 'tl': '┌', 'tr': '┐', 'bl': '└', 'br': '┘', 'h': '─', 'v': '│' }, BoxStyle.DOUBLE: { 'tl': '╔', 'tr': '╗', 'bl': '╚', 'br': '╝', 'h': '═', 'v': '║' }, } def strip_ansi(text: str) -> str: """ Strip ANSI color codes from text Args: text: Text with potential ANSI codes Returns: Clean text without ANSI codes """ ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') return ansi_escape.sub('', text) def colorize(text: str, color: str = '', bold: bool = False) -> str: try: color_map = { 'r': Colors.RESET, 'b': Colors.BOLD, 'd': Colors.DIM, 'g': Colors.GREEN_SOFT, 'bl': Colors.BLUE_SOFT, 'c': Colors.CYAN_SOFT, 'y': Colors.YELLOW_SOFT, 'rd': Colors.RED_SOFT, 'm': Colors.MAGENTA_SOFT, 'gr': Colors.GRAY_SOFT, } prefix = Colors.BOLD if bold else '' code = color_map.get(color, color) return f"{prefix}{code}{text}{Colors.RESET}" except: return text class Box: def __init__(self, width: int = 68, style: BoxStyle = BoxStyle.ROUNDED, color: str = 'bl', padding: int = 2): self.width = width self.style = style self.color = color self.padding = padding self.chars = BOX_CHARS.get(style, BOX_CHARS[BoxStyle.ROUNDED]) def top_line(self, indent: int = 2) -> str: indent_str = " " * indent if self.style == BoxStyle.SIMPLE: return colorize(indent_str + "=" * self.width, self.color) return colorize( indent_str + self.chars['tl'] + self.chars['h'] * self.width + self.chars['tr'], self.color ) def bottom_line(self, indent: int = 2) -> str: indent_str = " " * indent if self.style == BoxStyle.SIMPLE: return colorize(indent_str + "=" * self.width, self.color) return colorize( indent_str + self.chars['bl'] + self.chars['h'] * self.width + self.chars['br'], self.color ) def separator_line(self, indent: int = 2) -> str: indent_str = " " * indent if self.style == BoxStyle.SIMPLE: return colorize(indent_str + "-" * self.width, self.color) return colorize(indent_str + " " + self.chars['h'] * self.width, self.color) def empty_line(self, indent: int = 2) -> str: indent_str = " " * indent if self.style == BoxStyle.SIMPLE: return "" return colorize( indent_str + self.chars['v'] + " " * self.width + self.chars['v'], self.color ) def text_line(self, text: str, align: str = 'left', indent: int = 2, text_color: str = '') -> str: indent_str = " " * indent content_width = self.width - 2 * self.padding # Strip ANSI codes to get actual display length clean_text = strip_ansi(text) text_len = len(clean_text) # Use original text (may contain colors) or apply new color display_text = colorize(text, text_color) if text_color else text if align == 'center': left_pad = (content_width - text_len) // 2 right_pad = content_width - text_len - left_pad content = " " * left_pad + display_text + " " * right_pad elif align == 'right': left_pad = content_width - text_len content = " " * left_pad + display_text else: # left right_pad = content_width - text_len content = display_text + " " * right_pad if self.style == BoxStyle.SIMPLE: return indent_str + " " * self.padding + content padding_str = " " * self.padding return colorize(indent_str + self.chars['v'], self.color) + \ padding_str + content + padding_str + \ colorize(self.chars['v'], self.color) def build(self, title: Optional[str] = None, lines: List[str] = None, footer: Optional[str] = None, indent: int = 2) -> str: result = [] result.append(self.top_line(indent)) if title: result.append(self.empty_line(indent)) result.append(self.text_line(title, align='center', indent=indent, text_color='c')) result.append(self.empty_line(indent)) if lines: for line in lines: result.append(self.text_line(line, indent=indent)) if footer: result.append(self.empty_line(indent)) result.append(self.text_line(footer, align='center', indent=indent, text_color='gr')) result.append(self.bottom_line(indent)) return "\n".join(result) def print_box(title: Optional[str] = None, lines: List[str] = None, footer: Optional[str] = None, width: int = 68, style: BoxStyle = BoxStyle.ROUNDED, color: str = 'bl', indent: int = 2): box = Box(width=width, style=style, color=color) print(box.build(title=title, lines=lines, footer=footer, indent=indent)) def print_banner(title: str, subtitle: Optional[str] = None, width: int = 66, style: BoxStyle = BoxStyle.ROUNDED, color: str = 'bl', indent: int = 2): box = Box(width=width, style=style, color=color) print() print(box.top_line(indent)) print(box.empty_line(indent)) print(box.text_line(title, align='center', indent=indent, text_color='c')) if subtitle: print(box.text_line(subtitle, align='center', indent=indent, text_color='gr')) print(box.empty_line(indent)) print(box.bottom_line(indent)) print() def print_section(title: str, content: List[str], color: str = 'c', indent: int = 2): indent_str = " " * indent print(f"\n{indent_str}{colorize('- ' + title, color, bold=True)}") for line in content: print(f"{indent_str} {line}") def print_separator(width: int = 68, color: str = 'bl', indent: int = 2): indent_str = " " * indent print(colorize(indent_str + "─" * width, color)) ================================================ FILE: anytool/utils/logging.py ================================================ import logging import os import sys import threading import json from pathlib import Path from datetime import datetime from typing import Optional from colorama import init init(autoreset=True) def _load_log_level_from_config() -> int: """ Load log_level from config_grounding.json and convert to ANYTOOL_DEBUG value. Returns: 0 (WARNING), 1 (INFO), or 2 (DEBUG) """ try: config_path = Path(__file__).parent.parent / "config" / "config_grounding.json" if config_path.exists(): with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) log_level = config.get("log_level", "INFO").upper() # Convert log level string to ANYTOOL_DEBUG value level_map = { "DEBUG": 2, "INFO": 1, "WARNING": 0, "ERROR": 0, "CRITICAL": 0 } return level_map.get(log_level, 1) # Default to INFO except Exception: # If any error occurs, silently return default INFO level pass return 1 # Default to INFO # 0=WARNING, 1=INFO, 2=DEBUG; can be overridden by set_debug / environment variable # Load from config_grounding.json to ensure consistency ANYTOOL_DEBUG = _load_log_level_from_config() # Default log directory and file pattern # Use absolute path to anytool/logs directory DEFAULT_LOG_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "logs") DEFAULT_LOG_FILE_PATTERN = "anytool_{timestamp}.log" class FlushFileHandler(logging.FileHandler): """File handler that flushes after each emit for real-time logging""" def emit(self, record): super().emit(record) self.flush() # Immediately flush to disk class ColoredFormatter(logging.Formatter): COLORS = { 'DEBUG': '\033[1;36m', # Bold cyan 'INFO': '\033[1;32m', # Bold green 'WARNING': '\033[1;33m', # Bold yellow 'ERROR': '\033[1;31m', # Bold red 'CRITICAL': '\033[1;35m', # Bold magenta 'RESET': '\033[0m', } def format(self, record: logging.LogRecord) -> str: formatted = super().format(record) level_color = self.COLORS.get(record.levelname, self.COLORS["RESET"]) colored_line = f"{level_color}{formatted}{self.COLORS['RESET']}" return colored_line class Logger: """ Thread-safe logger facade that: 1. Configures handlers only once (lazy initialization). 2. Ensures all subsequent loggers obtained via ``Logger.get_logger()`` inherit the configured handlers. 3. Dynamically adapts log levels according to ``ANYTOOL_DEBUG``. """ _ROOT_NAME = "anytool" # Package root name # Standard format: time with milliseconds | level | file:line number | message _LOG_FORMAT = ( "%(asctime)s.%(msecs)03d [%(levelname)-8s] %(filename)s:%(lineno)d - %(message)s" ) _lock = threading.Lock() _configured = False _registered: dict[str, logging.Logger] = {} @staticmethod def _get_default_log_file() -> str: """Generate default log file path with timestamp (to seconds) Log files are organized by the running script name: - logs//anytool_2025-10-24_15-30-00.log """ # Get the name of the main script script_name = "anytool" # Default name try: import __main__ if hasattr(__main__, "__file__") and __main__.__file__: # Extract script name without extension script_path = os.path.basename(__main__.__file__) script_name = os.path.splitext(script_path)[0] except Exception: # If can't get script name, use default pass # Create log directory: logs// log_dir = os.path.join(DEFAULT_LOG_DIR, script_name) timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") filename = DEFAULT_LOG_FILE_PATTERN.format(timestamp=timestamp) return os.path.abspath(os.path.join(log_dir, filename)) @classmethod def get_logger(cls, name: Optional[str] = None) -> logging.Logger: """Return a logger with *name* (defaults to ``anytool``). The first call triggers :meth:`configure` automatically.""" if name is None: name = cls._ROOT_NAME # Check if configuration is needed to avoid recursive calls. need_config = False with cls._lock: logger = cls._registered.get(name) if logger is None: logger = logging.getLogger(name) logger.propagate = True cls._registered[name] = logger if not cls._configured: need_config = True if need_config: cls.configure() return logger @classmethod def configure( cls, *, level: Optional[int] = None, fmt: Optional[str] = None, log_to_console: bool = True, log_to_file: Optional[str] = "auto", use_colors: bool = True, force_color: bool = False, force: bool = False, attach_to_root: bool = False, ) -> None: """ Configure the logging system. Usually called automatically on first use; pass ``force=True`` to reconfigure explicitly. Args: level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) fmt: Log format string log_to_console: Whether to output to console log_to_file: Log file path ("auto" auto-generate by date, None disable, or specify path) use_colors: Whether to use colors on console force_color: Force use of colors (even if not supported) force: Whether to force reconfiguration attach_to_root: Whether to attach to root logger If *attach_to_root* is ``True``, handlers are attached to the *root* logger (``""``). This makes every logger—regardless of its name— inherit the handlers (handy for standalone scripts) but will also surface logs from third-party libraries. Choose with care. """ with cls._lock: if cls._configured and not force: # Already configured and no need to force reconfiguration, only update level. if level is not None: cls._update_level(level) return resolved_level = cls._resolve_level(level) fmt_str = fmt or cls._LOG_FORMAT # Handle log_to_file parameter actual_log_file = None if log_to_file == "auto": actual_log_file = cls._get_default_log_file() elif log_to_file is not None: actual_log_file = log_to_file # Select the logger to attach handlers to (root logger or anytool). target_logger = ( logging.getLogger() if attach_to_root else logging.getLogger(cls._ROOT_NAME) ) target_logger.setLevel(resolved_level) # Clean up old handlers. for h in target_logger.handlers[:]: target_logger.removeHandler(h) # Construct Formatter date_fmt = "%Y-%m-%d %H:%M:%S" color_supported = force_color or (use_colors and cls._stdout_supports_color()) console_formatter = ( ColoredFormatter(fmt_str, datefmt=date_fmt) if color_supported else logging.Formatter(fmt_str, datefmt=date_fmt) ) file_formatter = logging.Formatter(fmt_str, datefmt=date_fmt) # Console Handler if log_to_console: ch = logging.StreamHandler(sys.stdout) ch.setLevel(resolved_level) ch.setFormatter(console_formatter) target_logger.addHandler(ch) # File Handler (with real-time flush) if actual_log_file: dir_path = os.path.dirname(actual_log_file) if dir_path: os.makedirs(dir_path, exist_ok=True) fh = FlushFileHandler(actual_log_file, encoding="utf-8") fh.setLevel(resolved_level) fh.setFormatter(file_formatter) target_logger.addHandler(fh) # Record log file location if not cls._configured: print(f"Log file enabled: {actual_log_file}") cls._configured = True @classmethod def set_debug(cls, debug_level: int = 2) -> None: """Dynamically switch debug level: 0 = WARNING, 1 = INFO, 2 = DEBUG.""" global ANYTOOL_DEBUG ANYTOOL_DEBUG = max(0, min(debug_level, 2)) cls._update_level(cls._resolve_level(None)) @classmethod def add_file_handler( cls, filepath: str, logger_name: Optional[str] = None ) -> None: """ Append a file handler to the given (default ``anytool``) logger. Args: filepath: Log file path logger_name: Log logger name """ logger = cls.get_logger(logger_name or cls._ROOT_NAME) dir_path = os.path.dirname(filepath) if dir_path: os.makedirs(dir_path, exist_ok=True) fh = FlushFileHandler(filepath, encoding="utf-8") fh.setLevel(logger.level) fh.setFormatter(logging.Formatter(cls._LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S")) logger.addHandler(fh) @classmethod def reset_configuration(cls) -> None: """Remove all handlers and clear registered loggers.""" with cls._lock: for lg in cls._registered.values(): for h in lg.handlers[:]: lg.removeHandler(h) cls._registered.clear() cls._configured = False @staticmethod def _stdout_supports_color() -> bool: return sys.stdout.isatty() and not os.getenv("NO_COLOR") @classmethod def _resolve_level(cls, level: Optional[int]) -> int: if level is not None: # Allow passing logging.INFO / "INFO" / 20 etc. return getattr(logging, str(level).upper(), level) return {2: logging.DEBUG, 1: logging.INFO}.get(ANYTOOL_DEBUG, logging.WARNING) @classmethod def _update_level(cls, level: int) -> None: for lg in cls._registered.values(): lg.setLevel(level) for h in lg.handlers: h.setLevel(level) # Adjust debug level automatically according to the # ``ANYTOOL_DEBUG`` (preferred) or legacy ``DEBUG`` environment variable. _env_debug = os.getenv("ANYTOOL_DEBUG") or os.getenv("DEBUG") if _env_debug is not None: try: Logger.set_debug(int(_env_debug)) except ValueError: # When not a number, use common format: DEBUG=1/true Logger.set_debug(2 if _env_debug.strip().lower() in {"1", "true", "yes"} else 0) # Initialize logger system, attach to root so all loggers inherit the configuration # This ensures any logger obtained via Logger.get_logger() will work correctly Logger.configure(attach_to_root=True) # Get anytool logger for internal logging logger = Logger.get_logger() logger.debug("AnyTool logging initialized") ================================================ FILE: anytool/utils/telemetry/__init__.py ================================================ ================================================ FILE: anytool/utils/telemetry/events.py ================================================ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any class BaseTelemetryEvent(ABC): """Base class for all telemetry events""" @property @abstractmethod def name(self) -> str: """Event name for tracking""" pass @property @abstractmethod def properties(self) -> dict[str, Any]: """Event properties to send with the event""" pass @dataclass class MCPAgentExecutionEvent(BaseTelemetryEvent): """Comprehensive event for tracking complete MCP agent execution""" # Execution method and context execution_method: str # "run" or "astream" query: str # The actual user query success: bool # Agent configuration model_provider: str model_name: str server_count: int server_identifiers: list[dict[str, str]] total_tools_available: int tools_available_names: list[str] max_steps_configured: int memory_enabled: bool use_server_manager: bool # Execution PARAMETERS max_steps_used: int | None manage_connector: bool external_history_used: bool # Execution results steps_taken: int | None = None tools_used_count: int | None = None tools_used_names: list[str] | None = None response: str | None = None # The actual response execution_time_ms: int | None = None error_type: str | None = None # Context conversation_history_length: int | None = None @property def name(self) -> str: return "mcp_agent_execution" @property def properties(self) -> dict[str, Any]: return { # Core execution info "execution_method": self.execution_method, "query": self.query, "query_length": len(self.query), "success": self.success, # Agent configuration "model_provider": self.model_provider, "model_name": self.model_name, "server_count": self.server_count, "server_identifiers": self.server_identifiers, "total_tools_available": self.total_tools_available, "tools_available_names": self.tools_available_names, "max_steps_configured": self.max_steps_configured, "memory_enabled": self.memory_enabled, "use_server_manager": self.use_server_manager, # Execution parameters (always include, even if None) "max_steps_used": self.max_steps_used, "manage_connector": self.manage_connector, "external_history_used": self.external_history_used, # Execution results (always include, even if None) "steps_taken": self.steps_taken, "tools_used_count": self.tools_used_count, "tools_used_names": self.tools_used_names, "response": self.response, "response_length": len(self.response) if self.response else None, "execution_time_ms": self.execution_time_ms, "error_type": self.error_type, "conversation_history_length": self.conversation_history_length, } ================================================ FILE: anytool/utils/telemetry/telemetry.py ================================================ import logging import os import platform import uuid from collections.abc import Callable from functools import wraps from pathlib import Path from typing import Any from posthog import Posthog from scarf import ScarfEventLogger from mcp_use.logging import MCP_USE_DEBUG from mcp_use.telemetry.events import ( BaseTelemetryEvent, MCPAgentExecutionEvent, ) from mcp_use.telemetry.utils import get_package_version logger = logging.getLogger(__name__) def singleton(cls): """A decorator that implements the singleton pattern for a class.""" instance = [None] def wrapper(*args, **kwargs): if instance[0] is None: instance[0] = cls(*args, **kwargs) return instance[0] return wrapper def requires_telemetry(func: Callable) -> Callable: """Decorator that skips function execution if telemetry is disabled""" @wraps(func) def wrapper(self, *args, **kwargs): if not self._posthog_client and not self._scarf_client: return None return func(self, *args, **kwargs) return wrapper def get_cache_home() -> Path: """Get platform-appropriate cache directory.""" # XDG_CACHE_HOME for Linux and manually set envs env_var: str | None = os.getenv("XDG_CACHE_HOME") if env_var and (path := Path(env_var)).is_absolute(): return path system = platform.system() if system == "Windows": appdata = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") if appdata: return Path(appdata) return Path.home() / "AppData" / "Local" elif system == "Darwin": # macOS return Path.home() / "Library" / "Caches" else: # Linux or other Unix return Path.home() / ".cache" @singleton class Telemetry: """ Service for capturing anonymized telemetry data via PostHog and Scarf. If the environment variable `MCP_USE_ANONYMIZED_TELEMETRY=false`, telemetry will be disabled. """ USER_ID_PATH = str(get_cache_home() / "mcp_use_3" / "telemetry_user_id") VERSION_DOWNLOAD_PATH = str(get_cache_home() / "mcp_use" / "download_version") PROJECT_API_KEY = "phc_lyTtbYwvkdSbrcMQNPiKiiRWrrM1seyKIMjycSvItEI" HOST = "https://eu.i.posthog.com" SCARF_GATEWAY_URL = "https://mcpuse.gateway.scarf.sh/events" UNKNOWN_USER_ID = "UNKNOWN_USER_ID" _curr_user_id = None def __init__(self): telemetry_disabled = os.getenv("MCP_USE_ANONYMIZED_TELEMETRY", "true").lower() == "false" if telemetry_disabled: self._posthog_client = None self._scarf_client = None logger.debug("Telemetry disabled") else: logger.info("Anonymized telemetry enabled. Set MCP_USE_ANONYMIZED_TELEMETRY=false to disable.") # Initialize PostHog try: self._posthog_client = Posthog( project_api_key=self.PROJECT_API_KEY, host=self.HOST, disable_geoip=False, enable_exception_autocapture=True, ) # Silence posthog's logging unless debug mode (level 2) if MCP_USE_DEBUG < 2: posthog_logger = logging.getLogger("posthog") posthog_logger.disabled = True except Exception as e: logger.warning(f"Failed to initialize PostHog telemetry: {e}") self._posthog_client = None # Initialize Scarf try: self._scarf_client = ScarfEventLogger( endpoint_url=self.SCARF_GATEWAY_URL, timeout=3.0, verbose=MCP_USE_DEBUG >= 2, ) # Silence scarf's logging unless debug mode (level 2) if MCP_USE_DEBUG < 2: scarf_logger = logging.getLogger("scarf") scarf_logger.disabled = True except Exception as e: logger.warning(f"Failed to initialize Scarf telemetry: {e}") self._scarf_client = None @property def user_id(self) -> str: """Get or create a persistent anonymous user ID""" if self._curr_user_id: return self._curr_user_id try: is_first_time = not os.path.exists(self.USER_ID_PATH) if is_first_time: logger.debug(f"Creating user ID path: {self.USER_ID_PATH}") os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True) with open(self.USER_ID_PATH, "w") as f: new_user_id = str(uuid.uuid4()) f.write(new_user_id) self._curr_user_id = new_user_id logger.debug(f"User ID path created: {self.USER_ID_PATH}") else: with open(self.USER_ID_PATH) as f: self._curr_user_id = f.read().strip() # Always check for version-based download tracking self.track_package_download( { "triggered_by": "user_id_property", } ) except Exception as e: logger.debug(f"Failed to get/create user ID: {e}") self._curr_user_id = self.UNKNOWN_USER_ID return self._curr_user_id @requires_telemetry def capture(self, event: BaseTelemetryEvent) -> None: """Capture a telemetry event""" # Send to PostHog if self._posthog_client: try: # Add package version to all events properties = event.properties.copy() properties["mcp_use_version"] = get_package_version() self._posthog_client.capture(distinct_id=self.user_id, event=event.name, properties=properties) except Exception as e: logger.debug(f"Failed to track PostHog event {event.name}: {e}") # Send to Scarf if self._scarf_client: try: # Add package version and user_id to all events properties = {} properties["mcp_use_version"] = get_package_version() properties["user_id"] = self.user_id properties["event"] = event.name # Convert complex types to simple types for Scarf compatibility self._scarf_client.log_event(properties=properties) except Exception as e: logger.debug(f"Failed to track Scarf event {event.name}: {e}") @requires_telemetry def track_package_download(self, properties: dict[str, Any] | None = None) -> None: """Track package download event specifically for Scarf analytics""" if self._scarf_client: try: current_version = get_package_version() should_track = False first_download = False # Check if version file exists if not os.path.exists(self.VERSION_DOWNLOAD_PATH): # First download should_track = True first_download = True # Create directory and save version os.makedirs(os.path.dirname(self.VERSION_DOWNLOAD_PATH), exist_ok=True) with open(self.VERSION_DOWNLOAD_PATH, "w") as f: f.write(current_version) else: # Read saved version with open(self.VERSION_DOWNLOAD_PATH) as f: saved_version = f.read().strip() # Compare versions (simple string comparison for now) if current_version > saved_version: should_track = True first_download = False # Update saved version with open(self.VERSION_DOWNLOAD_PATH, "w") as f: f.write(current_version) if should_track: logger.debug(f"Tracking package download event with properties: {properties}") # Add package version and user_id to event event_properties = (properties or {}).copy() event_properties["mcp_use_version"] = current_version event_properties["user_id"] = self.user_id event_properties["event"] = "package_download" event_properties["first_download"] = first_download # Convert complex types to simple types for Scarf compatibility self._scarf_client.log_event(properties=event_properties) except Exception as e: logger.debug(f"Failed to track Scarf package_download event: {e}") @requires_telemetry def track_agent_execution( self, execution_method: str, query: str, success: bool, model_provider: str, model_name: str, server_count: int, server_identifiers: list[dict[str, str]], total_tools_available: int, tools_available_names: list[str], max_steps_configured: int, memory_enabled: bool, use_server_manager: bool, max_steps_used: int | None, manage_connector: bool, external_history_used: bool, steps_taken: int | None = None, tools_used_count: int | None = None, tools_used_names: list[str] | None = None, response: str | None = None, execution_time_ms: int | None = None, error_type: str | None = None, conversation_history_length: int | None = None, ) -> None: """Track comprehensive agent execution""" event = MCPAgentExecutionEvent( execution_method=execution_method, query=query, success=success, model_provider=model_provider, model_name=model_name, server_count=server_count, server_identifiers=server_identifiers, total_tools_available=total_tools_available, tools_available_names=tools_available_names, max_steps_configured=max_steps_configured, memory_enabled=memory_enabled, use_server_manager=use_server_manager, max_steps_used=max_steps_used, manage_connector=manage_connector, external_history_used=external_history_used, steps_taken=steps_taken, tools_used_count=tools_used_count, tools_used_names=tools_used_names, response=response, execution_time_ms=execution_time_ms, error_type=error_type, conversation_history_length=conversation_history_length, ) self.capture(event) @requires_telemetry def flush(self) -> None: """Flush any queued telemetry events""" # Flush PostHog if self._posthog_client: try: self._posthog_client.flush() logger.debug("PostHog client telemetry queue flushed") except Exception as e: logger.debug(f"Failed to flush PostHog client: {e}") # Scarf events are sent immediately, no flush needed if self._scarf_client: logger.debug("Scarf telemetry events sent immediately (no flush needed)") @requires_telemetry def shutdown(self) -> None: """Shutdown telemetry clients and flush remaining events""" # Shutdown PostHog if self._posthog_client: try: self._posthog_client.shutdown() logger.debug("PostHog client shutdown successfully") except Exception as e: logger.debug(f"Error shutting down PostHog client: {e}") # Scarf doesn't require explicit shutdown if self._scarf_client: logger.debug("Scarf telemetry client shutdown (no action needed)") ================================================ FILE: anytool/utils/telemetry/utils.py ================================================ """ Utility functions for extracting model information from LangChain LLMs. This module provides utilities to extract provider and model information from LangChain language models for telemetry purposes. """ import importlib.metadata from langchain_core.language_models.base import BaseLanguageModel def get_package_version() -> str: """Get the current mcp-use package version.""" try: return importlib.metadata.version("mcp-use") except importlib.metadata.PackageNotFoundError: return "unknown" def get_model_provider(llm: BaseLanguageModel) -> str: """Extract the model provider from LangChain LLM using BaseChatModel standards.""" # Use LangChain's standard _llm_type property for identification return getattr(llm, "_llm_type", llm.__class__.__name__.lower()) def get_model_name(llm: BaseLanguageModel) -> str: """Extract the model name from LangChain LLM using BaseChatModel standards.""" # First try _identifying_params which may contain model info if hasattr(llm, "_identifying_params"): identifying_params = llm._identifying_params if isinstance(identifying_params, dict): # Common keys that contain model names for key in ["model", "model_name", "model_id", "deployment_name"]: if key in identifying_params: return str(identifying_params[key]) # Fallback to direct model attributes return getattr(llm, "model", getattr(llm, "model_name", llm.__class__.__name__)) def extract_model_info(llm: BaseLanguageModel) -> tuple[str, str]: """Extract both provider and model name from LangChain LLM. Returns: Tuple of (provider, model_name) """ return get_model_provider(llm), get_model_name(llm) ================================================ FILE: anytool/utils/ui.py ================================================ """ AnyTool Terminal UI System Provides real-time CLI visualization for AnyTool execution flow. Displays agent activities, grounding backends, and detailed logs. Uses native ANSI colors and custom box drawing for a clean, lightweight interface. """ from typing import Optional, Dict, Any, List, Tuple from datetime import datetime from enum import Enum import asyncio import sys import shutil from anytool.utils.display import Box, BoxStyle, colorize class AgentStatus(Enum): """Agent execution status""" IDLE = "idle" THINKING = "thinking" EXECUTING = "executing" WAITING = "waiting" class AnyToolUI: """ AnyTool Terminal UI Provides real-time visualization of: - Agent activities and status - Grounding backend operations - Execution logs - System metrics Design Philosophy: - Lightweight and fast (no heavy dependencies) - Clean ANSI-based rendering - Minimal CPU overhead - Easy to customize """ def __init__(self, enable_live: bool = True, compact: bool = False): """ Initialize UI Args: enable_live: Whether to enable live display updates compact: Use compact layout (for smaller terminals) """ self.enable_live = enable_live self.compact = compact # Terminal dimensions self.term_width, self.term_height = self._get_terminal_size() # State tracking self.agent_status: Dict[str, AgentStatus] = {} self.agent_activities: Dict[str, List[str]] = {} self.grounding_operations: List[Dict[str, Any]] = [] self.grounding_backends: List[Dict[str, Any]] = [] # Backend info (type, servers, etc.) self.log_buffer: List[Tuple[str, str, datetime]] = [] # (message, level, timestamp) # Metrics self.metrics: Dict[str, Any] = { "start_time": None, "iterations": 0, "completed_tasks": 0, "llm_calls": 0, "grounding_calls": 0, } # Live display state self._live_running = False self._live_task: Optional[asyncio.Task] = None self._last_render: List[str] = [] def _get_terminal_size(self) -> Tuple[int, int]: """Get terminal size""" try: size = shutil.get_terminal_size((80, 24)) return size.columns, size.lines except: return 80, 24 def _clear_screen(self): """Clear screen""" if self.enable_live: # Clear entire screen and move cursor to top-left sys.stdout.write('\033[2J\033[H') sys.stdout.flush() def _move_cursor_home(self): """Move cursor to home position""" sys.stdout.write('\033[H') sys.stdout.flush() def _hide_cursor(self): """Hide cursor""" sys.stdout.write('\033[?25l') sys.stdout.flush() def _show_cursor(self): """Show cursor""" sys.stdout.write('\033[?25h') sys.stdout.flush() # Banner and Startup def print_banner(self): """Print startup banner""" box = Box(width=70, style=BoxStyle.ROUNDED, color='c') print() print(box.top_line(indent=4)) print(box.empty_line(indent=4)) # Title title = colorize("AnyTool", 'c', bold=True) print(box.text_line(title, align='center', indent=4, text_color='')) # Subtitle subtitle = "Universal Tool-Use Layer for AI Agents" print(box.text_line(subtitle, align='center', indent=4, text_color='gr')) print(box.empty_line(indent=4)) print(box.bottom_line(indent=4)) print() def print_initialization(self, steps: List[Tuple[str, str]]): """ Print initialization steps Args: steps: List of (component_name, status) tuples """ box = Box(width=70, style=BoxStyle.ROUNDED, color='bl') print(box.text_line("Initializing Components", align='center', indent=4, text_color='c')) print(box.separator_line(indent=4)) for component, status in steps: icon = colorize("✓", 'g') if status == "ok" else colorize("✗", 'rd') line = f"{icon} {component}" print(box.text_line(line, indent=4)) print(box.bottom_line(indent=4)) print() async def start_live_display(self): """Start live display""" if not self.enable_live or self._live_running: return self._live_running = True self.metrics["start_time"] = datetime.now() self._clear_screen() self._hide_cursor() # Start update loop self._live_task = asyncio.create_task(self._live_update_loop()) async def stop_live_display(self): """Stop live display""" if not self._live_running: return self._live_running = False if self._live_task: self._live_task.cancel() try: await self._live_task except asyncio.CancelledError: pass self._show_cursor() print() # Add newline after live display async def _live_update_loop(self): """Live update loop""" while self._live_running: try: self.render() await asyncio.sleep(2.0) except asyncio.CancelledError: break except Exception as e: print(f"UI render error: {e}") def render(self): """Render entire UI""" if not self.enable_live or not self._live_running: return # Clear and redraw self._clear_screen() lines = [] # Header lines.extend(self._render_header()) lines.append("") # Stack all panels vertically lines.extend(self._render_agents()) lines.append("") lines.extend(self._render_grounding()) lines.append("") lines.extend(self._render_logs()) output = "\n".join(lines) sys.stdout.write(output) sys.stdout.flush() def update_display(self): """Update display (alias for render())""" self.render() def _render_header(self) -> List[str]: """Render header section""" lines = [] # Calculate elapsed time elapsed = "0s" if self.metrics["start_time"]: delta = datetime.now() - self.metrics["start_time"] minutes = delta.seconds // 60 seconds = delta.seconds % 60 if minutes > 0: elapsed = f"{minutes}m{seconds}s" else: elapsed = f"{seconds}s" status_text = ( f"▶ {colorize('RUNNING', 'g')} | " f"Time: {colorize(elapsed, 'c')} | " f"Iter: {colorize(str(self.metrics['iterations']), 'y')} | " f"Tasks: {colorize(str(self.metrics['completed_tasks']), 'g')} | " f"LLM: {colorize(str(self.metrics['llm_calls']), 'bl')} | " f"Grounding: {colorize(str(self.metrics['grounding_calls']), 'm')}" ) lines.append(" " + status_text) lines.append(" " + "─" * 60) return lines def _render_agents(self) -> List[str]: """Render agents section""" lines = [] lines.append(" " + colorize("§ Agents", 'c', bold=True)) # Agent info agents = [ ("GroundingAgent", 'c', self.agent_status.get("GroundingAgent", AgentStatus.IDLE)), ] for agent_name, color, status in agents: # Status icon status_icons = { AgentStatus.IDLE: "○", AgentStatus.THINKING: "◐", AgentStatus.EXECUTING: "◉", AgentStatus.WAITING: "◷", } icon = status_icons.get(status, "○") # Recent activity activities = self.agent_activities.get(agent_name, []) activity = activities[-1][:40] if activities else "idle" # Format line line = f" {colorize(icon, 'y')} {colorize(agent_name, color):<20s} {activity}" lines.append(line) return lines def _render_grounding(self) -> List[str]: """Render grounding operations section""" lines = [] lines.append(" " + colorize("⊕ Grounding Backends", 'c', bold=True)) # Show backend types and servers if self.grounding_backends: for backend_info in self.grounding_backends: backend_name = backend_info.get("name", "unknown") backend_type = backend_info.get("type", "unknown") servers = backend_info.get("servers", []) # Backend type icon type_icons = { "gui": "■", "shell": "$", "mcp": "◆", "system": "●", "web": "◉", } icon = type_icons.get(backend_type, "○") # Format backend line if backend_type == "mcp" and servers: servers_str = ", ".join([s[:15] for s in servers]) line = f" {icon} {colorize(backend_name, 'y')} ({backend_type}): {colorize(servers_str, 'gr')}" else: line = f" {icon} {colorize(backend_name, 'y')} ({backend_type})" lines.append(line) # Show last 3 operations recent_ops = self.grounding_operations[-3:] if self.grounding_operations else [] if recent_ops: lines.append(" " + colorize("Recent Operations:", 'gr')) for op in recent_ops: backend = op.get("backend", "unknown") action = op.get("action", "unknown")[:40] status = op.get("status", "pending") # Status icon if status == "success": icon = colorize("✓", 'g') elif status == "pending": icon = colorize("⏳", 'y') else: icon = colorize("✗", 'rd') line = f" {icon} {colorize(backend, 'bl')}: {action}" lines.append(line) return lines def _render_logs(self) -> List[str]: """Render logs section""" lines = [] lines.append(" " + colorize("⊞ Recent Events", 'c', bold=True)) # Show last 5 logs recent_logs = self.log_buffer[-5:] if self.log_buffer else [] if recent_logs: for message, level, timestamp in recent_logs: time_str = timestamp.strftime("%H:%M:%S") # Truncate long messages msg_display = message[:55] log_line = f" {colorize(time_str, 'gr')} | {msg_display}" lines.append(log_line) return lines def update_agent_status(self, agent_name: str, status: AgentStatus): """Update agent status""" self.agent_status[agent_name] = status def add_agent_activity(self, agent_name: str, activity: str): """Add agent activity""" if agent_name not in self.agent_activities: self.agent_activities[agent_name] = [] self.agent_activities[agent_name].append(activity) # Keep only last 10 activities if len(self.agent_activities[agent_name]) > 10: self.agent_activities[agent_name] = self.agent_activities[agent_name][-10:] def update_grounding_backends(self, backends: List[Dict[str, Any]]): """ Update grounding backends information Args: backends: List of backend info dicts with keys: - name: backend name - type: backend type (gui, shell, mcp, system, web) - servers: list of server names (for mcp) """ self.grounding_backends = backends def add_grounding_operation(self, backend: str, action: str, status: str = "pending"): """Add grounding operation""" self.grounding_operations.append({ "backend": backend, "action": action, "status": status, "timestamp": datetime.now(), }) self.metrics["grounding_calls"] += 1 def add_log(self, message: str, level: str = "info"): """Add log message""" self.log_buffer.append((message, level, datetime.now())) # Keep only last 100 logs if len(self.log_buffer) > 100: self.log_buffer = self.log_buffer[-100:] def update_metrics(self, **kwargs): """Update metrics""" self.metrics.update(kwargs) def print_summary(self, result: Dict[str, Any]): """Print execution summary""" box = Box(width=70, style=BoxStyle.ROUNDED, color='c') print() print(box.text_line(colorize("◈ Execution Summary", 'c', bold=True), align='center', indent=4, text_color='')) print(box.separator_line(indent=4)) # Status status = result.get("status", "unknown") status_display = { "completed": colorize("COMPLETED", 'g', bold=True), "timeout": colorize("TIMEOUT", 'y', bold=True), "error": colorize("ERROR", 'rd', bold=True), } status_text = status_display.get(status, status) print(box.text_line(f" Status: {status_text}", indent=4, text_color='')) print(box.text_line(f" Execution Time: {colorize(f'{result.get('execution_time', 0):.2f}s', 'c')}", indent=4, text_color='')) print(box.text_line(f" Iterations: {colorize(str(result.get('iterations', 0)), 'y')}", indent=4, text_color='')) print(box.text_line(f" Completed Tasks: {colorize(str(result.get('completed_tasks', 0)), 'g')}", indent=4, text_color='')) if result.get('evaluation_results'): print(box.text_line(f" Evaluations: {colorize(str(len(result['evaluation_results'])), 'bl')}", indent=4, text_color='')) print(box.bottom_line(indent=4)) print() def create_ui(enable_live: bool = True, compact: bool = False) -> AnyToolUI: """ Create AnyTool UI instance Args: enable_live: Whether to enable live display updates compact: Use compact layout for smaller terminals """ return AnyToolUI(enable_live=enable_live, compact=compact) ================================================ FILE: anytool/utils/ui_integration.py ================================================ """ AnyTool UI Integration Integrates the UI system with AnyTool core components. Provides hooks and callbacks to update UI in real-time. """ import asyncio from typing import Optional from anytool.utils.ui import AnyToolUI, AgentStatus from anytool.utils.logging import Logger logger = Logger.get_logger(__name__) class UIIntegration: """ UI Integration for AnyTool Connects AnyTool components with the UI system to provide real-time visualization of agent activities and execution flow. """ def __init__(self, ui: AnyToolUI): """ Initialize UI integration Args: ui: AnyToolUI instance """ self.ui = ui self._update_task: Optional[asyncio.Task] = None self._running = False # Tracked components self._llm_client = None self._grounding_client = None def attach_llm_client(self, llm_client): """ Attach LLM client Args: llm_client: LLMClient instance """ self._llm_client = llm_client logger.debug("UI attached to LLMClient") def attach_grounding_client(self, grounding_client): """ Attach grounding client Args: grounding_client: GroundingClient instance """ self._grounding_client = grounding_client logger.debug("UI attached to GroundingClient") async def start_monitoring(self, poll_interval: float = 0.5): """ Start monitoring and updating UI Args: poll_interval: Update interval in seconds """ if self._running: logger.warning("UI monitoring already running") return self._running = True # Immediately update UI once before starting the loop await self._update_ui() self._update_task = asyncio.create_task( self._monitor_loop(poll_interval) ) logger.info("UI monitoring started") async def stop_monitoring(self): """Stop monitoring""" if not self._running: return self._running = False if self._update_task: self._update_task.cancel() try: await self._update_task except asyncio.CancelledError: pass logger.info("UI monitoring stopped") async def _monitor_loop(self, poll_interval: float): """ Main monitoring loop Args: poll_interval: Update interval in seconds """ while self._running: try: await self._update_ui() await asyncio.sleep(poll_interval) except asyncio.CancelledError: break except Exception as e: logger.error(f"UI update error: {e}", exc_info=True) async def _update_ui(self): """Update UI with current state""" # Update grounding backends info if self._grounding_client: backends = [] try: # Get list of providers providers = self._grounding_client.list_providers() for backend_type, provider in providers.items(): backend_name = backend_type.value if hasattr(backend_type, 'value') else str(backend_type) backend_info = { "name": backend_name, "type": backend_name, # gui, shell, mcp, system, web "servers": [] } # For MCP provider, get server names if backend_name == "mcp": try: # Try to get MCP sessions from provider if hasattr(provider, '_sessions'): backend_info["servers"] = list(provider._sessions.keys()) except Exception: pass backends.append(backend_info) self.ui.update_grounding_backends(backends) except Exception as e: logger.debug(f"Failed to update grounding backends: {e}") # Refresh display self.ui.update_display() # Event handlers - to be called by agents def on_agent_start(self, agent_name: str, activity: str): """ Called when agent starts an activity Args: agent_name: Agent name activity: Activity description """ self.ui.update_agent_status(agent_name, AgentStatus.EXECUTING) self.ui.add_agent_activity(agent_name, activity) self.ui.add_log(f"{agent_name}: {activity}", level="info") def on_agent_thinking(self, agent_name: str): """ Called when agent is thinking Args: agent_name: Agent name """ self.ui.update_agent_status(agent_name, AgentStatus.THINKING) def on_agent_complete(self, agent_name: str, result: str = ""): """ Called when agent completes an activity Args: agent_name: Agent name result: Result description """ self.ui.update_agent_status(agent_name, AgentStatus.IDLE) if result: self.ui.add_log(f"{agent_name}: {result}", level="success") def on_llm_call(self, model: str, prompt_length: int): """ Called when LLM is called Args: model: Model name prompt_length: Prompt length """ self.ui.update_metrics( llm_calls=self.ui.metrics.get("llm_calls", 0) + 1 ) self.ui.add_log(f"LLM call: {model} (prompt: {prompt_length} chars)", level="debug") def on_grounding_call(self, backend: str, action: str): """ Called when grounding backend is called Args: backend: Backend name action: Action description """ self.ui.add_grounding_operation(backend, action, status="pending") self.ui.add_log(f"Grounding [{backend}]: {action}", level="info") def on_grounding_complete(self, backend: str, action: str, success: bool): """ Called when grounding operation completes Args: backend: Backend name action: Action description success: Whether operation succeeded """ status = "success" if success else "error" # Update last operation status for op in reversed(self.ui.grounding_operations): if op["backend"] == backend and op["action"] == action and op["status"] == "pending": op["status"] = status break level = "success" if success else "error" result = "✓" if success else "✗" self.ui.add_log(f"Grounding [{backend}]: {action} {result}", level=level) def on_iteration(self, iteration: int): """ Called on each iteration Args: iteration: Iteration number """ self.ui.update_metrics(iterations=iteration) def on_error(self, message: str): """ Called when an error occurs Args: message: Error message """ self.ui.add_log(f"ERROR: {message}", level="error") class UILoggingHandler: """ Logging handler that forwards logs to UI """ def __init__(self, ui: AnyToolUI): """ Initialize logging handler Args: ui: AnyToolUI instance """ self.ui = ui def emit(self, record): """ Emit a log record to UI Args: record: Log record """ level_map = { "DEBUG": "debug", "INFO": "info", "WARNING": "warning", "ERROR": "error", "CRITICAL": "error", } level = level_map.get(record.levelname, "info") message = record.getMessage() # Filter out noisy logs if any(skip in message.lower() for skip in ["processing card", "workflow poll"]): return self.ui.add_log(message, level=level) def create_integration(ui: AnyToolUI) -> UIIntegration: """ Create UI integration instance Args: ui: AnyToolUI instance Returns: UIIntegration instance """ return UIIntegration(ui) ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=68.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "anytool" version = "0.1.0" description = "AnyTool: Universal Tool-Use Layer for AI Agents" readme = "README.md" requires-python = ">=3.10" license = {text = "MIT"} authors = [ {name = "lingruixu@HKUDS", email = "lingruixu.db@gmail.com"} ] dependencies = [ "litellm>=1.70.0", "python-dotenv>=1.0.0", "openai>=1.0.0", "jsonschema>=4.25.0", "mcp>=1.0.0", "anthropic>=0.71.0", "pillow>=12.0.0", "flask>=3.1.0", "pyautogui>=0.9.54", "pydantic>=2.12.0", "requests>=2.32.0", ] [project.optional-dependencies] macos = [ "pyobjc-core>=12.0", "pyobjc-framework-cocoa>=12.0", "pyobjc-framework-quartz>=12.0", "atomacos>=3.2.0", ] linux = [ "python-xlib>=0.33", "pyatspi>=2.38.0", "numpy>=1.24.0", ] windows = [ "pywinauto>=0.6.8", "pywin32>=306", "PyGetWindow>=0.0.9", ] dev = [ "pytest>=7.0.0", "pytest-asyncio>=0.21.0", "black>=23.0.0", "flake8>=6.0.0", "mypy>=1.0.0", ] all = [ "anytool[macos,linux,windows,dev]", ] [project.urls] Repository = "https://github.com/HKUDS/AnyTool" "Bug Tracker" = "https://github.com/HKUDS/AnyTool/issues" [project.scripts] anytool = "anytool.__main__:run_main" anytool-server = "anytool.local_server.main:main" [tool.setuptools] packages = {find = {where = ["."], include = ["anytool*"]}} [tool.setuptools.package-data] anytool = [ "config/*.json", "config/*.json.example", "local_server/config.json", "local_server/README.md", ] ================================================ FILE: requirements.txt ================================================ # AnyTool core dependencies litellm>=1.70.0 python-dotenv>=1.0.0 openai>=1.0.0 jsonschema>=4.25.0 mcp>=1.0.0 anthropic>=0.71.0 pillow>=12.0.0 colorama # Local server dependencies (cross-platform) flask>=3.1.0 pyautogui>=0.9.54 pydantic>=2.12.0 requests>=2.32.0 # # macOS-specific dependencies (local server) # pyobjc-core>=12.0; sys_platform == 'darwin' # pyobjc-framework-cocoa>=12.0; sys_platform == 'darwin' # pyobjc-framework-quartz>=12.0; sys_platform == 'darwin' # atomacos>=3.2.0; sys_platform == 'darwin' # # Linux-specific dependencies (local server) # python-xlib>=0.33; sys_platform == 'linux' # pyatspi>=2.38.0; sys_platform == 'linux' # numpy>=1.24.0; sys_platform == 'linux' # # Windows-specific dependencies (local server) # pywinauto>=0.6.8; sys_platform == 'win32' # pywin32>=306; sys_platform == 'win32' # PyGetWindow>=0.0.9; sys_platform == 'win32'