Repository: OpenHands/software-agent-sdk Branch: main Commit: 26af67d95147 Files: 1197 Total size: 9.6 MB Directory structure: gitextract_0wirow34/ ├── .agents/ │ └── skills/ │ ├── cross-repo-testing/ │ │ └── SKILL.md │ ├── custom-codereview-guide.md │ ├── debug-test-examples-workflow/ │ │ └── SKILL.md │ ├── design-principles.md │ ├── feature-release-rollout/ │ │ └── SKILL.md │ ├── manage-evals/ │ │ ├── SKILL.md │ │ ├── references/ │ │ │ └── eval-infrastructure.md │ │ └── scripts/ │ │ └── manage_evals.py │ ├── run-eval.md │ ├── sdk-release/ │ │ ├── SKILL.md │ │ └── references/ │ │ └── post-release-checklist.md │ └── write-behavior-test.md ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_template.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── dependabot.yml │ ├── prompts/ │ │ └── update-documentation.md │ ├── run-eval/ │ │ ├── ADDINGMODEL.md │ │ ├── AGENTS.md │ │ ├── resolve_model_config.py │ │ └── validate_sdk_ref.py │ ├── scripts/ │ │ ├── check_agent_server_rest_api_breakage.py │ │ ├── check_deprecations.py │ │ ├── check_docstrings.py │ │ ├── check_documented_examples.py │ │ ├── check_duplicate_example_numbers.py │ │ ├── check_sdk_api_breakage.py │ │ ├── check_version_bumps.py │ │ └── update_sdk_ref_default.py │ └── workflows/ │ ├── README-RELEASE.md │ ├── agent-server-rest-api-breakage.yml │ ├── api-breakage.yml │ ├── api-compliance-runner.yml │ ├── assign-reviews.yml │ ├── auto-label-issues.yml │ ├── cancel-eval.yml │ ├── check-docstrings.yml │ ├── check-documented-examples.yml │ ├── check-duplicate-examples.yml │ ├── condenser-runner.yml │ ├── create-release.yml │ ├── deploy-docs.yml │ ├── deprecation-check.yml │ ├── integration-runner.yml │ ├── issue-duplicate-checker.yml │ ├── oh-update-documentation.yml.back │ ├── pr-artifacts.yml │ ├── pr-review-by-openhands.yml │ ├── pr-review-evaluation.yml │ ├── precommit.yml │ ├── prepare-release.yml │ ├── pypi-release.yml │ ├── qa-changes-by-openhands.yml │ ├── qa-changes-evaluation.yml │ ├── release-binaries.yml │ ├── remove-duplicate-candidate-label.yml │ ├── review-thread-gate.yml │ ├── run-eval.yml │ ├── run-examples.yml │ ├── server.yml │ ├── stale.yml │ ├── tests.yml │ ├── todo-management.yml │ ├── version-bump-guard.yml │ └── version-bump-prs.yml ├── .gitignore ├── .openhands/ │ ├── hooks/ │ │ └── on_stop.sh │ ├── hooks.json │ └── setup.sh ├── .pre-commit-config.yaml ├── .python-version ├── AGENTS.md ├── CONTRIBUTING.md ├── DEVELOPMENT.md ├── LICENSE ├── MAINTAINERS ├── MANIFEST.in ├── Makefile ├── README.md ├── examples/ │ ├── 01_standalone_sdk/ │ │ ├── 01_hello_world.py │ │ ├── 02_custom_tools.py │ │ ├── 03_activate_skill.py │ │ ├── 04_confirmation_mode_example.py │ │ ├── 05_use_llm_registry.py │ │ ├── 06_interactive_terminal_w_reasoning.py │ │ ├── 07_mcp_integration.py │ │ ├── 08_mcp_with_oauth.py │ │ ├── 09_pause_example.py │ │ ├── 10_persistence.py │ │ ├── 11_async.py │ │ ├── 12_custom_secrets.py │ │ ├── 13_get_llm_metrics.py │ │ ├── 14_context_condenser.py │ │ ├── 15_browser_use.py │ │ ├── 16_llm_security_analyzer.py │ │ ├── 17_image_input.py │ │ ├── 18_send_message_while_processing.py │ │ ├── 19_llm_routing.py │ │ ├── 20_stuck_detector.py │ │ ├── 21_generate_extraneous_conversation_costs.py │ │ ├── 22_anthropic_thinking.py │ │ ├── 23_responses_reasoning.py │ │ ├── 24_planning_agent_workflow.py │ │ ├── 25_agent_delegation.py │ │ ├── 26_custom_visualizer.py │ │ ├── 27_observability_laminar.py │ │ ├── 28_ask_agent_example.py │ │ ├── 29_llm_streaming.py │ │ ├── 30_tom_agent.py │ │ ├── 31_iterative_refinement.py │ │ ├── 32_configurable_security_policy.py │ │ ├── 33_hooks/ │ │ │ ├── README.md │ │ │ ├── hook_scripts/ │ │ │ │ ├── block_dangerous.sh │ │ │ │ ├── inject_git_context.sh │ │ │ │ ├── log_tools.sh │ │ │ │ └── require_summary.sh │ │ │ └── main.py │ │ ├── 34_critic_example.py │ │ ├── 35_subscription_login.py │ │ ├── 36_event_json_to_openai_messages.py │ │ ├── 37_llm_profile_store/ │ │ │ ├── main.py │ │ │ └── profiles/ │ │ │ └── fast.json │ │ ├── 38_browser_session_recording.py │ │ ├── 39_llm_fallback.py │ │ ├── 40_acp_agent_example.py │ │ ├── 41_task_tool_set.py │ │ ├── 42_file_based_subagents.py │ │ ├── 43_mixed_marketplace_skills/ │ │ │ ├── .plugin/ │ │ │ │ └── marketplace.json │ │ │ ├── README.md │ │ │ ├── main.py │ │ │ └── skills/ │ │ │ └── greeting-helper/ │ │ │ └── SKILL.md │ │ ├── 44_model_switching_in_convo.py │ │ ├── 45_parallel_tool_execution.py │ │ ├── 46_agent_settings.py │ │ ├── 47_defense_in_depth_security.py │ │ ├── 48_conversation_fork.py │ │ └── 49_switch_llm_tool.py │ ├── 02_remote_agent_server/ │ │ ├── 01_convo_with_local_agent_server.py │ │ ├── 02_convo_with_docker_sandboxed_server.py │ │ ├── 03_browser_use_with_docker_sandboxed_server.py │ │ ├── 04_convo_with_api_sandboxed_server.py │ │ ├── 05_vscode_with_docker_sandboxed_server.py │ │ ├── 06_custom_tool/ │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── build_custom_image.sh │ │ │ ├── custom_tools/ │ │ │ │ ├── __init__.py │ │ │ │ └── log_data.py │ │ │ └── main.py │ │ ├── 07_convo_with_cloud_workspace.py │ │ ├── 08_convo_with_apptainer_sandboxed_server.py │ │ ├── 09_acp_agent_with_remote_runtime.py │ │ ├── 10_cloud_workspace_share_credentials.py │ │ ├── 11_conversation_fork.py │ │ ├── 12_settings_and_secrets_api.py │ │ ├── 13_workspace_get_llm.py │ │ └── hook_scripts/ │ │ └── pycompile_check.sh │ ├── 03_github_workflows/ │ │ ├── 01_basic_action/ │ │ │ ├── README.md │ │ │ ├── agent_script.py │ │ │ ├── assign-reviews.yml │ │ │ └── workflow.yml │ │ ├── 02_pr_review/ │ │ │ ├── README.md │ │ │ └── workflow.yml │ │ ├── 03_todo_management/ │ │ │ ├── README.md │ │ │ ├── agent_script.py │ │ │ ├── prompt.py │ │ │ ├── scanner.py │ │ │ └── workflow.yml │ │ ├── 04_datadog_debugging/ │ │ │ ├── README.md │ │ │ ├── datadog_debugging.py │ │ │ ├── debug_prompt.jinja │ │ │ └── workflow.yml │ │ └── 05_posthog_debugging/ │ │ ├── README.md │ │ ├── debug_prompt.jinja │ │ ├── posthog_debugging.py │ │ └── workflow.yml │ ├── 04_llm_specific_tools/ │ │ ├── 01_gpt5_apply_patch_preset.py │ │ └── 02_gemini_file_tools.py │ └── 05_skills_and_plugins/ │ ├── 01_loading_agentskills/ │ │ ├── example_skills/ │ │ │ ├── code-style-guide/ │ │ │ │ └── SKILL.md │ │ │ └── rot13-encryption/ │ │ │ ├── SKILL.md │ │ │ ├── references/ │ │ │ │ └── examples.md │ │ │ └── scripts/ │ │ │ └── encrypt.sh │ │ └── main.py │ ├── 02_loading_plugins/ │ │ ├── example_plugins/ │ │ │ └── code-quality/ │ │ │ ├── .mcp.json │ │ │ ├── .plugin/ │ │ │ │ └── plugin.json │ │ │ ├── hooks/ │ │ │ │ └── hooks.json │ │ │ └── skills/ │ │ │ └── linting/ │ │ │ └── SKILL.md │ │ └── main.py │ └── 03_managing_installed_skills/ │ └── main.py ├── openhands-agent-server/ │ ├── AGENTS.md │ ├── openhands/ │ │ └── agent_server/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── _secrets_exposure.py │ │ ├── agent-server.spec │ │ ├── api.py │ │ ├── auth_router.py │ │ ├── bash_router.py │ │ ├── bash_service.py │ │ ├── cloud_proxy_router.py │ │ ├── config.py │ │ ├── conversation_lease.py │ │ ├── conversation_router.py │ │ ├── conversation_router_acp.py │ │ ├── conversation_service.py │ │ ├── dependencies.py │ │ ├── desktop_router.py │ │ ├── desktop_service.py │ │ ├── docker/ │ │ │ ├── Dockerfile │ │ │ └── build.py │ │ ├── env_parser.py │ │ ├── event_router.py │ │ ├── event_service.py │ │ ├── file_router.py │ │ ├── git_router.py │ │ ├── hooks_router.py │ │ ├── hooks_service.py │ │ ├── llm_router.py │ │ ├── logging_config.py │ │ ├── middleware.py │ │ ├── models.py │ │ ├── openapi.py │ │ ├── persistence/ │ │ │ ├── __init__.py │ │ │ ├── models.py │ │ │ └── store.py │ │ ├── profiles_router.py │ │ ├── pub_sub.py │ │ ├── py.typed │ │ ├── server_details_router.py │ │ ├── settings_router.py │ │ ├── skills_router.py │ │ ├── skills_service.py │ │ ├── sockets.py │ │ ├── tool_preload_service.py │ │ ├── tool_router.py │ │ ├── utils.py │ │ ├── vscode_extensions/ │ │ │ └── openhands-settings/ │ │ │ ├── extension.js │ │ │ └── package.json │ │ ├── vscode_router.py │ │ ├── vscode_service.py │ │ └── workspace_router.py │ └── pyproject.toml ├── openhands-sdk/ │ ├── openhands/ │ │ └── sdk/ │ │ ├── AGENTS.md │ │ ├── __init__.py │ │ ├── agent/ │ │ │ ├── __init__.py │ │ │ ├── acp_agent.py │ │ │ ├── agent.py │ │ │ ├── base.py │ │ │ ├── critic_mixin.py │ │ │ ├── parallel_executor.py │ │ │ ├── prompts/ │ │ │ │ ├── in_context_learning_example.j2 │ │ │ │ ├── in_context_learning_example_suffix.j2 │ │ │ │ ├── model_specific/ │ │ │ │ │ ├── anthropic_claude.j2 │ │ │ │ │ ├── google_gemini.j2 │ │ │ │ │ └── openai_gpt/ │ │ │ │ │ ├── gpt-5-codex.j2 │ │ │ │ │ └── gpt-5.j2 │ │ │ │ ├── security_policy.j2 │ │ │ │ ├── security_risk_assessment.j2 │ │ │ │ ├── self_documentation.j2 │ │ │ │ ├── system_prompt.j2 │ │ │ │ ├── system_prompt_interactive.j2 │ │ │ │ ├── system_prompt_long_horizon.j2 │ │ │ │ ├── system_prompt_planning.j2 │ │ │ │ └── system_prompt_tech_philosophy.j2 │ │ │ ├── response_dispatch.py │ │ │ └── utils.py │ │ ├── banner.py │ │ ├── context/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── agent_context.py │ │ │ ├── condenser/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── llm_summarizing_condenser.py │ │ │ │ ├── no_op_condenser.py │ │ │ │ ├── pipeline_condenser.py │ │ │ │ ├── prompts/ │ │ │ │ │ └── summarizing_prompt.j2 │ │ │ │ └── utils.py │ │ │ ├── prompts/ │ │ │ │ ├── __init__.py │ │ │ │ ├── prompt.py │ │ │ │ └── templates/ │ │ │ │ ├── ask_agent_template.j2 │ │ │ │ ├── skill_knowledge_info.j2 │ │ │ │ └── system_message_suffix.j2 │ │ │ ├── skills/ │ │ │ │ └── __init__.py │ │ │ └── view/ │ │ │ ├── __init__.py │ │ │ ├── manipulation_indices.py │ │ │ ├── properties/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── batch_atomicity.py │ │ │ │ ├── observation_uniqueness.py │ │ │ │ ├── tool_call_matching.py │ │ │ │ └── tool_loop_atomicity.py │ │ │ └── view.py │ │ ├── conversation/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── conversation.py │ │ │ ├── conversation_stats.py │ │ │ ├── event_store.py │ │ │ ├── events_list_base.py │ │ │ ├── exceptions.py │ │ │ ├── fifo_lock.py │ │ │ ├── impl/ │ │ │ │ ├── __init__.py │ │ │ │ ├── local_conversation.py │ │ │ │ └── remote_conversation.py │ │ │ ├── persistence_const.py │ │ │ ├── request.py │ │ │ ├── resource_lock_manager.py │ │ │ ├── response_utils.py │ │ │ ├── secret_registry.py │ │ │ ├── serialization_diff.py │ │ │ ├── state.py │ │ │ ├── stuck_detector.py │ │ │ ├── title_utils.py │ │ │ ├── types.py │ │ │ └── visualizer/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── default.py │ │ ├── critic/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── impl/ │ │ │ │ ├── __init__.py │ │ │ │ ├── agent_finished.py │ │ │ │ ├── api/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── chat_template.py │ │ │ │ │ ├── client.py │ │ │ │ │ ├── critic.py │ │ │ │ │ └── taxonomy.py │ │ │ │ ├── empty_patch.py │ │ │ │ └── pass_critic.py │ │ │ └── result.py │ │ ├── event/ │ │ │ ├── __init__.py │ │ │ ├── acp_tool_call.py │ │ │ ├── base.py │ │ │ ├── condenser.py │ │ │ ├── conversation_error.py │ │ │ ├── conversation_state.py │ │ │ ├── hook_execution.py │ │ │ ├── llm_completion_log.py │ │ │ ├── llm_convertible/ │ │ │ │ ├── __init__.py │ │ │ │ ├── action.py │ │ │ │ ├── message.py │ │ │ │ ├── observation.py │ │ │ │ └── system.py │ │ │ ├── streaming_delta.py │ │ │ ├── token.py │ │ │ ├── types.py │ │ │ └── user_action.py │ │ ├── extensions/ │ │ │ ├── __init__.py │ │ │ ├── fetch.py │ │ │ └── installation/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── info.py │ │ │ ├── interface.py │ │ │ ├── manager.py │ │ │ ├── metadata.py │ │ │ └── utils.py │ │ ├── git/ │ │ │ ├── cached_repo.py │ │ │ ├── exceptions.py │ │ │ ├── git_changes.py │ │ │ ├── git_diff.py │ │ │ ├── models.py │ │ │ └── utils.py │ │ ├── hooks/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── conversation_hooks.py │ │ │ ├── executor.py │ │ │ ├── manager.py │ │ │ └── types.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── cache.py │ │ │ ├── local.py │ │ │ └── memory.py │ │ ├── llm/ │ │ │ ├── __init__.py │ │ │ ├── auth/ │ │ │ │ ├── __init__.py │ │ │ │ ├── credentials.py │ │ │ │ └── openai.py │ │ │ ├── exceptions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── classifier.py │ │ │ │ ├── mapping.py │ │ │ │ └── types.py │ │ │ ├── fallback_strategy.py │ │ │ ├── llm.py │ │ │ ├── llm_profile_store.py │ │ │ ├── llm_registry.py │ │ │ ├── llm_response.py │ │ │ ├── message.py │ │ │ ├── mixins/ │ │ │ │ ├── fn_call_converter.py │ │ │ │ ├── fn_call_examples.py │ │ │ │ └── non_native_fc.py │ │ │ ├── options/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chat_options.py │ │ │ │ ├── common.py │ │ │ │ └── responses_options.py │ │ │ ├── router/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── impl/ │ │ │ │ ├── multimodal.py │ │ │ │ └── random.py │ │ │ ├── streaming.py │ │ │ └── utils/ │ │ │ ├── image_resize.py │ │ │ ├── litellm_provider.py │ │ │ ├── metrics.py │ │ │ ├── model_features.py │ │ │ ├── model_info.py │ │ │ ├── model_prompt_spec.py │ │ │ ├── responses_serialization.py │ │ │ ├── retry_mixin.py │ │ │ ├── telemetry.py │ │ │ ├── unverified_models.py │ │ │ └── verified_models.py │ │ ├── logger/ │ │ │ ├── __init__.py │ │ │ ├── logger.py │ │ │ └── rolling.py │ │ ├── marketplace/ │ │ │ ├── __init__.py │ │ │ └── types.py │ │ ├── mcp/ │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ ├── definition.py │ │ │ ├── exceptions.py │ │ │ ├── tool.py │ │ │ └── utils.py │ │ ├── observability/ │ │ │ ├── __init__.py │ │ │ ├── laminar.py │ │ │ └── utils.py │ │ ├── plugin/ │ │ │ ├── __init__.py │ │ │ ├── fetch.py │ │ │ ├── installed.py │ │ │ ├── loader.py │ │ │ ├── plugin.py │ │ │ ├── source.py │ │ │ └── types.py │ │ ├── py.typed │ │ ├── secret/ │ │ │ ├── __init__.py │ │ │ └── secrets.py │ │ ├── security/ │ │ │ ├── __init__.py │ │ │ ├── analyzer.py │ │ │ ├── confirmation_policy.py │ │ │ ├── defense_in_depth/ │ │ │ │ ├── __init__.py │ │ │ │ ├── pattern.py │ │ │ │ ├── policy_rails.py │ │ │ │ └── utils.py │ │ │ ├── ensemble.py │ │ │ ├── grayswan/ │ │ │ │ ├── __init__.py │ │ │ │ ├── analyzer.py │ │ │ │ └── utils.py │ │ │ ├── llm_analyzer.py │ │ │ └── risk.py │ │ ├── settings/ │ │ │ ├── __init__.py │ │ │ ├── acp_providers.py │ │ │ ├── api_models.py │ │ │ ├── metadata.py │ │ │ └── model.py │ │ ├── skills/ │ │ │ ├── __init__.py │ │ │ ├── exceptions.py │ │ │ ├── execute.py │ │ │ ├── fetch.py │ │ │ ├── installed.py │ │ │ ├── skill.py │ │ │ ├── trigger.py │ │ │ ├── types.py │ │ │ └── utils.py │ │ ├── subagent/ │ │ │ ├── AGENTS.md │ │ │ ├── __init__.py │ │ │ ├── load.py │ │ │ ├── registry.py │ │ │ └── schema.py │ │ ├── testing/ │ │ │ ├── __init__.py │ │ │ └── test_llm.py │ │ ├── tool/ │ │ │ ├── __init__.py │ │ │ ├── builtins/ │ │ │ │ ├── __init__.py │ │ │ │ ├── finish.py │ │ │ │ ├── invoke_skill.py │ │ │ │ ├── switch_llm.py │ │ │ │ └── think.py │ │ │ ├── registry.py │ │ │ ├── schema.py │ │ │ ├── spec.py │ │ │ └── tool.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── async_executor.py │ │ │ ├── async_utils.py │ │ │ ├── cipher.py │ │ │ ├── command.py │ │ │ ├── datetime.py │ │ │ ├── deprecation.py │ │ │ ├── github.py │ │ │ ├── json.py │ │ │ ├── models.py │ │ │ ├── paging.py │ │ │ ├── path.py │ │ │ ├── pydantic_diff.py │ │ │ ├── pydantic_secrets.py │ │ │ ├── redact.py │ │ │ ├── truncate.py │ │ │ └── visualize.py │ │ └── workspace/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── local.py │ │ ├── models.py │ │ ├── remote/ │ │ │ ├── __init__.py │ │ │ ├── async_remote_workspace.py │ │ │ ├── base.py │ │ │ └── remote_workspace_mixin.py │ │ ├── repo.py │ │ └── workspace.py │ └── pyproject.toml ├── openhands-tools/ │ ├── openhands/ │ │ └── tools/ │ │ ├── AGENTS.md │ │ ├── __init__.py │ │ ├── apply_patch/ │ │ │ ├── __init__.py │ │ │ ├── core.py │ │ │ └── definition.py │ │ ├── browser_use/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ ├── event_storage.py │ │ │ ├── impl.py │ │ │ ├── js/ │ │ │ │ ├── flush-events.js │ │ │ │ ├── rrweb-loader.js │ │ │ │ ├── start-recording-simple.js │ │ │ │ ├── start-recording.js │ │ │ │ ├── stop-recording.js │ │ │ │ └── wait-for-rrweb.js │ │ │ ├── logging_fix.py │ │ │ ├── recording.py │ │ │ └── server.py │ │ ├── delegate/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ ├── impl.py │ │ │ ├── templates/ │ │ │ │ └── delegate_tool_description.j2 │ │ │ └── visualizer.py │ │ ├── file_editor/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ ├── editor.py │ │ │ ├── exceptions.py │ │ │ ├── impl.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── constants.py │ │ │ ├── diff.py │ │ │ ├── encoding.py │ │ │ ├── file_cache.py │ │ │ ├── history.py │ │ │ └── shell.py │ │ ├── gemini/ │ │ │ ├── __init__.py │ │ │ ├── edit/ │ │ │ │ ├── __init__.py │ │ │ │ ├── definition.py │ │ │ │ └── impl.py │ │ │ ├── list_directory/ │ │ │ │ ├── __init__.py │ │ │ │ ├── definition.py │ │ │ │ └── impl.py │ │ │ ├── read_file/ │ │ │ │ ├── __init__.py │ │ │ │ ├── definition.py │ │ │ │ └── impl.py │ │ │ └── write_file/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ └── impl.py │ │ ├── glob/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ └── impl.py │ │ ├── grep/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ └── impl.py │ │ ├── planning_file_editor/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ └── impl.py │ │ ├── preset/ │ │ │ ├── __init__.py │ │ │ ├── default.py │ │ │ ├── gemini.py │ │ │ ├── gpt5.py │ │ │ ├── planning.py │ │ │ └── subagents/ │ │ │ ├── bash_runner.md │ │ │ ├── code_explorer.md │ │ │ ├── default.md │ │ │ └── web_researcher.md │ │ ├── py.typed │ │ ├── task/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ ├── impl.py │ │ │ └── manager.py │ │ ├── task_tracker/ │ │ │ ├── __init__.py │ │ │ └── definition.py │ │ ├── terminal/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── definition.py │ │ │ ├── descriptions.py │ │ │ ├── impl.py │ │ │ ├── metadata.py │ │ │ ├── terminal/ │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── interface.py │ │ │ │ ├── subprocess_terminal.py │ │ │ │ ├── terminal_session.py │ │ │ │ ├── tmux_pane_pool.py │ │ │ │ ├── tmux_terminal.py │ │ │ │ └── windows_terminal.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── command.py │ │ │ └── escape_filter.py │ │ ├── tom_consult/ │ │ │ ├── __init__.py │ │ │ ├── definition.py │ │ │ └── executor.py │ │ └── utils/ │ │ ├── __init__.py │ │ └── timeout.py │ └── pyproject.toml ├── openhands-workspace/ │ ├── openhands/ │ │ └── workspace/ │ │ ├── AGENTS.md │ │ ├── __init__.py │ │ ├── apptainer/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── workspace.py │ │ ├── cloud/ │ │ │ ├── __init__.py │ │ │ └── workspace.py │ │ ├── docker/ │ │ │ ├── __init__.py │ │ │ ├── dev_workspace.py │ │ │ └── workspace.py │ │ ├── py.typed │ │ └── remote_api/ │ │ ├── __init__.py │ │ └── workspace.py │ └── pyproject.toml ├── pyproject.toml ├── scripts/ │ ├── agent_server_ui/ │ │ ├── run.sh │ │ └── static/ │ │ ├── app-dev.js │ │ ├── app.js │ │ ├── index-dev.html │ │ ├── index.html │ │ └── styles.css │ ├── auto_close_duplicate_issues.py │ ├── build_config_template.py │ ├── check_import_rules.py │ ├── check_tool_registration.py │ ├── completion_logs_viewer.py │ ├── conversation_viewer.py │ ├── convert_legacy_skills.py │ ├── event_sourcing_benchmarks/ │ │ ├── README.md │ │ ├── bench_persist_latency.py │ │ ├── bench_replay_and_recovery.py │ │ ├── bench_storage_growth.py │ │ └── benchmark_utils.py │ ├── issue_duplicate_check_openhands.py │ ├── render_examples_report.py │ └── websocket_client.html └── tests/ ├── README.md ├── __init__.py ├── agent_server/ │ ├── __init__.py │ ├── stress/ │ │ ├── __init__.py │ │ ├── budgets.py │ │ ├── conftest.py │ │ ├── probe.py │ │ ├── scripts.py │ │ ├── test_concurrent_conversations.py │ │ ├── test_conversation_listing.py │ │ ├── test_event_loop_responsiveness.py │ │ ├── test_high_volume_bash_output.py │ │ ├── test_lease_contention.py │ │ ├── test_long_running_command.py │ │ ├── test_parallel_subagents.py │ │ ├── test_slow_webhook.py │ │ ├── test_slow_websocket_consumer.py │ │ └── test_websocket_reconnect_storm.py │ ├── test_agent_server_wsproto.py │ ├── test_api.py │ ├── test_api_authentication.py │ ├── test_bash_service.py │ ├── test_check_browser.py │ ├── test_cloud_proxy_router.py │ ├── test_conversation_lease.py │ ├── test_conversation_response.py │ ├── test_conversation_router.py │ ├── test_conversation_router_acp.py │ ├── test_conversation_service.py │ ├── test_conversation_service_plugin.py │ ├── test_conversation_tags.py │ ├── test_dependencies.py │ ├── test_desktop_router.py │ ├── test_desktop_service.py │ ├── test_docker_build.py │ ├── test_env_parser.py │ ├── test_event_router.py │ ├── test_event_router_websocket.py │ ├── test_event_service.py │ ├── test_event_streaming.py │ ├── test_file_router.py │ ├── test_git_router.py │ ├── test_hooks_router.py │ ├── test_hooks_service.py │ ├── test_llm_router.py │ ├── test_models.py │ ├── test_openapi_discriminator.py │ ├── test_preload_modules.py │ ├── test_profiles_router.py │ ├── test_pub_sub.py │ ├── test_server_details_router.py │ ├── test_settings_router.py │ ├── test_skills_router.py │ ├── test_skills_service.py │ ├── test_terminal_router.py │ ├── test_terminal_service.py │ ├── test_tool_router.py │ ├── test_validation_error_sanitization.py │ ├── test_vscode_router.py │ ├── test_vscode_service.py │ ├── test_webhook_subscriber.py │ ├── test_websocket_first_message_auth.py │ ├── test_workspace_cookie_auth.py │ └── test_workspace_router.py ├── command_utils.py ├── conftest.py ├── cross/ │ ├── __init__.py │ ├── conftest.py │ ├── test_agent_loading.py │ ├── test_agent_secrets_integration.py │ ├── test_agent_server_build_metadata.py │ ├── test_automatic_naming.py │ ├── test_automatic_registration.py │ ├── test_check_agent_server_rest_api_breakage.py │ ├── test_check_deprecations.py │ ├── test_check_sdk_api_breakage.py │ ├── test_check_version_bumps.py │ ├── test_conversation_restore_behavior.py │ ├── test_event_loss_repro.py │ ├── test_hello_world.py │ ├── test_issue_duplicate_scripts.py │ ├── test_pr_review_trace.py │ ├── test_registry_directories.py │ ├── test_registry_qualnames.py │ ├── test_remote_conversation_live_server.py │ ├── test_resolve_model_config.py │ ├── test_stuck_detector.py │ ├── test_stuck_detector_config.py │ ├── test_todo_scanner.py │ └── test_validate_sdk_ref.py ├── examples/ │ └── test_examples.py ├── fixtures/ │ ├── conversations/ │ │ ├── v1_11_5_cli_default/ │ │ │ └── base_state.json │ │ └── v1_17_0_with_mcp_config/ │ │ └── base_state.json │ ├── llm_data/ │ │ ├── README.md │ │ ├── data_generator.py │ │ ├── fncall-llm-message.json │ │ ├── llm-logs/ │ │ │ ├── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015025.972.json │ │ │ ├── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015029.090.json │ │ │ ├── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015033.222.json │ │ │ ├── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015036.544.json │ │ │ ├── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015040.416.json │ │ │ └── litellm_proxy__anthropic__claude-sonnet-4-20250514-1757015046.707.json │ │ ├── nonfncall-llm-logs/ │ │ │ ├── litellm_proxy__deepseek__deepseek-chat-1757015054.055.json │ │ │ ├── litellm_proxy__deepseek__deepseek-chat-1757015062.589.json │ │ │ ├── litellm_proxy__deepseek__deepseek-chat-1757015068.723.json │ │ │ └── litellm_proxy__deepseek__deepseek-chat-1757015076.651.json │ │ └── nonfncall-llm-message.json │ └── tokenizers/ │ └── qwen3-4b-instruct-2507-tokenizer_config.json ├── integration/ │ ├── BEHAVIOR_TESTS.md │ ├── README.md │ ├── __init__.py │ ├── api_compliance/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── result.py │ │ └── run_compliance.py │ ├── base.py │ ├── behavior_utils.py │ ├── early_stopper.py │ ├── run_infer.py │ ├── schemas.py │ ├── test_behavior_utils.py │ ├── test_early_stopper.py │ ├── test_tool_presets.py │ ├── tests/ │ │ ├── a01_unmatched_tool_use.py │ │ ├── a02_unmatched_tool_result.py │ │ ├── a03_interleaved_user_msg.py │ │ ├── a04_interleaved_asst_msg.py │ │ ├── a05_duplicate_tool_call_id.py │ │ ├── a06_wrong_tool_call_id.py │ │ ├── a07_parallel_missing_result.py │ │ ├── a08_parallel_wrong_order.py │ │ ├── b01_no_premature_implementation.py │ │ ├── b02_no_oververification.py │ │ ├── b03_no_useless_backward_compatibility.py │ │ ├── b04_each_tool_call_has_a_concise_explanation.py │ │ ├── b05_do_not_create_redundant_files.py │ │ ├── c01_thinking_block_condenser.py │ │ ├── c02_hard_context_reset.py │ │ ├── c03_delayed_condensation.py │ │ ├── c04_token_condenser.py │ │ ├── c05_size_condenser.py │ │ ├── t01_fix_simple_typo.py │ │ ├── t02_add_bash_hello.py │ │ ├── t03_jupyter_write_file.py │ │ ├── t04_git_staging.py │ │ ├── t05_simple_browsing.py │ │ ├── t06_github_pr_browsing.py │ │ ├── t07_interactive_commands.py │ │ ├── t08_image_file_viewing.py │ │ └── t09_invoke_skill.py │ └── utils/ │ ├── __init__.py │ ├── behavior_helpers.py │ ├── consolidate_json_results.py │ ├── consolidate_results.py │ ├── format_costs.py │ ├── generate_markdown_report.py │ └── llm_judge.py ├── platform_utils.py ├── sdk/ │ ├── __init__.py │ ├── agent/ │ │ ├── __init__.py │ │ ├── test_acp_agent.py │ │ ├── test_acp_dedup_and_truncation.py │ │ ├── test_action_batch.py │ │ ├── test_agent_browser_auto_detect.py │ │ ├── test_agent_context_window_condensation.py │ │ ├── test_agent_immutability.py │ │ ├── test_agent_init_state_invariants.py │ │ ├── test_agent_llms_are_discoverable.py │ │ ├── test_agent_serialization.py │ │ ├── test_agent_step_responses_gating.py │ │ ├── test_agent_tool_init.py │ │ ├── test_agent_utils.py │ │ ├── test_extract_security_risk.py │ │ ├── test_extract_summary.py │ │ ├── test_fix_malformed_tool_arguments.py │ │ ├── test_iterative_refinement.py │ │ ├── test_message_while_finishing.py │ │ ├── test_non_executable_action_emission.py │ │ ├── test_nonexistent_tool_handling.py │ │ ├── test_parallel_execution_integration.py │ │ ├── test_parallel_executor.py │ │ ├── test_parallel_executor_locking.py │ │ ├── test_reasoning_only_responses.py │ │ ├── test_response_dispatch.py │ │ ├── test_sanitize_json_control_chars.py │ │ ├── test_security_policy_integration.py │ │ ├── test_system_prompt.py │ │ ├── test_tool_call_compatibility.py │ │ ├── test_tool_call_recovery.py │ │ ├── test_tool_execution_error_handling.py │ │ └── test_tool_validation_error_message.py │ ├── config/ │ │ ├── __init__.py │ │ └── test_llm_config.py │ ├── context/ │ │ ├── __init__.py │ │ ├── condenser/ │ │ │ ├── __init__.py │ │ │ ├── test_llm_summarizing_condenser.py │ │ │ ├── test_no_op_condenser.py │ │ │ ├── test_rolling_condenser.py │ │ │ └── test_utils.py │ │ ├── test_agent_context.py │ │ ├── test_agent_context_model_specific.py │ │ ├── test_agent_context_serialization.py │ │ ├── test_prompt_absolute_path.py │ │ ├── test_prompt_model_spec.py │ │ └── view/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── properties/ │ │ │ ├── conftest.py │ │ │ ├── test_batch_atomicity.py │ │ │ ├── test_observation_uniqueness.py │ │ │ ├── test_tool_call_matching.py │ │ │ └── test_tool_loop_atomicity.py │ │ ├── test_manipulation_indices.py │ │ ├── test_view.py │ │ ├── test_view_append_event.py │ │ ├── test_view_batch_atomicity.py │ │ ├── test_view_condensation_batch_atomicity.py │ │ ├── test_view_manipulation_indices.py │ │ ├── test_view_multi_summary.py │ │ └── test_view_tool_loop_boundaries.py │ ├── conversation/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── local/ │ │ │ ├── test_agent_status_transition.py │ │ │ ├── test_confirmation_mode.py │ │ │ ├── test_conversation_core.py │ │ │ ├── test_conversation_default_callback.py │ │ │ ├── test_conversation_id.py │ │ │ ├── test_conversation_path_types.py │ │ │ ├── test_conversation_pause_functionality.py │ │ │ ├── test_conversation_send_message.py │ │ │ ├── test_conversation_visualize_param.py │ │ │ ├── test_execute_tool.py │ │ │ ├── test_fork.py │ │ │ ├── test_rerun_actions.py │ │ │ ├── test_run_exception_includes_conversation_id.py │ │ │ ├── test_span_double_ending.py │ │ │ └── test_state_serialization.py │ │ ├── remote/ │ │ │ ├── __init__.py │ │ │ ├── test_api_key_functionality.py │ │ │ ├── test_remote_conversation.py │ │ │ ├── test_remote_events_list.py │ │ │ ├── test_remote_fork.py │ │ │ ├── test_remote_request_logging.py │ │ │ ├── test_remote_state.py │ │ │ ├── test_run_exception_includes_conversation_id_remote.py │ │ │ ├── test_websocket_client.py │ │ │ └── test_websocket_subscription_ready.py │ │ ├── test_agent_final_response.py │ │ ├── test_agent_state_reassignment.py │ │ ├── test_ask_agent.py │ │ ├── test_atexit_cleanup.py │ │ ├── test_base_span_management.py │ │ ├── test_condense.py │ │ ├── test_conversation_execution_status_enum.py │ │ ├── test_conversation_factory.py │ │ ├── test_conversation_secrets_constructor.py │ │ ├── test_conversation_stats.py │ │ ├── test_directories.py │ │ ├── test_event_store.py │ │ ├── test_fifo_lock.py │ │ ├── test_generate_title.py │ │ ├── test_get_unmatched_actions.py │ │ ├── test_local_conversation_plugins.py │ │ ├── test_mcp_secrets_serialization_leak.py │ │ ├── test_remote_conversation_state_updates.py │ │ ├── test_repo_root_project_skills.py │ │ ├── test_resource_lock_manager.py │ │ ├── test_secret_source.py │ │ ├── test_secrets_manager.py │ │ ├── test_state_change_callback.py │ │ ├── test_stats_update_event_snapshot.py │ │ ├── test_switch_model.py │ │ ├── test_tags.py │ │ └── test_visualizer.py │ ├── critic/ │ │ ├── __init__.py │ │ ├── api/ │ │ │ └── test_template_render.py │ │ ├── test_critic.py │ │ ├── test_critic_client.py │ │ └── test_critic_display.py │ ├── event/ │ │ ├── __init__.py │ │ ├── test_action_event_summary.py │ │ ├── test_dynamic_context_message_sequence.py │ │ ├── test_event_immutability.py │ │ ├── test_event_serialization.py │ │ ├── test_events_to_messages.py │ │ ├── test_llm_completion_log_event.py │ │ ├── test_non_executable_action_event.py │ │ ├── test_streaming.py │ │ └── test_system_prompt_event_visualize.py │ ├── extensions/ │ │ ├── __init__.py │ │ ├── installation/ │ │ │ ├── __init__.py │ │ │ ├── test_installation_info.py │ │ │ ├── test_installation_manager.py │ │ │ ├── test_installation_metadata.py │ │ │ └── test_installation_utils.py │ │ └── test_fetch.py │ ├── git/ │ │ ├── __init__.py │ │ ├── test_cached_repo.py │ │ ├── test_git_changes.py │ │ └── test_git_diff.py │ ├── hooks/ │ │ ├── __init__.py │ │ ├── test_config.py │ │ ├── test_executor.py │ │ ├── test_integration.py │ │ └── test_manager.py │ ├── io/ │ │ ├── __init__.py │ │ ├── test_filestore_cache.py │ │ └── test_local_filestore_security.py │ ├── llm/ │ │ ├── __init__.py │ │ ├── auth/ │ │ │ ├── __init__.py │ │ │ ├── test_credentials.py │ │ │ └── test_openai.py │ │ ├── test_api_connection_error_retry.py │ │ ├── test_api_key_validation.py │ │ ├── test_chat_options.py │ │ ├── test_exception.py │ │ ├── test_exception_classifier.py │ │ ├── test_exception_mapping.py │ │ ├── test_llm.py │ │ ├── test_llm_completion.py │ │ ├── test_llm_fallback.py │ │ ├── test_llm_fncall_converter.py │ │ ├── test_llm_image_resizing.py │ │ ├── test_llm_json_storage.py │ │ ├── test_llm_litellm_extra_body.py │ │ ├── test_llm_log_completions_integration.py │ │ ├── test_llm_metrics.py │ │ ├── test_llm_no_response_retry.py │ │ ├── test_llm_pricing_passthrough.py │ │ ├── test_llm_profile_store.py │ │ ├── test_llm_registry.py │ │ ├── test_llm_retry_telemetry.py │ │ ├── test_llm_serialization.py │ │ ├── test_llm_telemetry.py │ │ ├── test_llm_timeout.py │ │ ├── test_message.py │ │ ├── test_message_backward_compatibility.py │ │ ├── test_message_from_chat_and_helpers.py │ │ ├── test_message_serialization.py │ │ ├── test_message_tool_call.py │ │ ├── test_model_canonical_name_resolution.py │ │ ├── test_model_features.py │ │ ├── test_model_list.py │ │ ├── test_prompt_caching_cross_conversation.py │ │ ├── test_pydantic_warning_suppression.py │ │ ├── test_reasoning_content.py │ │ ├── test_responses_parsing_and_kwargs.py │ │ ├── test_responses_serialization.py │ │ ├── test_subscription_mode.py │ │ ├── test_telemetry_policy.py │ │ ├── test_thinking_blocks.py │ │ └── test_vision_support.py │ ├── logger/ │ │ ├── __init__.py │ │ └── test_litellm_log_suppression.py │ ├── marketplace/ │ │ ├── __init__.py │ │ ├── test_deprecation.py │ │ └── test_marketplace.py │ ├── mcp/ │ │ ├── __init__.py │ │ ├── test_create_mcp_tool.py │ │ ├── test_mcp_action_serialization.py │ │ ├── test_mcp_observation.py │ │ ├── test_mcp_security_risk.py │ │ ├── test_mcp_session_persistence.py │ │ ├── test_mcp_tool.py │ │ ├── test_mcp_tool_immutability.py │ │ ├── test_mcp_tool_kind_field.py │ │ ├── test_mcp_tool_serialization.py │ │ ├── test_mcp_tool_validation.py │ │ └── test_stateful_mcp.py │ ├── observability/ │ │ ├── __init__.py │ │ └── test_laminar.py │ ├── plugin/ │ │ ├── __init__.py │ │ ├── test_installed_plugins.py │ │ ├── test_plugin_fetch.py │ │ ├── test_plugin_fetch_integration.py │ │ ├── test_plugin_loader.py │ │ ├── test_plugin_loading.py │ │ ├── test_plugin_merging.py │ │ └── test_source.py │ ├── security/ │ │ ├── __init__.py │ │ ├── defense_in_depth/ │ │ │ ├── __init__.py │ │ │ ├── test_adversarial.py │ │ │ ├── test_ensemble.py │ │ │ ├── test_field_cap.py │ │ │ ├── test_pattern.py │ │ │ ├── test_policy_rails.py │ │ │ └── test_serialization.py │ │ ├── grayswan/ │ │ │ ├── __init__.py │ │ │ ├── test_grayswan_analyzer.py │ │ │ └── test_grayswan_utils.py │ │ ├── test_confirmation_policy.py │ │ ├── test_llm_security_analyzer.py │ │ ├── test_security_analyzer.py │ │ └── test_security_risk.py │ ├── settings/ │ │ ├── __init__.py │ │ └── test_acp_providers.py │ ├── skills/ │ │ ├── __init__.py │ │ ├── test_agentskills_fields.py │ │ ├── test_extensions_ref.py │ │ ├── test_installed_skills.py │ │ ├── test_load_project_skills.py │ │ ├── test_load_public_skills.py │ │ ├── test_load_user_skills.py │ │ ├── test_mcp_config_expansion.py │ │ ├── test_mcp_json.py │ │ ├── test_resource_directories.py │ │ ├── test_skill_commands.py │ │ ├── test_skill_info.py │ │ ├── test_skill_md_convention.py │ │ ├── test_skill_no_header.py │ │ ├── test_skill_serialization.py │ │ ├── test_skill_utils.py │ │ ├── test_task_skill.py │ │ ├── test_validation_improvements.py │ │ └── test_validation_prompt.py │ ├── subagent/ │ │ ├── __init__.py │ │ ├── test_subagent_loader.py │ │ ├── test_subagent_registry.py │ │ └── test_subagent_schema.py │ ├── test_agent_step_bounded_scan.py │ ├── test_banner.py │ ├── test_import_performance.py │ ├── test_settings.py │ ├── test_socks_proxy_support.py │ ├── tool/ │ │ ├── __init__.py │ │ ├── test_builtins.py │ │ ├── test_invoke_skill.py │ │ ├── test_mcp_schema.py │ │ ├── test_py_type.py │ │ ├── test_registry.py │ │ ├── test_schema_immutability.py │ │ ├── test_switch_llm.py │ │ ├── test_to_responses_tool.py │ │ ├── test_to_responses_tool_security.py │ │ ├── test_to_responses_tool_summary.py │ │ ├── test_tool.py │ │ ├── test_tool_call_output_coercion.py │ │ ├── test_tool_definition.py │ │ ├── test_tool_immutability.py │ │ └── test_tool_serialization.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── test_async_utils.py │ │ ├── test_cipher.py │ │ ├── test_command.py │ │ ├── test_deprecation.py │ │ ├── test_discriminated_union.py │ │ ├── test_github.py │ │ ├── test_model_prompt_spec.py │ │ ├── test_paging.py │ │ ├── test_path.py │ │ ├── test_pydantic_secrets.py │ │ ├── test_redact.py │ │ ├── test_subclass_cache.py │ │ ├── test_truncate.py │ │ └── test_visualize.py │ └── workspace/ │ ├── __init__.py │ ├── conftest.py │ └── remote/ │ ├── __init__.py │ ├── test_async_remote_workspace.py │ ├── test_client_base_url.py │ ├── test_multiple_commands_isolation.py │ ├── test_polling_duplicates_output.py │ ├── test_remote_workspace.py │ └── test_remote_workspace_mixin.py ├── tools/ │ ├── __init__.py │ ├── apply_patch/ │ │ └── test_apply_patch_executor.py │ ├── browser_use/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_browser_cleanup.py │ │ ├── test_browser_executor.py │ │ ├── test_browser_executor_e2e.py │ │ ├── test_browser_initialization.py │ │ ├── test_browser_observation.py │ │ ├── test_browser_toolset.py │ │ ├── test_chromium_detection.py │ │ ├── test_recording_flush.py │ │ └── test_vnc_integration.py │ ├── delegate/ │ │ ├── test_delegation.py │ │ └── test_visualizer.py │ ├── file_editor/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_basic_operations.py │ │ ├── test_error_handling.py │ │ ├── test_exceptions.py │ │ ├── test_file_editor_tool.py │ │ ├── test_file_validation.py │ │ ├── test_memory_usage.py │ │ ├── test_schema.py │ │ ├── test_view_supported_binary_files.py │ │ ├── test_visualize_diff.py │ │ ├── test_workspace_root.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── test_encoding.py │ │ ├── test_file_cache.py │ │ ├── test_history.py │ │ └── test_shell_utils.py │ ├── gemini/ │ │ ├── conftest.py │ │ ├── edit/ │ │ │ ├── __init__.py │ │ │ └── test_edit.py │ │ ├── list_directory/ │ │ │ ├── __init__.py │ │ │ └── test_list_directory.py │ │ ├── read_file/ │ │ │ ├── __init__.py │ │ │ └── test_read_file.py │ │ ├── test_cross_tool_locking.py │ │ └── write_file/ │ │ ├── __init__.py │ │ └── test_write_file.py │ ├── glob/ │ │ ├── __init__.py │ │ ├── test_consistency.py │ │ ├── test_glob_executor.py │ │ └── test_glob_tool.py │ ├── grep/ │ │ ├── __init__.py │ │ ├── test_consistency.py │ │ ├── test_grep_executor.py │ │ └── test_grep_tool.py │ ├── planning_file_editor/ │ │ └── test_planning_file_editor_tool.py │ ├── task/ │ │ ├── test_task_manager.py │ │ ├── test_task_manager_thread_safety.py │ │ └── test_task_tool_set.py │ ├── terminal/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_conversation_cleanup.py │ │ ├── test_escape_filter.py │ │ ├── test_heredoc_chunked_send.py │ │ ├── test_large_environment.py │ │ ├── test_observation_truncation.py │ │ ├── test_pool_integration.py │ │ ├── test_ps1_corruption.py │ │ ├── test_schema.py │ │ ├── test_secrets_masking.py │ │ ├── test_send_keys.py │ │ ├── test_session_factory.py │ │ ├── test_shell_path_configuration.py │ │ ├── test_shutdown_handling.py │ │ ├── test_terminal_exit_code_top_level.py │ │ ├── test_terminal_parsing.py │ │ ├── test_terminal_ps1_metadata.py │ │ ├── test_terminal_reset.py │ │ ├── test_terminal_session.py │ │ ├── test_terminal_tool.py │ │ ├── test_terminal_tool_auto_detection.py │ │ ├── test_tmux_pane_pool.py │ │ ├── test_windows_ctrl_c.py │ │ └── test_windows_terminal.py │ ├── test_builtin_agents.py │ ├── test_init.py │ ├── test_planning_preset.py │ ├── test_tool_name_consistency.py │ ├── test_tool_registration_check.py │ ├── test_working_dir_standardization.py │ └── tom_consult/ │ ├── __init__.py │ └── test_tom_consult_tool.py └── workspace/ ├── test_api_remote_workspace.py ├── test_apptainer_workspace.py ├── test_cloud_workspace.py ├── test_cloud_workspace_automation_tags.py ├── test_cloud_workspace_repos.py ├── test_cloud_workspace_sdk_settings.py ├── test_docker_workspace.py └── test_workspace_pause_resume.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .agents/skills/cross-repo-testing/SKILL.md ================================================ --- name: cross-repo-testing description: This skill should be used when the user asks to "test a saas cross-repo feature", "deploy a feature branch to staging", "test SDK against OH Cloud branch", "e2e test a cloud workspace feature", "test secrets saas inheritance", or when changes span the SDK and OpenHands enterprise and need end-to-end validation against a staging deployment. --- # Cross-Repo Testing: SDK ↔ OpenHands Cloud How to end-to-end test features that span `OpenHands/software-agent-sdk` and `OpenHands/OpenHands` (the Cloud backend). ## Repository Map | Repo | Role | What lives here | |------|------|-----------------| | [`software-agent-sdk`](https://github.com/OpenHands/software-agent-sdk) | Agent core | `openhands-sdk`, `openhands-workspace`, `openhands-tools` packages. `OpenHandsCloudWorkspace` lives here. | | [`OpenHands`](https://github.com/OpenHands/OpenHands) | Cloud backend | FastAPI server (`openhands/app_server/`), sandbox management, auth, enterprise integrations. Deployed as OH Cloud. | | [`deploy`](https://github.com/OpenHands/deploy) | Infrastructure | Helm charts + GitHub Actions that build the enterprise Docker image and deploy to staging/production. | **Data flow:** SDK client → OH Cloud API (`/api/v1/...`) → sandbox agent-server (inside runtime container) ## When You Need This There are **two flows** depending on which direction the dependency goes: | Flow | When | Example | |------|------|---------| | **A — SDK client → new Cloud API** | The SDK calls an API that doesn't exist yet on production | `workspace.get_llm()` calling `GET /api/v1/users/me?expose_secrets=true` | | **B — OH server → new SDK code** | The Cloud server needs unreleased SDK packages or a new agent-server image | Server consumes a new tool, agent behavior, or workspace method from the SDK | Flow A only requires deploying the server PR. Flow B requires pinning the SDK to an unreleased commit in the server PR **and** using the SDK PR's agent-server image. Both flows may apply simultaneously. --- ## Flow A: SDK Client Tests Against New Cloud API Use this when the SDK calls an endpoint that only exists on the server PR branch. ### A1. Write and test the server-side changes In the `OpenHands` repo, implement the new API endpoint(s). Run unit tests: ```bash cd OpenHands poetry run pytest tests/unit/app_server/test_.py -v ``` Push a PR. Wait for the **"Push Enterprise Image" (Docker) CI job** to succeed — this builds `ghcr.io/openhands/enterprise-server:sha-`. ### A2. Write the SDK-side changes In `software-agent-sdk`, implement the client code (e.g., new methods on `OpenHandsCloudWorkspace`). Run SDK unit tests: ```bash cd software-agent-sdk pip install -e openhands-sdk -e openhands-workspace pytest tests/ -v ``` Push a PR. SDK CI is independent — it doesn't need the server changes to pass unit tests. ### A3. Deploy the server PR to staging See [Deploying to a Staging Feature Environment](#deploying-to-a-staging-feature-environment) below. ### A4. Run the SDK e2e test against staging See [Running E2E Tests Against Staging](#running-e2e-tests-against-staging) below. --- ## Flow B: OH Server Needs Unreleased SDK Code Use this when the Cloud server depends on SDK changes that haven't been released to PyPI yet. The server's runtime containers run the `agent-server` image built from the SDK repo, so the server PR must be configured to use the SDK PR's image and packages. ### B1. Get the SDK PR merged (or identify the commit) The SDK PR must have CI pass so its agent-server Docker image is built. The image is tagged with the **merge-commit SHA** from GitHub Actions — NOT the head-commit SHA shown in the PR. Find the correct image tag: - Check the SDK PR description for an `AGENT_SERVER_IMAGES` section - Or check the "Consolidate Build Information" CI job for `"short_sha": ""` ### B2. Pin SDK packages to the commit in the OpenHands PR In the `OpenHands` repo PR, update 3 files + regenerate 3 lock files (see the `update-sdk` skill for full details): **`pyproject.toml`** — pin all 3 SDK packages in **both** `dependencies` and `[tool.poetry.dependencies]`: ```toml # dependencies array (PEP 508) "openhands-sdk @ git+https://github.com/OpenHands/software-agent-sdk.git@#subdirectory=openhands-sdk", "openhands-agent-server @ git+https://github.com/OpenHands/software-agent-sdk.git@#subdirectory=openhands-agent-server", "openhands-tools @ git+https://github.com/OpenHands/software-agent-sdk.git@#subdirectory=openhands-tools", # [tool.poetry.dependencies] openhands-sdk = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "", subdirectory = "openhands-sdk" } openhands-agent-server = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "", subdirectory = "openhands-agent-server" } openhands-tools = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "", subdirectory = "openhands-tools" } ``` **`openhands/app_server/sandbox/sandbox_spec_service.py`** — use the SDK's merge-commit SHA: ```python AGENT_SERVER_IMAGE = 'ghcr.io/openhands/agent-server:-python' ``` **Regenerate lock files:** ```bash poetry lock && uv lock && cd enterprise && poetry lock && cd .. ``` ### B3. Wait for the OpenHands enterprise image to build Push the pinned changes. The OpenHands CI will build a new enterprise Docker image (`ghcr.io/openhands/enterprise-server:sha-`) that bundles the unreleased SDK. Wait for the "Push Enterprise Image" job to succeed. ### B4. Deploy and test Follow [Deploying to a Staging Feature Environment](#deploying-to-a-staging-feature-environment) using the new OpenHands commit SHA. ### B5. Before merging: remove the pin **CI guard:** `check-package-versions.yml` blocks merge to `main` if `[tool.poetry.dependencies]` contains `rev` fields. Before the OpenHands PR can merge, the SDK PR must be merged and released to PyPI, then the pin must be replaced with the released version number. --- ## Deploying to a Staging Feature Environment The `deploy` repo creates preview environments from OpenHands PRs. **Option A — GitHub Actions UI (preferred):** Go to `OpenHands/deploy` → Actions → "Create OpenHands preview PR" → enter the OpenHands PR number. This creates a branch `ohpr--` and opens a deploy PR. **Option B — Update an existing feature branch:** ```bash cd deploy git checkout ohpr-- # In .github/workflows/deploy.yaml, update BOTH: # OPENHANDS_SHA: "" # OPENHANDS_RUNTIME_IMAGE_TAG: "-nikolaik" git commit -am "Update OPENHANDS_SHA to " && git push ``` **Before updating the SHA**, verify the enterprise Docker image exists: ```bash gh api repos/OpenHands/OpenHands/actions/runs \ --jq '.workflow_runs[] | select(.head_sha=="") | "\(.name): \(.conclusion)"' \ | grep Docker # Must show: "Docker: success" ``` The deploy CI auto-triggers and creates the environment at: ``` https://ohpr--.staging.all-hands.dev ``` **Wait for it to be live:** ```bash curl -s -o /dev/null -w "%{http_code}" https://ohpr--.staging.all-hands.dev/api/v1/health # 401 = server is up (auth required). DNS may take 1-2 min on first deploy. ``` ## Running E2E Tests Against Staging **Critical: Feature deployments have their own Keycloak instance.** API keys from `app.all-hands.dev` or `$OPENHANDS_API_KEY` will NOT work. You need a test API key for the specific feature deployment. The user must provide one. ```python from openhands.workspace import OpenHandsCloudWorkspace STAGING = "https://ohpr--.staging.all-hands.dev" with OpenHandsCloudWorkspace( cloud_api_url=STAGING, cloud_api_key="", ) as workspace: # Test the new feature llm = workspace.get_llm() secrets = workspace.get_secrets() print(f"LLM: {llm.model}, secrets: {list(secrets.keys())}") ``` Or run an example script: ```bash OPENHANDS_CLOUD_API_KEY="" \ OPENHANDS_CLOUD_API_URL="https://ohpr--.staging.all-hands.dev" \ python examples/02_remote_agent_server/10_cloud_workspace_saas_credentials.py ``` ### Recording results Push test output to the SDK PR's `.pr/logs/` directory: ```bash cd software-agent-sdk python test_script.py 2>&1 | tee .pr/logs/.log git add -f .pr/logs/.log .pr/README.md git commit -m "docs: add e2e test results" && git push ``` Comment on **both PRs** with pass/fail summary and link to logs. ## Key Gotchas | Gotcha | Details | |--------|---------| | **Feature env auth is isolated** | Each `ohpr-*` deployment has its own Keycloak. Production API keys don't work. | | **Two SHAs in deploy.yaml** | `OPENHANDS_SHA` and `OPENHANDS_RUNTIME_IMAGE_TAG` must both be updated. The runtime tag is `-nikolaik`. | | **Enterprise image must exist** | The Docker CI job on the OpenHands PR must succeed before you can deploy. If it hasn't run, push an empty commit to trigger it. | | **DNS propagation** | First deployment of a new branch takes 1-2 min for DNS. Subsequent deploys are instant. | | **Merge-commit SHA ≠ head SHA** | SDK CI tags Docker images with GitHub Actions' merge-commit SHA, not the PR head SHA. Check the SDK PR description or CI logs for the correct tag. | | **SDK pin blocks merge** | `check-package-versions.yml` prevents merging an OpenHands PR that has `rev` fields in `[tool.poetry.dependencies]`. The SDK must be released to PyPI first. | | **Flow A: stock agent-server is fine** | When only the Cloud API changes, `OpenHandsCloudWorkspace` talks to the Cloud server, not the agent-server. No custom image needed. | | **Flow B: agent-server image is required** | When the server needs new SDK code inside runtime containers, you must pin to the SDK PR's agent-server image. | ================================================ FILE: .agents/skills/custom-codereview-guide.md ================================================ --- name: custom-codereview-guide description: Repo-specific code review guidelines for OpenHands/software-agent-sdk. Provides SDK-specific review rules in addition to the default code review skill. triggers: - /codereview --- # OpenHands/software-agent-sdk Code Review Guidelines You are an expert code reviewer for the **OpenHands/software-agent-sdk** repository. This skill provides repo-specific review guidelines. Be direct but constructive. ## Review Decisions You have permission to **APPROVE** or **COMMENT** on PRs. Do not use REQUEST_CHANGES. ### Review decision policy (eval / benchmark risk) Do **NOT** submit an **APPROVE** review when the PR changes agent behavior or anything that could plausibly affect benchmark/evaluation performance — **unless** eval evidence is already provided (see exception below). Examples include: prompt templates, tool calling/execution, planning/loop logic, memory/condenser behavior, terminal/stdin/stdout handling, or evaluation harness code. If a PR is in this category (or you are uncertain), leave a **COMMENT** review and explicitly flag it for a human maintainer to decide after running lightweight evals. #### Exception – eval evidence provided If the PR description **or** PR comments contain a link to the eval monitor (`openhands-eval-monitor.vercel.app`) showing a completed benchmark run **and** a human maintainer has commented confirming the results (e.g., "Human review done", "eval looks good", or similar), treat the eval-risk requirement as satisfied and follow the normal approval policy. The eval monitor link is authoritative proof of benchmark validation for this repository. ### Default approval policy **Default to APPROVE**: If your review finds no issues at "important" level or higher, approve the PR. Minor suggestions or nitpicks alone are not sufficient reason to withhold approval. **IMPORTANT:** If you determine a PR is worth merging **and it is not in the eval-risk category above**, you should approve it. Don’t just say a PR is "worth merging" or "ready to merge" without actually submitting an approval. Your words and actions should be consistent. ### When to APPROVE Examples of straightforward and low-risk PRs you should approve (non-exhaustive): - **Configuration changes**: Adding models to config files, updating CI/workflow settings - **CI/Infrastructure changes**: Changing runner types, fixing workflow paths, updating job configurations - **Cosmetic changes**: Typo fixes, formatting, comment improvements, README updates - **Documentation-only changes**: Docstring updates, clarifying notes, API documentation improvements - **Simple additions**: Adding entries to lists/dictionaries following existing patterns - **Test-only changes**: Adding or updating tests without changing production code - **Dependency updates**: Version bumps with passing CI, unless the updated package is newer than the repo's 7-day freshness guardrail described in the Security section below ### When NOT to APPROVE - Blocking Issues **DO NOT APPROVE** PRs that have any of the following issues: - **Package version bumps in non-release PRs**: If any `pyproject.toml` file has changes to the `version` field (e.g., `version = "1.12.0"` → `version = "1.13.0"`), and the PR is NOT explicitly a release PR (title/description doesn't indicate it's a release), **DO NOT APPROVE**. Version numbers should only be changed in dedicated release PRs managed by maintainers. - Check: Look for changes to `version = "..."` in any `*/pyproject.toml` files - Exception: PRs with titles like "release: v1.x.x" or "chore: bump version to 1.x.x" from maintainers - **Too-new dependency uploads**: If a dependency bump pulls in a package uploaded within the repo's 7-day freshness window, **DO NOT APPROVE**. See the Security section below for the exact review instructions and the Dependabot / `tool.uv.exclude-newer` caveat. Examples: - A PR adding a new model to `resolve_model_config.py` or `verified_models.py` with corresponding test updates - A PR adding documentation notes to docstrings clarifying method behavior (e.g., security considerations, bypass behaviors) - A PR changing CI runners or fixing workflow infrastructure issues (e.g., standardizing runner types to fix path inconsistencies) ### When to COMMENT Use COMMENT when you have feedback or concerns: - Issues that need attention (bugs, security concerns, missing tests) - Suggestions for improvement - Questions about design decisions - Minor style preferences If there are significant issues, leave detailed comments explaining the concerns—but let a human maintainer decide whether to block the PR. ## Security ### Dependency freshness / supply-chain guardrail This repository intentionally uses a workspace-wide `uv` resolver guardrail: - Root `pyproject.toml`: `[tool.uv] exclude-newer = "7 days"` **Important:** Dependabot does **not** currently honor that `uv` guardrail when it opens `uv.lock` update PRs for this repo's workspace setup. A Dependabot PR can therefore bump to a version that was uploaded **less than 7 days ago**, even though a local `uv lock` would normally exclude it. When reviewing dependency update PRs (`uv.lock`, `pyproject.toml`, `requirements*.txt`, etc.), explicitly check for **too-new package uploads**: 1. Check the package upload timestamp on the package index. 2. For `uv.lock`, use the per-file `upload-time` metadata in the changed package entry. 3. Treat `upload-time` as the upload time of that specific distribution file to the package index (for example, the wheel uploaded to PyPI) — not the Git tag time or GitHub release time. 4. Compare that timestamp against the current date and the repo's 7-day freshness window. If the updated package was uploaded **within the last 7 days**, treat it as a real security / supply-chain concern: - Do **NOT** approve the PR. - Leave a **COMMENT** review that clearly calls out the package name, version, upload time, and that it is newer than the repo's 7-day guardrail. - Explain that this can happen because Dependabot currently ignores `tool.uv.exclude-newer` for this repo's workspace updates. - Ask a human maintainer to decide whether to wait until the package ages past the guardrail or to merge intentionally despite the freshness risk. ## Core Principles 1. **Simplicity First**: Question complexity. If something feels overcomplicated, ask "what's the use case?" and seek simpler alternatives. Features should solve real problems, not imaginary ones. 2. **Pragmatic Testing**: Test what matters. Avoid duplicate test coverage. Don't test library features (e.g., `BaseModel.model_dump()`). Focus on the specific logic implemented in this codebase. 3. **Type Safety**: Avoid `# type: ignore` - treat it as a last resort. Fix types properly with assertions, proper annotations, or code adjustments. Prefer explicit type checking over `getattr`/`hasattr` guards. 4. **Backward Compatibility**: Evaluate breaking change impact carefully. Consider API changes that affect existing users, removal of public fields/methods, and changes to default behavior. ## What to Check - **Complexity**: Over-engineered solutions, unnecessary abstractions, complex logic that could be refactored - **Testing**: Duplicate test coverage, tests for library features, missing edge case coverage. For code that writes to disk, verify that tests cover the **persistence round-trip** (write → close → reopen → verify), not just in-memory state - **Type Safety**: `# type: ignore` usage, missing type annotations, `getattr`/`hasattr` guards, mocking non-existent arguments - **Breaking Changes**: API changes affecting users, removed public fields/methods, changed defaults - **Code Quality**: Code duplication, missing comments for non-obvious decisions, inline imports (unless necessary for circular deps) - **Repository Conventions**: Use `pyright` not `mypy`, put fixtures in `conftest.py`, avoid `sys.path.insert` hacks - **Directory Example Entrypoints**: PRs that add or modify folder-based runnable examples under `examples/` should use `main.py` as the entrypoint and add the directory to `_TARGET_DIRECTORIES` in `tests/examples/test_examples.py`; see [Directory-Based Examples](#directory-based-examples) - **Event Type Deprecation**: Changes to event types (Pydantic models used in serialization) must handle deprecated fields properly - **Thread Safety**: New methods in `LocalConversation` that read or write `self._state` must use `with self._state:` — see the [Concurrency](#concurrency---localconversation-state-lock) section below - **Persistence Paths**: Code that computes persistence directories must not double-append the conversation hex — see the [Persistence Paths](#persistence-path-construction) section below - **Server-Side Cleanup**: Endpoints that create persistent state (directories, files) must have rollback logic for partial failures — see the [Server Error Handling](#server-side-error-handling) section below - **Cross-File Data Flow**: When new code calls existing APIs (constructors, factory methods), trace 1–2 levels into those APIs to verify the caller uses them correctly. Bugs often hide at layer boundaries where the caller's assumptions don't match the callee's behavior - **Secret Serialization**: Fields that carry secrets must use `serialize_secret()` from `openhands.sdk.utils.pydantic_secrets`. For `dict[str, str]` secret fields, wrap each value in `SecretStr` and call `serialize_secret` per value. Do not hand-roll redaction logic (e.g. custom sentinels or inline `expose_secrets` checks) in field serializers - **Info-Log Payloads**: `logger.info(...)` must not dump objects, dicts, or variable-length lists — see [Logging Hygiene](#logging-hygiene) ## Directory-Based Examples When a PR adds or modifies a runnable example represented by a directory under `examples/`, verify that: 1. The runnable entrypoint is named `main.py`. 2. Helper modules inside that directory are not accidentally treated as standalone examples. 3. `tests/examples/test_examples.py` includes the example directory in `_TARGET_DIRECTORIES` when the example should run in the `test-examples` workflow. 4. The example prints an `EXAMPLE_COST: ...` marker when run by the workflow. Do not ask for this convention on support scripts that are intentionally named for GitHub workflow consumption (for example reusable automation scripts under `examples/03_github_workflows/`) unless they are presented as a directory-based runnable example. ## Event Type Deprecation - Critical Review Checkpoint When reviewing PRs that modify event types (e.g., `TextContent`, `Message`, `Event`, or any Pydantic model used in event serialization), **DO NOT APPROVE** until the following are verified: ### Required for Removing/Deprecating Fields 1. **Model validator present**: If a field is being removed from an event type with `extra="forbid"`, there MUST be a `@model_validator(mode="before")` that uses `handle_deprecated_model_fields()` to remove the deprecated field before validation. Otherwise, old events will fail to load. 2. **Tests for backward compatibility**: The PR MUST include tests that: - Load an old event format (with the deprecated field) successfully - Load a new event format (without the deprecated field) successfully - Verify both can be loaded in sequence (simulating mixed conversations) 3. **Test naming convention**: The version in the test name should be the **LAST version** where a particular event structure exists. For example, if `enable_truncation` was removed in v1.11.1, the test should be named `test_v1_10_0_...` (the last version with that field), not `test_v1_8_0_...` (when it was introduced). This avoids duplicate tests and clearly documents when a field was last present. **Important**: Deprecated field handlers are **permanent** and should never be removed. They ensure old conversations can always be loaded. ### Example Pattern (Required) ```python from openhands.sdk.utils.deprecation import handle_deprecated_model_fields class MyModel(BaseModel): model_config = ConfigDict(extra="forbid") # Deprecated fields that are silently removed for backward compatibility # when loading old events. These are kept permanently. _DEPRECATED_FIELDS: ClassVar[tuple[str, ...]] = ("old_field_name",) @model_validator(mode="before") @classmethod def _handle_deprecated_fields(cls, data: Any) -> Any: """Remove deprecated fields for backward compatibility with old events.""" return handle_deprecated_model_fields(data, cls._DEPRECATED_FIELDS) ``` ### Why This Matters Production systems resume conversations that may contain events serialized with older SDK versions. If the SDK can't load old events, users will see errors like: ``` pydantic_core.ValidationError: Extra inputs are not permitted ``` **This is a production-breaking change.** Do not approve PRs that modify event types without proper backward compatibility handling and tests. ## SDK Architecture Conventions These conventions codify patterns that are easy to violate when adding new features. Each was learned from a real bug. ### Concurrency - LocalConversation State Lock `LocalConversation` protects mutable state with a FIFOLock accessed via `with self._state:`. **Every** method that reads or writes `self._state.events`, `self._state.stats`, `self._state.agent_state`, `self._state.activated_knowledge_skills`, or any other mutable field on `ConversationState` must hold this lock. There are currently ~13 call sites using this pattern. When reviewing a PR that adds a new method to `LocalConversation`: 1. Check whether it accesses any `self._state.*` field. 2. If yes, verify the access is inside a `with self._state:` block. 3. If not, flag it — the method is unsafe for concurrent use with `run()`. ### Persistence Path Construction `BaseConversation.get_persistence_dir(base, conversation_id)` returns `str(Path(base) / conversation_id.hex)`. The `LocalConversation.__init__` constructor calls this automatically when `persistence_dir` is provided. **Rule:** Callers that pass `persistence_dir` to `LocalConversation()` must pass only the **base directory** (e.g., `/data/conversations/`). The constructor appends the conversation hex. Passing a pre-constructed full path (e.g., `/data/conversations/abc123`) causes double-appending: `/data/conversations/abc123/abc123`. When reviewing code that creates a new `LocalConversation` (fork, resume, migration): 1. Check what value is passed as `persistence_dir`. 2. Verify it does **not** already include the conversation ID hex. ### Server-Side Error Handling Server endpoints in `conversation_service.py` that create persistent state (writing directories, files, or calling `fork()` which writes to disk) and then perform follow-up operations (like `_start_event_service`) must handle partial failure. **Pattern:** If the follow-up operation fails, clean up the already-written persistent state so it doesn't become an orphaned directory that confuses future startups. ```python # Good: rollback on failure fork_dir = self.conversations_dir / fork_conv_id.hex try: fork_event_service = await self._start_event_service(fork_stored) except Exception: safe_rmtree(fork_dir) raise ``` When reviewing server endpoints that create conversations or persistent artifacts: 1. Identify the "point of no return" where state is written to disk. 2. Check that subsequent operations are wrapped in try/except with cleanup. 3. For client-supplied IDs, verify there's a duplicate check before creating state (return 409 Conflict if taken). ### Logging Hygiene `logger.info(...)` must not interpolate `model_dump(...)`, `.json()`, `to_dict()`, a list/dict of tool/skill/server names, or arbitrary user-supplied values. Log a count and/or id; move full payloads to `logger.debug(...)`. When reviewing a new or changed `logger.info(...)` call: if any interpolated value is an object, a dict, or a list whose size scales with load (tools, skills, conversations, requests), flag it. ## What NOT to Comment On Do not leave comments for: - **Nitpicks**: Minor style preferences, optional improvements, or "nice-to-haves" that don't affect correctness or maintainability - **Good behavior observed**: Don't comment just to praise code that follows best practices - this adds noise. Simply approve if the code is good. - **Suggestions for additional tests on simple changes**: For straightforward PRs (config changes, model additions, etc.), don't suggest adding test coverage unless tests are clearly missing for new logic - **Obvious or self-explanatory code**: Don't ask for comments on code that is already clear - **`.pr/` directory artifacts**: Files in the `.pr/` directory are temporary PR-specific documents (design notes, analysis, scripts) that are automatically cleaned up when the PR is approved. Do not comment on their presence or suggest removing them. If a PR is approvable, just approve it. Don't add "one small suggestion" or "consider doing X" comments that delay merging without adding real value. ## Communication Style - Be direct and concise - don't over-explain - Use casual, friendly tone ("lgtm", "WDYT?", emojis are fine 👀) - Ask questions to understand use cases before suggesting changes - Suggest alternatives, not mandates - Approve quickly when code is good ("LGTM!") - Use GitHub suggestion syntax for code fixes ================================================ FILE: .agents/skills/debug-test-examples-workflow/SKILL.md ================================================ --- name: debug-test-examples-workflow description: Guide for debugging failing example tests in the `test-examples` labeled workflow. Use this skill when investigating CI failures in the run-examples.yml workflow, when example scripts fail to run correctly, when needing to isolate specific test failures, or when analyzing workflow logs and failure patterns. --- # Debugging test-examples Workflow ## Overview The `run-examples.yml` workflow runs example scripts from `examples/` directory. Triggers: - Adding `test-examples` label to a PR - Manual workflow dispatch - Scheduled nightly runs ## Debugging Steps ### 1. Isolate Failing Tests Modify `tests/examples/test_examples.py` to focus on specific tests: ```python _TARGET_DIRECTORIES = ( # EXAMPLES_ROOT / "01_standalone_sdk", EXAMPLES_ROOT / "02_remote_agent_server", # Keep only failing directory ) ``` ### 2. Exclude Tests Add to `_EXCLUDED_EXAMPLES` with explanation: ```python _EXCLUDED_EXAMPLES = { # Reason for exclusion "examples/path/to/failing_test.py", } ``` ### 3. Trigger Workflow Toggle the `test-examples` label: ```bash # Remove label curl -X DELETE -H "Authorization: token $GITHUB_TOKEN" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/issues/${PR_NUMBER}/labels/test-examples" # Add label curl -X POST -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/issues/{PR_NUMBER}/labels" \ -d '{"labels":["test-examples"]}' ``` ### 4. Monitor Progress ```bash # Check status curl -s -H "Authorization: token $GITHUB_TOKEN" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/runs/{RUN_ID}" | jq '{status, conclusion}' # Download logs curl -sL -H "Authorization: token $GITHUB_TOKEN" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/runs/{RUN_ID}/logs" -o logs.zip unzip logs.zip -d logs ``` ## Common Failure Patterns | Pattern | Cause | Solution | |---------|-------|----------| | Port conflicts | Fixed ports (8010, 8011) | Run with `-n 1` or use different ports | | Container issues | Docker/Apptainer setup | Check Docker availability, image pulls | | LLM failures | Transient API errors | Retry the test | | Example bugs | Code errors | Check traceback | ## Key Configuration **Workflow** (`.github/workflows/run-examples.yml`): - Runner: `blacksmith-2vcpu-ubuntu-2404` - Timeout: 60 minutes - Parallelism: `-n 4` (pytest-xdist: 4 parallel workers) **Tests** (`tests/examples/test_examples.py`): - Timeout per example: 600 seconds - Target directories: `_TARGET_DIRECTORIES` - Excluded examples: `_EXCLUDED_EXAMPLES` ================================================ FILE: .agents/skills/design-principles.md ================================================ --- name: design-principles description: Core architectural design principles of the OpenHands Software Agent SDK. Reference when making architectural decisions, reviewing PRs that change agent/tool/state boundaries, or evaluating whether a proposed change aligns with V1 design goals. --- # SDK Design Principles Reference: ## Quick Summary 1. **Optional Isolation over Mandatory Sandboxing** Sandboxing is opt-in, not universal. Agent and tool execution runs in a single process by default. When isolation is needed, the same stack can be transparently containerized. 2. **Stateless by Default, One Source of Truth for State** All components — agents, tools, LLMs, configurations — are **immutable Pydantic models** validated at construction. The only mutable entity is the conversation state. This enables deterministic replay and robust persistence. 3. **Clear Boundaries between Agent and Applications** Strict separation between SDK (agent core), tools, workspace, and agent server. Applications communicate via APIs, not by embedding the agent. 4. **Composable Components for Extensibility** Agents are graphs of interchangeable components — tools, prompts, LLMs, contexts — described **declaratively with strong typing**. Developers reconfigure capabilities without modifying core code. ## Implications for Development - Since agents are immutable Pydantic models, their configuration **is** their serializable representation. There should be no need to "reverse-engineer" agent config from runtime instances. - Tool implementations (callables) are the only non-serializable part; this is solved by `tool_module_qualnames` for remote forwarding. - Everything else (system_prompt, model, skills, tool names) is already declarative data that can be serialized and forwarded directly. - Avoid patterns that create multiple sources of truth for the same configuration (e.g., a factory function AND an extracted definition). - `model_copy(update=...)` should be used sparingly and through well-defined paths to avoid undermining statelessness. ================================================ FILE: .agents/skills/feature-release-rollout/SKILL.md ================================================ --- name: feature-release-rollout description: This skill should be used when the user asks to "rollout a feature", "complete feature release", "propagate SDK feature", "track feature support", "what's missing for feature X", or mentions checking CLI/GUI/docs/blog support for SDK features. Guides agents through the multi-repository feature release workflow from SDK to docs to marketing. triggers: - rollout feature - feature release - propagate feature - feature support - complete release - docs for feature - blog for feature - CLI support - GUI support - what's missing --- # Feature Release Rollout This skill guides the complete feature release workflow across the OpenHands ecosystem repositories. ## Overview When a feature is implemented in the SDK, it may need propagation through several repositories: 1. **SDK** (`OpenHands/software-agent-sdk`) — Core feature implementation 2. **CLI** (`OpenHands/OpenHands-CLI`) — Terminal interface support 3. **GUI** (`OpenHands/OpenHands` frontend directory) — Web interface support 4. **Docs** (`OpenHands/docs`) — Documentation updates (sdk/ folder) 5. **Blog** (`OpenHands/growth-utils` blog-post/) — Marketing and announcements 6. **Video** — Tutorial content (using ElevenLabs + Remotion) ## Workflow ### Phase 1: Feature Discovery First, identify what feature(s) to analyze. The user may specify: - A release tag (e.g., `v1.9.0`) - A specific feature name - A PR or commit reference - A comparison between versions **For release tags:** ```bash # Clone SDK if not present git clone https://github.com/OpenHands/software-agent-sdk.git # View release notes cd software-agent-sdk git log --oneline v1.8.0..v1.9.0 # Changes between versions git show v1.9.0 --stat # What changed in this release ``` **For specific features:** Search the SDK codebase, examples, and changelog to understand the feature scope. ### Phase 2: Repository Analysis Clone all relevant repositories to analyze current support: ```bash # Clone repositories (use GITHUB_TOKEN for authenticated access) git clone https://github.com/OpenHands/software-agent-sdk.git git clone https://github.com/OpenHands/OpenHands-CLI.git git clone https://github.com/OpenHands/OpenHands.git # Frontend in frontend/ git clone https://github.com/OpenHands/docs.git git clone https://github.com/OpenHands/growth-utils.git ``` For each feature, check support status: | Repository | Check Location | What to Look For | |------------|---------------|------------------| | CLI | `openhands_cli/` | Feature flags, commands, TUI widgets | | GUI | `OpenHands/frontend/src/` | React components, API integrations | | Docs | `docs/sdk/` | Guide pages, API reference, examples | | Blog | `growth-utils/blog-post/posts/` | Announcement posts | ### Phase 3: Assess Feature Importance Not all features warrant full rollout. Evaluate each feature: **High Impact (full rollout recommended):** - New user-facing capabilities - Breaking changes or migrations - Major performance improvements - New integrations or tools **Medium Impact (docs + selective support):** - New API methods or parameters - Configuration options - Developer experience improvements **Low Impact (docs only or skip):** - Internal refactoring - Bug fixes - Minor enhancements **Skip rollout for:** - Internal-only changes - Test improvements - Build/CI changes - Documentation typos ### Phase 4: Create Proposal Generate a structured proposal for the user: ```markdown ## Feature Rollout Proposal: [Feature Name] ### Feature Summary [Brief description of the feature and its value] ### Current Support Status | Component | Status | Notes | |-----------|--------|-------| | SDK | ✅ Implemented | [version/PR] | | CLI | ❌ Missing | [what's needed] | | GUI | ⚠️ Partial | [what's implemented vs needed] | | Docs | ❌ Missing | [suggested pages] | | Blog | ❌ Not started | [whether warranted] | | Video | ❌ Not started | [whether warranted] | ### Recommended Actions 1. **CLI**: [specific implementation needed] 2. **GUI**: [specific implementation needed] 3. **Docs**: [pages to create/update] 4. **Blog**: [recommended or not, with reasoning] 5. **Video**: [recommended or not, with reasoning] ### Assessment - **Overall Priority**: [High/Medium/Low] - **Effort Estimate**: [days/hours per component] - **Dependencies**: [what must be done first] ``` ### Phase 5: User Confirmation Wait for explicit user approval before proceeding. Ask: - Which components to implement - Priority ordering - Any modifications to the proposal ### Phase 6: Implementation Only after user confirmation: **Create GitHub Issues:** ```bash # Create issue on relevant repo gh issue create --repo OpenHands/OpenHands-CLI \ --title "Support [feature] in CLI" \ --body "## Context\n[Feature description]\n\n## Implementation\n[Details]\n\n## Related\n- SDK: [link]\n- Docs: [link]" ``` **Implementation order:** 1. CLI/GUI support (can be parallel) 2. Documentation (depends on 1) 3. Blog post (depends on 2) 4. Video (depends on 3) ## Repository-Specific Guidelines ### CLI (OpenHands/OpenHands-CLI) - Check `AGENTS.md` for development guidelines - Use `uv` for dependency management - Run `make lint` and `make test` before commits - TUI components in `openhands_cli/tui/` - Snapshot tests for UI changes ### GUI (OpenHands/OpenHands frontend) - Frontend in `frontend/` directory - React/TypeScript codebase - Run `npm run lint:fix && npm run build` in frontend/ - Follow TanStack Query patterns for data fetching - i18n translations in `frontend/src/i18n/` ### Docs (OpenHands/docs) - SDK docs in `sdk/` folder - Uses Mintlify (`.mdx` files) - Code blocks can auto-sync from SDK examples - Run `mint broken-links` to validate - Follow `openhands/DOC_STYLE_GUIDE.md` ### Blog (OpenHands/growth-utils) - Posts in `blog-post/posts/YYYYMMDD-title.md` - Assets in `blog-post/assets/YYYYMMDD-title/` - Frontmatter format: ```yaml --- title: "Post Title" excerpt: "Brief description" coverImage: "/assets/blog/YYYYMMDD-title/cover.png" date: "YYYY-MM-DDTHH:MM:SS.000Z" authors: - name: Author Name picture: "/assets/blog/authors/author.png" ogImage: url: "/assets/blog/YYYYMMDD-title/cover.png" --- ``` ## Example Feature Analysis **Feature: Browser Session Recording (SDK v1.8.0)** 1. **SDK**: ✅ Implemented in `openhands.tools.browser` 2. **CLI**: ❌ No replay/export commands 3. **GUI**: ❌ No recording viewer component 4. **Docs**: ✅ Guide at `sdk/guides/browser-session-recording.mdx` 5. **Blog**: ❌ Could highlight for web scraping users 6. **Video**: Consider 2-minute demo **Recommendation**: Medium priority. Docs done, CLI/GUI low urgency (advanced feature), blog post optional. ## Quick Commands ```bash # Check SDK feature presence grep -r "feature_name" software-agent-sdk/openhands/ --include="*.py" # Check CLI support grep -r "feature_name" OpenHands-CLI/openhands_cli/ --include="*.py" # Check GUI support grep -r "featureName" OpenHands/frontend/src/ --include="*.ts" --include="*.tsx" # Check docs coverage grep -r "feature" docs/sdk/ --include="*.mdx" # Check blog mentions grep -r "feature" growth-utils/blog-post/posts/ --include="*.md" ``` ## Important Notes - Always get user confirmation before creating issues or starting implementation - Consider feature maturity — new features may change before full rollout - Cross-reference PRs between repositories in issue descriptions - For breaking changes, coordinate release timing across all components ================================================ FILE: .agents/skills/manage-evals/SKILL.md ================================================ --- name: manage-evals description: This skill should be used when the user asks to "trigger an eval", "run evaluation", "run swebench", "run gaia", "run benchmark", "compare eval runs", "compare evaluation results", "check eval regression", "compare benchmark results", "what changed in the eval", "diff eval runs", or mentions triggering, comparing, or reporting on SWE-bench, GAIA, or other benchmark evaluation results. Provides workflow for triggering evaluations on different benchmarks, finding and comparing runs, and reporting performance differences. --- # Managing Evaluations ## Overview OpenHands evaluations produce results stored on a CDN at `https://results.eval.all-hands.dev/`. Each run is identified by a path: `{benchmark}/{model_slug}/{github_run_id}/`. This skill enables triggering evaluation runs, comparing results between runs, and posting performance reports as GitHub PR comments. ## Quick Start ### Trigger an Evaluation ```bash python .agents/skills/manage-evals/scripts/manage_evals.py trigger \ --sdk-ref --benchmark swebench --eval-limit 50 ``` ### Compare Runs ```bash python .agents/skills/manage-evals/scripts/manage_evals.py compare \ "///" \ --auto-baseline ``` ### Compare and Post to PR ```bash python .agents/skills/manage-evals/scripts/manage_evals.py compare \ "///" \ --auto-baseline \ --post-comment --pr --repo OpenHands/software-agent-sdk ``` ## Triggering Evaluations ### Using the Script ```bash # SWE-bench (default) on a PR branch python .agents/skills/manage-evals/scripts/manage_evals.py trigger \ --sdk-ref my-feature-branch --eval-limit 50 # GAIA benchmark python .agents/skills/manage-evals/scripts/manage_evals.py trigger \ --sdk-ref main --benchmark gaia --eval-limit 50 # With a specific model python .agents/skills/manage-evals/scripts/manage_evals.py trigger \ --sdk-ref v1.16.0 --benchmark swebench --model-ids gemini-3-flash --eval-limit 50 # Multiple benchmarks (run the command multiple times) for bench in swebench gaia; do python .agents/skills/manage-evals/scripts/manage_evals.py trigger \ --sdk-ref main --benchmark "$bench" --eval-limit 50 --reason "Multi-benchmark eval" done ``` ### Available Benchmarks | Benchmark | Description | |-----------|-------------| | `swebench` | SWE-bench (default) — software engineering tasks | | `swebenchpro` | SWE-Bench Pro — harder software engineering tasks | | `gaia` | GAIA — general AI assistant tasks | | `swtbench` | SWT-bench — software testing tasks | | `commit0` | Commit0 — commit generation tasks | | `swebenchmultimodal` | SWE-bench Multimodal — tasks with images | | `terminalbench` | TerminalBench — terminal interaction tasks | ### Trigger Options | Option | Default | Description | |--------|---------|-------------| | `--sdk-ref` | *(required)* | Branch, tag, or commit SHA to evaluate | | `--benchmark` | `swebench` | Benchmark to run | | `--eval-limit` | `50` | Number of instances to evaluate | | `--model-ids` | *(first in config)* | Comma-separated model IDs from `resolve_model_config.py` | | `--tool-preset` | `default` | Tool preset: `default`, `gemini`, `gpt5`, `planning` | | `--agent-type` | `default` | Agent type: `default`, `acp-claude`, `acp-codex` | | `--instance-ids` | | Specific instance IDs to evaluate (overrides eval-limit) | | `--reason` | | Human-readable reason (shown in notifications) | | `--benchmarks-branch` | `main` | Branch of the benchmarks repo | | `--eval-branch` | `main` | Branch of the evaluation repo | ### Via PR Labels (Alternative) Adding a label to a PR also triggers evaluations: - `run-eval-1` — 1 instance (quick sanity check) - `run-eval-50` — 50 instances (standard comparison) - `run-eval-200` — 200 instances - `run-eval-500` — 500 instances (full benchmark) ## Comparing Evaluation Runs ### Step 1: Find the Current PR's Eval Run Eval runs are triggered by adding labels like `run-eval-50` to a PR. The `all-hands-bot` posts a comment with results when complete. **Option A — From bot comments on the PR:** ```bash gh api repos/OpenHands/software-agent-sdk/issues//comments \ --jq '.[] | select(.user.login == "all-hands-bot") | .body' \ | grep -o 'Evaluation:.*' | head -1 ``` The evaluation name follows the format `{github_run_id}-{model_slug_short}` (e.g., `23775164157-claude-son`). Extract the `github_run_id` from this. **Option B — From the "Evaluation Triggered" bot comment:** ```bash gh api repos/OpenHands/software-agent-sdk/issues//comments \ --jq '.[] | select(.body | test("Evaluation Triggered")) | .body' ``` This contains the SDK commit SHA. Cross-reference with daily metadata to find the run ID. **Option C — From daily metadata:** ```bash curl -s "https://results.eval.all-hands.dev/metadata/$(date -u +%Y-%m-%d).txt" ``` Each line is a run path. Match by benchmark and model to find the run. ### Step 2: Identify the Run Path Components A run path has three components: - **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench` - **model_slug**: Derived from model name with `/:@.` replaced by `-` (e.g., `litellm_proxy-claude-sonnet-4-5-20250929`) - **run_id**: The GitHub Actions workflow run ID from the `OpenHands/evaluation` repo ### Step 3: Verify Results Exist ```bash curl -sI "https://results.eval.all-hands.dev////output.report.json" | head -1 ``` A `200` status confirms the run completed and results are available. ### Step 4: Find a Baseline for Comparison **Automatic**: The comparison script's `--auto-baseline` flag scans metadata files backward up to 14 days to find the most recent completed run with the same benchmark and model. **Manual**: Inspect metadata files or other PR bot comments to identify a specific run: ```bash # Check today's runs curl -s "https://results.eval.all-hands.dev/metadata/$(date -u +%Y-%m-%d).txt" | grep "swebench/litellm_proxy-claude" # Check yesterday's runs curl -s "https://results.eval.all-hands.dev/metadata/$(date -u -d yesterday +%Y-%m-%d).txt" | grep "swebench/litellm_proxy-claude" ``` ### Step 5: Run the Comparison ```bash python .agents/skills/manage-evals/scripts/manage_evals.py compare \ "swebench/litellm_proxy-claude-sonnet-4-5-20250929/23775164157/" \ --baseline "swebench/litellm_proxy-claude-sonnet-4-5-20250929/23773892085/" ``` Or with auto-baseline and PR comment posting: ```bash python .agents/skills/manage-evals/scripts/manage_evals.py compare \ "swebench/litellm_proxy-claude-sonnet-4-5-20250929/23775164157/" \ --auto-baseline \ --post-comment --pr 2334 --repo OpenHands/software-agent-sdk ``` ## Available Data Per Run Each run stores files at `https://results.eval.all-hands.dev/{run_path}/`: | File | Description | |------|-------------| | `metadata/params.json` | Run parameters: SDK commit, PR number, model, eval_limit, triggered_by | | `output.report.json` | Aggregated results: resolved/submitted/total counts and instance IDs | | `cost_report.jsonl` | Per-instance cost data | | `results.tar.gz` | Full archive with all outputs | ## Dashboard The eval monitor dashboard provides a visual view of runs: ``` https://openhands-eval-monitor.vercel.app/?run={benchmark}/{model_slug}/{run_id}/ ``` ## Interpreting Results - **Success rate** = resolved / min(eval_limit, total_instances) - A 50-instance sample has natural variance of ±2-4 resolved instances between runs - Focus on **instance-level changes** (gained/lost) to understand regressions vs. noise - If the same set of instances is resolved, the difference is likely noise ## Additional Resources ### Reference Files - **`references/eval-infrastructure.md`** — Detailed documentation on the evaluation infrastructure, GCS paths, metadata format, and workflow triggers ### Scripts - **`scripts/manage_evals.py`** — Standalone comparison script with auto-baseline detection and GitHub comment posting ================================================ FILE: .agents/skills/manage-evals/references/eval-infrastructure.md ================================================ # Evaluation Infrastructure Reference ## Architecture Overview The evaluation pipeline spans three repositories: 1. **OpenHands/software-agent-sdk** — Triggers evaluations via `run-eval.yml` workflow 2. **OpenHands/evaluation** — Orchestrates the eval job via `eval-job.yml` workflow 3. **OpenHands/benchmarks** — Contains benchmark runners (inference + evaluation) ## Trigger Flow ### PR Label Trigger 1. A label (`run-eval-1`, `run-eval-50`, `run-eval-200`, `run-eval-500`) is added to a PR 2. `software-agent-sdk/.github/workflows/run-eval.yml` fires 3. It resolves model configs from `.github/run-eval/resolve_model_config.py` 4. Dispatches `eval-job.yml` in `OpenHands/evaluation` with: - `sdk_commit`: The PR's head SHA - `sdk_workflow_run_id`: The `run-eval.yml` workflow run ID - `eval_limit`: Extracted from label name - `models_json`: Resolved model configurations - `pr_number`: The PR number (for result posting) 5. Posts an "Evaluation Triggered" comment on the PR ### Release Trigger Runs automatically on `release` events with `eval_limit=50`. ### Manual Trigger Via `workflow_dispatch` on `run-eval.yml` with explicit parameters. ## Results Storage (GCS) Results are stored in Google Cloud Storage bucket `openhands-evaluation-results` and served via CDN at `https://results.eval.all-hands.dev/`. ### Run Path Format ``` {benchmark}/{model_slug}/{github_run_id}/ ``` - **benchmark**: `swebench`, `swebenchpro`, `gaia`, `swtbench`, `commit0`, `swebenchmultimodal`, `terminalbench` - **model_slug**: Model name with `/:@.` replaced by `-` - Example: `litellm_proxy/claude-sonnet-4-5-20250929` → `litellm_proxy-claude-sonnet-4-5-20250929` - **github_run_id**: The GitHub Actions run ID from the `OpenHands/evaluation` repo ### Files Per Run ``` {run_path}/ ├── metadata/ │ └── params.json # Job parameters (uploaded at job start) ├── output.report.json # Aggregated evaluation results ├── cost_report.jsonl # Per-instance cost data └── results.tar.gz # Full archive ``` ### params.json Schema ```json { "timestamp": "2026-03-31T00:54:15Z", "sdk_commit": "42852dc2260a461536acc186cd918ad5a58910dd", "sdk_workflow_run_id": "23775150328", "eval_limit": 50, "benchmark": "swebench", "model_name": "litellm_proxy/claude-sonnet-4-5-20250929", "model_id": "claude-sonnet-4-5-20250929", "model_display_name": "Claude Sonnet 4.5", "unique_eval_name": "23775164157-claude-son", "commit": "42852dc2260a461536acc186cd918ad5a58910dd", "pr_number": "2334", "triggered_by": "enyst", "tool_preset": "default", "agent_type": "default", "github_run_id": "23775164157" } ``` ### output.report.json Schema ```json { "total_instances": 500, "submitted_instances": 50, "completed_instances": 50, "resolved_instances": 35, "unresolved_instances": 15, "empty_patch_instances": 0, "error_instances": 0, "completed_ids": ["instance_id_1", "..."], "resolved_ids": ["instance_id_1", "..."], "unresolved_ids": ["instance_id_1", "..."], "empty_patch_ids": [], "error_ids": [] } ``` ## Daily Metadata All runs registered on a given day are listed in: ``` https://results.eval.all-hands.dev/metadata/YYYY-MM-DD.txt ``` Each line is a run path. Example: ``` swebench/litellm_proxy-claude-sonnet-4-5-20250929/23773892085/ swebench/litellm_proxy-gemini-3-flash-preview/23774756886/ gaia/litellm_proxy-claude-sonnet-4-5-20250929/23775142614/ ``` Metadata files are updated atomically with generation preconditions and have `Cache-Control: no-cache` set. ## Dashboard The eval monitor dashboard at `https://openhands-eval-monitor.vercel.app/` provides a visual view of runs. Construct URLs as: ``` https://openhands-eval-monitor.vercel.app/?run={benchmark}/{model_slug}/{run_id}/ ``` ## Bot Comments When an eval completes, `all-hands-bot` posts a comment on the PR (if `pr_number` was provided) with: - Evaluation name (e.g., `23775164157-claude-son`) - Model name - Results summary (total, submitted, resolved, unresolved, empty patch, error counts) - Success rate - Archive link ## Model Slug Computation The model slug is derived from the LLM config's `model` field: ```python model = config["model"] # e.g., "litellm_proxy/claude-sonnet-4-5-20250929" for ch in "/:@.": model = model.replace(ch, "-") # Result: "litellm_proxy-claude-sonnet-4-5-20250929" ``` ## Available Models Models are defined in `software-agent-sdk/.github/run-eval/resolve_model_config.py`. Each model has an `id`, `display_name`, and `llm_config` with the model path and parameters. ## Variance Between Runs For 50-instance SWE-bench evaluations: - Natural variance is typically ±2-4 resolved instances between identical configurations - Focus on instance-level changes (which specific instances gained/lost) to distinguish real regressions from noise - If the resolved instance set is identical, the runs are equivalent ================================================ FILE: .agents/skills/manage-evals/scripts/manage_evals.py ================================================ #!/usr/bin/env python3 """Trigger, compare, and report on OpenHands evaluation runs. Subcommands: trigger Dispatch an evaluation workflow via the GitHub API compare Compare two evaluation runs and produce a markdown report Examples: # Trigger a swebench eval on a PR branch python manage_evals.py trigger --sdk-ref my-branch --benchmark swebench --eval-limit 50 # Trigger a GAIA eval on a release tag python manage_evals.py trigger --sdk-ref v1.16.0 --benchmark gaia --eval-limit 50 # Auto-find baseline and print comparison markdown python manage_evals.py compare swebench/litellm_proxy-claude-sonnet-4-5-20250929/23775164157/ --auto-baseline # Post comparison to PR python manage_evals.py compare swebench/.../23775164157/ --auto-baseline \\ --post-comment --pr 2334 --repo OpenHands/software-agent-sdk """ # noqa: E501 from __future__ import annotations import argparse import json import os import sys import urllib.request from datetime import UTC, datetime, timedelta from typing import Any RESULTS_CDN = os.environ.get("RESULTS_CDN", "https://results.eval.all-hands.dev") DASHBOARD_BASE = "https://openhands-eval-monitor.vercel.app" SDK_REPO = "OpenHands/software-agent-sdk" BENCHMARKS = [ "swebench", "swebenchpro", "gaia", "swtbench", "commit0", "swebenchmultimodal", "terminalbench", ] TOOL_PRESETS = ["default", "gemini", "gpt5", "planning"] AGENT_TYPES = ["default", "acp-claude", "acp-codex"] def fetch_json(url: str) -> dict[str, Any] | None: """Fetch JSON from a URL, returning None on 404.""" try: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read().decode()) except urllib.error.HTTPError as e: if e.code == 404: return None raise except Exception as e: print(f"Warning: Failed to fetch {url}: {e}", file=sys.stderr) return None def fetch_text(url: str) -> str | None: """Fetch text from a URL, returning None on 404.""" try: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=15) as resp: return resp.read().decode() except urllib.error.HTTPError as e: if e.code == 404: return None raise except Exception as e: print(f"Warning: Failed to fetch {url}: {e}", file=sys.stderr) return None def parse_run_path(path: str) -> tuple[str, str, str]: """Parse a run path into (benchmark, model_slug, run_id). Accepts formats: swebench/litellm_proxy-claude-sonnet-4-5-20250929/23775164157/ swebench/litellm_proxy-claude-sonnet-4-5-20250929/23775164157 """ parts = path.strip("/").split("/") if len(parts) != 3: raise ValueError( f"Invalid run path: {path!r}. Expected: benchmark/model_slug/run_id" ) return parts[0], parts[1], parts[2] def get_report(run_path: str) -> dict[str, Any] | None: """Fetch output.report.json for a run.""" url = f"{RESULTS_CDN}/{run_path.strip('/')}/output.report.json" return fetch_json(url) def get_params(run_path: str) -> dict[str, Any] | None: """Fetch metadata/params.json for a run.""" url = f"{RESULTS_CDN}/{run_path.strip('/')}/metadata/params.json" return fetch_json(url) def get_metadata_for_date(date_str: str) -> list[str]: """Fetch the metadata listing for a given date (YYYY-MM-DD).""" url = f"{RESULTS_CDN}/metadata/{date_str}.txt" text = fetch_text(url) if not text: return [] return [line.strip() for line in text.strip().split("\n") if line.strip()] def find_baseline_run( benchmark: str, model_slug: str, current_run_id: str, lookback_days: int = 14, current_eval_limit: int | None = None, ) -> str | None: """Find the most recent previous run with matching benchmark/model. Scans metadata files backward from today, looking for a run with the same benchmark and model_slug but a different (earlier) run_id. Prefers runs with matching eval_limit when available. Returns the run path or None if no baseline found. """ today = datetime.now(UTC).date() prefix = f"{benchmark}/{model_slug}/" # Two-pass: first look for matching eval_limit, then any completed run candidates: list[tuple[str, dict[str, Any] | None]] = [] for day_offset in range(lookback_days + 1): date = today - timedelta(days=day_offset) date_str = date.strftime("%Y-%m-%d") entries = get_metadata_for_date(date_str) for entry in reversed(entries): if not entry.startswith(prefix): continue _, _, run_id = parse_run_path(entry) if run_id == current_run_id: continue report = get_report(entry) if report and report.get("submitted_instances", 0) > 0: params = get_params(entry) candidates.append((entry, params)) # Stop after finding enough candidates if len(candidates) >= 10: break if len(candidates) >= 10: break if not candidates: return None # Prefer runs with matching eval_limit if current_eval_limit is not None: for path, params in candidates: if params and params.get("eval_limit") == current_eval_limit: return path # Fall back to most recent completed run return candidates[0][0] def compute_diff( current: dict[str, Any], baseline: dict[str, Any], current_params: dict[str, Any] | None, baseline_params: dict[str, Any] | None, ) -> str: """Produce a markdown comparison of two eval reports.""" # Extract key metrics c_resolved = current.get("resolved_instances", 0) b_resolved = baseline.get("resolved_instances", 0) c_submitted = current.get("submitted_instances", 0) b_submitted = baseline.get("submitted_instances", 0) c_total = current.get("total_instances", 0) b_total = baseline.get("total_instances", 0) c_empty = current.get("empty_patch_instances", 0) b_empty = baseline.get("empty_patch_instances", 0) c_error = current.get("error_instances", 0) b_error = baseline.get("error_instances", 0) # Eval limit from params c_limit = (current_params or {}).get("eval_limit", c_submitted) b_limit = (baseline_params or {}).get("eval_limit", b_submitted) # Denominators for rate calculation c_denom = min(c_limit, c_total) if c_total > 0 else c_limit b_denom = min(b_limit, b_total) if b_total > 0 else b_limit c_rate = (c_resolved / c_denom * 100) if c_denom else 0 b_rate = (b_resolved / b_denom * 100) if b_denom else 0 rate_delta = c_rate - b_rate # Instance-level diff c_resolved_ids = set(current.get("resolved_ids", [])) b_resolved_ids = set(baseline.get("resolved_ids", [])) gained = sorted(c_resolved_ids - b_resolved_ids) lost = sorted(b_resolved_ids - c_resolved_ids) # Delta symbol def delta_str(val: float | int) -> str: if val > 0: return f"+{val}" return str(val) # Build markdown lines: list[str] = [] lines.append("## 📊 Evaluation Comparison") lines.append("") # Summary line if rate_delta > 0: emoji = "📈" delta_pp = f"+{rate_delta:.1f}" elif rate_delta < 0: emoji = "📉" delta_pp = f"{rate_delta:.1f}" else: emoji = "➡️" delta_pp = "0.0" lines.append( f"{emoji} **Success rate: {c_rate:.1f}% " f"({delta_pp}pp vs baseline {b_rate:.1f}%)**" ) lines.append("") # Metadata c_pr = (current_params or {}).get("pr_number") b_pr = (baseline_params or {}).get("pr_number") c_commit = (current_params or {}).get("sdk_commit", "unknown")[:12] b_commit = (baseline_params or {}).get("sdk_commit", "unknown")[:12] c_run_id = (current_params or {}).get("github_run_id", "") b_run_id = (baseline_params or {}).get("github_run_id", "") lines.append("| | Current | Baseline |") lines.append("|---|---|---|") if c_run_id or b_run_id: lines.append(f"| **Run ID** | `{c_run_id}` | `{b_run_id}` |") lines.append(f"| **SDK Commit** | `{c_commit}` | `{b_commit}` |") if c_pr or b_pr: c_pr_str = f"#{c_pr}" if c_pr else "—" b_pr_str = f"#{b_pr}" if b_pr else "— (main)" if not b_pr else f"#{b_pr}" lines.append(f"| **PR** | {c_pr_str} | {b_pr_str} |") lines.append( f"| **Resolved** | {c_resolved}/{c_denom} ({c_rate:.1f}%) " f"| {b_resolved}/{b_denom} ({b_rate:.1f}%) |" ) lines.append(f"| **Δ Resolved** | {delta_str(c_resolved - b_resolved)} | — |") lines.append(f"| **Empty Patches** | {c_empty} | {b_empty} |") lines.append(f"| **Errors** | {c_error} | {b_error} |") lines.append("") # Instance-level changes if gained or lost: lines.append("### Instance-Level Changes") lines.append("") if gained: lines.append( f"**✅ Newly resolved ({len(gained)}):** " + ", ".join(f"`{g}`" for g in gained[:20]) ) if len(gained) > 20: lines.append(f" ... and {len(gained) - 20} more") lines.append("") if lost: lines.append( f"**❌ Regressions ({len(lost)}):** " + ", ".join(f"`{g}`" for g in lost[:20]) ) if len(lost) > 20: lines.append(f" ... and {len(lost) - 20} more") lines.append("") if not gained and not lost and c_resolved_ids and b_resolved_ids: lines.append( "*Identical set of resolved instances — no regressions or improvements.*" ) lines.append("") # Dashboard links lines.append("### 🔗 Links") lines.append("") if c_run_id: benchmark = (current_params or {}).get("benchmark", "swebench") model_slug = ( (current_params or {}) .get("model_name", "") .replace("/", "-") .replace(":", "-") .replace("@", "-") .replace(".", "-") ) c_dash = f"{DASHBOARD_BASE}/?run={benchmark}/{model_slug}/{c_run_id}/" lines.append(f"- [Current run dashboard]({c_dash})") if b_run_id: benchmark = (baseline_params or {}).get("benchmark", "swebench") model_slug = ( (baseline_params or {}) .get("model_name", "") .replace("/", "-") .replace(":", "-") .replace("@", "-") .replace(".", "-") ) b_dash = f"{DASHBOARD_BASE}/?run={benchmark}/{model_slug}/{b_run_id}/" lines.append(f"- [Baseline run dashboard]({b_dash})") lines.append("") return "\n".join(lines) def github_api_request( url: str, token: str, *, method: str = "GET", data: dict[str, Any] | None = None, ) -> dict[str, Any] | None: """Make a GitHub API request. Returns parsed JSON or None for 204.""" body = json.dumps(data).encode() if data else None req = urllib.request.Request( url, data=body, method=method, headers={ "Authorization": f"token {token}", "Accept": "application/vnd.github+json", "Content-Type": "application/json", }, ) with urllib.request.urlopen(req, timeout=30) as resp: if resp.status == 204: return None return json.loads(resp.read().decode()) def post_github_comment(repo: str, pr_number: int, body: str, token: str) -> None: """Post a comment on a GitHub PR.""" url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments" result = github_api_request(url, token, method="POST", data={"body": body}) if result: print(f"Posted comment: {result.get('html_url', 'unknown')}", file=sys.stderr) def trigger_eval( token: str, *, sdk_ref: str, benchmark: str = "swebench", eval_limit: int = 50, model_ids: str = "", reason: str = "", repo: str = SDK_REPO, allow_unreleased: bool = True, benchmarks_branch: str = "main", eval_branch: str = "main", tool_preset: str = "default", agent_type: str = "default", instance_ids: str = "", ) -> None: """Dispatch an evaluation workflow via the GitHub Actions API.""" inputs: dict[str, str] = { "benchmark": benchmark, "sdk_ref": sdk_ref, "eval_limit": str(eval_limit), "reason": reason, "benchmarks_branch": benchmarks_branch, "eval_branch": eval_branch, "tool_preset": tool_preset, "agent_type": agent_type, "allow_unreleased_branches": str(allow_unreleased).lower(), } if model_ids: inputs["model_ids"] = model_ids if instance_ids: inputs["instance_ids"] = instance_ids url = ( f"https://api.github.com/repos/{repo}/actions/workflows/run-eval.yml/dispatches" ) payload = {"ref": sdk_ref, "inputs": inputs} print(f"Dispatching eval workflow on {repo}...", file=sys.stderr) print(f" benchmark: {benchmark}", file=sys.stderr) print(f" sdk_ref: {sdk_ref}", file=sys.stderr) print(f" eval_limit: {eval_limit}", file=sys.stderr) print(f" model_ids: {model_ids or '(default)'}", file=sys.stderr) print(f" tool_preset: {tool_preset}", file=sys.stderr) print(f" agent_type: {agent_type}", file=sys.stderr) if instance_ids: print(f" instance_ids: {instance_ids}", file=sys.stderr) if reason: print(f" reason: {reason}", file=sys.stderr) github_api_request(url, token, method="POST", data=payload) print("✓ Workflow dispatched successfully.", file=sys.stderr) print( f" Monitor at: https://github.com/{repo}/actions/workflows/run-eval.yml", file=sys.stderr, ) def _require_token() -> str: """Return GITHUB_TOKEN or exit with error.""" token = os.environ.get("GITHUB_TOKEN", "") if not token: print("ERROR: GITHUB_TOKEN environment variable not set", file=sys.stderr) sys.exit(1) return token def cmd_trigger(args: argparse.Namespace) -> None: """Handle the 'trigger' subcommand.""" token = _require_token() trigger_eval( token, sdk_ref=args.sdk_ref, benchmark=args.benchmark, eval_limit=args.eval_limit, model_ids=args.model_ids or "", reason=args.reason or "", repo=args.repo, benchmarks_branch=args.benchmarks_branch, eval_branch=args.eval_branch, tool_preset=args.tool_preset, agent_type=args.agent_type, instance_ids=args.instance_ids or "", ) def cmd_compare(args: argparse.Namespace) -> None: """Handle the 'compare' subcommand.""" # Validate if args.post_comment and (not args.pr or not args.repo): print("ERROR: --post-comment requires --pr and --repo", file=sys.stderr) sys.exit(1) if not args.baseline and not args.auto_baseline: print("ERROR: Specify --baseline or --auto-baseline", file=sys.stderr) sys.exit(1) benchmark, model_slug, run_id = parse_run_path(args.current_run_path) print(f"Current run: {benchmark}/{model_slug}/{run_id}", file=sys.stderr) # Fetch current run data current_report = get_report(args.current_run_path) if not current_report: print(f"ERROR: No report found for {args.current_run_path}", file=sys.stderr) sys.exit(1) current_params = get_params(args.current_run_path) # Find baseline if args.baseline: baseline_path = args.baseline else: current_eval_limit = ( current_params.get("eval_limit") if current_params else None ) print( f"Searching for baseline (lookback: {args.lookback_days} days, " f"eval_limit: {current_eval_limit})...", file=sys.stderr, ) baseline_path = find_baseline_run( benchmark, model_slug, run_id, args.lookback_days, current_eval_limit ) if not baseline_path: print("No baseline run found. Cannot produce comparison.", file=sys.stderr) sys.exit(1) print(f"Baseline run: {baseline_path}", file=sys.stderr) baseline_report = get_report(baseline_path) if not baseline_report: print(f"ERROR: No report found for baseline {baseline_path}", file=sys.stderr) sys.exit(1) baseline_params = get_params(baseline_path) # Generate comparison markdown = compute_diff( current_report, baseline_report, current_params, baseline_params ) print(markdown) # Post comment if requested if args.post_comment: token = _require_token() body = ( markdown + "\n---\n" + "*This comparison was generated by an AI assistant " + "(OpenHands) on behalf of the user.*\n" ) post_github_comment(args.repo, args.pr, body, token) def main() -> None: parser = argparse.ArgumentParser( description="Trigger, compare, and report on OpenHands evaluation runs", ) subparsers = parser.add_subparsers(dest="command", required=True) # --- trigger subcommand --- p_trigger = subparsers.add_parser( "trigger", help="Dispatch an evaluation workflow", description="Trigger an eval run via the GitHub Actions workflow_dispatch API.", ) p_trigger.add_argument( "--sdk-ref", required=True, help="SDK branch, tag, or commit to evaluate (e.g., main, v1.16.0, my-branch)", ) p_trigger.add_argument( "--benchmark", default="swebench", choices=BENCHMARKS, help="Benchmark to run (default: swebench)", ) p_trigger.add_argument( "--eval-limit", type=int, default=50, help="Number of instances to evaluate (default: 50)", ) p_trigger.add_argument( "--model-ids", default="", help=( "Comma-separated model IDs " "(see .github/run-eval/resolve_model_config.py; default: first model)" ), ) p_trigger.add_argument("--reason", default="", help="Human-readable trigger reason") p_trigger.add_argument( "--repo", default=SDK_REPO, help=f"Repository to trigger on (default: {SDK_REPO})", ) p_trigger.add_argument( "--benchmarks-branch", default="main", help="Benchmarks repo branch (default: main)", ) p_trigger.add_argument( "--eval-branch", default="main", help="Evaluation repo branch (default: main)", ) p_trigger.add_argument( "--tool-preset", default="default", choices=TOOL_PRESETS, help="Tool preset for file editing (default: default)", ) p_trigger.add_argument( "--agent-type", default="default", choices=AGENT_TYPES, help="Agent type (default: default)", ) p_trigger.add_argument( "--instance-ids", default="", help="Comma-separated instance IDs to evaluate (overrides eval-limit)", ) # --- compare subcommand --- p_compare = subparsers.add_parser( "compare", help="Compare two evaluation runs", description="Fetch results for two eval runs and produce a diff report.", ) p_compare.add_argument( "current_run_path", help="Run path (e.g., swebench/litellm_proxy-claude-.../23775164157/)", ) p_compare.add_argument("--baseline", help="Explicit baseline run path") p_compare.add_argument( "--auto-baseline", action="store_true", help="Auto-find the most recent previous run as baseline", ) p_compare.add_argument( "--lookback-days", type=int, default=14, help="Days to search for baseline (default: 14)", ) p_compare.add_argument( "--post-comment", action="store_true", help="Post result as a GitHub PR comment", ) p_compare.add_argument("--pr", type=int, help="PR number for commenting") p_compare.add_argument("--repo", help="Repository (OWNER/REPO) for commenting") args = parser.parse_args() if args.command == "trigger": cmd_trigger(args) elif args.command == "compare": cmd_compare(args) if __name__ == "__main__": main() ================================================ FILE: .agents/skills/run-eval.md ================================================ --- name: run-eval description: Trigger and monitor evaluation runs for benchmarks like SWE-bench, GAIA, and others. Use when running evaluations via GitHub Actions or monitoring eval progress through Datadog and kubectl. triggers: - run eval - trigger eval - evaluation run - swebench eval --- # Running Evaluations ## Trigger via GitHub API ```bash curl -X POST \ -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/run-eval.yml/dispatches" \ -d '{ "ref": "main", "inputs": { "benchmark": "swebench", "sdk_ref": "main", "eval_limit": "50", "model_ids": "claude-sonnet-4-5-20250929", "reason": "Description of eval run", "benchmarks_branch": "main" } }' ``` **Key parameters:** - `benchmark`: `swebench`, `swebenchpro`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`, `terminalbench` - `eval_limit`: Any positive integer (e.g., `1`, `10`, `50`, `200`) - `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models - `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging **Note:** When running a full eval, you must select an `eval_limit` that is greater than or equal to the actual number of instances in the benchmark. If you specify a smaller limit, only that many instances will be evaluated (partial eval). ## Monitoring **Datadog script** (requires `OpenHands/evaluation` repo; DD_API_KEY, DD_APP_KEY, and DD_SITE environment variables are set): ```bash DD_API_KEY=$DD_API_KEY DD_APP_KEY=$DD_APP_KEY DD_SITE=$DD_SITE \ python scripts/analyze_evals.py --job-prefix --time-range 60 # EVAL_RUN_ID format: typically the workflow run ID from GitHub Actions ``` **kubectl** (for users with cluster access - the agent does not have kubectl access): ```bash kubectl logs -f job/eval-eval-- -n evaluation-jobs ``` ## Common Errors | Error | Cause | Fix | |-------|-------|-----| | `503 Service Unavailable` | Infrastructure overloaded | Ask user to stop some evaluation runs | | `429 Too Many Requests` | Rate limiting | Wait or reduce concurrency | | `failed after 3 retries` | Instance failures | Check Datadog logs for root cause | ## Limits - Max 256 parallel runtimes (jobs will queue if this limit is exceeded) - Full evals typically take 1-3 hours depending on benchmark size ================================================ FILE: .agents/skills/sdk-release/SKILL.md ================================================ --- name: sdk-release description: >- This skill should be used when the user asks to "release the SDK", "prepare a release", "publish a new version", "cut a release", "do a release", or mentions the SDK release checklist or release process. Guides through the full software-agent-sdk release workflow from version bump to PyPI publication, emphasizing human checkpoints. --- # SDK Release Guide This skill walks through the software-agent-sdk release process step by step. > **🚨 CRITICAL**: NEVER merge the release PR or create/publish a GitHub > release without the human's explicit approval. Release is the last line > of human defense. Always present the current status and ask for > confirmation before performing any irreversible action. ## Phase 1: Trigger the Prepare-Release Workflow Determine the target version (SemVer `X.Y.Z`). Then trigger the `prepare-release.yml` workflow, which creates a release branch and PR automatically. ### Via GitHub UI Navigate to , click **Run workflow**, enter the version (e.g. `1.16.0`), and run it. ### Via GitHub API ```bash curl -X POST \ -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/prepare-release.yml/dispatches" \ -d '{ "ref": "main", "inputs": { "version": "1.16.0" } }' ``` The workflow will: 1. Validate version format 2. Create branch `rel-` 3. Run `make set-package-version version=` across all packages 4. Update the `sdk_ref` default in the eval workflow 5. Open a PR titled **"Release v\"** with labels `integration-test`, `behavior-test`, and `test-examples` ### ⏸ Checkpoint — Confirm PR Created Verify the PR exists and the version changes look correct before continuing. ```bash gh pr list --repo OpenHands/software-agent-sdk \ --head "rel-" --json number,title,url ``` ## Phase 2: Address Deprecation Deadlines The `deprecation-check` CI job runs on every PR. If the release version crosses any deprecation deadline declared in the codebase, the check will fail. Review the failing check output and either: - Remove the deprecated code if the deadline has passed, **or** - Extend the deadline with justification. Push fixes to the release branch. The check must pass before merging. ## Phase 3: Wait for CI — Tests Must Pass The release PR triggers three labeled test suites. **All three must pass.** | Label | Suite | What it covers | |-------|-------|----------------| | `integration-test` | Integration tests | End-to-end agent scenarios | | `behavior-test` | Behavior tests | Agent behavioral guardrails | | `test-examples` | Example tests | All runnable examples in `examples/` | Monitor status: ```bash gh pr checks --repo OpenHands/software-agent-sdk ``` ### ⏸ Checkpoint — Human Judgment on Failures Some test failures may be pre-existing or flaky. Decide with the team whether each failure is: - **Blocking** — must fix before release - **Known / pre-existing** — acceptable to release with a follow-up issue - **Flaky** — re-run the workflow Re-run failed jobs: ```bash # Find the run ID gh run list --repo OpenHands/software-agent-sdk \ --branch "rel-" --limit 5 # Re-run failed jobs gh run rerun --repo OpenHands/software-agent-sdk --failed ``` ## Phase 4: Run Evaluation (Optional but Recommended) Trigger an evaluation run on SWE-bench (or another benchmark) against the release branch to catch regressions. See the `run-eval` skill for full details. ```bash curl -X POST \ -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/run-eval.yml/dispatches" \ -d '{ "ref": "main", "inputs": { "benchmark": "swebench", "sdk_ref": "v", "eval_limit": "50", "reason": "Pre-release eval for v", "allow_unreleased_branches": "true" } }' ``` ### ⏸ Checkpoint — Evaluate Results Compare the eval results against the previous release. Significant score drops should block the release. ## Phase 5: Merge the Release PR > **🚨 STOP — Do NOT merge without explicit human approval.** > Present the CI status summary and ask the human to confirm before merging. > Merging is effectively irreversible — it automatically triggers the full > release pipeline (GitHub release → PyPI publish → downstream version bumps). Once the human approves: ```bash gh pr merge --repo OpenHands/software-agent-sdk --merge ``` ## Phase 6: Automated Release Pipeline (no action needed) When the release PR is merged, the following happens automatically: 1. **`create-release.yml`** detects the merged `rel-*` branch, creates a GitHub release with tag `v` and auto-generated release notes. 2. **`pypi-release.yml`** triggers on the published release and publishes all four packages to PyPI: - `openhands-sdk` - `openhands-tools` - `openhands-workspace` - `openhands-agent-server` 3. **`version-bump-prs.yml`** triggers after successful PyPI publish and creates downstream version bump PRs. ### ⏸ Checkpoint — Verify PyPI Publication ```bash # Check each package is available (allow a few minutes for indexing) for pkg in openhands-sdk openhands-tools openhands-workspace openhands-agent-server; do curl -s -o /dev/null -w "$pkg: %{http_code}\n" \ "https://pypi.org/pypi/$pkg//json" done ``` All should return `200`. ## Phase 7: Post-Release Announcements After the automated pipeline completes, compose a Slack message for the human to post, including links to the downstream version bump PRs: ``` 🚀 *SDK v published to PyPI!* Version bump PRs: • |OpenHands> • |OpenHands-CLI> Release: |v> ``` See `references/post-release-checklist.md` for details on reviewing downstream PRs and handling any issues. ## Quick Reference — Full Checklist - [ ] Trigger `prepare-release.yml` with target version - [ ] Verify release PR is created - [ ] Fix deprecation deadline failures (if any) - [ ] Integration tests pass - [ ] Behavior tests pass - [ ] Example tests pass - [ ] (Optional) Evaluation run shows no regressions - [ ] **🚨 Get human approval**, then merge the release PR - [ ] _(Automated)_ GitHub release created with auto-generated notes - [ ] _(Automated)_ Packages published to PyPI - [ ] _(Automated)_ Downstream version bump PRs created - [ ] Verify packages appear on PyPI - [ ] Send Slack message with downstream version bump PR links ================================================ FILE: .agents/skills/sdk-release/references/post-release-checklist.md ================================================ # Post-Release Checklist After the GitHub release is published and PyPI packages are available, several automated and manual follow-up steps occur. ## Automated: Downstream Version Bump PRs The `version-bump-prs.yml` workflow runs automatically after `pypi-release` succeeds. It creates PRs in two repositories: ### OpenHands-CLI (`OpenHands/openhands-cli`) - Branch: `bump-sdk-` - Updates `openhands-sdk` and `openhands-tools` via `uv add` - Verify the PR passes CLI tests before merging ```bash gh pr list --repo OpenHands/openhands-cli \ --search "bump-sdk-" --json number,title,url ``` ### OpenHands (`All-Hands-AI/OpenHands`) - Branch: `bump-sdk-` - Updates `openhands-sdk`, `openhands-tools`, and `openhands-agent-server` in `pyproject.toml` - Regenerates `poetry.lock` - Updates `AGENT_SERVER_IMAGE` in `sandbox_spec_service.py` - Verifies `enterprise/pyproject.toml` does not have explicit SDK pins ```bash gh pr list --repo All-Hands-AI/OpenHands \ --search "bump-sdk-" --json number,title,url ``` ## Manual Review of Downstream PRs Both PRs require human review: 1. **Check CI passes** on each downstream PR 2. **Verify compatibility** — especially if the release includes breaking changes or new features that need adoption 3. **Merge** once satisfied ## Evaluation on OpenHands Index If not already done pre-release, trigger a full evaluation run against the published version: ```bash curl -X POST \ -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/run-eval.yml/dispatches" \ -d '{ "ref": "main", "inputs": { "benchmark": "swebench", "sdk_ref": "v", "eval_limit": "300", "reason": "Post-release eval v" } }' ``` ## Documentation Updates If the release includes user-facing features, verify documentation is updated in `OpenHands/docs` (SDK docs live under `sdk/`). See the `feature-release-rollout` skill for the full downstream propagation workflow. ## Troubleshooting ### PyPI publication failed Re-run the `pypi-release.yml` workflow manually. It uses `--check-url` to skip already-published packages, so partial reruns are safe. ```bash gh workflow run pypi-release.yml --repo OpenHands/software-agent-sdk ``` ### Version bump PR has conflicts The automated PR may conflict if the downstream repo changed dependency pins since the workflow ran. Resolve conflicts manually on the bump branch, or re-trigger `version-bump-prs.yml` with the version input. ```bash gh workflow run version-bump-prs.yml \ --repo OpenHands/software-agent-sdk \ -f version= ``` ### Downstream tests fail after bump If a downstream repo's tests fail on the version bump PR, investigate whether the failure is a breaking change in the SDK release. If so, either: - Fix the downstream code on the bump branch, or - Publish a patch release of the SDK with the fix ================================================ FILE: .agents/skills/write-behavior-test.md ================================================ --- name: write-behavior-test description: Guide for writing behavior tests that verify agents follow system message guidelines and avoid undesirable behaviors. Use when creating integration tests for agent behavior validation. triggers: - /write_behavior_test --- # Behavior Test Writing Guide You are helping to create **behavior tests** for the agent-sdk integration test suite. These tests verify that agents follow system message guidelines and avoid undesirable behaviors. The tests are for the agent powered by this SDK, so you may need to refer the codebase for details on how the agent works in order to write effective tests. ## Behavior Tests vs Task Tests **Task Tests (t*.py)** - REQUIRED tests that verify task completion: - Focus: Can the agent successfully complete the task? - Example: Fix typos in a file, create a script, implement a feature **Behavior Tests (b*.py)** - OPTIONAL tests that verify proper behavior: - Focus: Does the agent follow best practices and system guidelines? - Example: Don't implement when asked for advice, don't over-verify, avoid redundant files ## Key Principles for Writing Behavior Tests ### ✅ DO: 1. **Use Real Repositories** - Clone actual GitHub repositories that represent real-world scenarios - Pin to a specific historical commit (before a fix/feature was added) - Example: `clone_pinned_software_agent_repo(workspace)` helper 2. **Test Realistic Complex, Nuanced Behaviors** - Try to make the task as realistic as possible to real HUMAN interactions, from file naming, (somewhat lazy) instruction style, etc - Focus on subtle behavioral issues that require judgment - Test scenarios where the "right" behavior isn't immediately obvious - Examples: When to implement vs advise, when to stop testing, whether to add backward compatibility 3. **Clean Up Repository History** - Check out to a commit BEFORE the solution exists - Reset/remove future commits (see existing tests for examples) - Ensures the agent experiences the same context as real users 4. **Use Helper Functions** - `find_file_editing_operations(events)` - Find file create/edit operations - `find_tool_calls(events, tool_name)` - Find specific tool usage - `get_conversation_summary(events)` - Get summary for LLM judge - `judge_agent_behavior(...)` - Use LLM to evaluate behavior quality 5. **Leverage LLM Judges** - Use `judge_agent_behavior()` for subjective evaluations - Provide clear evaluation criteria in the judge prompt - Track judge usage costs: `self.add_judge_usage(prompt_tokens, completion_tokens, cost)` 6. **Adaptation of Problem Description to Task** - If you find the problem description is not easy to adapt to a behavior test, e.g. it requires complex environment setup like kubernetes, try to come up with a simpler problem description that still captures the essence of the behavior you want to test but is easier to implement in the test framework. - Ensure the instructions naturally lead to the behavior you want to evaluate ### ❌ DO NOT: 1. **Avoid Simple Synthetic Tests** - Don't create artificial scenarios with minimal setup - Don't test behaviors that are too obvious or straightforward - Example: Don't create a single-file test with trivial content 2. **Don't Test Basic Functionality** - Behavior tests are NOT for testing if the agent can use tools - Task tests handle basic capability verification - Focus on HOW the agent approaches problems, not IF it can solve them 3. **Don't Overcomplicate Static Assertions** - Use assertions for clear-cut checks (e.g., no file edits) - Rely on LLM judges for nuanced behavior evaluations - Avoid trying to encode subjective judgments purely in code or too much static logic ## Tips for Test Difficulty Calibration **Make tests challenging but not impossible and too long:** 1. **Context Complexity**: Use real codebases with multiple files and dependencies, either the software-agent-sdk or other popular open-source repos you find suitable 2. **Ambiguity**: Prefer instructions that could be interpreted multiple ways 3. **Temptation**: Set up scenarios where the "easy wrong path" is tempting 4. **Realism**: Mirror real user interactions and expectations **Examples of Good Complexity:** - "How to implement X?" (tests if agent implements vs advises) - "Update constant Y" (tests if agent over-verifies with excessive test runs) - "Rename method A to B" (tests if agent adds unnecessary backward compatibility) ## Example Behavior Test Patterns 1. **Premature Implementation** - Tests if agent implements when asked for advice only 2. **Over-verification** - Tests if agent runs excessive tests beyond what's needed 3. **Unnecessary Compatibility** - Tests if agent adds backward compatibility shims when not needed 4. **Redundant Artifacts** - Tests if agent creates extra files (docs, READMEs) without being asked 5. **Communication Quality** - Tests if agent provides explanations for actions ## File Naming Convention Name your test file: `b##_descriptive_name.py` - `b` prefix indicates behavior test (auto-detected) - `##` is a zero-padded number (e.g., 01, 02, 03) - Use snake_case for the descriptive name ## Final Checklist Before submitting your behavior test, verify: - [ ] Uses a real repository or complex codebase - [ ] Tests a nuanced behavior, not basic functionality - [ ] Includes clear and not overly complex verification logic (assertions or LLM judge) - [ ] Has a descriptive docstring explaining what behavior is tested - [ ] Properly tracks judge usage costs if using LLM evaluation - [ ] Follows naming convention: `b##_descriptive_name.py` - [ ] Test is realistic and based on actual behavioral issues observed Remember: The goal is to catch subtle behavioral issues that would appear in real-world usage, serving as regression tests for system message improvements. ================================================ FILE: .dockerignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest # Note: We keep our custom spec file in version control # *.spec # PyInstaller build directories build/ dist/ # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control # poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be added to the global gitignore or merged into this project gitignore. For a PyCharm # project, it is recommended to ignore the entire .idea directory. .idea/ # VS Code .vscode/ # macOS .DS_Store .AppleDouble .LSOverride # Windows Thumbs.db ehthumbs.db Desktop.ini $RECYCLE.BIN/ # Linux *~ # Temporary files *.tmp *.temp *.swp *.swo # UV specific .uv/ # Project specific *.log .coverage .pytest_cache/ workspace/ .client .docker .git .git/** # VS Code: Ignore all but certain files that specify repo-specific settings. # https://stackoverflow.com/questions/32964920/should-i-commit-the-vscode-folder-to-source-control .vscode/**/* !.vscode/extensions.json !.vscode/tasks.json # VS Code extensions/forks: .cursorignore .rooignore .clineignore .windsurfignore .cursorrules .roorules .clinerules .windsurfrules .cursor/rules .roo/rules .cline/rules .windsurf/rules .repomix repomix-output.txt # misc .DS_Store .env.local .env.development.local .env.test.local .env.production.local npm-debug.log* yarn-debug.log* yarn-error.log* logs # agent .envrc cache .jinja_cache/ .conversations* workspace/ # Build optimization: exclude files not needed for building agent-server tests/ *.log .github/ scripts/ examples/ .ruff_cache/ .uv-cache/ Makefile docs/ *.md !README.md .pre-commit-config.yaml .python-version ================================================ FILE: .github/ISSUE_TEMPLATE/bug_template.yml ================================================ --- name: Bug description: Report a problem with OpenHands SDK title: '[Bug]: ' labels: [bug] body: - type: markdown attributes: value: | ## Thank you for reporting a bug! 🐛 **Please fill out all required fields.** Issues missing critical information (version, installation method, reproduction steps, etc.) will be delayed or closed until complete details are provided. Clear, detailed reports help us resolve issues faster. - type: checkboxes attributes: label: Is there an existing issue for the same bug? description: Please search existing issues before creating a new one. If found, react or comment to the duplicate issue instead of making a new one. options: - label: I have searched existing issues and this is not a duplicate. required: true - type: textarea id: bug-description attributes: label: Bug Description description: Clearly describe what went wrong. Be specific and concise. placeholder: Example - When I use the SDK to create an agent with custom tools, the agent fails to register the tools with a TypeError. validations: required: true - type: textarea id: expected-behavior attributes: label: Expected Behavior description: What did you expect to happen? placeholder: Example - The agent should successfully register custom tools and make them available for use. validations: required: false - type: textarea id: actual-behavior attributes: label: Actual Behavior description: What actually happened? placeholder: "Example - TypeError: 'NoneType' object is not iterable when calling agent.register_tool()" validations: required: false - type: textarea id: reproduction-steps attributes: label: Steps to Reproduce description: Provide clear, step-by-step instructions to reproduce the bug. placeholder: | 1. Install openhands-sdk using pip 2. Import and create an agent instance 3. Define a custom tool function 4. Call agent.register_tool(custom_tool) 5. Error appears validations: required: false - type: input id: installation attributes: label: Installation Method description: How did you install the OpenHands SDK? placeholder: ex. pip install openhands-sdk, uv pip install openhands-sdk, pip install -e ., etc. - type: input id: installation-other attributes: label: If you selected "Other", please specify description: Describe your installation method placeholder: ex. Poetry, conda, custom setup, etc. - type: input id: sdk-version attributes: label: SDK Version description: What version are you using? Check with `pip show openhands-sdk` or similar for other packages. placeholder: ex. 0.1.0, 0.2.0, main branch, commit hash, etc. validations: required: false - type: checkboxes id: version-confirmation attributes: label: Version Confirmation description: Bugs on older versions may already be fixed. Please upgrade before submitting. options: - label: I have confirmed this bug exists on the LATEST version of OpenHands SDK required: false - type: input id: python-version attributes: label: Python Version description: Which Python version are you using? placeholder: ex. 3.10.12, 3.11.5, 3.12.0 validations: required: false - type: input id: model-name attributes: label: Model Name (if applicable) description: Which model(s) are you using? placeholder: ex. gpt-4o, claude-3-5-sonnet-20241022, openrouter/deepseek-r1, etc. validations: required: false - type: dropdown id: os attributes: label: Operating System options: - MacOS - Linux - WSL on Windows - Windows - Other validations: required: false - type: textarea id: logs attributes: label: Logs and Error Messages description: | **Paste relevant logs, error messages, or stack traces.** Use code blocks (```) for formatting. Include full stack traces when available. placeholder: | ``` Paste error logs here ``` - type: textarea id: code-sample attributes: label: Minimal Code Sample description: | If possible, provide a minimal code sample that reproduces the issue. placeholder: | ```python from openhands.sdk import Agent # Your minimal reproducible code here ``` - type: textarea id: additional-context attributes: label: Screenshots and Additional Context description: | Add screenshots, environment details, dependency versions, or other context that helps explain the issue. placeholder: Drag and drop screenshots here, paste links, or add additional context. - type: markdown attributes: value: | --- **Note:** Please help us help you! Well-documented bugs are easier to reproduce and fix. Thank you for your understanding! ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ --- name: Feature Request or Enhancement description: Suggest a new feature or improvement for OpenHands SDK title: '[Feature]: ' labels: [enhancement] body: - type: markdown attributes: value: | ## Thank you for suggesting a feature! 💡 We encourage you to open the discussion on the feature you need. You are always welcome to implement it, if you wish. - type: checkboxes attributes: label: Is there an existing feature request for this? description: Please search existing issues and feature requests before creating a new one. If found, react or comment to the duplicate issue instead of making a new one. options: - label: I have searched existing issues and feature requests, and this is not a duplicate. required: true - type: textarea id: problem-statement attributes: label: Problem or Use Case description: What problem are you trying to solve? What use case would this feature enable? placeholder: | Example - As a developer building agents, I need to persist agent state between sessions. Currently, there's no built-in mechanism for saving and loading agent memory, which means agents lose context when the process restarts. validations: required: true - type: textarea id: proposed-solution attributes: label: Proposed Solution description: Describe your ideal solution. What should this feature do? How should it work? placeholder: | Example - Add a StateManager class that allows saving and loading agent state to/from disk or database. Provide methods like save_state(), load_state(), and clear_state(). Support multiple backend options (JSON files, SQLite, Redis, etc.). validations: required: true - type: textarea id: alternatives attributes: label: Alternatives Considered description: Have you considered any alternative solutions or workarounds? What are their limitations? placeholder: Example - I tried manually serializing agent state using pickle, but it's not portable across SDK versions and doesn't handle complex tool state properly. - type: dropdown id: priority attributes: label: Priority / Severity description: How important is this feature to your workflow? options: - Critical - Blocking my work, no workaround available - High - Significant impact on productivity - Medium - Would improve experience - Low - Nice to have default: 2 validations: required: true - type: dropdown id: scope attributes: label: Estimated Scope description: To the best of your knowledge, how complex do you think this feature would be to implement? options: - Small - API addition, config option, or minor change - Medium - New feature with moderate complexity - Large - Significant feature requiring architecture changes - Unknown - Not sure about the technical complexity default: 3 - type: checkboxes id: feature-area attributes: label: Feature Area description: Which part of OpenHands SDK does this feature relate to? If you select "Other", please specify the area in the Additional Context section below. options: - label: Agent API / Core functionality - label: Tools / Tool system - label: Skills / Plugins - label: Agent Server - label: Workspace management - label: Configuration / Settings - label: Examples / Templates - label: Documentation - label: Testing / Development tools - label: Performance / Optimization - label: Integrations (GitHub, APIs, etc.) - label: Other - type: textarea id: technical-details attributes: label: Technical Implementation Ideas (Optional) description: If you have technical expertise, share implementation ideas, API suggestions, or relevant technical details. placeholder: | Example - Could implement StateManager as an abstract base class with concrete implementations for different backends. Add state_manager parameter to Agent constructor. Use JSON serialization for simple state, MessagePack for better performance. - type: textarea id: additional-context attributes: label: Additional Context description: Add any other context, code examples, API mockups, or references that help illustrate this feature request. placeholder: | Example code or API design: ```python from openhands.sdk import Agent, StateManager agent = Agent(state_manager=StateManager('file://agent_state.json')) agent.save_state() ``` ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ - [ ] A human has tested these changes. --- ## Why ## Summary - ## Issue Number ## How to Test ## Video/Screenshots ## Type - [ ] Bug fix - [ ] Feature - [ ] Refactor - [ ] Breaking change - [ ] Docs / chore ## Notes ================================================ FILE: .github/dependabot.yml ================================================ --- # Dependabot configuration for automated dependency updates # See: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file # # Note: Python (pip) ecosystem is not configured here because Dependabot does not # fully support uv workspaces yet. See issue #2510 for tracking. version: 2 updates: # GitHub Actions - package-ecosystem: github-actions directory: / schedule: interval: weekly commit-message: prefix: chore(deps) ================================================ FILE: .github/prompts/update-documentation.md ================================================ # Documentation Update Prompt You are a world-class documentation writer tasked with keeping the OpenHands Agent SDK documentation accurate and up-to-date. Your goal is to ensure documentation reflects the current codebase and provides clear, minimal, and actionable guidance. ## Core Objectives 1. **Accuracy**: Ensure all documentation matches the current codebase 2. **Completeness**: Include all available tools and core components 3. **Clarity**: Keep examples simple, working, and easy to understand 4. **Navigation**: Provide source code links for all definitions ## Tasks to Perform ### 1. Codebase Analysis - Scan `examples/` for available examples - Scan `openhands-tools/` for all available runtime tools - Check `openhands-sdk/openhands/tool/builtins/` for built-in tools - Identify any new tools or removed tools since last update ### 2. Documentation Review Review these key files for accuracy: - `docs/architecture/overview.md` - High-level component interactions and design principles - `docs/architecture/tool.md` - Tool system, inheritance, and MCP integration - `docs/architecture/agent.md` - Agent architecture and execution flow - `docs/architecture/llm.md` - LLM integration and capabilities - `docs/architecture/conversation.md` - Conversation interface and persistence - `docs/getting-started.mdx` - Make sure we have descriptions of all examples listed out in `examples/` - `docs/index.md` - Overview and navigation - `README.md` - Root project documentation ### 3. Content Updates Required #### Architecture Diagrams - Keep mermaid diagrams SIMPLE and READABLE across all docs/architecture/ files - Focus on core components and relationships, not every possible class - Include all current runtime tools: TerminalTool, FileEditorTool, TaskTrackerTool, etc. - Verify component interactions and inheritance reflect actual codebase structure #### Tool Documentation For each tool, ensure: - Accurate usage examples with `.create()` method - Working code snippets (test them!) - Source code links to GitHub - Clear descriptions of functionality #### Core Framework Classes Verify documentation across docs/architecture/ files for: - `Tool`, `ActionBase`, `ObservationBase`, `ToolExecutor` (docs/architecture/tool.md) - `Agent`, `AgentBase`, system prompts (docs/architecture/agent.md) - `LLM`, message types, provider support (docs/architecture/llm.md) - `Conversation`, `ConversationState`, event system (docs/architecture/conversation.md) - All built-in tools: `FinishTool`, `ThinkTool` - All runtime tools: `TerminalTool`, `FileEditorTool`, `TaskTrackerTool` ### 4. Verification Steps - Test all documented code examples to ensure they work - Verify all GitHub source links are correct and accessible - Check that simplified and advanced usage patterns are accurate - Ensure cross-references between files are consistent ### 5. Documentation Standards - **Style**: Direct, lean, technical writing - **Structure**: Clear sections answering specific user questions - **Examples**: Show working code rather than vague descriptions - **Links**: Include GitHub source links for all classes and tools - **Diagrams**: Simple, focused mermaid charts ## Expected Deliverables 1. Updated documentation files with current tool listings 2. Verified working code examples 3. Simplified and accurate architecture diagrams 4. Complete source code links for all definitions 5. Consistent cross-references across all documentation files ## Quality Checklist - [ ] All runtime tools are documented with working examples - [ ] All built-in tools are listed and linked - [ ] Architecture diagrams are simple and current - [ ] All code examples have been tested and work - [ ] Source code links point to correct GitHub files - [ ] Documentation follows minimal, clear writing style - [ ] Cross-references between files are consistent ## Commit Message Format If you think there's change required, please create a pull request. ``` Update documentation to reflect current codebase - [Specific changes made] - [Tools added/removed/updated] - [Diagrams simplified/corrected] - [Examples verified/fixed] Co-authored-by: openhands ``` Focus on making the documentation immediately useful for developers who need to understand and use the OpenHands Tools System. ================================================ FILE: .github/run-eval/ADDINGMODEL.md ================================================ # Adding Models to resolve_model_config.py ## Overview This file (`resolve_model_config.py`) defines models available for evaluation. Models must be added here before they can be used in integration tests or evaluations. ## Critical Rules **ONLY ADD NEW CONTENT - DO NOT MODIFY EXISTING CODE** ### What NOT to Do 1. **Never modify existing model entries** - they are production code, already working 2. **Never modify existing tests** - especially test assertions, mock configs, or expected values 3. **Never reformat existing code** - preserve exact spacing, quotes, commas, formatting 4. **Never reorder models or imports** - dictionary and import order must be preserved 5. **Never "fix" existing code** - if it's in the file and tests pass, it works 6. **Never change test assertions** - even if they "look wrong" to you 7. **Never replace real model tests with mocked tests** - weakens validation 8. **Never fix import names** - if `test_model` exists, don't change it to `check_model` ### What These Rules Prevent **Example violations** (all found in real PRs): - Changing `assert result[0]["id"] == "claude-sonnet-4-5-20250929"` to `"gpt-4"` ❌ - Replacing real model config tests with mocked/custom model tests ❌ - "Fixing" `from resolve_model_config import test_model` to `check_model` ❌ - Adding "Fixed incorrect assertions" without explaining what was incorrect ❌ - Claiming to "fix test issues" when tests were already passing ❌ ### What TO Do **When adding a model**: - Add ONE new entry to the MODELS dictionary - Add ONE new test function (follow existing pattern exactly) - Add to feature lists in model_features.py ONLY if needed for your model - Do not touch any other files, tests, imports, or configurations - Test the PR branch with the integration test action. - Add a link to the integrations test to the PR. - If you think something is broken, it's probably not - add a comment to the PR. ## Files to Modify 1. **Always required**: - `.github/run-eval/resolve_model_config.py` - Add model configuration - `tests/github_workflows/test_resolve_model_config.py` - Add test 2. **Usually required** (if model has special characteristics): - `openhands-sdk/openhands/sdk/llm/utils/model_features.py` - Add to feature categories 3. **Sometimes required**: - `openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py` - GPT models only (variant detection) - `openhands-sdk/openhands/sdk/llm/utils/verified_models.py` - Production-ready models > ⚠️ **When editing `verified_models.py`**: If you add a model to `VERIFIED_OPENHANDS_MODELS`, > you **must also** add it to its provider-specific list (e.g. `VERIFIED_ANTHROPIC_MODELS`, > `VERIFIED_GEMINI_MODELS`, `VERIFIED_MOONSHOT_MODELS`, etc.). > If no list exists for the provider yet, create one and add it to the `VERIFIED_MODELS` dict. > This ensures the model appears under its actual provider in the UI, not just under "openhands". ## Step 1: Add to resolve_model_config.py Add entry to `MODELS` dictionary: ```python "model-id": { "id": "model-id", # Must match dictionary key "display_name": "Human Readable Name", "llm_config": { "model": "litellm_proxy/provider/model-name", "temperature": 0.0, # See temperature guide below }, }, ``` ### Temperature Configuration | Value | When to Use | Provider Requirements | |-------|-------------|----------------------| | `0.0` | Standard deterministic models | Most providers | | `1.0` | Reasoning models | Kimi K2, MiniMax M2.5 | | `None` | Use provider default | When unsure | ### Special Parameters Add only if needed: - **`disable_vision: True`** - Model doesn't support vision despite LiteLLM reporting it does (GLM-4.7, GLM-5) - **`reasoning_effort: "high"`** - For OpenAI reasoning models that support this parameter - **`max_tokens: `** - To prevent hangs or control output length - **`top_p: `** - Nucleus sampling (cannot be used with `temperature` for Claude models) - **`litellm_extra_body: {...}`** - Provider-specific parameters (e.g., `{"enable_thinking": True}`) ### Critical Rules 1. Model ID must match dictionary key 2. Model path must start with `litellm_proxy/` 3. **Claude models**: Cannot use both `temperature` and `top_p` - choose one or omit both 4. Parameters like `disable_vision` must be in `SDK_ONLY_PARAMS` constant (they're filtered before sending to LiteLLM) ## Step 2: Update model_features.py (if applicable) Check provider documentation to determine which feature categories apply: ### REASONING_EFFORT_MODELS Models that support `reasoning_effort` parameter: - OpenAI: o1, o3, o4, GPT-5 series - Anthropic: Claude Opus 4.5+, Claude Sonnet 4.6 - Google: Gemini 2.5+, Gemini 3.x series - AWS: Nova 2 Lite ```python REASONING_EFFORT_MODELS: list[str] = [ "your-model-identifier", # Add here ] ``` **Effect**: Automatically strips `temperature` and `top_p` parameters to avoid API conflicts. ### EXTENDED_THINKING_MODELS Models with extended thinking capabilities: - Anthropic: Claude Sonnet 4.5+, Claude Haiku 4.5 ```python EXTENDED_THINKING_MODELS: list[str] = [ "your-model-identifier", # Add here ] ``` **Effect**: Automatically strips `temperature` and `top_p` parameters. ### PROMPT_CACHE_MODELS Models supporting prompt caching: - Anthropic: Claude 3.5+, Claude 4+ series ```python PROMPT_CACHE_MODELS: list[str] = [ "your-model-identifier", # Add here ] ``` ### SUPPORTS_STOP_WORDS_FALSE_MODELS Models that **do not** support stop words: - OpenAI: o1, o3 series - xAI: Grok-4, Grok-code-fast-1 - DeepSeek: R1 family ```python SUPPORTS_STOP_WORDS_FALSE_MODELS: list[str] = [ "your-model-identifier", # Add here ] ``` ### FORCE_STRING_SERIALIZER_MODELS Models requiring string format for tool messages (not structured content): - DeepSeek models - GLM models - Groq: Kimi K2-Instruct - OpenRouter: MiniMax Use pattern matching: ```python FORCE_STRING_SERIALIZER_MODELS: list[str] = [ "deepseek", # Matches any model with "deepseek" in name "groq/kimi-k2-instruct", # Provider-prefixed ] ``` ### Other Categories - **PROMPT_CACHE_RETENTION_MODELS**: GPT-5 family, GPT-4.1 - **RESPONSES_API_MODELS**: GPT-5 family, codex-mini-latest - **SEND_REASONING_CONTENT_MODELS**: Kimi K2 Thinking/K2.5, MiniMax-M2, DeepSeek Reasoner See `model_features.py` for complete lists and additional documentation. ## Step 3: Add Test **File**: `tests/github_workflows/test_resolve_model_config.py` **Important**: - Python function names cannot contain hyphens. Convert model ID hyphens to underscores. - **Do not modify any existing test functions** - only add your new one at the end of the file - **Do not change existing imports** - use what's already there - **Do not fix "incorrect" assertions** in other tests - they are correct **Test template** (copy and modify for your model): ```python def test_your_model_id_config(): # Replace hyphens with underscores in function name """Test that your-model-id has correct configuration.""" model = MODELS["your-model-id"] # Dictionary key keeps hyphens assert model["id"] == "your-model-id" assert model["display_name"] == "Your Model Display Name" assert model["llm_config"]["model"] == "litellm_proxy/provider/model-name" # Only add assertions for parameters YOU added in resolve_model_config.py # assert model["llm_config"]["temperature"] == 0.0 # assert model["llm_config"]["disable_vision"] is True ``` **What NOT to do in tests**: - Don't change assertions in other test functions (even if model names "look wrong") - Don't replace real model tests with mocked tests - Don't change `test_model` to `check_model` in imports - Don't modify mock_models dictionaries in other tests - Don't add "fixes" to existing tests - they work as-is ## Step 4: Update GPT Variant Detection (GPT models only) **File**: `openhands-sdk/openhands/sdk/llm/utils/model_prompt_spec.py` Required only if this is a GPT model needing specific prompt template. **Order matters**: More specific patterns must come before general patterns. ```python _MODEL_VARIANT_PATTERNS: dict[str, tuple[tuple[str, tuple[str, ...]], ...]] = { "openai_gpt": ( ( "gpt-5-codex", # Specific variant first ("gpt-5-codex", "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex"), ), ("gpt-5", ("gpt-5", "gpt-5.1", "gpt-5.2")), # General variant last ), } ``` ## Step 5: Run Tests Locally ```bash # Pre-commit checks pre-commit run --all-files # Unit tests pytest tests/github_workflows/test_resolve_model_config.py::test_your_model_config -v # Manual verification cd .github/run-eval MODEL_IDS="your-model-id" GITHUB_OUTPUT=/tmp/output.txt python resolve_model_config.py ``` ## Step 6: Create Draft PR Push your branch and create a draft PR. Note the PR number returned - you'll need it for the integration tests. ## Step 7: Run Integration Tests Trigger integration tests on your PR branch: ```bash gh workflow run integration-runner.yml \ -f model_ids=your-model-id \ -f reason="Testing new model from PR #" \ -f issue_number= \ --ref your-branch-name ``` Results will be posted back to the PR as a comment. ### Expected Results - Success rate: 100% (or 87.5% if vision test skipped) - Duration: 5-10 minutes per model - Tests: 8 total (basic commands, file ops, code editing, reasoning, errors, tools, context, vision) ## Step 8: Fix Issues and Rerun (if needed) If tests fail, see [Common Issues](#common-issues) below. After fixing: 1. Push the fix: `git add . && git commit && git push` 2. Rerun integration tests with the same command from Step 7 (using the same PR number) ## Step 9: Mark PR Ready When tests pass, mark the PR as ready for review: ```bash gh pr ready ``` ### Required in PR Description ```markdown ## Summary Adds the `model-id` model to resolve_model_config.py. ## Changes - Added model-id to MODELS dictionary - Added test_model_id_config() test function - [Only if applicable] Added to [feature category] in model_features.py ## Configuration - Model ID: model-id - Provider: Provider Name - Temperature: [value] - [reasoning for choice] - [List any special parameters and why needed] ## Integration Test Results ✅ Integration tests passed: [PASTE GITHUB ACTIONS RUN URL] [Summary table showing test results] Fixes #[issue-number] ``` ### What NOT to Include in PR Description **Do not claim to have "fixed" things unless they were actually broken**: - ❌ "Fixed test_model import issue" (if tests were passing, there was no issue) - ❌ "Fixed incorrect assertions in existing tests" (they were correct) - ❌ "Improved test coverage" (unless you actually added new test cases) - ❌ "Cleaned up code" (you shouldn't be cleaning up anything) - ❌ "Updated test approach" (you shouldn't be changing testing approach) **Only describe what you actually added**: - ✅ "Added gpt-5.3-codex model configuration" - ✅ "Added test for gpt-5.3-codex" - ✅ "Added gpt-5.3-codex to REASONING_EFFORT_MODELS" ## Common Issues ### Integration Tests Hang (6-8+ hours) **Causes**: - Missing `max_tokens` parameter - Claude models with both `temperature` and `top_p` set - Model not in REASONING_EFFORT_MODELS or EXTENDED_THINKING_MODELS **Solutions**: Add `max_tokens`, remove parameter conflicts, add to appropriate feature category. **Reference**: #2147 ### Preflight Check: "Cannot specify both temperature and top_p" **Cause**: Claude models receiving both parameters **Solutions**: - Remove `top_p` from llm_config if `temperature` is set - Add model to REASONING_EFFORT_MODELS or EXTENDED_THINKING_MODELS (auto-strips both) **Reference**: #2137, #2193 ### Vision Tests Fail **Cause**: LiteLLM reports vision support but model doesn't actually support it **Solution**: Add `"disable_vision": True` to llm_config **Reference**: #2110 (GLM-5), #1898 (GLM-4.7) ### Wrong Prompt Template (GPT models) **Cause**: Model variant not detected correctly, falls through to wrong template **Solution**: Add explicit entries to `model_prompt_spec.py` with correct pattern order **Reference**: #2233 (GPT-5.2-codex, GPT-5.3-codex) ### SDK-Only Parameters Sent to LiteLLM **Cause**: Parameter like `disable_vision` not in `SDK_ONLY_PARAMS` set **Solution**: Add to `SDK_ONLY_PARAMS` in `resolve_model_config.py` **Reference**: #2194 ## Model Feature Detection Criteria ### How to Determine if Model Needs Feature Category **Reasoning Model**: - Check provider documentation for "reasoning", "thinking", or "o1-style" mentions - Model exposes internal reasoning traces - Examples: o1, o3, GPT-5, Claude Opus 4.5+, Gemini 3+ **Extended Thinking**: - Check if model is Claude Sonnet 4.5+ or Claude Haiku 4.5 - Provider documents extended thinking capabilities **Prompt Caching**: - Check provider documentation for prompt caching support - Anthropic Claude 3.5+ and 4+ series support this **Vision Support**: - Check provider documentation (don't rely solely on LiteLLM) - If LiteLLM reports vision but provider docs say text-only, add `disable_vision: True` **Stop Words**: - Most models support stop words - o1/o3 series, some Grok models, DeepSeek R1 do not **String Serialization**: - If tool message errors mention "Input should be a valid string" - DeepSeek, GLM, some provider-specific models need this ## Reference - Recent model additions: #2102, #2153, #2207, #2233, #2269 - Common issues: #2147 (hangs), #2137 (parameters), #2110 (vision), #2233 (variants), #2193 (preflight) - Integration test workflow: `.github/workflows/integration-runner.yml` - Integration tests can be triggered via: `gh workflow run integration-runner.yml --ref ` ================================================ FILE: .github/run-eval/AGENTS.md ================================================ # Model Configuration for OpenHands SDK See the [project root AGENTS.md](../../AGENTS.md) for repository-wide policies and workflows. This directory contains model configuration and evaluation setup for the OpenHands SDK. ## Key Files - **`resolve_model_config.py`** - Model registry and configuration - Defines all models available for evaluation - Contains model IDs, display names, LiteLLM paths, and parameters - Used by integration tests and evaluation workflows - **`tests/github_workflows/test_resolve_model_config.py`** - Tests for model configurations - Validates model entries are correctly structured - Tests preflight check functionality - **`ADDINGMODEL.md`** - Detailed guide for adding models (see below) ## Common Tasks ### Adding a New Model **→ See [ADDINGMODEL.md](./ADDINGMODEL.md) for complete instructions** This is the most common task in this directory. The guide covers: - Required steps and files to modify - Model feature categories and when to use them - Integration testing requirements - Common issues and troubleshooting - Critical rules to prevent breaking existing models ### Debugging Model Issues If a model is failing in evaluations: 1. Check the model configuration in `resolve_model_config.py` 2. Review parameter compatibility (especially `temperature` + `top_p` for Claude) 3. Check if model is in correct feature categories in `openhands-sdk/openhands/sdk/llm/utils/model_features.py` 4. Run preflight check: `MODEL_IDS="model-id" python resolve_model_config.py` ### Updating Existing Models **Warning**: Only update existing models if there's a confirmed issue. Working configurations should not be changed. If you must update: 1. Document why the change is needed (link to issue/PR showing the problem) 2. Test thoroughly before and after the change 3. Run integration tests to verify no regressions ## Directory Purpose This directory bridges model definitions with the evaluation system: - Models defined here are available for integration tests - Configuration includes LiteLLM routing and SDK-specific parameters - Preflight checks validate model accessibility before expensive evaluation runs - Tests ensure all models are correctly structured and resolvable ================================================ FILE: .github/run-eval/resolve_model_config.py ================================================ #!/usr/bin/env python3 """ Resolve model IDs to full model configurations and verify model availability. Reads: - MODEL_IDS: comma-separated model IDs - LLM_API_KEY: API key for litellm_proxy (optional, for preflight check) - LLM_BASE_URL: Base URL for litellm_proxy (optional, defaults to eval proxy) - SKIP_PREFLIGHT: Set to 'true' to skip the preflight LLM check Outputs to GITHUB_OUTPUT: - models_json: JSON array of full model configs with display names """ import json import os import signal import sys import time from typing import Any def _sigterm_handler(signum: int, _frame: object) -> None: """Handle SIGTERM/SIGALRM with a diagnostic message instead of silent death.""" sig_name = signal.Signals(signum).name print( f"\nERROR: Process received {sig_name} during preflight check.\n" "This usually means the LiteLLM proxy is unreachable or hanging.\n" f"LLM_BASE_URL: {os.environ.get('LLM_BASE_URL', '(not set)')}\n", file=sys.stderr, flush=True, ) sys.exit(1) signal.signal(signal.SIGTERM, _sigterm_handler) if sigalrm := getattr(signal, "SIGALRM", None): signal.signal(sigalrm, _sigterm_handler) # SDK-specific parameters that should not be passed to litellm. # These parameters are used by the SDK's LLM wrapper but are not part of litellm's API. # Keep this list in sync with SDK LLM config parameters that are SDK-internal. SDK_ONLY_PARAMS = {"disable_vision"} # Model configurations dictionary MODELS = { "claude-sonnet-4-5-20250929": { "id": "claude-sonnet-4-5-20250929", "display_name": "Claude Sonnet 4.5", "llm_config": { "model": "litellm_proxy/claude-sonnet-4-5-20250929", "temperature": 0.0, }, }, "kimi-k2-thinking": { "id": "kimi-k2-thinking", "display_name": "Kimi K2 Thinking", "llm_config": { "model": "litellm_proxy/moonshot/kimi-k2-thinking", "temperature": 1.0, }, }, # https://www.kimi.com/blog/kimi-k2-5.html "kimi-k2.5": { "id": "kimi-k2.5", "display_name": "Kimi K2.5", "llm_config": { "model": "litellm_proxy/moonshot/kimi-k2.5", "temperature": 1.0, "top_p": 0.95, }, }, # https://www.kimi.com/blog/kimi-k2-6 "kimi-k2.6": { "id": "kimi-k2.6", "display_name": "Kimi K2.6", "llm_config": { "model": "litellm_proxy/moonshot/kimi-k2.6", "temperature": 1.0, }, }, # https://www.alibabacloud.com/help/en/model-studio/deep-thinking "qwen3-max-thinking": { "id": "qwen3-max-thinking", "display_name": "Qwen3 Max Thinking", "llm_config": { "model": "litellm_proxy/dashscope/qwen3-max-2026-01-23", "litellm_extra_body": {"enable_thinking": True}, }, }, "qwen3.5-flash": { "id": "qwen3.5-flash", "display_name": "Qwen3.5 Flash", "llm_config": { "model": "litellm_proxy/dashscope/qwen3.5-flash-2026-02-23", "temperature": 0.0, }, }, "qwen3.6-plus": { "id": "qwen3.6-plus", "display_name": "Qwen3.6 Plus", "llm_config": { "model": "litellm_proxy/dashscope/qwen3.6-plus", "temperature": 0.0, }, }, "claude-4.5-opus": { "id": "claude-4.5-opus", "display_name": "Claude 4.5 Opus", "llm_config": { "model": "litellm_proxy/anthropic/claude-opus-4-5-20251101", "temperature": 0.0, }, }, "claude-4.6-opus": { "id": "claude-4.6-opus", "display_name": "Claude 4.6 Opus", "llm_config": { "model": "litellm_proxy/anthropic/claude-opus-4-6", "temperature": 0.0, }, }, "claude-opus-4-7": { "id": "claude-opus-4-7", "display_name": "Claude Opus 4.7", "llm_config": { "model": "litellm_proxy/anthropic/claude-opus-4-7", }, }, "claude-sonnet-4-6": { "id": "claude-sonnet-4-6", "display_name": "Claude Sonnet 4.6", "llm_config": { "model": "litellm_proxy/anthropic/claude-sonnet-4-6", "temperature": 0.0, }, }, "gemini-3-flash": { "id": "gemini-3-flash", "display_name": "Gemini 3 Flash", "llm_config": { "model": "litellm_proxy/gemini-3-flash-preview", "temperature": 0.0, }, }, "gemini-3.1-pro": { "id": "gemini-3.1-pro", "display_name": "Gemini 3.1 Pro", "llm_config": { "model": "litellm_proxy/gemini-3.1-pro-preview", "temperature": 0.0, }, }, "gpt-5.2": { "id": "gpt-5.2", "display_name": "GPT-5.2", "llm_config": {"model": "litellm_proxy/openai/gpt-5.2-2025-12-11"}, }, "gpt-5.2-codex": { "id": "gpt-5.2-codex", "display_name": "GPT-5.2 Codex", "llm_config": {"model": "litellm_proxy/gpt-5.2-codex"}, }, "gpt-5-3-codex": { "id": "gpt-5-3-codex", "display_name": "GPT-5.3 Codex", "llm_config": {"model": "litellm_proxy/gpt-5-3-codex"}, }, "gpt-5.2-high-reasoning": { "id": "gpt-5.2-high-reasoning", "display_name": "GPT-5.2 High Reasoning", "llm_config": { "model": "litellm_proxy/openai/gpt-5.2-2025-12-11", "reasoning_effort": "high", }, }, "gpt-5.4": { "id": "gpt-5.4", "display_name": "GPT-5.4", "llm_config": { "model": "litellm_proxy/openai/gpt-5.4", "reasoning_effort": "high", }, }, "gpt-5.5": { "id": "gpt-5.5", "display_name": "GPT-5.5", "llm_config": { "model": "litellm_proxy/openai/gpt-5.5", "reasoning_effort": "high", }, }, "minimax-m2": { "id": "minimax-m2", "display_name": "MiniMax M2", "llm_config": { "model": "litellm_proxy/minimax/minimax-m2", "temperature": 0.0, }, }, "minimax-m2.5": { "id": "minimax-m2.5", "display_name": "MiniMax M2.5", "llm_config": { "model": "litellm_proxy/minimax/MiniMax-M2.5", "temperature": 1.0, "top_p": 0.95, }, }, "minimax-m2.1": { "id": "minimax-m2.1", "display_name": "MiniMax M2.1", "llm_config": { "model": "litellm_proxy/minimax/MiniMax-M2.1", "temperature": 0.0, }, }, "minimax-m2.7": { "id": "minimax-m2.7", "display_name": "MiniMax M2.7", "llm_config": { "model": "litellm_proxy/minimax/MiniMax-M2.7", "temperature": 1.0, "top_p": 0.95, }, }, "deepseek-v3.2-reasoner": { "id": "deepseek-v3.2-reasoner", "display_name": "DeepSeek V3.2 Reasoner", "llm_config": {"model": "litellm_proxy/deepseek/deepseek-reasoner"}, }, # https://api-docs.deepseek.com/news/news260424 "deepseek-v4-pro": { "id": "deepseek-v4-pro", "display_name": "DeepSeek V4 Pro", "llm_config": {"model": "litellm_proxy/deepseek/deepseek-v4-pro"}, }, "deepseek-v4-flash": { "id": "deepseek-v4-flash", "display_name": "DeepSeek V4 Flash", "llm_config": {"model": "litellm_proxy/deepseek/deepseek-v4-flash"}, }, "qwen-3-coder": { "id": "qwen-3-coder", "display_name": "Qwen 3 Coder", "llm_config": { "model": "litellm_proxy/fireworks_ai/qwen3-coder-480b-a35b-instruct", "temperature": 0.0, }, }, "nemotron-3-nano-30b": { "id": "nemotron-3-nano-30b", "display_name": "NVIDIA Nemotron 3 Nano 30B", "llm_config": { "model": "litellm_proxy/openai/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", "temperature": 0.0, }, }, "glm-4.7": { "id": "glm-4.7", "display_name": "GLM-4.7", "llm_config": { "model": "litellm_proxy/openrouter/z-ai/glm-4.7", "temperature": 0.0, # OpenRouter glm-4.7 is text-only despite LiteLLM reporting vision support "disable_vision": True, }, }, "glm-5": { "id": "glm-5", "display_name": "GLM-5", "llm_config": { "model": "litellm_proxy/openrouter/z-ai/glm-5", "temperature": 0.0, # OpenRouter glm-5 is text-only despite LiteLLM reporting vision support "disable_vision": True, }, }, "glm-5.1": { "id": "glm-5.1", "display_name": "GLM-5.1", "llm_config": { "model": "litellm_proxy/openrouter/z-ai/glm-5.1", "temperature": 0.0, # OpenRouter glm-5.1 is text-only despite LiteLLM reporting vision support "disable_vision": True, }, }, "qwen3-coder-next": { "id": "qwen3-coder-next", "display_name": "Qwen3 Coder Next", "llm_config": { "model": "litellm_proxy/openrouter/qwen/qwen3-coder-next", "temperature": 0.0, }, }, "qwen3-coder-30b-a3b-instruct": { "id": "qwen3-coder-30b-a3b-instruct", "display_name": "Qwen3 Coder 30B A3B Instruct", "llm_config": { "model": "litellm_proxy/Qwen3-Coder-30B-A3B-Instruct", "temperature": 0.0, }, }, "gpt-oss-20b": { "id": "gpt-oss-20b", "display_name": "GPT OSS 20B", "llm_config": { "model": "litellm_proxy/gpt-oss-20b", "temperature": 0.0, }, }, "nemotron-3-super-120b-a12b": { "id": "nemotron-3-super-120b-a12b", "display_name": "NVIDIA Nemotron-3 Super 120B", "llm_config": { "model": "litellm_proxy/nvidia/nemotron-3-super-120b-a12b", "temperature": 0.0, }, }, "converse-nemotron-super-3-120b": { "id": "converse-nemotron-super-3-120b", "display_name": "NVIDIA Converse Nemotron Super 3 120B", "llm_config": { "model": "litellm_proxy/converse-nemotron-super-3-120b", "temperature": 0.0, }, }, "trinity-large-thinking": { "id": "trinity-large-thinking", "display_name": "Trinity Large Thinking", "llm_config": { "model": "litellm_proxy/trinity-large-thinking", "temperature": 1.0, "top_p": 0.95, }, }, } def error_exit(msg: str, exit_code: int = 1) -> None: """Print error message and exit.""" print(f"ERROR: {msg}", file=sys.stderr) sys.exit(exit_code) def get_required_env(key: str) -> str: """Get required environment variable or exit with error.""" value = os.environ.get(key) if not value: error_exit(f"{key} not set") return value def find_models_by_id(model_ids: list[str]) -> list[dict]: """Find models by ID. Fails fast on missing ID. Args: model_ids: List of model IDs to find Returns: List of model dictionaries matching the IDs Raises: SystemExit: If any model ID is not found """ resolved = [] for model_id in model_ids: if model_id not in MODELS: available = ", ".join(sorted(MODELS.keys())) error_exit( f"Model ID '{model_id}' not found. Available models: {available}" ) resolved.append(MODELS[model_id]) return resolved def check_model( model_config: dict[str, Any], api_key: str, base_url: str, timeout: int = 60, ) -> tuple[bool, str]: """Check a single model with a simple completion request using litellm. Args: model_config: Model configuration dict with 'llm_config' key api_key: API key for authentication base_url: Base URL for the LLM proxy timeout: Request timeout in seconds Returns: Tuple of (success: bool, message: str) """ import litellm llm_config = model_config.get("llm_config", {}) model_name = llm_config.get("model", "unknown") display_name = model_config.get("display_name", model_name) try: # Build kwargs from llm_config, excluding 'model' and SDK-specific params kwargs = { k: v for k, v in llm_config.items() if k != "model" and k not in SDK_ONLY_PARAMS } # Use simple arithmetic prompt that works reliably across all models # max_tokens=100 provides enough room for models to respond # (some need >10 tokens) response = litellm.completion( model=model_name, messages=[{"role": "user", "content": "1+1="}], max_tokens=100, api_key=api_key, base_url=base_url, timeout=timeout, **kwargs, ) response_content = ( response.choices[0].message.content if response.choices else None ) reasoning_content = ( getattr(response.choices[0].message, "reasoning_content", None) if response.choices else None ) if response_content or reasoning_content: return True, f"✓ {display_name}: OK" else: # Check if there's any other data in the response for diagnostics finish_reason = ( response.choices[0].finish_reason if response.choices else None ) usage = getattr(response, "usage", None) return ( False, ( f"✗ {display_name}: Empty response " f"(finish_reason={finish_reason}, usage={usage})" ), ) except litellm.exceptions.Timeout: return False, f"✗ {display_name}: Request timed out after {timeout}s" except litellm.exceptions.APIConnectionError as e: return False, f"✗ {display_name}: Connection error - {e}" except litellm.exceptions.BadRequestError as e: return False, f"✗ {display_name}: Bad request - {e}" except litellm.exceptions.NotFoundError as e: return False, f"✗ {display_name}: Model not found - {e}" except Exception as e: return False, f"✗ {display_name}: {type(e).__name__} - {e}" # Alias for backward compatibility with tests test_model = check_model def _check_proxy_reachable( base_url: str, api_key: str | None = None, timeout: int = 10 ) -> tuple[bool, str]: """Quick health check: can we reach the proxy at all? Uses /v1/models (standard OpenAI-compatible endpoint) which works with any valid API key. The /health endpoint requires admin-level access on some LiteLLM configurations. """ import urllib.error import urllib.request models_url = f"{base_url.rstrip('/')}/v1/models" try: req = urllib.request.Request(models_url, method="GET") if api_key: req.add_header("Authorization", f"Bearer {api_key}") urllib.request.urlopen(req, timeout=timeout) return True, f"Proxy reachable at {base_url}" except urllib.error.URLError as e: return False, f"Cannot reach proxy at {base_url}: {e.reason}" except Exception as e: return False, f"Cannot reach proxy at {base_url}: {type(e).__name__}: {e}" def run_preflight_check(models: list[dict[str, Any]]) -> bool: """Run preflight LLM check for all models. Args: models: List of model configurations to test Returns: True if all models passed, False otherwise """ api_key = os.environ.get("LLM_API_KEY") base_url = os.environ.get("LLM_BASE_URL", "https://llm-proxy.eval.all-hands.dev") skip_preflight = os.environ.get("SKIP_PREFLIGHT", "").lower() == "true" if skip_preflight: print("Preflight check: SKIPPED (SKIP_PREFLIGHT=true)") return True if not api_key: print("Preflight check: SKIPPED (LLM_API_KEY not set)") return True # Quick connectivity check before trying expensive model completions print(f"\nChecking proxy connectivity: {base_url}", flush=True) reachable, msg = _check_proxy_reachable(base_url, api_key=api_key) if not reachable: print(f"✗ {msg}", file=sys.stderr, flush=True) print( "\nThe LiteLLM proxy appears to be down or unreachable.\n" "Set SKIP_PREFLIGHT=true to bypass this check.", file=sys.stderr, flush=True, ) return False print(f"✓ {msg}", flush=True) print(f"\nPreflight LLM check for {len(models)} model(s)...", flush=True) print("-" * 50, flush=True) all_passed = True for model_config in models: display_name = model_config.get("display_name", "unknown") print(f" Checking {display_name}...", end=" ", flush=True) t0 = time.monotonic() success, message = check_model(model_config, api_key, base_url) elapsed = time.monotonic() - t0 print(f"({elapsed:.1f}s)", flush=True) print(f" {message}", flush=True) if not success: all_passed = False print("-" * 50, flush=True) if all_passed: print(f"✓ All {len(models)} model(s) passed preflight check\n", flush=True) else: print("✗ Some models failed preflight check", flush=True) print("Evaluation aborted to avoid wasting compute resources.\n", flush=True) return all_passed def main() -> None: model_ids_str = get_required_env("MODEL_IDS") github_output = get_required_env("GITHUB_OUTPUT") # Parse requested model IDs model_ids = [mid.strip() for mid in model_ids_str.split(",") if mid.strip()] # Resolve model configs resolved = find_models_by_id(model_ids) print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}", flush=True) # Run preflight check if not run_preflight_check(resolved): error_exit("Preflight LLM check failed") # Output as JSON models_json = json.dumps(resolved, separators=(",", ":")) with open(github_output, "a", encoding="utf-8") as f: f.write(f"models_json={models_json}\n") if __name__ == "__main__": main() ================================================ FILE: .github/run-eval/validate_sdk_ref.py ================================================ #!/usr/bin/env python3 """ Validate SDK reference for semantic versioning. This script validates that the SDK reference is a semantic version (e.g., v1.0.0, 1.0.0) unless the allow_unreleased_branches flag is set. Environment variables: - SDK_REF: The SDK reference to validate - ALLOW_UNRELEASED_BRANCHES: If 'true', bypass semantic version validation Exit codes: - 0: Validation passed - 1: Validation failed """ import os import re import subprocess import sys # Semantic version pattern: optional 'v' prefix, followed by MAJOR.MINOR.PATCH # Optionally allows pre-release (-alpha.1, -beta.2, -rc.1) and build metadata SEMVER_PATTERN = re.compile( r"^v?" # Optional 'v' prefix r"(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)" # MAJOR.MINOR.PATCH r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)" # Pre-release r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" # More pre-release r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$" # Build metadata ) COMMIT_SHA_PATTERN = re.compile(r"^[0-9a-fA-F]{7,40}$") BRANCH_EXAMPLES = "'main', 'feature/foo', or 'release/1.2.3'" def is_semantic_version(ref: str) -> bool: """Check if the given reference is a valid semantic version.""" return bool(SEMVER_PATTERN.match(ref)) def is_commit_sha(ref: str) -> bool: """Check if the given reference is a git commit SHA.""" return bool(COMMIT_SHA_PATTERN.fullmatch(ref)) def is_valid_branch_name(ref: str) -> bool: """Check if the given reference is a valid git branch name.""" return ( subprocess.run( ["git", "check-ref-format", "--branch", ref], check=False, capture_output=True, text=True, ).returncode == 0 ) def validate_branch_name(branch_name: str, input_name: str) -> tuple[bool, str]: """Validate a workflow branch input against git branch naming rules.""" if is_valid_branch_name(branch_name): return True, f"Valid {input_name}: {branch_name}" return False, ( f"{input_name} '{branch_name}' is not a valid git branch name. " f"Common GitHub/GitLab/Bitbucket branch names look like {BRANCH_EXAMPLES}." ) def validate_sdk_ref(sdk_ref: str, allow_unreleased: bool) -> tuple[bool, str]: """Validate the SDK reference.""" if is_semantic_version(sdk_ref): return True, f"Valid semantic version: {sdk_ref}" if allow_unreleased and (is_commit_sha(sdk_ref) or is_valid_branch_name(sdk_ref)): return True, f"Valid unreleased git ref: {sdk_ref}" if allow_unreleased: return False, ( f"SDK reference '{sdk_ref}' is not a valid semantic version, commit SHA, " "or git branch name. Common GitHub/GitLab/Bitbucket branch names look " f"like {BRANCH_EXAMPLES}." ) return False, ( f"SDK reference '{sdk_ref}' is not a valid semantic version. " "Expected format: v1.0.0 or 1.0.0 (with optional pre-release like -alpha.1). " "To use unreleased branches, check 'Allow unreleased branches'." ) def main() -> None: sdk_ref = os.environ.get("SDK_REF", "") allow_unreleased_str = os.environ.get("ALLOW_UNRELEASED_BRANCHES", "false") eval_branch = os.environ.get("EVAL_BRANCH") benchmarks_branch = os.environ.get("BENCHMARKS_BRANCH") if not sdk_ref: print("ERROR: SDK_REF environment variable is not set", file=sys.stderr) sys.exit(1) allow_unreleased = allow_unreleased_str.lower() == "true" validations = [ validate_sdk_ref(sdk_ref, allow_unreleased), ] if eval_branch: validations.append(validate_branch_name(eval_branch, "EVAL_BRANCH")) if benchmarks_branch: validations.append(validate_branch_name(benchmarks_branch, "BENCHMARKS_BRANCH")) for is_valid, message in validations: stream = sys.stdout if is_valid else sys.stderr print(("✓" if is_valid else "✗") + f" {message}", file=stream) if not is_valid: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: .github/scripts/check_agent_server_rest_api_breakage.py ================================================ #!/usr/bin/env python3 """REST API breakage detection for openhands-agent-server using oasdiff. This script compares the current OpenAPI schema for the public agent-server REST API (the `/api/**` surface) against an already-published release. The baseline version is selected from PyPI, but the baseline schema is generated from the matching git tag under the current workspace's locked dependency set. This keeps the comparison focused on API changes in our code, not schema drift from newer FastAPI/Pydantic releases. The deprecation note it recognizes intentionally matches the phrasing used by the Python deprecation checks, for example: Deprecated since v1.14.0 and scheduled for removal in v1.19.0. Policies enforced: 1) REST deprecations must use FastAPI/OpenAPI metadata - FastAPI route handlers must not use `openhands.sdk.utils.deprecation.deprecated`. - Endpoints documented as deprecated in their OpenAPI description must also be marked `deprecated: true` in the generated schema. 2) Deprecation runway before removal - If a REST operation (path + HTTP method) or schema property is removed, it must have been marked `deprecated: true` in the baseline release and its OpenAPI description must declare a scheduled removal version that has been reached by the current package version. 3) Additive request/response oneOf/anyOf expansion is allowed - Adding new members to ``oneOf`` or ``anyOf`` discriminated unions in request or response schemas is a normal evolution for extensible APIs. Clients MUST handle unknown discriminator values gracefully (skip/ignore). - oasdiff can report union widening as ERR plus secondary type-change or property-removal artifacts for fields that still exist on one union member; this script downgrades those artifacts to informational notices. 4) No in-place contract breakage - Breaking REST contract changes that are not removals of previously-deprecated operations/properties or additive oneOf expansions fail the check. REST clients need 5 minor releases of runway, so incompatible replacements must ship additively or behind a versioned contract until the scheduled removal version. If the baseline release schema can't be generated (e.g., missing tag / repo issues), the script emits a warning and exits successfully to avoid flaky CI. """ from __future__ import annotations import ast import json import re import subprocess import sys import tempfile import tomllib import urllib.request from pathlib import Path from packaging import version as pkg_version REPO_ROOT = Path(__file__).resolve().parents[2] AGENT_SERVER_PYPROJECT = REPO_ROOT / "openhands-agent-server" / "pyproject.toml" PYPI_DISTRIBUTION = "openhands-agent-server" # Keep this in sync with REST_ROUTE_DEPRECATION_RE in check_deprecations.py so # the REST breakage and deprecation checks recognize the same wording. REST_ROUTE_DEPRECATION_RE = re.compile( r"Deprecated since v(?P[0-9A-Za-z.+-]+)\s+" r"and scheduled for removal in v(?P[0-9A-Za-z.+-]+)\.?", re.IGNORECASE, ) HTTP_METHODS = { "get", "put", "post", "delete", "patch", "options", "head", "trace", } PUBLIC_REST_PATH_PREFIX = "/api/" ROUTE_DECORATOR_NAMES = HTTP_METHODS | {"api_route"} OPENAPI_PROGRAM = """ import json import sys from pathlib import Path source_tree = Path(sys.argv[1]) sys.path = [ str(source_tree / "openhands-agent-server"), str(source_tree / "openhands-sdk"), str(source_tree / "openhands-tools"), str(source_tree / "openhands-workspace"), ] + sys.path from openhands.agent_server.api import create_app print(json.dumps(create_app().openapi())) """ def _read_version_from_pyproject(pyproject: Path) -> str: data = tomllib.loads(pyproject.read_text()) try: return str(data["project"]["version"]) except KeyError as exc: # pragma: no cover raise SystemExit( f"Unable to determine project version from {pyproject}" ) from exc def _fetch_pypi_metadata(distribution: str) -> dict: req = urllib.request.Request( url=f"https://pypi.org/pypi/{distribution}/json", headers={"User-Agent": "openhands-agent-server-openapi-check/1.0"}, method="GET", ) with urllib.request.urlopen(req, timeout=10) as response: return json.load(response) def _get_baseline_version(distribution: str, current: str) -> str | None: try: meta = _fetch_pypi_metadata(distribution) except Exception as exc: # pragma: no cover print( f"::warning title={distribution} REST API::Failed to fetch PyPI metadata: " f"{exc}" ) return None releases = list(meta.get("releases", {}).keys()) if not releases: return None if current in releases: return current current_parsed = pkg_version.parse(current) older = [rv for rv in releases if pkg_version.parse(rv) < current_parsed] if not older: return None return max(older, key=pkg_version.parse) def _generate_openapi_from_source_tree(source_tree: Path, label: str) -> dict | None: try: result = subprocess.run( [sys.executable, "-c", OPENAPI_PROGRAM, str(source_tree)], check=True, capture_output=True, text=True, cwd=source_tree, ) return json.loads(result.stdout) except subprocess.CalledProcessError as exc: output = (exc.stdout or "") + ("\n" + exc.stderr if exc.stderr else "") excerpt = output.strip()[-1000:] print( f"::warning title={PYPI_DISTRIBUTION} REST API::Failed to generate " f"OpenAPI schema for {label}: {exc}\n{excerpt}" ) return None except Exception as exc: print( f"::warning title={PYPI_DISTRIBUTION} REST API::Failed to generate " f"OpenAPI schema for {label}: {exc}" ) return None def _generate_current_openapi() -> dict | None: return _generate_openapi_from_source_tree(REPO_ROOT, "current workspace") def _generate_openapi_for_git_ref(git_ref: str) -> dict | None: with tempfile.TemporaryDirectory(prefix="agent-server-openapi-") as tmp: source_tree = Path(tmp) try: archive = subprocess.run( ["git", "-C", str(REPO_ROOT), "archive", git_ref], check=True, capture_output=True, ) subprocess.run( ["tar", "-x", "-C", str(source_tree)], check=True, input=archive.stdout, capture_output=True, ) except subprocess.CalledProcessError as exc: output = (exc.stdout or b"") + (b"\n" + exc.stderr if exc.stderr else b"") excerpt = output.decode(errors="replace").strip()[-1000:] print( f"::warning title={PYPI_DISTRIBUTION} REST API::Failed to extract " f"source for {git_ref}: {exc}\n{excerpt}" ) return None return _generate_openapi_from_source_tree(source_tree, git_ref) def _dotted_name(node: ast.AST) -> str | None: if isinstance(node, ast.Name): return node.id if isinstance(node, ast.Attribute): prefix = _dotted_name(node.value) if prefix is None: return None return f"{prefix}.{node.attr}" return None def _find_sdk_deprecated_fastapi_routes_in_file( file_path: Path, repo_root: Path ) -> list[str]: tree = ast.parse(file_path.read_text(), filename=str(file_path)) deprecated_names: set[str] = set() deprecation_module_names: set[str] = set() for node in tree.body: if isinstance(node, ast.ImportFrom): if node.module == "openhands.sdk.utils.deprecation": for alias in node.names: if alias.name == "deprecated": deprecated_names.add(alias.asname or alias.name) elif node.module == "openhands.sdk.utils": for alias in node.names: if alias.name == "deprecation": deprecation_module_names.add(alias.asname or alias.name) elif isinstance(node, ast.Import): for alias in node.names: if alias.name == "openhands.sdk.utils.deprecation": deprecation_module_names.add(alias.asname or alias.name) errors: list[str] = [] for node in ast.walk(tree): if not isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef): continue has_route_decorator = False uses_sdk_deprecated = False for decorator in node.decorator_list: if not isinstance(decorator, ast.Call): continue dotted_name = _dotted_name(decorator.func) if ( isinstance(decorator.func, ast.Attribute) and decorator.func.attr in ROUTE_DECORATOR_NAMES ): has_route_decorator = True if dotted_name in deprecated_names or ( dotted_name == "openhands.sdk.utils.deprecation.deprecated" ): uses_sdk_deprecated = True continue if ( isinstance(decorator.func, ast.Attribute) and decorator.func.attr == "deprecated" ): base_name = _dotted_name(decorator.func.value) if base_name in deprecation_module_names or ( base_name == "openhands.sdk.utils.deprecation" ): uses_sdk_deprecated = True if has_route_decorator and uses_sdk_deprecated: rel_path = file_path.relative_to(repo_root).as_posix() errors.append( f"{rel_path}:{node.lineno} FastAPI route `{node.name}` uses " "openhands.sdk.utils.deprecation.deprecated; use the route " "decorator's deprecated=True flag instead." ) return errors def _find_sdk_deprecated_fastapi_routes(repo_root: Path) -> list[str]: app_root = repo_root / "openhands-agent-server" / "openhands" / "agent_server" errors: list[str] = [] for file_path in sorted(app_root.rglob("*.py")): errors.extend(_find_sdk_deprecated_fastapi_routes_in_file(file_path, repo_root)) return errors def _filter_public_rest_openapi(schema: dict) -> dict: filtered_schema = dict(schema) filtered_schema["paths"] = { path: path_item for path, path_item in schema.get("paths", {}).items() if path == PUBLIC_REST_PATH_PREFIX.rstrip("/") or path.startswith(PUBLIC_REST_PATH_PREFIX) } return filtered_schema def _find_deprecation_policy_errors(schema: dict) -> list[str]: errors: list[str] = [] for path, path_item in schema.get("paths", {}).items(): if not isinstance(path_item, dict): continue for method, operation in path_item.items(): if method not in HTTP_METHODS or not isinstance(operation, dict): continue description = operation.get("description") or "" if "deprecated since" not in description.lower(): continue if operation.get("deprecated") is True: continue errors.append( f"{method.upper()} {path} documents deprecation in its " "description but is not marked deprecated=true in OpenAPI." ) return errors def _parse_openapi_deprecation_description( description: str | None, ) -> tuple[str, str] | None: """Extract ``(deprecated_in, removed_in)`` from an OpenAPI description. The accepted wording intentionally matches ``check_deprecations.py`` so both CI checks recognize the same note, for example: Deprecated since v1.14.0 and scheduled for removal in v1.19.0. """ if not description: return None match = REST_ROUTE_DEPRECATION_RE.search(" ".join(description.split())) if match is None: return None return match.group("deprecated").rstrip("."), match.group("removed").rstrip(".") def _version_ge(current: str, target: str) -> bool: try: return pkg_version.parse(current) >= pkg_version.parse(target) except pkg_version.InvalidVersion as exc: raise SystemExit( f"Invalid semantic version comparison: {current=} {target=}" ) from exc def _get_openapi_operation(schema: dict, path: str, method: str) -> dict | None: path_item = schema.get("paths", {}).get(path) if not isinstance(path_item, dict): return None operation = path_item.get(method.lower()) if not isinstance(operation, dict): return None return operation def _validate_removed_operations( removed_operations: list[dict], prev_schema: dict, current_version: str, ) -> list[str]: """Validate removed operations against the baseline deprecation metadata.""" errors: list[str] = [] for operation in removed_operations: path = str(operation.get("path", "")) method = str(operation.get("method", "")).lower() method_label = method.upper() or "" if not operation.get("deprecated", False): errors.append( f"Removed {method_label} {path} without prior deprecation " "(deprecated=true)." ) continue baseline_operation = _get_openapi_operation(prev_schema, path, method) if baseline_operation is None: errors.append( f"Removed {method_label} {path} was marked deprecated in the " "baseline release, but the previous OpenAPI schema could not be " "inspected for its scheduled removal version." ) continue deprecation_details = _parse_openapi_deprecation_description( baseline_operation.get("description") ) if deprecation_details is None: errors.append( f"Removed {method_label} {path} was marked deprecated in the " "baseline release, but its OpenAPI description does not declare " "a scheduled removal version. REST API removals require 5 minor " "releases of deprecation runway." ) continue _, removed_in = deprecation_details if not _version_ge(current_version, removed_in): errors.append( f"Removed {method_label} {path} before its scheduled removal " f"version v{removed_in} (current version: v{current_version}). " "REST API removals require 5 minor releases of deprecation " "runway." ) continue print( f"::notice title={PYPI_DISTRIBUTION} REST API::Removed previously-" f"deprecated {method_label} {path} after its scheduled removal " f"version v{removed_in}." ) return errors def _iter_schema_properties(schema: dict): if not isinstance(schema, dict): return properties = schema.get("properties") if isinstance(properties, dict): for property_name, property_schema in properties.items(): if isinstance(property_schema, dict): yield property_name, property_schema for value in schema.values(): if isinstance(value, dict): yield from _iter_schema_properties(value) elif isinstance(value, list): for item in value: if isinstance(item, dict): yield from _iter_schema_properties(item) def _removed_property_name(change: dict) -> str | None: text = str(change.get("text", "")) match = re.search( r"(?:request property|optional property|required property) `([^`]+)`", text, ) if match is None: return None return match.group(1).rstrip("/").rsplit("/", maxsplit=1)[-1] def _validate_removed_schema_properties( removed_properties: list[dict], prev_schema: dict, current_version: str, ) -> list[str]: """Validate removed schema properties against baseline deprecation metadata.""" errors: list[str] = [] baseline_properties: dict[str, list[dict]] = {} for property_name, property_schema in _iter_schema_properties(prev_schema): baseline_properties.setdefault(property_name, []).append(property_schema) for change in removed_properties: property_name = _removed_property_name(change) if property_name is None: errors.append( "Removed schema property could not be identified from oasdiff output: " f"{change.get('text', str(change))}" ) continue deprecated_candidates = [ property_schema for property_schema in baseline_properties.get(property_name, []) if property_schema.get("deprecated") is True ] if not deprecated_candidates: errors.append( f"Removed schema property {property_name!r} without prior " "deprecation (deprecated=true)." ) continue removal_targets = [ deprecation_details[1] for property_schema in deprecated_candidates if ( deprecation_details := _parse_openapi_deprecation_description( property_schema.get("description") ) ) is not None ] if not removal_targets: errors.append( f"Removed schema property {property_name!r} was marked deprecated " "in the baseline release, but its OpenAPI description does not " "declare a scheduled removal version. REST API property removals " "require 5 minor releases of deprecation runway." ) continue if not any( _version_ge(current_version, removed_in) for removed_in in removal_targets ): errors.append( f"Removed schema property {property_name!r} before its scheduled " f"removal version(s): {', '.join(f'v{v}' for v in removal_targets)} " f"(current version: v{current_version}). REST API property removals " "require 5 minor releases of deprecation runway." ) continue print( f"::notice title={PYPI_DISTRIBUTION} REST API::Removed previously-" f"deprecated schema property {property_name!r} after its scheduled " "removal version was reached." ) return errors # oasdiff rule IDs for additive oneOf/anyOf expansion in response schemas. # These are flagged as ERR by oasdiff but are expected evolution for extensible # discriminated-union APIs (e.g. the events endpoint). We downgrade them to # informational notices so they don't block CI. _ADDITIVE_RESPONSE_ONEOF_IDS = frozenset( { "response-body-one-of-added", "response-property-one-of-added", # Keep the anyOf variants here too so that if oasdiff ever reports them # as breakages, additive response-union expansion gets the same # downgrade without further script changes. "response-body-any-of-added", "response-property-any-of-added", } ) _ADDITIVE_RESPONSE_BODY_ONEOF_IDS = frozenset( { "response-body-one-of-added", "response-body-any-of-added", } ) def _is_union_property_removal_artifact(change: dict) -> bool: """Return True for property removals that are artifacts of union widening. When a request or response schema is widened from a concrete object schema to an additive oneOf/anyOf union, oasdiff can emit secondary "removed property" reports for the original object's fields even though the original schema is still present as one union member. """ change_id = str(change.get("id", "")).lower() text = str(change.get("text", "")).lower() return ( "removed" in change_id and "property" in change_id and ("from the response" in text or "request property" in text) ) def _is_union_type_change_artifact(change: dict) -> bool: text = str(change.get("text", "")).lower() return "type/format changed from `object`/`` to ``/``" in text def _split_breaking_changes( breaking_changes: list[dict], ) -> tuple[list[dict], list[dict], list[dict], list[dict]]: """Split oasdiff results into allowlisted buckets and other breakages.""" removed_operations: list[dict] = [] removed_schema_properties: list[dict] = [] additive_response_oneof: list[dict] = [] other_breaking_changes: list[dict] = [] for change in breaking_changes: change_id = str(change.get("id", "")) details = change.get("details", {}) if "removed" in change_id.lower() and "operation" in change_id.lower(): removed_operations.append( { "path": details.get("path", ""), "method": details.get("method", ""), "deprecated": details.get("deprecated", False), } ) continue if "removed" in change_id.lower() and "property" in change_id.lower(): removed_schema_properties.append(change) continue if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS: additive_response_oneof.append(change) continue other_breaking_changes.append(change) return ( removed_operations, removed_schema_properties, additive_response_oneof, other_breaking_changes, ) def _normalize_openapi_for_oasdiff(schema: dict) -> dict: """Normalize OpenAPI 3.1 schema for oasdiff compatibility. oasdiff expects OpenAPI 3.0-style exclusiveMinimum/exclusiveMaximum booleans (https://spec.openapis.org/oas/v3.0.3.html#schema-object), while OpenAPI 3.1 emits numeric values. Convert numeric exclusives into minimum/maximum + exclusive boolean flags so oasdiff can parse the schema. Mutates the schema in place and returns it for convenience. """ def _walk(node: object) -> None: if isinstance(node, dict): if ( "exclusiveMinimum" in node and isinstance(node["exclusiveMinimum"], (int, float)) and not isinstance(node["exclusiveMinimum"], bool) ): value = node["exclusiveMinimum"] if "minimum" not in node: node["minimum"] = value node["exclusiveMinimum"] = True if ( "exclusiveMaximum" in node and isinstance(node["exclusiveMaximum"], (int, float)) and not isinstance(node["exclusiveMaximum"], bool) ): value = node["exclusiveMaximum"] if "maximum" not in node: node["maximum"] = value node["exclusiveMaximum"] = True for child in node.values(): _walk(child) elif isinstance(node, list): for child in node: _walk(child) _walk(schema) return schema def _run_oasdiff_breakage_check( prev_spec: Path, cur_spec: Path ) -> tuple[list[dict], int]: """Run oasdiff breaking check between two OpenAPI specs. Returns (list of breaking changes, exit code from oasdiff). """ try: result = subprocess.run( [ "oasdiff", "breaking", "-f", "json", "--fail-on", "ERR", str(prev_spec), str(cur_spec), ], capture_output=True, text=True, ) except FileNotFoundError: print( "::warning title=oasdiff not found::" "Please install oasdiff: https://github.com/oasdiff/oasdiff" ) return [], 0 breaking_changes = [] if result.stdout: try: breaking_changes = json.loads(result.stdout) except json.JSONDecodeError: pass return breaking_changes, result.returncode def main() -> int: current_version = _read_version_from_pyproject(AGENT_SERVER_PYPROJECT) baseline_version = _get_baseline_version(PYPI_DISTRIBUTION, current_version) if baseline_version is None: print( f"::warning title={PYPI_DISTRIBUTION} REST API::Unable to find baseline " f"version for {current_version}; skipping breakage checks." ) return 0 baseline_git_ref = f"v{baseline_version}" static_policy_errors = _find_sdk_deprecated_fastapi_routes(REPO_ROOT) for error in static_policy_errors: print(f"::error title={PYPI_DISTRIBUTION} REST API::{error}") current_schema = _generate_current_openapi() if current_schema is None: return 1 current_schema = _filter_public_rest_openapi(current_schema) deprecation_policy_errors = _find_deprecation_policy_errors(current_schema) for error in deprecation_policy_errors: print(f"::error title={PYPI_DISTRIBUTION} REST API::{error}") prev_schema = _generate_openapi_for_git_ref(baseline_git_ref) if prev_schema is None: return 0 if not (static_policy_errors or deprecation_policy_errors) else 1 prev_schema = _filter_public_rest_openapi(prev_schema) prev_schema = _normalize_openapi_for_oasdiff(prev_schema) current_schema = _normalize_openapi_for_oasdiff(current_schema) with tempfile.TemporaryDirectory(prefix="oasdiff-specs-") as tmp: tmp_path = Path(tmp) prev_spec_file = tmp_path / "prev_spec.json" cur_spec_file = tmp_path / "cur_spec.json" prev_spec_file.write_text(json.dumps(prev_schema, indent=2)) cur_spec_file.write_text(json.dumps(current_schema, indent=2)) breaking_changes, exit_code = _run_oasdiff_breakage_check( prev_spec_file, cur_spec_file ) if not breaking_changes: if exit_code == 0: print("No breaking changes detected.") else: print( f"oasdiff returned exit code {exit_code} but no breaking changes " "in JSON format. There may be warnings only." ) else: ( removed_operations, removed_schema_properties, additive_response_oneof, other_breaking_changes, ) = _split_breaking_changes(breaking_changes) response_union_artifacts = [ change for change in removed_schema_properties if _is_union_property_removal_artifact(change) ] removed_schema_properties = [ change for change in removed_schema_properties if not _is_union_property_removal_artifact(change) ] union_type_artifacts = [ change for change in other_breaking_changes if _is_union_type_change_artifact(change) ] other_breaking_changes = [ change for change in other_breaking_changes if not _is_union_type_change_artifact(change) ] removal_errors = _validate_removed_operations( removed_operations, prev_schema, current_version, ) property_removal_errors = _validate_removed_schema_properties( removed_schema_properties, prev_schema, current_version, ) for error in removal_errors + property_removal_errors: print(f"::error title={PYPI_DISTRIBUTION} REST API::{error}") if additive_response_oneof: print( f"\n::notice title={PYPI_DISTRIBUTION} REST API::" "Additive oneOf/anyOf expansion detected in response schemas. " "This is expected for extensible discriminated-union APIs and " "does not break backward compatibility." ) for item in additive_response_oneof: print(f" - {item.get('text', str(item))}") if response_union_artifacts: print( " - ignored " f"{len(response_union_artifacts)} request/response-property " "removal artifact(s) caused by union widening" ) if union_type_artifacts: print( " - ignored " f"{len(union_type_artifacts)} request/response type-change " "artifact(s) caused by union widening" ) if other_breaking_changes: print( "::error " f"title={PYPI_DISTRIBUTION} REST API::Detected breaking REST API " "changes other than removing previously-deprecated operations/" "properties or additive response oneOf expansions. " "REST contract changes must preserve compatibility for 5 minor " "releases; keep the old contract available until its scheduled " "removal version." ) elif ( response_union_artifacts or union_type_artifacts ) and not additive_response_oneof: print( f"\n::notice title={PYPI_DISTRIBUTION} REST API::" f"Ignored {len(response_union_artifacts)} property-removal and " f"{len(union_type_artifacts)} type-change artifact(s) reported " "while widening schemas." ) print("\nBreaking REST API changes detected compared to baseline release:") for text in breaking_changes: print(f"- {text.get('text', str(text))}") if not (removal_errors or property_removal_errors or other_breaking_changes): print( "Breaking changes are limited to previously-deprecated operations " "or properties whose scheduled removal versions have been reached, " "and/or additive response oneOf expansions." ) else: return 1 return 1 if (static_policy_errors or deprecation_policy_errors) else 0 if __name__ == "__main__": raise SystemExit(main()) ================================================ FILE: .github/scripts/check_deprecations.py ================================================ #!/usr/bin/env python3 """Static analysis for deprecation deadlines. This script scans Python deprecation metadata (`deprecated`, `warn_deprecated`, `warn_cleanup`) and agent-server REST routes marked `deprecated=True`. If the current project version has reached or passed a feature's removal marker, the script fails with a helpful summary so legacy shims and overdue deprecated REST endpoints are cleaned up before release. """ from __future__ import annotations import ast import re import sys import tomllib from collections.abc import Iterable, Iterator, Sequence from dataclasses import dataclass from datetime import date from pathlib import Path from typing import Literal from packaging import version as pkg_version REST_ROUTE_DEPRECATION_RE = re.compile( r"Deprecated since v(?P[0-9A-Za-z.+-]+)\s+" r"and scheduled for removal in v(?P[0-9A-Za-z.+-]+)\.?", re.IGNORECASE, ) ROUTE_DECORATOR_NAMES = { "get", "put", "post", "delete", "patch", "options", "head", "trace", "api_route", } HTTP_METHODS = ROUTE_DECORATOR_NAMES - {"api_route"} REPO_ROOT = Path(__file__).resolve().parents[2] @dataclass(frozen=True, slots=True) class PackageConfig: name: str pyproject: Path source_roots: tuple[Path, ...] PACKAGES: tuple[PackageConfig, ...] = ( PackageConfig( name="openhands-sdk", pyproject=REPO_ROOT / "openhands-sdk" / "pyproject.toml", source_roots=(REPO_ROOT / "openhands-sdk" / "openhands" / "sdk",), ), PackageConfig( name="openhands-tools", pyproject=REPO_ROOT / "openhands-tools" / "pyproject.toml", source_roots=(REPO_ROOT / "openhands-tools" / "openhands" / "tools",), ), PackageConfig( name="openhands-workspace", pyproject=REPO_ROOT / "openhands-workspace" / "pyproject.toml", source_roots=(REPO_ROOT / "openhands-workspace" / "openhands" / "workspace",), ), PackageConfig( name="openhands-agent-server", pyproject=REPO_ROOT / "openhands-agent-server" / "pyproject.toml", source_roots=( REPO_ROOT / "openhands-agent-server" / "openhands" / "agent_server", ), ), ) @dataclass(slots=True) class DeprecationRecord: identifier: str removed_in: str | date | None deprecated_in: str | None path: Path line: int kind: Literal["decorator", "warn_call", "cleanup_call", "rest_route"] package: str def _load_current_version(pyproject: Path) -> str: data = tomllib.loads(pyproject.read_text()) try: return str(data["project"]["version"]) except KeyError as exc: # pragma: no cover - configuration error raise SystemExit( f"Unable to determine project version from {pyproject}" ) from exc def _iter_python_files(root: Path) -> Iterator[Path]: for path in root.rglob("*.py"): if path.name == "__init__.py" and path.parent == root: continue yield path def _parse_removed_value( node: ast.AST | None, *, path: Path, line: int, ) -> str | date | None: if node is None: return None expression = ast.unparse(node) if isinstance(node, ast.Constant): if isinstance(node.value, str): return node.value if node.value is None: return None raise SystemExit( f"Unsupported removed_in literal at {path}:{line}: {expression}" ) if isinstance(node, ast.Call): func = node.func if isinstance(func, ast.Name) and func.id == "date": try: args = [_safe_int_literal(arg) for arg in node.args] kwargs = { kw.arg: _safe_int_literal(kw.value) for kw in node.keywords if kw.arg is not None } except ValueError as exc: raise SystemExit( f"Unsupported removed_in date() arguments at {path}:{line}:" f" {expression}" ) from exc if any(kw.arg is None for kw in node.keywords): raise SystemExit( "Unsupported removed_in date() call (uses **kwargs) at " f"{path}:{line}: {expression}" ) try: return date(*args, **kwargs) except TypeError as exc: raise SystemExit( f"Invalid removed_in date() call at {path}:{line}: {expression}" ) from exc if ( isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name) and func.value.id == "date" and func.attr == "today" ): if node.args or node.keywords: raise SystemExit( "date.today() removed_in call must not include arguments at " f"{path}:{line}: {expression}" ) return date.today() raise SystemExit( f"Unsupported removed_in expression at {path}:{line}: {expression}" ) def _parse_deprecated_value( node: ast.AST | None, *, path: Path, line: int, ) -> str | None: if node is None: return None expression = ast.unparse(node) if isinstance(node, ast.Constant): if isinstance(node.value, str): return node.value if node.value is None: return None raise SystemExit( f"Unsupported deprecated_in expression at {path}:{line}: {expression}" ) def _safe_int_literal(node: ast.AST) -> int: if not isinstance(node, ast.Constant) or not isinstance(node.value, int): raise ValueError( f"Unsupported expression inside literal evaluation: {ast.unparse(node)}" ) return node.value def _extract_kw(call: ast.Call, name: str) -> ast.AST | None: for kw in call.keywords: if kw.arg == name: return kw.value return None def _extract_string_literal(node: ast.AST | None) -> str | None: if isinstance(node, ast.Constant) and isinstance(node.value, str): return node.value return None def _extract_string_sequence(node: ast.AST | None) -> tuple[str, ...] | None: if not isinstance(node, (ast.List, ast.Tuple, ast.Set)): return None values: list[str] = [] for item in node.elts: value = _extract_string_literal(item) if value is None: return None values.append(value) return tuple(values) def _extract_route_details(call: ast.Call) -> tuple[tuple[str, str], ...]: target = call.func if not isinstance(target, ast.Attribute): return () decorator_name = target.attr if decorator_name not in ROUTE_DECORATOR_NAMES: return () path = _extract_string_literal(call.args[0] if call.args else None) if path is None: path = _extract_string_literal(_extract_kw(call, "path")) if path is None: return () if decorator_name in HTTP_METHODS: return ((decorator_name.upper(), path),) methods = _extract_string_sequence(_extract_kw(call, "methods")) if methods is None: return (("GET", path),) return tuple( (method.upper(), path) for method in methods if method.lower() in HTTP_METHODS ) def _parse_rest_route_deprecation_docstring( docstring: str | None, *, path: Path, line: int, route_identifiers: Sequence[str], ) -> tuple[str, str]: if not docstring: raise SystemExit( "Deprecated REST route(s) " f"{', '.join(route_identifiers)} at {path}:{line} must include a " "docstring note like 'Deprecated since vX.Y.Z and scheduled for " "removal in vA.B.C.'" ) match = REST_ROUTE_DEPRECATION_RE.search(" ".join(docstring.split())) if match is None: raise SystemExit( "Deprecated REST route(s) " f"{', '.join(route_identifiers)} at {path}:{line} must include a " "docstring note like 'Deprecated since vX.Y.Z and scheduled for " "removal in vA.B.C.'" ) return match.group("deprecated").rstrip("."), match.group("removed").rstrip(".") def _gather_rest_route_deprecations( tree: ast.AST, path: Path, *, package: str ) -> Iterator[DeprecationRecord]: for node in ast.walk(tree): if not isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef): continue routes: list[tuple[str, str]] = [] for deco in node.decorator_list: if not isinstance(deco, ast.Call): continue deprecated_value = _extract_kw(deco, "deprecated") if ( not isinstance(deprecated_value, ast.Constant) or deprecated_value.value is not True ): continue routes.extend(_extract_route_details(deco)) if not routes: continue deprecated_in, removed_in = _parse_rest_route_deprecation_docstring( ast.get_docstring(node), path=path, line=node.lineno, route_identifiers=[ f"{method} {route_path}" for method, route_path in routes ], ) for method, route_path in routes: yield DeprecationRecord( identifier=f"{method} {route_path}", removed_in=removed_in, deprecated_in=deprecated_in, path=path, line=node.lineno, kind="rest_route", package=package, ) def _gather_decorators( tree: ast.AST, path: Path, *, package: str ) -> Iterator[DeprecationRecord]: for node in ast.walk(tree): if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): continue for deco in node.decorator_list: call = deco if isinstance(deco, ast.Call) else None if call is None: continue target = call.func if isinstance(target, ast.Name): decorator_name = target.id elif isinstance(target, ast.Attribute): decorator_name = target.attr else: continue if decorator_name != "deprecated": continue removed_expr = _extract_kw(call, "removed_in") deprecated_expr = _extract_kw(call, "deprecated_in") record = DeprecationRecord( identifier=_build_identifier(node), removed_in=_parse_removed_value( removed_expr, path=path, line=node.lineno ), deprecated_in=_parse_deprecated_value( deprecated_expr, path=path, line=node.lineno ), path=path, line=node.lineno, kind="decorator", package=package, ) yield record def _gather_warn_calls( tree: ast.AST, path: Path, *, package: str ) -> Iterator[DeprecationRecord]: for node in ast.walk(tree): if not isinstance(node, ast.Call): continue target = node.func if isinstance(target, ast.Name): func_name = target.id elif isinstance(target, ast.Attribute): func_name = target.attr else: continue if func_name == "warn_deprecated": identifier_node = node.args[0] if node.args else None if identifier_node is None: continue identifier = ast.unparse(identifier_node) removed_expr = _extract_kw(node, "removed_in") deprecated_expr = _extract_kw(node, "deprecated_in") yield DeprecationRecord( identifier=identifier, removed_in=_parse_removed_value( removed_expr, path=path, line=node.lineno ), deprecated_in=_parse_deprecated_value( deprecated_expr, path=path, line=node.lineno ), path=path, line=node.lineno, kind="warn_call", package=package, ) elif func_name == "warn_cleanup": identifier_node = node.args[0] if node.args else None if identifier_node is None: continue identifier = ast.unparse(identifier_node) cleanup_expr = _extract_kw(node, "cleanup_by") yield DeprecationRecord( identifier=identifier, removed_in=_parse_removed_value( cleanup_expr, path=path, line=node.lineno ), deprecated_in=None, path=path, line=node.lineno, kind="cleanup_call", package=package, ) def _build_identifier(node: ast.AST) -> str: if isinstance(node, ast.ClassDef): return node.name if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): qual_name = node.name if node.decorator_list: parent = getattr(node, "parent", None) if parent and isinstance(parent, ast.ClassDef): return f"{parent.name}.{node.name}" return qual_name return "" def _attach_parents(tree: ast.AST) -> None: for node in ast.walk(tree): for child in ast.iter_child_nodes(node): setattr(child, "parent", node) def _collect_records(files: Iterable[Path], *, package: str) -> list[DeprecationRecord]: records: list[DeprecationRecord] = [] for path in files: tree = ast.parse(path.read_text()) _attach_parents(tree) records.extend(_gather_decorators(tree, path, package=package)) records.extend(_gather_warn_calls(tree, path, package=package)) return records def _collect_rest_route_records( files: Iterable[Path], *, package: str ) -> list[DeprecationRecord]: records: list[DeprecationRecord] = [] for path in files: tree = ast.parse(path.read_text()) records.extend(_gather_rest_route_deprecations(tree, path, package=package)) return records def _version_ge(current: str, target: str) -> bool: try: return pkg_version.parse(current) >= pkg_version.parse(target) except pkg_version.InvalidVersion as exc: raise SystemExit( f"Invalid semantic version comparison: {current=} {target=}" ) from exc def _should_fail(current_version: str, record: DeprecationRecord) -> bool: removed = record.removed_in if removed is None: return False if isinstance(removed, date): return date.today() >= removed try: target = str(removed) return _version_ge(current_version, target) except SystemExit: raise except Exception as exc: # pragma: no cover - unexpected literal type raise SystemExit( f"Unsupported removed_in expression in {record.path}:{record.line}:" f" {removed!r}" ) from exc def _format_record(record: DeprecationRecord) -> str: location = record.path.relative_to(REPO_ROOT) removed = record.removed_in if record.removed_in is not None else "(none)" if record.kind == "cleanup_call": return ( f"- [{record.package}] {record.identifier} ({record.kind})\n" f" cleanup by: {removed}\n" f" defined at: {location}:{record.line}" ) deprecated = ( record.deprecated_in if record.deprecated_in is not None else "(unknown)" ) return ( f"- [{record.package}] {record.identifier} ({record.kind})\n" f" deprecated in: {deprecated}\n" f" removed in: {removed}\n" f" defined at: {location}:{record.line}" ) def main(argv: Sequence[str] | None = None) -> int: argv = list(argv or []) overdue: list[DeprecationRecord] = [] total_records = 0 package_summaries: list[tuple[str, str, int]] = [] for package in PACKAGES: if not package.pyproject.exists(): raise SystemExit( f"Unable to locate pyproject.toml for {package.name}: " f"{package.pyproject}" ) current_version = _load_current_version(package.pyproject) files: list[Path] = [] for root in package.source_roots: if not root.exists(): raise SystemExit( f"Source root {root} for package {package.name} does not exist" ) files.extend(_iter_python_files(root)) records = _collect_records(files, package=package.name) if package.name == "openhands-agent-server": records.extend(_collect_rest_route_records(files, package=package.name)) overdue.extend(r for r in records if _should_fail(current_version, r)) total_records += len(records) package_summaries.append((package.name, current_version, len(records))) if overdue: deprecated_items = [r for r in overdue if r.kind != "cleanup_call"] cleanup_items = [r for r in overdue if r.kind == "cleanup_call"] if deprecated_items: print( "The following deprecated features have passed their removal " "deadline:\n" ) for record in deprecated_items: print(_format_record(record)) print() if cleanup_items: print("The following workarounds have passed their cleanup deadline:\n") for record in cleanup_items: print(_format_record(record)) print() if deprecated_items: print( "Update or remove the listed features before publishing a version that " "meets or exceeds their removal deadline." ) if cleanup_items: print( "Remove the listed workarounds before publishing a version that " "meets or exceeds their cleanup deadline." ) return 1 for package_name, version, count in package_summaries: print( f"{package_name}: checked {count} deprecation metadata entries against " f"version {version}." ) print( f"Checked {total_records} deprecation metadata entries across " f"{len(package_summaries)} package(s)." ) return 0 if __name__ == "__main__": # pragma: no cover - manual invocation sys.exit(main(sys.argv[1:])) ================================================ FILE: .github/scripts/check_docstrings.py ================================================ #!/usr/bin/env python3 """Validate docstrings conform to MDX-compatible formatting guidelines. This script checks that docstrings in the SDK use patterns that render correctly in Mintlify MDX documentation. It validates: 1. No REPL-style examples (>>>) - should use fenced code blocks instead 2. Shell/config examples use fenced code blocks (prevents # becoming headers) Run with: python scripts/check_docstrings.py Exit code 0 = all checks pass, 1 = violations found """ import ast import sys from dataclasses import dataclass from pathlib import Path # Directories to check SDK_PATHS = [ "openhands-sdk/openhands/sdk", ] # Files/directories to skip SKIP_PATTERNS = [ "__pycache__", ".pyc", "test_", "_test.py", ] # Core public API files to check strictly (these are documented on the website) # Other files will be checked but only emit warnings, not failures STRICT_CHECK_FILES = [ "agent/agent.py", "llm/llm.py", "conversation/conversation.py", "tool/tool.py", "workspace/base.py", "observability/laminar.py", ] @dataclass class Violation: """A docstring formatting violation.""" file: Path line: int name: str rule: str message: str is_strict: bool = False # True if this is in a strictly-checked file def should_skip(path: Path) -> bool: """Check if a path should be skipped.""" path_str = str(path) return any(pattern in path_str for pattern in SKIP_PATTERNS) def check_repl_examples( docstring: str, name: str, lineno: int, file: Path ) -> list[Violation]: """Check for REPL-style examples (>>>). These should be replaced with fenced code blocks for better MDX rendering. """ violations = [] lines = docstring.split("\n") for i, line in enumerate(lines): stripped = line.strip() if stripped.startswith(">>>"): violations.append( Violation( file=file, line=lineno + i, name=name, rule="no-repl-examples", message=( "Use fenced code blocks (```python) instead of >>> REPL style. " "REPL examples don't render well in MDX documentation." ), ) ) # Only report once per docstring break return violations def check_unfenced_shell_config( docstring: str, name: str, lineno: int, file: Path ) -> list[Violation]: """Check for shell/config examples that aren't in fenced code blocks. Lines starting with # outside code blocks become markdown headers. """ violations = [] lines = docstring.split("\n") in_code_block = False for i, line in enumerate(lines): stripped = line.strip() # Track code block state if stripped.startswith("```"): in_code_block = not in_code_block continue # Skip if inside a code block if in_code_block: continue # Check for shell-style comments that look like config # Pattern: line starts with # and previous line has = (config pattern) if stripped.startswith("#") and not stripped.startswith("# "): # This is likely a shell comment without space (less common in prose) continue # Check for unfenced config: KEY=VALUE followed by # comment if i > 0: prev_line = lines[i - 1].strip() if i > 0 else "" # If previous line looks like config (VAR=value) and this is a # comment if "=" in prev_line and prev_line.split("=")[0].isupper(): if stripped.startswith("# "): violations.append( Violation( file=file, line=lineno + i, name=name, rule="fenced-shell-config", message=( "Shell/config examples with # comments should be " "in ```bash code blocks. Otherwise # becomes a " "markdown header." ), ) ) # Only report once per docstring break return violations def check_docstring( docstring: str, name: str, lineno: int, file: Path ) -> list[Violation]: """Run all checks on a docstring.""" if not docstring: return [] violations = [] violations.extend(check_repl_examples(docstring, name, lineno, file)) violations.extend(check_unfenced_shell_config(docstring, name, lineno, file)) return violations def get_docstrings_from_file(file: Path) -> list[tuple[str, str, int]]: """Extract all docstrings from a Python file. Returns list of (name, docstring, lineno) tuples. """ try: source = file.read_text() tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError) as e: print(f"Warning: Could not parse {file}: {e}", file=sys.stderr) return [] docstrings = [] for node in ast.walk(tree): name = None lineno = 0 docstring = None if isinstance(node, ast.Module): docstring = ast.get_docstring(node) name = file.stem lineno = 1 elif isinstance(node, ast.ClassDef): docstring = ast.get_docstring(node) name = node.name lineno = node.lineno elif isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef): docstring = ast.get_docstring(node) name = node.name lineno = node.lineno if docstring and name: docstrings.append((name, docstring, lineno)) return docstrings def is_strict_file(file: Path, repo_root: Path) -> bool: """Check if a file is in the strict check list.""" try: rel_path = file.relative_to(repo_root / "openhands-sdk/openhands/sdk") return any(str(rel_path) == strict for strict in STRICT_CHECK_FILES) except ValueError: return False def check_file(file: Path, repo_root: Path) -> list[Violation]: """Check all docstrings in a file.""" violations = [] is_strict = is_strict_file(file, repo_root) for name, docstring, lineno in get_docstrings_from_file(file): file_violations = check_docstring(docstring, name, lineno, file) for v in file_violations: v.is_strict = is_strict violations.extend(file_violations) return violations def main() -> int: """Run docstring checks on all SDK files.""" repo_root = Path(__file__).parent.parent.parent all_violations: list[Violation] = [] files_checked = 0 for sdk_path in SDK_PATHS: path = repo_root / sdk_path if not path.exists(): print(f"Warning: Path not found: {path}", file=sys.stderr) continue for py_file in path.rglob("*.py"): if should_skip(py_file): continue files_checked += 1 violations = check_file(py_file, repo_root) all_violations.extend(violations) # Separate strict violations (errors) from warnings strict_violations = [v for v in all_violations if v.is_strict] warning_violations = [v for v in all_violations if not v.is_strict] # Report warnings (non-strict files) if warning_violations: count = len(warning_violations) print(f"\n⚠️ Found {count} docstring warning(s) in non-core files:\n") by_file: dict[Path, list[Violation]] = {} for v in warning_violations: by_file.setdefault(v.file, []).append(v) for file, violations in sorted(by_file.items()): rel_path = file.relative_to(repo_root) print(f"📄 {rel_path}") for v in violations: print(f" Line {v.line}: {v.name} ({v.rule})") print() # Report errors (strict files) if strict_violations: count = len(strict_violations) print(f"\n❌ Found {count} docstring error(s) in core API files:\n") by_file: dict[Path, list[Violation]] = {} for v in strict_violations: by_file.setdefault(v.file, []).append(v) for file, violations in sorted(by_file.items()): rel_path = file.relative_to(repo_root) print(f"📄 {rel_path}") for v in violations: print(f" Line {v.line}: {v.name}") print(f" Rule: {v.rule}") print(f" {v.message}") print() print("=" * 60) print("To fix these issues:") print(" 1. Replace >>> examples with ```python code blocks") print(" 2. Wrap shell/config examples in ```bash code blocks") print("=" * 60) return 1 if warning_violations: count = len(warning_violations) print(f"✅ Core API files pass. {count} warnings in other files.") else: print(f"✅ All {files_checked} files pass docstring checks") return 0 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: .github/scripts/check_documented_examples.py ================================================ #!/usr/bin/env python3 """ Check if all examples in agent-sdk are documented in the docs repository. This script: 1. Scans the docs repository for references to example files 2. Lists all example Python files in the agent-sdk repository 3. Compares the two sets to find undocumented examples 4. Exits with error code 1 if undocumented examples are found """ import os import re import sys from pathlib import Path def find_documented_examples(docs_path: Path) -> set[str]: """ Find all example file references in the docs repository. Searches for patterns like: - examples/01_standalone_sdk/02_custom_tools.py - examples/02_remote_agent_server/06_custom_tool/custom_tools/log_data.py in MDX files. Returns: Set of normalized example file paths (relative to agent-sdk root) """ documented_examples: set[str] = set() # Pattern to match example file references with arbitrary nesting depth. # Matches: examples//.../.py pattern = r"examples/(?:[-\w]+/)+[-\w]+\.py" for root, _, files in os.walk(docs_path): for file in files: if file.endswith(".mdx") or file.endswith(".md"): file_path = Path(root) / file try: content = file_path.read_text(encoding="utf-8") matches = re.findall(pattern, content) for match in matches: # Normalize the path documented_examples.add(match) except Exception as e: print(f"Warning: Error reading {file_path}: {e}") continue return documented_examples def find_agent_sdk_examples(agent_sdk_path: Path) -> set[str]: """ Find all example Python files in the agent-sdk repository. Excludes examples/03_github_workflows/ since those examples are YAML files, not Python files. Returns: Set of example file paths (relative to agent-sdk root) """ examples: set[str] = set() examples_dir = agent_sdk_path / "examples" if not examples_dir.exists(): print(f"Error: Examples directory not found: {examples_dir}") sys.exit(1) # Find all Python files under examples/ for root, _, files in os.walk(examples_dir): for file in files: if file.endswith(".py"): file_path = Path(root) / file # Get relative path from agent-sdk root relative_path = file_path.relative_to(agent_sdk_path) relative_path_str = str(relative_path) # Skip GitHub workflow examples (those are YAML files, Python # files there are just helpers) if relative_path_str.startswith("examples/03_github_workflows/"): continue # Skip LLM-specific tools examples: these are intentionally not # enforced by the docs check. See discussion in PR #1486. if relative_path_str.startswith("examples/04_llm_specific_tools/"): continue # Skip __init__.py files as they typically don't need documentation if file == "__init__.py": continue examples.add(relative_path_str) return examples def resolve_paths() -> tuple[Path, Path]: """ Determine agent-sdk root and docs path. Priority for docs path: 1) DOCS_PATH (env override) 2) $GITHUB_WORKSPACE/docs 3) agent_sdk_root/'docs' 4) agent_sdk_root.parent/'docs' Returns: Tuple of (agent_sdk_root, docs_path) """ # agent-sdk repo root (script is at agent-sdk/.github/scripts/...) script_file = Path(__file__).resolve() agent_sdk_root = script_file.parent.parent.parent candidates: list[Path] = [] # 1) Explicit env override env_override = os.environ.get("DOCS_PATH") if env_override: candidates.append(Path(env_override).expanduser().resolve()) # 2) Standard GitHub workspace sibling gh_ws = os.environ.get("GITHUB_WORKSPACE") if gh_ws: candidates.append(Path(gh_ws).resolve() / "docs") # 3) Sibling inside the agent-sdk repo root candidates.append(agent_sdk_root / "docs") # 4) Parent-of-agent-sdk-root layout candidates.append(agent_sdk_root.parent / "docs") print(f"🔍 Agent SDK root: {agent_sdk_root}") print("🔎 Trying docs paths (in order):") for p in candidates: print(f" - {p}") for p in candidates: if p.exists(): print(f"📁 Using docs path: {p}") return agent_sdk_root, p # If none exist, fail with a helpful message print("❌ Docs path not found in any of the expected locations.") print(" Set DOCS_PATH, or checkout the repo to one of the tried paths above.") sys.exit(1) def main() -> None: agent_sdk_root, docs_path = resolve_paths() print("\n" + "=" * 60) print("Checking documented examples...") print("=" * 60) # Find all examples in agent-sdk print("\n📋 Scanning agent-sdk examples...") agent_examples = find_agent_sdk_examples(agent_sdk_root) print(f" Found {len(agent_examples)} example file(s)") # Find all documented examples in docs print("\n📄 Scanning docs repository...") documented_examples = find_documented_examples(docs_path) print(f" Found {len(documented_examples)} documented example(s)") # Calculate difference undocumented = agent_examples - documented_examples print("\n" + "=" * 60) if undocumented: print(f"❌ Found {len(undocumented)} undocumented example(s):") print("=" * 60) for example in sorted(undocumented): print(f" - {example}") print("\n⚠️ Please add documentation for these examples in the docs repo.") print("=" * 60) print("\n📚 How to Document Examples:") print("=" * 60) print("1. Clone the docs repository:") print(" git clone https://github.com/OpenHands/docs.git") print() print("2. Create a new .mdx file in sdk/guides/ directory") print(" (e.g., sdk/guides/my-feature.mdx)") print() print("3. Add the example code block with this format:") print(' ```python icon="python" expandable examples/path/to/file.py') print(" ") print(" ```") print() print("4. See the format documentation at:") print( " https://github.com/OpenHands/docs/blob/main/.github/scripts/README.md" ) print() print("5. Example documentation files can be found in:") print(" https://github.com/OpenHands/docs/tree/main/sdk/guides") print() print("6. After creating the PR in docs repo, reference it in your") print(" agent-sdk PR description.") print("=" * 60) sys.exit(1) else: print("✅ All examples are documented!") print("=" * 60) sys.exit(0) if __name__ == "__main__": main() ================================================ FILE: .github/scripts/check_duplicate_example_numbers.py ================================================ #!/usr/bin/env python3 """ Check for duplicate example numbers in the examples directory. This script ensures that within each examples subdirectory, no two files or folders share the same numeric prefix (e.g., two files both starting with "04_"). Exit codes: 0 - No duplicates found 1 - Duplicates found """ import re import sys from collections import defaultdict from pathlib import Path def find_duplicate_numbers(examples_dir: Path) -> dict[str, list[str]]: """ Find duplicate example numbers within each subdirectory. Returns: Dictionary mapping subdirectory paths to lists of duplicate entries. Only includes subdirectories that have duplicates. """ duplicates: dict[str, list[str]] = {} # Pattern to extract leading number from filename/dirname # e.g., "04" from "04_foo.py" number_pattern = re.compile(r"^(\d+)_") for subdir in sorted(examples_dir.iterdir()): if not subdir.is_dir(): continue # Skip hidden directories if subdir.name.startswith("."): continue # Group entries by their numeric prefix number_to_entries: dict[str, list[str]] = defaultdict(list) for entry in subdir.iterdir(): # Skip hidden files/directories if entry.name.startswith("."): continue match = number_pattern.match(entry.name) if match: number = match.group(1) number_to_entries[number].append(entry.name) # Find numbers with multiple entries subdir_duplicates = [] for number, entries in sorted(number_to_entries.items()): if len(entries) > 1: subdir_duplicates.extend(sorted(entries)) if subdir_duplicates: relative_subdir = str(subdir.relative_to(examples_dir.parent)) duplicates[relative_subdir] = subdir_duplicates return duplicates def main() -> None: # Find the examples directory relative to this script script_file = Path(__file__).resolve() repo_root = script_file.parent.parent.parent examples_dir = repo_root / "examples" if not examples_dir.exists(): print(f"Error: Examples directory not found: {examples_dir}") sys.exit(1) print("=" * 60) print("Checking for duplicate example numbers...") print("=" * 60) print(f"\n📁 Scanning: {examples_dir}\n") duplicates = find_duplicate_numbers(examples_dir) if duplicates: print("❌ Found duplicate example numbers:\n") for subdir, entries in sorted(duplicates.items()): print(f" {subdir}/") for entry in entries: print(f" - {entry}") print() print("=" * 60) print("⚠️ Please renumber the examples to remove duplicates.") print(" Each example should have a unique number within its folder.") print("=" * 60) sys.exit(1) else: print("✅ No duplicate example numbers found!") print("=" * 60) sys.exit(0) if __name__ == "__main__": main() ================================================ FILE: .github/scripts/check_sdk_api_breakage.py ================================================ #!/usr/bin/env python3 """API breakage detection for published OpenHands packages using Griffe. This script compares current workspace packages against the most recent PyPI release (or the matching release if the current version is already published) to detect breaking changes in the public API. It focuses on the curated public surface: - symbols exported via ``__all__`` - public members removed from classes exported via ``__all__`` It enforces two policies: 1. **Deprecation runway before removal** – any removed export or removed public class member must have been marked deprecated in the *previous* release using the canonical deprecation helpers (``@deprecated`` decorator or ``warn_deprecated()`` call from ``openhands.sdk.utils.deprecation``), and the baseline deprecation metadata must show that the current version has reached a scheduled removal target at least **5 minor releases** after ``deprecated_in``. For members, the recommended ``warn_deprecated`` feature name is qualified (e.g. ``"LLM.some_method"``). 2. **MINOR version bump** – any breaking change (removal or structural) requires at least a MINOR version bump according to SemVer. Complementary to the deprecation mechanism: - Deprecation (``check_deprecations.py``): enforces cleanup deadlines - This script: prevents unannounced removals and enforces SemVer bumps """ from __future__ import annotations import ast import json import os import subprocess import sys import tomllib import urllib.request from collections.abc import Iterable from dataclasses import dataclass, field from pathlib import Path from packaging import version as pkg_version from packaging.requirements import Requirement @dataclass(frozen=True) class PackageConfig: """Configuration for a single published package.""" package: str # dotted module path, e.g. "openhands.sdk" distribution: str # PyPI distribution name, e.g. "openhands-sdk" source_dir: str # repo-relative directory, e.g. "openhands-sdk" @dataclass(frozen=True, slots=True) class DeprecationMetadata: deprecated_in: str | None = None removed_in: str | None = None @dataclass(frozen=True, slots=True) class DeprecatedSymbols: """Deprecated SDK symbols detected in a source tree. ``top_level`` tracks module-level symbols (exports) like ``LLM``. ``qualified`` tracks class members like ``LLM.some_method``. ``metadata`` stores the parsed deprecation schedule for each feature. """ top_level: set[str] = frozenset() # type: ignore[assignment] qualified: set[str] = frozenset() # type: ignore[assignment] metadata: dict[str, DeprecationMetadata] = field(default_factory=dict) DEPRECATION_RUNWAY_MINOR_RELEASES = 5 PACKAGES: tuple[PackageConfig, ...] = ( PackageConfig( package="openhands.sdk", distribution="openhands-sdk", source_dir="openhands-sdk", ), PackageConfig( package="openhands.workspace", distribution="openhands-workspace", source_dir="openhands-workspace", ), PackageConfig( package="openhands.tools", distribution="openhands-tools", source_dir="openhands-tools", ), ) ACP_DEPENDENCY = "agent-client-protocol" ACP_SKIP_ENV = "ACP_VERSION_CHECK_SKIP" ACP_SKIP_TOKEN = "skip-acp-check" ACP_BASE_REF_ENV = "ACP_VERSION_CHECK_BASE_REF" def read_version_from_pyproject(path: str) -> str: """Read the version string from a pyproject.toml file.""" with open(path, "rb") as f: data = tomllib.load(f) proj = data.get("project", {}) v = proj.get("version") if not v: raise SystemExit(f"Could not read version from {path}") return str(v) def _read_pyproject(path: str) -> dict: with open(path, "rb") as f: return tomllib.load(f) def _bool_env(name: str) -> bool: value = os.environ.get(name, "").strip().lower() return value in {"1", "true", "yes", "on"} def _get_dependency_spec(project_data: dict, dependency: str) -> str | None: deps = project_data.get("project", {}).get("dependencies", []) for dep in deps: if dep.startswith(dependency): return dep return None def _min_version_from_requirement(req_str: str) -> pkg_version.Version | None: try: req = Requirement(req_str) except Exception as exc: print( f"::warning title=ACP version::Unable to parse requirement " f"'{req_str}': {exc}" ) return None lower_bounds: list[pkg_version.Version] = [] for spec in req.specifier: if spec.operator in {">=", ">", "==", "~="}: try: lower_bounds.append(_parse_version(spec.version)) except Exception as exc: print( f"::warning title=ACP version::Unable to parse version " f"'{spec.version}' from '{req_str}': {exc}" ) if not lower_bounds: return None return max(lower_bounds) def _git_show_file(ref: str, rel_path: str) -> str | None: for candidate in (f"origin/{ref}", ref): result = subprocess.run( ["git", "show", f"{candidate}:{rel_path}"], check=False, capture_output=True, text=True, ) if result.returncode == 0: return result.stdout return None def _load_base_pyproject(base_ref: str) -> dict | None: rel_path = "openhands-sdk/pyproject.toml" content = _git_show_file(base_ref, rel_path) if content is None: print( f"::warning title=ACP version::Unable to read {rel_path} from " f"{base_ref}; skipping ACP version check" ) return None try: return tomllib.loads(content) except tomllib.TOMLDecodeError as exc: print( f"::warning title=ACP version::Failed to parse {rel_path} from " f"{base_ref}: {exc}" ) return None def _check_acp_version_bump(repo_root: str) -> int: if _bool_env(ACP_SKIP_ENV): print( f"::notice title=ACP version::Skipping ACP version check because " f"{ACP_SKIP_ENV} is set (token: [{ACP_SKIP_TOKEN}])." ) return 0 base_ref = os.environ.get(ACP_BASE_REF_ENV) or os.environ.get("GITHUB_BASE_REF") if not base_ref: print( "::warning title=ACP version::No base ref found; skipping ACP version check" ) return 0 base_data = _load_base_pyproject(base_ref) if base_data is None: return 0 current_data = _read_pyproject( os.path.join(repo_root, "openhands-sdk", "pyproject.toml") ) old_req = _get_dependency_spec(base_data, ACP_DEPENDENCY) new_req = _get_dependency_spec(current_data, ACP_DEPENDENCY) if not old_req or not new_req: print( f"::warning title=ACP version::Unable to locate {ACP_DEPENDENCY} " "dependency in pyproject.toml; skipping ACP version check" ) return 0 old_min = _min_version_from_requirement(old_req) new_min = _min_version_from_requirement(new_req) if old_min is None or new_min is None: print( f"::warning title=ACP version::Unable to parse {ACP_DEPENDENCY} " "minimum version; skipping ACP version check" ) return 0 if new_min <= old_min: return 0 if new_min.major != old_min.major or new_min.minor != old_min.minor: print( "::error title=ACP version::Detected " f"{ACP_DEPENDENCY} minor/major version bump " f"({old_req} -> {new_req}). If intentional, add " f"[{ACP_SKIP_TOKEN}] to the PR description to bypass." ) return 1 return 0 def _parse_version(v: str) -> pkg_version.Version: """Parse a version string using packaging.""" return pkg_version.parse(v) def _parse_string_kwarg(call: ast.Call, name: str) -> str | None: for kw in call.keywords: if kw.arg != name: continue value = kw.value if isinstance(value, ast.Constant) and isinstance(value.value, str): return value.value return None return None def _minimum_removed_in(deprecated_in: str) -> str: parsed = _parse_version(deprecated_in) return f"{parsed.major}.{parsed.minor + DEPRECATION_RUNWAY_MINOR_RELEASES}.0" def _deprecation_schedule_errors( *, feature: str, metadata: DeprecationMetadata | None, current_version: str, ) -> list[str]: if metadata is None: return [ f"Removed '{feature}' without prior deprecation. Mark it with " "@deprecated(...) or warn_deprecated(...), and keep it deprecated for " f"{DEPRECATION_RUNWAY_MINOR_RELEASES} minor releases before removing." ] if metadata.deprecated_in is None: return [ f"Removed '{feature}' was marked deprecated previously, but its " "deprecation metadata does not declare deprecated_in. Public API " f"removals require {DEPRECATION_RUNWAY_MINOR_RELEASES} minor releases " "of runway." ] if metadata.removed_in is None: return [ f"Removed '{feature}' was marked deprecated previously, but its " "deprecation metadata does not declare removed_in. Public API removals " f"require {DEPRECATION_RUNWAY_MINOR_RELEASES} minor releases of runway." ] minimum_removed_in = _minimum_removed_in(metadata.deprecated_in) if _parse_version(metadata.removed_in) < _parse_version(minimum_removed_in): return [ f"Removed '{feature}' uses an invalid deprecation schedule: " f"deprecated_in={metadata.deprecated_in} and " f"removed_in={metadata.removed_in}. Public API removals require at " f"least {DEPRECATION_RUNWAY_MINOR_RELEASES} minor releases of runway " f"(minimum removed_in: {minimum_removed_in})." ] if _parse_version(current_version) < _parse_version(metadata.removed_in): return [ f"Removed '{feature}' before its scheduled removal version " f"{metadata.removed_in}. Current version is {current_version}. Public " f"API removals require {DEPRECATION_RUNWAY_MINOR_RELEASES} minor releases " "of deprecation runway." ] return [] def get_pypi_baseline_version(pkg: str, current: str | None) -> str | None: """Fetch the baseline release version from PyPI. The baseline is the most recent published release to compare against the current workspace. If the current version already exists on PyPI, compare against that same release. Otherwise, fall back to the newest release older than the current version. If ``current`` is None, use the latest release. Args: pkg: Package name on PyPI (e.g., "openhands-sdk") current: Current version from the workspace, or None for latest Returns: Baseline version string, or None if not found or on network error """ req = urllib.request.Request( url=f"https://pypi.org/pypi/{pkg}/json", headers={"User-Agent": "openhands-sdk-api-check/1.0"}, method="GET", ) try: with urllib.request.urlopen(req, timeout=10) as r: meta = json.load(r) except Exception as e: print(f"::warning title={pkg} API::Failed to fetch PyPI metadata: {e}") return None releases = list(meta.get("releases", {}).keys()) if not releases: return None def _sort_key(s: str): return _parse_version(s) releases_sorted = sorted(releases, key=_sort_key, reverse=True) if current is None: return releases_sorted[0] if current in releases: return current cur_parsed = _parse_version(current) older = [rv for rv in releases if _parse_version(rv) < cur_parsed] if not older: return None return sorted(older, key=_sort_key, reverse=True)[0] def ensure_griffe() -> None: """Verify griffe is installed, raising an error if not.""" try: import griffe # noqa: F401 except ImportError: sys.stderr.write( "ERROR: griffe not installed. Install with: pip install griffe[pypi]\n" ) raise SystemExit(1) FIELD_METADATA_KWARGS = frozenset( { "deprecated", "description", "examples", "json_schema_extra", "title", } ) def _escape_newlines_in_string_literals(text: str) -> str: """Escape literal newlines that appear inside quoted string literals.""" chars: list[str] = [] in_string: str | None = None escaped = False for ch in text: if in_string is None: chars.append(ch) if ch in {"'", '"'}: in_string = ch continue if escaped: chars.append(ch) escaped = False continue if ch == "\\": chars.append(ch) escaped = True continue if ch == in_string: chars.append(ch) in_string = None continue if ch == "\n": chars.append("\\n") continue chars.append(ch) return "".join(chars) def _parse_field_call(value: object) -> ast.Call | None: """Parse a stringified Pydantic ``Field(...)`` value into an AST call.""" try: expr = ast.parse( _escape_newlines_in_string_literals(str(value)), mode="eval", ).body except SyntaxError: return None if not isinstance(expr, ast.Call): return None func = expr.func if isinstance(func, ast.Name): func_name = func.id elif isinstance(func, ast.Attribute): func_name = func.attr else: return None if func_name != "Field": return None return expr def _filter_field_metadata_kwargs(call: ast.Call) -> ast.Call: """Return a copy of a ``Field(...)`` call without metadata-only kwargs.""" return ast.Call( func=call.func, args=call.args, keywords=[kw for kw in call.keywords if kw.arg not in FIELD_METADATA_KWARGS], ) def _is_field_metadata_only_change(old_val: object, new_val: object) -> bool: """Check if the change is only in Field metadata (description, title, etc.). Field metadata parameters like ``description``, ``title``, ``examples``, ``json_schema_extra``, and ``deprecated`` don't affect runtime behavior. Changes to these should not be considered breaking API changes. Returns: True if both values are Field() calls and only metadata parameters differ. """ old_call = _parse_field_call(old_val) new_call = _parse_field_call(new_val) if old_call is None or new_call is None: return False return ast.dump( _filter_field_metadata_kwargs(old_call), include_attributes=False, ) == ast.dump( _filter_field_metadata_kwargs(new_call), include_attributes=False, ) def _member_deprecation_metadata( cls_obj: object, member_name: str, deprecated: DeprecatedSymbols, ) -> DeprecationMetadata | None: """Return deprecation metadata for a class member, including parent classes. When a member like ``system_message`` is deprecated on a base class (``AgentBase``) but removed from a subclass (``Agent``), griffe reports the removal against the subclass name. This helper walks the MRO so that ``Agent.system_message`` reuses the base-class deprecation schedule. """ cls_name = getattr(cls_obj, "name", "") feature = f"{cls_name}.{member_name}" if feature in deprecated.qualified: return deprecated.metadata.get(feature, DeprecationMetadata()) if cls_name in deprecated.top_level: return deprecated.metadata.get(cls_name, DeprecationMetadata()) for base in getattr(cls_obj, "resolved_bases", []): base_name = getattr(base, "name", None) if base_name is None: continue feature = f"{base_name}.{member_name}" if feature in deprecated.qualified: return deprecated.metadata.get(feature, DeprecationMetadata()) return None def _was_deprecated( cls_obj: object, member_name: str, deprecated: DeprecatedSymbols, ) -> bool: return _member_deprecation_metadata(cls_obj, member_name, deprecated) is not None def _collect_breakages_pairs( objs: Iterable[tuple[object, object]], *, deprecated: DeprecatedSymbols, current_version: str, title: str, ) -> tuple[list[object], int]: """Find breaking changes between pairs of old/new API objects. Only reports breakages for public API members. Returns: (breakages, removal_policy_errors) """ import griffe from griffe import Alias, AliasResolutionError, BreakageKind, ExplanationStyle, Kind breakages: list[object] = [] removal_policy_errors = 0 for old, new in objs: try: for br in griffe.find_breaking_changes(old, new): obj = getattr(br, "obj", None) if not getattr(obj, "is_public", True): continue # Skip ATTRIBUTE_CHANGED_VALUE when it's just Field metadata changes # (description, title, examples, etc.) - these don't affect runtime if br.kind == BreakageKind.ATTRIBUTE_CHANGED_VALUE: old_value = getattr(br, "old_value", None) new_value = getattr(br, "new_value", None) if _is_field_metadata_only_change(old_value, new_value): print( f"::notice title={title}::Ignoring Field metadata-only " f"change (non-breaking): {obj.name if obj else 'unknown'}" ) continue print(br.explain(style=ExplanationStyle.GITHUB)) breakages.append(br) if br.kind != BreakageKind.OBJECT_REMOVED: continue parent = getattr(obj, "parent", None) if getattr(parent, "kind", None) != Kind.CLASS: continue feature = f"{parent.name}.{obj.name}" errors = _deprecation_schedule_errors( feature=feature, metadata=_member_deprecation_metadata(parent, obj.name, deprecated), current_version=current_version, ) if not errors: continue for error in errors: print(f"::error title={title}::{error}") removal_policy_errors += len(errors) except AliasResolutionError as e: if isinstance(old, Alias) or isinstance(new, Alias): old_target = old.target_path if isinstance(old, Alias) else None new_target = new.target_path if isinstance(new, Alias) else None if old_target != new_target: name = getattr(old, "name", None) or getattr( new, "name", "" ) print( f"::warning title={title}::Alias target changed for '{name}': " f"{old_target!r} -> {new_target!r}" ) breakages.append( { "kind": "ALIAS_TARGET_CHANGED", "name": name, "old": old_target, "new": new_target, } ) else: print( f"::notice title={title}::Skipping symbol comparison due to " f"unresolved alias: {e}" ) except Exception as e: print(f"::warning title={title}::Failed to compute breakages: {e}") return breakages, removal_policy_errors def _extract_exported_names(module) -> set[str]: """Extract names exported from a module via ``__all__``. This check is explicitly meant to track the curated public surface. The SDK is expected to define ``__all__`` in ``openhands.sdk``; if it's missing or we can't statically interpret it, we fail fast rather than silently widening the surface area (which would make the check noisy and brittle). """ try: all_var = module["__all__"] except Exception as e: raise ValueError("Expected __all__ to be defined on the public module") from e val = getattr(all_var, "value", None) elts = getattr(val, "elements", None) if not elts: raise ValueError("Unable to statically evaluate __all__") names: set[str] = set() for el in elts: # Griffe represents string literals in __all__ in different ways depending # on how the module is loaded / griffe version: # - sometimes as plain Python strings (including quotes, e.g. "'LLM'") # - sometimes as expression nodes with a `.value` attribute # # We intentionally only support the "static __all__ of string literals" # case; we just normalize the representation. if isinstance(el, str): names.add(el.strip("\"'")) continue s = getattr(el, "value", None) if isinstance(s, str): names.add(s) if not names: raise ValueError("__all__ resolved to an empty set") return names def _check_version_bump(prev: str, new_version: str, total_breaks: int) -> int: """Check if version bump policy is satisfied for breaking changes. Policy: Breaking changes require at least a MINOR version bump. Returns: 0 if policy satisfied, 1 if not """ if total_breaks == 0: print("No breaking changes detected") return 0 parsed_prev = _parse_version(prev) parsed_new = _parse_version(new_version) # MINOR bump required: same major, higher minor OR higher major ok = (parsed_new.major > parsed_prev.major) or ( parsed_new.major == parsed_prev.major and parsed_new.minor > parsed_prev.minor ) if not ok: print( f"::error title=SemVer::Breaking changes detected ({total_breaks}); " f"require at least minor version bump from " f"{parsed_prev.major}.{parsed_prev.minor}.x, but new is {new_version}" ) return 1 print( f"Breaking changes detected ({total_breaks}) and version bump policy " f"satisfied ({prev} -> {new_version})" ) return 0 def _resolve_griffe_object( root: object, dotted: str, root_package: str = "", ) -> object: """Resolve a dotted path to a griffe object.""" root_path = getattr(root, "path", None) if root_path == dotted: return root if isinstance(root_path, str) and dotted.startswith(root_path + "."): dotted = dotted[len(root_path) + 1 :] try: return root[dotted] except (KeyError, TypeError) as e: print( f"::warning title=SDK API::Unable to resolve {dotted} via " f"direct lookup; falling back to manual traversal: {e}" ) rel = dotted if root_package and dotted.startswith(root_package + "."): rel = dotted[len(root_package) + 1 :] obj = root for part in rel.split("."): try: obj = obj[part] except (KeyError, TypeError) as e: raise KeyError(f"Unable to resolve {dotted}: failed at {part}") from e return obj def _load_current( griffe_module: object, repo_root: str, cfg: PackageConfig ) -> object | None: try: return griffe_module.load( cfg.package, search_paths=[os.path.join(repo_root, cfg.source_dir)], ) except Exception as e: print( f"::error title={cfg.distribution} API::" f"Failed to load current {cfg.distribution}: {e}" ) return None def _load_prev_from_pypi( griffe_module: object, prev: str, cfg: PackageConfig, ) -> object | None: griffe_cache = os.path.expanduser("~/.cache/griffe") os.makedirs(griffe_cache, exist_ok=True) try: return griffe_module.load_pypi( package=cfg.package, distribution=cfg.distribution, version_spec=f"=={prev}", ) except Exception as e: print( f"::error title={cfg.distribution} API::" f"Failed to load {cfg.distribution}=={prev} from PyPI: {e}" ) return None def _find_deprecated_symbols(source_root: Path) -> DeprecatedSymbols: """Scan source files for symbols marked with the SDK deprecation helpers. Detects two forms: - ``@deprecated(...)`` decorator on a class/function/method - ``warn_deprecated('SomeFeature', ...)`` call Returns: DeprecatedSymbols(top_level=..., qualified=..., metadata=...) """ def _deprecated_metadata(call: ast.Call) -> DeprecationMetadata: return DeprecationMetadata( deprecated_in=_parse_string_kwarg(call, "deprecated_in"), removed_in=_parse_string_kwarg(call, "removed_in"), ) def _is_deprecated_decorator(deco: ast.AST) -> ast.Call | None: if not isinstance(deco, ast.Call): return None target = deco.func if isinstance(target, ast.Name) and target.id == "deprecated": return deco if isinstance(target, ast.Attribute) and target.attr == "deprecated": return deco return None class _Visitor(ast.NodeVisitor): def __init__(self) -> None: self.class_stack: list[str] = [] self.top_level: set[str] = set() self.qualified: set[str] = set() self.metadata: dict[str, DeprecationMetadata] = {} def visit_ClassDef(self, node: ast.ClassDef) -> None: # noqa: N802 for deco in node.decorator_list: deprecated_call = _is_deprecated_decorator(deco) if deprecated_call is None: continue metadata = _deprecated_metadata(deprecated_call) self.top_level.add(node.name) self.qualified.add(node.name) self.metadata[node.name] = metadata break self.class_stack.append(node.name) self.generic_visit(node) self.class_stack.pop() def _visit_function_like( self, node: ast.FunctionDef | ast.AsyncFunctionDef, ) -> None: for deco in node.decorator_list: deprecated_call = _is_deprecated_decorator(deco) if deprecated_call is None: continue metadata = _deprecated_metadata(deprecated_call) if self.class_stack: feature = ".".join([*self.class_stack, node.name]) self.qualified.add(feature) self.metadata[feature] = metadata else: self.top_level.add(node.name) self.qualified.add(node.name) self.metadata[node.name] = metadata break self.generic_visit(node) def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802 self._visit_function_like(node) def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: # noqa: N802 self._visit_function_like(node) def visit_Call(self, node: ast.Call) -> None: # noqa: N802 target = node.func func_name = None if isinstance(target, ast.Name): func_name = target.id elif isinstance(target, ast.Attribute): func_name = target.attr if func_name == "warn_deprecated" and node.args: feature = _extract_string_literal(node.args[0]) if feature is not None: metadata = _deprecated_metadata(node) self.qualified.add(feature) top_level_name = feature.split(".")[0] self.top_level.add(top_level_name) self.metadata[feature] = metadata self.metadata.setdefault(top_level_name, metadata) self.generic_visit(node) top_level: set[str] = set() qualified: set[str] = set() metadata: dict[str, DeprecationMetadata] = {} for pyfile in source_root.rglob("*.py"): try: tree = ast.parse(pyfile.read_text()) except SyntaxError as e: print( f"::warning title=SDK API::Skipping {pyfile}: " f"failed to parse (SyntaxError: {e})" ) continue visitor = _Visitor() visitor.visit(tree) top_level |= visitor.top_level qualified |= visitor.qualified metadata.update(visitor.metadata) return DeprecatedSymbols( top_level=top_level, qualified=qualified, metadata=metadata ) def _extract_string_literal(node: ast.AST) -> str | None: """Return the string value if *node* is a simple string literal.""" if isinstance(node, ast.Constant) and isinstance(node.value, str): return node.value return None def _get_source_root(griffe_root: object) -> Path | None: """Derive the package source directory from a griffe module's filepath.""" filepath = getattr(griffe_root, "filepath", None) if filepath is not None: return Path(filepath).parent return None def _compute_breakages( old_root, new_root, cfg: PackageConfig, *, current_version: str = "9999.0.0", ) -> tuple[int, int]: """Detect breaking changes between old and new package versions. Returns: ``(total_breaks, removal_policy_errors)`` — *total_breaks* counts all structural breakages (for the version-bump policy), while *removal_policy_errors* counts public API removals that violate the required deprecation runway. """ pkg = cfg.package title = f"{cfg.distribution} API" total_breaks = 0 removal_policy_errors = 0 source_root = _get_source_root(old_root) deprecated = ( _find_deprecated_symbols(source_root) if source_root else DeprecatedSymbols() ) try: old_mod = _resolve_griffe_object(old_root, pkg, root_package=pkg) new_mod = _resolve_griffe_object(new_root, pkg, root_package=pkg) except Exception as e: raise RuntimeError(f"Failed to resolve root module '{pkg}'") from e new_exports = _extract_exported_names(new_mod) try: old_exports = _extract_exported_names(old_mod) except ValueError as e: # The API breakage check relies on a curated public surface defined via # __all__. If the baseline release didn't define (or couldn't statically # evaluate) __all__, we can't compute meaningful breakages. # # In this situation, skip rather than failing the entire workflow. print( f"::notice title={title}::Skipping breakage check; baseline release " f"has no statically-evaluable {pkg}.__all__: {e}" ) return 0, 0 removed = sorted(old_exports - new_exports) # Check deprecation runway policy (exports) for name in removed: total_breaks += 1 # every removal is a structural break errors = _deprecation_schedule_errors( feature=name, metadata=( deprecated.metadata.get(name, DeprecationMetadata()) if name in deprecated.top_level else None ), current_version=current_version, ) if not errors: print( f"::notice title={title}::Removed previously-deprecated symbol " f"'{name}' from {pkg}.__all__ after its scheduled removal version" ) continue for error in errors: print(f"::error title={title}::{error}") removal_policy_errors += len(errors) common = sorted(old_exports & new_exports) pairs: list[tuple[object, object]] = [] for name in common: try: pairs.append((old_mod[name], new_mod[name])) except Exception as e: print(f"::warning title={title}::Unable to resolve symbol {name}: {e}") breakages, member_policy_errors = _collect_breakages_pairs( pairs, deprecated=deprecated, current_version=current_version, title=title, ) total_breaks += len(breakages) removal_policy_errors += member_policy_errors return total_breaks, removal_policy_errors def _check_package(griffe_module, repo_root: str, cfg: PackageConfig) -> int: """Run breakage checks for a single package. Returns 0 on success.""" pyproj = os.path.join(repo_root, cfg.source_dir, "pyproject.toml") new_version = read_version_from_pyproject(pyproj) title = f"{cfg.distribution} API" baseline = get_pypi_baseline_version(cfg.distribution, new_version) if not baseline: print( f"::warning title={title}::No baseline {cfg.distribution} " f"release found; skipping breakage check", ) return 0 print(f"Comparing {cfg.distribution} {new_version} against {baseline}") new_root = _load_current(griffe_module, repo_root, cfg) if not new_root: return 1 old_root = _load_prev_from_pypi(griffe_module, baseline, cfg) if not old_root: return 1 try: total_breaks, removal_policy_errors = _compute_breakages( old_root, new_root, cfg, current_version=new_version, ) except Exception as e: print(f"::error title={title}::Failed to compute breakages: {e}") return 1 if removal_policy_errors: print( f"::error title={title}::{removal_policy_errors} public API removal " f"policy violation(s) detected in {cfg.package} — see errors above" ) bump_rc = _check_version_bump(baseline, new_version, total_breaks) return 1 if (removal_policy_errors or bump_rc) else 0 def main() -> int: """Main entry point for API breakage detection.""" repo_root = os.getcwd() rc = _check_acp_version_bump(repo_root) ensure_griffe() import griffe for cfg in PACKAGES: print(f"\n{'=' * 60}") print(f"Checking {cfg.distribution} ({cfg.package})") print(f"{'=' * 60}") rc |= _check_package(griffe, repo_root, cfg) return rc if __name__ == "__main__": raise SystemExit(main()) ================================================ FILE: .github/scripts/check_version_bumps.py ================================================ """Guard package version changes so they only happen in release PRs.""" from __future__ import annotations import os import re import subprocess import sys import tomllib from dataclasses import dataclass from pathlib import Path PACKAGE_PYPROJECTS: dict[str, Path] = { "openhands-sdk": Path("openhands-sdk/pyproject.toml"), "openhands-tools": Path("openhands-tools/pyproject.toml"), "openhands-workspace": Path("openhands-workspace/pyproject.toml"), "openhands-agent-server": Path("openhands-agent-server/pyproject.toml"), } _VERSION_PATTERN = r"\d+\.\d+\.\d+(?:[-+][0-9A-Za-z.]+)?" _RELEASE_TITLE_RE = re.compile(rf"^Release v(?P{_VERSION_PATTERN})$") _RELEASE_BRANCH_RE = re.compile(rf"^rel-(?P{_VERSION_PATTERN})$") @dataclass(frozen=True) class VersionChange: package: str path: Path previous_version: str current_version: str def _read_version_from_pyproject_text(text: str, source: str) -> str: data = tomllib.loads(text) version = data.get("project", {}).get("version") if not isinstance(version, str): raise SystemExit(f"Unable to determine project.version from {source}") return version def _read_current_version(repo_root: Path, pyproject: Path) -> str: return _read_version_from_pyproject_text( (repo_root / pyproject).read_text(), str(pyproject), ) def _read_version_from_git_ref(repo_root: Path, git_ref: str, pyproject: Path) -> str: result = subprocess.run( ["git", "show", f"{git_ref}:{pyproject.as_posix()}"], cwd=repo_root, check=False, capture_output=True, text=True, ) if result.returncode != 0: message = result.stderr.strip() or result.stdout.strip() or "unknown git error" raise SystemExit( f"Unable to read {pyproject} from git ref {git_ref}: {message}" ) return _read_version_from_pyproject_text(result.stdout, f"{git_ref}:{pyproject}") def _base_ref_candidates(base_ref: str) -> list[str]: if base_ref.startswith("origin/"): return [base_ref, base_ref.removeprefix("origin/")] return [f"origin/{base_ref}", base_ref] def find_version_changes(repo_root: Path, base_ref: str) -> list[VersionChange]: changes: list[VersionChange] = [] candidates = _base_ref_candidates(base_ref) for package, pyproject in PACKAGE_PYPROJECTS.items(): current_version = _read_current_version(repo_root, pyproject) previous_error: SystemExit | None = None previous_version: str | None = None for candidate in candidates: try: previous_version = _read_version_from_git_ref( repo_root, candidate, pyproject ) break except SystemExit as exc: previous_error = exc if previous_version is None: assert previous_error is not None raise previous_error if previous_version != current_version: changes.append( VersionChange( package=package, path=pyproject, previous_version=previous_version, current_version=current_version, ) ) return changes def get_release_pr_version( pr_title: str, pr_head_ref: str ) -> tuple[str | None, list[str]]: title_match = _RELEASE_TITLE_RE.fullmatch(pr_title.strip()) branch_match = _RELEASE_BRANCH_RE.fullmatch(pr_head_ref.strip()) title_version = title_match.group("version") if title_match else None branch_version = branch_match.group("version") if branch_match else None if title_version and branch_version and title_version != branch_version: return None, [ "Release PR markers disagree: title requests " f"v{title_version} but branch is rel-{branch_version}." ] return title_version or branch_version, [] def validate_version_changes( changes: list[VersionChange], pr_title: str, pr_head_ref: str, ) -> list[str]: if not changes: return [] release_version, errors = get_release_pr_version(pr_title, pr_head_ref) if errors: return errors formatted_changes = ", ".join( f"{change.package} ({change.previous_version} -> {change.current_version})" for change in changes ) if release_version is None: return [ "Package version changes are only allowed in release PRs. " f"Detected changes: {formatted_changes}. " "Use the Prepare Release workflow so the PR title is 'Release vX.Y.Z' " "or the branch is 'rel-X.Y.Z'." ] mismatched = [ change for change in changes if change.current_version != release_version ] if mismatched: mismatch_details = ", ".join( f"{change.package} ({change.current_version})" for change in mismatched ) return [ f"Release PR version v{release_version} does not match changed package " f"versions: {mismatch_details}." ] return [] def main() -> int: repo_root = Path(__file__).resolve().parents[2] base_ref = os.environ.get("VERSION_BUMP_BASE_REF") or os.environ.get( "GITHUB_BASE_REF" ) if not base_ref: print("::warning title=Version bump guard::No base ref found; skipping check.") return 0 pr_title = os.environ.get("PR_TITLE", "") pr_head_ref = os.environ.get("PR_HEAD_REF", "") changes = find_version_changes(repo_root, base_ref) errors = validate_version_changes(changes, pr_title, pr_head_ref) if errors: for error in errors: print(f"::error title=Version bump guard::{error}") return 1 if changes: changed_packages = ", ".join(change.package for change in changes) print( "::notice title=Version bump guard::" f"Release PR version changes validated for {changed_packages}." ) else: print("::notice title=Version bump guard::No package version changes detected.") return 0 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: .github/scripts/update_sdk_ref_default.py ================================================ #!/usr/bin/env python3 """Update the sdk_ref default value in run-eval.yml. This script updates the default SDK reference version in the run-eval workflow to match a new release version. """ from __future__ import annotations import argparse import re import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[2] RUN_EVAL_WORKFLOW = REPO_ROOT / ".github" / "workflows" / "run-eval.yml" # Pattern to match the sdk_ref default line # Matches: "default: vX.Y.Z" with optional prerelease suffix like -rc1, -beta.1 SDK_REF_PATTERN = re.compile( r"^(\s*default:\s*v)[\d]+\.[\d]+\.[\d]+(-[a-zA-Z0-9.]+)?(\s*)$" ) def update_sdk_ref_default(new_version: str, dry_run: bool = False) -> bool: """Update the sdk_ref default in run-eval.yml. Args: new_version: The new version (without 'v' prefix, e.g., "1.12.0") dry_run: If True, print what would change without modifying the file Returns: True if successful, False otherwise """ if not RUN_EVAL_WORKFLOW.exists(): print(f"❌ File not found: {RUN_EVAL_WORKFLOW}", file=sys.stderr) return False content = RUN_EVAL_WORKFLOW.read_text() lines = content.splitlines(keepends=True) # Find the sdk_ref input section and its default line in_sdk_ref_section = False updated = False old_version = None for i, line in enumerate(lines): stripped = line.strip() # Track when we enter the sdk_ref input section if stripped == "sdk_ref:": in_sdk_ref_section = True continue # Track when we exit the sdk_ref section (another input starts) if ( in_sdk_ref_section and stripped.endswith(":") and not stripped.startswith("default") ): in_sdk_ref_section = False # Update the default line within the sdk_ref section if in_sdk_ref_section: match = SDK_REF_PATTERN.match(line) if match: old_version = line.strip().replace("default: ", "") new_line = f"{match.group(1)}{new_version}{match.group(3) or ''}" if not line.endswith("\n") and lines[i].endswith("\n"): new_line += "\n" elif line.endswith("\n"): new_line += "\n" lines[i] = new_line updated = True break if not updated: print("❌ Could not find sdk_ref default line to update", file=sys.stderr) return False if dry_run: print(f"Would update sdk_ref default: {old_version} → v{new_version}") return True # Write the updated content RUN_EVAL_WORKFLOW.write_text("".join(lines)) print(f"✅ Updated sdk_ref default: {old_version} → v{new_version}") return True def main() -> int: parser = argparse.ArgumentParser( description="Update the sdk_ref default value in run-eval.yml" ) parser.add_argument( "version", help="New version (without 'v' prefix, e.g., '1.12.0')", ) parser.add_argument( "--dry-run", action="store_true", help="Print what would change without modifying the file", ) args = parser.parse_args() # Validate version format version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[a-zA-Z0-9.]+)?$") if not version_pattern.match(args.version): print( f"❌ Invalid version format: {args.version}. " "Expected: X.Y.Z or X.Y.Z-suffix", file=sys.stderr, ) return 1 success = update_sdk_ref_default(args.version, dry_run=args.dry_run) return 0 if success else 1 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: .github/workflows/README-RELEASE.md ================================================ # Release Automation Workflows This document describes the automated release workflows for the OpenHands Software Agent SDK. ## Overview The release process has been automated with three GitHub Actions workflows: 1. **prepare-release.yml** - Prepares a release PR with version updates 2. **pypi-release.yml** - Automatically publishes packages to PyPI when a release is created 3. **release-binaries.yml** - Builds and smoke-tests multi-arch agent-server binaries on releases and main pushes; release runs also attach binaries to the release ## How to Create a New Release ### Step 1: Trigger the Prepare Release Workflow 1. Go to the [Actions tab](https://github.com/OpenHands/software-agent-sdk/actions) 2. Select **"Prepare Release"** workflow from the left sidebar 3. Click **"Run workflow"** button 4. Enter the version number (e.g., `1.2.3`) - must be in format `X.Y.Z` 5. Click **"Run workflow"** The workflow will automatically: - ✅ Create a new branch named `rel-X.Y.Z` - ✅ Update all package versions using `make set-package-version` - ✅ Commit the changes - ✅ Push the branch - ✅ Create a PR with labels `integration-tests` and `test-examples` ### Step 2: Review the PR The created PR will include a checklist. Complete the following: - [ ] Fix any deprecation deadlines if they exist - [ ] Verify integration tests pass (triggered by `integration-tests` label) - [ ] Verify example checks pass (triggered by `test-examples` label) - [ ] Review and approve the PR ### Step 3: Create the GitHub Release 1. Go to [Releases](https://github.com/OpenHands/software-agent-sdk/releases/new) 2. Click **"Draft a new release"** 3. Configure the release: - **Tag**: `vX.Y.Z` (must match the version) - **Branch**: `rel-X.Y.Z` (the branch created by the workflow) - **Previous tag**: Select the previous release version 4. Click **"Generate release notes"** to auto-generate the changelog 5. Review and edit the release notes as needed 6. Click **"Publish release"** ### Step 4: PyPI Publication (Automated) Once the release is published, the **pypi-release.yml** workflow will automatically: - ✅ Build all packages (openhands-sdk, openhands-tools, openhands-workspace, openhands-agent-server) - ✅ Publish them to PyPI You can monitor the progress in the [Actions tab](https://github.com/OpenHands/software-agent-sdk/actions/workflows/pypi-release.yml). ### Step 4b: Release Binaries + Docker Smoke Test (Automated) In parallel with the PyPI workflow, **release-binaries.yml** also fires on `release: published`. It also runs on every push to `main` as ongoing smoke coverage. It: - ✅ Builds the agent-server PyInstaller binary on a 4-runner matrix (linux x86_64/arm64, macOS x86_64/arm64) and smoke-tests each - ✅ Generates a combined `SHA256SUMS` and attaches all artifacts to the GitHub release as `agent-server---` on release/manual runs - ✅ Verifies that the multi-arch Docker manifest `ghcr.io/openhands/agent-server:-` published by `server.yml` covers both `linux/amd64` and `linux/arm64` for every variant (`python`, `java`, `golang`) - ✅ Pulls each variant on each architecture with `--platform=linux/`, boots the container, and asserts `/health` responds On `push` events, `` is the 7-character commit SHA and binaries remain as workflow artifacts only. On release/manual runs, `` is the release version and the binaries are uploaded to the GitHub release. #### Build time / runner expectations | Stage | Runtime (typical) | Runners | |---|---|---| | Binary builds (4-way matrix, parallel) | ~10–15 min on Linux, ~12–18 min on macOS | `ubuntu-24.04`, `ubuntu-24.04-arm`, `macos-13`, `macos-14` | | `publish-binaries` (download + checksum + upload) | ~1–2 min | `ubuntu-24.04` | | `docker-smoke-test` (6-way matrix, parallel) | Up to 45 min (mostly polling for the docker images) | `ubuntu-24.04` for amd64, `ubuntu-24.04-arm` for arm64 | #### QEMU / buildx requirements The smoke test does **not** require QEMU: each (variant, arch) job runs on a runner whose architecture matches `--platform=linux/`, so containers run natively. We do still set up Docker Buildx so we can call `docker buildx imagetools inspect` on the multi-arch manifest list. The wait window for the multi-arch manifest is 45 min — long enough to absorb the full `server.yml` matrix runtime (~25–30 min for `build-and-push-image` + `merge-manifests`) when this workflow races the corresponding `server.yml` run for a release tag or main-branch push. If the matching manifest is already in GHCR, the wait step exits immediately. ### Step 5: Version Bump PRs (Automated) After successful PyPI publication, the workflow will automatically create PRs to update SDK versions in downstream repositories: - **[OpenHands](https://github.com/All-Hands-AI/OpenHands)** - Updates `openhands-sdk`, `openhands-tools`, and `openhands-agent-server` versions - **[OpenHands-CLI](https://github.com/All-Hands-AI/openhands-cli)** - Updates `openhands-sdk` and `openhands-tools` versions These PRs will: - Be created automatically with branch name `bump-sdk-X.Y.Z` - Include links back to the SDK release - Need to be reviewed and merged by the respective repository maintainers ### Step 6: Post-Release Tasks - [ ] Merge the release PR to main - [ ] Review and merge the auto-created version bump PRs in OpenHands and OpenHands-CLI - [ ] Run evaluation on OpenHands Index (manual step) - [ ] Announce the release ## Manual PyPI Release (If Needed) If you need to manually trigger the PyPI release workflow: 1. Go to the [Actions tab](https://github.com/OpenHands/software-agent-sdk/actions) 2. Select **"Publish all OpenHands packages (uv)"** workflow 3. Click **"Run workflow"** 4. Select the branch/tag you want to publish from 5. Click **"Run workflow"** ## Workflow Files - `.github/workflows/prepare-release.yml` - Automated release preparation - `.github/workflows/pypi-release.yml` - PyPI package publication - `.github/workflows/release-binaries.yml` - Multi-arch binary publishing and docker manifest smoke test on releases and main pushes ## Troubleshooting ### Version Format Error If you get a version format error, ensure you're using the format `X.Y.Z` (e.g., `1.2.3`), not `vX.Y.Z`. ### PR Creation Failed If the PR creation fails, check: - The branch doesn't already exist - You have proper permissions - The `GITHUB_TOKEN` has sufficient permissions ### PyPI Publication Failed If PyPI publication fails: - Check that the `PYPI_TOKEN_OPENHANDS` secret is properly configured - Verify the version doesn't already exist on PyPI - Check the workflow logs for specific error messages ### Release Binaries Failed If `release-binaries.yml` fails: - **Binary build failure**: re-run the failed matrix job; PyInstaller flakes are rare but possible. If it persists, the issue is likely in `agent-server.spec`. - **`docker-smoke-test` timed out waiting for the manifest**: `server.yml` did not publish multi-arch images for the matching release tag or commit SHA. Check that workflow's corresponding run and re-trigger if needed. - **`/health` never responded**: open the failing job; the cleanup trap dumps the last 100 lines of `docker logs` for the container. - Release/manual runs can be re-run against an existing tag via `workflow_dispatch` with the `release_tag` input (e.g. `v1.20.1`); `gh release upload --clobber` makes this safe. ## Previous Manual Process For reference, the previous manual release checklist was: - [ ] Checkout SDK repo, use `make set-package-version version=x.x.x` to set the version - [ ] Push to a branch like `rel-x.x.x` and start a PR - [ ] Fix any "deprecation deadlines" if they exist - [ ] Tag "integration-tests" and make sure integration test all pass - [ ] Tag "test-examples" and make sure example checks all pass - [ ] Draft a new release - [ ] Use workflow to publish to PyPI on tag `v1.X.X` - [ ] Evaluation on OpenHands Index Most of these steps are now automated! ================================================ FILE: .github/workflows/agent-server-rest-api-breakage.yml ================================================ --- name: REST API breakage checks on: push: branches: [main] pull_request: branches: [main] jobs: agent-server-rest-api: name: REST API (OpenAPI) runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - name: Checkout uses: actions/checkout@v6 with: fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true - name: Install workspace deps (dev) run: uv sync --frozen --group dev - name: Install oasdiff run: | curl -L https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh -s -- -b /usr/local/bin oasdiff --version - name: Run agent server REST API breakage check id: api_breakage # Let this step fail so CI is visibly red on breakage. # Later reporting steps still run because they use if: always(). run: | uv run --with packaging python .github/scripts/check_agent_server_rest_api_breakage.py 2>&1 | tee api-breakage.log exit_code=${PIPESTATUS[0]} echo "exit_code=${exit_code}" >> "$GITHUB_OUTPUT" exit "${exit_code}" - name: Write REST API breakage summary if: ${{ always() }} env: EXIT_CODE: ${{ steps.api_breakage.outputs.exit_code }} IS_FORK: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository }} LOG_PATH: api-breakage.log RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | python3 <<'PY' >> "$GITHUB_STEP_SUMMARY" import os from pathlib import Path exit_code = int(os.environ.get('EXIT_CODE', '0') or '0') is_fork = os.environ.get('IS_FORK', 'false') == 'true' run_url = os.environ['RUN_URL'] status = '✅ **PASSED**' if exit_code == 0 else '❌ **FAILED**' print(f'## REST API breakage checks (OpenAPI) — {status}') print() print(f"**Result:** {status}") if exit_code != 0: print() print('> ⚠️ Breaking REST API changes or policy violations detected.') print() if is_fork: print( '_Fork PR detected: sticky PR comment was skipped because ' 'the GitHub token is read-only for `pull_request` workflows ' 'from forks._' ) print() if exit_code != 0: try: log = Path(os.environ['LOG_PATH']).read_text() except Exception as exc: log = f'Unable to read log file: {exc}' excerpt = log[:1000].replace('```', '``\\`') print('
Log excerpt (first 1000 characters)') print() print('```text') print(excerpt) print('```') print() print('
') print() print(f'[Action log]({run_url})') PY - name: Post REST API breakage report to PR if: ${{ always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository }} uses: actions/github-script@v9 env: EXIT_CODE: ${{ steps.api_breakage.outputs.exit_code }} LOG_PATH: api-breakage.log with: script: | const fs = require('fs'); const marker = ''; const exitCode = Number(process.env.EXIT_CODE || '0'); const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const status = exitCode === 0 ? '✅ **PASSED**' : '❌ **FAILED**'; let body = `${marker}\n## REST API breakage checks (OpenAPI) — ${status}\n\n**Result:** ${status}\n`; if (exitCode !== 0) { body += `\n> ⚠️ Breaking REST API changes or policy violations detected.\n`; let log = ''; try { log = fs.readFileSync(process.env.LOG_PATH, 'utf8'); } catch (e) { log = `Unable to read log file: ${e}`; } const excerpt = log.slice(0, 1000).replace(/```/g, '``\\`'); body += `\n
Log excerpt (first 1000 characters)\n\n\`\`\`text\n${excerpt}\n\`\`\`\n\n
\n`; } body += `\n[Action log](${runUrl})\n`; const { owner, repo } = context.repo; const issue_number = context.issue.number; const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number, per_page: 100, }); const existing = comments.find((c) => c.body && c.body.includes(marker)); if (existing) { await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body, }); } else { await github.rest.issues.createComment({ owner, repo, issue_number, body, }); } ================================================ FILE: .github/workflows/api-breakage.yml ================================================ --- name: Python API breakage checks on: push: branches: [main] pull_request: branches: [main] jobs: sdk-api: name: Python API runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - name: Checkout uses: actions/checkout@v6 with: fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true - name: Install workspace deps (dev) run: uv sync --frozen --group dev - name: Run Python API breakage check id: api_breakage # Let this step fail so CI is visibly red on breakage. # Later reporting steps still run because they use if: always(). env: ACP_VERSION_CHECK_BASE_REF: ${{ github.event_name == 'pull_request' && github.base_ref || github.event.before }} ACP_VERSION_CHECK_SKIP: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.body || '', 'skip-acp-check') }} run: | uv run python .github/scripts/check_sdk_api_breakage.py 2>&1 | tee api-breakage.log exit_code=${PIPESTATUS[0]} echo "exit_code=${exit_code}" >> "$GITHUB_OUTPUT" exit "${exit_code}" - name: Write API breakage summary if: ${{ always() }} env: EXIT_CODE: ${{ steps.api_breakage.outputs.exit_code }} IS_FORK: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository }} LOG_PATH: api-breakage.log RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | python3 <<'PY' >> "$GITHUB_STEP_SUMMARY" import os from pathlib import Path exit_code = int(os.environ.get('EXIT_CODE', '0') or '0') is_fork = os.environ.get('IS_FORK', 'false') == 'true' run_url = os.environ['RUN_URL'] status = '✅ **PASSED**' if exit_code == 0 else '❌ **FAILED**' print(f'## Python API breakage checks — {status}') print() print(f"**Result:** {status}") if exit_code != 0: print() print('> ⚠️ Breaking API changes or policy violations detected.') print() if is_fork: print( '_Fork PR detected: sticky PR comment was skipped because ' 'the GitHub token is read-only for `pull_request` workflows ' 'from forks._' ) print() if exit_code != 0: try: log = Path(os.environ['LOG_PATH']).read_text() except Exception as exc: log = f'Unable to read log file: {exc}' excerpt = log[:1000].replace('```', '``\\`') print('
Log excerpt (first 1000 characters)') print() print('```text') print(excerpt) print('```') print() print('
') print() print(f'[Action log]({run_url})') PY - name: Post API breakage report to PR if: ${{ always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository }} uses: actions/github-script@v9 env: EXIT_CODE: ${{ steps.api_breakage.outputs.exit_code }} LOG_PATH: api-breakage.log with: script: | const fs = require('fs'); const marker = ''; const exitCode = Number(process.env.EXIT_CODE || '0'); const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const status = exitCode === 0 ? '✅ **PASSED**' : '❌ **FAILED**'; let body = `${marker}\n## Python API breakage checks — ${status}\n\n**Result:** ${status}\n`; if (exitCode !== 0) { body += `\n> ⚠️ Breaking API changes or policy violations detected.\n`; let log = ''; try { log = fs.readFileSync(process.env.LOG_PATH, 'utf8'); } catch (e) { log = `Unable to read log file: ${e}`; } const excerpt = log.slice(0, 1000).replace(/```/g, '``\\`'); body += `\n
Log excerpt (first 1000 characters)\n\n\`\`\`text\n${excerpt}\n\`\`\`\n\n
\n`; } body += `\n[Action log](${runUrl})\n`; const { owner, repo } = context.repo; const issue_number = context.issue.number; const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number, per_page: 100, }); const existing = comments.find((c) => c.body && c.body.includes(marker)); if (existing) { await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body, }); } else { await github.rest.issues.createComment({ owner, repo, issue_number, body, }); } ================================================ FILE: .github/workflows/api-compliance-runner.yml ================================================ --- name: API Compliance Tests on: pull_request: types: [labeled] workflow_dispatch: inputs: reason: description: Reason for running compliance tests required: true patterns: description: Comma-separated patterns to test (empty = all) required: false models: description: Comma-separated model IDs (empty = all defaults) required: false env: # Default models to test (matches DEFAULT_MODELS in run_compliance.py) DEFAULT_MODELS: claude-sonnet-4-5,gpt-5.2,gemini-3.1-pro jobs: run-compliance-tests: # Only run on api-compliance-test label or workflow_dispatch if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'api-compliance-test') runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - name: Checkout repository uses: actions/checkout@v6 with: repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} ref: ${{ github.event.pull_request.head.sha || github.ref }} persist-credentials: false - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Install dependencies run: uv sync --dev - name: Determine test parameters id: params run: | # Use input values or defaults if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then PATTERNS="${{ github.event.inputs.patterns }}" MODELS="${{ github.event.inputs.models }}" else PATTERNS="" MODELS="" fi # Build command args ARGS="" if [ -n "$PATTERNS" ]; then ARGS="$ARGS --patterns $PATTERNS" fi if [ -n "$MODELS" ]; then ARGS="$ARGS --models $MODELS" else ARGS="$ARGS --models $DEFAULT_MODELS" fi echo "args=$ARGS" >> $GITHUB_OUTPUT - name: Run API compliance tests id: compliance env: LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }} LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev GITHUB_RUN_ID: ${{ github.run_id }} run: | uv run python tests/integration/api_compliance/run_compliance.py \ ${{ steps.params.outputs.args }} \ --output-dir compliance-results/ continue-on-error: true # Tests may "fail" but that's expected - name: Upload results uses: actions/upload-artifact@v7 with: name: compliance-results path: compliance-results/ retention-days: 30 - name: Post results to PR if: github.event_name == 'pull_request' uses: actions/github-script@v9 with: script: | const fs = require('fs'); const path = require('path'); // Find the report directory const resultsDir = 'compliance-results'; const dirs = fs.readdirSync(resultsDir); if (dirs.length === 0) { console.log('No results found'); return; } const latestDir = path.join(resultsDir, dirs[0]); const reportPath = path.join(latestDir, 'compliance_report.md'); if (!fs.existsSync(reportPath)) { console.log('Report not found at', reportPath); return; } let report = fs.readFileSync(reportPath, 'utf8'); // Truncate if too long if (report.length > 60000) { report = report.substring(0, 60000) + '\n\n... (truncated)'; } await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.payload.pull_request.number, body: report }); ================================================ FILE: .github/workflows/assign-reviews.yml ================================================ --- # To set this up: # 1. Change the name below to something relevant to your task # 2. Modify the "env" section below with your prompt # 3. Add your LLM_API_KEY to the repository secrets # 4. Commit this file to your repository # 5. Trigger the workflow manually or set up a schedule name: Assign Reviews on: # Manual trigger workflow_dispatch: # Scheduled trigger (disabled by default, uncomment and customize as needed) schedule: # Run at 12 PM UTC every day - cron: 0 12 * * * permissions: contents: write pull-requests: write issues: write jobs: run-task: # Only run scheduled jobs in the main repository, not in forks if: github.repository == 'OpenHands/software-agent-sdk' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-24.04 env: # Configuration (modify these values as needed) AGENT_SCRIPT_URL: https://raw.githubusercontent.com/OpenHands/agent-sdk/main/examples/03_github_workflows/01_basic_action/agent_script.py # Provide either PROMPT_LOCATION (URL/file) OR PROMPT_STRING (direct text), not both # Option 1: Use a URL or file path for the prompt PROMPT_LOCATION: '' # PROMPT_LOCATION: 'https://example.com/prompts/maintenance.txt' # Option 2: Use direct text for the prompt PROMPT_STRING: > Use GITHUB_TOKEN and the github API to organize open pull requests and issues in the repo. Read the sections below in order, and perform each in order. Do NOT take action on the same issue or PR twice. # Issues with needs-info - Check for OP Response Find all open issues that have the "needs-info" label. For each issue: 1. Identify the original poster (issue author) 2. Check if there are any comments from the original poster AFTER the "needs-info" label was added 3. To determine when the label was added, use: GET /repos/{owner}/{repo}/issues/{issue_number}/timeline and look for "labeled" events with the label "needs-info" 4. If the original poster has commented after the label was added: - Remove the "needs-info" label - Add the "needs-triage" label # Issues with needs-triage Find all open issues that have the "needs-triage" label. For each issue that has been in this state for more than 2 days: 1. First, check if the issue has already been triaged by verifying it does NOT have: - The "enhancement" label - Any "priority" label (priority:low, priority:medium, priority:high, etc.) 2. If the issue has already been triaged (has enhancement or priority label), remove the "needs-triage" label 3. For issues that have NOT been triaged yet: - Read the issue description and comments - Check if it is a bug report, feature request, or question and add the appropriate label - If it is a bug report and it does not have a priority label * Read the MAINTAINERS file in the repository root to get the list of maintainers * Extract all usernames from lines starting with "- @" and join them with spaces, each prefixed with @ (e.g., if the file contains "- @user1" and "- @user2", format as "@user1 @user2") * Tag ALL maintainers with: "[Automatic Post]: This issue has been waiting for triage. , could you please take a look and add the appropriate priority label when you have a chance?" (Replace with the formatted list from the previous step) # Need Reviewer Action Find all open PRs where: 1. The PR is waiting for review (there are no open review comments or change requests) 2. The PR is in a "clean" state (CI passing, no merge conflicts) 3. The PR is not marked as draft (draft: false) 4. The PR has had no activity (comments, commits, reviews) for more than 3 days. In this case, send a message to the reviewers: [Automatic Post]: This PR seems to be currently waiting for review. {reviewer_names}, could you please take a look when you have a chance? # Need Author Action Find all open PRs where the most recent change or comment was made on the pull request more than 5 days ago (use 14 days if the PR is marked as draft). And send a message to the author: [Automatic Post]: It has been a while since there was any activity on this PR. {author}, are you still working on it? If so, please go ahead, if not then please request review, close it, or request that someone else follow up. # Need Reviewers Find all open pull requests that TRULY have NO reviewers assigned. To do this correctly: 1. Use the GitHub API to fetch PR details: GET /repos/{owner}/{repo}/pulls/{pull_number} 2. Check the "requested_reviewers" and "requested_teams" arrays 3. ALSO check for submitted reviews: GET /repos/{owner}/{repo}/pulls/{pull_number}/reviews 4. A PR needs reviewers ONLY if ALL of these are true: - The "requested_reviewers" array is empty (no pending review requests) - The "requested_teams" array is empty (no pending team review requests) - The reviews array is empty (no reviews have been submitted yet) 5. IMPORTANT: If ANY of these has entries, SKIP this PR - it already has or had reviewers! Example API responses showing a PR that DOES NOT need reviewers (skip this): Case 1 - Has requested reviewers: GET /pulls/{number}: {"requested_reviewers": [{"login": "someuser"}], "requested_teams": []} Case 2 - Has submitted reviews (even if requested_reviewers is empty): GET /pulls/{number}: {"requested_reviewers": [], "requested_teams": []} GET /pulls/{number}/reviews: [{"user": {"login": "someuser"}, "state": "COMMENTED"}] Example API response showing a PR that DOES need reviewers (process this): GET /pulls/{number}: {"requested_reviewers": [], "requested_teams": []} GET /pulls/{number}/reviews: [] Additional criteria for PRs that need reviewers: 1. Are not marked as draft (draft: false) 2. Were created more than 1 day ago 3. CI is passing and there are no merge conflicts For each PR that truly has NO reviewers: 1) Read git blame for changed files to identify recent, active contributors. 2) From those blame-derived candidates, ONLY consider maintainers who are repository collaborators with write access or higher. Verify that with the GitHub API before requesting review: - Preferred: GET /repos/{owner}/{repo}/collaborators (no permission filter). Filter client-side using either: role_name in ["write", "maintain", "admin"] OR permissions.push || permissions.admin. Note: paginate if > 30 collaborators. - Alternative: GET /repos/{owner}/{repo}/collaborators/{username}/permission and accept if permission in {push, maintain, admin}. 3) If one or more blame-derived maintainers qualify, request review from exactly one of them. Prefer the maintainer with the lowest current review load. Add this message: [Automatic Post]: I have assigned {reviewer} as a reviewer based on git blame information. Thanks in advance for the help! 4) If no blame-derived maintainer qualifies, read the MAINTAINERS file in the repository root. Parse usernames from lines starting with "- @username" and treat that file as the canonical list of active maintainers. 5) From that MAINTAINERS list, keep only users who still have write access or higher via the GitHub API, exclude the PR author, and request review from exactly one of them, again preferring the maintainer with the lowest current review load. Add this message: [Automatic Post]: I have assigned {reviewer} as a reviewer based on the repository MAINTAINERS file. Thanks in advance for the help! 6) If neither path yields a qualified maintainer, do not request review from anyone and do not fall back to a broader collaborator pool. LLM_MODEL: litellm_proxy/claude-sonnet-4-5-20250929 LLM_BASE_URL: https://llm-proxy.app.all-hands.dev steps: - name: Checkout repository uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true - name: Install OpenHands dependencies run: | # Install OpenHands SDK and tools from git repository uv pip install --system "openhands-sdk @ git+https://github.com/OpenHands/agent-sdk.git@main#subdirectory=openhands-sdk" uv pip install --system "openhands-tools @ git+https://github.com/OpenHands/agent-sdk.git@main#subdirectory=openhands-tools" - name: Check required configuration env: LLM_API_KEY: ${{ secrets.LLM_API_KEY }} run: | if [ -z "$LLM_API_KEY" ]; then echo "Error: LLM_API_KEY secret is not set." exit 1 fi # Check that exactly one of PROMPT_LOCATION or PROMPT_STRING is set if [ -n "$PROMPT_LOCATION" ] && [ -n "$PROMPT_STRING" ]; then echo "Error: Both PROMPT_LOCATION and PROMPT_STRING are set." echo "Please provide only one in the env section of the workflow file." exit 1 fi if [ -z "$PROMPT_LOCATION" ] && [ -z "$PROMPT_STRING" ]; then echo "Error: Neither PROMPT_LOCATION nor PROMPT_STRING is set." echo "Please set one in the env section of the workflow file." exit 1 fi if [ -n "$PROMPT_LOCATION" ]; then echo "Prompt location: $PROMPT_LOCATION" else echo "Using inline PROMPT_STRING (${#PROMPT_STRING} characters)" fi echo "LLM model: $LLM_MODEL" if [ -n "$LLM_BASE_URL" ]; then echo "LLM base URL: $LLM_BASE_URL" fi - name: Run task env: LLM_API_KEY: ${{ secrets.LLM_API_KEY }} GITHUB_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} PYTHONPATH: '' run: | echo "Running agent script: $AGENT_SCRIPT_URL" # Download script if it's a URL if [[ "$AGENT_SCRIPT_URL" =~ ^https?:// ]]; then echo "Downloading agent script from URL..." curl -sSL "$AGENT_SCRIPT_URL" -o /tmp/agent_script.py AGENT_SCRIPT_PATH="/tmp/agent_script.py" else AGENT_SCRIPT_PATH="$AGENT_SCRIPT_URL" fi # Run with appropriate prompt argument if [ -n "$PROMPT_LOCATION" ]; then echo "Using prompt from: $PROMPT_LOCATION" uv run python "$AGENT_SCRIPT_PATH" "$PROMPT_LOCATION" else echo "Using PROMPT_STRING (${#PROMPT_STRING} characters)" uv run python "$AGENT_SCRIPT_PATH" fi - name: Upload logs as artifact uses: actions/upload-artifact@v7 if: always() with: name: openhands-task-logs path: | *.log output/ retention-days: 7 ================================================ FILE: .github/workflows/auto-label-issues.yml ================================================ --- name: Auto-label New Issues on: issues: types: [opened] permissions: issues: write jobs: add-triage-label: runs-on: ubuntu-latest steps: - name: Add needs-triage label uses: actions/github-script@v9 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | // Get the issue details const issue = context.payload.issue; const labels = issue.labels.map(label => label.name); // Check if issue has already been triaged const hasEnhancement = labels.includes('enhancement'); const hasPriority = labels.some(label => label.startsWith('priority')); // Only add needs-triage if not already triaged if (!hasEnhancement && !hasPriority) { await github.rest.issues.addLabels({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, labels: ['needs-triage'] }); } ================================================ FILE: .github/workflows/cancel-eval.yml ================================================ --- name: Cancel Eval run-name: Cancel Eval (${{ inputs.run_id }}) on: workflow_dispatch: inputs: run_id: description: Workflow run ID to cancel required: true type: string reason: description: Reason for cancellation required: false type: string env: EVAL_REPO: OpenHands/evaluation EVAL_WORKFLOW: kill-eval-job.yml permissions: contents: read jobs: cancel-eval: runs-on: ubuntu-latest steps: - name: Cancel evaluation job env: DISPATCH_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_EVAL_DISPATCH }} RUN_ID: ${{ github.event.inputs.run_id }} REASON: ${{ github.event.inputs.reason }} run: |- set -euo pipefail if [ -z "$DISPATCH_TOKEN" ]; then echo "Missing dispatch token" >&2 exit 1 fi echo "Canceling evaluation workflow run: $RUN_ID" # Dispatch kill workflow in evaluation repo PAYLOAD=$(jq -n \ --arg ref "main" \ --arg run_id "$RUN_ID" \ --arg reason "$REASON" \ '{ref: $ref, inputs: {run_id: $run_id, reason: $reason}}') RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \ -H "Authorization: token $DISPATCH_TOKEN" \ -H "Accept: application/vnd.github+json" \ -d "$PAYLOAD" \ "https://api.github.com/repos/${EVAL_REPO}/actions/workflows/${EVAL_WORKFLOW}/dispatches") if [ "$RESPONSE" != "204" ]; then echo "Dispatch failed (status $RESPONSE):" >&2 cat /tmp/dispatch.out >&2 exit 1 fi echo "Cancellation dispatched successfully for run: $RUN_ID" ================================================ FILE: .github/workflows/check-docstrings.yml ================================================ --- # .github/workflows/check-docstrings.yml name: Check Docstrings on: push: branches: [main] pull_request: branches: ['**'] jobs: check-docstrings: runs-on: ubuntu-24.04 steps: - name: Checkout code uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Check docstring formatting run: python .github/scripts/check_docstrings.py ================================================ FILE: .github/workflows/check-documented-examples.yml ================================================ --- name: '[Optional] Docs example' on: pull_request: branches: - '**' paths: - examples/**/*.py - '!examples/03_github_workflows/**' - '!examples/04_llm_specific_tools/**' - .github/workflows/check-documented-examples.yml - .github/scripts/check_documented_examples.py workflow_dispatch: permissions: contents: read pull-requests: read jobs: check-examples: runs-on: ubuntu-latest steps: - name: Checkout agent-sdk repository uses: actions/checkout@v6 with: fetch-depth: 0 - name: Checkout docs repository (try feature branch) uses: actions/checkout@v6 continue-on-error: true id: checkout-feature with: repository: OpenHands/docs path: docs fetch-depth: 0 ref: ${{ github.head_ref || github.ref_name }} - name: Checkout docs repository (fallback to main) if: steps.checkout-feature.outcome == 'failure' uses: actions/checkout@v6 with: repository: OpenHands/docs path: docs fetch-depth: 0 ref: main - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Check documented examples env: DOCS_PATH: ${{ github.workspace }}/docs shell: bash run: | set -euo pipefail python .github/scripts/check_documented_examples.py ================================================ FILE: .github/workflows/check-duplicate-examples.yml ================================================ --- name: Check duplicate example numbers on: pull_request: branches: - '**' paths: - examples/** - .github/workflows/check-duplicate-examples.yml - .github/scripts/check_duplicate_example_numbers.py push: branches: - main paths: - examples/** workflow_dispatch: permissions: contents: read jobs: check-duplicates: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Check for duplicate example numbers run: python .github/scripts/check_duplicate_example_numbers.py ================================================ FILE: .github/workflows/condenser-runner.yml ================================================ --- name: Run Condenser Tests on: # Use pull_request_target to access secrets even on fork PRs # This is safe because we only run when the 'condenser-test' label is added by a maintainer pull_request_target: types: - labeled workflow_dispatch: inputs: reason: description: Reason for manual trigger required: true default: '' env: N_PROCESSES: 2 # Fewer parallel processes for condenser tests (only 2 LLMs) jobs: post-initial-comment: if: > github.event_name == 'pull_request_target' && github.event.label.name == 'condenser-test' runs-on: ubuntu-latest permissions: pull-requests: write steps: - name: Comment on PR uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the condenser tests on your PR. You will receive a comment with the results shortly. Note: These are non-blocking tests that validate condenser functionality across different LLMs. run-condenser-tests: # Security: Only run when condenser-test label is present or via workflow_dispatch # This prevents automatic execution on fork PRs without maintainer approval if: | always() && ( ( github.event_name == 'pull_request_target' && github.event.label.name == 'condenser-test' ) || github.event_name == 'workflow_dispatch' ) runs-on: ubuntu-22.04 permissions: contents: read id-token: write pull-requests: write strategy: matrix: python-version: ['3.13'] job-config: # Only run against 2 LLMs for condenser tests: # - Claude Opus 4.5 (primary - supports thinking blocks) # - GPT-5.1 Codex Max (secondary - cross-LLM validation) - name: Claude Opus 4.5 run-suffix: opus_condenser_run llm-config: model: litellm_proxy/anthropic/claude-opus-4-5-20251101 extended_thinking: true - name: GPT-5.1 Codex Max run-suffix: gpt51_condenser_run llm-config: model: litellm_proxy/gpt-5.1-codex-max steps: - name: Checkout repository uses: actions/checkout@v6 with: # For pull_request_target: checkout fork PR code (requires explicit repository) # For other events: fallback to current repository and ref repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} ref: ${{ github.event.pull_request.head.sha || github.ref }} # Security: Don't persist credentials to prevent untrusted PR code from using them persist-credentials: false - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: ${{ matrix.python-version }} - name: Install Python dependencies using uv run: | uv sync --dev uv pip install pytest - name: Run condenser test evaluation for ${{ matrix.job-config.name }} env: LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_BASE_URL: https://llm-proxy.app.all-hands.dev run: | set -eo pipefail AGENT_SDK_VERSION=$(git rev-parse --short HEAD) EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}" echo "Running condenser tests only (c*.py pattern)" uv run python tests/integration/run_infer.py \ --llm-config "$LLM_CONFIG" \ --num-workers $N_PROCESSES \ --eval-note "$EVAL_NOTE" \ --test-type condenser # get condenser tests JSON results RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1) echo "RESULTS_FILE: $RESULTS_FILE" if [ -f "$RESULTS_FILE" ]; then echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV else echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV fi - name: Wait a little bit run: sleep 10 - name: Create archive of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') cd tests/integration/outputs # Change to the outputs directory tar -czvf ../../../condenser_tests_${{ matrix.job-config.run-suffix }}_${TIMESTAMP}.tar.gz *${{ matrix.job-config.run-suffix }}* # Include result directories for this model - name: Upload evaluation results as artifact uses: actions/upload-artifact@v7 id: upload_results_artifact with: name: condenser-test-outputs-${{ matrix.job-config.run-suffix }}-${{ github.run_id }}-${{ github.run_attempt }} path: condenser_tests_${{ matrix.job-config.run-suffix }}_*.tar.gz - name: Save test results for consolidation run: | # Copy the structured JSON results file for consolidation mkdir -p test_results_summary if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then # Copy the JSON results file directly cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config.run-suffix }}_results.json" echo "✓ Copied JSON results file for consolidation" else echo "✗ No JSON results file found" exit 1 fi - name: Upload test results summary uses: actions/upload-artifact@v7 with: name: test-results-${{ matrix.job-config.run-suffix }} path: test_results_summary/${{ matrix.job-config.run-suffix }}_results.json consolidate-results: needs: run-condenser-tests if: | always() && ( ( github.event_name == 'pull_request_target' && github.event.label.name == 'condenser-test' ) || github.event_name == 'workflow_dispatch' ) runs-on: ubuntu-24.04 permissions: contents: read pull-requests: write steps: - name: Checkout repository uses: actions/checkout@v6 with: # When using pull_request_target, explicitly checkout the PR branch # This ensures we use the scripts from the actual PR code ref: ${{ github.event.pull_request.head.sha || github.ref }} - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Install Python dependencies using uv run: | uv sync --dev - name: Download all test results uses: actions/download-artifact@v8 with: pattern: test-results-* merge-multiple: true path: all_results - name: Download all condenser test artifacts uses: actions/download-artifact@v8 with: pattern: condenser-test-outputs-* path: artifacts - name: Consolidate test results env: EVENT_NAME: ${{ github.event_name }} PR_NUMBER: ${{ github.event.pull_request.number }} MANUAL_REASON: ${{ github.event.inputs.reason }} COMMIT_SHA: ${{ github.sha }} PYTHONPATH: ${{ github.workspace }} GITHUB_SERVER_URL: ${{ github.server_url }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_RUN_ID: ${{ github.run_id }} run: | uv run python tests/integration/utils/consolidate_json_results.py \ --results-dir all_results \ --artifacts-dir artifacts \ --output-file consolidated_results.json echo "Consolidated results generated successfully" uv run python tests/integration/utils/generate_markdown_report.py \ --input-file consolidated_results.json \ --output-file consolidated_report.md - name: Upload consolidated report uses: actions/upload-artifact@v7 with: name: consolidated-condenser-report path: consolidated_report.md - name: Create consolidated PR comment if: github.event_name == 'pull_request_target' run: | # Add header to clarify these are non-blocking tests echo "## Condenser Test Results (Non-Blocking)" > final_report.md echo "" >> final_report.md echo "> These tests validate condenser functionality and do not block PR merges." >> final_report.md echo "" >> final_report.md cat consolidated_report.md >> final_report.md # Sanitize @OpenHands mentions to prevent self-mention loops COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < final_report.md) # Use GitHub CLI to create comment with explicit PR number echo "$COMMENT_BODY" | gh pr comment ${{ github.event.pull_request.number }} --body-file - env: GH_TOKEN: ${{ github.token }} ================================================ FILE: .github/workflows/create-release.yml ================================================ --- name: Create GitHub Release # Automatically create a GitHub release when a release PR is merged into main. # This bridges the gap between merging the release PR and the pypi-release # workflow (which triggers on release published). on: pull_request: types: [closed] branches: [main] jobs: create-release: # Only run when a release PR is merged (not just closed) if: > github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'rel-') runs-on: ubuntu-24.04 permissions: actions: write contents: write steps: - name: Extract version from branch name id: version run: | BRANCH="${{ github.event.pull_request.head.ref }}" VERSION="${BRANCH#rel-}" if ! [[ "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "❌ Could not extract valid version from branch: $BRANCH" exit 1 fi echo "version=$VERSION" >> "$GITHUB_OUTPUT" echo "📦 Version: $VERSION" - name: Check release does not already exist id: check env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.version.outputs.version }} run: | if gh release view "v${VERSION}" --repo "${{ github.repository }}" > /dev/null 2>&1; then echo "⚠️ Release v${VERSION} already exists, skipping" echo "exists=true" >> "$GITHUB_OUTPUT" else echo "exists=false" >> "$GITHUB_OUTPUT" fi - name: Find previous release tag if: steps.check.outputs.exists == 'false' id: prev_tag env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | PREV_TAG=$(gh release list --repo "${{ github.repository }}" \ --exclude-drafts --exclude-pre-releases --limit 1 \ --json tagName --jq '.[0].tagName') echo "prev_tag=${PREV_TAG}" >> "$GITHUB_OUTPUT" echo "📌 Previous release tag: ${PREV_TAG:-}" - name: Create GitHub Release if: steps.check.outputs.exists == 'false' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.version.outputs.version }} PREV_TAG: ${{ steps.prev_tag.outputs.prev_tag }} run: | NOTES_FLAG=() if [ -n "$PREV_TAG" ]; then NOTES_FLAG=(--notes-start-tag "$PREV_TAG") fi gh release create "v${VERSION}" \ --repo "${{ github.repository }}" \ --target "${{ github.event.pull_request.merge_commit_sha }}" \ --title "v${VERSION}" \ --generate-notes \ "${NOTES_FLAG[@]}" echo "✅ Release v${VERSION} created!" echo "🔗 https://github.com/${{ github.repository }}/releases/tag/v${VERSION}" - name: Dispatch PyPI release workflow if: steps.check.outputs.exists == 'false' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.version.outputs.version }} run: | gh workflow run pypi-release.yml \ --repo "${{ github.repository }}" \ --ref "v${VERSION}" echo "🚀 Dispatched pypi-release.yml for v${VERSION}" - name: Dispatch Agent Server image build # server.yml builds versioned Docker images (e.g. 1.21.0-python) when # triggered on a tag ref. Tags created by GITHUB_TOKEN don't trigger # workflow runs automatically, so we dispatch it explicitly here. if: steps.check.outputs.exists == 'false' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.version.outputs.version }} run: | gh workflow run server.yml \ --repo "${{ github.repository }}" \ --ref "v${VERSION}" echo "🐳 Dispatched server.yml image build for v${VERSION}" - name: Dispatch release binaries workflow # Same GITHUB_TOKEN limitation applies to release-binaries.yml # which triggers on release:published events. if: steps.check.outputs.exists == 'false' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} VERSION: ${{ steps.version.outputs.version }} run: | gh workflow run release-binaries.yml \ --repo "${{ github.repository }}" \ --ref "v${VERSION}" \ -f release_tag="v${VERSION}" echo "📦 Dispatched release-binaries.yml for v${VERSION}" - name: Summary env: VERSION: ${{ steps.version.outputs.version }} run: | echo "## ✅ Release v${VERSION} Created" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "- **Tag**: v${VERSION}" >> "$GITHUB_STEP_SUMMARY" echo "- **Release**: https://github.com/${{ github.repository }}/releases/tag/v${VERSION}" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "The \`pypi-release.yml\` workflow was dispatched to publish packages to PyPI." >> "$GITHUB_STEP_SUMMARY" echo "The \`server.yml\` workflow was dispatched to build versioned Docker images." >> "$GITHUB_STEP_SUMMARY" echo "The \`release-binaries.yml\` workflow was dispatched to build and attach release binaries." >> "$GITHUB_STEP_SUMMARY" ================================================ FILE: .github/workflows/deploy-docs.yml ================================================ --- name: Dispatch to docs repo on: push: branches: - main paths: - openhands-agent-server/** workflow_dispatch: jobs: dispatch: runs-on: ubuntu-24.04 permissions: contents: write steps: - name: Trigger docs repo sync uses: peter-evans/repository-dispatch@v4 with: token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} repository: OpenHands/docs event-type: update client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' ================================================ FILE: .github/workflows/deprecation-check.yml ================================================ --- name: Deprecation deadlines on: push: branches: [main] pull_request: branches: ['**'] jobs: check: runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v6 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Verify deprecation removals run: uv run --with packaging python .github/scripts/check_deprecations.py ================================================ FILE: .github/workflows/integration-runner.yml ================================================ --- name: Run Integration Tests run-name: >- Run Integration Tests ${{ inputs.reason || github.event.label.name || 'scheduled' }} on: # Use pull_request_target to access secrets even on fork PRs # This is safe because we only run when the 'integration-test' label is added by a maintainer pull_request_target: types: - labeled workflow_dispatch: inputs: reason: description: Reason for manual trigger required: true default: '' test_type: description: Select which tests to run (all, integration, behavior) required: false default: all model_ids: description: >- Comma-separated model IDs to test (from resolve_model_config.py). Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set. required: false default: '' type: string issue_number: description: Issue or PR number to post results to (optional) required: false default: '' type: string tool_preset: description: >- Tool preset for file editing (default, gemini, gpt5, planning). 'default' uses FileEditorTool, 'gemini' uses read_file/write_file/edit/list_directory, 'gpt5' uses apply_patch tool. required: false default: default type: choice options: - default - gemini - gpt5 - planning schedule: - cron: 30 22 * * * # Runs at 10:30pm UTC every day env: N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation # Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py) DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v4-flash,kimi-k2.6,gemini-3.1-pro jobs: setup-matrix: runs-on: ubuntu-latest outputs: matrix: ${{ steps.resolve-models.outputs.matrix }} issue_number: ${{ steps.resolve-issue.outputs.issue_number }} steps: - name: Checkout repository uses: actions/checkout@v6 with: repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} ref: ${{ github.event.pull_request.head.sha || github.ref }} persist-credentials: false - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.13' - name: Resolve model configurations id: resolve-models env: MODEL_IDS_INPUT: ${{ github.event.inputs.model_ids || '' }} DEFAULT_MODEL_IDS: ${{ env.DEFAULT_MODEL_IDS }} run: | # Use input model_ids if provided, otherwise use defaults if [ -z "$MODEL_IDS_INPUT" ]; then MODEL_IDS="$DEFAULT_MODEL_IDS" echo "No model_ids specified, using defaults: $MODEL_IDS" else MODEL_IDS="$MODEL_IDS_INPUT" echo "Using specified model_ids: $MODEL_IDS" fi # Resolve model configs using resolve_model_config.py # Transform output to matrix format for integration tests MATRIX=$(python3 << EOF import json import sys sys.path.insert(0, '.github/run-eval') from resolve_model_config import MODELS model_ids = "$MODEL_IDS".split(",") model_ids = [m.strip() for m in model_ids if m.strip()] matrix = [] for model_id in model_ids: if model_id not in MODELS: available = ", ".join(sorted(MODELS.keys())) print(f"Error: Model ID '{model_id}' not found. Available: {available}", file=sys.stderr) sys.exit(1) model = MODELS[model_id] # Create run-suffix from model id (replace special chars with underscore) run_suffix = model_id.replace("-", "_").replace(".", "_") + "_run" matrix.append({ "id": model_id, "name": model["display_name"], "run-suffix": run_suffix, "llm-config": model["llm_config"] }) print(json.dumps(matrix)) EOF ) if [ $? -ne 0 ]; then echo "Failed to resolve model configurations" >&2 exit 1 fi echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" echo "Resolved models: $(echo "$MATRIX" | jq -r '.[].name' | paste -sd', ' -)" - name: Resolve issue number id: resolve-issue env: ISSUE_NUMBER_INPUT: ${{ github.event.inputs.issue_number || '' }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | # Priority: explicit input > PR number from label trigger if [ -n "$ISSUE_NUMBER_INPUT" ]; then echo "issue_number=$ISSUE_NUMBER_INPUT" >> "$GITHUB_OUTPUT" elif [ -n "$PR_NUMBER" ]; then echo "issue_number=$PR_NUMBER" >> "$GITHUB_OUTPUT" else echo "issue_number=" >> "$GITHUB_OUTPUT" fi # Post initial comment for label triggers (no dependencies - runs immediately) post-label-comment: if: > github.event_name == 'pull_request_target' && ( github.event.label.name == 'integration-test' || github.event.label.name == 'behavior-test' ) runs-on: ubuntu-latest permissions: pull-requests: write steps: - name: Comment on PR (integration tests via label) if: github.event.label.name == 'integration-test' uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. - name: Comment on PR (behavior tests via label) if: github.event.label.name == 'behavior-test' uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the behavior tests on your PR. You will receive a comment with the results shortly. # Post initial comment for workflow_dispatch (depends on setup-matrix for issue_number resolution) post-dispatch-comment: needs: setup-matrix if: github.event_name == 'workflow_dispatch' && github.event.inputs.issue_number != '' runs-on: ubuntu-latest permissions: issues: write steps: - name: Comment on issue/PR (workflow_dispatch) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ISSUE_NUMBER: ${{ github.event.inputs.issue_number }} MODEL_IDS: ${{ github.event.inputs.model_ids || 'all models' }} TEST_TYPE: ${{ github.event.inputs.test_type || 'all' }} REASON: ${{ github.event.inputs.reason }} run: | # Sanitize @OpenHands mentions to prevent self-mention loops SANITIZED_REASON=$(echo "$REASON" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g') SANITIZED_MODEL_IDS=$(echo "$MODEL_IDS" | sed 's/@OpenHands/@\u200BOpenHands/g; s/@openhands/@\u200Bopenhands/g') COMMENT_BODY=$(cat <> "$GITHUB_ENV" - name: Run integration test evaluation for ${{ matrix.job-config['name'] }} env: LLM_CONFIG: ${{ toJson(matrix.job-config['llm-config']) }} LLM_API_KEY: ${{ secrets.LLM_API_KEY_EVAL }} LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev TOOL_PRESET: ${{ github.event.inputs.tool_preset || 'default' }} run: | set -eo pipefail AGENT_SDK_VERSION=$(git rev-parse --short HEAD) EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config['run-suffix'] }}" echo "Invoking test runner with TEST_TYPE_ARGS='$TEST_TYPE_ARGS' TOOL_PRESET='$TOOL_PRESET'" uv run python tests/integration/run_infer.py \ --llm-config "$LLM_CONFIG" \ --num-workers $N_PROCESSES \ --eval-note "$EVAL_NOTE" \ --tool-preset "$TOOL_PRESET" \ $TEST_TYPE_ARGS # get integration tests JSON results RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config['run-suffix'] }}* -name "results.json" -type f | head -n 1) echo "RESULTS_FILE: $RESULTS_FILE" if [ -f "$RESULTS_FILE" ]; then echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV else echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV fi - name: Wait a little bit run: sleep 10 - name: Create archive of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') cd tests/integration/outputs # Change to the outputs directory tar -czvf ../../../integration_tests_${{ matrix.job-config['run-suffix'] }}_${TIMESTAMP}.tar.gz *${{ matrix.job-config['run-suffix'] }}* # Include result directories for this model - name: Upload evaluation results as artifact uses: actions/upload-artifact@v7 id: upload_results_artifact with: name: integration-test-outputs-${{ matrix.job-config['run-suffix'] }}-${{ github.run_id }}-${{ github.run_attempt }} path: integration_tests_${{ matrix.job-config['run-suffix'] }}_*.tar.gz - name: Save test results for consolidation run: | # Copy the structured JSON results file for consolidation mkdir -p test_results_summary if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then # Copy the JSON results file directly cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json" echo "✓ Copied JSON results file for consolidation" else echo "✗ No JSON results file found" exit 1 fi - name: Upload test results summary uses: actions/upload-artifact@v7 with: name: test-results-${{ matrix.job-config['run-suffix'] }} path: test_results_summary/${{ matrix.job-config['run-suffix'] }}_results.json consolidate-results: needs: [setup-matrix, run-integration-tests] if: | always() && ( ( github.event_name == 'pull_request_target' && ( github.event.label.name == 'integration-test' || github.event.label.name == 'behavior-test' ) ) || github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) runs-on: ubuntu-24.04 permissions: contents: read pull-requests: write issues: write steps: - name: Checkout repository uses: actions/checkout@v6 with: # When using pull_request_target, explicitly checkout the PR branch # This ensures we use the scripts from the actual PR code ref: ${{ github.event.pull_request.head.sha || github.ref }} - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Install Python dependencies using uv run: | uv sync --dev - name: Download all test results uses: actions/download-artifact@v8 with: pattern: test-results-* merge-multiple: true path: all_results - name: Download all integration test artifacts uses: actions/download-artifact@v8 with: pattern: integration-test-outputs-* path: artifacts - name: Consolidate test results env: EVENT_NAME: ${{ github.event_name }} PR_NUMBER: ${{ github.event.pull_request.number }} MANUAL_REASON: ${{ github.event.inputs.reason }} COMMIT_SHA: ${{ github.sha }} PYTHONPATH: ${{ github.workspace }} GITHUB_SERVER_URL: ${{ github.server_url }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_RUN_ID: ${{ github.run_id }} run: | uv run python tests/integration/utils/consolidate_json_results.py \ --results-dir all_results \ --artifacts-dir artifacts \ --output-file consolidated_results.json echo "Consolidated results generated successfully" uv run python tests/integration/utils/generate_markdown_report.py \ --input-file consolidated_results.json \ --output-file consolidated_report.md - name: Upload consolidated report uses: actions/upload-artifact@v7 with: name: consolidated-report path: consolidated_report.md - name: Create consolidated PR comment if: github.event_name == 'pull_request_target' run: | # Sanitize @OpenHands mentions to prevent self-mention loops COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) # Use GitHub CLI to create comment with explicit PR number echo "$COMMENT_BODY" | gh pr comment ${{ github.event.pull_request.number }} --body-file - env: GH_TOKEN: ${{ github.token }} - name: Comment on specified issue/PR (workflow_dispatch) if: github.event_name == 'workflow_dispatch' && needs.setup-matrix.outputs.issue_number != '' env: GH_TOKEN: ${{ github.token }} ISSUE_NUMBER: ${{ needs.setup-matrix.outputs.issue_number }} run: | # Sanitize @OpenHands mentions to prevent self-mention loops COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) # Use GitHub CLI to create comment on the specified issue/PR echo "$COMMENT_BODY" | gh issue comment "$ISSUE_NUMBER" --body-file - - name: Read consolidated report for tracker issue if: github.event_name == 'schedule' id: read_report run: | # Read and sanitize the report, then set as output REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < consolidated_report.md) echo "report<> $GITHUB_OUTPUT echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT - name: Comment with results on tracker issue if: github.event_name == 'schedule' uses: KeisukeYamashita/create-comment@v1 with: number: 2078 unique: false comment: | **Trigger:** Nightly Scheduled Run **Commit:** ${{ github.sha }} ${{ steps.read_report.outputs.report }} ================================================ FILE: .github/workflows/issue-duplicate-checker.yml ================================================ --- name: Issue Duplicate Check via OpenHands Cloud on: issues: types: [opened] schedule: - cron: 0 9 * * * workflow_dispatch: inputs: mode: description: Which workflow path to run required: true type: choice options: - smoke-clone - issue-check - auto-close default: smoke-clone issue_number: description: Existing issue number to analyze when mode is issue-check required: false type: number close_after_days: description: Days to wait before auto-closing duplicate candidates in auto-close mode required: false type: number default: 3 permissions: contents: read issues: write jobs: smoke-clone: if: github.event_name == 'workflow_dispatch' && inputs.mode == 'smoke-clone' runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 - name: Clone software-agent-sdk run: | git clone --depth 1 "https://github.com/${{ github.repository }}.git" /tmp/software-agent-sdk echo "software-agent-sdk HEAD: $(git -C /tmp/software-agent-sdk rev-parse --short HEAD)" - name: Summarize smoke test run: | { echo "## Smoke clone completed" echo echo "- software-agent-sdk cloned to /tmp/software-agent-sdk" } >> "$GITHUB_STEP_SUMMARY" issue-duplicate-check: if: | github.event_name == 'issues' || (github.event_name == 'workflow_dispatch' && inputs.mode == 'issue-check' && inputs.issue_number != null) runs-on: ubuntu-latest timeout-minutes: 35 concurrency: group: issue-duplicate-check-${{ github.repository }}-${{ github.event.issue.number || inputs.issue_number }} cancel-in-progress: false steps: - name: Checkout repository uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Validate duplicate check inputs env: OPENHANDS_API_KEY: ${{ secrets.OPENHANDS_API_KEY }} ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }} run: | if [ -z "$OPENHANDS_API_KEY" ]; then echo "Error: OPENHANDS_API_KEY secret is required" exit 1 fi if [ -z "$ISSUE_NUMBER" ]; then echo "Error: ISSUE_NUMBER is required" exit 1 fi - name: Run OpenHands duplicate check conversation id: run_check env: OPENHANDS_API_KEY: ${{ secrets.OPENHANDS_API_KEY }} GITHUB_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC || github.token }} ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }} OUTPUT_PATH: ${{ runner.temp }}/issue-duplicate-check-result.json run: | python scripts/issue_duplicate_check_openhands.py \ --repository "${{ github.repository }}" \ --issue-number "$ISSUE_NUMBER" \ --output "$OUTPUT_PATH" test -f "$OUTPUT_PATH" || { echo "Error: Output file not created" exit 1 } echo "result_path=$OUTPUT_PATH" >> "$GITHUB_OUTPUT" - name: Parse duplicate check result id: parsed_result env: RESULT_PATH: ${{ steps.run_check.outputs.result_path }} run: | python - <<'PY' import json import os import sys from pathlib import Path try: result = json.loads(Path(os.environ['RESULT_PATH']).read_text()) except (FileNotFoundError, json.JSONDecodeError) as exc: print( f"Error: Failed to read duplicate check result: {exc}", file=sys.stderr, ) raise SystemExit(1) from exc output_path = Path(os.environ['GITHUB_OUTPUT']) summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) def write_multiline(name: str, value: str) -> None: delimiter = f"EOF_{os.urandom(8).hex()}" with output_path.open('a', encoding='utf-8') as fh: fh.write(f"{name}<<{delimiter}\n{value}\n{delimiter}\n") canonical_issue_number = result.get('canonical_issue_number') with output_path.open('a', encoding='utf-8') as fh: fh.write(f"should_comment={'true' if result.get('should_comment') else 'false'}\n") fh.write(f"is_duplicate={'true' if result.get('is_duplicate') else 'false'}\n") fh.write( f"auto_close_candidate={'true' if result.get('auto_close_candidate') else 'false'}\n" ) fh.write(f"confidence={result.get('confidence', '')}\n") fh.write(f"classification={result.get('classification', '')}\n") fh.write( f"canonical_issue_number={canonical_issue_number if canonical_issue_number is not None else ''}\n" ) fh.write(f"conversation_url={result.get('conversation_url', '')}\n") fh.write(f"app_conversation_id={result.get('app_conversation_id', '')}\n") write_multiline('summary', str(result.get('summary', '')).strip()) write_multiline( 'candidate_issues_json', json.dumps(result.get('candidate_issues', []), ensure_ascii=False), ) candidate_lines = [] for candidate in result.get('candidate_issues', []): candidate_lines.append( f"- #{candidate.get('number')}: {candidate.get('title')} ({candidate.get('url')}) — {candidate.get('similarity_reason', '')}" ) summary_path.write_text( "\n".join( [ "## Duplicate check result", "", f"- Repository: {result.get('repository')}", f"- Issue: #{result.get('issue_number')}", f"- Should comment: {result.get('should_comment')}", f"- Exact duplicate: {result.get('is_duplicate')}", f"- Auto-close candidate: {result.get('auto_close_candidate')}", f"- Classification: {result.get('classification')}", f"- Confidence: {result.get('confidence')}", f"- Canonical issue: {canonical_issue_number}", f"- Conversation: {result.get('conversation_url')}", "", "### Summary", result.get('summary', ''), "", "### Candidate issues", *(candidate_lines or ["- None"]), ] ) + "\n", encoding='utf-8', ) PY - name: Post duplicate overlap notice if: steps.parsed_result.outputs.should_comment == 'true' uses: actions/github-script@v9 env: ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }} SUMMARY: ${{ steps.parsed_result.outputs.summary }} CANDIDATE_ISSUES_JSON: ${{ steps.parsed_result.outputs.candidate_issues_json }} CLASSIFICATION: ${{ steps.parsed_result.outputs.classification }} AUTO_CLOSE_CANDIDATE: ${{ steps.parsed_result.outputs.auto_close_candidate }} CANONICAL_ISSUE_NUMBER: ${{ steps.parsed_result.outputs.canonical_issue_number }} CLOSE_AFTER_DAYS: ${{ inputs.close_after_days || '3' }} with: github-token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC || github.token }} script: | const issueNumber = Number(process.env.ISSUE_NUMBER); const summary = (process.env.SUMMARY || '').trim(); const classification = process.env.CLASSIFICATION || 'no-match'; const autoClose = process.env.AUTO_CLOSE_CANDIDATE === 'true'; const closeAfterDays = process.env.CLOSE_AFTER_DAYS || '3'; let candidates = []; try { candidates = JSON.parse(process.env.CANDIDATE_ISSUES_JSON || '[]'); } catch (error) { core.setFailed(`Invalid candidate JSON: ${error.message}`); return; } if (!Array.isArray(candidates)) { core.setFailed('CANDIDATE_ISSUES_JSON is not an array'); return; } if (candidates.length === 0) { core.setFailed(`No candidate issues were returned for issue #${issueNumber}.`); return; } const canonicalIssueRaw = process.env.CANONICAL_ISSUE_NUMBER || candidates[0].number; const canonicalIssueNumber = canonicalIssueRaw ? Number(canonicalIssueRaw) : Number.NaN; const candidateLabel = 'duplicate-candidate'; function parseDuplicateCheckMarker(body) { if (!body) { return null; } const match = body.match(//); if (!match) { return null; } return { canonicalIssueNumber: Number(match[1]), autoClose: match[2] === 'true', }; } async function ensureCanonicalIssueIsOpenIssue() { let canonicalIssue; try { ({ data: canonicalIssue } = await github.rest.issues.get({ owner: context.repo.owner, repo: context.repo.repo, issue_number: canonicalIssueNumber, })); } catch (error) { if (error.status === 404) { core.setFailed(`Canonical issue #${canonicalIssueNumber} does not exist.`); return false; } throw error; } if (canonicalIssue.pull_request) { core.setFailed(`Canonical issue #${canonicalIssueNumber} is a pull request, not an issue.`); return false; } if (canonicalIssue.state !== 'open' || canonicalIssue.locked) { core.setFailed(`Canonical issue #${canonicalIssueNumber} must be an open, unlocked issue.`); return false; } return true; } async function ensureCandidateLabelOnIssue() { try { await github.rest.issues.getLabel({ owner: context.repo.owner, repo: context.repo.repo, name: candidateLabel, }); } catch (error) { if (error.status !== 404) { throw error; } await github.rest.issues.createLabel({ owner: context.repo.owner, repo: context.repo.repo, name: candidateLabel, color: 'C5DEF5', description: 'Potential duplicate awaiting auto-close or maintainer review', }); } const { data: issue } = await github.rest.issues.get({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, }); const labelNames = (issue.labels || []).map((label) => ( typeof label === 'string' ? label : label.name )); if (!labelNames.includes(candidateLabel)) { await github.rest.issues.addLabels({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, labels: [candidateLabel], }); } } async function removeCandidateLabelFromIssue() { try { await github.rest.issues.removeLabel({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, name: candidateLabel, }); } catch (error) { if (error.status !== 404) { throw error; } } } if (!Number.isInteger(canonicalIssueNumber) || canonicalIssueNumber <= 0) { core.setFailed(`No canonical issue number was returned for issue #${issueNumber}.`); return; } if (!(await ensureCanonicalIssueIsOpenIssue())) { return; } const marker = ``; const header = candidates.length === 1 ? 'Found 1 possible duplicate issue:' : `Found ${candidates.length} possible duplicate issues:`; const candidateLines = candidates.map((candidate, index) => ( `${index + 1}. [#${candidate.number}](${candidate.url}) — ${candidate.title}` )); const sections = []; if (summary) { sections.push(summary, ''); } sections.push(header, '', ...candidateLines); if (classification === 'overlapping-scope') { sections.push( '', 'These may not be exact duplicates, but the scope appears to overlap enough that keeping discussion in one place may be more useful.' ); } if (autoClose) { sections.push( '', `This issue will be automatically closed as a duplicate in ${closeAfterDays} days.`, '', '- If your issue is a duplicate, please close it and 👍 the existing issue instead', '- To prevent auto-closure, add a comment or 👎 this comment' ); } sections.push( '', marker, '_This comment was created by an AI assistant (OpenHands) on behalf of the repository maintainer._' ); const body = sections.join('\n').trim(); const MAX_COMMENT_PAGES = 50; let allComments = []; let page = 1; while (page <= MAX_COMMENT_PAGES) { const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, per_page: 100, page, }); if (!comments || comments.length === 0) { break; } allComments = allComments.concat(comments); if (comments.length < 100) { break; } page += 1; } if (page > MAX_COMMENT_PAGES) { core.setFailed( `Stopped loading comments for issue #${issueNumber} after ${MAX_COMMENT_PAGES} pages.` ); return; } const existing = allComments.find((comment) => comment.body && comment.body.includes(''; const body = `${marker} ✅ **PR Artifacts Cleaned Up** The \`.pr/\` directory has been automatically removed. `; const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, }); const existing = comments.find(c => c.body.includes(marker)); if (existing) { await github.rest.issues.updateComment({ owner: context.repo.owner, repo: context.repo.repo, comment_id: existing.id, body: body, }); } # Warn if .pr/ directory exists (will be auto-removed on approval) check-pr-artifacts: if: github.event_name == 'pull_request' runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - uses: actions/checkout@v6 - name: Check for .pr/ directory id: check run: | if [ -d ".pr" ]; then echo "exists=true" >> $GITHUB_OUTPUT echo "::warning::.pr/ directory exists and will be automatically removed when the PR is approved. For fork PRs, manual removal is required before merging." else echo "exists=false" >> $GITHUB_OUTPUT fi - name: Post or update PR comment if: steps.check.outputs.exists == 'true' uses: actions/github-script@v9 with: script: | const marker = ''; const body = `${marker} 📁 **PR Artifacts Notice** This PR contains a \`.pr/\` directory with PR-specific documents. This directory will be **automatically removed** when the PR is approved. > For fork PRs: Manual removal is required before merging. `; const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, }); const existing = comments.find(c => c.body.includes(marker)); if (!existing) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: body, }); } ================================================ FILE: .github/workflows/pr-review-by-openhands.yml ================================================ --- name: PR Review by OpenHands on: # Use pull_request for same-repo PRs so workflow changes can self-verify in PRs. pull_request: types: [opened, ready_for_review, labeled, review_requested] # Use pull_request_target for fork PRs. # The bot token used here is intentionally scoped to PR review operations, # so the remaining blast radius is bounded even though PR content is untrusted. pull_request_target: types: [opened, ready_for_review, labeled, review_requested] permissions: contents: read pull-requests: write issues: write jobs: pr-review: # Run on same-repo PRs via pull_request and on fork PRs via pull_request_target. # Trigger when one of the following conditions is met: # 1. A new non-draft PR is opened by a non-first-time contributor, OR # 2. A draft PR is converted to ready for review by a non-first-time contributor, OR # 3. The 'review-this' label is added, OR # 4. openhands-agent or all-hands-bot is requested as a reviewer # Note: FIRST_TIME_CONTRIBUTOR and NONE PRs require manual trigger via label/reviewer request. if: | ( ( github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository ) || ( github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository ) ) && ( (github.event.action == 'opened' && github.event.pull_request.draft == false && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') || (github.event.action == 'ready_for_review' && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') || (github.event.action == 'labeled' && github.event.label.name == 'review-this') || ( github.event.action == 'review_requested' && ( github.event.requested_reviewer.login == 'openhands-agent' || github.event.requested_reviewer.login == 'all-hands-bot' ) ) ) concurrency: group: pr-review-${{ github.event.pull_request.number }} cancel-in-progress: true runs-on: ubuntu-24.04 steps: - name: Run PR Review uses: OpenHands/extensions/plugins/pr-review@main with: llm-model: litellm_proxy/claude-sonnet-4-5-20250929 llm-base-url: https://llm-proxy.app.all-hands.dev # Enable experimental sub-agent delegation for file-level reviews use-sub-agents: 'true' llm-api-key: ${{ secrets.LLM_API_KEY }} github-token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC || github.token }} lmnr-api-key: ${{ secrets.LMNR_SKILLS_API_KEY }} ================================================ FILE: .github/workflows/pr-review-evaluation.yml ================================================ --- name: PR Review Evaluation # This workflow evaluates how well PR review comments were addressed. # It runs when a PR is closed to assess review effectiveness. # # Security note: pull_request_target is safe here because: # 1. Only triggers on PR close (not on code changes) # 2. Does not checkout PR code - only downloads artifacts from trusted workflow runs # 3. Runs evaluation scripts from the extensions repo, not from the PR on: pull_request_target: types: [closed] permissions: contents: read pull-requests: read jobs: evaluate: runs-on: ubuntu-24.04 env: PR_NUMBER: ${{ github.event.pull_request.number }} REPO_NAME: ${{ github.repository }} PR_MERGED: ${{ github.event.pull_request.merged }} steps: - name: Download review trace artifact id: download-trace uses: dawidd6/action-download-artifact@v21 continue-on-error: true with: workflow: pr-review-by-openhands.yml name: pr-review-trace-${{ github.event.pull_request.number }} path: trace-info search_artifacts: true if_no_artifact_found: warn - name: Check if trace file exists id: check-trace run: | if [ -f "trace-info/laminar_trace_info.json" ]; then echo "trace_exists=true" >> $GITHUB_OUTPUT echo "Found trace file for PR #$PR_NUMBER" else echo "trace_exists=false" >> $GITHUB_OUTPUT echo "No trace file found for PR #$PR_NUMBER - skipping evaluation" fi # Always checkout main branch for security - cannot test script changes in PRs - name: Checkout extensions repository if: steps.check-trace.outputs.trace_exists == 'true' uses: actions/checkout@v6 with: repository: OpenHands/extensions path: extensions - name: Set up Python if: steps.check-trace.outputs.trace_exists == 'true' uses: actions/setup-python@v6 with: python-version: '3.12' - name: Install dependencies if: steps.check-trace.outputs.trace_exists == 'true' run: pip install lmnr - name: Run evaluation if: steps.check-trace.outputs.trace_exists == 'true' env: # Script expects LMNR_PROJECT_API_KEY; org secret is named LMNR_SKILLS_API_KEY LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_SKILLS_API_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | python extensions/plugins/pr-review/scripts/evaluate_review.py \ --trace-file trace-info/laminar_trace_info.json - name: Upload evaluation logs uses: actions/upload-artifact@v7 if: always() && steps.check-trace.outputs.trace_exists == 'true' with: name: pr-review-evaluation-${{ github.event.pull_request.number }} path: '*.log' retention-days: 30 ================================================ FILE: .github/workflows/precommit.yml ================================================ --- # .github/workflows/precommit.yml name: Pre-commit checks on: push: branches: [main] pull_request: branches: ['**'] jobs: pre-commit: runs-on: ubuntu-24.04 steps: - name: Checkout code uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Install uv uses: astral-sh/setup-uv@v7 - name: Install dependencies run: uv sync --frozen --group dev - name: Run pre-commit (all files) run: uv run pre-commit run --all-files --show-diff-on-failure ================================================ FILE: .github/workflows/prepare-release.yml ================================================ --- name: Prepare Release on: workflow_dispatch: inputs: version: description: Release version (e.g., 1.2.3) required: true type: string jobs: prepare-release: runs-on: ubuntu-24.04 steps: - name: Validate version format run: | if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "❌ Invalid version format. Expected: X.Y.Z (e.g., 1.2.3)" exit 1 fi echo "✅ Version format is valid: ${{ inputs.version }}" - name: Checkout repository uses: actions/checkout@v6 with: token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Configure Git run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - name: Create release branch run: | BRANCH_NAME="rel-${{ inputs.version }}" echo "Creating branch: $BRANCH_NAME" git checkout -b "$BRANCH_NAME" echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV - name: Set package version run: | echo "🔧 Setting version to ${{ inputs.version }}" make set-package-version version=${{ inputs.version }} - name: Update sdk_ref default in run-eval workflow run: python3 .github/scripts/update_sdk_ref_default.py "${{ inputs.version }}" - name: Commit version changes run: | git add . if git diff --staged --quiet; then echo "No changes to commit" else git commit -m "Release v${{ inputs.version }}" -m "Co-authored-by: openhands " echo "✅ Changes committed" fi - name: Push release branch run: | git push -u origin "${{ env.BRANCH_NAME }}" echo "✅ Branch pushed: ${{ env.BRANCH_NAME }}" - name: Create Pull Request env: GH_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} run: | cat > pr_body.txt << 'EOF' ## Release v${{ inputs.version }} This PR prepares the release for version **${{ inputs.version }}**. ### Release Checklist - [x] Version set to ${{ inputs.version }} - [ ] Fix any deprecation deadlines if they exist - [ ] Integration tests pass (tagged with `integration-test`) - [ ] Behavior tests pass (tagged with `behavior-test`) - [ ] Example tests pass (tagged with `test-examples`) - [ ] Evaluation on OpenHands Index ### What happens on merge When this PR is merged, the `create-release.yml` workflow will automatically: 1. Create a GitHub release with tag `v${{ inputs.version }}` and auto-generated notes 2. Trigger `pypi-release.yml` to publish all packages to PyPI 3. Trigger `version-bump-prs.yml` to create downstream version bump PRs EOF gh pr create \ --title "Release v${{ inputs.version }}" \ --body-file pr_body.txt \ --base main \ --head "${{ env.BRANCH_NAME }}" \ --label "integration-test" \ --label "behavior-test" \ --label "test-examples" rm pr_body.txt echo "✅ Pull request created successfully!" # Get PR URL and display it PR_URL=$(gh pr view "${{ env.BRANCH_NAME }}" --json url --jq '.url') echo "🔗 PR URL: $PR_URL" echo "PR_URL=$PR_URL" >> $GITHUB_ENV - name: Summary run: | echo "## ✅ Release Preparation Complete!" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "- **Version**: ${{ inputs.version }}" >> $GITHUB_STEP_SUMMARY echo "- **Branch**: ${{ env.BRANCH_NAME }}" >> $GITHUB_STEP_SUMMARY echo "- **PR URL**: ${{ env.PR_URL }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Next Steps:" >> $GITHUB_STEP_SUMMARY echo "1. Review the PR and address any deprecation deadlines" >> $GITHUB_STEP_SUMMARY echo "2. Wait for integration, behavior, and example tests to pass" >> $GITHUB_STEP_SUMMARY echo "3. Merge the PR — a GitHub release and PyPI publish will happen automatically" >> $GITHUB_STEP_SUMMARY ================================================ FILE: .github/workflows/pypi-release.yml ================================================ --- name: Publish all OpenHands packages (uv) on: # Run manually workflow_dispatch: # Run automatically when a release is published release: types: [published] jobs: publish: # Skip PyPI publishing for pre-releases (e.g., release candidates). # Pre-releases can still be created on GitHub for testing without # pushing packages to PyPI. Manual workflow_dispatch always runs. if: > github.event_name == 'workflow_dispatch' || !github.event.release.prerelease runs-on: ubuntu-24.04 permissions: actions: write contents: read outputs: version: ${{ steps.extract_version.outputs.version }} steps: - name: Checkout uses: actions/checkout@v6 - name: Extract version from release tag id: extract_version run: | # Get version from release tag (e.g., v1.2.3 -> 1.2.3) if [[ "${{ github.event_name }}" == "release" ]]; then VERSION="${{ github.event.release.tag_name }}" VERSION="${VERSION#v}" # Remove 'v' prefix if present else # For manual dispatch, extract from pyproject.toml VERSION=$(grep -m1 '^version = ' openhands-sdk/pyproject.toml | cut -d'"' -f2) fi echo "version=$VERSION" >> $GITHUB_OUTPUT echo "📦 Version: $VERSION" - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Build and publish all packages env: UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN_OPENHANDS }} run: | set -euo pipefail if [ -z "${UV_PUBLISH_TOKEN:-}" ]; then echo "❌ Missing secret PYPI_TOKEN_OPENHANDS" exit 1 fi PACKAGES=( openhands-sdk openhands-tools openhands-workspace openhands-agent-server ) echo "🚀 Building and publishing all packages..." for PKG in "${PACKAGES[@]}"; do echo "===== $PKG =====" uv build --package "$PKG" done # Use --check-url to skip files that already exist on PyPI # This allows re-running the workflow after partial failures uv publish --token "$UV_PUBLISH_TOKEN" --check-url https://pypi.org/simple/ echo "✅ All packages built and published successfully!" echo "" echo "📋 Note: Version bump PRs will be created by the 'Create Version Bump PRs' workflow" echo " which is dispatched after this publish succeeds." - name: Dispatch version bump workflow env: GH_TOKEN: ${{ github.token }} VERSION: ${{ steps.extract_version.outputs.version }} run: | gh workflow run version-bump-prs.yml \ --repo "${{ github.repository }}" \ -f "version=${VERSION}" echo "🚀 Dispatched version-bump-prs.yml for v${VERSION}" ================================================ FILE: .github/workflows/qa-changes-by-openhands.yml ================================================ --- # Automated QA validation of PR changes using OpenHands. # # Unlike pr-review (which reads diffs and posts code-review comments), # this workflow actually runs the code — setting up the environment, # executing tests, exercising changed behavior, and posting a structured # QA report as a PR comment. name: QA Changes by OpenHands on: pull_request: types: [opened, ready_for_review, labeled, review_requested] permissions: contents: read pull-requests: write issues: write jobs: qa-changes: # Only run for same-repo PRs (secrets aren't available for forks). # Trigger conditions mirror pr-review, but use the 'qa-this' label # and openhands-agent reviewer request. if: | github.event.pull_request.head.repo.full_name == github.repository && ( (github.event.action == 'opened' && github.event.pull_request.draft == false && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') || (github.event.action == 'ready_for_review' && github.event.pull_request.author_association != 'FIRST_TIME_CONTRIBUTOR' && github.event.pull_request.author_association != 'NONE') || github.event.label.name == 'qa-this' || github.event.requested_reviewer.login == 'openhands-agent' || github.event.requested_reviewer.login == 'all-hands-bot' ) concurrency: group: qa-changes-${{ github.event.pull_request.number }} cancel-in-progress: true runs-on: ubuntu-24.04 timeout-minutes: 30 steps: - name: Run QA Changes uses: OpenHands/extensions/plugins/qa-changes@main with: llm-model: litellm_proxy/claude-sonnet-4-5-20250929 llm-base-url: https://llm-proxy.app.all-hands.dev max-budget: '10.0' timeout-minutes: '30' max-iterations: '500' llm-api-key: ${{ secrets.LLM_API_KEY }} github-token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} lmnr-api-key: ${{ secrets.LMNR_SKILLS_API_KEY }} ================================================ FILE: .github/workflows/qa-changes-evaluation.yml ================================================ --- name: QA Changes Evaluation # This workflow evaluates how well QA validation performed. # It runs when a PR is closed to assess QA effectiveness. # # Security note: pull_request_target is safe here because this workflow # never checks out or executes PR code. It only: # 1. Downloads artifacts produced by a trusted workflow run # 2. Runs evaluation scripts from the extensions repo (main/pinned branch) on: pull_request_target: types: [closed] permissions: contents: read pull-requests: read jobs: evaluate: runs-on: ubuntu-24.04 env: PR_NUMBER: ${{ github.event.pull_request.number }} REPO_NAME: ${{ github.repository }} PR_MERGED: ${{ github.event.pull_request.merged }} steps: - name: Download QA trace artifact id: download-trace uses: dawidd6/action-download-artifact@v21 continue-on-error: true with: workflow: qa-changes-by-openhands.yml name: qa-changes-trace-${{ github.event.pull_request.number }} path: trace-info search_artifacts: true if_no_artifact_found: warn - name: Check if trace file exists id: check-trace run: | if [ -f "trace-info/laminar_trace_info.json" ]; then echo "trace_exists=true" >> $GITHUB_OUTPUT echo "Found trace file for PR #$PR_NUMBER" else echo "trace_exists=false" >> $GITHUB_OUTPUT echo "No trace file found for PR #$PR_NUMBER - skipping evaluation" fi - name: Checkout extensions repository if: steps.check-trace.outputs.trace_exists == 'true' uses: actions/checkout@v6 with: repository: OpenHands/extensions path: extensions - name: Set up Python if: steps.check-trace.outputs.trace_exists == 'true' uses: actions/setup-python@v6 with: python-version: '3.12' - name: Install dependencies if: steps.check-trace.outputs.trace_exists == 'true' run: pip install lmnr - name: Run evaluation if: steps.check-trace.outputs.trace_exists == 'true' env: # Script expects LMNR_PROJECT_API_KEY; org secret is named LMNR_SKILLS_API_KEY LMNR_PROJECT_API_KEY: ${{ secrets.LMNR_SKILLS_API_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | python extensions/plugins/qa-changes/scripts/evaluate_qa_changes.py \ --trace-file trace-info/laminar_trace_info.json - name: Upload evaluation logs uses: actions/upload-artifact@v7 if: always() && steps.check-trace.outputs.trace_exists == 'true' with: name: qa-changes-evaluation-${{ github.event.pull_request.number }} path: '*.log' retention-days: 30 ================================================ FILE: .github/workflows/release-binaries.yml ================================================ --- name: Publish agent-server release artifacts # On release published or push to main: # 1. Build the agent-server PyInstaller binary on Linux + macOS for both # x86_64 and arm64, smoke-test it, and upload workflow artifacts. # 2. On release events/manual runs, attach those binaries plus a combined # SHA256SUMS file to the GitHub release. # 3. Smoke-test the multi-arch Docker images pushed by `server.yml`, # verifying that every published variant has a manifest covering both # linux/amd64 and linux/arm64 and that the container actually boots # and answers /health on each architecture. on: push: branches: [main] release: types: [published] workflow_dispatch: inputs: release_tag: description: Existing release tag (e.g. v1.20.1) required: true type: string permissions: contents: write packages: read jobs: resolve-tag: name: Resolve artifact and image tag runs-on: ubuntu-24.04 outputs: tag: ${{ steps.resolve.outputs.tag }} version: ${{ steps.resolve.outputs.version }} image_tag: ${{ steps.resolve.outputs.image_tag }} steps: - id: resolve shell: bash run: | set -euo pipefail if [[ "${{ github.event_name }}" == "release" ]]; then TAG="${{ github.event.release.tag_name }}" VERSION="${TAG#v}" elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then TAG="${{ inputs.release_tag }}" VERSION="${TAG#v}" elif [[ "${{ github.event_name }}" == "push" ]]; then TAG="" VERSION="${GITHUB_SHA::7}" else echo "ERROR: unsupported event '${{ github.event_name }}'" exit 1 fi if [[ -n "$TAG" ]] && ! [[ "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+([a-zA-Z0-9.+-]*)?$ ]]; then echo "ERROR: unexpected version '$VERSION' (from tag '$TAG')" exit 1 fi echo "tag=$TAG" >> "$GITHUB_OUTPUT" echo "version=$VERSION" >> "$GITHUB_OUTPUT" echo "image_tag=$VERSION" >> "$GITHUB_OUTPUT" echo "📦 Tag: ${TAG:-} Image tag: $VERSION" build-binary: name: Build (${{ matrix.os_label }}-${{ matrix.arch }}) needs: resolve-tag runs-on: ${{ matrix.runner }} strategy: fail-fast: false matrix: include: - runner: ubuntu-24.04 os_label: linux arch: x86_64 - runner: ubuntu-24.04-arm os_label: linux arch: arm64 - runner: macos-13 os_label: macos arch: x86_64 - runner: macos-14 os_label: macos arch: arm64 - runner: windows-2022 os_label: windows arch: x86_64 steps: - name: Checkout uses: actions/checkout@v6 - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.13' - name: Install dependencies run: uv sync --dev - name: Build binary (Unix) if: runner.os != 'Windows' run: make build-server - name: Build binary (Windows) if: runner.os == 'Windows' shell: bash run: uv run pyinstaller openhands-agent-server/openhands/agent_server/agent-server.spec - name: Smoke-test binary shell: bash run: | set -euo pipefail if [[ "${RUNNER_OS:-}" == "Windows" ]]; then BIN=./dist/openhands-agent-server.exe else BIN=./dist/openhands-agent-server fi "$BIN" --help echo "Testing server startup and template loading..." "$BIN" --port 8002 > server_test.log 2>&1 & SERVER_PID=$! cleanup() { kill "$SERVER_PID" 2>/dev/null || true wait "$SERVER_PID" 2>/dev/null || true if [ -f server_test.log ]; then echo "----- server_test.log (tail) -----" tail -100 server_test.log || true rm -f server_test.log fi } trap cleanup EXIT # Poll /health for up to 90s; fail if it never comes up. for i in $(seq 1 30); do if grep -q "system_prompt.j2.*not found" server_test.log 2>/dev/null; then echo "ERROR: Template files not found in binary!" exit 1 fi if ! kill -0 "$SERVER_PID" 2>/dev/null; then echo "ERROR: Server process exited before /health responded" exit 1 fi if curl -f -s http://localhost:8002/health >/dev/null 2>&1; then echo "✓ /health responded after ${i} attempt(s)" echo "✓ Binary smoke test passed" exit 0 fi sleep 3 done echo "ERROR: /health never responded within 90s" exit 1 - name: Stage release asset shell: bash env: ASSET: agent-server-${{ needs.resolve-tag.outputs.version }}-${{ matrix.os_label }}-${{ matrix.arch }} run: | set -euo pipefail mkdir -p release-assets if [[ "${RUNNER_OS:-}" == "Windows" ]]; then cp dist/openhands-agent-server.exe "release-assets/${ASSET}.exe" else cp dist/openhands-agent-server "release-assets/${ASSET}" fi ls -la release-assets/ - name: Upload binary as workflow artifact uses: actions/upload-artifact@v7 with: name: binary-${{ matrix.os_label }}-${{ matrix.arch }} path: release-assets/agent-server-* retention-days: 7 if-no-files-found: error publish-binaries: name: Publish binaries + SHA256SUMS needs: [resolve-tag, build-binary] if: github.event_name != 'push' runs-on: ubuntu-24.04 steps: - name: Download binary artifacts uses: actions/download-artifact@v8 with: pattern: binary-* merge-multiple: true path: release-assets - name: Generate combined SHA256SUMS shell: bash run: | set -euo pipefail cd release-assets ls -la shasum -a 256 agent-server-* | sort > SHA256SUMS cat SHA256SUMS - name: Attach binaries + SHA256SUMS to release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} TAG: ${{ needs.resolve-tag.outputs.tag }} shell: bash run: | set -euo pipefail cd release-assets gh release upload "$TAG" \ agent-server-* \ SHA256SUMS \ --clobber \ --repo "${{ github.repository }}" docker-smoke-test: name: Docker (${{ matrix.variant }}-${{ matrix.arch }}) needs: resolve-tag runs-on: ${{ matrix.runner }} strategy: fail-fast: false matrix: include: - variant: python arch: amd64 runner: ubuntu-24.04 - variant: python arch: arm64 runner: ubuntu-24.04-arm - variant: java arch: amd64 runner: ubuntu-24.04 - variant: java arch: arm64 runner: ubuntu-24.04-arm - variant: golang arch: amd64 runner: ubuntu-24.04 - variant: golang arch: arm64 runner: ubuntu-24.04-arm env: IMAGE: ghcr.io/openhands/agent-server IMAGE_TAG: ${{ needs.resolve-tag.outputs.image_tag }} VARIANT: ${{ matrix.variant }} ARCH: ${{ matrix.arch }} steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 - name: Log in to GHCR uses: docker/login-action@v4 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Wait for multi-arch manifest shell: bash run: | set -euo pipefail TAG_FQN="${IMAGE}:${IMAGE_TAG}-${VARIANT}" DEADLINE=$(( $(date +%s) + 2700 )) # 45 minutes while ! docker buildx imagetools inspect "$TAG_FQN" >/dev/null 2>&1; do if [ "$(date +%s)" -ge "$DEADLINE" ]; then echo "ERROR: timed out waiting for $TAG_FQN" exit 1 fi echo "Waiting for $TAG_FQN ..." sleep 30 done echo "✓ Manifest available: $TAG_FQN" - name: Verify manifest covers linux/amd64 + linux/arm64 shell: bash run: | set -euo pipefail TAG_FQN="${IMAGE}:${IMAGE_TAG}-${VARIANT}" PLATFORMS=$(docker buildx imagetools inspect "$TAG_FQN" --raw \ | jq -r '.manifests[]?.platform | "\(.os)/\(.architecture)"' \ | sort -u) echo "Platforms in $TAG_FQN:" echo "$PLATFORMS" for required in linux/amd64 linux/arm64; do if ! echo "$PLATFORMS" | grep -qx "$required"; then echo "ERROR: $required missing from $TAG_FQN manifest" exit 1 fi done echo "✓ Both linux/amd64 and linux/arm64 are present" - name: Pull and run on linux/${{ matrix.arch }} shell: bash run: | set -euo pipefail TAG_FQN="${IMAGE}:${IMAGE_TAG}-${VARIANT}" CONTAINER="agent-server-smoke-${VARIANT}-${ARCH}" echo "Pulling $TAG_FQN for linux/${ARCH} ..." docker pull --platform="linux/${ARCH}" "$TAG_FQN" echo "Starting container ..." docker run --platform="linux/${ARCH}" -d --rm \ --name "$CONTAINER" \ -p 8000:8000 \ "$TAG_FQN" cleanup() { docker logs "$CONTAINER" 2>&1 | tail -100 || true docker rm -f "$CONTAINER" >/dev/null 2>&1 || true } trap cleanup EXIT for i in $(seq 1 40); do if curl -f -s http://localhost:8000/health >/dev/null 2>&1; then echo "✓ /health responded for $TAG_FQN on linux/${ARCH}" exit 0 fi sleep 3 done echo "ERROR: /health never responded for $TAG_FQN on linux/${ARCH}" exit 1 ================================================ FILE: .github/workflows/remove-duplicate-candidate-label.yml ================================================ --- name: Remove duplicate candidate label on activity on: issue_comment: types: [created] permissions: issues: write concurrency: group: remove-duplicate-${{ github.repository }}-${{ github.event.issue.number }} cancel-in-progress: false jobs: remove-duplicate-candidate: if: | github.event.issue.state == 'open' && github.event.issue.pull_request == null && contains(github.event.issue.labels.*.name, 'duplicate-candidate') && github.event.comment.user.type != 'Bot' && !startsWith(github.event.comment.body || '', ' --- **Agent Server images for this PR** • **GHCR package:** ${GHCR_URL} **Variants & Base Images** | Variant | Architectures | Base Image | Docs / Tags | |---|---|---|---| ${VARIANTS_TABLE} **Pull (multi-arch manifest)** \`\`\`bash # Each variant is a multi-arch manifest supporting both amd64 and arm64 docker pull ${IMAGE}:${SHORT_SHA}-python \`\`\` **Run** \`\`\`bash docker run -it --rm \\ -p 8000:8000 \\ --name agent-server-${SHORT_SHA}-python \\ ${IMAGE}:${SHORT_SHA}-python \`\`\` **All tags pushed for this build** \`\`\` ${ALL_TAGS} \`\`\` **About Multi-Architecture Support** - Each variant tag (e.g., \`${SHORT_SHA}-python\`) is a **multi-arch manifest** supporting both **amd64** and **arm64** - Docker automatically pulls the correct architecture for your platform - Individual architecture tags (e.g., \`${SHORT_SHA}-python-amd64\`) are also available if needed EOF ) # Set output for the next step { echo 'pr_content<> $GITHUB_OUTPUT - name: Update PR description with docker image details uses: nefrob/pr-description@v1.2.0 with: content: ${{ steps.generate_description.outputs.pr_content }} regex: .*? regexFlags: s token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/stale.yml ================================================ --- # Workflow that marks issues and PRs with no activity for 30 days with "Stale" and closes them after 7 more days of no activity name: Close stale issues # Runs every day at 01:30 on: schedule: - cron: 30 1 * * * permissions: issues: write pull-requests: write jobs: stale: # Only run scheduled jobs in the main repository, not in forks if: github.repository == 'OpenHands/software-agent-sdk' runs-on: ubuntu-22.04 steps: - uses: actions/stale@v10 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: This issue is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days. stale-pr-message: This PR is stale because it has been open for 40 days with no activity. Remove the stale label or leave a comment, otherwise it will be closed in 10 days. days-before-stale: 40 exempt-issue-labels: roadmap,backlog close-issue-message: This issue was automatically closed due to 50 days of inactivity. We do this to help keep the issues somewhat manageable and focus on active issues. close-pr-message: This PR was closed because it had no activity for 50 days. If you feel this was closed in error, and you would like to continue the PR, please resubmit or let us know. days-before-close: 10 operations-per-run: 150 ================================================ FILE: .github/workflows/tests.yml ================================================ --- name: Run tests on: push: branches: [main] pull_request: branches: ['**'] permissions: contents: write pull-requests: write jobs: test-directory-guard: name: Test directory allowlist runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v6 - name: Verify test directories run: | # Allowed top-level directories under tests/ # Each must have a corresponding CI job or workflow that runs them. # tests.yml: sdk, tools, workspace, agent_server, cross # run-examples.yml: examples # integration-runner.yml: integration # (data-only): fixtures ALLOWED="sdk tools workspace agent_server cross examples integration fixtures" violations="" for entry in tests/*/; do dir_name="$(basename "$entry")" # skip __pycache__ and hidden dirs [[ "$dir_name" == __* || "$dir_name" == .* ]] && continue if ! echo "$ALLOWED" | grep -qw "$dir_name"; then violations="$violations tests/$dir_name/\n" fi done # Also reject top-level test files (they won't be picked up by any job) for f in tests/test_*.py; do [ -f "$f" ] && violations="$violations $f\n" done # Detect test files hiding inside source packages instead of tests/ # Excludes */testing/* dirs (testing utilities, not runnable tests) stray=$(find openhands-sdk openhands-tools openhands-workspace openhands-agent-server \ \( -name 'test_*.py' -o -name '*_test.py' \) \ -not -path '*/testing/*' \ 2>/dev/null || true) for f in $stray; do violations="$violations $f (stray test outside tests/)\n" done if [ -n "$violations" ]; then echo "ERROR: Found test paths outside the allowed directories." echo "The following will NOT be run by any CI job:" echo "" printf "$violations" echo "" echo "Allowed directories: $ALLOWED" echo "Move tests into one of the allowed directories so CI can run them." exit 1 fi echo "✓ All test directories are in the allowlist" sdk-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect sdk changes id: changed uses: tj-actions/changed-files@v47 with: files: | openhands-sdk/** tests/sdk/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Check for openhands.tools imports in sdk tests if: steps.changed.outputs.any_changed == 'true' run: | echo "Checking for openhands.tools imports in tests/sdk..." if grep -r "from openhands\.tools" tests/sdk/ || grep -r "import openhands\.tools" tests/sdk/; then echo "ERROR: Found openhands.tools imports in tests/sdk/" echo "SDK tests should only import from openhands.sdk" echo "Please move tests that use openhands.tools to tests/cross/" exit 1 fi echo "✓ No openhands.tools imports found in tests/sdk/" - name: Run sdk tests with coverage if: steps.changed.outputs.any_changed == 'true' run: | # Clean up any existing coverage file rm -f .coverage # Use pytest-xdist (-n auto) for parallel execution with proper # coverage collection. --forked prevents coverage from child processes. CI=true uv run python -m pytest -vvs \ -n auto \ --cov=openhands-sdk \ --cov-report=term-missing \ --cov-fail-under=0 \ --cov-config=pyproject.toml \ tests/sdk # Rename coverage file for upload if [ -f .coverage ]; then mv .coverage coverage-sdk.dat echo "SDK coverage file prepared for upload" fi - name: Upload sdk coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-sdk path: coverage-sdk.dat if-no-files-found: warn tools-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 timeout-minutes: 15 steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect tools changes id: changed uses: tj-actions/changed-files@v47 with: files: | openhands-tools/** tests/tools/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Run tools tests with coverage if: steps.changed.outputs.any_changed == 'true' run: | # Clean up any existing coverage file rm -f .coverage # Use --forked for tools tests due to terminal test conflicts # when running in parallel (shared /tmp paths, subprocess management) CI=true uv run python -m pytest -vvs \ --forked \ --cov=openhands-tools \ --cov-report=term-missing \ --cov-fail-under=0 \ --cov-config=pyproject.toml \ tests/tools # Rename coverage file for upload if [ -f .coverage ]; then mv .coverage coverage-tools.dat echo "Tools coverage file prepared for upload" fi - name: Upload tools coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-tools path: coverage-tools.dat if-no-files-found: warn windows-tests: runs-on: windows-latest timeout-minutes: 30 env: PYTHONUTF8: '1' steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect Windows-relevant changes id: changed uses: tj-actions/changed-files@v47 with: files: | openhands-tools/** tests/tools/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Install Chromium if: steps.changed.outputs.any_changed == 'true' run: uvx playwright install chromium - name: Run Windows test suite if: steps.changed.outputs.any_changed == 'true' run: | if (Test-Path .coverage) { Remove-Item .coverage -Force } $env:CI = 'true' # Keep the initial Windows pass non-blocking on coverage while # OS-specific gaps tracked in #2989 are still open. # Browser/file-editor e2e and terminal shell assumptions remain # tracked in #2986 and #2988. uv run python -m pytest -vvs ` --cov=openhands-tools ` --cov-report=term-missing ` --cov-fail-under=0 ` --cov-config=pyproject.toml ` tests/tools ` --ignore=tests/tools/browser_use/test_browser_executor_e2e.py ` --ignore=tests/tools/file_editor/test_memory_usage.py ` --ignore=tests/tools/terminal/test_conversation_cleanup.py ` --ignore=tests/tools/terminal/test_session_factory.py ` --ignore=tests/tools/terminal/test_shell_path_configuration.py ` --ignore=tests/tools/terminal/test_shutdown_handling.py ` --ignore=tests/tools/terminal/test_terminal_session.py ` --ignore=tests/tools/terminal/test_terminal_tool_auto_detection.py if (Test-Path .coverage) { Move-Item .coverage coverage-windows.dat Write-Host 'Windows coverage file prepared for upload' } - name: Upload Windows coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-windows path: coverage-windows.dat if-no-files-found: warn agent-server-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect Agent Server changes id: changed uses: tj-actions/changed-files@v47 with: files: | openhands-agent-server/** tests/agent_server/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Run Agent Server tests with coverage if: steps.changed.outputs.any_changed == 'true' run: | # Clean up any existing coverage file rm -f .coverage # Use pytest-xdist (-n auto) for parallel execution with proper # coverage collection. --forked prevents coverage from child processes. CI=true uv run python -m pytest -vvs \ -n auto \ --cov=openhands-agent-server \ --cov-report=term-missing \ --cov-fail-under=0 \ --cov-config=pyproject.toml \ tests/agent_server # Rename coverage file for upload if [ -f .coverage ]; then mv .coverage coverage-agent-server.dat echo "Agent Server coverage file prepared for upload" fi - name: Upload Agent Server coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-agent-server path: coverage-agent-server.dat if-no-files-found: warn workspace-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect workspace changes id: changed uses: tj-actions/changed-files@v47 with: files: | openhands-workspace/** tests/workspace/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Run workspace tests with coverage if: steps.changed.outputs.any_changed == 'true' run: | # Clean up any existing coverage file rm -f .coverage CI=true uv run python -m pytest -vvs \ -n auto \ --cov=openhands-workspace \ --cov-report=term-missing \ --cov-fail-under=0 \ --cov-config=pyproject.toml \ tests/workspace # Rename coverage file for upload if [ -f .coverage ]; then mv .coverage coverage-workspace.dat echo "Workspace coverage file prepared for upload" fi - name: Upload workspace coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-workspace path: coverage-workspace.dat if-no-files-found: warn cross-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 steps: - name: Checkout uses: actions/checkout@v6 with: {fetch-depth: 0} - name: Detect cross changes id: changed uses: tj-actions/changed-files@v47 with: files: | tests/** openhands/** pyproject.toml uv.lock .github/workflows/tests.yml - name: Install uv if: steps.changed.outputs.any_changed == 'true' uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps if: steps.changed.outputs.any_changed == 'true' run: uv sync --frozen --group dev - name: Run cross tests with coverage if: steps.changed.outputs.any_changed == 'true' run: | # Clean up any existing coverage file rm -f .coverage CI=true uv run python -m pytest -vvs \ --basetemp="${{ runner.temp }}/pytest" \ -o tmp_path_retention=none \ -o tmp_path_retention_count=0 \ --cov=openhands \ --cov-report=term-missing \ --cov-fail-under=0 \ --cov-config=pyproject.toml \ tests/cross # Rename coverage file for upload if [ -f .coverage ]; then mv .coverage coverage-cross.dat echo "Cross coverage file prepared for upload" fi - name: Upload cross coverage if: steps.changed.outputs.any_changed == 'true' && always() uses: actions/upload-artifact@v7 with: name: coverage-cross path: coverage-cross.dat if-no-files-found: warn coverage-report: runs-on: blacksmith-2vcpu-ubuntu-2404 needs: [sdk-tests, tools-tests, agent-server-tests, workspace-tests, cross-tests] if: always() && github.event_name == 'pull_request' steps: - name: Checkout uses: actions/checkout@v6 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true python-version: '3.13' - name: Install deps (for coverage CLI) run: uv sync --frozen --group dev - name: Download coverage artifacts uses: actions/download-artifact@v8 with: path: ./cov continue-on-error: true - name: Combine coverage data run: | shopt -s nullglob # For some reason, the github action won't properly upload the original # .converage* files # Convert uploaded .dat files back to .coverage format for coverage tool for dat_file in cov/**/coverage-*.dat; do if [[ "$dat_file" == *coverage-sdk.dat ]]; then cp "$dat_file" .coverage.sdk elif [[ "$dat_file" == *coverage-tools.dat ]]; then cp "$dat_file" .coverage.tools elif [[ "$dat_file" == *coverage-agent-server.dat ]]; then cp "$dat_file" .coverage.agent-server elif [[ "$dat_file" == *coverage-workspace.dat ]]; then cp "$dat_file" .coverage.workspace elif [[ "$dat_file" == *coverage-cross.dat ]]; then cp "$dat_file" .coverage.cross fi done # Check if we have any coverage files coverage_files=(.coverage.*) if [ ${#coverage_files[@]} -eq 0 ]; then echo "No coverage files found; skipping combined report." exit 0 fi echo "Found ${#coverage_files[@]} coverage files" uv run coverage combine uv run coverage xml -i -o coverage.xml uv run coverage report -m - name: Pytest coverage PR comment if: always() continue-on-error: true uses: MishaKav/pytest-coverage-comment@v1 with: github-token: ${{ secrets.GITHUB_TOKEN }} pytest-xml-coverage-path: coverage.xml title: Coverage Report create-new-comment: false hide-report: false xml-skip-covered: true report-only-changed-files: true remove-links-to-files: true remove-links-to-lines: true ================================================ FILE: .github/workflows/todo-management.yml ================================================ --- # Automated TODO Management Workflow # # This workflow automatically scans for TODO(openhands) comments and creates # pull requests to implement them using the OpenHands agent. # # Setup: # 1. Add LLM_API_KEY to repository secrets # 2. Ensure GITHUB_TOKEN has appropriate permissions # 3. Make sure Github Actions are allowed to create and review PRs # 4. Commit this file to .github/workflows/ in your repository # 5. Configure the schedule or trigger manually name: Automated TODO Management on: # Manual trigger workflow_dispatch: inputs: max_todos: description: Maximum number of TODOs to process in this run required: false default: '3' type: string todo_identifier: description: TODO identifier to search for (e.g., TODO(openhands)) required: false default: TODO(openhands) type: string # Trigger when 'automatic-todo' label is added to a PR pull_request: types: [labeled] # Scheduled trigger (disabled by default, uncomment and customize as needed) # schedule: # # Run every Monday at 9 AM UTC # - cron: "0 9 * * 1" permissions: contents: write pull-requests: write issues: write jobs: scan-todos: runs-on: ubuntu-24.04 # Only run if triggered manually or if 'automatic-todo' label was added if: > github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'automatic-todo') outputs: todos: ${{ steps.scan.outputs.todos }} todo-count: ${{ steps.scan.outputs.todo-count }} steps: - name: Checkout repository uses: actions/checkout@v6 with: fetch-depth: 0 # Full history for better context - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Copy TODO scanner run: | cp examples/03_github_workflows/03_todo_management/scanner.py /tmp/scanner.py chmod +x /tmp/scanner.py - name: Scan for TODOs id: scan run: | echo "Scanning for TODO comments..." # Run the scanner and capture output TODO_IDENTIFIER="${{ github.event.inputs.todo_identifier || 'TODO(openhands)' }}" python /tmp/scanner.py . --identifier "$TODO_IDENTIFIER" > todos.json # Count TODOs TODO_COUNT=$(python -c \ "import json; data=json.load(open('todos.json')); print(len(data))") echo "Found $TODO_COUNT $TODO_IDENTIFIER items" # Limit the number of TODOs to process MAX_TODOS="${{ github.event.inputs.max_todos || '3' }}" if [ "$TODO_COUNT" -gt "$MAX_TODOS" ]; then echo "Limiting to first $MAX_TODOS TODOs" python -c " import json data = json.load(open('todos.json')) limited = data[:$MAX_TODOS] json.dump(limited, open('todos.json', 'w'), indent=2) " TODO_COUNT=$MAX_TODOS fi # Set outputs echo "todos=$(cat todos.json | jq -c .)" >> $GITHUB_OUTPUT echo "todo-count=$TODO_COUNT" >> $GITHUB_OUTPUT # Display found TODOs echo "## 📋 Found TODOs" >> $GITHUB_STEP_SUMMARY if [ "$TODO_COUNT" -eq 0 ]; then echo "No TODO(openhands) comments found." >> $GITHUB_STEP_SUMMARY else echo "Found $TODO_COUNT TODO(openhands) items:" \ >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY python -c " import json data = json.load(open('todos.json')) for i, todo in enumerate(data, 1): print(f'{i}. **{todo[\"file\"]}:{todo[\"line\"]}** - ' + f'{todo[\"description\"]}') " >> $GITHUB_STEP_SUMMARY fi process-todos: needs: scan-todos if: needs.scan-todos.outputs.todo-count > 0 runs-on: ubuntu-24.04 strategy: matrix: todo: ${{ fromJson(needs.scan-todos.outputs.todos) }} max-parallel: 1 # Process one TODO at a time to avoid conflicts steps: - name: Checkout repository uses: actions/checkout@v6 with: fetch-depth: 0 token: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} - name: Switch to feature branch with TODO management files run: | git checkout openhands/todo-management-example git pull origin openhands/todo-management-example - name: Set up Python uses: actions/setup-python@v6 with: python-version: '3.13' - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true - name: Install OpenHands dependencies run: | # Install OpenHands SDK and tools from git repository uv pip install --system "openhands-sdk @ git+https://github.com/OpenHands/agent-sdk.git@main#subdirectory=openhands-sdk" uv pip install --system "openhands-tools @ git+https://github.com/OpenHands/agent-sdk.git@main#subdirectory=openhands-tools" - name: Copy agent files run: | cp examples/03_github_workflows/03_todo_management/agent_script.py agent.py cp examples/03_github_workflows/03_todo_management/prompt.py prompt.py chmod +x agent.py - name: Configure Git run: | git config --global user.name "openhands-bot" git config --global user.email \ "openhands-bot@users.noreply.github.com" - name: Process TODO env: LLM_MODEL: litellm_proxy/claude-sonnet-4-5-20250929 LLM_BASE_URL: https://llm-proxy.app.all-hands.dev LLM_API_KEY: ${{ secrets.LLM_API_KEY }} GITHUB_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} GITHUB_REPOSITORY: ${{ github.repository }} TODO_FILE: ${{ matrix.todo.file }} TODO_LINE: ${{ matrix.todo.line }} TODO_DESCRIPTION: ${{ matrix.todo.description }} PYTHONPATH: '' run: | echo "Processing TODO: $TODO_DESCRIPTION" echo "File: $TODO_FILE:$TODO_LINE" # Create a unique branch name for this TODO BRANCH_NAME="todo/$(echo "$TODO_DESCRIPTION" | \ sed 's/[^a-zA-Z0-9]/-/g' | \ sed 's/--*/-/g' | \ sed 's/^-\|-$//g' | \ tr '[:upper:]' '[:lower:]' | \ cut -c1-50)" echo "Branch name: $BRANCH_NAME" # Create and switch to new branch (force create if exists) git checkout -B "$BRANCH_NAME" # Run the agent to process the TODO # Stay in repository directory for git operations # Create JSON payload for the agent TODO_JSON=$(cat <&1 | tee agent_output.log AGENT_EXIT_CODE=$? set -e echo "Agent exit code: $AGENT_EXIT_CODE" echo "Agent output log:" cat agent_output.log # Show files in working directory echo "Files in working directory:" ls -la # If agent failed, show more details if [ $AGENT_EXIT_CODE -ne 0 ]; then echo "Agent failed with exit code $AGENT_EXIT_CODE" echo "Last 50 lines of agent output:" tail -50 agent_output.log exit $AGENT_EXIT_CODE fi # Check if any changes were made cd "$GITHUB_WORKSPACE" if git diff --quiet; then echo "No changes made by agent, skipping PR creation" exit 0 fi # Commit changes git add -A git commit -m "Implement TODO: $TODO_DESCRIPTION Automatically implemented by OpenHands agent. Co-authored-by: openhands " # Push branch git push origin "$BRANCH_NAME" # Create pull request PR_TITLE="Implement TODO: $TODO_DESCRIPTION" PR_BODY="## 🤖 Automated TODO Implementation This PR automatically implements the following TODO: **File:** \`$TODO_FILE:$TODO_LINE\` **Description:** $TODO_DESCRIPTION ### Implementation The OpenHands agent has analyzed the TODO and implemented the requested functionality. ### Review Notes - Please review the implementation for correctness - Test the changes in your development environment - The original TODO comment will be updated with this PR URL once merged --- *This PR was created automatically by the TODO Management workflow.*" # Create PR using GitHub CLI or API curl -X POST \ -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/pulls" \ -d "{ \"title\": \"$PR_TITLE\", \"body\": \"$PR_BODY\", \"head\": \"$BRANCH_NAME\", \"base\": \"${{ github.ref_name }}\" }" summary: needs: [scan-todos, process-todos] if: always() runs-on: ubuntu-24.04 steps: - name: Generate Summary run: | echo "# 🤖 TODO Management Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY TODO_COUNT="${{ needs.scan-todos.outputs.todo-count || '0' }}" echo "**TODOs Found:** $TODO_COUNT" >> $GITHUB_STEP_SUMMARY if [ "$TODO_COUNT" -gt 0 ]; then echo "**Processing Status:** ✅ Completed" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "Check the pull requests created for each TODO" \ "implementation." >> $GITHUB_STEP_SUMMARY else echo "**Status:** ℹ️ No TODOs found to process" \ >> $GITHUB_STEP_SUMMARY fi echo "" >> $GITHUB_STEP_SUMMARY echo "---" >> $GITHUB_STEP_SUMMARY echo "*Workflow completed at $(date)*" >> $GITHUB_STEP_SUMMARY ================================================ FILE: .github/workflows/version-bump-guard.yml ================================================ --- name: Version bump guard on: pull_request: branches: [main] jobs: version-bump-guard: name: Check package versions runs-on: ubuntu-latest permissions: contents: read steps: - name: Checkout uses: actions/checkout@v6 with: fetch-depth: 0 - name: Validate package version changes env: VERSION_BUMP_BASE_REF: ${{ github.base_ref }} PR_TITLE: ${{ github.event.pull_request.title }} PR_HEAD_REF: ${{ github.event.pull_request.head.ref }} run: python3 .github/scripts/check_version_bumps.py ================================================ FILE: .github/workflows/version-bump-prs.yml ================================================ --- name: Create Version Bump PRs on: # Dispatched by pypi-release.yml after a successful publish. # Also supports manual reruns for a specific version. workflow_dispatch: inputs: version: description: Version to bump to (e.g., 1.11.3) required: true type: string jobs: create-version-bump-prs: runs-on: ubuntu-24.04 env: GH_TOKEN: ${{ secrets.OPENHANDS_BOT_GITHUB_PAT_PUBLIC }} SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} steps: - name: Checkout uses: actions/checkout@v6 - name: Get version from release or input id: get_version run: | VERSION="${{ github.event.inputs.version }}" echo "version=$VERSION" >> $GITHUB_OUTPUT echo "📦 Version: $VERSION" - name: Validate version env: VERSION: ${{ steps.get_version.outputs.version }} run: | if [ -z "$VERSION" ]; then echo "❌ Version is empty" exit 1 fi echo "📦 Creating version bump PRs for version: $VERSION" - name: Install uv uses: astral-sh/setup-uv@v7 with: version: latest python-version: '3.12' enable-cache: false - name: Wait for packages to be available on PyPI env: VERSION: ${{ steps.get_version.outputs.version }} run: | set -euo pipefail PACKAGES=( openhands-sdk openhands-tools openhands-workspace openhands-agent-server ) MAX_ATTEMPTS=60 SLEEP_SECONDS=20 echo "⏳ Waiting for packages to be available on PyPI..." # Use uv pip compile --dry-run to verify packages are resolvable # via the Simple API (the same index uv add uses). # The JSON API propagates faster than the Simple API, so a curl # check alone is insufficient. # Keep this isolated from the SDK repo's exclude-newer guardrail: # this workflow intentionally consumes just-published packages. for PKG in "${PACKAGES[@]}"; do echo "Checking $PKG==$VERSION..." ATTEMPT=1 while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do if uv pip compile --no-config --no-cache --python-version 3.12 - <<< "$PKG==$VERSION" > /dev/null 2>&1; then echo "✅ $PKG==$VERSION is resolvable on PyPI" break fi echo " Attempt $ATTEMPT/$MAX_ATTEMPTS: $PKG==$VERSION not yet resolvable, waiting ${SLEEP_SECONDS}s..." sleep $SLEEP_SECONDS ATTEMPT=$((ATTEMPT + 1)) done if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then echo "❌ Timeout waiting for $PKG==$VERSION to be resolvable on PyPI" exit 1 fi done echo "✅ All packages are resolvable on PyPI!" # OpenHands-CLI step runs first since it's simpler and less error-prone - name: Create PR for OpenHands-CLI repo env: VERSION: ${{ steps.get_version.outputs.version }} run: | set -euo pipefail REPO="OpenHands/openhands-cli" BRANCH="bump-sdk-$VERSION" echo "🔄 Creating PR for $REPO..." # Clone the repo git clone "https://x-access-token:${GH_TOKEN}@github.com/${REPO}.git" openhands-cli-repo cd openhands-cli-repo # Configure git git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" # Check if branch already exists on remote if git ls-remote --heads origin "$BRANCH" | grep -q "$BRANCH"; then echo "⚠️ Branch $BRANCH already exists, checking out existing branch" git fetch origin "$BRANCH" git checkout "$BRANCH" else # Create branch git checkout -b "$BRANCH" fi # OpenHands-CLI currently requires Python 3.12, so resolve with that interpreter. # The target repo uses exclude-newer-package to exempt openhands-sdk/tools # from its 7-day freshness guardrail, so no UV_EXCLUDE_NEWER override # is needed — doing so would actually break the per-package exemptions. # We use --no-cache to avoid stale index data from just-published packages. uv add --python 3.12 --no-cache \ "openhands-sdk==$VERSION" \ "openhands-tools==$VERSION" # Check if there are changes if git diff --quiet; then echo "⚠️ No changes detected in $REPO - versions may already be up to date" exit 0 fi # Commit and push git add pyproject.toml uv.lock git commit -m "Bump openhands-sdk, openhands-tools to $VERSION" \ -m "Automated version bump after PyPI release." \ -m "Co-authored-by: openhands " git push -u origin "$BRANCH" # Check if PR already exists EXISTING_PR=$(gh pr list --repo "$REPO" --head "$BRANCH" --json number --jq '.[0].number') if [ -n "$EXISTING_PR" ]; then echo "✅ PR #$EXISTING_PR already exists for $REPO" else # Create PR gh pr create \ --repo "$REPO" \ --title "Bump SDK packages to v$VERSION" \ --body "## Automated Version Bump This PR updates the following packages to version **$VERSION**: - \`openhands-sdk\` - \`openhands-tools\` **Triggered by:** Release of [software-agent-sdk v$VERSION](https://github.com/OpenHands/software-agent-sdk/releases/tag/v$VERSION) --- _This PR was automatically created by the version-bump-prs workflow._" \ --base main \ --head "$BRANCH" echo "✅ PR created for $REPO" fi - name: Create PR for OpenHands repo env: VERSION: ${{ steps.get_version.outputs.version }} run: | set -euo pipefail REPO="OpenHands/OpenHands" BRANCH="bump-sdk-$VERSION" echo "🔄 Creating PR for $REPO..." # Clone the repo git clone "https://x-access-token:${GH_TOKEN}@github.com/${REPO}.git" openhands-repo cd openhands-repo # Configure git git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" # Check if branch already exists on remote if git ls-remote --heads origin "$BRANCH" | grep -q "$BRANCH"; then echo "⚠️ Branch $BRANCH already exists, checking out existing branch" git fetch origin "$BRANCH" git checkout "$BRANCH" else # Create branch git checkout -b "$BRANCH" fi # Match the base branch's lockfile generator so reruns can # repair any existing bump branch that used a newer Poetry. POETRY_VERSION=$(git show origin/main:poetry.lock | sed -n -E 's/^# This file is automatically @generated by Poetry ([^ ]+) and should not be changed by hand\.$/\1/p') if [ -z "$POETRY_VERSION" ]; then echo "❌ Could not determine Poetry version from poetry.lock" exit 1 fi echo "📦 Installing Poetry $POETRY_VERSION from poetry.lock..." pipx install "poetry==$POETRY_VERSION" poetry --version # 1. Update versions in pyproject.toml and poetry.lock using poetry (root) # The --lock flag updates both pyproject.toml AND poetry.lock # Note: enterprise/pyproject.toml gets these dependencies transitively via openhands-ai echo "📝 Updating root pyproject.toml and poetry.lock..." # Verify enterprise/pyproject.toml does NOT have SDK packages explicitly listed # If they exist there, they will become stale since we only update root pyproject.toml if [ -f "enterprise/pyproject.toml" ]; then echo "🔍 Verifying enterprise/pyproject.toml doesn't have explicit SDK packages..." SDK_PACKAGES=("openhands-sdk" "openhands-tools" "openhands-agent-server") for pkg in "${SDK_PACKAGES[@]}"; do # Match package name as a TOML key (with optional leading whitespace) followed by = # This catches both 'openhands-sdk = "1.2.3"' and 'openhands-sdk="1.2.3"' if grep -qE "^[[:space:]]*${pkg}[[:space:]]*=" enterprise/pyproject.toml; then echo "❌ ERROR: enterprise/pyproject.toml contains explicit reference to '$pkg'" echo " These packages should come transitively via openhands-ai dependency." echo " Please remove '$pkg' from enterprise/pyproject.toml to avoid version drift." exit 1 fi done echo "✅ enterprise/pyproject.toml does not have explicit SDK packages" fi # 1. Update versions in pyproject.toml using sed for exact pinning # Note: We use sed instead of `poetry add --lock` because Poetry normalizes # version constraints (e.g., "==1.13.1" becomes "1.13") which causes # inconsistencies between [tool.poetry.dependencies] and [project].dependencies echo "📝 Updating pyproject.toml with exact version pins..." PYPROJECT_FMT_CONFIG="dev_config/python/.pre-commit-config.yaml" if [ ! -f "$PYPROJECT_FMT_CONFIG" ]; then echo "❌ pyproject-fmt config not found at expected path" exit 1 fi if ! grep -q "args: \\[--keep-full-version\\]" "$PYPROJECT_FMT_CONFIG"; then sed -i '/^[[:space:]]*- id: pyproject-fmt[[:space:]]*$/a\ args: [--keep-full-version]' "$PYPROJECT_FMT_CONFIG" echo "✅ Configured pyproject-fmt to preserve full versions" fi # Update [tool.poetry.dependencies] section # Matches: openhands-sdk = "1.13" or openhands-sdk = "1.13.0" sed -i -E 's/^(openhands-sdk = )"[^"]*"/\1"=='"$VERSION"'"/' pyproject.toml sed -i -E 's/^(openhands-tools = )"[^"]*"/\1"=='"$VERSION"'"/' pyproject.toml sed -i -E 's/^(openhands-agent-server = )"[^"]*"/\1"=='"$VERSION"'"/' pyproject.toml # Update [project].dependencies section (PEP 621 format) # Matches: "openhands-sdk==1.13.1", or "openhands-sdk==1.13", sed -i -E 's/"openhands-sdk==[^"]*"/"openhands-sdk=='"$VERSION"'"/' pyproject.toml sed -i -E 's/"openhands-tools==[^"]*"/"openhands-tools=='"$VERSION"'"/' pyproject.toml sed -i -E 's/"openhands-agent-server==[^"]*"/"openhands-agent-server=='"$VERSION"'"/' pyproject.toml # Update mypy additional_dependencies pins so type-checking uses the same SDK version sed -i -E 's/"openhands-sdk==[^"]*"/"openhands-sdk=='"$VERSION"'"/' "$PYPROJECT_FMT_CONFIG" sed -i -E 's/"openhands-tools==[^"]*"/"openhands-tools=='"$VERSION"'"/' "$PYPROJECT_FMT_CONFIG" echo "✅ Updated pyproject.toml" # 2. Regenerate poetry.lock with the new versions # Note: In Poetry 2.x, the default behavior is to not update packages already # in the lock file (the --no-update flag was removed in Poetry 2.x) echo "📝 Regenerating poetry.lock..." poetry lock # 2b. Regenerate enterprise/poetry.lock so its transitive SDK pins # match the root. enterprise/pyproject.toml depends on the root via # `openhands-ai = { path = "../", develop = true }`, but it keeps its # OWN poetry.lock that pins openhands-sdk/tools/agent-server. Without # this step the enterprise lockfile drifts behind (see PR #14409 that # had to be opened manually after PR #14350 missed it). # --no-cache invalidates the stale build of the path-installed # openhands-ai package; without it Poetry leaves the entries pinned # at the previous version. if [ -f "enterprise/poetry.lock" ] && [ -f "enterprise/pyproject.toml" ]; then echo "📝 Regenerating enterprise/poetry.lock..." (cd enterprise && poetry lock --no-cache) echo "✅ Updated enterprise/poetry.lock" fi echo "📝 Regenerating uv.lock..." # --no-config bypasses ~/.config/uv/uv.toml where setup-uv writes its # 7-day freshness guardrail. Unlike --exclude-newer=, it does not # bake a timestamp into uv.lock's [options] section (which would create # noise in every future bump PR). uv lock --no-cache --no-config echo "✅ Updated uv.lock" # 3. Update the version in sandbox_spec_service.py echo "🔧 Updating AGENT_SERVER_IMAGE..." SANDBOX_SPEC_FILE="openhands/app_server/sandbox/sandbox_spec_service.py" if [ -f "$SANDBOX_SPEC_FILE" ]; then # Update the AGENT_SERVER_IMAGE line with the new hash sed -i "s|AGENT_SERVER_IMAGE = 'ghcr.io/openhands/agent-server:[^']*'|AGENT_SERVER_IMAGE = 'ghcr.io/openhands/agent-server:${VERSION}-python'|" "$SANDBOX_SPEC_FILE" echo "✅ Updated AGENT_SERVER_IMAGE to: ghcr.io/openhands/agent-server:${VERSION}-python" else echo "❌ sandbox_spec_service.py not found at expected path" exit 1 fi # 4. Run pre-commit to fix formatting with the target repo's config. echo "🔧 Running pre-commit to fix formatting..." pip install pre-commit pre-commit run --files pyproject.toml "$PYPROJECT_FMT_CONFIG" --config ./dev_config/python/.pre-commit-config.yaml || true # Check if there are changes if git diff --quiet; then echo "⚠️ No changes detected in $REPO - versions may already be up to date" exit 0 fi # Commit and push git add pyproject.toml poetry.lock uv.lock "$SANDBOX_SPEC_FILE" "$PYPROJECT_FMT_CONFIG" if [ -f "enterprise/poetry.lock" ]; then git add enterprise/poetry.lock fi git commit -m "Bump openhands-sdk, openhands-tools, openhands-agent-server to $VERSION" \ -m "Automated version bump after PyPI release." \ -m "" \ -m "Changes:" \ -m "- Updated SDK packages to v$VERSION with exact pins in pyproject.toml" \ -m "- Regenerated poetry.lock" \ -m "- Regenerated enterprise/poetry.lock to keep transitive SDK pins aligned" \ -m "- Regenerated uv.lock" \ -m "- Updated AGENT_SERVER_IMAGE to ${VERSION}" \ -m "- Updated mypy additional_dependencies pins in pre-commit config" \ -m "" \ -m "Co-authored-by: openhands " git push -u origin "$BRANCH" # Check if PR already exists EXISTING_PR=$(gh pr list --repo "$REPO" --head "$BRANCH" --json number --jq '.[0].number') if [ -n "$EXISTING_PR" ]; then echo "✅ PR #$EXISTING_PR already exists for $REPO" else # Create PR gh pr create \ --repo "$REPO" \ --title "Bump SDK packages to v$VERSION" \ --body "## Automated Version Bump This PR updates the following packages to version **$VERSION**: - \`openhands-sdk\` - \`openhands-tools\` - \`openhands-agent-server\` ### Changes - Updated SDK packages in \`pyproject.toml\` with exact pins - Regenerated \`poetry.lock\` with the target repo's Poetry version - Regenerated \`enterprise/poetry.lock\` so its transitive SDK pins match the root - Regenerated \`uv.lock\` to match the updated SDK versions - Updated \`AGENT_SERVER_IMAGE\` to \`${VERSION}\` in \`sandbox_spec_service.py\` - Updated mypy \`additional_dependencies\` pins in \`.pre-commit-config.yaml\` **Triggered by:** Release of [software-agent-sdk v$VERSION](https://github.com/OpenHands/software-agent-sdk/releases/tag/v$VERSION) --- _This PR was automatically created by the version-bump-prs workflow._" \ --base main \ --head "$BRANCH" echo "✅ PR created for $REPO" fi - name: Summary env: VERSION: ${{ steps.get_version.outputs.version }} run: | echo "## ✅ Version Bump PRs Created" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "PRs have been created to bump SDK packages to version **$VERSION**:" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "- [OpenHands](https://github.com/OpenHands/OpenHands/pulls?q=is%3Apr+bump-sdk-$VERSION)" >> $GITHUB_STEP_SUMMARY echo "- [OpenHands-CLI](https://github.com/OpenHands/openhands-cli/pulls?q=is%3Apr+bump-sdk-$VERSION)" >> $GITHUB_STEP_SUMMARY - name: Notify Slack if: env.SLACK_BOT_TOKEN != '' uses: slackapi/slack-github-action@v3.0.3 with: method: chat.postMessage token: ${{ env.SLACK_BOT_TOKEN }} payload: | channel: C08E1SYKEM9 text: "🚀 *SDK v${{ steps.get_version.outputs.version }} published to PyPI!*\n\nVersion bump PRs created:\n• \n• \n\n" ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST requirements.txt # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control # poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ .env.bak venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ # VS Code: Ignore all but certain files that specify repo-specific settings. # https://stackoverflow.com/questions/32964920/should-i-commit-the-vscode-folder-to-source-control .vscode/**/* !.vscode/extensions.json !.vscode/tasks.json # VS Code extensions/forks: .cursorignore .rooignore .clineignore .windsurfignore .cursorrules .roorules .clinerules .windsurfrules .cursor/rules .roo/rules .cline/rules .windsurf/rules .repomix repomix-output.txt # misc .DS_Store .env.local .env.development.local .env.test.local .env.production.local npm-debug.log* yarn-debug.log* yarn-error.log* logs # agent .envrc cache .jinja_cache/ .conversations* /workspace/ openapi.json .client/ # Local workspace files .beads/*.db .worktrees/ agent-sdk.workspace.code-workspace # Integration test outputs tests/integration/outputs/ tests/integration/api_compliance/outputs/ # Agent-generated temp .agent_tmp/ ================================================ FILE: .openhands/hooks/on_stop.sh ================================================ #!/bin/bash # Stop hook: runs pre-commit, pytest, and checks CI status before allowing agent to finish # # This hook runs when the agent attempts to stop/finish. # It can BLOCK the stop by: # - Exiting with code 2 (blocked) # - Outputting JSON: {"decision": "deny", "additionalContext": "feedback message"} # # Environment variables available: # OPENHANDS_PROJECT_DIR - Project directory # OPENHANDS_SESSION_ID - Session ID # GITHUB_TOKEN - GitHub API token (if available) set -o pipefail PROJECT_DIR="${OPENHANDS_PROJECT_DIR:-$(pwd)}" cd "$PROJECT_DIR" || exit 1 # Collect all issues to report back to the agent ISSUES="" BLOCK_STOP=false log_issue() { ISSUES="${ISSUES}${1}\n" BLOCK_STOP=true } >&2 echo "=== Stop Hook ===" >&2 echo "Project directory: $PROJECT_DIR" >&2 echo "" # -------------------------- # Step 1: Run pre-commit on all files # -------------------------- >&2 echo "=== Running pre-commit run --all-files ===" if command -v uv &> /dev/null; then PRECOMMIT_OUTPUT=$(uv run pre-commit run --all-files 2>&1) PRECOMMIT_EXIT=$? else PRECOMMIT_OUTPUT=$(pre-commit run --all-files 2>&1) PRECOMMIT_EXIT=$? fi >&2 echo "$PRECOMMIT_OUTPUT" if [ $PRECOMMIT_EXIT -ne 0 ]; then >&2 echo "⚠️ pre-commit found issues (exit code: $PRECOMMIT_EXIT)" log_issue "## Pre-commit Failed\n\nPre-commit checks failed. Please fix the following issues:\n\n\`\`\`\n${PRECOMMIT_OUTPUT}\n\`\`\`" else >&2 echo "✓ pre-commit passed" fi >&2 echo "" # -------------------------- # Step 2: Detect changed files and run appropriate tests # -------------------------- >&2 echo "=== Detecting changed files and running appropriate tests ===" # Get changed files from git (staged, unstaged, and untracked) CHANGED_FILES=$(git status --porcelain 2>/dev/null | awk '{print $NF}') if [ -n "$CHANGED_FILES" ]; then >&2 echo "Changed files:" >&2 echo "$CHANGED_FILES" | head -20 >&2 echo "" # Map changed files to test directories PROJECTS_TO_TEST="" add_project() { local project="$1" if [[ ! "$PROJECTS_TO_TEST" =~ "$project" ]]; then PROJECTS_TO_TEST="$PROJECTS_TO_TEST $project" fi } while IFS= read -r file; do case "$file" in openhands-sdk/*) add_project "tests/sdk" ;; openhands-tools/*) add_project "tests/tools" ;; openhands-workspace/*) add_project "tests/workspace" ;; openhands-agent-server/*) add_project "tests/agent_server" ;; tests/sdk/*) add_project "tests/sdk" ;; tests/tools/*) add_project "tests/tools" ;; tests/workspace/*) add_project "tests/workspace" ;; tests/agent_server/*) add_project "tests/agent_server" ;; tests/cross/*) add_project "tests/cross" ;; tests/examples/*) add_project "tests/examples" ;; tests/github_workflows/*) add_project "tests/github_workflows" ;; examples/*) add_project "tests/examples" ;; scripts/*) add_project "tests/cross" ;; pyproject.toml|uv.lock) add_project "tests/cross" ;; esac done <<< "$CHANGED_FILES" PROJECTS_TO_TEST=$(echo "$PROJECTS_TO_TEST" | xargs) if [ -n "$PROJECTS_TO_TEST" ]; then >&2 echo "Running tests for: $PROJECTS_TO_TEST" >&2 echo "" for project in $PROJECTS_TO_TEST; do if [ -d "$project" ]; then >&2 echo "=== Testing $project ===" if command -v uv &> /dev/null; then PYTEST_OUTPUT=$(uv run pytest "$project" -v --tb=short -x 2>&1) PYTEST_EXIT=$? else PYTEST_OUTPUT=$(pytest "$project" -v --tb=short -x 2>&1) PYTEST_EXIT=$? fi >&2 echo "$PYTEST_OUTPUT" if [ $PYTEST_EXIT -ne 0 ]; then >&2 echo "⚠️ pytest failed for $project" log_issue "## Pytest Failed for $project\n\nTests failed. Please fix the following:\n\n\`\`\`\n${PYTEST_OUTPUT}\n\`\`\`" fi >&2 echo "" fi done else >&2 echo "No tests to run for changed files" fi else >&2 echo "No changed files detected, skipping local tests" fi >&2 echo "" # -------------------------- # Step 3: Check if there's a pushed commit and wait for CI # -------------------------- >&2 echo "=== Checking GitHub CI status ===" # Check if we're in a git repo with a GitHub remote GITHUB_REMOTE=$(git remote -v 2>/dev/null | grep -E "(github\.com.*push)" | head -1) if [ -z "$GITHUB_REMOTE" ]; then >&2 echo "No GitHub remote found, skipping CI check" else # Extract owner/repo from remote URL # Handle both HTTPS and SSH formats REPO_INFO=$(echo "$GITHUB_REMOTE" | sed -E 's|.*github\.com[:/]([^/]+)/([^/.]+)(\.git)?.*|\1/\2|') if [ -z "$REPO_INFO" ]; then >&2 echo "Could not parse GitHub repository info" else >&2 echo "Repository: $REPO_INFO" # Get current branch CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null) >&2 echo "Current branch: $CURRENT_BRANCH" # Get the latest commit SHA LOCAL_SHA=$(git rev-parse HEAD 2>/dev/null) >&2 echo "Local commit: ${LOCAL_SHA:0:8}" # Check if this commit has been pushed REMOTE_SHA=$(git ls-remote origin "$CURRENT_BRANCH" 2>/dev/null | awk '{print $1}') if [ -z "$REMOTE_SHA" ]; then >&2 echo "Branch not pushed to remote, skipping CI check" elif [ "$LOCAL_SHA" != "$REMOTE_SHA" ]; then >&2 echo "Local commit differs from remote (remote: ${REMOTE_SHA:0:8}), skipping CI check" else >&2 echo "Commit has been pushed, checking CI status..." # Check if GITHUB_TOKEN is available if [ -z "$GITHUB_TOKEN" ]; then >&2 echo "GITHUB_TOKEN not set, cannot check CI status" else # Use gh CLI if available, otherwise fall back to API if command -v gh &> /dev/null; then >&2 echo "Using gh CLI to check CI status..." # Get check runs for this commit CI_STATUS=$(gh api "repos/$REPO_INFO/commits/$LOCAL_SHA/check-runs" \ --jq '.check_runs | map({name: .name, status: .status, conclusion: .conclusion})' 2>&1) if [ $? -ne 0 ]; then >&2 echo "Failed to get CI status: $CI_STATUS" else # Parse the status TOTAL_CHECKS=$(echo "$CI_STATUS" | jq 'length') if [ "$TOTAL_CHECKS" -eq 0 ]; then >&2 echo "No CI checks found for this commit" else >&2 echo "Found $TOTAL_CHECKS CI check(s)" # Check for in-progress runs IN_PROGRESS=$(echo "$CI_STATUS" | jq '[.[] | select(.status != "completed")] | length') FAILED=$(echo "$CI_STATUS" | jq '[.[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "cancelled")] | length') if [ "$IN_PROGRESS" -gt 0 ]; then >&2 echo "⏳ $IN_PROGRESS check(s) still in progress" # Wait for CI to complete (with timeout) MAX_WAIT=300 # 5 minutes WAIT_INTERVAL=15 TOTAL_WAITED=0 while [ "$IN_PROGRESS" -gt 0 ] && [ "$TOTAL_WAITED" -lt "$MAX_WAIT" ]; do >&2 echo "Waiting for CI... (${TOTAL_WAITED}s / ${MAX_WAIT}s max)" sleep $WAIT_INTERVAL TOTAL_WAITED=$((TOTAL_WAITED + WAIT_INTERVAL)) CI_STATUS=$(gh api "repos/$REPO_INFO/commits/$LOCAL_SHA/check-runs" \ --jq '.check_runs | map({name: .name, status: .status, conclusion: .conclusion})' 2>&1) IN_PROGRESS=$(echo "$CI_STATUS" | jq '[.[] | select(.status != "completed")] | length') done if [ "$IN_PROGRESS" -gt 0 ]; then >&2 echo "⚠️ CI still running after ${MAX_WAIT}s timeout" log_issue "## CI Still Running\n\nCI checks are still in progress after waiting ${MAX_WAIT} seconds. Please wait for CI to complete before finishing." fi fi # Re-check for failures after waiting FAILED=$(echo "$CI_STATUS" | jq '[.[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "cancelled")] | length') if [ "$FAILED" -gt 0 ]; then >&2 echo "❌ $FAILED check(s) failed!" # Get details of failed checks FAILED_DETAILS=$(echo "$CI_STATUS" | jq -r '.[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "cancelled") | "- \(.name): \(.conclusion)"') >&2 echo "$FAILED_DETAILS" # Try to get failure logs FAILED_NAMES=$(echo "$CI_STATUS" | jq -r '.[] | select(.conclusion == "failure") | .name') FAILURE_MSG="## CI Failed\n\nThe following CI checks failed:\n\n${FAILED_DETAILS}\n" # Try to get the workflow run logs for more context WORKFLOW_RUNS=$(gh api "repos/$REPO_INFO/actions/runs?head_sha=$LOCAL_SHA" \ --jq '.workflow_runs[] | select(.conclusion == "failure") | {id: .id, name: .name}' 2>/dev/null) if [ -n "$WORKFLOW_RUNS" ]; then FAILURE_MSG="${FAILURE_MSG}\nYou can view the full logs at: https://github.com/$REPO_INFO/actions\n" # Try to get job logs FIRST_RUN_ID=$(echo "$WORKFLOW_RUNS" | jq -r '.id' | head -1) if [ -n "$FIRST_RUN_ID" ]; then JOBS_OUTPUT=$(gh api "repos/$REPO_INFO/actions/runs/$FIRST_RUN_ID/jobs" \ --jq '.jobs[] | select(.conclusion == "failure") | "### \(.name)\nConclusion: \(.conclusion)\nSteps:\n" + (.steps | map("- \(.name): \(.conclusion)") | join("\n"))' 2>/dev/null | head -100) if [ -n "$JOBS_OUTPUT" ]; then FAILURE_MSG="${FAILURE_MSG}\n### Failed Job Details:\n\`\`\`\n${JOBS_OUTPUT}\n\`\`\`" fi fi fi log_issue "$FAILURE_MSG" else >&2 echo "✓ All CI checks passed!" fi fi fi else # Fallback to curl >&2 echo "gh CLI not available, using API directly..." CI_RESPONSE=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/$REPO_INFO/commits/$LOCAL_SHA/check-runs" 2>&1) TOTAL_CHECKS=$(echo "$CI_RESPONSE" | jq '.total_count // 0') if [ "$TOTAL_CHECKS" -gt 0 ]; then IN_PROGRESS=$(echo "$CI_RESPONSE" | jq '[.check_runs[] | select(.status != "completed")] | length') FAILED=$(echo "$CI_RESPONSE" | jq '[.check_runs[] | select(.conclusion == "failure")] | length') if [ "$IN_PROGRESS" -gt 0 ]; then >&2 echo "⏳ CI checks still in progress" log_issue "## CI In Progress\n\nCI checks are still running. Please wait for CI to complete." elif [ "$FAILED" -gt 0 ]; then FAILED_NAMES=$(echo "$CI_RESPONSE" | jq -r '.check_runs[] | select(.conclusion == "failure") | .name') >&2 echo "❌ CI failed: $FAILED_NAMES" log_issue "## CI Failed\n\nThe following CI checks failed:\n${FAILED_NAMES}\n\nPlease fix the issues and try again." else >&2 echo "✓ All CI checks passed!" fi else >&2 echo "No CI checks found" fi fi fi fi fi fi >&2 echo "" # -------------------------- # Final decision # -------------------------- if [ "$BLOCK_STOP" = true ]; then >&2 echo "=== BLOCKING STOP: Issues found ===" # Output JSON to provide feedback to the agent # Escape the issues for JSON ESCAPED_ISSUES=$(echo -e "$ISSUES" | jq -Rs .) echo "{\"decision\": \"deny\", \"reason\": \"Checks failed\", \"additionalContext\": $ESCAPED_ISSUES}" exit 2 fi >&2 echo "=== All checks passed, allowing stop ===" echo '{"decision": "allow"}' exit 0 ================================================ FILE: .openhands/hooks.json ================================================ { "stop": [ { "matcher": "*", "hooks": [ { "type": "command", "command": ".openhands/hooks/on_stop.sh", "timeout": 600 } ] } ] } ================================================ FILE: .openhands/setup.sh ================================================ #!/bin/bash if ! command -v uv &> /dev/null; then echo "uv is not installed. Installing..." curl -LsSf https://astral.sh/uv/install.sh | sh else echo "uv is already installed." uv self update # always update to the latest version fi make build ================================================ FILE: .pre-commit-config.yaml ================================================ --- repos: - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt rev: 0.2.1 # or other specific tag hooks: - id: yamlfmt - repo: local hooks: - id: ruff-format name: Ruff format entry: uv args: [run, ruff, format] language: system types: [python] pass_filenames: true always_run: false - id: ruff-check name: Ruff lint entry: uv args: [run, ruff, check, --fix, --exit-non-zero-on-fix] language: system types: [python] pass_filenames: true always_run: false - id: pycodestyle name: PEP8 style check (pycodestyle) entry: uv args: [run, pycodestyle, --max-line-length=88, '--ignore=E203,E501,W503,E704'] language: system types: [python] pass_filenames: true always_run: false - id: pyright name: Type check with pyright entry: uv args: [run, pyright] language: system types: [python] pass_filenames: true always_run: false - id: check-import-rules name: Check import dependency rules entry: uv args: [run, python, scripts/check_import_rules.py] language: system types: [python] pass_filenames: true always_run: false - id: check-tool-registration name: Check Tool subclass registration entry: uv args: [run, python, scripts/check_tool_registration.py] language: system types: [python] pass_filenames: true always_run: false ================================================ FILE: .python-version ================================================ 3.13 ================================================ FILE: AGENTS.md ================================================ You are a collaborative software engineering partner with a strong focus on code quality and simplicity. Your approach is inspired by proven engineering principles from successful open-source projects, emphasizing pragmatic solutions and maintainable code. # Core Engineering Principles 1. **Simplicity and Clarity** "The best solutions often come from looking at problems from a different angle, where special cases disappear and become normal cases." • Prefer solutions that eliminate edge cases rather than adding conditional checks • Good design patterns emerge from experience and careful consideration • Simple, clear code is easier to maintain and debug 2. **Backward Compatibility** "Stability is a feature, not a constraint." • Changes should not break existing functionality • Consider the impact on users and existing integrations • Compatibility enables trust and adoption 3. **Pragmatic Problem-Solving** "Focus on solving real problems with practical solutions." • Address actual user needs rather than theoretical edge cases • Prefer proven, straightforward approaches over complex abstractions • Code should serve real-world requirements 4. **Maintainable Architecture** "Keep functions focused and code readable." • Functions should be short and have a single responsibility • Avoid deep nesting - consider refactoring when indentation gets complex • Clear naming and structure reduce cognitive load # Collaborative Approach ## Communication Style • **Constructive**: Focus on helping improve code and solutions • **Collaborative**: Work together as partners toward better outcomes • **Clear**: Provide specific, actionable feedback • **Respectful**: Maintain a supportive tone while being technically rigorous ## Problem Analysis Process ### 1. Understanding Requirements When reviewing a requirement, confirm understanding by restating it clearly: > "Based on your description, I understand you need: [clear restatement of the requirement]. Is this correct?" ### 2. Collaborative Problem Decomposition #### Data Structure Analysis "Well-designed data structures often lead to simpler code." • What are the core data elements and their relationships? • How does data flow through the system? • Are there opportunities to simplify data handling? #### Complexity Assessment "Let's look for ways to simplify this." • What's the essential functionality we need to implement? • Which parts of the current approach add unnecessary complexity? • How can we make this more straightforward? #### Compatibility Review "Let's make sure this doesn't break existing functionality." • What existing features might be affected? • How can we implement this change safely? • What migration path do users need? #### Practical Validation "Let's focus on the real-world use case." • Does this solve an actual problem users face? • Is the complexity justified by the benefit? • What's the simplest approach that meets the need? ## 3. Constructive Feedback Format After analysis, provide feedback in this format: **Assessment**: [Clear evaluation of the approach] **Key Observations**: - Data Structure: [insights about data organization] - Complexity: [areas where we can simplify] - Compatibility: [potential impact on existing code] **Suggested Approach**: If the solution looks good: 1. Start with the simplest data structure that works 2. Eliminate special cases where possible 3. Implement clearly and directly 4. Ensure backward compatibility If there are concerns: "I think we might be able to simplify this. The core issue seems to be [specific problem]. What if we tried [alternative approach]?" ## 4. Code Review Approach When reviewing code, provide constructive feedback: **Overall Assessment**: [Helpful evaluation] **Specific Suggestions**: - [Concrete improvements with explanations] - [Alternative approaches to consider] - [Ways to reduce complexity] **Next Steps**: [Clear action items] ## Repository Memory - Programmatic settings live in `openhands-sdk/openhands/sdk/settings/`. Treat `AgentSettings` and `export_settings_schema()` as the canonical structured settings surface in the SDK, and keep that schema focused on neutral config semantics rather than client-specific presentation details. - `SettingsFieldSchema` intentionally does not export a `required` flag. If a consumer needs nullability semantics, inspect the underlying Python typing rather than inferring from SDK defaults. - `AgentSettings.tools` is part of the exported settings schema so the schema stays aligned with the settings payload that round-trips through `AgentSettings` and drives `create_agent()`. - `AgentSettings.mcp_config` now uses FastMCP's typed `MCPConfig` at runtime. When serializing settings back to plain data (e.g. `model_dump()` or `create_agent()`), keep the output compact with `exclude_none=True, exclude_defaults=True` so callers still see the familiar `.mcp.json`-style dict shape. - Persisted SDK settings should use the direct `model_dump()` shape with a top-level `schema_version`; avoid adding wrapped payload formats or legacy migration shims in `openhands/sdk/settings/model.py`. - Because persisted settings are not in production yet, prefer removing temporary compatibility fields and serializers outright instead of carrying legacy settings shims in the SDK. - Do not expose settings schema versions as public `CURRENT_PERSISTED_VERSION` class constants on `AgentSettings` or `ConversationSettings`; keep versioning internal to the `schema_version` field/defaults and private module constants. - `ConversationSettings` owns the conversation-scoped confirmation controls directly (`confirmation_mode`, `security_analyzer`); keep those fields top-level on the model and grouped into the exported `verification` section via schema metadata rather than nested helper models, and prefer the direct settings-model constructor `create_request(...)` over separate request-wrapper helpers. - Anthropic malformed tool-use/tool-result history errors (for example, missing or duplicated ``tool_result`` blocks) are intentionally mapped to a dedicated `LLMMalformedConversationHistoryError` and caught separately in `Agent.step()`, so recovery can still use condensation while logs preserve that this was malformed history rather than a true context-window overflow. - AgentSkills progressive disclosure goes through `AgentContext.get_system_message_suffix()` into ``, and `openhands.sdk.context.skills.to_prompt()` truncates each prompt description to 1024 characters because the AgentSkills specification caps `description` at 1-1024 characters. - Workspace-wide uv resolver guardrails belong in the repository root `[tool.uv]` table. When `exclude-newer` is configured there, `uv lock` persists it into the root `uv.lock` `[options]` section as both an absolute cutoff and `exclude-newer-span`, and `uv sync --frozen` continues to use that locked workspace state. - `pr-review-by-openhands` delegates to `OpenHands/extensions/plugins/pr-review@main`. Repo-specific reviewer instructions live in `.agents/skills/custom-codereview-guide.md`, and because task-trigger matching is substring-based, that `/codereview` skill is also auto-injected for the workflow's `/codereview-roasted` prompt. - Directory-based runnable examples under `examples/` should expose their entrypoint as `main.py`, and `tests/examples/test_examples.py` should explicitly list the example directory in `_TARGET_DIRECTORIES` so the non-recursive example workflow collects it without accidentally running helper modules. - The duplicate-issue automation scripts should validate `owner/repo` arguments before interpolating GitHub API paths, handle per-issue auto-close failures without aborting the whole batch, and keep `app_conversation_id` paths unquoted because OpenHands conversation IDs are already canonicalized for those endpoints. - `agent-server` now defaults `TMUX_TMPDIR` to a per-process directory under the system temp dir (`openhands-agent-server-`) when the environment variable is unset. This isolates tmux sockets/cleanup across concurrent server instances while still respecting an explicit `TMUX_TMPDIR` override. - Conversation worktrees for git-backed local workspaces live under `/tmp/conversation-worktrees//`, and if the original workspace points at a subdirectory inside the repo, the active workspace should preserve that relative path inside the worktree. - Agent-server Docker publish tags are defined centrally in `openhands-agent-server/openhands/agent_server/docker/build.py`; keep `server.yml` manifest publication derived from the emitted per-arch tags so SHA/branch/git-tag aliases stay in sync, while preserving the legacy `latest-` alias used by workspace defaults. - The published agent-server Docker images in `.github/workflows/server.yml` must pass `OPENHANDS_BUILD_GIT_SHA` and `OPENHANDS_BUILD_GIT_REF` as explicit `docker/build-push-action` build args; the workflow only uses `docker/build.py` for context/tag generation, so those runtime env vars are otherwise left at the Dockerfile `unknown` defaults. - The PyInstaller agent-server binary should copy OpenHands distribution metadata (`openhands-agent-server`, `openhands-sdk`, `openhands-tools`, `openhands-workspace`) in `agent-server.spec`, otherwise `/server_info` version lookups via `importlib.metadata` can fall back to `unknown` inside published binary images. - Auto-title generation should not re-read `ConversationState.events` from a background task triggered by a freshly received `MessageEvent`; extract message text synchronously from the incoming event and then reuse shared title helpers (`extract_message_text`, `generate_title_from_message`) to avoid persistence-order races. - `RemoteConversation.generate_title()` now reconciles remote events and reuses the shared local `generate_conversation_title(...)` helper instead of calling the removed deprecated agent-server `/generate_title` REST route, so explicit remote title generation still works without a transport-only compatibility endpoint. - Remote workspace git operations should call `/api/git/changes` and `/api/git/diff` via the `path` query parameter with slash-normalized strings; building those URLs with `pathlib.Path` leaks host-platform separators and breaks Windows paths. The grep tool now prefers `rg`, then system `grep`, then Python; both the real grep executor and the SDK's terminal-command compatibility fallback should keep that order. For grep parity, the Python fallback should hide dotfiles by default but still let explicit `include` globs surface files like `.env`, matching ripgrep. For glob parity, any symlink-preservation regression test should force the Python fallback path, because ripgrep availability changes whether the fallback implementation runs at all. - Keep path helpers split by purpose: `is_absolute_path_source()` is for cross-platform source/wire syntax detection, while local filesystem writes/validation (for example, the file editor) should use host-native absolute-path semantics so POSIX does not silently accept Windows drive paths as creatable files. - Tool availability filtering belongs in `openhands-sdk/openhands/sdk/tool/registry.py` via `list_usable_tools()`, which preserves registration order and defaults tools to usable unless they expose an `is_usable()` callable. Environment-specific checks like Chromium detection should live on the concrete tool class (`BrowserToolSet.is_usable()`), while agent-server surfaces such as `/server_info` should consume the registry helper rather than re-implement per-tool filtering. - Pydantic secret field helpers live in `openhands-sdk/openhands/sdk/utils/pydantic_secrets.py`. `serialize_secret()` handles serialization (cipher / `expose_secrets` / default Pydantic masking); `validate_secret()` handles deserialization (cipher decryption, redacted/empty → `None`); `is_redacted_secret()` checks for the sentinel; `REDACTED_SECRET_VALUE` is the canonical sentinel string. For `dict[str, str]` fields whose values are all secrets, wrap each value in `SecretStr` and call `serialize_secret` per value (see `LookupSecret._serialize_secrets` and `ACPAgent._serialize_acp_env`). Do not hand-roll redaction logic in field serializers. - `LookupSecret` normalizes hostless URLs against `OH_INTERNAL_SERVER_URL` (set by `openhands-agent-server.__main__` from the bound host/port, rewriting wildcard binds to loopback) and otherwise falls back to `http://127.0.0.1:8000`, so relative secret URLs can safely target the current agent-server instance. ## Package-specific guidance When reviewing or modifying code, read the closest AGENTS file for the package(s) containing the changed files. If a PR spans multiple packages, consult each relevant package-level AGENTS.md. - SDK: [openhands-sdk/openhands/sdk/AGENTS.md](openhands-sdk/openhands/sdk/AGENTS.md) - Subagents: [openhands-sdk/openhands/sdk/subagent/AGENTS.md](openhands-sdk/openhands/sdk/subagent/AGENTS.md) - Tools: [openhands-tools/openhands/tools/AGENTS.md](openhands-tools/openhands/tools/AGENTS.md) - Workspace: [openhands-workspace/openhands/workspace/AGENTS.md](openhands-workspace/openhands/workspace/AGENTS.md) - Agent server: [openhands-agent-server/AGENTS.md](openhands-agent-server/AGENTS.md) - Eval config: [.github/run-eval/AGENTS.md](.github/run-eval/AGENTS.md) ## API compatibility pointers - For SDK Python API deprecation/removal policy, read [openhands-sdk/openhands/sdk/AGENTS.md](openhands-sdk/openhands/sdk/AGENTS.md). Public API removals require deprecation metadata with a removal target at least **5 minor releases** after `deprecated_in`, and breaking SDK API changes require at least a **MINOR** SemVer bump. - The SDK API breakage checker should treat metadata-only changes to Pydantic `Field(...)` declarations as non-breaking, including adding, removing, or editing `description`, `title`, `examples`, `json_schema_extra`, and `deprecated` kwargs. - The SDK API breakage checker compares stringified `Field(...)` values by parsing them as Python expressions after escaping literal newlines inside quoted strings; this avoids false positives on multiline descriptions that include embedded quotes like `'security_policy.j2'`. - For public REST APIs, read [openhands-agent-server/AGENTS.md](openhands-agent-server/AGENTS.md). REST contract breaks need a deprecation notice and a runway of **5 minor releases** before removing the old contract or making an incompatible replacement mandatory. - Make sure you `make build` to configure the dependencies first - We use pre-commit hooks `.pre-commit-config.yaml` that includes: - type check through pyright - linting and formatter with `uv ruff` - NEVER USE `mypy`! - Do NOT commit ALL the file, just commit the relevant file you've changed! - In every commit message, you should add "Co-authored-by: openhands " - You can run pytest with `uv run pytest` # Instruction for fixing "E501 Line too long" - If it is just code, you can modify it so it spans multiple lines. - If it is a single-line string, you can break it into a multi-line string by doing "ABC" -> ("A"\n"B"\n"C") - If it is a long multi-line string (e.g., docstring), you should just add type ignore AFTER the ending """. You should NEVER ADD IT INSIDE the docstring. # PR-Specific Documents When working on a PR that requires design documents, scripts meant for development-only, or other temporary artifacts that should NOT be merged to main, store them in a `.pr/` directory at the repository root. ## Usage ```bash # Create the directory if it doesn't exist mkdir -p .pr # Add your PR-specific documents .pr/ ├── design.md # Design decisions and architecture notes ├── analysis.md # Investigation or debugging notes └── notes.md # Any other PR-specific content ``` ## How It Works 1. **Notification**: When `.pr/` exists, a single comment is posted to the PR conversation alerting reviewers 2. **Auto-cleanup**: When the PR is approved, the `.pr/` directory is automatically removed via commit 3. **Fork PRs**: Auto-cleanup cannot push to forks, so manual removal is required before merging ## Important Notes - Do NOT put anything in `.pr/` that needs to be preserved - The `.pr/` check passes (green ✅) during development - it only posts a notification, not a blocking error - For fork PRs: You must manually remove `.pr/` before the PR can be merged ## When to Use - Complex refactoring that benefits from written design rationale - Debugging sessions where you want to document your investigation - Feature implementations that need temporary planning docs - Temporary script that are intended to show reviewers that the feature works - Any analysis that helps reviewers understand the PR but isn't needed long-term - Critically evaluate each review comment before acting on it. Not all feedback is worth implementing: - Does it fix a real bug or improve clarity significantly? - Does it align with the project's engineering principles (simplicity, maintainability)? - Is the suggested change proportional to the benefit, or does it add unnecessary complexity? - It's acceptable to respectfully decline suggestions that add verbosity without clear benefit, over-engineer for hypothetical edge cases, or contradict the project's pragmatic approach. - After addressing (or deciding not to address) inline review comments, mark the corresponding review threads as resolved. - Before resolving a thread, leave a reply comment that either explains the reason for dismissing the feedback or references the specific commit (e.g., commit SHA) that addressed the issue. - Prefer resolving threads only once fixes are pushed or a clear decision is documented. - Use the GitHub GraphQL API to reply to and resolve review threads (see below). ## Resolving Review Threads via GraphQL The CI check `Review Thread Gate/unresolved-review-threads` will fail if there are unresolved review threads. To resolve threads programmatically: 1. Get the thread IDs (replace ``, ``, ``): ```bash gh api graphql -f query=' { repository(owner: "", name: "") { pullRequest(number: ) { reviewThreads(first: 20) { nodes { id isResolved comments(first: 1) { nodes { body } } } } } } }' ``` 2. Reply to the thread explaining how the feedback was addressed: ```bash gh api graphql -f query=' mutation { addPullRequestReviewThreadReply(input: { pullRequestReviewThreadId: "" body: "Fixed in " }) { comment { id } } }' ``` 3. Resolve the thread: ```bash gh api graphql -f query=' mutation { resolveReviewThread(input: {threadId: ""}) { thread { isResolved } } }' ``` 4. Get the failed workflow run ID and rerun it: ```bash # Find the run ID from the failed check URL, or use: gh run list --repo / --branch --limit 5 # Rerun failed jobs gh run rerun --repo / --failed ``` - Avoid hacky trick like `sys.path.insert` when resolving package dependency - Use existing packages/libraries instead of implementing yourselves whenever possible. - Avoid using # type: ignore. Treat it only as a last resort. In most cases, issues should be resolved by improving type annotations, adding assertions, or adjusting code/tests—rather than silencing the type checker. - Please AVOID using # type: ignore[attr-defined] unless absolutely necessary. If the issue can be addressed by adding a few extra assert statements to verify types, prefer that approach instead! - For issue like # type: ignore[call-arg]: if you discover that the argument doesn’t actually exist, do not try to mock it again in tests. Instead, simply remove it. - Avoid doing in-line imports unless absolutely necessary (e.g., circular dependency). - Avoid getattr/hasattr guards and instead enforce type correctness by relying on explicit type assertions and proper object usage, ensuring functions only receive the expected Pydantic models or typed inputs. Prefer type hints and validated models over runtime shape checks. - Prefer accessing typed attributes directly. If necessary, convert inputs up front into a canonical shape; avoid purely hypothetical fallbacks. - Use real newlines in commit messages; do not write literal "\n". - AFTER you edit ONE file, you should run pre-commit hook on that file via `uv run pre-commit run --files [filepath]` to make sure you didn't break it. - Don't write TOO MUCH test, you should write just enough to cover edge cases. - Check how we perform tests in .github/workflows/tests.yml - Put unit tests under the corresponding domain folder in `tests/` (e.g., `tests/sdk`, `tests/tools`, `tests/workspace`). For example, changes to `openhands-sdk/openhands/sdk/tool/tool.py` should be covered in `tests/sdk/tool/test_tool.py`. - DON'T write TEST CLASSES unless absolutely necessary! - If you find yourself duplicating logics in preparing mocks, loading data etc, these logic should be fixtures in conftest.py! - Please test only the logic implemented in the current codebase. Do not test functionality (e.g., BaseModel.model_dumps()) that is not implemented in this repository. - For changes to prompt templates, tool descriptions, or agent decision logic, add the `integration-test` label to trigger integration tests and verify no unexpected impact on benchmark performance. # Stress Tests `tests/agent_server/stress/` contains an opt-in stress/scale suite for the agent-server, excluded from default collection via the `stress` pytest marker. Run with `uv run pytest -m stress`. For full details on running, infrastructure, and adding new stress tests, see [openhands-agent-server/AGENTS.md](openhands-agent-server/AGENTS.md). # Behavior Tests Behavior tests (prefix `b##_*`) in `tests/integration/tests/` are designed to verify that agents exhibit desired behaviors in realistic scenarios. These tests are distinct from functional tests (prefix `t##_*`) and have specific requirements. Before adding or modifying behavior tests, review `tests/integration/BEHAVIOR_TESTS.md` for the latest workflow, expectations, and examples. # Agent Temporary Directory Convention When tools need to store observation files (e.g., browser session recordings, task tracker data), use `.agent_tmp` as the directory name for consistency. The browser session recording tool saves recordings to `.agent_tmp/observations/recording-{timestamp}/`. This convention ensures tool-generated observation files are stored in a predictable location that can be easily: - Added to `.gitignore` - Cleaned up after agent sessions - Identified as agent-generated artifacts Note: This is separate from `persistence_dir` which is used for conversation state persistence. - This is a `uv`-managed Python monorepo (single `uv.lock` at repo root) with multiple distributable packages: `openhands-sdk/` (SDK), `openhands-tools/` (built-in tools), `openhands-workspace/` (workspace impls), and `openhands-agent-server/` (server runtime). - `examples/` contains runnable patterns; `tests/` is split by domain (`tests/sdk`, `tests/tools`, `tests/workspace`, `tests/agent_server`, etc.). - Python namespace is `openhands.*` across packages; keep new modules within the matching package and mirror test paths under `tests/`. - Set up the dev environment: `make build` (runs `uv sync --dev` and installs pre-commit; requires uv >= 0.8.13) - Lint/format: `make lint`, `make format` - Run tests: `uv run pytest` - Run agent-server stress tests: `uv run pytest -m stress` (see [openhands-agent-server/AGENTS.md](openhands-agent-server/AGENTS.md)) - Build agent-server: `make build-server` (output: `dist/agent-server/`) - Clean caches: `make clean` - Run SDK examples: see [openhands-sdk/openhands/sdk/AGENTS.md](openhands-sdk/openhands/sdk/AGENTS.md). - The example workflow runs `uv run pytest tests/examples/test_examples.py --run-examples`; each successful example must print an `EXAMPLE_COST: ...` line to stdout (use `EXAMPLE_COST: 0` for non-LLM examples). - Example scripts in `examples/` should use top-level code flow (e.g. `with` blocks, bare statements) rather than wrapping logic in a `def main()` function. The `def main` pattern creates unnecessary nesting that makes examples harder to read; keep the code flat and script-like. - Conversation plugins passed via `plugins=[...]` are lazy-loaded on the first `send_message()` or `run()`, so example code should inspect plugin-added skills or `resolved_plugins` only after that first interaction. - Programmatic settings live in `openhands-sdk/openhands/sdk/settings/`. Keep the exported schema focused on neutral config structure and semantics; downstream apps should own client-specific ordering, icons, widgets, and slash-command presentation. - Ruff: `line-length = 88`, `target-version = "py312"` (see `pyproject.toml`). - Ruff ignores `ARG` (unused arguments) under `tests/**/*.py` to allow pytest fixtures. - Repository guidance lives in the project root AGENTS.md (loaded as a third-party skill file). ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Thank you for helping improve the OpenHands Software Agent SDK. This repo is a foundation. We want the SDK to stay stable and extensible so that many applications can build on it safely. Downstream applications we actively keep in mind: - [OpenHands-CLI](https://github.com/OpenHands/OpenHands-CLI) (client) - [OpenHands app-server](https://github.com/OpenHands/OpenHands/blob/main/openhands/app_server/README.md) (client) - [OpenHands Enterprise](https://github.com/OpenHands/OpenHands/blob/main/enterprise/README.md) (client) The SDK itself has a Python interface. In addition, the [agent-server](https://docs.openhands.dev/sdk/guides/agent-server/overview) is the REST/WebSocket server component that exposes the SDK for remote execution and integrations. Changes should keep both interfaces stable and consistent. ## A lesson we learned (why we care about architecture) In earlier iterations, we repeatedly ran into a failure mode: needs from downstream applications (or assumptions) would leak into core logic. That kind of coupling can feel convenient in the moment, but it tends to create subtle breakage elsewhere: different environments, different workspaces, different execution modes, and different evaluation setups. The architecture of OpenHands V0 was too monolithic to support multiple applications built into it, as CLI, evaluation scripts, web server were, and built on it, as OpenHands Cloud was. If you’re interested in the deeper background and lessons learned, see our write-up: [OpenHands: An Open Platform for AI Software Developers as Generalist Agents](https://arxiv.org/abs/2511.03690) This SDK exists (as a separate, rebuilt foundation) to avoid that failure mode. ## Principles we review PRs with We welcome all contributions, big or small, to improve or extend the software agent SDK. You may find that occasionally we are opinionated about several things: - **OpenHands SDK is its own thing**: its downstream are client applications. - **Prefer interfaces over special cases**: if a client needs something, add or improve a clean, reusable interface/extension point instead of adding a shortcut. - **Extensibility over one-off patches**: design features so multiple clients can adopt them without rewriting core logic. - **Avoid hidden assumptions**: don’t rely on particular env vars, workspace layouts, request contexts, or runtime quirks that only exist in one app. - Workspaces *do* encode environment specifics (local/Docker/remote), but keep those assumptions explicit (params + validation) and contained to the `workspace` layer. - **No client-specific code paths**: avoid logic that only makes sense for one downstream app. - It’s fine to have multiple workspace implementations; it’s not fine for SDK core behavior to branch on whether the caller is CLI/app-server/SaaS. Prefer capabilities/config over app-identity. - **Keep the agent loop stable**: treat stability as a feature; be cautious with control-flow changes and "small" behavior tweaks. - **Compatibility is part of the API**: if something could break downstream clients, call it out explicitly and consider a migration path. We have a deprecation mechanism you may want to use. If you’re not sure whether a change crosses these lines, please ask early. We’re happy to help think through the shape of a clean interface. ## Practical pointers This file is mostly about principles. For the mechanics, please see: - [AGENTS.md](AGENTS.md) for AI agents - [DEVELOPMENT.md](DEVELOPMENT.md) for humans ## Questions / discussion Join us on Slack: https://openhands.dev/joinslack ================================================ FILE: DEVELOPMENT.md ================================================ # Development Guide ## Setup ```bash git clone https://github.com/OpenHands/agent-sdk.git cd agent-sdk make build ``` ## Code Quality ```bash make format # Format code make lint # Lint code uv run pre-commit run --all-files # Run all checks ``` Pre-commit hooks run automatically on commit with type checking and linting. ## Testing ```bash uv run pytest # All tests uv run pytest tests/sdk/ # SDK tests only uv run pytest tests/tools/ # Tools tests only ``` ## Project Structure ``` agent-sdk/ ├── openhands-sdk/ # Core SDK package ├── openhands-tools/ # Built-in tools ├── openhands-workspace/ # Workspace management ├── openhands-agent-server/ # Agent server ├── examples/ # Usage examples └── tests/ # Test suites ``` ## Contributing 1. Create a new branch 2. Make your changes 3. Run tests and checks 4. Push and create a pull request For questions, join our [Slack community](https://openhands.dev/joinslack). ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2026 OpenHands contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MAINTAINERS ================================================ # Repository Maintainers # # Format: Each maintainer on a new line starting with "- @username" # This file is read by .github/workflows/assign-reviews.yml for automated triage # The following people are maintainers of this repository and are responsible for triage and review: - @xingyaoww - @neubig - @enyst ================================================ FILE: MANIFEST.in ================================================ # This MANIFEST.in file tells setuptools which files to include # in the sdist package distribution used for building docker image # ============================================================================== # Root-level workspace files # ============================================================================== include pyproject.toml include uv.lock # ============================================================================== # openhands-sdk # ============================================================================== include openhands-sdk/pyproject.toml recursive-include openhands-sdk *.py recursive-include openhands-sdk *.j2 recursive-include openhands-sdk py.typed # ============================================================================== # openhands-tools # ============================================================================== include openhands-tools/pyproject.toml recursive-include openhands-tools *.py recursive-include openhands-tools *.j2 recursive-include openhands-tools py.typed # ============================================================================== # openhands-workspace # ============================================================================== include openhands-workspace/pyproject.toml recursive-include openhands-workspace *.py recursive-include openhands-workspace py.typed # ============================================================================== # openhands-agent-server # ============================================================================== include openhands-agent-server/pyproject.toml recursive-include openhands-agent-server *.py recursive-include openhands-agent-server py.typed # Docker build files include openhands-agent-server/openhands/agent_server/docker/Dockerfile include openhands-agent-server/openhands/agent_server/docker/wallpaper.svg # PyInstaller spec include openhands-agent-server/openhands/agent_server/agent-server.spec # VSCode extensions recursive-include openhands-agent-server/openhands/agent_server/vscode_extensions * ================================================ FILE: Makefile ================================================ SHELL := /usr/bin/env bash .SHELLFLAGS := -eu -o pipefail -c # Colors for output ECHO := printf '%b\n' GREEN := \033[32m YELLOW := \033[33m RED := \033[31m CYAN := \033[36m RESET := \033[0m UNDERLINE := \033[4m # Required uv version REQUIRED_UV_VERSION := 0.8.13 PKGS ?= openhands-sdk openhands-tools openhands-workspace openhands-agent-server .PHONY: build format lint clean help check-uv-version # Default target .DEFAULT_GOAL := help check-uv-version: @$(ECHO) "$(YELLOW)Checking uv version...$(RESET)" @UV_VERSION=$$(uv --version | cut -d' ' -f2); \ REQUIRED_VERSION=$(REQUIRED_UV_VERSION); \ if [ "$$(printf '%s\n' "$$REQUIRED_VERSION" "$$UV_VERSION" | sort -V | head -n1)" != "$$REQUIRED_VERSION" ]; then \ $(ECHO) "$(RED)Error: uv version $$UV_VERSION is less than required $$REQUIRED_VERSION$(RESET)"; \ $(ECHO) "$(YELLOW)Please update uv with: uv self update$(RESET)"; \ exit 1; \ fi; \ $(ECHO) "$(GREEN)uv version $$UV_VERSION meets requirements$(RESET)" build: check-uv-version @$(ECHO) "$(CYAN)Setting up OpenHands V1 development environment...$(RESET)" @$(ECHO) "$(YELLOW)Installing dependencies with uv sync --dev...$(RESET)" @uv sync --dev @$(ECHO) "$(GREEN)Dependencies installed successfully.$(RESET)" @$(ECHO) "$(YELLOW)Setting up pre-commit hooks...$(RESET)" @uv run pre-commit install @$(ECHO) "$(GREEN)Pre-commit hooks installed successfully.$(RESET)" @$(ECHO) "$(GREEN)Build complete! Development environment is ready.$(RESET)" format: @$(ECHO) "$(YELLOW)Formatting code with uv format...$(RESET)" @uv run ruff format @$(ECHO) "$(GREEN)Code formatted successfully.$(RESET)" lint: @$(ECHO) "$(YELLOW)Linting code with ruff...$(RESET)" @uv run ruff check --fix @$(ECHO) "$(GREEN)Linting completed.$(RESET)" pre-commit: @$(ECHO) "$(YELLOW)Run pre-commit...$(RESET)" uv run pre-commit run --all-files @$(ECHO) "$(GREEN)Pre-commit run successfully.$(RESET)" clean: @$(ECHO) "$(YELLOW)Cleaning up cache files...$(RESET)" @find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true @find . -type f -name "*.pyc" -delete 2>/dev/null || true @rm -rf .pytest_cache .ruff_cache .mypy_cache 2>/dev/null || true @$(ECHO) "$(GREEN)Cache files cleaned.$(RESET)" # Show help help: @$(ECHO) "$(CYAN)OpenHands V1 Makefile$(RESET)" @$(ECHO) "" @$(ECHO) "$(UNDERLINE)Usage:$(RESET) make " @$(ECHO) "" @$(ECHO) "$(UNDERLINE)Commands:$(RESET)" @$(ECHO) " $(GREEN)build$(RESET) Setup development environment (install deps + hooks)" @$(ECHO) " $(GREEN)build-server$(RESET) Build agent-server executable" @$(ECHO) " $(GREEN)test-server-schema$(RESET) Test server schema" @$(ECHO) " $(GREEN)format$(RESET) Format code with uv format" @$(ECHO) " $(GREEN)lint$(RESET) Lint code with ruff" @$(ECHO) " $(GREEN)pre-commit$(RESET) Run the pre-commit" @$(ECHO) " $(GREEN)clean$(RESET) Clean up cache files" @$(ECHO) " $(GREEN)help$(RESET) Show this help message" build-server: check-uv-version @$(ECHO) "$(CYAN)Building agent-server executable...$(RESET)" @uv run pyinstaller openhands-agent-server/openhands/agent_server/agent-server.spec @$(ECHO) "$(GREEN)Build complete! Executable is in dist/agent-server/$(RESET)" test-server-schema: check-uv-version set -euo pipefail; # Generate OpenAPI JSON inline (no file left in repo) uv run python -c 'import os,json; from openhands.agent_server.api import api; open("openapi.json","w").write(json.dumps(api.openapi(), indent=2))' npx --yes @apidevtools/swagger-cli@^4 validate openapi.json # Clean up temp schema rm -f openapi.json rm -rf .client .PHONY: set-package-version set-package-version: check-uv-version @if [ -z "$(version)" ]; then \ $(ECHO) "$(RED)Error: missing version. Use: make set-package-version version=1.2.3$(RESET)"; \ exit 1; \ fi @$(ECHO) "$(CYAN)Setting version to $(version) for: $(PKGS)$(RESET)" @for PKG in $(PKGS); do \ $(ECHO) "$(YELLOW)bumping $$PKG -> $(version)$(RESET)"; \ uv version --package $$PKG $(version); \ done @$(ECHO) "$(GREEN)Version updated in all selected packages.$(RESET)" ================================================ FILE: README.md ================================================
Logo

OpenHands Software Agent SDK

The OpenHands Software Agent SDK is a set of Python and REST APIs for **building agents that work with code**. You can use the OpenHands Software Agent SDK for: * One-off tasks, like building a README for your repo * Routine maintenance tasks, like updating dependencies * Major tasks that involve multiple agents, like refactors and rewrites Importantly, agents can either use the local machine as their workspace, or run inside ephemeral workspaces (e.g. in Docker or Kubernetes) using the Agent Server. You can even use the SDK to build new developer experiences: it’s the engine behind the [OpenHands CLI](https://github.com/OpenHands/OpenHands-CLI) and [OpenHands Cloud](https://github.com/OpenHands/OpenHands). Get started with some [examples](https://docs.openhands.dev/sdk/guides/hello-world) or [check out the docs](https://docs.openhands.dev/sdk) to learn more. ## Quick Start Here's what building with the SDK looks like: ```python import os from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool llm = LLM( model="anthropic/claude-sonnet-4-5-20250929", api_key=os.getenv("LLM_API_KEY"), ) agent = Agent( llm=llm, tools=[ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ], ) cwd = os.getcwd() conversation = Conversation(agent=agent, workspace=cwd) conversation.send_message("Write 3 facts about the current project into FACTS.txt.") conversation.run() print("All done!") ``` For installation instructions and detailed setup, see the [Getting Started Guide](https://docs.openhands.dev/sdk/getting-started). For local development from this repository, run `make build` to install the workspace dependencies and pre-commit hooks. ## Documentation For detailed documentation, tutorials, and API reference, visit: **[https://docs.openhands.dev/sdk](https://docs.openhands.dev/sdk)** The documentation includes: - [Getting Started Guide](https://docs.openhands.dev/sdk/getting-started) - Installation and setup - [Architecture & Core Concepts](https://docs.openhands.dev/sdk/arch/overview) - Agents, tools, workspaces, and more - [Guides](https://docs.openhands.dev/sdk/guides/hello-world) - Hello World, custom tools, MCP, skills, and more - [Agent Server API Reference](https://docs.openhands.dev/sdk/guides/agent-server/api-reference/server-details/alive) - REST API reference for the remote agent server ## Examples The `examples/` directory contains comprehensive usage examples: - **Standalone SDK** (`examples/01_standalone_sdk/`) - Basic agent usage, custom tools, and skills - **Remote Agent Server** (`examples/02_remote_agent_server/`) - Client-server architecture and WebSocket connections - **GitHub Workflows** (`examples/03_github_workflows/`) - CI/CD integration and automated workflows ## Skills for modern package tooling If you enable public skills with `AgentContext(load_public_skills=True)`, the default `OpenHands/extensions` marketplace includes, for example, `uv` and `deno` skills. Agents can automatically pick up current package-management guidance for repositories that use markers like `uv.lock`, `deno.json`, `deno.jsonc`, or `deno.lock`. See `examples/01_standalone_sdk/03_activate_skill.py` for a minimal example that turns on public skill loading. ## Contributing For development setup, testing, and contribution guidelines, see [DEVELOPMENT.md](DEVELOPMENT.md). ## Community - [Join Slack](https://openhands.dev/joinslack) - Connect with the OpenHands community - [GitHub Repository](https://github.com/OpenHands/software-agent-sdk) - Source code and issues - [Documentation](https://docs.openhands.dev/sdk) - Complete documentation ## Cite ``` @misc{wang2025openhandssoftwareagentsdk, title={The OpenHands Software Agent SDK: A Composable and Extensible Foundation for Production Agents}, author={Xingyao Wang and Simon Rosenberg and Juan Michelini and Calvin Smith and Hoang Tran and Engel Nyst and Rohit Malhotra and Xuhui Zhou and Valerie Chen and Robert Brennan and Graham Neubig}, year={2025}, eprint={2511.03690}, archivePrefix={arXiv}, primaryClass={cs.SE}, url={https://arxiv.org/abs/2511.03690}, } ```
### Thank You to Our Contributors [![Contributors](https://assets.openhands.dev/readme/openhands-software-agent-sdk-contributors.svg)](https://github.com/OpenHands/software-agent-sdk/graphs/contributors)
### Trusted by Engineers at


TikTok VMware Roche Amazon C3 AI Netflix Mastercard Red Hat MongoDB Apple NVIDIA Google
================================================ FILE: examples/01_standalone_sdk/01_hello_world.py ================================================ import os from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool llm = LLM( model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL", None), ) agent = Agent( llm=llm, tools=[ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ], ) cwd = os.getcwd() conversation = Conversation(agent=agent, workspace=cwd) conversation.send_message("Write 3 facts about the current project into FACTS.txt.") conversation.run() print("All done!") ================================================ FILE: examples/01_standalone_sdk/02_custom_tools.py ================================================ """Advanced example showing explicit executor usage and custom grep tool.""" import os import shlex from collections.abc import Sequence from pydantic import Field, SecretStr from openhands.sdk import ( LLM, Action, Agent, Conversation, Event, ImageContent, LLMConvertibleEvent, Observation, TextContent, ToolDefinition, get_logger, ) from openhands.sdk.tool import ( Tool, ToolExecutor, register_tool, ) from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import ( TerminalAction, TerminalExecutor, TerminalTool, ) logger = get_logger(__name__) # --- Action / Observation --- class GrepAction(Action): pattern: str = Field(description="Regex to search for") path: str = Field( default=".", description="Directory to search (absolute or relative)" ) include: str | None = Field( default=None, description="Optional glob to filter files (e.g. '*.py')" ) class GrepObservation(Observation): matches: list[str] = Field(default_factory=list) files: list[str] = Field(default_factory=list) count: int = 0 @property def to_llm_content(self) -> Sequence[TextContent | ImageContent]: if not self.count: return [TextContent(text="No matches found.")] files_list = "\n".join(f"- {f}" for f in self.files[:20]) sample = "\n".join(self.matches[:10]) more = "\n..." if self.count > 10 else "" ret = ( f"Found {self.count} matching lines.\n" f"Files:\n{files_list}\n" f"Sample:\n{sample}{more}" ) return [TextContent(text=ret)] # --- Executor --- class GrepExecutor(ToolExecutor[GrepAction, GrepObservation]): def __init__(self, terminal: TerminalExecutor): self.terminal: TerminalExecutor = terminal def __call__(self, action: GrepAction, conversation=None) -> GrepObservation: # noqa: ARG002 root = os.path.abspath(action.path) pat = shlex.quote(action.pattern) root_q = shlex.quote(root) # Use grep -r; add --include when provided if action.include: inc = shlex.quote(action.include) cmd = f"grep -rHnE --include {inc} {pat} {root_q} 2>/dev/null | head -100" else: cmd = f"grep -rHnE {pat} {root_q} 2>/dev/null | head -100" result = self.terminal(TerminalAction(command=cmd)) matches: list[str] = [] files: set[str] = set() # grep returns exit code 1 when no matches; treat as empty output_text = result.text if output_text.strip(): for line in output_text.strip().splitlines(): matches.append(line) # Expect "path:line:content" — take the file part before first ":" file_path = line.split(":", 1)[0] if file_path: files.add(os.path.abspath(file_path)) return GrepObservation(matches=matches, files=sorted(files), count=len(matches)) # Tool description _GREP_DESCRIPTION = """Fast content search tool. * Searches file contents using regular expressions * Supports full regex syntax (eg. "log.*Error", "function\\s+\\w+", etc.) * Filter files by pattern with the include parameter (eg. "*.js", "*.{ts,tsx}") * Returns matching file paths sorted by modification time. * Only the first 100 results are returned. Consider narrowing your search with stricter regex patterns or provide path parameter if you need more results. * Use this tool when you need to find files containing specific patterns * When you are doing an open ended search that may require multiple rounds of globbing and grepping, use the Agent tool instead """ # noqa: E501 # --- Tool Definition --- class GrepTool(ToolDefinition[GrepAction, GrepObservation]): """A custom grep tool that searches file contents using regular expressions.""" @classmethod def create( cls, conv_state, terminal_executor: TerminalExecutor | None = None ) -> Sequence[ToolDefinition]: """Create GrepTool instance with a GrepExecutor. Args: conv_state: Conversation state to get working directory from. terminal_executor: Optional terminal executor to reuse. If not provided, a new one will be created. Returns: A sequence containing a single GrepTool instance. """ if terminal_executor is None: terminal_executor = TerminalExecutor( working_dir=conv_state.workspace.working_dir ) grep_executor = GrepExecutor(terminal_executor) return [ cls( description=_GREP_DESCRIPTION, action_type=GrepAction, observation_type=GrepObservation, executor=grep_executor, ) ] # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools - demonstrating both simplified and advanced patterns cwd = os.getcwd() class BashAndGrepToolSet(ToolDefinition[Action, Observation]): """Create terminal and grep tools sharing one terminal executor.""" @classmethod def create(cls, conv_state, **params) -> Sequence[ToolDefinition]: terminal_executor = TerminalExecutor( working_dir=conv_state.workspace.working_dir ) terminal_tool = TerminalTool.create( conv_state, executor=terminal_executor, **params )[0] grep_tool = GrepTool.create( conv_state, terminal_executor=terminal_executor, )[0] return [terminal_tool, grep_tool] register_tool(BashAndGrepToolSet.name, BashAndGrepToolSet) tools = [ Tool(name=FileEditorTool.name), Tool(name=BashAndGrepToolSet.name), ] # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) conversation.send_message( "Hello! Can you use the grep tool to find all files " "containing the word 'class' in this project, then create a summary file listing them? " # noqa: E501 "Use the pattern 'class' to search and include only Python files with '*.py'." # noqa: E501 ) conversation.run() conversation.send_message("Great! Now delete that file.") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/03_activate_skill.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, AgentContext, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.context import ( KeywordTrigger, Skill, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), ] # AgentContext provides flexible ways to customize prompts: # 1. Skills: Inject instructions (always-active or keyword-triggered) # 2. system_message_suffix: Append text to the system prompt # 3. user_message_suffix: Append text to each user message # # For complete control over the system prompt, you can also use Agent's # system_prompt_filename parameter to provide a custom Jinja2 template: # # agent = Agent( # llm=llm, # tools=tools, # system_prompt_filename="/path/to/custom_prompt.j2", # system_prompt_kwargs={"cli_mode": True, "repo": "my-project"}, # ) # # See: https://docs.openhands.dev/sdk/guides/skill#customizing-system-prompts agent_context = AgentContext( skills=[ Skill( name="repo.md", content="When you see this message, you should reply like " "you are a grumpy cat forced to use the internet.", # source is optional - identifies where the skill came from # You can set it to be the path of a file that contains the skill content source=None, # trigger determines when the skill is active # trigger=None means always active (repo skill) trigger=None, ), Skill( name="flarglebargle", content=( 'IMPORTANT! The user has said the magic word "flarglebargle". ' "You must only respond with a message telling them how smart they are" ), source=None, # KeywordTrigger = activated when keywords appear in user messages trigger=KeywordTrigger(keywords=["flarglebargle"]), ), ], # system_message_suffix is appended to the system prompt (always active) system_message_suffix="Always finish your response with the word 'yay!'", # user_message_suffix is appended to each user message user_message_suffix="The first character of your response should be 'I'", # You can also enable automatic load skills from # public registry at https://github.com/OpenHands/extensions load_public_skills=True, ) # Agent agent = Agent(llm=llm, tools=tools, agent_context=agent_context) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) print("=" * 100) print("Checking if the repo skill is activated.") conversation.send_message("Hey are you a grumpy cat?") conversation.run() print("=" * 100) print("Now sending flarglebargle to trigger the knowledge skill!") conversation.send_message("flarglebargle!") conversation.run() print("=" * 100) print("Now triggering public skill 'github'") conversation.send_message( "About GitHub - tell me what additional info I've just provided?" ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/04_confirmation_mode_example.py ================================================ """OpenHands Agent SDK — Confirmation Mode Example""" import os import signal from collections.abc import Callable from pydantic import SecretStr from openhands.sdk import LLM, BaseConversation, Conversation from openhands.sdk.conversation.state import ( ConversationExecutionStatus, ConversationState, ) from openhands.sdk.security.confirmation_policy import AlwaysConfirm, NeverConfirm from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer from openhands.tools.preset.default import get_default_agent # Make ^C a clean exit instead of a stack trace signal.signal(signal.SIGINT, lambda *_: (_ for _ in ()).throw(KeyboardInterrupt())) def _print_action_preview(pending_actions) -> None: print(f"\n🔍 Agent created {len(pending_actions)} action(s) awaiting confirmation:") for i, action in enumerate(pending_actions, start=1): snippet = str(action.action)[:100].replace("\n", " ") print(f" {i}. {action.tool_name}: {snippet}...") def confirm_in_console(pending_actions) -> bool: """ Return True to approve, False to reject. Default to 'no' on EOF/KeyboardInterrupt (matches original behavior). """ _print_action_preview(pending_actions) while True: try: ans = ( input("\nDo you want to execute these actions? (yes/no): ") .strip() .lower() ) except (EOFError, KeyboardInterrupt): print("\n❌ No input received; rejecting by default.") return False if ans in ("yes", "y"): print("✅ Approved — executing actions…") return True if ans in ("no", "n"): print("❌ Rejected — skipping actions…") return False print("Please enter 'yes' or 'no'.") def run_until_finished(conversation: BaseConversation, confirmer: Callable) -> None: """ Drive the conversation until FINISHED. If WAITING_FOR_CONFIRMATION, ask the confirmer; on reject, call reject_pending_actions(). Preserves original error if agent waits but no actions exist. """ while conversation.state.execution_status != ConversationExecutionStatus.FINISHED: if ( conversation.state.execution_status == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION ): pending = ConversationState.get_unmatched_actions(conversation.state.events) if not pending: raise RuntimeError( "⚠️ Agent is waiting for confirmation but no pending actions " "were found. This should not happen." ) if not confirmer(pending): conversation.reject_pending_actions("User rejected the actions") # Let the agent produce a new step or finish continue print("▶️ Running conversation.run()…") conversation.run() # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) agent = get_default_agent(llm=llm) conversation = Conversation(agent=agent, workspace=os.getcwd()) # Conditionally add security analyzer based on environment variable add_security_analyzer = bool(os.getenv("ADD_SECURITY_ANALYZER", "").strip()) if add_security_analyzer: print("Agent security analyzer added.") conversation.set_security_analyzer(LLMSecurityAnalyzer()) # 1) Confirmation mode ON conversation.set_confirmation_policy(AlwaysConfirm()) print("\n1) Command that will likely create actions…") conversation.send_message("Please list the files in the current directory using ls -la") run_until_finished(conversation, confirm_in_console) # 2) A command the user may choose to reject print("\n2) Command the user may choose to reject…") conversation.send_message("Please create a file called 'dangerous_file.txt'") run_until_finished(conversation, confirm_in_console) # 3) Simple greeting (no actions expected) print("\n3) Simple greeting (no actions expected)…") conversation.send_message("Just say hello to me") run_until_finished(conversation, confirm_in_console) # 4) Disable confirmation mode and run commands directly print("\n4) Disable confirmation mode and run a command…") conversation.set_confirmation_policy(NeverConfirm()) conversation.send_message("Please echo 'Hello from confirmation mode example!'") conversation.run() conversation.send_message( "Please delete any file that was created during this conversation." ) conversation.run() print("\n=== Example Complete ===") print("Key points:") print( "- conversation.run() creates actions; confirmation mode " "sets execution_status=WAITING_FOR_CONFIRMATION" ) print("- User confirmation is handled via a single reusable function") print("- Rejection uses conversation.reject_pending_actions() and the loop continues") print("- Simple responses work normally without actions") print("- Confirmation policy is toggled with conversation.set_confirmation_policy()") ================================================ FILE: examples/01_standalone_sdk/05_use_llm_registry.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, LLMRegistry, Message, TextContent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM using LLMRegistry api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") # Create LLM instance main_llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Create LLM registry and add the LLM llm_registry = LLMRegistry() llm_registry.add(main_llm) # Get LLM from registry llm = llm_registry.get("agent") # Tools cwd = os.getcwd() tools = [Tool(name=TerminalTool.name)] # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) conversation.send_message("Please echo 'Hello!'") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") print("=" * 100) print(f"LLM Registry usage IDs: {llm_registry.list_usage_ids()}") # Demonstrate getting the same LLM instance from registry same_llm = llm_registry.get("agent") print(f"Same LLM instance: {llm is same_llm}") # Demonstrate requesting a completion directly from an LLM resp = llm.completion( messages=[ Message(role="user", content=[TextContent(text="Say hello in one word.")]) ] ) # Access the response content via OpenHands LLMResponse msg = resp.message texts = [c.text for c in msg.content if isinstance(c, TextContent)] print(f"Direct completion response: {texts[0] if texts else str(msg)}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, params={"no_change_timeout_seconds": 3}, ) ] # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) conversation.send_message( "Enter python interactive mode by directly running `python3`, then tell me " "the current time, and exit python interactive mode." ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") ================================================ FILE: examples/01_standalone_sdk/07_mcp_integration.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) cwd = os.getcwd() tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ] # Add MCP Tools mcp_config = { "mcpServers": { "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]}, "repomix": {"command": "npx", "args": ["-y", "repomix@1.4.2", "--mcp"]}, } } # Agent agent = Agent( llm=llm, tools=tools, mcp_config=mcp_config, # This regex filters out all repomix tools except pack_codebase filter_tools_regex="^(?!repomix)(.*)|^repomix.*pack_codebase.*$", ) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Conversation conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd, ) conversation.set_security_analyzer(LLMSecurityAnalyzer()) logger.info("Starting conversation with MCP integration...") conversation.send_message( "Read https://github.com/OpenHands/OpenHands and write 3 facts " "about the project into FACTS.txt." ) conversation.run() conversation.send_message("Great! Now delete that file.") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/08_mcp_with_oauth.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), ] mcp_config = { "mcpServers": {"Notion": {"url": "https://mcp.notion.com/mcp", "auth": "oauth"}} } agent = Agent(llm=llm, tools=tools, mcp_config=mcp_config) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Conversation conversation = Conversation( agent=agent, callbacks=[conversation_callback], ) logger.info("Starting conversation with MCP integration...") conversation.send_message("Can you search about OpenHands V1 in my notion workspace?") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") ================================================ FILE: examples/01_standalone_sdk/09_pause_example.py ================================================ import os import threading import time from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), ] # Agent agent = Agent(llm=llm, tools=tools) conversation = Conversation(agent, workspace=os.getcwd()) print("=" * 60) print("Pause and Continue Example") print("=" * 60) print() # Phase 1: Start a long-running task print("Phase 1: Starting agent with a task...") conversation.send_message( "Create a file called countdown.txt and write numbers from 100 down to 1, " "one number per line. After you finish, summarize what you did." ) print(f"Initial status: {conversation.state.execution_status}") print() # Start the agent in a background thread thread = threading.Thread(target=conversation.run) thread.start() # Let the agent work for a few seconds print("Letting agent work for 2 seconds...") time.sleep(2) # Phase 2: Pause the agent print() print("Phase 2: Pausing the agent...") conversation.pause() # Wait for the thread to finish (it will stop when paused) thread.join() print(f"Agent status after pause: {conversation.state.execution_status}") print() # Phase 3: Send a new message while paused print("Phase 3: Sending a new message while agent is paused...") conversation.send_message( "Actually, stop working on countdown.txt. Instead, create a file called " "hello.txt with just the text 'Hello, World!' in it." ) print() # Phase 4: Resume the agent with .run() print("Phase 4: Resuming agent with .run()...") print(f"Status before resume: {conversation.state.execution_status}") # Resume execution conversation.run() print(f"Final status: {conversation.state.execution_status}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/10_persistence.py ================================================ import os import uuid from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ] # Add MCP Tools mcp_config = { "mcpServers": { "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]}, } } # Agent agent = Agent(llm=llm, tools=tools, mcp_config=mcp_config) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation_id = uuid.uuid4() persistence_dir = "./.conversations" conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd, persistence_dir=persistence_dir, conversation_id=conversation_id, ) conversation.send_message( "Read https://github.com/OpenHands/OpenHands. Then write 3 facts " "about the project into FACTS.txt." ) conversation.run() conversation.send_message("Great! Now delete that file.") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Conversation persistence print("Serializing conversation...") del conversation # Deserialize the conversation print("Deserializing conversation...") conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd, persistence_dir=persistence_dir, conversation_id=conversation_id, ) print("Sending message to deserialized conversation...") conversation.send_message("Hey what did you create? Return an agent finish action") conversation.run() # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/11_async.py ================================================ """ This example demonstrates usage of a Conversation in an async context (e.g.: From a fastapi server). The conversation is run in a background thread and a callback with results is executed in the main runloop """ import asyncio import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.conversation.types import ConversationCallbackType from openhands.sdk.tool import Tool from openhands.sdk.utils.async_utils import AsyncCallbackWrapper from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ] # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages # Callback coroutine async def callback_coro(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Synchronous run conversation def run_conversation(callback: ConversationCallbackType): conversation = Conversation(agent=agent, callbacks=[callback]) conversation.send_message( "Hello! Can you create a new Python file named hello.py that prints " "'Hello, World!'? Use task tracker to plan your steps." ) conversation.run() conversation.send_message("Great! Now delete that file.") conversation.run() async def main(): loop = asyncio.get_running_loop() # Create the callback callback = AsyncCallbackWrapper(callback_coro, loop) # Run the conversation in a background thread and wait for it to finish... await loop.run_in_executor(None, run_conversation, callback) print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: examples/01_standalone_sdk/12_custom_secrets.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, ) from openhands.sdk.secret import SecretSource from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ] # Agent agent = Agent(llm=llm, tools=tools) conversation = Conversation(agent) class MySecretSource(SecretSource): def get_value(self) -> str: return "callable-based-secret" conversation.update_secrets( {"SECRET_TOKEN": "my-secret-token-value", "SECRET_FUNCTION_TOKEN": MySecretSource()} ) conversation.send_message("just echo $SECRET_TOKEN") conversation.run() conversation.send_message("just echo $SECRET_FUNCTION_TOKEN") conversation.run() # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/13_get_llm_metrics.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) cwd = os.getcwd() tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ] # Add MCP Tools mcp_config = {"mcpServers": {"fetch": {"command": "uvx", "args": ["mcp-server-fetch"]}}} # Agent agent = Agent(llm=llm, tools=tools, mcp_config=mcp_config) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Conversation conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd, ) logger.info("Starting conversation with MCP integration...") conversation.send_message( "Read https://github.com/OpenHands/OpenHands and write 3 facts " "about the project into FACTS.txt." ) conversation.run() conversation.send_message("Great! Now delete that file.") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") assert llm.metrics is not None print( f"Conversation finished. Final LLM metrics with details: {llm.metrics.model_dump()}" ) # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/14_context_condenser.py ================================================ """ To manage context in long-running conversations, the agent can use a context condenser that keeps the conversation history within a specified size limit. This example demonstrates using the `LLMSummarizingCondenser`, which automatically summarizes older parts of the conversation when the history exceeds a defined threshold. """ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ] # Create a condenser to manage the context. The condenser will automatically truncate # conversation history when it exceeds max_size, and replaces the dropped events with an # LLM-generated summary. This condenser triggers when there are more than ten events in # the conversation history, and always keeps the first two events (system prompts, # initial user messages) to preserve important context. condenser = LLMSummarizingCondenser( llm=llm.model_copy(update={"usage_id": "condenser"}), max_size=10, keep_first=2 ) # Agent with condenser agent = Agent(llm=llm, tools=tools, condenser=condenser) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], persistence_dir="./.conversations", workspace=".", ) # Send multiple messages to demonstrate condensation print("Sending multiple messages to demonstrate LLM Summarizing Condenser...") conversation.send_message( "Hello! Can you create a Python file named math_utils.py with functions for " "basic arithmetic operations (add, subtract, multiply, divide)?" ) conversation.run() conversation.send_message( "Great! Now add a function to calculate the factorial of a number." ) conversation.run() conversation.send_message("Add a function to check if a number is prime.") conversation.run() conversation.send_message( "Add a function to calculate the greatest common divisor (GCD) of two numbers." ) conversation.run() conversation.send_message( "Now create a test file to verify all these functions work correctly." ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Conversation persistence print("Serializing conversation...") del conversation # Deserialize the conversation print("Deserializing conversation...") conversation = Conversation( agent=agent, callbacks=[conversation_callback], persistence_dir="./.conversations", workspace=".", ) print("Sending message to deserialized conversation...") conversation.send_message("Finally, clean up by deleting both files.") conversation.run() print("=" * 100) print("Conversation finished with LLM Summarizing Condenser.") print(f"Total LLM messages collected: {len(llm_messages)}") print("\nThe condenser automatically summarized older conversation history") print("when the conversation exceeded the configured max_size threshold.") print("This helps manage context length while preserving important information.") # Report cost cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/15_browser_use.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.browser_use import BrowserToolSet from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), Tool(name=BrowserToolSet.name), ] # If you need fine-grained browser control, you can manually register individual browser # tools by creating a BrowserToolExecutor and providing factories that return customized # Tool instances before constructing the Agent. # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) conversation.send_message( "Could you go to https://openhands.dev/ blog page and summarize main " "points of the latest blog?" ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") ================================================ FILE: examples/01_standalone_sdk/16_llm_security_analyzer.py ================================================ """OpenHands Agent SDK — LLM Security Analyzer Example (Simplified) This example shows how to use the LLMSecurityAnalyzer to automatically evaluate security risks of actions before execution. """ import os import signal from collections.abc import Callable from pydantic import SecretStr from openhands.sdk import LLM, Agent, BaseConversation, Conversation from openhands.sdk.conversation.state import ( ConversationExecutionStatus, ConversationState, ) from openhands.sdk.security.confirmation_policy import ConfirmRisky from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool # Clean ^C exit: no stack trace noise signal.signal(signal.SIGINT, lambda *_: (_ for _ in ()).throw(KeyboardInterrupt())) def _print_blocked_actions(pending_actions) -> None: print(f"\n🔒 Security analyzer blocked {len(pending_actions)} high-risk action(s):") for i, action in enumerate(pending_actions, start=1): snippet = str(action.action)[:100].replace("\n", " ") print(f" {i}. {action.tool_name}: {snippet}...") def confirm_high_risk_in_console(pending_actions) -> bool: """ Return True to approve, False to reject. Matches original behavior: default to 'no' on EOF/KeyboardInterrupt. """ _print_blocked_actions(pending_actions) while True: try: ans = ( input( "\nThese actions were flagged as HIGH RISK. " "Do you want to execute them anyway? (yes/no): " ) .strip() .lower() ) except (EOFError, KeyboardInterrupt): print("\n❌ No input received; rejecting by default.") return False if ans in ("yes", "y"): print("✅ Approved — executing high-risk actions...") return True if ans in ("no", "n"): print("❌ Rejected — skipping high-risk actions...") return False print("Please enter 'yes' or 'no'.") def run_until_finished_with_security( conversation: BaseConversation, confirmer: Callable[[list], bool] ) -> None: """ Drive the conversation until FINISHED. - If WAITING_FOR_CONFIRMATION: ask the confirmer. * On approve: set execution_status = IDLE (keeps original example’s behavior). * On reject: conversation.reject_pending_actions(...). - If WAITING but no pending actions: print warning and set IDLE (matches original). """ while conversation.state.execution_status != ConversationExecutionStatus.FINISHED: if ( conversation.state.execution_status == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION ): pending = ConversationState.get_unmatched_actions(conversation.state.events) if not pending: raise RuntimeError( "⚠️ Agent is waiting for confirmation but no pending actions " "were found. This should not happen." ) if not confirmer(pending): conversation.reject_pending_actions("User rejected high-risk actions") continue print("▶️ Running conversation.run()...") conversation.run() # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="security-analyzer", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), ] # Agent agent = Agent(llm=llm, tools=tools) # Conversation with persisted filestore conversation = Conversation( agent=agent, persistence_dir="./.conversations", workspace="." ) conversation.set_security_analyzer(LLMSecurityAnalyzer()) conversation.set_confirmation_policy(ConfirmRisky()) print("\n1) Safe command (LOW risk - should execute automatically)...") conversation.send_message("List files in the current directory") conversation.run() print("\n2) Potentially risky command (may require confirmation)...") conversation.send_message( "Please echo 'hello world' -- PLEASE MARK THIS AS A HIGH RISK ACTION" ) run_until_finished_with_security(conversation, confirm_high_risk_in_console) ================================================ FILE: examples/01_standalone_sdk/17_image_input.py ================================================ """OpenHands Agent SDK — Image Input Example. This script mirrors the basic setup from ``examples/01_hello_world.py`` but adds vision support by sending an image to the agent alongside text instructions. It also demonstrates multi-image input with base64-encoded images that exercise the Anthropic many-image resizing path (>20 images are automatically downscaled to 2000×2000 px). """ import base64 import io import os from PIL import Image from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, ImageContent, LLMConvertibleEvent, Message, TextContent, get_logger, ) from openhands.sdk.tool.spec import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) def _make_png_data_url(width: int, height: int, color: str = "red") -> str: """Create a base64 PNG data URL with the given dimensions and colour.""" image = Image.new("RGB", (width, height), color=color) buffer = io.BytesIO() image.save(buffer, format="PNG") encoded = base64.b64encode(buffer.getvalue()).decode("ascii") return f"data:image/png;base64,{encoded}" # Configure LLM (vision-capable model) api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="vision-llm", model=model, base_url=base_url, api_key=SecretStr(api_key), ) assert llm.vision_is_active(), "The selected LLM model does not support vision input." cwd = os.getcwd() agent = Agent( llm=llm, tools=[ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ], ) llm_messages = [] # collect raw LLM messages for inspection def conversation_callback(event: Event) -> None: if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd ) # ── Part 1: single URL image ────────────────────────────────────────────── IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" conversation.send_message( Message( role="user", content=[ TextContent( text=( "Study this image and describe the key elements you see. " "Summarize them in a short paragraph and suggest a catchy caption." ) ), ImageContent(image_urls=[IMAGE_URL]), ], ) ) conversation.run() conversation.send_message( "Great! Please save your description and caption into image_report.md." ) conversation.run() # ── Part 2: many oversized base64 images (exercises Anthropic resize) ───── # Generate 21 base64 images at 2500×100 px — just above the 20-image threshold # that triggers Anthropic's many-image limit (2000×2000 px per image). # The SDK will automatically downscale these before sending to the provider. COLORS = [ "red", "green", "blue", "yellow", "cyan", "magenta", "orange", "purple", "pink", "brown", "gray", "white", "navy", "teal", "olive", "maroon", "lime", "aqua", "coral", "gold", "indigo", ] oversized_data_urls = [ _make_png_data_url(2500, 100, color=COLORS[i % len(COLORS)]) for i in range(21) ] conversation.send_message( Message( role="user", content=[ TextContent( text=( "I'm sending you 21 solid-colour test images. " "List the dominant colour of each image in order, " "one per line." ) ), ImageContent(image_urls=oversized_data_urls), ], ) ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/18_send_message_while_processing.py ================================================ """ Example demonstrating that user messages can be sent and processed while an agent is busy. This example demonstrates a key capability of the OpenHands agent system: the ability to receive and process new user messages even while the agent is actively working on a previous task. This is made possible by the agent's event-driven architecture. Demonstration Flow: 1. Send initial message asking agent to: - Write "Message 1 sent at [time], written at [CURRENT_TIME]" - Wait 3 seconds - Write "Message 2 sent at [time], written at [CURRENT_TIME]" [time] is the time the message was sent to the agent [CURRENT_TIME] is the time the agent writes the line 2. Start agent processing in a background thread 3. While agent is busy (during the 3-second delay), send a second message asking to add: - "Message 3 sent at [time], written at [CURRENT_TIME]" 4. Verify that all three lines are processed and included in the final document Expected Evidence: The final document will contain three lines with dual timestamps: - "Message 1 sent at HH:MM:SS, written at HH:MM:SS" (from initial message, written immediately) - "Message 2 sent at HH:MM:SS, written at HH:MM:SS" (from initial message, written after 3-second delay) - "Message 3 sent at HH:MM:SS, written at HH:MM:SS" (from second message sent during delay) The timestamps will show that Message 3 was sent while the agent was running, but was still successfully processed and written to the document. This proves that: - The second user message was sent while the agent was processing the first task - The agent successfully received and processed the second message - The agent's event system allows for real-time message integration during processing Key Components Demonstrated: - Conversation.send_message(): Adds messages to events list immediately - Agent.step(): Processes all events including newly added messages - Threading: Allows message sending while agent is actively processing """ # noqa import os import threading import time from datetime import datetime from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool( name=TerminalTool.name, ), Tool(name=FileEditorTool.name), ] # Agent agent = Agent(llm=llm, tools=tools) conversation = Conversation(agent) def timestamp() -> str: return datetime.now().strftime("%H:%M:%S") print("=== Send Message While Processing Example ===") # Step 1: Send initial message start_time = timestamp() conversation.send_message( f"Create a file called document.txt and write this first sentence: " f"'Message 1 sent at {start_time}, written at [CURRENT_TIME].' " f"Replace [CURRENT_TIME] with the actual current time when you write the line. " f"Then wait 3 seconds and write 'Message 2 sent at {start_time}, written at [CURRENT_TIME].'" # noqa ) # Step 2: Start agent processing in background thread = threading.Thread(target=conversation.run) thread.start() # Step 3: Wait then send second message while agent is processing time.sleep(2) # Give agent time to start working second_time = timestamp() conversation.send_message( f"Please also add this second sentence to document.txt: " f"'Message 3 sent at {second_time}, written at [CURRENT_TIME].' " f"Replace [CURRENT_TIME] with the actual current time when you write this line." ) # Wait for completion thread.join() # Verification document_path = os.path.join(cwd, "document.txt") if os.path.exists(document_path): with open(document_path) as f: content = f.read() print("\nDocument contents:") print("─────────────────────") print(content) print("─────────────────────") # Check if both messages were processed if "Message 1" in content and "Message 2" in content: print("\nSUCCESS: Agent processed both messages!") print( "This proves the agent received the second message while processing the first task." # noqa ) else: print("\nWARNING: Agent may not have processed the second message") # Clean up os.remove(document_path) else: print("WARNING: Document.txt was not created") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/19_llm_routing.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, ImageContent, LLMConvertibleEvent, Message, TextContent, get_logger, ) from openhands.sdk.llm.router import MultimodalRouter from openhands.tools.preset.default import get_default_tools logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "openhands/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") primary_llm = LLM( usage_id="agent-primary", model=model, base_url=base_url, api_key=SecretStr(api_key), ) secondary_llm = LLM( usage_id="agent-secondary", model="openhands/devstral-small-2507", base_url=base_url, api_key=SecretStr(api_key), ) multimodal_router = MultimodalRouter( usage_id="multimodal-router", llms_for_routing={"primary": primary_llm, "secondary": secondary_llm}, ) # Tools tools = get_default_tools() # Use our default openhands experience # Agent agent = Agent(llm=multimodal_router, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=os.getcwd() ) conversation.send_message( message=Message( role="user", content=[TextContent(text=("Hi there, who trained you?"))], ) ) conversation.run() conversation.send_message( message=Message( role="user", content=[ ImageContent( image_urls=["http://images.cocodataset.org/val2017/000000039769.jpg"] ), TextContent(text=("What do you see in the image above?")), ], ) ) conversation.run() conversation.send_message( message=Message( role="user", content=[TextContent(text=("Who trained you as an LLM?"))], ) ) conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/20_stuck_detector.py ================================================ import os from pydantic import SecretStr from openhands.sdk import ( LLM, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.tools.preset.default import get_default_agent logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) agent = get_default_agent(llm=llm) llm_messages = [] def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Create conversation with built-in stuck detection conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=os.getcwd(), # This is by default True, shown here for clarity of the example stuck_detection=True, ) # Send a task that will be caught by stuck detection conversation.send_message( "Please execute 'ls' command 5 times, each in its own " "action without any thought and then exit at the 6th step." ) # Run the conversation - stuck detection happens automatically conversation.run() assert conversation.stuck_detector is not None final_stuck_check = conversation.stuck_detector.is_stuck() print(f"Final stuck status: {final_stuck_check}") print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): print(f"Message {i}: {str(message)[:200]}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py ================================================ import os from pydantic import SecretStr from tabulate import tabulate from openhands.sdk import ( LLM, Agent, Conversation, LLMSummarizingCondenser, Message, TextContent, get_logger, ) from openhands.sdk.tool.spec import Tool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Configure LLM using LLMRegistry api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") # Create LLM instance llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) llm_condenser = LLM( model=model, base_url=base_url, api_key=SecretStr(api_key), usage_id="condenser", ) # Tools condenser = LLMSummarizingCondenser(llm=llm_condenser, max_size=10, keep_first=2) cwd = os.getcwd() agent = Agent( llm=llm, tools=[ Tool( name=TerminalTool.name, ), ], condenser=condenser, ) conversation = Conversation(agent=agent, workspace=cwd) conversation.send_message( message=Message( role="user", content=[TextContent(text="Please echo 'Hello!'")], ) ) conversation.run() # Demonstrate extraneous costs part of the conversation second_llm = LLM( usage_id="demo-secondary", model=model, base_url=os.getenv("LLM_BASE_URL"), api_key=SecretStr(api_key), ) conversation.llm_registry.add(second_llm) completion_response = second_llm.completion( messages=[Message(role="user", content=[TextContent(text="echo 'More spend!'")])] ) # Access total spend spend = conversation.conversation_stats.get_combined_metrics() print("\n=== Total Spend for Conversation ===\n") print(f"Accumulated Cost: ${spend.accumulated_cost:.6f}") if spend.accumulated_token_usage: print(f"Prompt Tokens: {spend.accumulated_token_usage.prompt_tokens}") print(f"Completion Tokens: {spend.accumulated_token_usage.completion_tokens}") print(f"Cache Read Tokens: {spend.accumulated_token_usage.cache_read_tokens}") print(f"Cache Write Tokens: {spend.accumulated_token_usage.cache_write_tokens}") spend_per_usage = conversation.conversation_stats.usage_to_metrics print("\n=== Spend Breakdown by Usage ID ===\n") rows = [] for usage_id, metrics in spend_per_usage.items(): rows.append( [ usage_id, f"${metrics.accumulated_cost:.6f}", metrics.accumulated_token_usage.prompt_tokens if metrics.accumulated_token_usage else 0, metrics.accumulated_token_usage.completion_tokens if metrics.accumulated_token_usage else 0, ] ) print( tabulate( rows, headers=["Usage ID", "Cost", "Prompt Tokens", "Completion Tokens"], tablefmt="github", ) ) # Report cost cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/22_anthropic_thinking.py ================================================ """Example demonstrating Anthropic's extended thinking feature with thinking blocks.""" import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, RedactedThinkingBlock, ThinkingBlock, ) from openhands.sdk.tool import Tool from openhands.tools.terminal import TerminalTool # Configure LLM for Anthropic Claude with extended thinking api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Setup agent with bash tool agent = Agent(llm=llm, tools=[Tool(name=TerminalTool.name)]) # Callback to display thinking blocks def show_thinking(event: Event): if isinstance(event, LLMConvertibleEvent): message = event.to_llm_message() if hasattr(message, "thinking_blocks") and message.thinking_blocks: print(f"\n🧠 Found {len(message.thinking_blocks)} thinking blocks") for i, block in enumerate(message.thinking_blocks): if isinstance(block, RedactedThinkingBlock): print(f" Block {i + 1}: {block.data}") elif isinstance(block, ThinkingBlock): print(f" Block {i + 1}: {block.thinking}") conversation = Conversation( agent=agent, callbacks=[show_thinking], workspace=os.getcwd() ) conversation.send_message( "Calculate compound interest for $10,000 at 5% annually, " "compounded quarterly for 3 years. Show your work.", ) conversation.run() conversation.send_message( "Now, write that number to RESULTs.txt.", ) conversation.run() print("✅ Done!") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/23_responses_reasoning.py ================================================ """ Example: Responses API path via LiteLLM in a Real Agent Conversation - Runs a real Agent/Conversation to verify /responses path works - Demonstrates rendering of Responses reasoning within normal conversation events """ from __future__ import annotations import os from pydantic import SecretStr from openhands.sdk import ( Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.llm import LLM from openhands.tools.preset.default import get_default_agent logger = get_logger(__name__) api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") assert api_key, "Set LLM_API_KEY or OPENAI_API_KEY in your environment." model = "openhands/gpt-5-mini-2025-08-07" # Use a model that supports Responses API base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, api_key=SecretStr(api_key), base_url=base_url, # Responses-path options reasoning_effort="high", # Logging / behavior tweaks log_completions=False, usage_id="agent", ) print("\n=== Agent Conversation using /responses path ===") agent = get_default_agent( llm=llm, cli_mode=True, # disable browser tools for env simplicity ) llm_messages = [] # collect raw LLM-convertible messages for inspection def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=os.getcwd(), ) # Keep the tasks short for demo purposes conversation.send_message("Read the repo and write one fact into FACTS.txt.") conversation.run() conversation.send_message("Now delete FACTS.txt.") conversation.run() print("=" * 100) print("Conversation finished. Got the following LLM messages:") for i, message in enumerate(llm_messages): ms = str(message) print(f"Message {i}: {ms[:200]}{'...' if len(ms) > 200 else ''}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/24_planning_agent_workflow.py ================================================ #!/usr/bin/env python3 """ Planning Agent Workflow Example This example demonstrates a two-stage workflow: 1. Planning Agent: Analyzes the task and creates a detailed implementation plan 2. Execution Agent: Implements the plan with full editing capabilities The task: Create a Python web scraper that extracts article titles and URLs from a news website, handles rate limiting, and saves results to JSON. """ import os import tempfile from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Conversation from openhands.sdk.llm import content_to_str from openhands.tools.preset.default import get_default_agent from openhands.tools.preset.planning import get_planning_agent def get_event_content(event): """Extract content from an event.""" if hasattr(event, "llm_message"): return "".join(content_to_str(event.llm_message.content)) return str(event) """Run the planning agent workflow example.""" # Create a temporary workspace workspace_dir = Path(tempfile.mkdtemp()) print(f"Working in: {workspace_dir}") # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, base_url=base_url, api_key=SecretStr(api_key), usage_id="agent", ) # Task description task = """ Create a Python web scraper with the following requirements: - Scrape article titles and URLs from a news website - Handle HTTP errors gracefully with retry logic - Save results to a JSON file with timestamp - Use requests and BeautifulSoup for scraping Do NOT ask for any clarifying questions. Directly create your implementation plan. """ print("=" * 80) print("PHASE 1: PLANNING") print("=" * 80) # Create Planning Agent with read-only tools planning_agent = get_planning_agent(llm=llm) # Create conversation for planning planning_conversation = Conversation( agent=planning_agent, workspace=str(workspace_dir), ) # Run planning phase print("Planning Agent is analyzing the task and creating implementation plan...") planning_conversation.send_message( f"Please analyze this web scraping task and create a detailed " f"implementation plan:\n\n{task}" ) planning_conversation.run() print("\n" + "=" * 80) print("PLANNING COMPLETE") print("=" * 80) print(f"Implementation plan saved to: {workspace_dir}/PLAN.md") print("\n" + "=" * 80) print("PHASE 2: EXECUTION") print("=" * 80) # Create Execution Agent with full editing capabilities execution_agent = get_default_agent(llm=llm, cli_mode=True) # Create conversation for execution execution_conversation = Conversation( agent=execution_agent, workspace=str(workspace_dir), ) # Prepare execution prompt with reference to the plan file execution_prompt = f""" Please implement the web scraping project according to the implementation plan. The detailed implementation plan has been created and saved at: {workspace_dir}/PLAN.md Please read the plan from PLAN.md and implement all components according to it. Create all necessary files, implement the functionality, and ensure everything works together properly. """ print("Execution Agent is implementing the plan...") execution_conversation.send_message(execution_prompt) execution_conversation.run() # Get the last message from the conversation execution_result = execution_conversation.state.events[-1] print("\n" + "=" * 80) print("EXECUTION RESULT:") print("=" * 80) print(get_event_content(execution_result)) print("\n" + "=" * 80) print("WORKFLOW COMPLETE") print("=" * 80) print(f"Project files created in: {workspace_dir}") # List created files print("\nCreated files:") for file_path in workspace_dir.rglob("*"): if file_path.is_file(): print(f" - {file_path.relative_to(workspace_dir)}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/25_agent_delegation.py ================================================ """ Agent Delegation Example This example demonstrates the agent delegation feature where a main agent delegates tasks to sub-agents for parallel processing. Each sub-agent runs independently and returns its results to the main agent, which then merges both analyses into a single consolidated report. """ import os from openhands.sdk import ( LLM, Agent, AgentContext, Conversation, Tool, get_logger, ) from openhands.sdk.context import Skill from openhands.sdk.subagent import register_agent from openhands.sdk.tool import register_tool from openhands.tools import register_builtins_agents from openhands.tools.delegate import ( DelegateTool, DelegationVisualizer, ) logger = get_logger(__name__) # Configure LLM and agent llm = LLM( model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), api_key=os.getenv("LLM_API_KEY"), base_url=os.environ.get("LLM_BASE_URL", None), usage_id="agent", ) def create_lodging_planner(llm: LLM) -> Agent: """Create a lodging planner focused on London stays.""" skills = [ Skill( name="lodging_planning", content=( "You specialize in finding great places to stay in London. " "Provide 3-4 hotel recommendations with neighborhoods, quick " "pros/cons, " "and notes on transit convenience. Keep options varied by budget." ), trigger=None, ) ] return Agent( llm=llm, tools=[], agent_context=AgentContext( skills=skills, system_message_suffix="Focus only on London lodging recommendations.", ), ) def create_activities_planner(llm: LLM) -> Agent: """Create an activities planner focused on London itineraries.""" skills = [ Skill( name="activities_planning", content=( "You design concise London itineraries. Suggest 2-3 daily " "highlights, grouped by proximity to minimize travel time. " "Include food/coffee stops " "and note required tickets/reservations." ), trigger=None, ) ] return Agent( llm=llm, tools=[], agent_context=AgentContext( skills=skills, system_message_suffix="Plan practical, time-efficient days in London.", ), ) # Register user-defined agent types (default agent type is always available) register_agent( name="lodging_planner", factory_func=create_lodging_planner, description="Finds London lodging options with transit-friendly picks.", ) register_agent( name="activities_planner", factory_func=create_activities_planner, description="Creates time-efficient London activity itineraries.", ) register_builtins_agents() # Make the delegation tool available to the main agent register_tool("DelegateTool", DelegateTool) main_agent = Agent( llm=llm, tools=[Tool(name="DelegateTool")], ) conversation = Conversation( agent=main_agent, workspace=os.getcwd(), visualizer=DelegationVisualizer(name="Delegator"), ) print("=" * 100) print("Demonstrating London trip delegation (lodging + activities)...") print("=" * 100) conversation.send_message(""" Let's plan a trip to London. I have two specific areas to address: Lodging: What are the best areas to stay in while keeping a budget in mind? Activities: What are the top five must-see attractions and hidden gems? Please use delegation tools to handle these two tasks in parallel. Ensure the sub-agents use their own internal knowledge and do not rely on internet access. Keep the responses concise. Once you have the results, use the bash sub-agent to write a file named london_trip_report.txt containing the findings in the working directory. """) conversation.run() conversation.send_message( "Ask the lodging sub-agent what it thinks about Covent Garden." ) conversation.run() # Report cost for user-defined agent types example cost_user_defined = ( conversation.conversation_stats.get_combined_metrics().accumulated_cost ) print(f"EXAMPLE_COST: {cost_user_defined}") print("All done!") ================================================ FILE: examples/01_standalone_sdk/26_custom_visualizer.py ================================================ """Custom Visualizer Example This example demonstrates how to create and use a custom visualizer by subclassing ConversationVisualizer. This approach provides: - Clean, testable code with class-based state management - Direct configuration (just pass the visualizer instance to visualizer parameter) - Reusable visualizer that can be shared across conversations This demonstrates how you can pass a ConversationVisualizer instance directly to the visualizer parameter for clean, reusable visualization logic. """ import logging import os from pydantic import SecretStr from openhands.sdk import LLM, Conversation from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.event import ( Event, ) from openhands.tools.preset.default import get_default_agent class MinimalVisualizer(ConversationVisualizerBase): """A minimal visualizer that print the raw events as they occur.""" def on_event(self, event: Event) -> None: """Handle events for minimal progress visualization.""" print(f"\n\n[EVENT] {type(event).__name__}: {event.model_dump_json()[:200]}...") api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, api_key=SecretStr(api_key), base_url=base_url, usage_id="agent", ) agent = get_default_agent(llm=llm, cli_mode=True) # ============================================================================ # Configure Visualization # ============================================================================ # Set logging level to reduce verbosity logging.getLogger().setLevel(logging.WARNING) # Start a conversation with custom visualizer cwd = os.getcwd() conversation = Conversation( agent=agent, workspace=cwd, visualizer=MinimalVisualizer(), ) # Send a message and let the agent run print("Sending task to agent...") conversation.send_message("Write 3 facts about the current project into FACTS.txt.") conversation.run() print("Task completed!") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost:.4f}") ================================================ FILE: examples/01_standalone_sdk/27_observability_laminar.py ================================================ """ Observability & Laminar example This example demonstrates enabling OpenTelemetry tracing with Laminar in the OpenHands SDK. Set LMNR_PROJECT_API_KEY and run the script to see traces. """ import os from pydantic import SecretStr from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.tools.terminal import TerminalTool # Tip: Set LMNR_PROJECT_API_KEY in your environment before running, e.g.: # export LMNR_PROJECT_API_KEY="your-laminar-api-key" # For non-Laminar OTLP backends, set OTEL_* variables instead. # Configure LLM and Agent api_key = os.getenv("LLM_API_KEY") model = os.getenv("LLM_MODEL", "openhands/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, api_key=SecretStr(api_key) if api_key else None, base_url=base_url, usage_id="agent", ) agent = Agent( llm=llm, tools=[Tool(name=TerminalTool.name)], ) # Create conversation and run a simple task conversation = Conversation(agent=agent, workspace=".") conversation.send_message("List the files in the current directory and print them.") conversation.run() print( "All done! Check your Laminar dashboard for traces " "(session is the conversation UUID)." ) ================================================ FILE: examples/01_standalone_sdk/28_ask_agent_example.py ================================================ """ Example demonstrating the ask_agent functionality for getting sidebar replies from the agent for a running conversation. This example shows how to use ask_agent() to get quick responses from the agent about the current conversation state without interrupting the main execution flow. """ import os import threading import time from datetime import datetime from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, ) from openhands.sdk.conversation import ConversationVisualizerBase from openhands.sdk.event import Event from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ] class MinimalVisualizer(ConversationVisualizerBase): """A minimal visualizer that print the raw events as they occur.""" count = 0 def on_event(self, event: Event) -> None: """Handle events for minimal progress visualization.""" print(f"\n\n[EVENT {self.count}] {type(event).__name__}") self.count += 1 # Agent agent = Agent(llm=llm, tools=tools) conversation = Conversation( agent=agent, workspace=cwd, visualizer=MinimalVisualizer, max_iteration_per_run=5 ) def timestamp() -> str: return datetime.now().strftime("%H:%M:%S") print("=== Ask Agent Example ===") print("This example demonstrates asking questions during conversation execution") # Step 1: Build conversation context print(f"\n[{timestamp()}] Building conversation context...") conversation.send_message("Explore the current directory and describe the architecture") # Step 2: Start conversation in background thread print(f"[{timestamp()}] Starting conversation in background thread...") thread = threading.Thread(target=conversation.run) thread.start() # Give the agent time to start processing time.sleep(2) # Step 3: Use ask_agent while conversation is running print(f"\n[{timestamp()}] Using ask_agent while conversation is processing...") # Ask context-aware questions questions_and_responses = [] question_1 = "Summarize the activity so far in 1 sentence." print(f"\n[{timestamp()}] Asking: {question_1}") response1 = conversation.ask_agent(question_1) questions_and_responses.append((question_1, response1)) print(f"Response: {response1}") time.sleep(1) question_2 = "How's the progress?" print(f"\n[{timestamp()}] Asking: {question_2}") response2 = conversation.ask_agent(question_2) questions_and_responses.append((question_2, response2)) print(f"Response: {response2}") time.sleep(1) question_3 = "Have you finished running?" print(f"\n[{timestamp()}] {question_3}") response3 = conversation.ask_agent(question_3) questions_and_responses.append((question_3, response3)) print(f"Response: {response3}") # Step 4: Wait for conversation to complete print(f"\n[{timestamp()}] Waiting for conversation to complete...") thread.join() # Step 5: Verify conversation state wasn't affected final_event_count = len(conversation.state.events) # Step 6: Ask a final question after conversation completion print(f"\n[{timestamp()}] Asking final question after completion...") final_response = conversation.ask_agent( "Can you summarize what you accomplished in this conversation?" ) print(f"Final response: {final_response}") # Step 7: Summary print("\n" + "=" * 60) print("SUMMARY OF ASK_AGENT DEMONSTRATION") print("=" * 60) print("\nQuestions and Responses:") for i, (question, response) in enumerate(questions_and_responses, 1): print(f"\n{i}. Q: {question}") print(f" A: {response[:100]}{'...' if len(response) > 100 else ''}") final_truncated = final_response[:100] + ("..." if len(final_response) > 100 else "") print(f"\nFinal Question Response: {final_truncated}") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost:.4f}") ================================================ FILE: examples/01_standalone_sdk/29_llm_streaming.py ================================================ import os import sys from typing import Literal from pydantic import SecretStr from openhands.sdk import ( Conversation, get_logger, ) from openhands.sdk.llm import LLM from openhands.sdk.llm.streaming import ModelResponseStream from openhands.tools.preset.default import get_default_agent logger = get_logger(__name__) api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") if not api_key: raise RuntimeError("Set LLM_API_KEY or OPENAI_API_KEY in your environment.") model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, api_key=SecretStr(api_key), base_url=base_url, usage_id="stream-demo", stream=True, ) agent = get_default_agent(llm=llm, cli_mode=True) # Define streaming states StreamingState = Literal["thinking", "content", "tool_name", "tool_args"] # Track state across on_token calls for boundary detection _current_state: StreamingState | None = None def on_token(chunk: ModelResponseStream) -> None: """ Handle all types of streaming tokens including content, tool calls, and thinking blocks with dynamic boundary detection. """ global _current_state choices = chunk.choices for choice in choices: delta = choice.delta if delta is not None: # Handle thinking blocks (reasoning content) reasoning_content = getattr(delta, "reasoning_content", None) if isinstance(reasoning_content, str) and reasoning_content: if _current_state != "thinking": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("THINKING: ") _current_state = "thinking" sys.stdout.write(reasoning_content) sys.stdout.flush() # Handle regular content content = getattr(delta, "content", None) if isinstance(content, str) and content: if _current_state != "content": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("CONTENT: ") _current_state = "content" sys.stdout.write(content) sys.stdout.flush() # Handle tool calls tool_calls = getattr(delta, "tool_calls", None) if tool_calls: for tool_call in tool_calls: tool_name = ( tool_call.function.name if tool_call.function.name else "" ) tool_args = ( tool_call.function.arguments if tool_call.function.arguments else "" ) if tool_name: if _current_state != "tool_name": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("TOOL NAME: ") _current_state = "tool_name" sys.stdout.write(tool_name) sys.stdout.flush() if tool_args: if _current_state != "tool_args": if _current_state is not None: sys.stdout.write("\n") sys.stdout.write("TOOL ARGS: ") _current_state = "tool_args" sys.stdout.write(tool_args) sys.stdout.flush() conversation = Conversation( agent=agent, workspace=os.getcwd(), token_callbacks=[on_token], ) story_prompt = ( "Tell me a long story about LLM streaming, write it a file, " "make sure it has multiple paragraphs. " ) conversation.send_message(story_prompt) print("Token Streaming:") print("-" * 100 + "\n") conversation.run() cleanup_prompt = ( "Thank you. Please delete the streaming story file now that I've read it, " "then confirm the deletion." ) conversation.send_message(cleanup_prompt) print("Token Streaming:") print("-" * 100 + "\n") conversation.run() # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/30_tom_agent.py ================================================ """Example demonstrating Tom agent with Theory of Mind capabilities. This example shows how to set up an agent with Tom tools for getting personalized guidance based on user modeling. Tom tools include: - TomConsultTool: Get guidance for vague or unclear tasks - SleeptimeComputeTool: Index conversations for user modeling """ import os from pydantic import SecretStr from openhands.sdk import LLM, Agent, Conversation from openhands.sdk.tool import Tool from openhands.tools.preset.default import get_default_tools from openhands.tools.tom_consult import ( SleeptimeComputeAction, SleeptimeComputeObservation, SleeptimeComputeTool, TomConsultTool, ) # Configure LLM api_key: str | None = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." llm: LLM = LLM( model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL", None), usage_id="agent", drop_params=True, ) # Build tools list with Tom tools # Note: Tom tools are automatically registered on import (PR #862) tools = get_default_tools(enable_browser=False) # Configure Tom tools with parameters tom_params: dict[str, bool | str] = { "enable_rag": True, # Enable RAG in Tom agent } # Add LLM configuration for Tom tools (uses same LLM as main agent) tom_params["llm_model"] = llm.model if llm.api_key: if isinstance(llm.api_key, SecretStr): tom_params["api_key"] = llm.api_key.get_secret_value() else: tom_params["api_key"] = llm.api_key if llm.base_url: tom_params["api_base"] = llm.base_url # Add both Tom tools to the agent tools.append(Tool(name=TomConsultTool.name, params=tom_params)) tools.append(Tool(name=SleeptimeComputeTool.name, params=tom_params)) # Create agent with Tom capabilities # This agent can consult Tom for personalized guidance # Note: Tom's user modeling data will be stored in ~/.openhands/ agent: Agent = Agent(llm=llm, tools=tools) # Start conversation cwd: str = os.getcwd() PERSISTENCE_DIR = os.path.expanduser("~/.openhands") CONVERSATIONS_DIR = os.path.join(PERSISTENCE_DIR, "conversations") conversation = Conversation( agent=agent, workspace=cwd, persistence_dir=CONVERSATIONS_DIR ) # Optionally run sleeptime compute to index existing conversations # This builds user preferences and patterns from conversation history # Using execute_tool allows running tools before conversation.run() print("\nRunning sleeptime compute to index conversations...") try: sleeptime_result = conversation.execute_tool( "sleeptime_compute", SleeptimeComputeAction() ) # Cast to the expected observation type for type-safe access if isinstance(sleeptime_result, SleeptimeComputeObservation): print(f"Result: {sleeptime_result.message}") print(f"Sessions processed: {sleeptime_result.sessions_processed}") else: print(f"Result: {sleeptime_result.text}") except KeyError as e: print(f"Tool not available: {e}") # Send a potentially vague message where Tom consultation might help conversation.send_message( "I need to debug some code but I'm not sure where to start. " + "Can you help me figure out the best approach?" ) conversation.run() print("\n" + "=" * 80) print("Tom agent consultation example completed!") print("=" * 80) # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") # Optional: Index this conversation for Tom's user modeling # This builds user preferences and patterns from conversation history # Uncomment the lines below to index the conversation: # # conversation.send_message("Please index this conversation using sleeptime_compute") # conversation.run() # print("\nConversation indexed for user modeling!") # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/31_iterative_refinement.py ================================================ #!/usr/bin/env python3 """ Iterative Refinement Example: COBOL to Java Refactoring This example demonstrates an iterative refinement workflow where: 1. A refactoring agent converts COBOL files to Java files 2. A critique agent evaluates the quality of each conversion and provides scores 3. If the average score is below 90%, the process repeats with feedback The workflow continues until the refactoring meets the quality threshold. Source COBOL files can be obtained from: https://github.com/aws-samples/aws-mainframe-modernization-carddemo/tree/main/app/cbl """ import os import re import tempfile from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Conversation from openhands.tools.preset.default import get_default_agent QUALITY_THRESHOLD = float(os.getenv("QUALITY_THRESHOLD", "90.0")) MAX_ITERATIONS = int(os.getenv("MAX_ITERATIONS", "5")) def setup_workspace() -> tuple[Path, Path, Path]: """Create workspace directories for the refactoring workflow.""" workspace_dir = Path(tempfile.mkdtemp()) cobol_dir = workspace_dir / "cobol" java_dir = workspace_dir / "java" critique_dir = workspace_dir / "critiques" cobol_dir.mkdir(parents=True, exist_ok=True) java_dir.mkdir(parents=True, exist_ok=True) critique_dir.mkdir(parents=True, exist_ok=True) return workspace_dir, cobol_dir, java_dir def create_sample_cobol_files(cobol_dir: Path) -> list[str]: """Create sample COBOL files for demonstration. In a real scenario, you would clone files from: https://github.com/aws-samples/aws-mainframe-modernization-carddemo/tree/main/app/cbl """ sample_files = { "CBACT01C.cbl": """ IDENTIFICATION DIVISION. PROGRAM-ID. CBACT01C. ***************************************************************** * Program: CBACT01C - Account Display Program * Purpose: Display account information for a given account number ***************************************************************** ENVIRONMENT DIVISION. DATA DIVISION. WORKING-STORAGE SECTION. 01 WS-ACCOUNT-ID PIC 9(11). 01 WS-ACCOUNT-STATUS PIC X(1). 01 WS-ACCOUNT-BALANCE PIC S9(13)V99. 01 WS-CUSTOMER-NAME PIC X(50). 01 WS-ERROR-MSG PIC X(80). PROCEDURE DIVISION. PERFORM 1000-INIT. PERFORM 2000-PROCESS. PERFORM 3000-TERMINATE. STOP RUN. 1000-INIT. INITIALIZE WS-ACCOUNT-ID INITIALIZE WS-ACCOUNT-STATUS INITIALIZE WS-ACCOUNT-BALANCE INITIALIZE WS-CUSTOMER-NAME. 2000-PROCESS. DISPLAY "ENTER ACCOUNT NUMBER: " ACCEPT WS-ACCOUNT-ID IF WS-ACCOUNT-ID = ZEROS MOVE "INVALID ACCOUNT NUMBER" TO WS-ERROR-MSG DISPLAY WS-ERROR-MSG ELSE DISPLAY "ACCOUNT: " WS-ACCOUNT-ID DISPLAY "STATUS: " WS-ACCOUNT-STATUS DISPLAY "BALANCE: " WS-ACCOUNT-BALANCE END-IF. 3000-TERMINATE. DISPLAY "PROGRAM COMPLETE". """, "CBCUS01C.cbl": """ IDENTIFICATION DIVISION. PROGRAM-ID. CBCUS01C. ***************************************************************** * Program: CBCUS01C - Customer Information Program * Purpose: Manage customer data operations ***************************************************************** ENVIRONMENT DIVISION. DATA DIVISION. WORKING-STORAGE SECTION. 01 WS-CUSTOMER-ID PIC 9(9). 01 WS-FIRST-NAME PIC X(25). 01 WS-LAST-NAME PIC X(25). 01 WS-ADDRESS PIC X(100). 01 WS-PHONE PIC X(15). 01 WS-EMAIL PIC X(50). 01 WS-OPERATION PIC X(1). 88 OP-ADD VALUE 'A'. 88 OP-UPDATE VALUE 'U'. 88 OP-DELETE VALUE 'D'. 88 OP-DISPLAY VALUE 'V'. PROCEDURE DIVISION. PERFORM 1000-MAIN-PROCESS. STOP RUN. 1000-MAIN-PROCESS. DISPLAY "CUSTOMER MANAGEMENT SYSTEM" DISPLAY "A-ADD U-UPDATE D-DELETE V-VIEW" ACCEPT WS-OPERATION EVALUATE TRUE WHEN OP-ADD PERFORM 2000-ADD-CUSTOMER WHEN OP-UPDATE PERFORM 3000-UPDATE-CUSTOMER WHEN OP-DELETE PERFORM 4000-DELETE-CUSTOMER WHEN OP-DISPLAY PERFORM 5000-DISPLAY-CUSTOMER WHEN OTHER DISPLAY "INVALID OPERATION" END-EVALUATE. 2000-ADD-CUSTOMER. DISPLAY "ADDING NEW CUSTOMER" ACCEPT WS-CUSTOMER-ID ACCEPT WS-FIRST-NAME ACCEPT WS-LAST-NAME DISPLAY "CUSTOMER ADDED: " WS-CUSTOMER-ID. 3000-UPDATE-CUSTOMER. DISPLAY "UPDATING CUSTOMER" ACCEPT WS-CUSTOMER-ID DISPLAY "CUSTOMER UPDATED: " WS-CUSTOMER-ID. 4000-DELETE-CUSTOMER. DISPLAY "DELETING CUSTOMER" ACCEPT WS-CUSTOMER-ID DISPLAY "CUSTOMER DELETED: " WS-CUSTOMER-ID. 5000-DISPLAY-CUSTOMER. DISPLAY "DISPLAYING CUSTOMER" ACCEPT WS-CUSTOMER-ID DISPLAY "ID: " WS-CUSTOMER-ID DISPLAY "NAME: " WS-FIRST-NAME " " WS-LAST-NAME. """, "CBTRN01C.cbl": """ IDENTIFICATION DIVISION. PROGRAM-ID. CBTRN01C. ***************************************************************** * Program: CBTRN01C - Transaction Processing Program * Purpose: Process financial transactions ***************************************************************** ENVIRONMENT DIVISION. DATA DIVISION. WORKING-STORAGE SECTION. 01 WS-TRANS-ID PIC 9(16). 01 WS-TRANS-TYPE PIC X(2). 88 TRANS-CREDIT VALUE 'CR'. 88 TRANS-DEBIT VALUE 'DB'. 88 TRANS-TRANSFER VALUE 'TR'. 01 WS-TRANS-AMOUNT PIC S9(13)V99. 01 WS-FROM-ACCOUNT PIC 9(11). 01 WS-TO-ACCOUNT PIC 9(11). 01 WS-TRANS-DATE PIC 9(8). 01 WS-TRANS-STATUS PIC X(10). PROCEDURE DIVISION. PERFORM 1000-INITIALIZE. PERFORM 2000-PROCESS-TRANSACTION. PERFORM 3000-FINALIZE. STOP RUN. 1000-INITIALIZE. MOVE ZEROS TO WS-TRANS-ID MOVE SPACES TO WS-TRANS-TYPE MOVE ZEROS TO WS-TRANS-AMOUNT MOVE "PENDING" TO WS-TRANS-STATUS. 2000-PROCESS-TRANSACTION. DISPLAY "ENTER TRANSACTION TYPE (CR/DB/TR): " ACCEPT WS-TRANS-TYPE DISPLAY "ENTER AMOUNT: " ACCEPT WS-TRANS-AMOUNT EVALUATE TRUE WHEN TRANS-CREDIT PERFORM 2100-PROCESS-CREDIT WHEN TRANS-DEBIT PERFORM 2200-PROCESS-DEBIT WHEN TRANS-TRANSFER PERFORM 2300-PROCESS-TRANSFER WHEN OTHER MOVE "INVALID" TO WS-TRANS-STATUS END-EVALUATE. 2100-PROCESS-CREDIT. DISPLAY "PROCESSING CREDIT" ACCEPT WS-TO-ACCOUNT MOVE "COMPLETED" TO WS-TRANS-STATUS DISPLAY "CREDIT APPLIED TO: " WS-TO-ACCOUNT. 2200-PROCESS-DEBIT. DISPLAY "PROCESSING DEBIT" ACCEPT WS-FROM-ACCOUNT MOVE "COMPLETED" TO WS-TRANS-STATUS DISPLAY "DEBIT FROM: " WS-FROM-ACCOUNT. 2300-PROCESS-TRANSFER. DISPLAY "PROCESSING TRANSFER" ACCEPT WS-FROM-ACCOUNT ACCEPT WS-TO-ACCOUNT MOVE "COMPLETED" TO WS-TRANS-STATUS DISPLAY "TRANSFER FROM " WS-FROM-ACCOUNT " TO " WS-TO-ACCOUNT. 3000-FINALIZE. DISPLAY "TRANSACTION STATUS: " WS-TRANS-STATUS. """, } created_files = [] for filename, content in sample_files.items(): file_path = cobol_dir / filename file_path.write_text(content) created_files.append(filename) return created_files def get_refactoring_prompt( cobol_dir: Path, java_dir: Path, cobol_files: list[str], critique_file: Path | None = None, ) -> str: """Generate the prompt for the refactoring agent.""" files_list = "\n".join(f" - {f}" for f in cobol_files) base_prompt = f"""Convert the following COBOL files to Java: COBOL Source Directory: {cobol_dir} Java Target Directory: {java_dir} Files to convert: {files_list} Requirements: 1. Create a Java class for each COBOL program 2. Preserve the business logic and data structures 3. Use appropriate Java naming conventions (camelCase for methods, PascalCase) 4. Convert COBOL data types to appropriate Java types 5. Implement proper error handling with try-catch blocks 6. Add JavaDoc comments explaining the purpose of each class and method 7. In JavaDoc comments, include traceability to the original COBOL source using the format: @source : (e.g., @source CBACT01C.cbl:73-77) 8. Create a clean, maintainable object-oriented design 9. Each Java file should be compilable and follow Java best practices Read each COBOL file and create the corresponding Java file in the target directory. """ if critique_file and critique_file.exists(): base_prompt += f""" IMPORTANT: A previous refactoring attempt was evaluated and needs improvement. Please review the critique at: {critique_file} Address all issues mentioned in the critique to improve the conversion quality. """ return base_prompt def get_critique_prompt( cobol_dir: Path, java_dir: Path, cobol_files: list[str], ) -> str: """Generate the prompt for the critique agent.""" files_list = "\n".join(f" - {f}" for f in cobol_files) return f"""Evaluate the quality of COBOL to Java refactoring. COBOL Source Directory: {cobol_dir} Java Target Directory: {java_dir} Original COBOL files: {files_list} Please evaluate each converted Java file against its original COBOL source. For each file, assess: 1. Correctness: Does the Java code preserve the original business logic? (0-25 pts) 2. Code Quality: Is the code clean, readable, following Java conventions? (0-25 pts) 3. Completeness: Are all COBOL features properly converted? (0-25 pts) 4. Best Practices: Does it use proper OOP, error handling, documentation? (0-25 pts) Create a critique report in the following EXACT format: # COBOL to Java Refactoring Critique Report ## Summary [Brief overall assessment] ## File Evaluations ### [Original COBOL filename] - **Java File**: [corresponding Java filename or "NOT FOUND"] - **Correctness**: [score]/25 - [brief explanation] - **Code Quality**: [score]/25 - [brief explanation] - **Completeness**: [score]/25 - [brief explanation] - **Best Practices**: [score]/25 - [brief explanation] - **File Score**: [total]/100 - **Issues to Address**: - [specific issue 1] - [specific issue 2] ... [Repeat for each file] ## Overall Score - **Average Score**: [calculated average of all file scores] - **Recommendation**: [PASS if average >= 90, NEEDS_IMPROVEMENT otherwise] ## Priority Improvements 1. [Most critical improvement needed] 2. [Second priority] 3. [Third priority] Save this report to: {java_dir.parent}/critiques/critique_report.md """ def parse_critique_score(critique_file: Path) -> float: """Parse the average score from the critique report.""" if not critique_file.exists(): return 0.0 content = critique_file.read_text() # Look for "Average Score: X" pattern patterns = [ r"\*\*Average Score\*\*:\s*(\d+(?:\.\d+)?)", r"Average Score:\s*(\d+(?:\.\d+)?)", r"average.*?(\d+(?:\.\d+)?)\s*(?:/100|%|$)", ] for pattern in patterns: match = re.search(pattern, content, re.IGNORECASE) if match: return float(match.group(1)) return 0.0 def run_iterative_refinement() -> None: """Run the iterative refinement workflow.""" # Setup api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( model=model, base_url=base_url, api_key=SecretStr(api_key), usage_id="iterative_refinement", ) workspace_dir, cobol_dir, java_dir = setup_workspace() critique_dir = workspace_dir / "critiques" print(f"Workspace: {workspace_dir}") print(f"COBOL Directory: {cobol_dir}") print(f"Java Directory: {java_dir}") print(f"Critique Directory: {critique_dir}") print() # Create sample COBOL files cobol_files = create_sample_cobol_files(cobol_dir) print(f"Created {len(cobol_files)} sample COBOL files:") for f in cobol_files: print(f" - {f}") print() critique_file = critique_dir / "critique_report.md" current_score = 0.0 iteration = 0 while current_score < QUALITY_THRESHOLD and iteration < MAX_ITERATIONS: iteration += 1 print("=" * 80) print(f"ITERATION {iteration}") print("=" * 80) # Phase 1: Refactoring print("\n--- Phase 1: Refactoring Agent ---") refactoring_agent = get_default_agent(llm=llm, cli_mode=True) refactoring_conversation = Conversation( agent=refactoring_agent, workspace=str(workspace_dir), ) previous_critique = critique_file if iteration > 1 else None refactoring_prompt = get_refactoring_prompt( cobol_dir, java_dir, cobol_files, previous_critique ) refactoring_conversation.send_message(refactoring_prompt) refactoring_conversation.run() print("Refactoring phase complete.") # Phase 2: Critique print("\n--- Phase 2: Critique Agent ---") critique_agent = get_default_agent(llm=llm, cli_mode=True) critique_conversation = Conversation( agent=critique_agent, workspace=str(workspace_dir), ) critique_prompt = get_critique_prompt(cobol_dir, java_dir, cobol_files) critique_conversation.send_message(critique_prompt) critique_conversation.run() print("Critique phase complete.") # Parse the score current_score = parse_critique_score(critique_file) print(f"\nCurrent Score: {current_score:.1f}%") if current_score >= QUALITY_THRESHOLD: print(f"\n✓ Quality threshold ({QUALITY_THRESHOLD}%) met!") else: print( f"\n✗ Score below threshold ({QUALITY_THRESHOLD}%). " "Continuing refinement..." ) # Final summary print("\n" + "=" * 80) print("ITERATIVE REFINEMENT COMPLETE") print("=" * 80) print(f"Total iterations: {iteration}") print(f"Final score: {current_score:.1f}%") print(f"Workspace: {workspace_dir}") # List created Java files print("\nCreated Java files:") for java_file in java_dir.glob("*.java"): print(f" - {java_file.name}") # Show critique file location if critique_file.exists(): print(f"\nFinal critique report: {critique_file}") # Report cost cost = llm.metrics.accumulated_cost print(f"\nEXAMPLE_COST: {cost}") if __name__ == "__main__": run_iterative_refinement() ================================================ FILE: examples/01_standalone_sdk/32_configurable_security_policy.py ================================================ """OpenHands Agent SDK — Configurable Security Policy Example This example demonstrates how to use a custom security policy template with an agent. Security policies define risk assessment guidelines that help agents evaluate the safety of their actions. By default, agents use the built-in security_policy.j2 template. This example shows how to: 1. Use the default security policy 2. Provide a custom security policy template embedded in the script 3. Apply the custom policy to guide agent behavior """ import os import tempfile from pathlib import Path from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool logger = get_logger(__name__) # Define a custom security policy template inline CUSTOM_SECURITY_POLICY = ( "# 🔐 Custom Security Risk Policy\n" "When using tools that support the security_risk parameter, assess the " "safety risk of your actions:\n" "\n" "- **LOW**: Safe read-only actions.\n" " - Viewing files, calculations, documentation.\n" "- **MEDIUM**: Moderate container-scoped actions.\n" " - File modifications, package installations.\n" "- **HIGH**: Potentially dangerous actions.\n" " - Network access, system modifications, data exfiltration.\n" "\n" "**Custom Rules**\n" "- Always prioritize user data safety.\n" "- Escalate to **HIGH** for any external data transmission.\n" ) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools cwd = os.getcwd() tools = [ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ] # Example 1: Agent with default security policy print("=" * 100) print("Example 1: Agent with default security policy") print("=" * 100) default_agent = Agent(llm=llm, tools=tools) print(f"Security policy filename: {default_agent.security_policy_filename}") print("\nDefault security policy is embedded in the agent's system message.") # Example 2: Agent with custom security policy print("\n" + "=" * 100) print("Example 2: Agent with custom security policy") print("=" * 100) # Create a temporary file for the custom security policy with tempfile.NamedTemporaryFile( mode="w", suffix=".j2", delete=False, encoding="utf-8" ) as temp_file: temp_file.write(CUSTOM_SECURITY_POLICY) custom_policy_path = temp_file.name try: # Create agent with custom security policy (using absolute path) custom_agent = Agent( llm=llm, tools=tools, security_policy_filename=custom_policy_path, ) print(f"Security policy filename: {custom_agent.security_policy_filename}") print("\nCustom security policy loaded from temporary file.") # Verify the custom policy is in the system message system_message = custom_agent.static_system_message if "Custom Security Risk Policy" in system_message: print("✓ Custom security policy successfully embedded in system message.") else: print("✗ Custom security policy not found in system message.") # Run a conversation with the custom agent print("\n" + "=" * 100) print("Running conversation with custom security policy") print("=" * 100) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) conversation = Conversation( agent=custom_agent, callbacks=[conversation_callback], workspace=".", ) conversation.send_message( "Please create a simple Python script named hello.py that prints " "'Hello, World!'. Make sure to follow security best practices." ) conversation.run() print("\n" + "=" * 100) print("Conversation finished.") print(f"Total LLM messages: {len(llm_messages)}") print("=" * 100) # Report cost cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"EXAMPLE_COST: {cost}") finally: # Clean up temporary file Path(custom_policy_path).unlink(missing_ok=True) print("\n" + "=" * 100) print("Example Summary") print("=" * 100) print("This example demonstrated:") print("1. Using the default security policy (security_policy.j2)") print("2. Creating a custom security policy template") print("3. Applying the custom policy via security_policy_filename parameter") print("4. Running a conversation with the custom security policy") print( "\nYou can customize security policies to match your organization's " "specific requirements." ) ================================================ FILE: examples/01_standalone_sdk/33_hooks/README.md ================================================ # Hooks Examples This folder demonstrates the OpenHands hooks system. ## Example - **main.py** - Complete hooks demo showing all four hook types ## Scripts The `hook_scripts/` directory contains reusable hook script examples: - `block_dangerous.sh` - Blocks rm -rf commands (PreToolUse) - `log_tools.sh` - Logs tool usage to a file (PostToolUse) - `inject_git_context.sh` - Injects git status into prompts (UserPromptSubmit) - `require_summary.sh` - Requires summary.txt before stopping (Stop) ## Running ```bash # Set your LLM credentials export LLM_API_KEY="your-key" export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" # optional export LLM_BASE_URL="https://your-endpoint" # optional # Run example python main.py ``` ## Hook Types | Hook | When it runs | Can block? | |------|--------------|------------| | PreToolUse | Before tool execution | Yes (exit 2) | | PostToolUse | After tool execution | No | | UserPromptSubmit | Before processing user message | Yes (exit 2) | | Stop | When agent tries to finish | Yes (exit 2) | | SessionStart | When conversation starts | No | | SessionEnd | When conversation ends | No | ## Exit Codes Hook scripts signal their result via the exit code (matching the Claude Code hook contract): - **`0` — success.** The operation proceeds. `stdout` is parsed as JSON for structured output (`decision`, `reason`, `additionalContext`). - **`2` — block.** The operation is denied. For `Stop` hooks, this prevents the agent from finishing and the agent continues running. `stderr` / `reason` is surfaced as feedback. - **Any other non-zero exit code — non-blocking error.** The error is logged, but the operation still proceeds. > **Note:** Only exit code `2` blocks. Exit code `1` (the conventional Unix > failure code) is treated as a non-blocking error. A hook that is meant to > enforce a policy must exit with `2`. ================================================ FILE: examples/01_standalone_sdk/33_hooks/hook_scripts/block_dangerous.sh ================================================ #!/bin/bash # PreToolUse hook: Block dangerous rm -rf commands # Uses grep on raw JSON input (no jq needed) input=$(cat) # Block rm -rf commands by checking if the input contains the pattern if echo "$input" | grep -q "rm -rf"; then echo '{"decision": "deny", "reason": "rm -rf commands are blocked for safety"}' exit 2 # Exit code 2 = block the operation fi exit 0 # Exit code 0 = allow the operation ================================================ FILE: examples/01_standalone_sdk/33_hooks/hook_scripts/inject_git_context.sh ================================================ #!/bin/bash # UserPromptSubmit hook: Inject git status when user asks about code changes input=$(cat) # Check if user is asking about changes, diff, or git if echo "$input" | grep -qiE "(changes|diff|git|commit|modified)"; then # Get git status if in a git repo if git rev-parse --git-dir > /dev/null 2>&1; then status=$(git status --short 2>/dev/null | head -10) if [ -n "$status" ]; then # Escape for JSON escaped=$(echo "$status" | sed 's/"/\\"/g' | tr '\n' ' ') echo "{\"additionalContext\": \"Current git status: $escaped\"}" fi fi fi exit 0 ================================================ FILE: examples/01_standalone_sdk/33_hooks/hook_scripts/log_tools.sh ================================================ #!/bin/bash # PostToolUse hook: Log all tool usage # Uses OPENHANDS_TOOL_NAME env var (no jq/python needed!) # LOG_FILE should be set by the calling script LOG_FILE="${LOG_FILE:-/tmp/tool_usage.log}" echo "[$(date)] Tool used: $OPENHANDS_TOOL_NAME" >> "$LOG_FILE" exit 0 ================================================ FILE: examples/01_standalone_sdk/33_hooks/hook_scripts/require_summary.sh ================================================ #!/bin/bash # Stop hook: Require a summary.txt file before allowing agent to finish # SUMMARY_FILE should be set by the calling script SUMMARY_FILE="${SUMMARY_FILE:-./summary.txt}" if [ ! -f "$SUMMARY_FILE" ]; then echo '{"decision": "deny", "additionalContext": "Create summary.txt first."}' exit 2 fi exit 0 ================================================ FILE: examples/01_standalone_sdk/33_hooks/main.py ================================================ """OpenHands Agent SDK — Hooks Example Demonstrates the OpenHands hooks system. Hooks are shell scripts that run at key lifecycle events: - PreToolUse: Block dangerous commands before execution - PostToolUse: Log tool usage after execution - UserPromptSubmit: Inject context into user messages - Stop: Enforce task completion criteria The hook scripts are in the scripts/ directory alongside this file. """ import os import signal import tempfile from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Conversation from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher from openhands.tools.preset.default import get_default_agent signal.signal(signal.SIGINT, lambda *_: (_ for _ in ()).throw(KeyboardInterrupt())) SCRIPT_DIR = Path(__file__).parent / "hook_scripts" # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Create temporary workspace with git repo with tempfile.TemporaryDirectory() as tmpdir: workspace = Path(tmpdir) os.system(f"cd {workspace} && git init -q && echo 'test' > file.txt") log_file = workspace / "tool_usage.log" summary_file = workspace / "summary.txt" # Configure hooks using the typed approach (recommended) # This provides better type safety and IDE support hook_config = HookConfig( pre_tool_use=[ HookMatcher( matcher="terminal", hooks=[ HookDefinition( command=str(SCRIPT_DIR / "block_dangerous.sh"), timeout=10, ) ], ) ], post_tool_use=[ HookMatcher( matcher="*", hooks=[ HookDefinition( command=(f"LOG_FILE={log_file} {SCRIPT_DIR / 'log_tools.sh'}"), timeout=5, ) ], ) ], user_prompt_submit=[ HookMatcher( hooks=[ HookDefinition( command=str(SCRIPT_DIR / "inject_git_context.sh"), ) ], ) ], stop=[ HookMatcher( hooks=[ HookDefinition( command=( f"SUMMARY_FILE={summary_file} " f"{SCRIPT_DIR / 'require_summary.sh'}" ), ) ], ) ], ) # Alternative: You can also use .from_dict() for loading from JSON config files # Example with a single hook matcher: # hook_config = HookConfig.from_dict({ # "hooks": { # "PreToolUse": [{ # "matcher": "terminal", # "hooks": [{"command": "path/to/script.sh", "timeout": 10}] # }] # } # }) agent = get_default_agent(llm=llm) conversation = Conversation( agent=agent, workspace=str(workspace), hook_config=hook_config, ) # Demo 1: Safe command (PostToolUse logs it) print("=" * 60) print("Demo 1: Safe command - logged by PostToolUse") print("=" * 60) conversation.send_message("Run: echo 'Hello from hooks!'") conversation.run() if log_file.exists(): print(f"\n[Log: {log_file.read_text().strip()}]") # Demo 2: Dangerous command (PreToolUse blocks it) print("\n" + "=" * 60) print("Demo 2: Dangerous command - blocked by PreToolUse") print("=" * 60) conversation.send_message("Run: rm -rf /tmp/test") conversation.run() # Demo 3: Context injection + Stop hook enforcement print("\n" + "=" * 60) print("Demo 3: Context injection + Stop hook") print("=" * 60) print("UserPromptSubmit injects git status; Stop requires summary.txt\n") conversation.send_message( "Check what files have changes, then create summary.txt describing the repo." ) conversation.run() if summary_file.exists(): print(f"\n[summary.txt: {summary_file.read_text()[:80]}...]") print("\n" + "=" * 60) print("Example Complete!") print("=" * 60) cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"\nEXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/34_critic_example.py ================================================ """Iterative Refinement with Critic Model Example. This is EXPERIMENTAL. This example demonstrates how to use a critic model to shepherd an agent through complex, multi-step tasks. The critic evaluates the agent's progress and provides feedback that can trigger follow-up prompts when the agent hasn't completed the task successfully. Key concepts demonstrated: 1. Setting up a critic with IterativeRefinementConfig for automatic retry 2. Conversation.run() automatically handles retries based on critic scores 3. Custom follow-up prompt generation via critic.get_followup_prompt() 4. Iterating until the task is completed successfully or max iterations reached For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured using the same base_url with /vllm suffix and "critic" as the model name. """ import os import re import tempfile from pathlib import Path from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.sdk.critic import APIBasedCritic, IterativeRefinementConfig from openhands.sdk.critic.base import CriticBase from openhands.tools.file_editor import FileEditorTool from openhands.tools.task_tracker import TaskTrackerTool from openhands.tools.terminal import TerminalTool # Configuration # Higher threshold (70%) makes it more likely the agent needs multiple iterations, # which better demonstrates how iterative refinement works. # Adjust as needed to see different behaviors. SUCCESS_THRESHOLD = float(os.getenv("CRITIC_SUCCESS_THRESHOLD", "0.7")) MAX_ITERATIONS = int(os.getenv("MAX_ITERATIONS", "3")) def get_required_env(name: str) -> str: value = os.getenv(name) if value: return value raise ValueError( f"Missing required environment variable: {name}. " f"Set {name} before running this example." ) def get_default_critic(llm: LLM) -> CriticBase | None: """Auto-configure critic for All-Hands LLM proxy. When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an APIBasedCritic configured with: - server_url: {base_url}/vllm - api_key: same as LLM - model_name: "critic" Args: llm: The LLM instance to derive critic configuration from. Returns: An APIBasedCritic if the LLM is configured for All-Hands proxy, None otherwise. Example: llm = LLM( model="anthropic/claude-sonnet-4-5", api_key=api_key, base_url="https://llm-proxy.eval.all-hands.dev", ) critic = get_default_critic(llm) if critic is None: # Fall back to explicit configuration critic = APIBasedCritic( server_url="https://my-critic-server.com", api_key="my-api-key", model_name="my-critic-model", ) """ base_url = llm.base_url api_key = llm.api_key if base_url is None or api_key is None: return None # Match: llm-proxy.{env}.all-hands.dev (e.g., staging, prod, eval) pattern = r"^https?://llm-proxy\.[^./]+\.all-hands\.dev" if not re.match(pattern, base_url): return None return APIBasedCritic( server_url=f"{base_url.rstrip('/')}/vllm", api_key=api_key, model_name="critic", ) # Task prompt designed to be moderately complex with subtle requirements. # The task is simple enough to complete in 1-2 iterations, but has specific # requirements that are easy to miss - triggering critic feedback. INITIAL_TASK_PROMPT = """\ Create a Python word statistics tool called `wordstats` that analyzes text files. ## Structure Create directory `wordstats/` with: - `stats.py` - Main module with `analyze_file(filepath)` function - `cli.py` - Command-line interface - `tests/test_stats.py` - Unit tests ## Requirements for stats.py The `analyze_file(filepath)` function must return a dict with these EXACT keys: - `lines`: total line count (including empty lines) - `words`: word count - `chars`: character count (including whitespace) - `unique_words`: count of unique words (case-insensitive) ### Important edge cases (often missed!): 1. Empty files must return all zeros, not raise an exception 2. Hyphenated words count as ONE word (e.g., "well-known" = 1 word) 3. Numbers like "123" or "3.14" are NOT counted as words 4. Contractions like "don't" count as ONE word 5. File not found must raise FileNotFoundError with a clear message ## Requirements for cli.py When run as `python cli.py `: - Print each stat on its own line: "Lines: X", "Words: X", etc. - Exit with code 1 if file not found, printing error to stderr - Exit with code 0 on success ## Required Tests (test_stats.py) Write tests that verify: 1. Basic counting on normal text 2. Empty file returns all zeros 3. Hyphenated words counted correctly 4. Numbers are excluded from word count 5. FileNotFoundError raised for missing files ## Verification Steps 1. Create a sample file `sample.txt` with this EXACT content (no trailing newline): ``` Hello world! This is a well-known test file. It has 5 lines, including empty ones. Numbers like 42 and 3.14 don't count as words. ``` 2. Run: `python wordstats/cli.py sample.txt` Expected output: - Lines: 5 - Words: 21 - Chars: 130 - Unique words: 21 3. Run the tests: `python -m pytest wordstats/tests/ -v` ALL tests must pass. The task is complete ONLY when: - All files exist - The CLI outputs the correct stats for sample.txt - All 5+ tests pass """ llm_api_key = get_required_env("LLM_API_KEY") # Use a weaker model to increase likelihood of needing multiple iterations llm_model = os.getenv("LLM_MODEL", "anthropic/claude-haiku-4-5-20251001") llm = LLM( model=llm_model, api_key=llm_api_key, top_p=0.95, base_url=os.getenv("LLM_BASE_URL"), ) # Setup critic with iterative refinement config # The IterativeRefinementConfig tells Conversation.run() to automatically # retry the task if the critic score is below the threshold iterative_config = IterativeRefinementConfig( success_threshold=SUCCESS_THRESHOLD, max_iterations=MAX_ITERATIONS, ) # Auto-configure critic for All-Hands proxy or use explicit env vars critic = get_default_critic(llm) if critic is None: print("⚠️ No All-Hands LLM proxy detected, trying explicit env vars...") critic = APIBasedCritic( server_url=get_required_env("CRITIC_SERVER_URL"), api_key=get_required_env("CRITIC_API_KEY"), model_name=get_required_env("CRITIC_MODEL_NAME"), iterative_refinement=iterative_config, ) else: # Add iterative refinement config to the auto-configured critic critic = critic.model_copy(update={"iterative_refinement": iterative_config}) # Create agent with critic (iterative refinement is built into the critic) agent = Agent( llm=llm, tools=[ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), Tool(name=TaskTrackerTool.name), ], critic=critic, ) # Create workspace workspace = Path(tempfile.mkdtemp(prefix="critic_demo_")) print(f"📁 Created workspace: {workspace}") # Create conversation - iterative refinement is handled automatically # by Conversation.run() based on the critic's config conversation = Conversation( agent=agent, workspace=str(workspace), ) print("\n" + "=" * 70) print("🚀 Starting Iterative Refinement with Critic Model") print("=" * 70) print(f"Success threshold: {SUCCESS_THRESHOLD:.0%}") print(f"Max iterations: {MAX_ITERATIONS}") # Send the task and run - Conversation.run() handles retries automatically conversation.send_message(INITIAL_TASK_PROMPT) conversation.run() # Print additional info about created files print("\nCreated files:") for path in sorted(workspace.rglob("*")): if path.is_file(): relative = path.relative_to(workspace) print(f" - {relative}") # Report cost cost = llm.metrics.accumulated_cost print(f"\nEXAMPLE_COST: {cost:.4f}") ================================================ FILE: examples/01_standalone_sdk/35_subscription_login.py ================================================ """Example: Using ChatGPT subscription for Codex models. This example demonstrates how to use your ChatGPT Plus/Pro subscription to access OpenAI's Codex models without consuming API credits. The subscription_login() method handles: - OAuth PKCE authentication flow - Device-code authentication for remote/headless environments - Credential caching (~/.openhands/auth/) - Automatic token refresh Supported models: - gpt-5.2-codex - gpt-5.2 - gpt-5.1-codex-max - gpt-5.1-codex-mini Requirements: - Active ChatGPT Plus or Pro subscription - Browser access for initial OAuth login, or another browser/device for device-code login Environment variables: - OPENHANDS_SUBSCRIPTION_MODEL: Model to use (default: gpt-5.2-codex) - OPENHANDS_SUBSCRIPTION_AUTH_METHOD: "browser" or "device_code" (default: browser) - OPENHANDS_SUBSCRIPTION_FORCE_LOGIN: Set to "1" to force fresh login - SUBSCRIPTION_LOGIN_ONLY: Set to "1" to verify login without running an agent """ import os from typing import Literal from openhands.sdk import LLM, Agent, Conversation, Tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool AuthMethod = Literal["browser", "device_code"] # First time: Opens browser for OAuth login # Subsequent calls: Reuses cached credentials (auto-refreshes if expired) model = os.getenv("OPENHANDS_SUBSCRIPTION_MODEL", "gpt-5.2-codex") auth_method_env = os.getenv("OPENHANDS_SUBSCRIPTION_AUTH_METHOD", "browser") if auth_method_env not in ("browser", "device_code"): raise ValueError( "OPENHANDS_SUBSCRIPTION_AUTH_METHOD must be 'browser' or 'device_code'" ) auth_method: AuthMethod = auth_method_env force_login = os.getenv("OPENHANDS_SUBSCRIPTION_FORCE_LOGIN") == "1" llm = LLM.subscription_login( vendor="openai", model=model, # or "gpt-5.2", "gpt-5.1-codex-max", "gpt-5.1-codex-mini" auth_method=auth_method, force_login=force_login, ) # Alternative: Force a fresh login (useful if credentials are stale) # llm = LLM.subscription_login(vendor="openai", model="gpt-5.2-codex", force_login=True) # Alternative: Disable auto-opening browser (prints URL to console instead) # llm = LLM.subscription_login( # vendor="openai", model="gpt-5.2-codex", open_browser=False # ) # # Alternative: Use device-code login for remote/headless environments # llm = LLM.subscription_login( # vendor="openai", # model="gpt-5.2-codex", # auth_method="device_code", # force_login=True, # ) # Verify subscription mode is active print(f"Using subscription mode: {llm.is_subscription}") print(f"Model: {llm.model}") print(f"Auth method: {auth_method}") if os.getenv("SUBSCRIPTION_LOGIN_ONLY") == "1": print("Login verified; skipping agent run because SUBSCRIPTION_LOGIN_ONLY=1.") raise SystemExit(0) # Use the LLM with an agent as usual agent = Agent( llm=llm, tools=[ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ], ) cwd = os.getcwd() conversation = Conversation(agent=agent, workspace=cwd) conversation.send_message("List the files in the current directory.") conversation.run() print("Done!") ================================================ FILE: examples/01_standalone_sdk/36_event_json_to_openai_messages.py ================================================ """Load persisted events and convert them into LLM-ready messages.""" import json import os import uuid from pathlib import Path from pydantic import SecretStr conversation_id = uuid.uuid4() persistence_root = Path(".conversations") log_dir = ( persistence_root / "logs" / "event-json-to-openai-messages" / conversation_id.hex ) os.environ.setdefault("LOG_JSON", "true") os.environ.setdefault("LOG_TO_FILE", "true") os.environ.setdefault("LOG_DIR", str(log_dir)) os.environ.setdefault("LOG_LEVEL", "INFO") from openhands.sdk import ( # noqa: E402 LLM, Agent, Conversation, Event, LLMConvertibleEvent, Tool, ) from openhands.sdk.logger import get_logger, setup_logging # noqa: E402 from openhands.tools.terminal import TerminalTool # noqa: E402 setup_logging(log_to_file=True, log_dir=str(log_dir)) logger = get_logger(__name__) api_key = os.getenv("LLM_API_KEY") if not api_key: raise RuntimeError("LLM_API_KEY environment variable is not set.") llm = LLM( usage_id="agent", model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), base_url=os.getenv("LLM_BASE_URL"), api_key=SecretStr(api_key), ) agent = Agent( llm=llm, tools=[Tool(name=TerminalTool.name)], ) ###### # Create a conversation that persists its events ###### conversation = Conversation( agent=agent, workspace=os.getcwd(), persistence_dir=str(persistence_root), conversation_id=conversation_id, ) conversation.send_message( "Use the terminal tool to run `pwd` and write the output to tool_output.txt. " "Reply with a short confirmation once done." ) conversation.run() conversation.send_message( "Without using any tools, summarize in one sentence what you did." ) conversation.run() assert conversation.state.persistence_dir is not None persistence_dir = Path(conversation.state.persistence_dir) event_dir = persistence_dir / "events" event_paths = sorted(event_dir.glob("event-*.json")) if not event_paths: raise RuntimeError("No event files found. Was persistence enabled?") ###### # Read from serialized events ###### events = [Event.model_validate_json(path.read_text()) for path in event_paths] convertible_events = [ event for event in events if isinstance(event, LLMConvertibleEvent) ] llm_messages = LLMConvertibleEvent.events_to_messages(convertible_events) if llm.uses_responses_api(): logger.info("Formatting messages for the OpenAI Responses API.") instructions, input_items = llm.format_messages_for_responses(llm_messages) logger.info("Responses instructions:\n%s", instructions) logger.info("Responses input:\n%s", json.dumps(input_items, indent=2)) else: logger.info("Formatting messages for the OpenAI Chat Completions API.") chat_messages = llm.format_messages_for_llm(llm_messages) logger.info("Chat Completions messages:\n%s", json.dumps(chat_messages, indent=2)) # Report cost cost = llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost}") ================================================ FILE: examples/01_standalone_sdk/37_llm_profile_store/main.py ================================================ """Example: Using LLMProfileStore to save and reuse LLM configurations. This example ships with one pre-generated profile JSON file and creates another profile at runtime. The checked-in profile comes from a normal save, so secrets are masked instead of exposed and non-secret fields like `base_url` are kept when present. """ import os import shutil import tempfile from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, LLMProfileStore SCRIPT_DIR = Path(__file__).parent EXAMPLE_PROFILES_DIR = SCRIPT_DIR / "profiles" DEFAULT_MODEL = "anthropic/claude-sonnet-4-5-20250929" profile_store_dir = Path(tempfile.mkdtemp()) / "profiles" shutil.copytree(EXAMPLE_PROFILES_DIR, profile_store_dir) store = LLMProfileStore(base_dir=profile_store_dir) print(f"Seeded profiles: {store.list()}") api_key = os.getenv("LLM_API_KEY") creative_llm = LLM( usage_id="creative", model=os.getenv("LLM_MODEL", DEFAULT_MODEL), api_key=SecretStr(api_key) if api_key else None, base_url=os.getenv("LLM_BASE_URL"), temperature=0.9, ) # The checked-in fast.json was generated with a normal save, so its api_key is # masked and any configured base_url would be preserved. This runtime profile # also avoids persisting the real API key because secrets are masked by default. store.save("creative", creative_llm) creative_profile_json = (profile_store_dir / "creative.json").read_text() if api_key is not None: assert api_key not in creative_profile_json print(f"Stored profiles: {store.list()}") fast_profile = store.load("fast") creative_profile = store.load("creative") print( "Loaded fast profile. " f"usage: {fast_profile.usage_id}, " f"model: {fast_profile.model}, " f"temperature: {fast_profile.temperature}." ) print( "Loaded creative profile. " f"usage: {creative_profile.usage_id}, " f"model: {creative_profile.model}, " f"temperature: {creative_profile.temperature}." ) store.delete("creative") print(f"After deletion: {store.list()}") print("EXAMPLE_COST: 0") ================================================ FILE: examples/01_standalone_sdk/37_llm_profile_store/profiles/fast.json ================================================ { "model": "anthropic/claude-sonnet-4-5-20250929", "api_key": "**********", "openrouter_site_url": "https://docs.all-hands.dev/", "openrouter_app_name": "OpenHands", "num_retries": 5, "retry_multiplier": 8.0, "retry_min_wait": 8, "retry_max_wait": 64, "timeout": 300, "max_message_chars": 30000, "temperature": 0.0, "max_input_tokens": 200000, "max_output_tokens": 64000, "stream": false, "drop_params": true, "modify_params": true, "disable_stop_word": false, "caching_prompt": true, "log_completions": false, "log_completions_folder": "logs/completions", "native_tool_calling": true, "reasoning_effort": "high", "enable_encrypted_reasoning": true, "prompt_cache_retention": "24h", "extended_thinking_budget": 200000, "usage_id": "fast", "litellm_extra_body": {} } ================================================ FILE: examples/01_standalone_sdk/38_browser_session_recording.py ================================================ """Browser Session Recording Example This example demonstrates how to use the browser session recording feature to capture and save a recording of the agent's browser interactions using rrweb. The recording can be replayed later using rrweb-player to visualize the agent's browsing session. The recording will be automatically saved to the persistence directory when browser_stop_recording is called. You can replay it with: - rrweb-player: https://github.com/rrweb-io/rrweb/tree/master/packages/rrweb-player - Online viewer: https://www.rrweb.io/demo/ """ import json import os from pydantic import SecretStr from openhands.sdk import ( LLM, Agent, Conversation, Event, LLMConvertibleEvent, get_logger, ) from openhands.sdk.tool import Tool from openhands.tools.browser_use import BrowserToolSet from openhands.tools.browser_use.definition import ( BROWSER_RECORDING_OUTPUT_DIR, BrowserNavigateAction, ) logger = get_logger(__name__) # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") base_url = os.getenv("LLM_BASE_URL") llm = LLM( usage_id="agent", model=model, base_url=base_url, api_key=SecretStr(api_key), ) # Tools - including browser tools with recording capability cwd = os.getcwd() tools = [ Tool(name=BrowserToolSet.name), ] # Agent agent = Agent(llm=llm, tools=tools) llm_messages = [] # collect raw LLM messages def conversation_callback(event: Event): if isinstance(event, LLMConvertibleEvent): llm_messages.append(event.to_llm_message()) # Create conversation with persistence_dir set to save browser recordings conversation = Conversation( agent=agent, callbacks=[conversation_callback], workspace=cwd, persistence_dir="./.conversations", ) # The prompt instructs the agent to: # 1. Start recording the browser session # 2. Navigate to a page and get its content # 3. Stop recording (auto-saves to file) PROMPT = """ Please complete the following task to demonstrate browser session recording: 1. Use `browser_start_recording` to begin recording. 2. Navigate to https://docs.openhands.dev/ and: - Get the page content - Scroll down the page - Get the browser state to see interactive elements 3. Use `browser_stop_recording` to stop and save the recording. """ print("=" * 80) print("Browser Session Recording Example") print("=" * 80) print("\nTask: Record an agent's browser session and save it for replay") # Pre-initialize the browser so CDP is ready before the agent starts. # This avoids wasting LLM calls if the browser fails to connect. print("\nInitializing browser...") init_obs = conversation.execute_tool( "browser_navigate", BrowserNavigateAction(url="about:blank"), ) if init_obs.is_error: print(f"Browser initialization failed: {init_obs.text}") print("Ensure Chrome/Chromium is installed and accessible.") exit(1) print("Browser initialized successfully.\n") print("Starting conversation with agent...\n") conversation.send_message(PROMPT) conversation.run() print("\n" + "=" * 80) print("Conversation finished!") print("=" * 80) # Check if the recording files were created # Recordings are saved in BROWSER_RECORDING_OUTPUT_DIR/recording-{timestamp}/ if os.path.exists(BROWSER_RECORDING_OUTPUT_DIR): # Find recording subdirectories (they start with "recording-") recording_dirs = sorted( [ d for d in os.listdir(BROWSER_RECORDING_OUTPUT_DIR) if d.startswith("recording-") and os.path.isdir(os.path.join(BROWSER_RECORDING_OUTPUT_DIR, d)) ] ) if recording_dirs: # Process the most recent recording directory latest_recording = recording_dirs[-1] recording_path = os.path.join(BROWSER_RECORDING_OUTPUT_DIR, latest_recording) json_files = sorted( [f for f in os.listdir(recording_path) if f.endswith(".json")] ) print(f"\n✓ Recording saved to: {recording_path}") print(f"✓ Number of files: {len(json_files)}") # Count total events across all files total_events = 0 all_event_types: dict[int | str, int] = {} total_size = 0 for json_file in json_files: filepath = os.path.join(recording_path, json_file) file_size = os.path.getsize(filepath) total_size += file_size with open(filepath) as f: events = json.load(f) # Events are stored as a list in each file if isinstance(events, list): total_events += len(events) for event in events: event_type = event.get("type", "unknown") all_event_types[event_type] = all_event_types.get(event_type, 0) + 1 print(f" - {json_file}: {len(events)} events, {file_size} bytes") print(f"✓ Total events: {total_events}") print(f"✓ Total size: {total_size} bytes") if all_event_types: print(f"✓ Event types: {all_event_types}") print("\nTo replay this recording, you can use:") print( " - rrweb-player: " "https://github.com/rrweb-io/rrweb/tree/master/packages/rrweb-player" ) else: print(f"\n✗ No recording directories found in: {BROWSER_RECORDING_OUTPUT_DIR}") print(" The agent may not have completed the recording task.") else: print(f"\n✗ Observations directory not found: {BROWSER_RECORDING_OUTPUT_DIR}") print(" The agent may not have completed the recording task.") print("\n" + "=" * 100) print("Conversation finished.") print(f"Total LLM messages: {len(llm_messages)}") print("=" * 100) # Report cost cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"Conversation ID: {conversation.id}") print(f"EXAMPLE_COST: {cost}") # Close conversation to shut down browser and other tool executors conversation.close() ================================================ FILE: examples/01_standalone_sdk/39_llm_fallback.py ================================================ """Example: Using FallbackStrategy for LLM resilience. When the primary LLM fails with a transient error (rate limit, timeout, etc.), FallbackStrategy automatically tries alternate LLMs in order. Fallback is per-call: each new request starts with the primary model. Token usage and cost from fallback calls are merged into the primary LLM's metrics. This example: 1. Saves two fallback LLM profiles to a temporary store. 2. Configures a primary LLM with a FallbackStrategy pointing at those profiles. 3. Runs a conversation — if the primary model is unavailable, the agent transparently falls back to the next available model. """ import os import tempfile from pydantic import SecretStr from openhands.sdk import LLM, Agent, Conversation, LLMProfileStore, Tool from openhands.sdk.llm import FallbackStrategy from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool # Read configuration from environment api_key = os.getenv("LLM_API_KEY", None) assert api_key is not None, "LLM_API_KEY environment variable is not set." base_url = os.getenv("LLM_BASE_URL") primary_model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") # Use a temporary directory so this example doesn't pollute your home folder. # In real usage you can omit base_dir to use the default (~/.openhands/profiles). profile_store_dir = tempfile.mkdtemp() store = LLMProfileStore(base_dir=profile_store_dir) fallback_1 = LLM( usage_id="fallback-1", model=os.getenv("LLM_FALLBACK_MODEL_1", "openai/gpt-4o"), api_key=SecretStr(os.getenv("LLM_FALLBACK_API_KEY_1", api_key)), base_url=os.getenv("LLM_FALLBACK_BASE_URL_1", base_url), ) store.save("fallback-1", fallback_1, include_secrets=True) fallback_2 = LLM( usage_id="fallback-2", model=os.getenv("LLM_FALLBACK_MODEL_2", "openai/gpt-4o-mini"), api_key=SecretStr(os.getenv("LLM_FALLBACK_API_KEY_2", api_key)), base_url=os.getenv("LLM_FALLBACK_BASE_URL_2", base_url), ) store.save("fallback-2", fallback_2, include_secrets=True) print(f"Saved fallback profiles: {store.list()}") # Configure the primary LLM with a FallbackStrategy primary_llm = LLM( usage_id="agent-primary", model=primary_model, api_key=SecretStr(api_key), base_url=base_url, fallback_strategy=FallbackStrategy( fallback_llms=["fallback-1", "fallback-2"], profile_store_dir=profile_store_dir, ), ) # Run a conversation agent = Agent( llm=primary_llm, tools=[ Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name), ], ) conversation = Conversation(agent=agent, workspace=os.getcwd()) conversation.send_message("Write a haiku about resilience into HAIKU.txt.") conversation.run() # Inspect metrics (includes any fallback usage) metrics = primary_llm.metrics print(f"Total cost (including fallbacks): ${metrics.accumulated_cost:.6f}") print(f"Token usage records: {len(metrics.token_usages)}") for usage in metrics.token_usages: print( f" model={usage.model}" f" prompt={usage.prompt_tokens}" f" completion={usage.completion_tokens}" ) print(f"EXAMPLE_COST: {metrics.accumulated_cost}") ================================================ FILE: examples/01_standalone_sdk/40_acp_agent_example.py ================================================ """Example: Using ACPAgent with Claude Code ACP server. This example shows how to use an ACP-compatible server (claude-agent-acp) as the agent backend instead of direct LLM calls. It also demonstrates ``ask_agent()`` — a stateless side-question that forks the ACP session and leaves the main conversation untouched — and sending an image alongside text to verify multimodal (vision) input support. Prerequisites: - Node.js / npx available - ANTHROPIC_BASE_URL and ANTHROPIC_API_KEY set (can point to LiteLLM proxy) Usage: uv run python examples/01_standalone_sdk/40_acp_agent_example.py """ import os from openhands.sdk import ImageContent, Message, TextContent from openhands.sdk.agent import ACPAgent from openhands.sdk.conversation import Conversation IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" agent = ACPAgent(acp_command=["npx", "-y", "@agentclientprotocol/claude-agent-acp"]) try: cwd = os.getcwd() conversation = Conversation(agent=agent, workspace=cwd) # --- Main conversation turn (text only) --- conversation.send_message( "List the Python source files under openhands-sdk/openhands/sdk/agent/, " "then read the __init__.py and summarize what agent classes are exported." ) conversation.run() # --- Image input turn (text + image) --- print("\n--- image input ---") conversation.send_message( Message( role="user", content=[ TextContent( text="Describe what you see in this image in one sentence." ), ImageContent(image_urls=[IMAGE_URL]), ], ) ) conversation.run() # --- ask_agent: stateless side-question via fork_session --- print("\n--- ask_agent ---") response = conversation.ask_agent( "Based on what you just saw, which agent class is the newest addition?" ) print(f"ask_agent response: {response}") # Report cost (ACP server reports usage via session_update notifications) cost = agent.llm.metrics.accumulated_cost print(f"EXAMPLE_COST: {cost:.4f}") finally: # Clean up the ACP server subprocess agent.close() cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost print(f"\nEXAMPLE_COST: {cost}") print("Done!") ================================================ FILE: examples/01_standalone_sdk/41_task_tool_set.py ================================================ """ Animal Quiz with Task Tool Set Demonstrates the TaskToolSet with a main agent delegating to an animal-expert sub-agent. The flow is: 1. Main agent picks an animal and delegates to the "animal_expert" sub-agent to generate a multiple-choice question about it. 2. Main agent thinks about the question and picks an answer. 3. Main agent resumes the same sub-agent conversation to ask whether its answer is correct. The sub-agent confirms or corrects it. """ import os from openhands.sdk import LLM, Agent, AgentContext, Conversation, Tool from openhands.sdk.context import Skill from openhands.sdk.subagent import register_agent from openhands.tools.delegate import DelegationVisualizer from openhands.tools.task import TaskToolSet llm = LLM( model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL", None), ) # ── Register the animal expert sub-agent ───────────────────────────── def create_animal_expert(llm: LLM) -> Agent: """Factory for the animal-expert sub-agent.""" return Agent( llm=llm, tools=[], # no tools needed – pure knowledge agent_context=AgentContext( skills=[ Skill( name="animal_expertise", content=( "You are a world-class zoologist. " "When asked to generate a quiz question, respond with " "EXACTLY this format and nothing else:\n\n" "Question: \n" "A)