Repository: browserbase/stagehand Branch: main Commit: 2e99c9d9814e Files: 799 Total size: 4.9 MB Directory structure: gitextract_8uatdfoc/ ├── .changeset/ │ ├── config.json │ └── crazy-nights-prove.md ├── .cursorrules ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ ├── actions/ │ │ ├── select-browserbase-region/ │ │ │ └── action.yml │ │ ├── setup-node-pnpm-turbo/ │ │ │ └── action.yml │ │ ├── upload-ctrf-report/ │ │ │ └── action.yml │ │ ├── upload-v8-coverage/ │ │ │ └── action.yml │ │ └── verify-chromium-launch/ │ │ └── action.yml │ ├── pull_request_template │ └── workflows/ │ ├── ci.yml │ ├── claude.yml │ ├── external-contributor-pr-approval-handoff.yml │ ├── external-contributor-pr.yml │ ├── feature-parity.yml │ ├── release.yml │ ├── stagehand-server-v3-release.yml │ ├── stagehand-server-v3-sea-build.yml │ ├── stagehand-server-v4-release.yml │ ├── stagehand-server-v4-sea-build.yml │ └── stainless.yml ├── .gitignore ├── .prettierignore ├── .prettierrc ├── .vscode/ │ └── settings.json ├── CHANGELOG.md ├── LICENSE ├── README.md ├── claude.md ├── eslint.config.mjs ├── package.json ├── packages/ │ ├── README.md │ ├── cli/ │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── package.json │ │ ├── src/ │ │ │ └── index.ts │ │ ├── tests/ │ │ │ ├── cli.test.ts │ │ │ └── mode.test.ts │ │ ├── tsconfig.json │ │ ├── tsup.config.ts │ │ └── vitest.config.ts │ ├── core/ │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── examples/ │ │ │ ├── 2048.ts │ │ │ ├── CHANGELOG.md │ │ │ ├── actionable_observe_example.ts │ │ │ ├── agent-custom-tools.ts │ │ │ ├── agent_stream_example.ts │ │ │ ├── cua-example.ts │ │ │ ├── custom_client_aisdk.ts │ │ │ ├── custom_client_langchain.ts │ │ │ ├── custom_client_openai.ts │ │ │ ├── example.ts │ │ │ ├── external_clients/ │ │ │ │ ├── aisdk.ts │ │ │ │ ├── customOpenAI.ts │ │ │ │ └── langchain.ts │ │ │ ├── form_filling_sensible.ts │ │ │ ├── google_enter.ts │ │ │ ├── instructions.ts │ │ │ ├── integrations/ │ │ │ │ ├── exa.ts │ │ │ │ └── supabase.ts │ │ │ ├── mcp.ts │ │ │ ├── operator-example.ts │ │ │ ├── oss-cua-example.ts │ │ │ ├── parameterizeApiKey.ts │ │ │ ├── persist_logs_example.ts │ │ │ ├── tsconfig.json │ │ │ ├── v3/ │ │ │ │ ├── cuaReplay.ts │ │ │ │ ├── deepLocator.ts │ │ │ │ ├── dropdown.ts │ │ │ │ ├── highlight.ts │ │ │ │ ├── patchright.ts │ │ │ │ ├── playwright.ts │ │ │ │ ├── puppeteer.ts │ │ │ │ ├── recordVideo.ts │ │ │ │ ├── returnXpath.ts │ │ │ │ ├── shadowRoot.ts │ │ │ │ ├── targetedExtract.ts │ │ │ │ └── v3_agent.ts │ │ │ ├── v3_example.ts │ │ │ └── wordle.ts │ │ ├── lib/ │ │ │ ├── CHANGELOG.md │ │ │ ├── inference.ts │ │ │ ├── inferenceLogUtils.ts │ │ │ ├── logger.ts │ │ │ ├── modelUtils.ts │ │ │ ├── prompt.ts │ │ │ ├── utils.ts │ │ │ ├── v3/ │ │ │ │ ├── agent/ │ │ │ │ │ ├── AgentClient.ts │ │ │ │ │ ├── AgentProvider.ts │ │ │ │ │ ├── AnthropicCUAClient.ts │ │ │ │ │ ├── GoogleCUAClient.ts │ │ │ │ │ ├── MicrosoftCUAClient.ts │ │ │ │ │ ├── OpenAICUAClient.ts │ │ │ │ │ ├── prompts/ │ │ │ │ │ │ └── agentSystemPrompt.ts │ │ │ │ │ ├── tools/ │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── act.ts │ │ │ │ │ │ ├── ariaTree.ts │ │ │ │ │ │ ├── braveSearch.ts │ │ │ │ │ │ ├── browserbaseSearch.ts │ │ │ │ │ │ ├── click.ts │ │ │ │ │ │ ├── clickAndHold.ts │ │ │ │ │ │ ├── dragAndDrop.ts │ │ │ │ │ │ ├── extract.ts │ │ │ │ │ │ ├── fillFormVision.ts │ │ │ │ │ │ ├── fillform.ts │ │ │ │ │ │ ├── goto.ts │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ ├── keys.ts │ │ │ │ │ │ ├── navback.ts │ │ │ │ │ │ ├── screenshot.ts │ │ │ │ │ │ ├── scroll.ts │ │ │ │ │ │ ├── think.ts │ │ │ │ │ │ ├── type.ts │ │ │ │ │ │ └── wait.ts │ │ │ │ │ └── utils/ │ │ │ │ │ ├── actionMapping.ts │ │ │ │ │ ├── captchaSolver.ts │ │ │ │ │ ├── coordinateNormalization.ts │ │ │ │ │ ├── cuaKeyMapping.ts │ │ │ │ │ ├── googleCustomToolHandler.ts │ │ │ │ │ ├── handleDoneToolCall.ts │ │ │ │ │ ├── imageCompression.ts │ │ │ │ │ ├── messageProcessing.ts │ │ │ │ │ ├── screenshotHandler.ts │ │ │ │ │ ├── validateExperimentalFeatures.ts │ │ │ │ │ ├── variables.ts │ │ │ │ │ └── xpath.ts │ │ │ │ ├── api.ts │ │ │ │ ├── cache/ │ │ │ │ │ ├── ActCache.ts │ │ │ │ │ ├── AgentCache.ts │ │ │ │ │ ├── CacheStorage.ts │ │ │ │ │ ├── serverAgentCache.ts │ │ │ │ │ └── utils.ts │ │ │ │ ├── cli.js │ │ │ │ ├── dom/ │ │ │ │ │ ├── a11yScripts/ │ │ │ │ │ │ └── index.ts │ │ │ │ │ ├── genA11yScripts.ts │ │ │ │ │ ├── genDomScripts.ts │ │ │ │ │ ├── genLocatorScripts.ts │ │ │ │ │ ├── genScreenshotScripts.ts │ │ │ │ │ ├── global.d.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── locatorScripts/ │ │ │ │ │ │ ├── counts.ts │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ ├── scripts.ts │ │ │ │ │ │ ├── selectors.ts │ │ │ │ │ │ ├── waitForSelector.ts │ │ │ │ │ │ ├── xpathParser.ts │ │ │ │ │ │ └── xpathResolver.ts │ │ │ │ │ ├── piercer.entry.ts │ │ │ │ │ ├── piercer.runtime.ts │ │ │ │ │ ├── rerenderMissingShadows.entry.ts │ │ │ │ │ ├── rerenderMissingShadows.runtime.ts │ │ │ │ │ └── screenshotScripts/ │ │ │ │ │ ├── index.ts │ │ │ │ │ └── resolveMaskRect.ts │ │ │ │ ├── external_clients/ │ │ │ │ │ ├── aisdk.ts │ │ │ │ │ └── customOpenAI.ts │ │ │ │ ├── flowlogger/ │ │ │ │ │ ├── EventEmitter.ts │ │ │ │ │ ├── EventSink.ts │ │ │ │ │ ├── EventStore.ts │ │ │ │ │ ├── FlowLogger.ts │ │ │ │ │ └── prettify.ts │ │ │ │ ├── handlers/ │ │ │ │ │ ├── actHandler.ts │ │ │ │ │ ├── extractHandler.ts │ │ │ │ │ ├── handlerUtils/ │ │ │ │ │ │ ├── actHandlerUtils.ts │ │ │ │ │ │ └── timeoutGuard.ts │ │ │ │ │ ├── observeHandler.ts │ │ │ │ │ ├── v3AgentHandler.ts │ │ │ │ │ └── v3CuaAgentHandler.ts │ │ │ │ ├── index.ts │ │ │ │ ├── launch/ │ │ │ │ │ ├── browserbase.ts │ │ │ │ │ └── local.ts │ │ │ │ ├── llm/ │ │ │ │ │ ├── AnthropicClient.ts │ │ │ │ │ ├── CerebrasClient.ts │ │ │ │ │ ├── GoogleClient.ts │ │ │ │ │ ├── GroqClient.ts │ │ │ │ │ ├── LLMClient.ts │ │ │ │ │ ├── LLMProvider.ts │ │ │ │ │ ├── OpenAIClient.ts │ │ │ │ │ └── aisdk.ts │ │ │ │ ├── logger.ts │ │ │ │ ├── mcp/ │ │ │ │ │ ├── connection.ts │ │ │ │ │ └── utils.ts │ │ │ │ ├── runtimePaths.ts │ │ │ │ ├── shutdown/ │ │ │ │ │ ├── cleanupLocal.ts │ │ │ │ │ ├── supervisor.ts │ │ │ │ │ └── supervisorClient.ts │ │ │ │ ├── timeoutConfig.ts │ │ │ │ ├── types/ │ │ │ │ │ ├── private/ │ │ │ │ │ │ ├── agent.ts │ │ │ │ │ │ ├── api.ts │ │ │ │ │ │ ├── cache.ts │ │ │ │ │ │ ├── evaluator.ts │ │ │ │ │ │ ├── handlers.ts │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ ├── internal.ts │ │ │ │ │ │ ├── locator.ts │ │ │ │ │ │ ├── network.ts │ │ │ │ │ │ ├── shutdown.ts │ │ │ │ │ │ ├── shutdownErrors.ts │ │ │ │ │ │ └── snapshot.ts │ │ │ │ │ └── public/ │ │ │ │ │ ├── agent.ts │ │ │ │ │ ├── api.ts │ │ │ │ │ ├── apiErrors.ts │ │ │ │ │ ├── context.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── locator.ts │ │ │ │ │ ├── logs.ts │ │ │ │ │ ├── methods.ts │ │ │ │ │ ├── metrics.ts │ │ │ │ │ ├── model.ts │ │ │ │ │ ├── options.ts │ │ │ │ │ ├── page.ts │ │ │ │ │ ├── screenshotTypes.ts │ │ │ │ │ └── sdkErrors.ts │ │ │ │ ├── understudy/ │ │ │ │ │ ├── a11y/ │ │ │ │ │ │ └── snapshot/ │ │ │ │ │ │ ├── a11yTree.ts │ │ │ │ │ │ ├── activeElement.ts │ │ │ │ │ │ ├── capture.ts │ │ │ │ │ │ ├── coordinateResolver.ts │ │ │ │ │ │ ├── domTree.ts │ │ │ │ │ │ ├── focusSelectors.ts │ │ │ │ │ │ ├── index.ts │ │ │ │ │ │ ├── sessions.ts │ │ │ │ │ │ ├── treeFormatUtils.ts │ │ │ │ │ │ └── xpathUtils.ts │ │ │ │ │ ├── a11yInvocation.ts │ │ │ │ │ ├── cdp.ts │ │ │ │ │ ├── consoleMessage.ts │ │ │ │ │ ├── context.ts │ │ │ │ │ ├── cookies.ts │ │ │ │ │ ├── deepLocator.ts │ │ │ │ │ ├── executionContextRegistry.ts │ │ │ │ │ ├── fileUploadUtils.ts │ │ │ │ │ ├── frame.ts │ │ │ │ │ ├── frameLocator.ts │ │ │ │ │ ├── frameRegistry.ts │ │ │ │ │ ├── initScripts.ts │ │ │ │ │ ├── lifecycleWatcher.ts │ │ │ │ │ ├── locator.ts │ │ │ │ │ ├── locatorInvocation.ts │ │ │ │ │ ├── navigationResponseTracker.ts │ │ │ │ │ ├── networkManager.ts │ │ │ │ │ ├── page.ts │ │ │ │ │ ├── piercer.ts │ │ │ │ │ ├── response.ts │ │ │ │ │ ├── screenshotUtils.ts │ │ │ │ │ └── selectorResolver.ts │ │ │ │ ├── v3.ts │ │ │ │ └── zodCompat.ts │ │ │ └── v3Evaluator.ts │ │ ├── package.json │ │ ├── scripts/ │ │ │ ├── build-cjs.ts │ │ │ ├── build-esm.ts │ │ │ ├── coverage.ts │ │ │ ├── gen-version.ts │ │ │ ├── normalize-v8-coverage.ts │ │ │ ├── prepare.js │ │ │ ├── test-core.ts │ │ │ ├── test-e2e.ts │ │ │ └── test-utils.ts │ │ ├── tests/ │ │ │ ├── cache-variables.test.ts │ │ │ ├── integration/ │ │ │ │ ├── agent-abort-signal.spec.ts │ │ │ │ ├── agent-cache-self-heal.spec.ts │ │ │ │ ├── agent-callbacks.spec.ts │ │ │ │ ├── agent-captcha-autosolve.spec.ts │ │ │ │ ├── agent-experimental-validation.spec.ts │ │ │ │ ├── agent-hybrid-mode.spec.ts │ │ │ │ ├── agent-message-continuation.spec.ts │ │ │ │ ├── agent-streaming.spec.ts │ │ │ │ ├── cdp-close-api-region.spec.ts │ │ │ │ ├── cdp-connection-close.spec.ts │ │ │ │ ├── cdp-session-detached.spec.ts │ │ │ │ ├── click-count.spec.ts │ │ │ │ ├── connect-to-existing-browser.spec.ts │ │ │ │ ├── context-addInitScript.spec.ts │ │ │ │ ├── context-extra-http-headers.spec.ts │ │ │ │ ├── cookies.spec.ts │ │ │ │ ├── default-page-tracking.spec.ts │ │ │ │ ├── downloads.spec.ts │ │ │ │ ├── flowLogger.spec.ts │ │ │ │ ├── frame-get-location-and-click.spec.ts │ │ │ │ ├── iframe-ctx-addInitScript-race.spec.ts │ │ │ │ ├── iframe-ctx-addInitScript.spec.ts │ │ │ │ ├── keep-alive.child.ts │ │ │ │ ├── keep-alive.spec.ts │ │ │ │ ├── keyboard.spec.ts │ │ │ │ ├── locator-backend-node-id.spec.ts │ │ │ │ ├── locator-content-methods.spec.ts │ │ │ │ ├── locator-count-iframe.spec.ts │ │ │ │ ├── locator-count.spec.ts │ │ │ │ ├── locator-fill.spec.ts │ │ │ │ ├── locator-input-methods.spec.ts │ │ │ │ ├── locator-nth.spec.ts │ │ │ │ ├── locator-select-option.spec.ts │ │ │ │ ├── logger-initialization.spec.ts │ │ │ │ ├── multi-instance-logger.spec.ts │ │ │ │ ├── nested-div.spec.ts │ │ │ │ ├── page-addInitScript.spec.ts │ │ │ │ ├── page-console.spec.ts │ │ │ │ ├── page-drag-and-drop.spec.ts │ │ │ │ ├── page-extra-http-headers.spec.ts │ │ │ │ ├── page-goto-response.spec.ts │ │ │ │ ├── page-hover.spec.ts │ │ │ │ ├── page-screenshot.spec.ts │ │ │ │ ├── page-scroll.spec.ts │ │ │ │ ├── page-send-cdp.spec.ts │ │ │ │ ├── perform-understudy-method.spec.ts │ │ │ │ ├── setinputfiles.spec.ts │ │ │ │ ├── shadow-iframe-oopif.spec.ts │ │ │ │ ├── shadow-iframe-spif.spec.ts │ │ │ │ ├── testUtils.ts │ │ │ │ ├── text-selector-innermost.spec.ts │ │ │ │ ├── timeouts.spec.ts │ │ │ │ ├── user-data-dir.spec.ts │ │ │ │ ├── v3.config.ts │ │ │ │ ├── v3.dynamic.config.ts │ │ │ │ ├── v3.playwright.config.ts │ │ │ │ ├── wait-for-selector.spec.ts │ │ │ │ ├── wait-for-timeout.spec.ts │ │ │ │ └── xpath-for-location-deep.spec.ts │ │ │ └── unit/ │ │ │ ├── agent-captcha-hooks.test.ts │ │ │ ├── agent-execution-model.test.ts │ │ │ ├── api-multiregion.test.ts │ │ │ ├── browserbase-session-accessors.test.ts │ │ │ ├── cache-llm-resolution.test.ts │ │ │ ├── captcha-solver.test.ts │ │ │ ├── cdp-connection-close.test.ts │ │ │ ├── context-extra-http-headers.test.ts │ │ │ ├── cookies.test.ts │ │ │ ├── flowlogger-capturing-cdp.test.ts │ │ │ ├── flowlogger-capturing-llm.test.ts │ │ │ ├── flowlogger-eventstore.test.ts │ │ │ ├── helpers/ │ │ │ │ └── mockCDPSession.ts │ │ │ ├── llm-provider.test.ts │ │ │ ├── model-deprecation.test.ts │ │ │ ├── model-utils.test.ts │ │ │ ├── openai-cua-client.test.ts │ │ │ ├── page-extra-http-headers.test.ts │ │ │ ├── page-snapshot.test.ts │ │ │ ├── public-api/ │ │ │ │ ├── export-surface.test.ts │ │ │ │ ├── llm-and-agents.test.ts │ │ │ │ ├── public-error-types.test.ts │ │ │ │ ├── public-types.test.ts │ │ │ │ ├── runtime-utils.test.ts │ │ │ │ ├── schema-utils.test.ts │ │ │ │ ├── timeout-error-types.test.ts │ │ │ │ ├── tool-type-export.test.ts │ │ │ │ └── v3-core.test.ts │ │ │ ├── safety-confirmation.test.ts │ │ │ ├── snapshot-a11y-resolvers.test.ts │ │ │ ├── snapshot-a11y-tree-utils.test.ts │ │ │ ├── snapshot-capture-orchestration.test.ts │ │ │ ├── snapshot-cbor.test.ts │ │ │ ├── snapshot-dom-session-builders.test.ts │ │ │ ├── snapshot-dom-tree-utils.test.ts │ │ │ ├── snapshot-focus-selectors-utils.test.ts │ │ │ ├── snapshot-frame-merge.test.ts │ │ │ ├── snapshot-tree-format-utils.test.ts │ │ │ ├── snapshot-xpath-utils.test.ts │ │ │ ├── timeout-handlers.test.ts │ │ │ ├── understudy-command-exception.test.ts │ │ │ ├── xpath-parser.test.ts │ │ │ ├── xpath-resolver.test.ts │ │ │ └── zod-enum-compatibility.test.ts │ │ ├── tsconfig.json │ │ ├── vitest.cjs.config.mjs │ │ ├── vitest.config.ts │ │ └── vitest.esm.config.mjs │ ├── docs/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── docs.json │ │ ├── language-selector.js │ │ ├── package.json │ │ ├── scripts/ │ │ │ ├── runtimePaths.js │ │ │ └── sync-sdk-docs.js │ │ ├── snippets/ │ │ │ ├── excalidraw.mdx │ │ │ └── v3-banner.mdx │ │ ├── v2/ │ │ │ ├── basics/ │ │ │ │ ├── act.mdx │ │ │ │ ├── agent.mdx │ │ │ │ ├── extract.mdx │ │ │ │ └── observe.mdx │ │ │ ├── best-practices/ │ │ │ │ ├── agent-fallbacks.mdx │ │ │ │ ├── build-agent.mdx │ │ │ │ ├── caching.mdx │ │ │ │ ├── computer-use.mdx │ │ │ │ ├── contributing.mdx │ │ │ │ ├── cost-optimization.mdx │ │ │ │ ├── deployments.mdx │ │ │ │ ├── mcp-integrations.mdx │ │ │ │ ├── playwright-interop.mdx │ │ │ │ ├── prompting-best-practices.mdx │ │ │ │ ├── speed-optimization.mdx │ │ │ │ ├── usecase-observe.mdx │ │ │ │ ├── user-data.mdx │ │ │ │ ├── using-multiple-tabs.mdx │ │ │ │ └── working-with-iframes.mdx │ │ │ ├── configuration/ │ │ │ │ ├── browser.mdx │ │ │ │ ├── evals.mdx │ │ │ │ ├── logging.mdx │ │ │ │ ├── models.mdx │ │ │ │ └── observability.mdx │ │ │ ├── first-steps/ │ │ │ │ ├── ai-rules.mdx │ │ │ │ ├── installation.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ └── quickstart.mdx │ │ │ ├── integrations/ │ │ │ │ ├── crew-ai/ │ │ │ │ │ ├── configuration.mdx │ │ │ │ │ └── introduction.mdx │ │ │ │ ├── langchain/ │ │ │ │ │ ├── configuration.mdx │ │ │ │ │ └── introduction.mdx │ │ │ │ ├── mcp/ │ │ │ │ │ ├── configuration.mdx │ │ │ │ │ ├── introduction.mdx │ │ │ │ │ ├── setup.mdx │ │ │ │ │ └── tools.mdx │ │ │ │ └── vercel/ │ │ │ │ ├── configuration.mdx │ │ │ │ └── introduction.mdx │ │ │ └── references/ │ │ │ ├── act.mdx │ │ │ ├── agent.mdx │ │ │ ├── extract.mdx │ │ │ ├── observe.mdx │ │ │ └── stagehand.mdx │ │ └── v3/ │ │ ├── basics/ │ │ │ ├── act.mdx │ │ │ ├── agent.mdx │ │ │ ├── evals.mdx │ │ │ ├── extract.mdx │ │ │ └── observe.mdx │ │ ├── best-practices/ │ │ │ ├── agent-fallbacks.mdx │ │ │ ├── caching.mdx │ │ │ ├── computer-use.mdx │ │ │ ├── cost-optimization.mdx │ │ │ ├── deployments.mdx │ │ │ ├── deterministic-agent.mdx │ │ │ ├── history.mdx │ │ │ ├── mcp-integrations.mdx │ │ │ ├── prompting-best-practices.mdx │ │ │ ├── speed-optimization.mdx │ │ │ ├── usecase-observe.mdx │ │ │ ├── user-data.mdx │ │ │ └── using-multiple-tabs.mdx │ │ ├── configuration/ │ │ │ ├── browser.mdx │ │ │ ├── logging.mdx │ │ │ ├── models.mdx │ │ │ └── observability.mdx │ │ ├── first-steps/ │ │ │ ├── ai-rules.mdx │ │ │ ├── installation.mdx │ │ │ ├── introduction.mdx │ │ │ └── quickstart.mdx │ │ ├── integrations/ │ │ │ ├── convex/ │ │ │ │ ├── configuration.mdx │ │ │ │ └── introduction.mdx │ │ │ ├── crew-ai/ │ │ │ │ ├── configuration.mdx │ │ │ │ └── introduction.mdx │ │ │ ├── langchain/ │ │ │ │ ├── configuration.mdx │ │ │ │ └── introduction.mdx │ │ │ ├── mcp/ │ │ │ │ ├── configuration.mdx │ │ │ │ ├── introduction.mdx │ │ │ │ ├── setup.mdx │ │ │ │ └── tools.mdx │ │ │ ├── playwright.mdx │ │ │ ├── puppeteer.mdx │ │ │ ├── selenium.mdx │ │ │ └── vercel/ │ │ │ ├── configuration.mdx │ │ │ └── introduction.mdx │ │ ├── migrations/ │ │ │ ├── python.mdx │ │ │ └── v2.mdx │ │ ├── references/ │ │ │ ├── act.mdx │ │ │ ├── agent.mdx │ │ │ ├── context.mdx │ │ │ ├── deeplocator.mdx │ │ │ ├── extract.mdx │ │ │ ├── locator.mdx │ │ │ ├── observe.mdx │ │ │ ├── page.mdx │ │ │ ├── response.mdx │ │ │ └── stagehand.mdx │ │ └── sdk/ │ │ ├── go.mdx │ │ ├── java.mdx │ │ ├── python.mdx │ │ └── ruby.mdx │ ├── evals/ │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── args.ts │ │ ├── assets/ │ │ │ ├── cart.html │ │ │ └── peeler.html │ │ ├── browserbaseCleanup.ts │ │ ├── cli.ts │ │ ├── datasets/ │ │ │ ├── gaia/ │ │ │ │ └── GAIA_web.jsonl │ │ │ ├── onlineMind2Web/ │ │ │ │ └── onlineMind2Web.jsonl │ │ │ ├── webtailbench/ │ │ │ │ └── WebTailBench_data.jsonl │ │ │ └── webvoyager/ │ │ │ └── WebVoyager_data.jsonl │ │ ├── env.ts │ │ ├── evals.config.json │ │ ├── index.eval.ts │ │ ├── initV3.ts │ │ ├── lib/ │ │ │ └── AISdkClientWrapped.ts │ │ ├── llm_clients/ │ │ │ ├── hn_aisdk.ts │ │ │ ├── hn_customOpenAI.ts │ │ │ └── hn_langchain.ts │ │ ├── logger.ts │ │ ├── package.json │ │ ├── run.ts │ │ ├── runtimePaths.ts │ │ ├── scoring.ts │ │ ├── scripts/ │ │ │ ├── build-cli.ts │ │ │ ├── build-esm.ts │ │ │ └── test-evals.ts │ │ ├── suites/ │ │ │ ├── gaia.ts │ │ │ ├── onlineMind2Web.ts │ │ │ ├── webtailbench.ts │ │ │ └── webvoyager.ts │ │ ├── summary.ts │ │ ├── taskConfig.ts │ │ ├── tasks/ │ │ │ ├── agent/ │ │ │ │ ├── alibaba_supplier_search.ts │ │ │ │ ├── all_recipes.ts │ │ │ │ ├── amazon_shoes_cart.ts │ │ │ │ ├── apple_trade_in.ts │ │ │ │ ├── apple_tv.ts │ │ │ │ ├── arxiv_gpt_report.ts │ │ │ │ ├── columbia_tuition.ts │ │ │ │ ├── flipkart_laptops.ts │ │ │ │ ├── gaia.ts │ │ │ │ ├── github.ts │ │ │ │ ├── github_react_version.ts │ │ │ │ ├── google_flights.ts │ │ │ │ ├── google_maps.ts │ │ │ │ ├── google_maps_2.ts │ │ │ │ ├── google_maps_3.ts │ │ │ │ ├── google_shopping.ts │ │ │ │ ├── hotel_booking.ts │ │ │ │ ├── hotels_paris_amenities.ts │ │ │ │ ├── hugging_face.ts │ │ │ │ ├── iframe_form.ts │ │ │ │ ├── iframe_form_multiple.ts │ │ │ │ ├── instacart_organic_bananas.ts │ │ │ │ ├── kayak.ts │ │ │ │ ├── kfc_tenders_combo.ts │ │ │ │ ├── kith.ts │ │ │ │ ├── made_in_china_supplier.ts │ │ │ │ ├── nba_trades.ts │ │ │ │ ├── nvidia_hgx_driver.ts │ │ │ │ ├── oed_word_search.ts │ │ │ │ ├── onlineMind2Web.ts │ │ │ │ ├── radiotimes_tv_schedule.ts │ │ │ │ ├── redfin_apartment_rental.ts │ │ │ │ ├── sf_library_card.ts │ │ │ │ ├── sf_library_card_multiple.ts │ │ │ │ ├── sign_in.ts │ │ │ │ ├── steam_games.ts │ │ │ │ ├── thegamer_opinion_article.ts │ │ │ │ ├── trailhead_superbadge.ts │ │ │ │ ├── trivago.ts │ │ │ │ ├── trustpilot_hr_companies.ts │ │ │ │ ├── ubereats.ts │ │ │ │ ├── uniqlo_mens_blazers.ts │ │ │ │ ├── webmd_audiologist_search.ts │ │ │ │ ├── webmd_ovulation_calculator.ts │ │ │ │ ├── webtailbench.ts │ │ │ │ └── webvoyager.ts │ │ │ ├── allrecipes.ts │ │ │ ├── amazon_add_to_cart.ts │ │ │ ├── apple.ts │ │ │ ├── arxiv.ts │ │ │ ├── bidnet.ts │ │ │ ├── checkboxes.ts │ │ │ ├── combination_sauce.ts │ │ │ ├── costar.ts │ │ │ ├── csr_in_oopif.ts │ │ │ ├── csr_in_spif.ts │ │ │ ├── custom_dropdown.ts │ │ │ ├── dropdown.ts │ │ │ ├── extract_aigrant_companies.ts │ │ │ ├── extract_aigrant_targeted.ts │ │ │ ├── extract_aigrant_targeted_2.ts │ │ │ ├── extract_apartments.ts │ │ │ ├── extract_area_codes.ts │ │ │ ├── extract_baptist_health.ts │ │ │ ├── extract_capacitor_info.ts │ │ │ ├── extract_collaborators.ts │ │ │ ├── extract_csa.ts │ │ │ ├── extract_geniusee.ts │ │ │ ├── extract_geniusee_2.ts │ │ │ ├── extract_github_commits.ts │ │ │ ├── extract_github_stars.ts │ │ │ ├── extract_hamilton_weather.ts │ │ │ ├── extract_jfk_links.ts │ │ │ ├── extract_jstor_news.ts │ │ │ ├── extract_memorial_healthcare.ts │ │ │ ├── extract_nhl_stats.ts │ │ │ ├── extract_partners.ts │ │ │ ├── extract_press_releases.ts │ │ │ ├── extract_professional_info.ts │ │ │ ├── extract_public_notices.ts │ │ │ ├── extract_recipe.ts │ │ │ ├── extract_regulations_table.ts │ │ │ ├── extract_repo_name.ts │ │ │ ├── extract_resistor_info.ts │ │ │ ├── extract_rockauto.ts │ │ │ ├── extract_single_link.ts │ │ │ ├── extract_snowshoeing_destinations.ts │ │ │ ├── extract_staff_members.ts │ │ │ ├── extract_zillow.ts │ │ │ ├── google_flights.ts │ │ │ ├── heal_custom_dropdown.ts │ │ │ ├── heal_scroll_50.ts │ │ │ ├── heal_simple_google_search.ts │ │ │ ├── hidden_input_dropdown.ts │ │ │ ├── history.ts │ │ │ ├── homedepot.ts │ │ │ ├── iframe_form_filling.ts │ │ │ ├── iframe_hn.ts │ │ │ ├── iframe_same_proc.ts │ │ │ ├── iframe_scroll.ts │ │ │ ├── iframes_nested.ts │ │ │ ├── imdb_movie_details.ts │ │ │ ├── instructions.ts │ │ │ ├── ionwave.ts │ │ │ ├── ionwave_observe.ts │ │ │ ├── login.ts │ │ │ ├── multi_tab.ts │ │ │ ├── namespace_xpath.ts │ │ │ ├── nested_iframes_2.ts │ │ │ ├── next_chunk.ts │ │ │ ├── no_js_click.ts │ │ │ ├── nonsense_action.ts │ │ │ ├── observe_amazon_add_to_cart.ts │ │ │ ├── observe_github.ts │ │ │ ├── observe_iframes1.ts │ │ │ ├── observe_iframes2.ts │ │ │ ├── observe_simple_google_search.ts │ │ │ ├── observe_taxes.ts │ │ │ ├── observe_vantechjournal.ts │ │ │ ├── observe_yc_startup.ts │ │ │ ├── oopif_in_csr.ts │ │ │ ├── oopif_in_osr.ts │ │ │ ├── os_dropdown.ts │ │ │ ├── osr_in_oopif.ts │ │ │ ├── osr_in_spif.ts │ │ │ ├── panamcs.ts │ │ │ ├── peeler_complex.ts │ │ │ ├── prev_chunk.ts │ │ │ ├── radio_btn.ts │ │ │ ├── rakuten_jp.ts │ │ │ ├── sciquest.ts │ │ │ ├── scroll_50.ts │ │ │ ├── scroll_75.ts │ │ │ ├── shadow_dom.ts │ │ │ ├── simple_google_search.ts │ │ │ ├── spif_in_csr.ts │ │ │ ├── spif_in_osr.ts │ │ │ ├── stock_x.ts │ │ │ ├── tab_handling.ts │ │ │ ├── ted_talk.ts │ │ │ ├── vanta_h.ts │ │ │ ├── vantechjournal.ts │ │ │ ├── wichita.ts │ │ │ └── wikipedia.ts │ │ ├── tsconfig.json │ │ ├── types/ │ │ │ ├── evals.ts │ │ │ └── screenshotCollector.ts │ │ ├── utils/ │ │ │ ├── ScreenshotCollector.ts │ │ │ └── imageResize.ts │ │ └── utils.ts │ ├── server-v3/ │ │ ├── CHANGELOG.md │ │ ├── README.md │ │ ├── SDK_RELEASE_WORKFLOW.md │ │ ├── openapi.v3.yaml │ │ ├── package.json │ │ ├── scripts/ │ │ │ ├── build-sea.ts │ │ │ ├── gen-openapi.ts │ │ │ ├── runtimePaths.ts │ │ │ └── test-server.ts │ │ ├── src/ │ │ │ ├── lib/ │ │ │ │ ├── InMemorySessionStore.ts │ │ │ │ ├── SessionStore.ts │ │ │ │ ├── auth.ts │ │ │ │ ├── env.ts │ │ │ │ ├── errorHandler.ts │ │ │ │ ├── header.ts │ │ │ │ ├── logging/ │ │ │ │ │ └── index.ts │ │ │ │ ├── response.ts │ │ │ │ ├── sessionStoreManager.ts │ │ │ │ ├── stream.ts │ │ │ │ └── utils.ts │ │ │ ├── routes/ │ │ │ │ ├── healthcheck.ts │ │ │ │ ├── readiness.ts │ │ │ │ └── v1/ │ │ │ │ └── sessions/ │ │ │ │ ├── _id/ │ │ │ │ │ ├── act.ts │ │ │ │ │ ├── agentExecute.ts │ │ │ │ │ ├── end.ts │ │ │ │ │ ├── extract.ts │ │ │ │ │ ├── navigate.ts │ │ │ │ │ ├── observe.ts │ │ │ │ │ └── replay.ts │ │ │ │ └── start.ts │ │ │ ├── sea-entry.ts │ │ │ ├── server.ts │ │ │ └── types/ │ │ │ ├── error.ts │ │ │ ├── fastify.d.ts │ │ │ ├── model.ts │ │ │ └── rrweb.ts │ │ ├── test/ │ │ │ └── integration/ │ │ │ ├── api-server-cache.test.ts │ │ │ ├── utils.ts │ │ │ └── v3/ │ │ │ ├── act.test.ts │ │ │ ├── agentExecute.test.ts │ │ │ ├── end.test.ts │ │ │ ├── extract.test.ts │ │ │ ├── multiRegion.test.ts │ │ │ ├── navigate.test.ts │ │ │ ├── observe.test.ts │ │ │ ├── replay.test.ts │ │ │ └── start.test.ts │ │ ├── tsconfig.json │ │ ├── tsconfig.tests.json │ │ └── vitest.config.ts │ └── server-v4/ │ ├── CHANGELOG.md │ ├── README.md │ ├── openapi.v4.yaml │ ├── package.json │ ├── scripts/ │ │ ├── build-sea.ts │ │ ├── gen-openapi.ts │ │ ├── runtimePaths.ts │ │ └── test-server.ts │ ├── src/ │ │ ├── routes/ │ │ │ ├── healthcheck.ts │ │ │ ├── readiness.ts │ │ │ └── v4/ │ │ │ ├── browsersession/ │ │ │ │ ├── _id/ │ │ │ │ │ ├── end.ts │ │ │ │ │ └── index.ts │ │ │ │ ├── action/ │ │ │ │ │ ├── _actionId.ts │ │ │ │ │ └── index.ts │ │ │ │ ├── activePage.ts │ │ │ │ ├── addCookies.ts │ │ │ │ ├── addInitScript.ts │ │ │ │ ├── awaitActivePage.ts │ │ │ │ ├── browserbaseDebugURL.ts │ │ │ │ ├── browserbaseSessionID.ts │ │ │ │ ├── browserbaseSessionURL.ts │ │ │ │ ├── clearCookies.ts │ │ │ │ ├── configuredViewport.ts │ │ │ │ ├── connectURL.ts │ │ │ │ ├── cookies.ts │ │ │ │ ├── getFullFrameTreeByMainFrameId.ts │ │ │ │ ├── index.ts │ │ │ │ ├── newPage.ts │ │ │ │ ├── pages.ts │ │ │ │ ├── resolvePageByMainFrameId.ts │ │ │ │ ├── routes.ts │ │ │ │ ├── setExtraHTTPHeaders.ts │ │ │ │ └── shared.ts │ │ │ ├── page/ │ │ │ │ ├── action/ │ │ │ │ │ ├── _actionId.ts │ │ │ │ │ └── index.ts │ │ │ │ ├── addInitScript.ts │ │ │ │ ├── asProtocolFrameTree.ts │ │ │ │ ├── click.ts │ │ │ │ ├── close.ts │ │ │ │ ├── dragAndDrop.ts │ │ │ │ ├── enableCursorOverlay.ts │ │ │ │ ├── evaluate.ts │ │ │ │ ├── frames.ts │ │ │ │ ├── getFullFrameTree.ts │ │ │ │ ├── getOrdinal.ts │ │ │ │ ├── goBack.ts │ │ │ │ ├── goForward.ts │ │ │ │ ├── goto.ts │ │ │ │ ├── hover.ts │ │ │ │ ├── keyPress.ts │ │ │ │ ├── listAllFrameIds.ts │ │ │ │ ├── mainFrame.ts │ │ │ │ ├── mainFrameId.ts │ │ │ │ ├── reload.ts │ │ │ │ ├── routes.ts │ │ │ │ ├── screenshot.ts │ │ │ │ ├── scroll.ts │ │ │ │ ├── sendCDP.ts │ │ │ │ ├── setExtraHTTPHeaders.ts │ │ │ │ ├── setViewportSize.ts │ │ │ │ ├── shared.ts │ │ │ │ ├── snapshot.ts │ │ │ │ ├── targetId.ts │ │ │ │ ├── title.ts │ │ │ │ ├── type.ts │ │ │ │ ├── url.ts │ │ │ │ ├── waitForLoadState.ts │ │ │ │ ├── waitForMainLoadState.ts │ │ │ │ ├── waitForSelector.ts │ │ │ │ └── waitForTimeout.ts │ │ │ └── pluginUtils.ts │ │ ├── schemas/ │ │ │ └── v4/ │ │ │ ├── browserSession.ts │ │ │ └── page.ts │ │ ├── sea-entry.ts │ │ ├── server.ts │ │ └── types/ │ │ ├── error.ts │ │ ├── fastify.d.ts │ │ ├── model.ts │ │ └── rrweb.ts │ ├── test/ │ │ └── integration/ │ │ ├── utils.ts │ │ └── v4/ │ │ ├── browsersession.test.ts │ │ └── page.test.ts │ ├── tsconfig.json │ ├── tsconfig.tests.json │ └── vitest.config.ts ├── pnpm-workspace.yaml ├── stainless.yml ├── tsconfig.base.json ├── tsconfig.json └── turbo.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .changeset/config.json ================================================ { "$schema": "https://unpkg.com/@changesets/config@2.1.1/schema.json", "commit": false, "fixed": [], "linked": [], "baseBranch": "main", "updateInternalDependencies": "patch", "access": "public", "changelog": [ "@changesets/changelog-github", { "repo": "browserbase/stagehand" } ], "snapshot": { "useCalculatedVersion": true, "prereleaseTemplate": "alpha-{commit}", "tag": "alpha" } } ================================================ FILE: .changeset/crazy-nights-prove.md ================================================ --- "@browserbasehq/stagehand": patch --- apply user defined toolTimeout to all agent tools (other than wait & think tools) ================================================ FILE: .cursorrules ================================================ # Stagehand Project This is a project that uses Stagehand V3, a browser automation framework with AI-powered `act`, `extract`, `observe`, and `agent` methods. The main class can be imported as `Stagehand` from `@browserbasehq/stagehand`. **Key Classes:** - `Stagehand`: Main orchestrator class providing `act`, `extract`, `observe`, and `agent` methods - `context`: A `V3Context` object that manages browser contexts and pages - `page`: Individual page objects accessed via `stagehand.context.pages()[i]` or created with `stagehand.context.newPage()` ## Initialize ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", // or "BROWSERBASE" verbose: 2, // 0, 1, or 2 model: "openai/gpt-4.1-mini", // or any supported model }); await stagehand.init(); // Access the browser context and pages const page = stagehand.context.pages()[0]; const context = stagehand.context; // Create new pages if needed const page2 = await stagehand.context.newPage(); ``` ## Act Actions are called on the `stagehand` instance (not the page). Use atomic, specific instructions: ```typescript // Act on the current active page await stagehand.act("click the sign in button"); // Act on a specific page (when you need to target a page that isn't currently active) await stagehand.act("click the sign in button", { page: page2 }); ``` **Important:** Act instructions should be atomic and specific: - ✅ Good: "Click the sign in button" or "Type 'hello' into the search input" - ❌ Bad: "Order me pizza" or "Type in the search bar and hit enter" (multi-step) ### Observe + Act Pattern (Recommended) Cache the results of `observe` to avoid unexpected DOM changes: ```typescript const instruction = "Click the sign in button"; // Get candidate actions const actions = await stagehand.observe(instruction); // Execute the first action await stagehand.act(actions[0]); ``` To target a specific page: ```typescript const actions = await stagehand.observe("select blue as the favorite color", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Extract Extract data from pages using natural language instructions. The `extract` method is called on the `stagehand` instance. ### Basic Extraction (with schema) ```typescript import { z } from "zod"; // Extract with explicit schema const data = await stagehand.extract( "extract all apartment listings with prices and addresses", z.object({ listings: z.array( z.object({ price: z.string(), address: z.string(), }), ), }), ); console.log(data.listings); ``` ### Simple Extraction (without schema) ```typescript // Extract returns a default object with 'extraction' field const result = await stagehand.extract("extract the sign in button text"); console.log(result); // Output: { extraction: "Sign in" } // Or destructure directly const { extraction } = await stagehand.extract( "extract the sign in button text", ); console.log(extraction); // "Sign in" ``` ### Targeted Extraction Extract data from a specific element using a selector: ```typescript const reason = await stagehand.extract( "extract the reason why script injection fails", z.string(), { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, ); ``` ### URL Extraction When extracting links or URLs, use `z.string().url()`: ```typescript const { links } = await stagehand.extract( "extract all navigation links", z.object({ links: z.array(z.string().url()), }), ); ``` ### Extracting from a Specific Page ```typescript // Extract from a specific page (when you need to target a page that isn't currently active) const data = await stagehand.extract( "extract the placeholder text on the name field", { page: page2 }, ); ``` ## Observe Plan actions before executing them. Returns an array of candidate actions: ```typescript // Get candidate actions on the current active page const [action] = await stagehand.observe("Click the sign in button"); // Execute the action await stagehand.act(action); ``` Observing on a specific page: ```typescript // Target a specific page (when you need to target a page that isn't currently active) const actions = await stagehand.observe("find the next page button", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Agent Use the `agent` method to autonomously execute complex, multi-step tasks. ### Basic Agent Usage ```typescript const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); const agent = stagehand.agent({ model: "google/gemini-2.0-flash", executionModel: "google/gemini-2.0-flash", }); const result = await agent.execute({ instruction: "Search for the stock price of NVDA", maxSteps: 20, }); console.log(result.message); ``` ### Computer Use Agent (CUA) For more advanced scenarios using computer-use models: ```typescript const agent = stagehand.agent({ mode: "cua", // Enable Computer Use Agent mode model: "anthropic/claude-sonnet-4-20250514", // or "google/gemini-2.5-computer-use-preview-10-2025" systemPrompt: `You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.`, }); await agent.execute({ instruction: "Apply for a library card at the San Francisco Public Library", maxSteps: 30, }); ``` ### Agent with Custom Model Configuration ```typescript const agent = stagehand.agent({ model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GEMINI_API_KEY, }, systemPrompt: `You are a helpful assistant.`, }); ``` ### Agent with Integrations (MCP/External Tools) ```typescript const agent = stagehand.agent({ integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], systemPrompt: `You have access to the Exa search tool.`, }); ``` ## Advanced Features ### DeepLocator (XPath Targeting) Target specific elements across shadow DOM and iframes: ```typescript await page .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") .highlight({ durationMs: 5000, contentColor: { r: 255, g: 0, b: 0 }, }); ``` ### Multi-Page Workflows ```typescript const page1 = stagehand.context.pages()[0]; await page1.goto("https://example.com"); const page2 = await stagehand.context.newPage(); await page2.goto("https://example2.com"); // Act/extract/observe operate on the current active page by default // Pass { page } option to target a specific page await stagehand.act("click button", { page: page1 }); await stagehand.extract("get title", { page: page2 }); ``` ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Detailed descriptions help us resolve faster title: '' labels: '' assignees: '' --- **Before submitting an issue, please:** - [ ] Check the [documentation](https://docs.stagehand.dev/) for relevant information - [ ] Search existing [issues](https://github.com/browserbase/stagehand/issues) to avoid duplicates ## Environment Information Please provide the following information to help us reproduce and resolve your issue: **Stagehand:** - Language/SDK: [TypeScript, Python, MCP…] - Stagehand version: [e.g., 1.0.0] **AI Provider:** - Provider: [e.g., OpenAI, Anthropic, Azure OpenAI] - Model: [e.g., gpt-4o, claude-sonnet-4-6] ## Issue Description ``` [Describe the current behavior here] ``` ### Steps to Reproduce 1. 2. 3. ### Minimal Reproduction Code ```tsx // Your minimal reproduction code here import { Stagehand } from '@browserbase/stagehand'; const stagehand = new Stagehand({ // IMPORTANT: include your stagehand config }); // Steps that reproduce the issue ``` ### Error Messages / Log trace ``` [Paste error messages/logs here] ``` ### Screenshots / Videos ``` [Attach screenshots or videos here] ``` ### Related Issues Are there any related issues or PRs? - Related to: #[issue number] - Duplicate of: #[issue number] - Blocks: #[issue number] ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Are you willing to contribute to implementing this feature or fix?** - [ ] Yes, I can submit a PR - [ ] Yes, but I need guidance - [ ] No, I cannot contribute at this time ================================================ FILE: .github/actions/select-browserbase-region/action.yml ================================================ name: Select Browserbase region description: Select a Browserbase region based on a weighted distribution. inputs: distribution: description: Comma-separated region=weight list (e.g. us-west-2=40,us-east-1=20). required: true outputs: region: description: Selected region. value: ${{ steps.select.outputs.region }} runs: using: composite steps: - id: select shell: bash run: | dist="${{ inputs.distribution }}" if [ -z "$dist" ]; then echo "BROWSERBASE_REGION_DISTRIBUTION is empty" exit 1 fi IFS=',' read -r -a entries <<< "$dist" total=0 regions=() weights=() for entry in "${entries[@]}"; do region="${entry%%=*}" weight="${entry#*=}" region="$(printf '%s' "$region" | tr -d '[:space:]')" weight="$(printf '%s' "$weight" | tr -d '[:space:]')" if [ -z "$region" ] || [ -z "$weight" ]; then echo "Invalid region distribution entry: $entry" exit 1 fi if ! [[ "$region" =~ ^[A-Za-z0-9-]+$ ]]; then echo "Invalid region value: $region" exit 1 fi if ! [[ "$weight" =~ ^[0-9]+$ ]]; then echo "Invalid weight for region $region: $weight" exit 1 fi regions+=("$region") weights+=("$weight") total=$((total + weight)) done if [ "$total" -le 0 ]; then echo "Invalid total weight: $total" exit 1 fi roll=$((RANDOM % total)) cumulative=0 chosen="" for i in "${!regions[@]}"; do cumulative=$((cumulative + weights[i])) if [ "$roll" -lt "$cumulative" ]; then chosen="${regions[i]}" break fi done if [ -z "$chosen" ]; then echo "Failed to choose Browserbase region" exit 1 fi echo "Selected Browserbase region: $chosen" echo "region=$chosen" >> "$GITHUB_OUTPUT" echo "BROWSERBASE_REGION=$chosen" >> "$GITHUB_ENV" ================================================ FILE: .github/actions/setup-node-pnpm-turbo/action.yml ================================================ name: Setup Node, pnpm, and Turbo cache description: Configure pnpm and Node.js with caching, restore Turbo cache, and install dependencies. inputs: node-version: description: Node.js version to use. required: false default: "20.x" use-prebuilt-artifacts: description: Whether to download pre-built package from build artifacts. required: false default: "true" restore-turbo-cache: description: Whether to restore the local .turbo cache. required: false default: "true" runs: using: composite steps: - uses: pnpm/action-setup@v4 - name: Set up Node.js uses: actions/setup-node@v6 with: node-version: ${{ inputs.node-version }} cache: 'pnpm' cache-dependency-path: '**/pnpm-lock.yaml' - name: Restore Turbo cache if: ${{ inputs.restore-turbo-cache == 'true' }} uses: actions/cache/restore@v4 with: path: .turbo key: ${{ runner.os }}-turbo-${{ hashFiles('pnpm-lock.yaml', 'pnpm-workspace.yaml', 'package.json', 'turbo.json') }}-${{ github.sha }} restore-keys: | ${{ runner.os }}-turbo-${{ hashFiles('pnpm-lock.yaml', 'pnpm-workspace.yaml', 'package.json', 'turbo.json') }}- - name: Install dependencies shell: bash run: pnpm install --frozen-lockfile --prefer-offline - name: Download build artifacts if: ${{ inputs.use-prebuilt-artifacts == 'true' }} uses: actions/download-artifact@v4 with: name: build-artifacts path: . merge-multiple: true - name: Prepare test output directories shell: bash run: | mkdir -p "${GITHUB_WORKSPACE}/ctrf" if [ -n "${NODE_V8_COVERAGE:-}" ]; then mkdir -p "$NODE_V8_COVERAGE" fi ================================================ FILE: .github/actions/upload-ctrf-report/action.yml ================================================ name: Upload CTRF report description: Upload CTRF report artifact. inputs: name: description: Report path (used as artifact name when sanitized). required: true path: description: Optional explicit path (defaults to name). required: false default: "" runs: using: composite steps: - name: Normalize inputs id: normalize shell: bash run: | name="${{ inputs.name }}" echo "name=${name//\//-}" >> "$GITHUB_OUTPUT" if [ -n "${{ inputs.path }}" ]; then echo "path=${{ inputs.path }}" >> "$GITHUB_OUTPUT" else echo "path=${{ inputs.name }}" >> "$GITHUB_OUTPUT" fi - name: Upload CTRF report artifact uses: actions/upload-artifact@v4 with: name: ${{ steps.normalize.outputs.name }} # package.json anchors uploaded paths to the repository root. path: | package.json ${{ steps.normalize.outputs.path }} ================================================ FILE: .github/actions/upload-v8-coverage/action.yml ================================================ name: Upload V8 coverage description: Upload V8 coverage artifacts. inputs: name: description: Artifact name. required: true path: description: Coverage path to upload (defaults to name). required: false default: "" runs: using: composite steps: - name: Normalize artifact name id: normalize shell: bash run: | name="${{ inputs.name }}" echo "name=${name//\//-}" >> "$GITHUB_OUTPUT" if [ -n "${{ inputs.path }}" ]; then echo "path=${{ inputs.path }}" >> "$GITHUB_OUTPUT" else echo "path=${{ inputs.name }}" >> "$GITHUB_OUTPUT" fi - name: Upload coverage artifact uses: actions/upload-artifact@v4 with: name: ${{ steps.normalize.outputs.name }} # package.json anchors uploaded paths to the repository root. path: | package.json ${{ steps.normalize.outputs.path }} ================================================ FILE: .github/actions/verify-chromium-launch/action.yml ================================================ name: Verify Chromium launch description: Validate that Chromium can start, connect to CDP, and read the page title. inputs: chrome-path: description: Path to Chromium/Chrome binary. required: false default: "/usr/bin/chromium" max-attempts: description: Number of launch attempts before failing. required: false default: "3" timeout-ms: description: Milliseconds to wait for DevTools and CDP per attempt. required: false default: "30000" runs: using: composite steps: - shell: bash run: | set -euo pipefail max_attempts="${{ inputs.max-attempts }}" attempt=1 while [ "$attempt" -le "$max_attempts" ]; do if [ -n "${{ inputs.chrome-path }}" ]; then pkill -f "${{ inputs.chrome-path }}" >/dev/null 2>&1 || true fi if node - <<'NODE' const { spawn } = require("node:child_process"); const workspace = process.env.GITHUB_WORKSPACE; if (workspace) { process.chdir(workspace); } const chrome = "${{ inputs.chrome-path }}"; const timeoutMs = Number("${{ inputs.timeout-ms }}"); const wsPrefix = "DevTools listening on "; const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); let proc; let wsUrl; const waitForWsUrl = async () => { const deadline = Date.now() + timeoutMs; while (!wsUrl) { if (Date.now() > deadline) { throw new Error( `❌ Chromium did not expose CDP WS URL within timeout (${timeoutMs}ms)`, ); } await sleep(250); } return wsUrl; }; const cleanup = () => { if (proc && !proc.killed) { proc.kill("SIGKILL"); } }; (async () => { try { const startTime = Date.now(); const args = [ '--ash-no-nudges', '--block-new-web-contents', '--deny-permission-prompts', '--disable-breakpad', '--disable-client-side-phishing-detection', '--disable-component-update', '--disable-components=AcceptCHFrame,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,InterestFeedContentSuggestions,CalculateNativeWinOcclusion,BackForwardCache,HeavyAdPrivacyMitigations,LazyFrameLoading,ImprovedCookieControls,PrivacySandboxSettings4,AutofillServerCommunication,CertificateTransparencyComponentUpdater,DestroyProfileOnBrowserClose,CrashReporting,OverscrollHistoryNavigation,InfiniteSessionRestore', '--disable-datasaver-prompt', '--disable-default-apps', '--disable-desktop-notifications', '--disable-domain-reliability', '--disable-external-intent-requests', '--disable-hang-monitor', '--disable-infobars', '--disable-notifications', '--disable-popup-blocking', '--disable-print-preview', '--disable-prompt-on-repost', '--disable-search-engine-choice-screen', '--disable-session-crashed-bubble', '--disable-speech-api', '--disable-speech-synthesis-api', '--hide-crash-restore-bubble', '--metrics-recording-only', '--no-default-browser-check', '--no-first-run', '--no-pings', '--noerrdialogs', '--safebrowsing-disable-auto-update', '--silent-debugger-extension-api', '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"', '--suppress-message-center-popups', "--disable-background-networking", "--disable-default-apps", "--disable-dev-shm-usage", "--disable-extensions", "--disable-notifications", "--disable-setuid-sandbox", "--disable-site-isolation-trials", "--disable-sync", "--disable-web-security", "--headless=new", "--no-default-browser-check", "--no-first-run", "--no-sandbox", "--no-zygote", "--password-store=basic", "--remote-debugging-port=0", "--test-type=gpu", "--use-mock-keychain", "about:blank", ]; proc = spawn(chrome, args, { stdio: ["ignore", "pipe", "pipe"] }); const lineBuffers = { stdout: "", stderr: "" }; const onData = (stream) => (data) => { const text = data.toString(); if (stream === "stderr") { process.stderr.write(text); } else { process.stdout.write(text); } lineBuffers[stream] += text; const lines = lineBuffers[stream].split(/\r?\n/); lineBuffers[stream] = lines.pop() ?? ""; for (const line of lines) { const idx = line.indexOf(wsPrefix); if (idx === -1) continue; const rest = line.slice(idx + wsPrefix.length).trim(); const candidate = rest.split(/\s+/)[0]; if ( candidate.startsWith("ws://") || candidate.startsWith("wss://") ) { wsUrl = candidate; } } }; proc.stdout.on("data", onData("stdout")); proc.stderr.on("data", onData("stderr")); const url = await waitForWsUrl(); const wsFoundMs = Date.now() - startTime; const wsFoundSec = (wsFoundMs / 1000).toFixed(2); const connectStart = Date.now(); const path = require("node:path"); const workspaceRoot = process.env.GITHUB_WORKSPACE || process.cwd(); const playwrightPath = path.join( workspaceRoot, "packages/core/node_modules/playwright", ); console.log( `✅ CDP Url found after ${wsFoundSec}s, connecting with playwright...`, ); const { chromium } = require(playwrightPath); const browser = await chromium.connectOverCDP(url, { timeout: timeoutMs, }); const context = browser.contexts()[0]; if (!context) { throw new Error("❌ No browser context available after CDP connect"); } const page = context.pages()[0]; if (!page) { throw new Error("❌ No page available after CDP connect"); } const remainingMs = timeoutMs - (Date.now() - connectStart); if (remainingMs <= 0) { throw new Error( `❌ CDP connect + verify timed out after ${timeoutMs}ms`, ); } const sum = await Promise.race([ page.evaluate("1 + 1"), new Promise((_, reject) => setTimeout( () => reject( new Error( `❌ CDP connect + verify timed out after ${timeoutMs}ms`, ), ), remainingMs, ), ), ]); if (sum !== 2) { throw new Error(`❌ Unexpected eval result: ${sum}`); } const totalMs = Date.now() - startTime; const connectMs = Date.now() - connectStart; const totalSec = (totalMs / 1000).toFixed(2); const connectSec = (connectMs / 1000).toFixed(2); console.log( `✅ Chromium launched in ${wsFoundSec}s and CDP connected in ${connectSec}s (total: ${totalSec}s)`, ); await browser.close(); cleanup(); process.exit(0); } catch (err) { cleanup(); console.error(err instanceof Error ? err.message : String(err)); process.exit(1); } })(); NODE then if [ "$attempt" -gt 1 ]; then echo "⚠️ Chromium launch succeeded after ${attempt} attempts; GitHub Actions runner may be constrained." fi exit 0 fi echo "⚠️ Chromium launch attempt ${attempt} failed." attempt=$((attempt + 1)) sleep 2 done echo "❌ Failed to launch Chromium before running Stagehand; GitHub Actions runner is likely overloaded." exit 1 ================================================ FILE: .github/pull_request_template ================================================ # why # what changed # test plan ================================================ FILE: .github/workflows/ci.yml ================================================ name: Tests on: pull_request: types: - opened - synchronize - labeled - unlabeled paths-ignore: - "packages/docs/**" permissions: contents: read actions: write env: BROWSERBASE_FLOW_LOGS: "1" LLM_MAX_MS: "15000" EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-haiku-4-5" EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-sonnet-4-6" EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent" EVAL_MAX_CONCURRENCY: 25 EVAL_TRIAL_COUNT: 3 LOCAL_SESSION_LIMIT_PER_E2E_TEST: 2 BROWSERBASE_SESSION_LIMIT_PER_E2E_TEST: 3 BROWSERBASE_REGION_DISTRIBUTION: "us-west-2=30,us-east-1=30,eu-central-1=20,ap-southeast-1=20" # percentage of load for each region when running e2e tests against prod CHROME_PATH: /usr/bin/chromium # GitHub Actions runners ship with stable Chromium by default BROWSERBASE_CDP_CONNECT_MAX_MS: "10000" BROWSERBASE_SESSION_CREATE_MAX_MS: "60000" PUPPETEER_SKIP_DOWNLOAD: "1" PLAYWRIGHT_SKIP_DOWNLOAD: "1" TURBO_TELEMETRY_DISABLED: "1" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: determine-changes: runs-on: ubuntu-latest outputs: core: ${{ steps.filter.outputs.core }} cli: ${{ steps.filter.outputs.cli }} evals: ${{ steps.filter.outputs.evals }} server: ${{ steps.filter.outputs.server }} docs-only: ${{ steps.filter.outputs.docs-only }} steps: - name: Check out repository code uses: actions/checkout@v4 - name: Log GitHub API rate limit env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | headers_file=$(mktemp) body_file=$(mktemp) curl -sSL \ -D "$headers_file" \ -o "$body_file" \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ https://api.github.com/rate_limit cat "$headers_file" echo "" cat "$body_file" remaining=$(jq -r '.rate.remaining' "$body_file") if [ "$remaining" -eq 0 ]; then reset_epoch=$(jq -r '.rate.reset' "$body_file") reset_utc=$(date -u -d "@$reset_epoch" +"%Y-%m-%d %H:%M:%S") reset_pacific=$(TZ=America/Los_Angeles date -d "@$reset_epoch" +"%Y-%m-%d %H:%M:%S %Z") echo "Github API rate limited until: ${reset_pacific} (${reset_utc} UTC)" >> "$GITHUB_STEP_SUMMARY" echo "GitHub API rate limit exhausted." exit 1 fi - uses: dorny/paths-filter@v3 id: filter with: filters: | core: - '.github/workflows/ci.yml' - 'packages/core/**' - 'package.json' - 'pnpm-lock.yaml' - 'turbo.json' cli: - 'packages/cli/**' - 'packages/core/**' - 'package.json' - 'pnpm-lock.yaml' evals: - 'packages/evals/**' - 'package.json' - 'pnpm-lock.yaml' server: - 'packages/server-v3/**' - 'packages/server-v4/**' - 'packages/core/**' - 'package.json' - 'pnpm-lock.yaml' - 'pnpm-workspace.yaml' - '.github/workflows/ci.yml' docs-only: - '**/*.md' - 'examples/**' - '!packages/**/*.md' determine-evals: needs: [determine-changes] runs-on: ubuntu-latest outputs: skip-all-evals: ${{ steps.check-labels.outputs.skip-all-evals }} eval-categories: ${{ steps.check-labels.outputs.eval-categories }} steps: - id: check-labels run: | categories=() declare -A seen add_category() { local category="$1" if [[ -z "${seen[$category]:-}" ]]; then categories+=("$category") seen["$category"]=1 fi } emit_categories() { local json="[" for category in "${categories[@]}"; do json+="\"${category}\"," done json="${json%,}" json+="]" echo "eval-categories=$json" >> $GITHUB_OUTPUT } # Check if skip-evals label is present if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-evals') }}" == "true" ]]; then echo "skip-evals label found - skipping all evals" echo "skip-all-evals=true" >> $GITHUB_OUTPUT emit_categories exit 0 fi # Skip evals if only docs/examples changed if [[ "${{ needs.determine-changes.outputs.docs-only }}" == "true" && "${{ needs.determine-changes.outputs.core }}" == "false" && "${{ needs.determine-changes.outputs.evals }}" == "false" ]]; then echo "Only docs/examples changed - skipping evals" echo "skip-all-evals=true" >> $GITHUB_OUTPUT emit_categories exit 0 fi # Check for skip-regression-evals label if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-regression-evals') }}" == "true" ]]; then echo "skip-regression-evals label found - regression evals will be skipped" else echo "Regression evals will run by default" add_category "regression" fi # Check for specific labels echo "skip-all-evals=false" >> $GITHUB_OUTPUT if [[ "${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" == "true" ]]; then add_category "combination" fi if [[ "${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" == "true" ]]; then add_category "extract" fi if [[ "${{ contains(github.event.pull_request.labels.*.name, 'act') }}" == "true" ]]; then add_category "act" fi if [[ "${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" == "true" ]]; then add_category "observe" fi if [[ "${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" == "true" ]]; then add_category "targeted_extract" fi if [[ "${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" == "true" ]]; then add_category "agent" fi emit_categories run-lint: name: Lint runs-on: ubuntu-latest needs: [run-build] steps: - name: Check out repository code uses: actions/checkout@v4 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" node-version: 20.x - name: Run Lint run: pnpm exec turbo run lint cancel-after-lint-failure: name: Cancel after lint failure runs-on: ubuntu-latest needs: [run-lint] if: ${{ always() && needs.run-lint.result == 'failure' }} continue-on-error: true steps: - name: Cancel workflow run env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | curl -sSfL -X POST \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/cancel" run-build: name: Build runs-on: ubuntu-latest steps: - name: Check out repository code uses: actions/checkout@v4 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" node-version: 20.x - name: Run Build run: pnpm exec turbo run build - name: Save Turbo cache if: always() uses: actions/cache/save@v4 with: path: .turbo key: ${{ runner.os }}-turbo-${{ hashFiles('pnpm-lock.yaml', 'pnpm-workspace.yaml', 'package.json', 'turbo.json') }}-${{ github.sha }} - name: Upload build artifacts uses: actions/upload-artifact@v4 with: name: build-artifacts include-hidden-files: true # package.json is included to anchor artifact paths at repo root. path: | package.json packages/core/dist/** packages/core/lib/version.ts packages/core/lib/dom/build/** packages/core/lib/v3/dom/build/** packages/cli/dist/** packages/evals/dist/** packages/server-v3/dist/** packages/server-v3/openapi.v3.yaml packages/server-v4/dist/** packages/server-v4/openapi.v4.yaml retention-days: 1 run-cli-tests: name: CLI Tests runs-on: ubuntu-latest needs: [run-build, determine-changes] if: needs.determine-changes.outputs.cli == 'true' steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Run CLI Tests run: pnpm exec turbo run test:cli --filter=@browserbasehq/browse-cli discover-core-tests: runs-on: ubuntu-latest needs: [determine-changes] if: needs.determine-changes.outputs.core == 'true' outputs: core-tests: ${{ steps.set-matrix.outputs.core-tests }} has-core-tests: ${{ steps.set-matrix.outputs.has-core-tests }} steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" restore-turbo-cache: "false" - name: Discover core test files id: set-matrix run: | core_json=$(pnpm --filter @browserbasehq/stagehand --silent run test:core -- --list) echo "core-tests=$core_json" >> $GITHUB_OUTPUT if [ "$core_json" = "[]" ]; then echo "has-core-tests=false" >> $GITHUB_OUTPUT else echo "has-core-tests=true" >> $GITHUB_OUTPUT fi echo "Found core tests: $core_json" core-unit-tests: name: core/${{ matrix.test.name }} runs-on: ubuntu-latest needs: [run-build, discover-core-tests] if: needs.discover-core-tests.outputs.has-core-tests == 'true' env: STAGEHAND_BROWSER_TARGET: local STAGEHAND_SERVER_TARGET: local strategy: fail-fast: false max-parallel: 100 matrix: test: ${{ fromJson(needs.discover-core-tests.outputs.core-tests) }} steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Run Vitest - ${{ matrix.test.name }} run: | pnpm exec turbo run test:core --only --filter=@browserbasehq/stagehand -- "${{ matrix.test.path }}" - uses: ./.github/actions/upload-ctrf-report if: always() with: name: ctrf/core-unit/${{ matrix.test.name }}.json - uses: ./.github/actions/upload-v8-coverage if: always() with: name: coverage/core-unit/${{ matrix.test.name }} discover-server-tests: runs-on: ubuntu-latest needs: [determine-changes] if: needs.determine-changes.outputs.server == 'true' outputs: integration-tests: ${{ steps.set-matrix.outputs.integration-tests }} has-integration-tests: ${{ steps.set-matrix.outputs.has-integration-tests }} steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" restore-turbo-cache: "false" - name: Discover server test files id: set-matrix run: | int_json=$(pnpm --filter @browserbasehq/stagehand-server-v3 --silent run test:server -- --list integration) echo "integration-tests=$int_json" >> $GITHUB_OUTPUT if [ "$int_json" = "[]" ]; then echo "has-integration-tests=false" >> $GITHUB_OUTPUT else echo "has-integration-tests=true" >> $GITHUB_OUTPUT fi echo "Found server integration tests: $int_json" build-server-sea: name: Build SEA binary (tests, v3) uses: ./.github/workflows/stagehand-server-v3-sea-build.yml needs: [run-build] with: matrix: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v3-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v3-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v3-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v3-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v3-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v3-win32-arm64.exe","include_sourcemaps":false}, {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v3-linux-x64-sourcemap","include_sourcemaps":true} ] use-prebuilt-artifacts: "true" restore-turbo-cache: "false" node-version: "20.x" upload-only-binary: stagehand-server-v3-linux-x64-sourcemap server-integration-tests: name: server/v3/integration/${{ matrix.test.name }} runs-on: ubuntu-latest needs: [build-server-sea, discover-server-tests, run-build] if: needs.discover-server-tests.outputs.has-integration-tests == 'true' strategy: fail-fast: false matrix: test: ${{ fromJson(needs.discover-server-tests.outputs.integration-tests) }} env: BB_ENV: local STAGEHAND_BASE_URL: http://stagehand-api.localhost:3106 STAGEHAND_BROWSER_TARGET: local STAGEHAND_SERVER_TARGET: sea OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # Used only for testing /start with env: BROWSERBASE remote browser BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Download SEA binary uses: actions/download-artifact@v4 with: name: stagehand-server-v3-linux-x64-sourcemap path: . - name: Ensure SEA binary is present and executable shell: bash run: | set -euo pipefail test -f packages/server-v3/dist/sea/stagehand-server-v3-linux-x64-sourcemap chmod +x packages/server-v3/dist/sea/stagehand-server-v3-linux-x64-sourcemap - name: Run server integration test - ${{ matrix.test.name }} env: SEA_BINARY_NAME: stagehand-server-v3-linux-x64-sourcemap run: | pnpm exec turbo run test:server --only --filter=@browserbasehq/stagehand-server-v3 -- "${{ matrix.test.path }}" - uses: ./.github/actions/upload-ctrf-report if: always() with: name: ctrf/server-v3-integration/${{ matrix.test.name }}.json - uses: ./.github/actions/upload-v8-coverage if: always() with: name: coverage/server-v3-integration/${{ matrix.test.name }} discover-e2e-tests: runs-on: ubuntu-latest needs: [determine-changes] if: needs.determine-changes.outputs.core == 'true' outputs: e2e-tests: ${{ steps.set-matrix.outputs.e2e-tests }} has-e2e-tests: ${{ steps.set-matrix.outputs.has-e2e-tests }} steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" restore-turbo-cache: "false" - name: Discover e2e test files id: set-matrix run: | e2e_json=$(pnpm --filter @browserbasehq/stagehand --silent run test:e2e -- --list) echo "e2e-tests=$e2e_json" >> $GITHUB_OUTPUT if [ "$e2e_json" = "[]" ]; then echo "has-e2e-tests=false" >> $GITHUB_OUTPUT else echo "has-e2e-tests=true" >> $GITHUB_OUTPUT fi echo "Found e2e tests: $e2e_json" run-e2e-local-tests: name: e2e/local/${{ matrix.test.name }} needs: [run-build, discover-e2e-tests] runs-on: ubuntu-latest timeout-minutes: 50 if: > needs.discover-e2e-tests.outputs.has-e2e-tests == 'true' && github.event.pull_request.head.repo.full_name == github.repository env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true STAGEHAND_BROWSER_TARGET: local STAGEHAND_SERVER_TARGET: local strategy: fail-fast: false max-parallel: 20 matrix: test: ${{ fromJson(needs.discover-e2e-tests.outputs.e2e-tests) }} steps: - name: Check out repository code uses: actions/checkout@v4 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - uses: ./.github/actions/verify-chromium-launch - name: Run local E2E Tests - ${{ matrix.test.name }} run: | pnpm exec turbo run test:e2e --only --filter=@browserbasehq/stagehand -- "${{ matrix.test.path }}" - uses: ./.github/actions/upload-ctrf-report if: always() with: name: ctrf/e2e-local/${{ matrix.test.name }}.json - uses: ./.github/actions/upload-v8-coverage if: always() with: name: coverage/e2e-local/${{ matrix.test.name }} run-e2e-bb-tests: name: e2e/bb/${{ matrix.test.name }} needs: [run-build, discover-e2e-tests] runs-on: ubuntu-latest timeout-minutes: 50 if: > needs.discover-e2e-tests.outputs.has-e2e-tests == 'true' && github.event.pull_request.head.repo.full_name == github.repository env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true STAGEHAND_BROWSER_TARGET: browserbase STAGEHAND_SERVER_TARGET: local strategy: fail-fast: false max-parallel: 100 matrix: test: ${{ fromJson(needs.discover-e2e-tests.outputs.e2e-tests) }} steps: - name: Check out repository code uses: actions/checkout@v4 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Select Browserbase region uses: ./.github/actions/select-browserbase-region with: distribution: ${{ env.BROWSERBASE_REGION_DISTRIBUTION }} - name: Run E2E Tests (browserbase) - ${{ matrix.test.name }} run: | pnpm exec turbo run test:e2e --only --filter=@browserbasehq/stagehand -- "${{ matrix.test.path }}" - uses: ./.github/actions/upload-ctrf-report if: always() with: name: ctrf/e2e-bb/${{ matrix.test.name }}.json - uses: ./.github/actions/upload-v8-coverage if: always() with: name: coverage/e2e-bb/${{ matrix.test.name }} run-evals: name: evals/${{ matrix.category }} needs: [run-build, determine-evals, run-e2e-bb-tests] if: >- ${{ always() && needs.run-build.result == 'success' && needs.determine-evals.result == 'success' && needs.run-e2e-bb-tests.result != 'failure' && needs.run-e2e-bb-tests.result != 'cancelled' && needs.determine-evals.outputs.skip-all-evals != 'true' && needs.determine-evals.outputs.eval-categories != '[]' }} runs-on: ubuntu-latest timeout-minutes: 90 strategy: fail-fast: false matrix: category: ${{ fromJson(needs.determine-evals.outputs.eval-categories) }} env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} STAGEHAND_BROWSER_TARGET: browserbase STAGEHAND_SERVER_TARGET: local steps: - name: Check out repository code uses: actions/checkout@v4 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Select Browserbase region uses: ./.github/actions/select-browserbase-region with: distribution: ${{ env.BROWSERBASE_REGION_DISTRIBUTION }} - name: Run Evals - ${{ matrix.category }} id: run-evals env: NODE_V8_COVERAGE: coverage/evals/${{ matrix.category }} run: | log_file="$(mktemp)" set +e pnpm exec turbo run test:evals --only --filter=@browserbasehq/stagehand-evals -- "${{ matrix.category }}" -t "${EVAL_TRIAL_COUNT}" -c "${EVAL_MAX_CONCURRENCY}" 2>&1 | tee "$log_file" eval_status=${PIPESTATUS[0]} set -e summary_block="$( awk ' /^=========================SUMMARY=========================$/ { capture=1 } capture { print } /^Evaluation summary written to / { capture=0 } ' "$log_file" )" if [ -n "$summary_block" ]; then { echo "summary_text<> "$GITHUB_OUTPUT" fi exit "$eval_status" - name: Log Evals Performance - ${{ matrix.category }} env: EVAL_STDOUT_SUMMARY: ${{ steps.run-evals.outputs.summary_text }} run: | if [ -n "${EVAL_STDOUT_SUMMARY:-}" ]; then echo "### Evals Summary (${{ matrix.category }})" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" printf '%s\n' "$EVAL_STDOUT_SUMMARY" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" fi experimentName=$(jq -r '.experimentName' eval-summary.json) echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" if [ -f eval-summary.json ]; then category_score=$(jq ".categories[\"${{ matrix.category }}\"]" eval-summary.json) echo "${{ matrix.category }} category score: $category_score%" if (( $(echo "$category_score < 80" | bc -l) )); then echo "${{ matrix.category }} category score is below 80%. Failing CI." exit 1 fi else echo "Eval summary not found for ${{ matrix.category }} category. Failing CI." exit 1 fi - uses: ./.github/actions/upload-ctrf-report if: always() with: name: ctrf/evals/${{ matrix.category }}.json - uses: ./.github/actions/upload-v8-coverage if: always() with: name: coverage/evals/${{ matrix.category }} merge-coverage: name: Code Coverage Report runs-on: ubuntu-latest needs: - core-unit-tests - run-e2e-local-tests - run-e2e-bb-tests - run-evals - server-integration-tests # if: always() if: false steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "true" restore-turbo-cache: "false" - name: Download V8 coverage artifacts uses: actions/download-artifact@v4 continue-on-error: true with: pattern: coverage-* path: . merge-multiple: true - name: Download CTRF artifacts uses: actions/download-artifact@v4 continue-on-error: true with: pattern: ctrf-* path: . merge-multiple: true - name: Generate merged coverage report run: | pnpm run coverage:merge - name: Upload merged coverage report if: always() id: upload-coverage-artifact uses: actions/upload-artifact@v4 with: name: coverage-merged # package.json is included to anchor artifact paths at repo root. path: | package.json coverage/merged - name: Add coverage summary to job summary if: always() shell: bash run: | echo "### Code Coverage" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" if [ -f coverage/merged/coverage-summary.txt ]; then echo '```' >> "$GITHUB_STEP_SUMMARY" cat coverage/merged/coverage-summary.txt >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" else echo "Coverage summary not available." >> "$GITHUB_STEP_SUMMARY" fi if [ -n "${{ steps.upload-coverage-artifact.outputs.artifact-url }}" ]; then echo "" >> "$GITHUB_STEP_SUMMARY" echo "[Download full HTML coverage report](${{ steps.upload-coverage-artifact.outputs.artifact-url }})" >> "$GITHUB_STEP_SUMMARY" fi - name: Publish merged CTRF report if: always() uses: ctrf-io/github-test-reporter@v1 with: report-path: './ctrf/**/*.json' summary: true summary-report: false summary-delta-report: true test-report: false failed-report: false insights-report: true flaky-rate-report: true fail-rate-report: true slowest-report: true previous-results-report: true fetch-previous-results: true baseline: 1 previous-results-max: 1 max-workflow-runs-to-check: 5 max-previous-runs-to-fetch: 1 upload-artifact: true artifact-name: ctrf-report-merged env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Compute coverage status metrics if: always() id: coverage-status shell: bash run: | set -euo pipefail shopt -s globstar nullglob tests_failed=0 ctrf_files=(ctrf/**/*.json) if [ "${#ctrf_files[@]}" -gt 0 ]; then tests_failed=$(jq -s '[.[].results.summary.failed // 0] | add' "${ctrf_files[@]}") fi total_coverage=0 if [ -f coverage/merged/coverage-summary.txt ]; then total_coverage=$(awk '/^Lines/ {gsub(/%/,"",$3); print $3}' coverage/merged/coverage-summary.txt) fi echo "tests_failed=${tests_failed}" >> "$GITHUB_OUTPUT" echo "total_coverage=${total_coverage}" >> "$GITHUB_OUTPUT" - name: Set coverage status if: always() continue-on-error: true shell: bash env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} RUN_ID: ${{ github.run_id }} PULL_NUMBER: ${{ github.event.pull_request.number }} TESTS_FAILED: ${{ steps.coverage-status.outputs.tests_failed }} TOTAL_COVERAGE: ${{ steps.coverage-status.outputs.total_coverage }} run: | set -euo pipefail repo="${GITHUB_REPOSITORY}" sha="${GITHUB_SHA}" tests_failed="${TESTS_FAILED:-0}" total_coverage="${TOTAL_COVERAGE:-0}" state="success" if [ -n "${PULL_NUMBER:-}" ]; then target_url="https://github.com/${repo}/pull/${PULL_NUMBER}/checks?check_run_id=${RUN_ID}" else target_url="https://github.com/${repo}/actions/runs/${RUN_ID}" fi description="non-blocking report: ${tests_failed} tests failed. ${total_coverage}% coverage" payload=$(jq -n \ --arg state "$state" \ --arg target_url "$target_url" \ --arg description "$description" \ --arg context "Measured coverage" \ '{state: $state, target_url: $target_url, description: $description, context: $context}') curl -sSfL -X POST \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "https://api.github.com/repos/${repo}/statuses/${sha}" \ -d "$payload" ================================================ FILE: .github/workflows/claude.yml ================================================ name: Claude Code on: issue_comment: types: [created] pull_request_review_comment: types: [created] issues: types: [opened, assigned] pull_request_review: types: [submitted] env: BROWSERBASE_FLOW_LOGS: "1" jobs: claude: if: | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: contents: write pull-requests: write issues: write id-token: write actions: write # Required for Claude to read CI results on PRs / rerun actions that failed steps: - name: Checkout repository uses: actions/checkout@v6 with: fetch-depth: 1 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" restore-turbo-cache: "false" node-version: 20.x - name: Run Build run: | pnpm exec turbo run build - name: Run Claude Code id: claude uses: anthropics/claude-code-action@v1 with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} # This is an optional setting that allows Claude to read CI results on PRs additional_permissions: | actions: read track_progress: true # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. prompt: 'Make sure "turbo run lint" and "turbo run build" pass before pushing and make sure to check present CI status for the branch and fix any easy failures. Prefer using the Github MCP tools over bash for Github operations, fall back to Bash(gh) for anything not supported by the MCP tools.' branch_prefix: 'claude-' # Optional: Add claude_args to customize behavior and configuration # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md # or https://code.claude.com/docs/en/cli-reference for available options claude_args: | --allowed-tools mcp__github_inline_comment__create_inline_comment,Bash,View,Glob,GlobTool,GrepTool,Grep,BatchTool,WebSearch,LS,Edit,MultiEdit,Write,Read # consider adding in the future: # - https://github.com/anthropics/claude-code-action/blob/main/examples/test-failure-analysis.yml # - https://github.com/anthropics/claude-code-action/blob/main/examples/ci-failure-auto-fix.yml # - https://github.com/anthropics/claude-code-action/blob/main/examples/issue-deduplication.yml ================================================ FILE: .github/workflows/external-contributor-pr-approval-handoff.yml ================================================ name: External Contributor PR Approval Handoff on: pull_request_review: types: - submitted permissions: contents: read pull-requests: read jobs: capture-approved-review: runs-on: ubuntu-latest steps: - name: Write approval handoff payload uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const fs = require('fs'); const pr = context.payload.pull_request; const review = context.payload.review; const shouldClaim = review.state === 'approved' && pr.head.repo.full_name !== context.payload.repository.full_name; const payload = { shouldClaim, prNumber: pr.number, reviewer: review.user?.login || '', reviewId: review.id, approvedSha: review.commit_id || pr.head.sha, }; fs.writeFileSync('approval-handoff.json', JSON.stringify(payload)); - name: Upload approval handoff artifact uses: actions/upload-artifact@v4 with: name: approved-review path: approval-handoff.json retention-days: 1 ================================================ FILE: .github/workflows/external-contributor-pr.yml ================================================ name: External Contributor PR on: pull_request_target: types: - opened - reopened - synchronize - closed workflow_run: workflows: - External Contributor PR Approval Handoff types: - completed permissions: actions: read contents: write pull-requests: write issues: write env: ECPR_LIB: | (() => { const LABELS = [ { name: 'external-contributor', color: '8b949e', description: 'Tracks PRs mirrored from external contributor forks.' }, { name: 'external-contributor:awaiting-approval', color: 'd29922', description: 'Waiting for a stagehand team member to approve the latest external commit.' }, { name: 'external-contributor:mirrored', color: '1f6feb', description: 'An internal mirrored PR currently exists for this external contributor PR.' }, { name: 'external-contributor:stale', color: 'db6d28', description: 'The mirrored PR is stale and waiting for a fresh approval to refresh.' }, { name: 'external-contributor:completed', color: '2da44e', description: 'The mirrored PR has been merged and the external contributor flow is complete.' }, ]; const MANAGED_LABELS = new Set(LABELS.map((label) => label.name)); const MANAGED_COMMENT_AUTHOR = 'github-actions[bot]'; const CLAIM_RE = //; const OWNED_RE = //; const NOTICE_MARKER = ''; const NOTICE_LINES = [ 'This PR is from an external contributor and must be approved by a stagehand team member with write access before CI can run.', 'Approving the latest commit mirrors it into an internal PR owned by the approver.', 'If new commits are pushed later, the internal PR stays open but is marked stale until someone approves the latest external commit and refreshes it.', ]; async function ensureLabels(github, context) { for (const label of LABELS) { try { await github.rest.issues.getLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label.name }); } catch (error) { if (error.status !== 404) throw error; try { await github.rest.issues.createLabel({ owner: context.repo.owner, repo: context.repo.repo, name: label.name, color: label.color, description: label.description, }); } catch (createError) { if (createError.status !== 422) throw createError; } } } } async function listComments(github, context, issueNumber) { return github.paginate(github.rest.issues.listComments, { owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, per_page: 100, }); } function isManagedComment(comment) { return comment.user?.login === MANAGED_COMMENT_AUTHOR; } function defaultManagedBranch(prNumber) { return `external-contributor-pr-${prNumber}`; } function sanitizeManagedBranch(prNumber, branch) { const fallback = defaultManagedBranch(prNumber); if (!branch) return fallback; const allowed = new RegExp(`^external-contributor-pr-${prNumber}(?:-[A-Za-z0-9._-]+)?$`); return allowed.test(branch) ? branch : fallback; } async function upsertComment(github, context, issueNumber, marker, lines) { const comments = await listComments(github, context, issueNumber); const body = [marker, ...lines].join('\n'); const existing = comments.find((comment) => isManagedComment(comment) && comment.body?.includes(marker)); if (!existing) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, body }); return; } if (existing.body !== body) { await github.rest.issues.updateComment({ owner: context.repo.owner, repo: context.repo.repo, comment_id: existing.id, body }); } } async function syncLabels(github, context, issueNumber, desiredLabels) { const { data: issue } = await github.rest.issues.get({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, }); const existingNames = issue.labels.map((label) => typeof label === 'string' ? label : label.name).filter(Boolean); const preserved = existingNames.filter((label) => !MANAGED_LABELS.has(label)); await github.rest.issues.setLabels({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, labels: [...preserved, ...desiredLabels], }); } async function findLatestClaim(github, context, issueNumber) { const comments = await listComments(github, context, issueNumber); return [...comments] .reverse() .map((comment) => { if (!isManagedComment(comment)) return null; const match = comment.body?.match(CLAIM_RE); if (!match) return null; const sourcePrNumber = issueNumber; return { ownedPrNumber: Number(match[1]), sourceSha: match[2], claimer: match[3], branch: sanitizeManagedBranch(sourcePrNumber, match[4]), }; }) .find(Boolean); } async function externalLifecycle({ github, context }) { const pr = context.payload.pull_request; await ensureLabels(github, context); if (context.payload.action === 'opened' || context.payload.action === 'reopened') { await upsertComment(github, context, pr.number, NOTICE_MARKER, NOTICE_LINES); const latestClaim = await findLatestClaim(github, context, pr.number); if (context.payload.action === 'reopened' && latestClaim && latestClaim.sourceSha === pr.head.sha) { const { data: ownedPr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: latestClaim.ownedPrNumber, }); if (ownedPr.state === 'open') { await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:mirrored']); await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pr.number, body: `This external contributor PR is already mirrored to ${ownedPr.html_url}. Closing it again so discussion stays on the internal PR until fresh commits require another approval.`, }); await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: pr.number, state: 'closed' }); return; } } await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:awaiting-approval']); return; } const latestClaim = await findLatestClaim(github, context, pr.number); if (!latestClaim || latestClaim.sourceSha === pr.head.sha) return; const { data: ownedPr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: latestClaim.ownedPrNumber, }); if (ownedPr.state !== 'open') return; await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:awaiting-approval']); await syncLabels(github, context, ownedPr.number, ['external-contributor', 'external-contributor:stale']); await upsertComment(github, context, ownedPr.number, '', [ `This mirrored PR is stale because the original external contributor PR #${pr.number} received new commits (\`${latestClaim.sourceSha}\` -> \`${pr.head.sha}\`).`, `Original PR: ${pr.html_url}`, '', 'Approve the latest external commit to refresh this same internal PR in place.', ]); if (pr.state === 'closed') { await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: pr.number, state: 'open' }); } await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: ownedPr.number, body: `New commits landed on external contributor PR #${pr.number} (\`${latestClaim.sourceSha}\` -> \`${pr.head.sha}\`). This mirrored PR stays open but is now stale until the latest external commit is approved and copied over.`, }); await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pr.number, body: `New commits were pushed to this external contributor PR (\`${latestClaim.sourceSha}\` -> \`${pr.head.sha}\`). The mirrored PR ${ownedPr.html_url} remains open but is marked stale. A stagehand team member with write access must approve the latest commit to refresh that internal PR.`, }); } async function prepareClaim({ github, context, core, artifactPath }) { const fs = require('fs'); const handoff = JSON.parse(fs.readFileSync(artifactPath, 'utf8')); core.setOutput('should-claim', 'false'); if (!handoff.shouldClaim || !handoff.prNumber || !handoff.reviewer || !handoff.approvedSha) return; const { data: pr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: Number(handoff.prNumber), }); if (pr.head.repo.full_name === context.payload.repository.full_name || pr.state !== 'open') return; const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({ owner: context.repo.owner, repo: context.repo.repo, username: handoff.reviewer, }); if (!new Set(['admin', 'maintain', 'write']).has(permission.permission)) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pr.number, body: `@${handoff.reviewer} submitted an approving review, but only stagehand team members with write access can claim external contributor PRs. A maintainer with write access must approve the latest commit to proceed.`, }); return; } if (pr.head.sha !== handoff.approvedSha) return; const latestClaim = await findLatestClaim(github, context, pr.number); const branch = sanitizeManagedBranch(pr.number, latestClaim?.branch); const title = `[Claimed #${pr.number}] ${pr.title}`; const body = [ `Mirrored from external contributor PR #${pr.number} after approval by @${handoff.reviewer}.`, '', `Original author: @${pr.user.login}`, `Original PR: ${pr.html_url}`, `Approved source head SHA: \`${pr.head.sha}\``, '', `@${pr.user.login}, please continue any follow-up discussion on this mirrored PR. When the external PR gets new commits, this same internal PR will be marked stale until the latest external commit is approved and refreshed here.`, '', '## Original description', pr.body?.trim() || '_No description provided._', '', ``, ].join('\n'); const { data: ownedPrs } = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'all', head: `${context.repo.owner}:${branch}`, base: 'main', per_page: 100, }); core.setOutput('should-claim', 'true'); core.setOutput('claimer', handoff.reviewer); core.setOutput('pr-number', String(pr.number)); core.setOutput('source-sha', pr.head.sha); core.setOutput('previous-source-sha', latestClaim?.sourceSha || ''); core.setOutput('branch', branch); core.setOutput('title', title); core.setOutput('body', body); core.setOutput('owned-pr-number', ownedPrs[0] ? String(ownedPrs[0].number) : ''); core.setOutput('owned-pr-merged', ownedPrs[0]?.merged_at ? 'true' : 'false'); } async function finalizeClaim({ github, context, input }) { await ensureLabels(github, context); const { prNumber, sourceSha, branch, claimer, title, body, existingNumber, existingMerged, refreshStatus, refreshReason, } = input; if (refreshStatus !== 'updated') { if (existingNumber) { await syncLabels(github, context, Number(existingNumber), ['external-contributor', 'external-contributor:stale']); await upsertComment(github, context, Number(existingNumber), '', [ `This mirrored PR could not be refreshed automatically after approval by @${claimer}.`, '', `Refresh reason: \`${refreshReason || 'unknown'}\``, 'Resolve the branch manually, then keep using this same mirrored PR.', ]); } await syncLabels(github, context, prNumber, ['external-contributor', 'external-contributor:awaiting-approval']); await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: prNumber, body: `The latest approval by @${claimer} could not refresh the mirrored PR automatically (${refreshReason || 'unknown reason'}). The external PR stays open, and the mirrored PR should be updated manually before work continues.`, }); return; } let ownedPr; if (existingNumber && !existingMerged) { const { data } = await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: Number(existingNumber), title, body, base: 'main', state: 'open', }); ownedPr = data; } else { const { data } = await github.rest.pulls.create({ owner: context.repo.owner, repo: context.repo.repo, title, body, head: branch, base: 'main', }); ownedPr = data; } await github.rest.issues.addAssignees({ owner: context.repo.owner, repo: context.repo.repo, issue_number: ownedPr.number, assignees: [claimer], }); await syncLabels(github, context, prNumber, ['external-contributor', 'external-contributor:mirrored']); await syncLabels(github, context, ownedPr.number, ['external-contributor', 'external-contributor:mirrored']); await upsertComment(github, context, ownedPr.number, '', [ `This mirrored PR tracks external contributor PR #${prNumber} at source SHA \`${sourceSha}\`, approved by @${claimer}.`, `Original PR: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}`, '', 'When the external PR gets new commits, this same internal PR will be refreshed in place after the latest external commit is approved.', ]); const marker = ``; const comments = await listComments(github, context, prNumber); if (!comments.some((comment) => comment.body?.includes(marker))) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: prNumber, body: [marker, `This PR was approved by @${claimer} and mirrored to ${ownedPr.html_url}. All further discussion should happen on that PR.`].join('\n'), }); } const { data: externalPr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: prNumber, }); if (externalPr.state !== 'closed') { await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: prNumber, state: 'closed' }); } } async function syncOwnedPr({ github, context }) { const pr = context.payload.pull_request; const match = pr.body?.match(OWNED_RE); if (!match) return; const sourcePrNumber = Number(match[1]); const sourceSha = match[2]; await ensureLabels(github, context); const { data: externalPr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: sourcePrNumber, }); if (context.payload.action === 'reopened') { await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:mirrored']); await syncLabels(github, context, sourcePrNumber, ['external-contributor', 'external-contributor:mirrored']); if (externalPr.state !== 'closed') { await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: sourcePrNumber, state: 'closed' }); } return; } if (pr.merged) { await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:completed']); await syncLabels(github, context, sourcePrNumber, ['external-contributor', 'external-contributor:completed']); await upsertComment(github, context, pr.number, '', [ `This mirrored PR has been merged into \`main\`. The original external PR ${externalPr.html_url} is now completed.`, ]); await upsertComment(github, context, sourcePrNumber, ``, [ `The mirrored PR ${pr.html_url} has been merged into \`main\`. This original external contributor PR will stay closed as completed.`, ]); return; } await syncLabels(github, context, pr.number, ['external-contributor', 'external-contributor:stale']); await syncLabels(github, context, sourcePrNumber, ['external-contributor', 'external-contributor:awaiting-approval']); if (externalPr.head.sha !== sourceSha) { await upsertComment(github, context, pr.number, '', [ `This mirrored PR is stale because the original external PR ${externalPr.html_url} now points at a different source SHA.`, 'Approve the latest external commit to refresh this same internal PR.', ]); return; } if (externalPr.state === 'closed') { await github.rest.pulls.update({ owner: context.repo.owner, repo: context.repo.repo, pull_number: sourcePrNumber, state: 'open' }); } await upsertComment(github, context, sourcePrNumber, ``, [ `The mirrored PR ${pr.html_url} was closed without merge. This original PR has been reopened and is awaiting a fresh approving review from a stagehand team member with write access.`, ]); await upsertComment(github, context, pr.number, '', [ `This mirrored PR was closed without merge. The original external PR ${externalPr.html_url} has been reopened and relabeled as awaiting approval.`, ]); } return { externalLifecycle, prepareClaim, finalizeClaim, syncOwnedPr }; })() concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.event.workflow_run.id }} cancel-in-progress: false jobs: manage-external-pr: if: github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository runs-on: ubuntu-latest steps: - name: Sync external PR lifecycle if: github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'synchronize' uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const lib = eval(process.env.ECPR_LIB); await lib.externalLifecycle({ github, context }); claim-approved-pr: if: github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest steps: - name: Download approval handoff artifact uses: actions/download-artifact@v4 with: name: approved-review path: approval-handoff github-token: ${{ secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} run-id: ${{ github.event.workflow_run.id }} - name: Prepare approved claim id: prepare-claim uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const lib = eval(process.env.ECPR_LIB); await lib.prepareClaim({ github, context, core, artifactPath: 'approval-handoff/approval-handoff.json' }); - name: Checkout repository for branch operations if: steps.prepare-claim.outputs.should-claim == 'true' uses: actions/checkout@v4 with: fetch-depth: 0 persist-credentials: true - name: Refresh internal branch if: steps.prepare-claim.outputs.should-claim == 'true' id: refresh-branch continue-on-error: true env: INTERNAL_BRANCH: ${{ steps.prepare-claim.outputs.branch }} PR_NUMBER: ${{ steps.prepare-claim.outputs.pr-number }} PREVIOUS_SOURCE_SHA: ${{ steps.prepare-claim.outputs.previous-source-sha }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -uo pipefail refresh_status="conflict" refresh_reason="unknown" write_outputs() { echo "refresh-status=${refresh_status}" >> "$GITHUB_OUTPUT" if [ -n "${refresh_reason}" ]; then echo "reason=${refresh_reason}" >> "$GITHUB_OUTPUT" fi } trap write_outputs EXIT if ! git config user.name "github-actions[bot]"; then refresh_reason="git-config-failed" exit 0 fi if ! git config user.email "41898282+github-actions[bot]@users.noreply.github.com"; then refresh_reason="git-config-failed" exit 0 fi if ! git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${GITHUB_REPOSITORY}.git"; then refresh_reason="remote-auth-failed" exit 0 fi if ! git fetch origin "pull/${PR_NUMBER}/head:refs/remotes/origin/external-pr-head-${PR_NUMBER}"; then refresh_reason="fetch-external-failed" exit 0 fi external_ref="refs/remotes/origin/external-pr-head-${PR_NUMBER}" branch_exists=false if git ls-remote --exit-code --heads origin "${INTERNAL_BRANCH}" >/dev/null 2>&1; then branch_exists=true if ! git fetch origin "${INTERNAL_BRANCH}:refs/remotes/origin/${INTERNAL_BRANCH}"; then refresh_reason="fetch-internal-failed" exit 0 fi fi if [ "${branch_exists}" = false ]; then if ! git checkout -B "${INTERNAL_BRANCH}" "${external_ref}"; then refresh_reason="checkout-failed" exit 0 fi if ! git push --force-with-lease origin "HEAD:refs/heads/${INTERNAL_BRANCH}"; then refresh_reason="push-failed" exit 0 fi refresh_status="updated" refresh_reason="" exit 0 fi if ! git checkout -B "${INTERNAL_BRANCH}" "refs/remotes/origin/${INTERNAL_BRANCH}"; then refresh_reason="checkout-failed" exit 0 fi if [ -z "${PREVIOUS_SOURCE_SHA}" ]; then refresh_reason="missing-previous-source" exit 0 fi if git rebase --onto "${external_ref}" "${PREVIOUS_SOURCE_SHA}" "${INTERNAL_BRANCH}"; then if ! git push --force-with-lease origin "HEAD:refs/heads/${INTERNAL_BRANCH}"; then refresh_reason="push-failed" exit 0 fi refresh_status="updated" refresh_reason="" exit 0 fi git rebase --abort || true refresh_reason="rebase-conflict" - name: Finalize approved claim if: always() && steps.prepare-claim.outputs.should-claim == 'true' uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const lib = eval(process.env.ECPR_LIB); await lib.finalizeClaim({ github, context, input: { prNumber: Number('${{ steps.prepare-claim.outputs.pr-number }}'), sourceSha: ${{ toJson(steps.prepare-claim.outputs.source-sha) }}, branch: ${{ toJson(steps.prepare-claim.outputs.branch) }}, claimer: ${{ toJson(steps.prepare-claim.outputs.claimer) }}, title: ${{ toJson(steps.prepare-claim.outputs.title) }}, body: ${{ toJson(steps.prepare-claim.outputs.body) }}, existingNumber: ${{ toJson(steps.prepare-claim.outputs.owned-pr-number) }}, existingMerged: '${{ steps.prepare-claim.outputs.owned-pr-merged }}' === 'true', refreshStatus: ${{ toJson(steps.refresh-branch.outputs.refresh-status) }}, refreshReason: ${{ toJson(steps.refresh-branch.outputs.reason) }}, }, }); sync-owned-pr: if: github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name == github.repository && (github.event.action == 'closed' || github.event.action == 'reopened') runs-on: ubuntu-latest steps: - name: Sync mirrored PR lifecycle uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const lib = eval(process.env.ECPR_LIB); await lib.syncOwnedPr({ github, context }); ================================================ FILE: .github/workflows/feature-parity.yml ================================================ name: Feature Parity on: pull_request: types: - opened - synchronize - labeled - unlabeled paths-ignore: - "packages/docs/**" jobs: check-parity-label: runs-on: ubuntu-latest if: github.event.action == 'labeled' && github.event.label.name == 'parity' permissions: contents: read pull-requests: write issues: write steps: - name: Check out repository code uses: actions/checkout@v4 - name: Check user permissions uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({ owner: context.repo.owner, repo: context.repo.repo, username: context.actor }); const hasWriteAccess = ['admin', 'write'].includes(permission.permission); if (!hasWriteAccess) { // Remove the parity label if user doesn't have write access await github.rest.issues.removeLabel({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, name: 'parity' }); // Add a comment explaining why the label was removed await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `❌ **Parity Label Removed**\n\n@${context.actor}, you do not have sufficient permissions to add the 'parity' label. Only users with write access can trigger feature parity issues.\n\nIf you believe this feature should be implemented in the Python SDK, please ask a maintainer to add the label.` }); throw new Error(`User ${context.actor} does not have write access to add parity label`); } console.log(`User ${context.actor} has ${permission.permission} access - proceeding with parity workflow`); - name: Generate GitHub App token id: generate-token uses: actions/create-github-app-token@v1 with: app-id: ${{ secrets.PARITY_APP_ID }} private-key: ${{ secrets.PARITY_APP_PRIVATE_KEY }} owner: browserbase repositories: stagehand - name: Create issue in Python SDK repository uses: actions/github-script@v7 with: github-token: ${{ steps.generate-token.outputs.token }} script: | const { data: pullRequest } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number, }); // Get PR comments for additional context const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, }); // Format comments for the issue description let commentsSection = ''; if (comments.length > 0) { commentsSection = '\n\n## Recent Comments\n\n'; comments.slice(-3).forEach(comment => { commentsSection += `**@${comment.user.login}** commented:\n`; commentsSection += `${comment.body.substring(0, 500)}${comment.body.length > 500 ? '...' : ''}\n\n`; }); } // Get list of changed files for context const { data: files } = await github.rest.pulls.listFiles({ owner: context.repo.owner, repo: context.repo.repo, pull_number: context.issue.number, }); const changedFiles = files.map(file => `- \`${file.filename}\``).join('\n'); const issueTitle = `[Feature Parity] ${pullRequest.title}`; const issueBody = `## Feature Parity Request This issue was automatically created from a pull request in the TypeScript Stagehand repository that was labeled with 'parity'. ### Original PR Details - **PR**: #${context.issue.number} - ${pullRequest.title} - **Author**: @${pullRequest.user.login} - **Link**: ${pullRequest.html_url} ### Description ${pullRequest.body || 'No description provided.'} ### Changed Files ${changedFiles} ${commentsSection} ### Action Required Please review the changes in the original PR and implement equivalent functionality in the Python SDK if applicable. --- *This issue was automatically generated by the Feature Parity workflow.*`; // Create the issue in the Python repository const { data: issue } = await github.rest.issues.create({ owner: 'browserbase', repo: 'stagehand-python', title: issueTitle, body: issueBody, labels: ['parity'] }); console.log(`Created issue: ${issue.html_url}`); // Add a comment to the original PR confirming the issue was created await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `🔄 **Feature Parity Issue Created**\n\nAn issue has been automatically created in the Python SDK repository to track parity implementation:\n${issue.html_url}` }); ================================================ FILE: .github/workflows/release.yml ================================================ name: Release on: push: branches: - main permissions: contents: write pull-requests: write id-token: write concurrency: ${{ github.workflow }}-${{ github.ref }} jobs: release: name: Release runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 with: fetch-depth: 0 - uses: ./.github/actions/setup-node-pnpm-turbo with: use-prebuilt-artifacts: "false" - name: Configure npm registry for Trusted Publishing uses: actions/setup-node@v6 with: node-version: 20.x registry-url: "https://registry.npmjs.org" - name: Update npm for Trusted Publishing run: npm install -g npm@latest - name: Run Lint & Build run: pnpm exec turbo run lint && pnpm exec turbo run build - name: Create Release Pull Request or Publish to npm id: changesets uses: changesets/action@v1 with: publish: pnpm run release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Publish Canary if: github.ref == 'refs/heads/main' run: | git checkout main pnpm run release-canary env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/stagehand-server-v3-release.yml ================================================ name: Release stagehand/server-v3 on: push: branches: - main paths: - .changeset/** workflow_dispatch: permissions: contents: write concurrency: ${{ github.workflow }}-${{ github.ref }} env: OAS_PATH: packages/server-v3/openapi.v3.yaml jobs: detect: name: Detect server-v3 release (changesets) runs-on: ubuntu-latest outputs: release: ${{ steps.meta.outputs.release }} version: ${{ steps.meta.outputs.version }} tag: ${{ steps.meta.outputs.tag }} steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 fetch-tags: true - uses: ./.github/actions/setup-node-pnpm-turbo env: PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1" with: use-prebuilt-artifacts: "false" - name: Determine release metadata id: meta shell: bash run: | set -euo pipefail latest_tag="$(git tag -l 'stagehand-server-v3/v*' --sort=-v:refname | head -n 1 || true)" rm -f changeset-status.json if [ -n "${latest_tag}" ]; then pnpm changeset status --since "${latest_tag}" --output changeset-status.json else pnpm changeset status --output changeset-status.json fi node <<'NODE' const fs = require('fs'); const status = JSON.parse(fs.readFileSync('changeset-status.json', 'utf8')); const changesets = Array.isArray(status.changesets) ? status.changesets : []; const releases = Array.isArray(status.releases) ? status.releases : []; const shouldRelease = changesets.some((cs) => (cs.releases || []).some((r) => r?.name === '@browserbasehq/stagehand-server-v3') ); const serverRelease = releases.find((r) => r?.name === '@browserbasehq/stagehand-server-v3'); if (shouldRelease && !serverRelease?.newVersion) { throw new Error( 'Expected @browserbasehq/stagehand-server-v3 to have a computed newVersion in changeset-status.json.' ); } const release = shouldRelease ? 'true' : 'false'; const version = shouldRelease ? serverRelease.newVersion : ''; const tag = `stagehand-server-v3/v${version}`; const out = process.env.GITHUB_OUTPUT; fs.appendFileSync(out, `release=${release}\n`); fs.appendFileSync(out, `version=${version}\n`); fs.appendFileSync(out, `tag=${tag}\n`); NODE - name: Create stagehand/server-v3 tag if: steps.meta.outputs.release == 'true' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} shell: bash run: | set -euo pipefail TAG="${{ steps.meta.outputs.tag }}" VERSION="${{ steps.meta.outputs.version }}" TARGET_SHA="${{ github.sha }}" git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" # Try to fetch the tag if it exists on remote; ignore failure for new tags git fetch --force origin "refs/tags/${TAG}:refs/tags/${TAG}" 2>/dev/null || true if git rev-parse -q --verify "refs/tags/${TAG}" >/dev/null; then echo "Tag already exists: ${TAG}" exit 0 fi git tag -a "${TAG}" "${TARGET_SHA}" -m "stagehand/server-v3 v${VERSION}" git push origin "${TAG}" build_binaries: name: Build SEA binaries needs: detect if: needs.detect.outputs.release == 'true' uses: ./.github/workflows/stagehand-server-v3-sea-build.yml with: matrix: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v3-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v3-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v3-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v3-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v3-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v3-win32-arm64.exe","include_sourcemaps":false} ] release: name: Publish GitHub Release needs: [detect, build_binaries] if: needs.detect.outputs.release == 'true' runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 fetch-tags: false - name: Prepare release assets directory run: mkdir -p release-assets - name: Prepare stagehand/server-v3 release assets run: | set -euo pipefail cp "${{ env.OAS_PATH }}" "release-assets/openapi.v3.stagehand-server-v3-${{ needs.detect.outputs.version }}.yaml" - name: Download SEA binary artifacts uses: actions/download-artifact@v4 with: pattern: stagehand-server-v3-* path: . merge-multiple: true - name: Collect SEA binaries shell: bash run: | set -euo pipefail shopt -s nullglob for f in packages/server-v3/dist/sea/stagehand-server-v3-*; do cp "$f" release-assets/ done - name: Create checksums shell: bash run: | set -euo pipefail cd release-assets # Only checksum binaries (exclude openapi yaml). Avoid failing if no matches. shopt -s nullglob files=(stagehand-server-v3-*) bins=() for f in "${files[@]}"; do [[ "$f" == *openapi* ]] && continue [[ -f "$f" ]] && bins+=("$f") done : > checksums.sha256 if [ "${#bins[@]}" -gt 0 ]; then shasum -a 256 "${bins[@]}" > checksums.sha256 fi - name: Publish stagehand/server-v3 GitHub release uses: softprops/action-gh-release@v2 with: tag_name: ${{ needs.detect.outputs.tag }} name: stagehand/server-v3 v${{ needs.detect.outputs.version }} generate_release_notes: true files: | release-assets/openapi.v3.stagehand-server-v3-${{ needs.detect.outputs.version }}.yaml release-assets/stagehand-server-v3-* release-assets/checksums.sha256 ================================================ FILE: .github/workflows/stagehand-server-v3-sea-build.yml ================================================ name: Stagehand Server v3 SEA Build on: workflow_call: inputs: matrix: description: "JSON matrix include list for SEA binaries." required: false type: string default: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v3-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v3-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v3-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v3-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v3-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v3-win32-arm64.exe","include_sourcemaps":false} ] use-prebuilt-artifacts: description: "Whether to download pre-built package artifacts." required: false type: string default: "false" restore-turbo-cache: description: "Whether to restore local .turbo cache." required: false type: string default: "true" node-version: description: "Node.js version for setup." required: false type: string default: "20.x" upload-only-binary: description: "Upload only this binary (empty => upload all)." required: false type: string default: "" workflow_dispatch: inputs: matrix: description: "JSON matrix include list for SEA binaries." required: false default: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v3-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v3-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v3-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v3-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v3-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v3-win32-arm64.exe","include_sourcemaps":false} ] use-prebuilt-artifacts: description: "Whether to download pre-built package artifacts." required: false type: string default: "false" restore-turbo-cache: description: "Whether to restore local .turbo cache." required: false type: string default: "true" node-version: description: "Node.js version for setup." required: false type: string default: "20.x" upload-only-binary: description: "Upload only this binary (empty => upload all)." required: false type: string default: "" jobs: build_binaries: name: Build SEA binaries (${{ matrix.binary_name }}) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: ${{ fromJson(inputs.matrix) }} steps: - name: Checkout repository uses: actions/checkout@v6 with: fetch-depth: 1 fetch-tags: false - uses: ./.github/actions/setup-node-pnpm-turbo env: PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1" PLAYWRIGHT_SKIP_DOWNLOAD: "1" PUPPETEER_SKIP_DOWNLOAD: "1" with: use-prebuilt-artifacts: ${{ inputs.use-prebuilt-artifacts }} restore-turbo-cache: ${{ inputs.restore-turbo-cache }} node-version: ${{ inputs.node-version }} - name: Build SEA binary (ESM) env: SEA_TARGET_PLATFORM: ${{ matrix.platform }} SEA_TARGET_ARCH: ${{ matrix.arch }} SEA_BINARY_NAME: ${{ matrix.binary_name }} SEA_INCLUDE_SOURCEMAPS: ${{ matrix.include_sourcemaps && '1' || '0' }} run: pnpm exec turbo run build:sea:esm --filter=@browserbasehq/stagehand-server-v3 - name: Verify SEA binary exists shell: bash run: | test -f "packages/server-v3/dist/sea/${{ matrix.binary_name }}" - name: Verify SEA binary launches cleanly shell: bash env: RUNNER_ARCH: ${{ runner.arch }} run: | set -euo pipefail binary="packages/server-v3/dist/sea/${{ matrix.binary_name }}" matrix_arch="${{ matrix.arch }}" runner_arch="$(echo "${RUNNER_ARCH}" | tr '[:upper:]' '[:lower:]')" if [[ "${matrix_arch}" != "${runner_arch}" ]]; then echo "Runner arch (${runner_arch}) does not match matrix arch (${matrix_arch})." echo "Launch verification must run on same-arch runners." exit 1 fi if [[ "${{ matrix.platform }}" != "win32" ]]; then chmod +x "${binary}" fi port="$((30000 + RANDOM % 10000))" log_file="$(mktemp)" launched="false" cleanup() { if [[ -n "${pid:-}" ]] && kill -0 "${pid}" 2>/dev/null; then kill "${pid}" 2>/dev/null || true wait "${pid}" 2>/dev/null || true fi } trap cleanup EXIT PORT="${port}" "${binary}" >"${log_file}" 2>&1 & pid=$! for _ in {1..30}; do if ! kill -0 "${pid}" 2>/dev/null; then wait "${pid}" 2>/dev/null || true echo "SEA binary exited before becoming healthy." cat "${log_file}" exit 1 fi if curl --silent --show-error --fail "http://127.0.0.1:${port}/healthz" >/dev/null; then launched="true" break fi sleep 1 done if [[ "${launched}" != "true" ]]; then echo "SEA binary did not become healthy within 30 seconds." cat "${log_file}" exit 1 fi - name: Upload artifact uses: actions/upload-artifact@v4 if: ${{ inputs.upload-only-binary == '' || matrix.binary_name == inputs.upload-only-binary }} with: name: ${{ matrix.binary_name }} # package.json is included to anchor artifact paths at repo root. path: | package.json packages/server-v3/dist/sea/${{ matrix.binary_name }} retention-days: 7 ================================================ FILE: .github/workflows/stagehand-server-v4-release.yml ================================================ name: Release stagehand/server-v4 on: push: branches: - main paths: - .changeset/** workflow_dispatch: permissions: contents: write concurrency: ${{ github.workflow }}-${{ github.ref }} env: OAS_PATH: packages/server-v4/openapi.v4.yaml jobs: detect: name: Detect server-v4 release (changesets) runs-on: ubuntu-latest outputs: release: ${{ steps.meta.outputs.release }} version: ${{ steps.meta.outputs.version }} tag: ${{ steps.meta.outputs.tag }} steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 fetch-tags: true - uses: ./.github/actions/setup-node-pnpm-turbo env: PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1" with: use-prebuilt-artifacts: "false" - name: Determine release metadata id: meta shell: bash run: | set -euo pipefail latest_tag="$(git tag -l 'stagehand-server-v4/v*' --sort=-v:refname | head -n 1 || true)" rm -f changeset-status.json if [ -n "${latest_tag}" ]; then pnpm changeset status --since "${latest_tag}" --output changeset-status.json else pnpm changeset status --output changeset-status.json fi node <<'NODE' const fs = require('fs'); const status = JSON.parse(fs.readFileSync('changeset-status.json', 'utf8')); const changesets = Array.isArray(status.changesets) ? status.changesets : []; const releases = Array.isArray(status.releases) ? status.releases : []; const shouldRelease = changesets.some((cs) => (cs.releases || []).some((r) => r?.name === '@browserbasehq/stagehand-server-v4') ); const serverRelease = releases.find((r) => r?.name === '@browserbasehq/stagehand-server-v4'); if (shouldRelease && !serverRelease?.newVersion) { throw new Error( 'Expected @browserbasehq/stagehand-server-v4 to have a computed newVersion in changeset-status.json.' ); } const release = shouldRelease ? 'true' : 'false'; const version = shouldRelease ? serverRelease.newVersion : ''; const tag = `stagehand-server-v4/v${version}`; const out = process.env.GITHUB_OUTPUT; fs.appendFileSync(out, `release=${release}\n`); fs.appendFileSync(out, `version=${version}\n`); fs.appendFileSync(out, `tag=${tag}\n`); NODE - name: Create stagehand/server-v4 tag if: steps.meta.outputs.release == 'true' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} shell: bash run: | set -euo pipefail TAG="${{ steps.meta.outputs.tag }}" VERSION="${{ steps.meta.outputs.version }}" TARGET_SHA="${{ github.sha }}" git config user.name "github-actions[bot]" git config user.email "41898282+github-actions[bot]@users.noreply.github.com" # Try to fetch the tag if it exists on remote; ignore failure for new tags git fetch --force origin "refs/tags/${TAG}:refs/tags/${TAG}" 2>/dev/null || true if git rev-parse -q --verify "refs/tags/${TAG}" >/dev/null; then echo "Tag already exists: ${TAG}" exit 0 fi git tag -a "${TAG}" "${TARGET_SHA}" -m "stagehand/server-v4 v${VERSION}" git push origin "${TAG}" build_binaries: name: Build SEA binaries needs: detect if: needs.detect.outputs.release == 'true' uses: ./.github/workflows/stagehand-server-v4-sea-build.yml with: matrix: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v4-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v4-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v4-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v4-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v4-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v4-win32-arm64.exe","include_sourcemaps":false} ] release: name: Publish GitHub Release needs: [detect, build_binaries] if: needs.detect.outputs.release == 'true' runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 fetch-tags: false - name: Prepare release assets directory run: mkdir -p release-assets - name: Prepare stagehand/server-v4 release assets run: | set -euo pipefail cp "${{ env.OAS_PATH }}" "release-assets/openapi.v4.stagehand-server-v4-${{ needs.detect.outputs.version }}.yaml" - name: Download SEA binary artifacts uses: actions/download-artifact@v4 with: pattern: stagehand-server-v4-* path: . merge-multiple: true - name: Collect SEA binaries shell: bash run: | set -euo pipefail shopt -s nullglob for f in packages/server-v4/dist/sea/stagehand-server-v4-*; do cp "$f" release-assets/ done - name: Create checksums shell: bash run: | set -euo pipefail cd release-assets # Only checksum binaries (exclude openapi yaml). Avoid failing if no matches. shopt -s nullglob files=(stagehand-server-v4-*) bins=() for f in "${files[@]}"; do [[ "$f" == *openapi* ]] && continue [[ -f "$f" ]] && bins+=("$f") done : > checksums.sha256 if [ "${#bins[@]}" -gt 0 ]; then shasum -a 256 "${bins[@]}" > checksums.sha256 fi - name: Publish stagehand/server-v4 GitHub release uses: softprops/action-gh-release@v2 with: tag_name: ${{ needs.detect.outputs.tag }} name: stagehand/server-v4 v${{ needs.detect.outputs.version }} generate_release_notes: true files: | release-assets/openapi.v4.stagehand-server-v4-${{ needs.detect.outputs.version }}.yaml release-assets/stagehand-server-v4-* release-assets/checksums.sha256 ================================================ FILE: .github/workflows/stagehand-server-v4-sea-build.yml ================================================ name: Stagehand Server v4 SEA Build on: workflow_call: inputs: matrix: description: "JSON matrix include list for SEA binaries." required: false type: string default: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v4-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v4-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v4-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v4-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v4-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v4-win32-arm64.exe","include_sourcemaps":false} ] use-prebuilt-artifacts: description: "Whether to download pre-built package artifacts." required: false type: string default: "false" restore-turbo-cache: description: "Whether to restore local .turbo cache." required: false type: string default: "true" node-version: description: "Node.js version for setup." required: false type: string default: "20.x" upload-only-binary: description: "Upload only this binary (empty => upload all)." required: false type: string default: "" workflow_dispatch: inputs: matrix: description: "JSON matrix include list for SEA binaries." required: false default: | [ {"os":"ubuntu-latest","platform":"linux","arch":"x64","binary_name":"stagehand-server-v4-linux-x64","include_sourcemaps":false}, {"os":"ubuntu-24.04-arm","platform":"linux","arch":"arm64","binary_name":"stagehand-server-v4-linux-arm64","include_sourcemaps":false}, {"os":"macos-15","platform":"darwin","arch":"arm64","binary_name":"stagehand-server-v4-darwin-arm64","include_sourcemaps":false}, {"os":"macos-15-intel","platform":"darwin","arch":"x64","binary_name":"stagehand-server-v4-darwin-x64","include_sourcemaps":false}, {"os":"windows-latest","platform":"win32","arch":"x64","binary_name":"stagehand-server-v4-win32-x64.exe","include_sourcemaps":false}, {"os":"windows-11-arm","platform":"win32","arch":"arm64","binary_name":"stagehand-server-v4-win32-arm64.exe","include_sourcemaps":false} ] use-prebuilt-artifacts: description: "Whether to download pre-built package artifacts." required: false type: string default: "false" restore-turbo-cache: description: "Whether to restore local .turbo cache." required: false type: string default: "true" node-version: description: "Node.js version for setup." required: false type: string default: "20.x" upload-only-binary: description: "Upload only this binary (empty => upload all)." required: false type: string default: "" jobs: build_binaries: name: Build SEA binaries (${{ matrix.binary_name }}) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: ${{ fromJson(inputs.matrix) }} steps: - name: Checkout repository uses: actions/checkout@v6 with: fetch-depth: 1 fetch-tags: false - uses: ./.github/actions/setup-node-pnpm-turbo env: PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1" PLAYWRIGHT_SKIP_DOWNLOAD: "1" PUPPETEER_SKIP_DOWNLOAD: "1" with: use-prebuilt-artifacts: ${{ inputs.use-prebuilt-artifacts }} restore-turbo-cache: ${{ inputs.restore-turbo-cache }} node-version: ${{ inputs.node-version }} - name: Build SEA binary (ESM) env: SEA_TARGET_PLATFORM: ${{ matrix.platform }} SEA_TARGET_ARCH: ${{ matrix.arch }} SEA_BINARY_NAME: ${{ matrix.binary_name }} SEA_INCLUDE_SOURCEMAPS: ${{ matrix.include_sourcemaps && '1' || '0' }} run: pnpm exec turbo run build:sea:esm --filter=@browserbasehq/stagehand-server-v4 - name: Verify SEA binary exists shell: bash run: | test -f "packages/server-v4/dist/sea/${{ matrix.binary_name }}" - name: Verify SEA binary launches cleanly shell: bash env: RUNNER_ARCH: ${{ runner.arch }} run: | set -euo pipefail binary="packages/server-v4/dist/sea/${{ matrix.binary_name }}" matrix_arch="${{ matrix.arch }}" runner_arch="$(echo "${RUNNER_ARCH}" | tr '[:upper:]' '[:lower:]')" if [[ "${matrix_arch}" != "${runner_arch}" ]]; then echo "Runner arch (${runner_arch}) does not match matrix arch (${matrix_arch})." echo "Launch verification must run on same-arch runners." exit 1 fi if [[ "${{ matrix.platform }}" != "win32" ]]; then chmod +x "${binary}" fi port="$((30000 + RANDOM % 10000))" log_file="$(mktemp)" launched="false" cleanup() { if [[ -n "${pid:-}" ]] && kill -0 "${pid}" 2>/dev/null; then kill "${pid}" 2>/dev/null || true wait "${pid}" 2>/dev/null || true fi } trap cleanup EXIT PORT="${port}" "${binary}" >"${log_file}" 2>&1 & pid=$! for _ in {1..30}; do if ! kill -0 "${pid}" 2>/dev/null; then wait "${pid}" 2>/dev/null || true echo "SEA binary exited before becoming healthy." cat "${log_file}" exit 1 fi if curl --silent --show-error --fail "http://127.0.0.1:${port}/healthz" >/dev/null; then launched="true" break fi sleep 1 done if [[ "${launched}" != "true" ]]; then echo "SEA binary did not become healthy within 30 seconds." cat "${log_file}" exit 1 fi - name: Upload artifact uses: actions/upload-artifact@v4 if: ${{ inputs.upload-only-binary == '' || matrix.binary_name == inputs.upload-only-binary }} with: name: ${{ matrix.binary_name }} # package.json is included to anchor artifact paths at repo root. path: | package.json packages/server-v4/dist/sea/${{ matrix.binary_name }} retention-days: 7 ================================================ FILE: .github/workflows/stainless.yml ================================================ name: Build SDKs for pull request on: pull_request: types: - opened - synchronize - reopened - closed concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} cancel-in-progress: true env: STAINLESS_ORG: ${{ vars.STAINLESS_ORG }} STAINLESS_PROJECT: ${{ vars.STAINLESS_PROJECT }} OAS_PATH: packages/server-v3/openapi.v3.yaml jobs: preview: if: github.event.action != 'closed' runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 2 - name: Run preview builds uses: stainless-api/upload-openapi-spec-action/preview@v1 with: stainless_api_key: ${{ secrets.STAINLESS_API_KEY }} org: ${{ env.STAINLESS_ORG }} project: ${{ env.STAINLESS_PROJECT }} oas_path: ${{ env.OAS_PATH }} config_path: stainless.yml merge: if: github.event.action == 'closed' && github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 2 - name: Run merge build uses: stainless-api/upload-openapi-spec-action/merge@v1 with: stainless_api_key: ${{ secrets.STAINLESS_API_KEY }} org: ${{ env.STAINLESS_ORG }} project: ${{ env.STAINLESS_PROJECT }} oas_path: ${{ env.OAS_PATH }} config_path: stainless.yml ================================================ FILE: .gitignore ================================================ node_modules/ /test-results/ /playwright-report/ /blob-report/ /playwright/.cache/ screenshot.png .DS_STORE .cache/ .env downloads/ dist/ .browserbase/ packages/evals/**/public packages/core/lib/dom/build/ packages/core/lib/v3/dom/build/ packages/evals/public *.tgz evals/playground.ts tmp/ eval-summary.json package-lock.json evals/deterministic/tests/BrowserContext/tmp-test.har packages/core/lib/version.ts packages/core/test-results/ /examples/inference_summary /inference_summary .turbo .idea coverage/ ctrf/ .stagehand-sea/ ================================================ FILE: .prettierignore ================================================ pnpm-lock.yaml README.md **/*.json docs/ .github/ dist/ node_modules/ lib/dom/build/ lib/v3/dom/build/ packages/core/dist/ packages/core/lib/dom/build/ packages/core/lib/v3/dom/build/ packages/cli/dist/ packages/evals/dist/ packages/docs/ *.min.js .browserbase/ .browserbase/** **/.browserbase/ **/.browserbase/** stainless.yml openapi.*.yaml ================================================ FILE: .prettierrc ================================================ {} ================================================ FILE: .vscode/settings.json ================================================ { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true } ================================================ FILE: CHANGELOG.md ================================================ # @browserbasehq/stagehand ## 3.0.0 ### Major Changes - Removes internal Playwright dependency - A generous 20-40% speed increase across `act`, `extract`, & `observe` calls - Compatibility with Playwright, Puppeteer, and Patchright - Automatic action caching (agent, stagehand.act). Go from CUA → deterministic scripts w/o inference - A suite of non AI primitives: - `page` - `locator` (built in closed mode shadow root traversal, with xpaths & css selectors) - `frameLocator` - `deepLocator` (crosses iframes & shadow roots) - bun compatibility - Simplified extract schemas - CSS selector support (id-based support coming soon) - Targeted extract and observe across iframes & shadow roots - More intuitive type names (observeResult is now action, act accepts an instruction string instead of an action string, solidified ModelConfiguration) Check the [migration guide](https://docs.stagehand.dev/v3/migrations/v2) for more information ## 2.5.0 ### Minor Changes - [#981](https://github.com/browserbase/stagehand/pull/981) [`8244ab2`](https://github.com/browserbase/stagehand/commit/8244ab247cd679962685ae2f7c54e874ce1fa614) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for `stagehand.agent` to interact with MCP servers as well as custom tools to be passed in. For more information, reference the [MCP integrations documentation](https://docs.stagehand.dev/best-practices/mcp-integrations) ### Patch Changes - [#959](https://github.com/browserbase/stagehand/pull/959) [`09b5e1e`](https://github.com/browserbase/stagehand/commit/09b5e1e9c23c845903686db6665cc968ac34efbb) Thanks [@filip-michalsky](https://github.com/filip-michalsky)! - add webvoyager evals - [#1049](https://github.com/browserbase/stagehand/pull/1049) [`e3734b9`](https://github.com/browserbase/stagehand/commit/e3734b9c98352d5f0a4eca49791b0bbf2130ab41) Thanks [@miguelg719](https://github.com/miguelg719)! - Support local MCP server connections - [#1025](https://github.com/browserbase/stagehand/pull/1025) [`be85b19`](https://github.com/browserbase/stagehand/commit/be85b19679a826f19702e00f0aae72fce1118ec8) Thanks [@tkattkat](https://github.com/tkattkat)! - add support for custom baseUrl within openai provider - [#1040](https://github.com/browserbase/stagehand/pull/1040) [`88d1565`](https://github.com/browserbase/stagehand/commit/88d1565c65bb65a104fea2d5f5e862bbbda69677) Thanks [@miguelg719](https://github.com/miguelg719)! - Allow OpenAI CUA to take in an optional baseURL - [#1046](https://github.com/browserbase/stagehand/pull/1046) [`ab5d6ed`](https://github.com/browserbase/stagehand/commit/ab5d6ede19aabc059badc4247f1cb2c6c9e71bae) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for gpt-5 in operator agent ## 2.4.4 ### Patch Changes - [#1012](https://github.com/browserbase/stagehand/pull/1012) [`9e8c173`](https://github.com/browserbase/stagehand/commit/9e8c17374fdc8fbe7f26e6cf802c36bd14f11039) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix disabling api validation whenever a customLLM client is provided ## 2.4.3 ### Patch Changes - [#951](https://github.com/browserbase/stagehand/pull/951) [`f45afdc`](https://github.com/browserbase/stagehand/commit/f45afdccc8680650755fee66ffbeac32b41e075d) Thanks [@miguelg719](https://github.com/miguelg719)! - Patch GPT-5 new api format - [#954](https://github.com/browserbase/stagehand/pull/954) [`261bba4`](https://github.com/browserbase/stagehand/commit/261bba43fa79ac3af95328e673ef3e9fced3279b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for shadow DOMs (open & closed mode) when experimental: true - [#944](https://github.com/browserbase/stagehand/pull/944) [`8de7bd8`](https://github.com/browserbase/stagehand/commit/8de7bd8635c2051cd8025e365c6c8aa83d81c7e7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - Bump zod version compatibility and add pathing spec - [#919](https://github.com/browserbase/stagehand/pull/919) [`3d80421`](https://github.com/browserbase/stagehand/commit/3d804210a106a6828c7fa50f8b765b10afd4cc6a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - enable scrolling inside of iframes - [#963](https://github.com/browserbase/stagehand/pull/963) [`0ead63d`](https://github.com/browserbase/stagehand/commit/0ead63d6526f6c286362b74b6407c8bebc900e69) Thanks [@tkattkat](https://github.com/tkattkat)! - Properly handle images in evaluator + clean up response parsing logic - [#961](https://github.com/browserbase/stagehand/pull/961) [`8422828`](https://github.com/browserbase/stagehand/commit/8422828c4cd5fd5ebcf348cfbdb40c768bb76dd9) Thanks [@tkattkat](https://github.com/tkattkat)! - Add more evals for stagehand agent - [#946](https://github.com/browserbase/stagehand/pull/946) [`b769206`](https://github.com/browserbase/stagehand/commit/b7692060f98a2f49aeeefb90d8789ed034b08ec2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: unable to act on/get content from some same process iframes - [#962](https://github.com/browserbase/stagehand/pull/962) [`72d2683`](https://github.com/browserbase/stagehand/commit/72d2683202af7e578d98367893964b33e0828de5) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - handle namespaced elements in xpath build step ## 2.4.2 ### Patch Changes - [#865](https://github.com/browserbase/stagehand/pull/865) [`6b4e6e3`](https://github.com/browserbase/stagehand/commit/6b4e6e3f31d5496cf15728e9018eddeb04839542) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improve type safety for trimTrailingTextNode - [#897](https://github.com/browserbase/stagehand/pull/897) [`e77d018`](https://github.com/browserbase/stagehand/commit/e77d0188683ebf596dfb78dfafbbca1dc32993f0) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix selfHeal to remember intially received arguments - [#920](https://github.com/browserbase/stagehand/pull/920) [`c20adb9`](https://github.com/browserbase/stagehand/commit/c20adb95539fed8c56a4aa413262a9c65a8e6474) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: tab handling on API - [#882](https://github.com/browserbase/stagehand/pull/882) [`b86df93`](https://github.com/browserbase/stagehand/commit/b86df93b9136aae96292121a29c25f3d74d84bf7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove elements that don't have xpaths from observe response - [#905](https://github.com/browserbase/stagehand/pull/905) [`023c2c2`](https://github.com/browserbase/stagehand/commit/023c2c273b46d3792d7e5d3c902089487b16b531) Thanks [@tkattkat](https://github.com/tkattkat)! - Delete old images from anthropic cua client - [#925](https://github.com/browserbase/stagehand/pull/925) [`8c28647`](https://github.com/browserbase/stagehand/commit/8c2864755ecd05c8f7de235d4198deec0dd5f78e) Thanks [@miguelg719](https://github.com/miguelg719)! - Remove \_refreshPageFromApi() - [#887](https://github.com/browserbase/stagehand/pull/887) [`87e09c6`](https://github.com/browserbase/stagehand/commit/87e09c618940f364ec8af00455a19a17ec63cbd3) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: allow xpaths with prepended 'xpath=' for targeted extract - [#864](https://github.com/browserbase/stagehand/pull/864) [`a611115`](https://github.com/browserbase/stagehand/commit/a61111525d70b450bdfc43f112380f44899c9e97) Thanks [@miguelg719](https://github.com/miguelg719)! - Temporarily patch custom clients serialization error on api - [#881](https://github.com/browserbase/stagehand/pull/881) [`69913fe`](https://github.com/browserbase/stagehand/commit/69913fe1dfb8201ae2aeffa5f049fb46ab02cbc2) Thanks [@miguelg719](https://github.com/miguelg719)! - Pass sdk version number to API for debugging - [#913](https://github.com/browserbase/stagehand/pull/913) [`b1b83a1`](https://github.com/browserbase/stagehand/commit/b1b83a1d334fe76e5f5f9dd32dc92c16b7d40ce6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - move iframe out of 'experimental' - [#891](https://github.com/browserbase/stagehand/pull/891) [`be8497c`](https://github.com/browserbase/stagehand/commit/be8497cb6b142cc893cea9692b8c47bd19514c60) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: nested iframe xpath bug - [#883](https://github.com/browserbase/stagehand/pull/883) [`98704c9`](https://github.com/browserbase/stagehand/commit/98704c9ed225ca25bbde4bb3dc286936e9c54471) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add timeout for JS click - [#907](https://github.com/browserbase/stagehand/pull/907) [`04978bd`](https://github.com/browserbase/stagehand/commit/04978bdd30d2edcbc69eb9fd91358a16975ea2eb) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - store mapping of CDP frame ID -> page ## 2.4.1 ### Patch Changes - [#856](https://github.com/browserbase/stagehand/pull/856) [`8a43c5a`](https://github.com/browserbase/stagehand/commit/8a43c5a86d4da40cfaedd9cf2e42186928bdf946) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - set download behaviour by default - [#857](https://github.com/browserbase/stagehand/pull/857) [`890ffcc`](https://github.com/browserbase/stagehand/commit/890ffccac5e0a60ade64a46eb550c981ffb3e84a) Thanks [@miguelg719](https://github.com/miguelg719)! - return "not-supported" for elements inside the shadow-dom - [#844](https://github.com/browserbase/stagehand/pull/844) [`64c1072`](https://github.com/browserbase/stagehand/commit/64c10727bda50470483a3eb175c02842db0923a1) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - don't automatically close tabs - [#860](https://github.com/browserbase/stagehand/pull/860) [`b077d3f`](https://github.com/browserbase/stagehand/commit/b077d3f48a97f47a71ccc79ae39b41e7f07f9c04) Thanks [@miguelg719](https://github.com/miguelg719)! - Set default schema on extract options with no schema - [#842](https://github.com/browserbase/stagehand/pull/842) [`8bcb5d7`](https://github.com/browserbase/stagehand/commit/8bcb5d77debf6bf7601fd5c090efd7fde75c5d5e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improved handling for OS level dropdowns - [#846](https://github.com/browserbase/stagehand/pull/846) [`7bf10c5`](https://github.com/browserbase/stagehand/commit/7bf10c55b267078fe847c1d7f7a60d604f9c7c94) Thanks [@miguelg719](https://github.com/miguelg719)! - Filter attaching to target worker / shared_worker ## 2.4.0 ### Minor Changes - [#819](https://github.com/browserbase/stagehand/pull/819) [`6a18c1e`](https://github.com/browserbase/stagehand/commit/6a18c1ee1e46d55c6e90c4d5572e17ed8daa140c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - try playwright click and fall back to JS click event ### Patch Changes - [#826](https://github.com/browserbase/stagehand/pull/826) [`124e0d3`](https://github.com/browserbase/stagehand/commit/124e0d3bb54ddb6738ede6d7aa99a945ef1cacd1) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where we are unable to take actions on text nodes - [#818](https://github.com/browserbase/stagehand/pull/818) [`1660751`](https://github.com/browserbase/stagehand/commit/1660751cd14cb5b27d44f8167216afb8d1c3c45c) Thanks [@miguelg719](https://github.com/miguelg719)! - Added CUA support for Claude 4 models - [#821](https://github.com/browserbase/stagehand/pull/821) [`cadac9d`](https://github.com/browserbase/stagehand/commit/cadac9da09123d12e5d496a0e8b12660964c1b33) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use playwright instead of playwright test - [#832](https://github.com/browserbase/stagehand/pull/832) [`759da55`](https://github.com/browserbase/stagehand/commit/759da55775eb2df81d56ae18c0f386fd9b02a9f0) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix \_refreshPageFromAPI to use parametrized apiKey - [#810](https://github.com/browserbase/stagehand/pull/810) [`a175a51`](https://github.com/browserbase/stagehand/commit/a175a519b8c14300db6f1ed30709e113d18e99db) Thanks [@miguelg719](https://github.com/miguelg719)! - Update logos - [#822](https://github.com/browserbase/stagehand/pull/822) [`8527a80`](https://github.com/browserbase/stagehand/commit/8527a80522c3eedb9516a6caa1a0e4e4be981a3d) Thanks [@miguelg719](https://github.com/miguelg719)! - Add model with date tag for OpenAI CUA - [#833](https://github.com/browserbase/stagehand/pull/833) [`55fca2f`](https://github.com/browserbase/stagehand/commit/55fca2f7da63cc0ef6e27b45a33f63c666cdce7e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - adjust stagehandLogger.warn() level to be 1 instead of 0 ## 2.3.1 ### Patch Changes - [#796](https://github.com/browserbase/stagehand/pull/796) [`12a99b3`](https://github.com/browserbase/stagehand/commit/12a99b398d8a4c3eea3ca69a3cf793faaaf4aea3) Thanks [@miguelg719](https://github.com/miguelg719)! - Added a experimental flag to enable the newest and most experimental features - [#807](https://github.com/browserbase/stagehand/pull/807) [`2451797`](https://github.com/browserbase/stagehand/commit/2451797f64c0efa4a72fd70265110003c8d0a6cd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - include version number in StagehandDefaultError message - [#803](https://github.com/browserbase/stagehand/pull/803) [`1d631a5`](https://github.com/browserbase/stagehand/commit/1d631a57a197390f672b718ae5199991ab27cfb1) Thanks [@miguelg719](https://github.com/miguelg719)! - Enable session affinity for cache optimization - [#804](https://github.com/browserbase/stagehand/pull/804) [`9c398bb`](https://github.com/browserbase/stagehand/commit/9c398bb9ec2d10bdb53ad5aa7e3b58cce24fdb2b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - update operatorResponseSchema based on new openai spec - [#786](https://github.com/browserbase/stagehand/pull/786) [`c19ad7f`](https://github.com/browserbase/stagehand/commit/c19ad7f1e082e91fdeaa9c2ef63767a5a2b3a195) Thanks [@miguelg719](https://github.com/miguelg719)! - Handle reroute to account for rollout ## 2.3.0 ### Minor Changes - [#737](https://github.com/browserbase/stagehand/pull/737) [`6ef6073`](https://github.com/browserbase/stagehand/commit/6ef60730cab0ad9025f44b6eeb2c83751d1dcd35) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - deprecate useTextExtract and remove functionality ### Patch Changes - [#741](https://github.com/browserbase/stagehand/pull/741) [`5680d25`](https://github.com/browserbase/stagehand/commit/5680d2509352c383ad502c9f4fabde01fa638833) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use safeparse for zod validation - [#783](https://github.com/browserbase/stagehand/pull/783) [`4de92a8`](https://github.com/browserbase/stagehand/commit/4de92a8af461fc95063faf39feee1d49259f58ba) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the readme logo link ## 2.2.1 ### Patch Changes - [#721](https://github.com/browserbase/stagehand/pull/721) [`be8652e`](https://github.com/browserbase/stagehand/commit/be8652e770b57fdb3299fa0b2efa4eb0e816434e) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix stagehand.close() functionality to include calling browser.close() - [#724](https://github.com/browserbase/stagehand/pull/724) [`6b413b7`](https://github.com/browserbase/stagehand/commit/6b413b7ad00b13ca0bd53ee2e7393023821408b6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm refine step in extract - [#712](https://github.com/browserbase/stagehand/pull/712) [`7eafbd9`](https://github.com/browserbase/stagehand/commit/7eafbd9b1a73b37effa444929767df7c592caf02) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - deprecated `onlyVisible` param and remove its functionality - [#725](https://github.com/browserbase/stagehand/pull/725) [`1b50aa6`](https://github.com/browserbase/stagehand/commit/1b50aa61cf0a429dd6cb2760a08f7f698a50454b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - dont overwrite .describe() when user defines a zod schema with z.string().url().describe() - [#717](https://github.com/browserbase/stagehand/pull/717) [`f2b7f1f`](https://github.com/browserbase/stagehand/commit/f2b7f1f284eef1f96753319b66c7d0b273a6f8cd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - don't publish uncompiled ts to npm - [#719](https://github.com/browserbase/stagehand/pull/719) [`c8d672f`](https://github.com/browserbase/stagehand/commit/c8d672f7c410c256defbc2e87ead99239837aa28) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix `Invalid schema for response_format` error when extracting links - [#722](https://github.com/browserbase/stagehand/pull/722) [`bebf204`](https://github.com/browserbase/stagehand/commit/bebf2044502333c694743078c5b0c9deae11fb79) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - replace NBSP with regular space & remove special characters from dom+a11y tree - [#714](https://github.com/browserbase/stagehand/pull/714) [`37d6810`](https://github.com/browserbase/stagehand/commit/37d6810a704773d0383a86f98f5f17c7d5b21975) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the native AI SDK client implementation to optionally take in an API key ## 2.2.0 ### Minor Changes - [#655](https://github.com/browserbase/stagehand/pull/655) [`8814af9`](https://github.com/browserbase/stagehand/commit/8814af9ece99fddc3dd9fb32671d0513a3a00c67) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - extract links - [#675](https://github.com/browserbase/stagehand/pull/675) [`35c55eb`](https://github.com/browserbase/stagehand/commit/35c55ebf6c2867801a0a6f6988a883c8cb90cf9a) Thanks [@tkattkat](https://github.com/tkattkat)! - Added Gemini 2.5 Flash to Google supported models - [#668](https://github.com/browserbase/stagehand/pull/668) [`5c6d2cf`](https://github.com/browserbase/stagehand/commit/5c6d2cf89c9fbf198485506ed9ed75e07aec5cd4) Thanks [@miguelg719](https://github.com/miguelg719)! - Added a new class - Stagehand Evaluator - that wraps around a Stagehand object to determine whether a task is successful or not. Currently used for agent evals ### Patch Changes - [#706](https://github.com/browserbase/stagehand/pull/706) [`18ac6fb`](https://github.com/browserbase/stagehand/commit/18ac6fba30f45b7557cecb890f4e84c75de8383c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove unused fillInVariables fn - [#692](https://github.com/browserbase/stagehand/pull/692) [`6b95248`](https://github.com/browserbase/stagehand/commit/6b95248d6e02e5304ce4dd60499e31fc42af57eb) Thanks [@miguelg719](https://github.com/miguelg719)! - Updated the list of OpenAI models (4.1, o3...) - [#688](https://github.com/browserbase/stagehand/pull/688) [`7d81b3c`](https://github.com/browserbase/stagehand/commit/7d81b3c951c1f3dfc46845aefcc26ff175299bca) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - wrap page.evaluate to make sure we have injected browser side scripts before calling them - [#664](https://github.com/browserbase/stagehand/pull/664) [`b5ca00a`](https://github.com/browserbase/stagehand/commit/b5ca00a25ad0c33a5f4d3198e1bc59edb9956e7c) Thanks [@miguelg719](https://github.com/miguelg719)! - remove unnecessary log - [#683](https://github.com/browserbase/stagehand/pull/683) [`8f0f97b`](https://github.com/browserbase/stagehand/commit/8f0f97bc491e23ff0078c802aaf509fd04173c37) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use javsacript click instead of playwright - [#705](https://github.com/browserbase/stagehand/pull/705) [`346ef5d`](https://github.com/browserbase/stagehand/commit/346ef5d0132dc1418dac18d26640a8df0435af57) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed removing a hanging observation map that is no longer used - [#698](https://github.com/browserbase/stagehand/pull/698) [`c145bc1`](https://github.com/browserbase/stagehand/commit/c145bc1d90ffd0d71c412de3af1c26c121e0b101) Thanks [@sameelarif](https://github.com/sameelarif)! - Fixing LLM client support to natively integrate with AI SDK - [#687](https://github.com/browserbase/stagehand/pull/687) [`edd6d3f`](https://github.com/browserbase/stagehand/commit/edd6d3feb47aac9f312a5edad78bf850ae1541db) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed the schema input for Gemini's response model - [#678](https://github.com/browserbase/stagehand/pull/678) [`5ec43d8`](https://github.com/browserbase/stagehand/commit/5ec43d8b9568c0f86b3e24bd83d1826c837656ed) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow form filling when form is not top-most element - [#694](https://github.com/browserbase/stagehand/pull/694) [`b8cc164`](https://github.com/browserbase/stagehand/commit/b8cc16405b712064a54c8cd591750368a47f35ea) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add telemetry for cua agents to stagehand.metrics - [#699](https://github.com/browserbase/stagehand/pull/699) [`d9f4243`](https://github.com/browserbase/stagehand/commit/d9f4243f6a8c8d4f3003ad6589f7eb4da6d23d0f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm deprecated primitives from stagehand object - [#710](https://github.com/browserbase/stagehand/pull/710) [`9f4ab76`](https://github.com/browserbase/stagehand/commit/9f4ab76a0c1f0c2171290765c48c3bcea5b50e0f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support targeted extract for domExtract - [#677](https://github.com/browserbase/stagehand/pull/677) [`bc5a731`](https://github.com/browserbase/stagehand/commit/bc5a731241f7f4c5040dd672d8e3787555766421) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixes a redundant unnecessary log ## 2.1.0 ### Minor Changes - [#659](https://github.com/browserbase/stagehand/pull/659) [`f9a435e`](https://github.com/browserbase/stagehand/commit/f9a435e938daccfb2e54ca23fad8ef75128a4486) Thanks [@miguelg719](https://github.com/miguelg719)! - Added native support for Google Generative models (Gemini) ### Patch Changes - [#647](https://github.com/browserbase/stagehand/pull/647) [`ca5467d`](https://github.com/browserbase/stagehand/commit/ca5467de7d31bfb270b6b625224a926c52c97900) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - collapse redundant text nodes into parent elements - [#636](https://github.com/browserbase/stagehand/pull/636) [`9037430`](https://github.com/browserbase/stagehand/commit/903743097367ba6bb12baa9f0fa8f7985f543fdc) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix token act metrics and inference logging being misplaced as observe metrics and inference logging - [#648](https://github.com/browserbase/stagehand/pull/648) [`169e7ea`](https://github.com/browserbase/stagehand/commit/169e7ea9e229503ae5958eaa4511531578ee3841) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add mapping of node id -> url - [#654](https://github.com/browserbase/stagehand/pull/654) [`57a9853`](https://github.com/browserbase/stagehand/commit/57a98538381e0e54fbb734b43c50d61fd0d567df) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix repeated up & down scrolling bug for clicks inside `act` - [#624](https://github.com/browserbase/stagehand/pull/624) [`cf167a4`](https://github.com/browserbase/stagehand/commit/cf167a437865e8e8bdb8739d22c3b3bb84e185de) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - export stagehand error classes so they can be referenced from @dist - [#640](https://github.com/browserbase/stagehand/pull/640) [`178f5f0`](https://github.com/browserbase/stagehand/commit/178f5f0a8fecd876adfb4e29983853bdf7ec72fd) Thanks [@yash1744](https://github.com/yash1744)! - Added support for stagehand agents to automatically redirect to https://google.com when the page URL is empty or set to about:blank, preventing empty screenshots and saving tokens. - [#661](https://github.com/browserbase/stagehand/pull/661) [`bf823a3`](https://github.com/browserbase/stagehand/commit/bf823a36930b0686b416a42302ef8c021b4aba75) Thanks [@kamath](https://github.com/kamath)! - fix press enter - [#633](https://github.com/browserbase/stagehand/pull/633) [`86724f6`](https://github.com/browserbase/stagehand/commit/86724f6fb0abc7292423ac5bd0bebcd352f95940) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the getBrowser logic for redundant api calls and throw informed errors - [#656](https://github.com/browserbase/stagehand/pull/656) [`c630373`](https://github.com/browserbase/stagehand/commit/c630373dede4c775875834bfb860436ba2ea48d2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - parse out % signs from variables in act - [#637](https://github.com/browserbase/stagehand/pull/637) [`944bbbf`](https://github.com/browserbase/stagehand/commit/944bbbfe8bfb357b4910584447a93f6f402c3826) Thanks [@kamath](https://github.com/kamath)! - Fix: forward along the stack trace in StagehandDefaultError ## 2.0.0 ### Major Changes - [#591](https://github.com/browserbase/stagehand/pull/591) [`e234a0f`](https://github.com/browserbase/stagehand/commit/e234a0f80bf4c07bcc57265da216cbc4ab3bd19d) Thanks [@miguelg719](https://github.com/miguelg719)! - Announcing **Stagehand 2.0**! 🎉 We're thrilled to announce the release of Stagehand 2.0, bringing significant improvements to make browser automation more powerful, faster, and easier to use than ever before. ### 🚀 New Features - **Introducing `stagehand.agent`**: A powerful new way to integrate SOTA Computer use models or Browserbase's [Open Operator](https://operator.browserbase.com) into Stagehand with one line of code! Perfect for multi-step workflows and complex interactions. [Learn more](https://docs.stagehand.dev/concepts/agent) - **Lightning-fast `act` and `extract`**: Major performance improvements to make your automations run significantly faster. - **Enhanced Logging**: Better visibility into what's happening during automation with improved logging and debugging capabilities. - **Comprehensive Documentation**: A completely revamped documentation site with better examples, guides, and best practices. - **Improved Error Handling**: More descriptive errors and better error recovery to help you debug issues faster. ### 🛠️ Developer Experience - **Better TypeScript Support**: Enhanced type definitions and better IDE integration - **Better Error Messages**: Clearer, more actionable error messages to help you debug faster - **Improved Caching**: More reliable action caching for better performance We're excited to see what you build with Stagehand 2.0! For questions or support, join our [Slack community](https://stagehand.dev/slack). For more details, check out our [documentation](https://docs.stagehand.dev). ### Minor Changes - [#588](https://github.com/browserbase/stagehand/pull/588) [`ba9efc5`](https://github.com/browserbase/stagehand/commit/ba9efc5580a536bc3c158e507a6c6695825c2834) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for offloading agent tasks to the API. - [#600](https://github.com/browserbase/stagehand/pull/600) [`11e015d`](https://github.com/browserbase/stagehand/commit/11e015daac56dc961b8c8d54ce360fd00d4fee38) Thanks [@sameelarif](https://github.com/sameelarif)! - Added a `stagehand.history` array which stores an array of `act`, `extract`, `observe`, and `goto` calls made. Since this history array is stored on the `StagehandPage` level, it will capture methods even if indirectly called by an agent. - [#601](https://github.com/browserbase/stagehand/pull/601) [`1d22604`](https://github.com/browserbase/stagehand/commit/1d2260401e27bae25779a55bb2ed7b7153c34fd0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add custom error classes - [#599](https://github.com/browserbase/stagehand/pull/599) [`75d8fb3`](https://github.com/browserbase/stagehand/commit/75d8fb36a67cd84eb55b509bf959edc7b05059da) Thanks [@miguelg719](https://github.com/miguelg719)! - cleaner logging with pino - [#609](https://github.com/browserbase/stagehand/pull/609) [`c92295d`](https://github.com/browserbase/stagehand/commit/c92295d8424dac1a4f81066ca260ade2d5fce80b) Thanks [@kamath](https://github.com/kamath)! - Removed deprecated fields and methods from Stagehand constructor and added cdpUrl to localBrowserLaunchOptions for custom CDP URLs support. - [#571](https://github.com/browserbase/stagehand/pull/571) [`73d6736`](https://github.com/browserbase/stagehand/commit/73d67368b88002c17814e46e75a99456bf355c4e) Thanks [@miguelg719](https://github.com/miguelg719)! - You can now use Computer Using Agents (CUA) natively in Stagehand for both Anthropic and OpenAI models! This unlocks a brand new frontier of applications for Stagehand users 🤘 - [#619](https://github.com/browserbase/stagehand/pull/619) [`7b0b996`](https://github.com/browserbase/stagehand/commit/7b0b9969a58014ae3e99b2054e4463b785073cfd) Thanks [@sameelarif](https://github.com/sameelarif)! - add disablePino flag to stagehand constructor params - [#620](https://github.com/browserbase/stagehand/pull/620) [`566e587`](https://github.com/browserbase/stagehand/commit/566e5877a1861e0eae5a118d34efe09d43a37098) Thanks [@kamath](https://github.com/kamath)! - You can now pass in an OpenAI instance as an `llmClient` to the Stagehand constructor! This allows you to use Stagehand with any OpenAI-compatible model, like Ollama, Gemini, etc., as well as OpenAI wrappers like Braintrust. - [#586](https://github.com/browserbase/stagehand/pull/586) [`c57dc19`](https://github.com/browserbase/stagehand/commit/c57dc19c448b8c2aab82953291f4e38f202c4729) Thanks [@sameelarif](https://github.com/sameelarif)! - Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider. ### Patch Changes - [#580](https://github.com/browserbase/stagehand/pull/580) [`179e17c`](https://github.com/browserbase/stagehand/commit/179e17c2d1c9837de49c776d9850a330a759e73f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - refactor \_performPlaywrightMethod - [#608](https://github.com/browserbase/stagehand/pull/608) [`71ee10d`](https://github.com/browserbase/stagehand/commit/71ee10d50cb46e83d43fd783e1404569e6f317cf) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - added support for "scrolling to next/previous chunk" - [#594](https://github.com/browserbase/stagehand/pull/594) [`e483484`](https://github.com/browserbase/stagehand/commit/e48348412a6e651967ba22d097d5308af0e8d0a8) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - pass observeHandler into actHandler - [#569](https://github.com/browserbase/stagehand/pull/569) [`17e8b40`](https://github.com/browserbase/stagehand/commit/17e8b40f94b30f6e253443a4bbb8a3e364e58e38) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - you can now call stagehand.metrics to get token usage metrics. you can also set logInferenceToFile in stagehand config to log the entire call/response history from stagehand & the LLM. - [#617](https://github.com/browserbase/stagehand/pull/617) [`affa564`](https://github.com/browserbase/stagehand/commit/affa5646658399ab71ed08c1b9ce0fd776b46fca) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use a11y tree for default extract - [#589](https://github.com/browserbase/stagehand/pull/589) [`0c4b1e7`](https://github.com/browserbase/stagehand/commit/0c4b1e7e6ff4b8a60af4a2d0d2056bff847227d5) Thanks [@miguelg719](https://github.com/miguelg719)! - Added CDP support for screenshots, find more about the benefits here: https://docs.browserbase.com/features/screenshots#why-use-cdp-for-screenshots%3F - [#584](https://github.com/browserbase/stagehand/pull/584) [`c7c1a80`](https://github.com/browserbase/stagehand/commit/c7c1a8066be33188ba1e900828045db61410025c) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix to remove unnecessary healtcheck ping on sdk - [#616](https://github.com/browserbase/stagehand/pull/616) [`2a27e1c`](https://github.com/browserbase/stagehand/commit/2a27e1c8e967befbbbb05ea71369878ac1573658) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed new opened tab handling for CUA models - [#582](https://github.com/browserbase/stagehand/pull/582) [`dfd24e6`](https://github.com/browserbase/stagehand/commit/dfd24e638ef3723d3a8a3a33ff7942af0ac4745f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support api usage for extract with no args - [#563](https://github.com/browserbase/stagehand/pull/563) [`98166d7`](https://github.com/browserbase/stagehand/commit/98166d76d30bc67d6b04b3d5c39f78f92c254b49) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support scrolling in `act` - [#598](https://github.com/browserbase/stagehand/pull/598) [`53889d4`](https://github.com/browserbase/stagehand/commit/53889d4b6e772098beaba2e1ee5a24e6f07706bb) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the open operator handler to work with anthropic - [#605](https://github.com/browserbase/stagehand/pull/605) [`b8beaec`](https://github.com/browserbase/stagehand/commit/b8beaec451a03eaa5d12281fe7c8d4eb9c9d7e81) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for resuming a Stagehand session created on the API. - [#612](https://github.com/browserbase/stagehand/pull/612) [`cd36068`](https://github.com/browserbase/stagehand/commit/cd3606854c465747c78b44763469dfdfa16db1b0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove all logic related to dom based act - [#577](https://github.com/browserbase/stagehand/pull/577) [`4fdbf63`](https://github.com/browserbase/stagehand/commit/4fdbf6324a0dc68568bba73ea4d9018b2ed67849) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove debugDom - [#603](https://github.com/browserbase/stagehand/pull/603) [`2a14a60`](https://github.com/browserbase/stagehand/commit/2a14a607f3e7fa3ca9a02670afdc7e60ccfbfb3f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm unused handlePossiblePageNavigation - [#614](https://github.com/browserbase/stagehand/pull/614) [`a59eaef`](https://github.com/browserbase/stagehand/commit/a59eaef67c2f4a0cb07bb0046fe7e93e2ba4dc41) Thanks [@kamath](https://github.com/kamath)! - override whatwg-url to avoid punycode warning - [#573](https://github.com/browserbase/stagehand/pull/573) [`c24f3c9`](https://github.com/browserbase/stagehand/commit/c24f3c9a58873c3920fab0f9891c2bf5245c9b5e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - return act result in actFromObserve ## 1.14.0 ### Minor Changes - [#518](https://github.com/browserbase/stagehand/pull/518) [`516725f`](https://github.com/browserbase/stagehand/commit/516725fc1c5d12d22caac0078a118c77bfe033a8) Thanks [@sameelarif](https://github.com/sameelarif)! - `act()` can now use `observe()` under the hood, resulting in significant performance improvements. To opt-in to this change, set `slowDomBasedAct: false` in `ActOptions`. - [#483](https://github.com/browserbase/stagehand/pull/483) [`8c9445f`](https://github.com/browserbase/stagehand/commit/8c9445fde9724ae33eeeb1234fd5b9bbd418bfdb) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - When using `textExtract`, you can now do targetted extraction by passing an xpath string into extract via the `selector` parameter. This limits the dom processing step to a target element, reducing tokens and increasing speed. For example: ```typescript const weatherData = await stagehand.page.extract({ instruction: "extract the weather data for Sun, Feb 23 at 11PM", schema: z.object({ temperature: z.string(), weather_description: z.string(), wind: z.string(), humidity: z.string(), barometer: z.string(), visibility: z.string(), }), modelName, useTextExtract, selector: xpath, // xpath of the element to extract from }); ``` - [#556](https://github.com/browserbase/stagehand/pull/556) [`499a72d`](https://github.com/browserbase/stagehand/commit/499a72dc56009791ce065270b854b12fc5570050) Thanks [@kamath](https://github.com/kamath)! - You can now set a timeout for dom-based stagehand act! Do this in `act` with `timeoutMs` as a parameter, or set a global param to `actTimeoutMs` in Stagehand config. - [#544](https://github.com/browserbase/stagehand/pull/544) [`55c9673`](https://github.com/browserbase/stagehand/commit/55c9673c5948743b804d70646f425a61818c7789) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - you can now deterministically get the full text representation of a webpage by calling `extract()` (with no arguments) - [#538](https://github.com/browserbase/stagehand/pull/538) [`d898d5b`](https://github.com/browserbase/stagehand/commit/d898d5b9e1c3b80e62e72d36d1754b3e50d5a2b4) Thanks [@sameelarif](https://github.com/sameelarif)! - Added `gpt-4.5-preview` and `claude-3-7-sonnet-latest` as supported models. - [#523](https://github.com/browserbase/stagehand/pull/523) [`44cf7cc`](https://github.com/browserbase/stagehand/commit/44cf7cc9ac1209c97d9153281970899b10a2ddc9) Thanks [@kwt00](https://github.com/kwt00)! You can now natively run Cerebras LLMs! `cerebras-llama-3.3-70b` and `cerebras-llama-3.1-8b` are now supported models as long as `CEREBRAS_API_KEY` is set in your environment. - [#542](https://github.com/browserbase/stagehand/pull/542) [`cf7fe66`](https://github.com/browserbase/stagehand/commit/cf7fe665e6d1eeda97582ee2816f1dc3a66c6152) Thanks [@sankalpgunturi](https://github.com/sankalpgunturi)! You can now natively run Groq LLMs! `groq-llama-3.3-70b-versatile` and `groq-llama-3.3-70b-specdec` are now supported models as long as `GROQ_API_KEY` is set in your environment. ### Patch Changes - [#506](https://github.com/browserbase/stagehand/pull/506) [`e521645`](https://github.com/browserbase/stagehand/commit/e5216455ce3fc2a4f4f7aa5614ecc92354eb670c) Thanks [@miguelg719](https://github.com/miguelg719)! - fixing 5s timeout on actHandler - [#535](https://github.com/browserbase/stagehand/pull/535) [`3782054`](https://github.com/browserbase/stagehand/commit/3782054734dcd0346f84003ddd8e0e484b379459) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding backwards compatibility to new act->observe pipeline by accepting actOptions - [#508](https://github.com/browserbase/stagehand/pull/508) [`270f666`](https://github.com/browserbase/stagehand/commit/270f6669f1638f52fd5cd3f133f76446ced6ef9f) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed stagehand to support multiple pages with an enhanced context - [#559](https://github.com/browserbase/stagehand/pull/559) [`18533ad`](https://github.com/browserbase/stagehand/commit/18533ad824722e4e699323248297e184bae9254e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: continuously adjusting chunk size inside `act` - [#554](https://github.com/browserbase/stagehand/pull/554) [`5f1868b`](https://github.com/browserbase/stagehand/commit/5f1868bd95478b3eb517319ebca7b0af4e91d144) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix targetted extract issue with scrollintoview and not chunking correctly - [#555](https://github.com/browserbase/stagehand/pull/555) [`fc5e8b6`](https://github.com/browserbase/stagehand/commit/fc5e8b6c5a606da96e6ed572dc8ffc6caef57576) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where processAllOfDom doesnt scroll to end of page when there is dynamic content - [#552](https://github.com/browserbase/stagehand/pull/552) [`a25a4cb`](https://github.com/browserbase/stagehand/commit/a25a4cb538d64f50b5bd834dd88e8e6086a73078) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - accept xpaths with 'xpath=' prepended to the front in addition to xpaths without - [#534](https://github.com/browserbase/stagehand/pull/534) [`f0c162a`](https://github.com/browserbase/stagehand/commit/f0c162a6b4d1ac72c42f26462d7241a08b5c4e0a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - call this.end() if the process exists - [#528](https://github.com/browserbase/stagehand/pull/528) [`c820bfc`](https://github.com/browserbase/stagehand/commit/c820bfcfc9571fea90afd1595775c5946118cfaf) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - handle attempt to close session that has already been closed when using the api - [#520](https://github.com/browserbase/stagehand/pull/520) [`f49eebd`](https://github.com/browserbase/stagehand/commit/f49eebd98c1d61413a3ea4c798595db601d55da8) Thanks [@miguelg719](https://github.com/miguelg719)! - Performing act from a 'not-supported' ObserveResult will now throw an informed error ## 1.13.1 ### Patch Changes - [#509](https://github.com/browserbase/stagehand/pull/509) [`a7d345e`](https://github.com/browserbase/stagehand/commit/a7d345e75434aebb656e1aa5aa61caed00dc99a8) Thanks [@miguelg719](https://github.com/miguelg719)! - Bun runs will now throw a more informed error ## 1.13.0 ### Minor Changes - [#486](https://github.com/browserbase/stagehand/pull/486) [`33f2b3f`](https://github.com/browserbase/stagehand/commit/33f2b3f8deff86ac2073b6d35b7413b0aeaba2f9) Thanks [@sameelarif](https://github.com/sameelarif)! - [Unreleased] Parameterized offloading Stagehand method calls to the Stagehand API. In the future, this will allow for better observability and debugging experience. - [#494](https://github.com/browserbase/stagehand/pull/494) [`9ba4b0b`](https://github.com/browserbase/stagehand/commit/9ba4b0b563cbc77d40cac31c11e17e365a9d1749) Thanks [@pkiv](https://github.com/pkiv)! - Added LocalBrowserLaunchOptions to provide comprehensive configuration options for local browser instances. Deprecated the top-level headless option in favor of using localBrowserLaunchOptions.headless - [#500](https://github.com/browserbase/stagehand/pull/500) [`a683fab`](https://github.com/browserbase/stagehand/commit/a683fab9ca90c45d78f6602a228c2d3219b776dc) Thanks [@miguelg719](https://github.com/miguelg719)! - Including Iframes in ObserveResults. This appends any iframe(s) found in the page to the end of observe results on any observe call. - [#504](https://github.com/browserbase/stagehand/pull/504) [`577662e`](https://github.com/browserbase/stagehand/commit/577662e985a6a6b0477815853d98610f3a6b567d) Thanks [@sameelarif](https://github.com/sameelarif)! - Enabled support for Browserbase captcha solving after page navigations. This can be enabled with the new constructor parameter: `waitForCaptchaSolves`. - [#496](https://github.com/browserbase/stagehand/pull/496) [`28ca9fb`](https://github.com/browserbase/stagehand/commit/28ca9fbc6f3cdc88437001108a9a6c4388ba0303) Thanks [@sameelarif](https://github.com/sameelarif)! - Fixed browserbaseSessionCreateParams not being passed in to the API initialization payload. ### Patch Changes - [#459](https://github.com/browserbase/stagehand/pull/459) [`62a29ee`](https://github.com/browserbase/stagehand/commit/62a29eea982bbb855e2f885c09ac4c1334f3e0dc) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - create a11y + dom hybrid input for observe - [#463](https://github.com/browserbase/stagehand/pull/463) [`e40bf6f`](https://github.com/browserbase/stagehand/commit/e40bf6f517331fc9952c3c9f2683b7e02ffb9735) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - include 'Scrollable' annotations in a11y-dom hybrid - [#480](https://github.com/browserbase/stagehand/pull/480) [`4c07c44`](https://github.com/browserbase/stagehand/commit/4c07c444f0e71faf54413b2eeab760c7916a36e3) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding a fallback try on actFromObserveResult to use the description from observe and call regular act. - [#487](https://github.com/browserbase/stagehand/pull/487) [`2c855cf`](https://github.com/browserbase/stagehand/commit/2c855cffdfa2b0af9924612b9c59df7b65df6443) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - update refine extraction prompt to ensure correct schema is used - [#497](https://github.com/browserbase/stagehand/pull/497) [`945ed04`](https://github.com/browserbase/stagehand/commit/945ed0426d34d2cb833aec8ba67bd4cba6c3b660) Thanks [@kamath](https://github.com/kamath)! - add gpt 4o november snapshot ## 1.12.0 ### Minor Changes - [#426](https://github.com/browserbase/stagehand/pull/426) [`bbbcee7`](https://github.com/browserbase/stagehand/commit/bbbcee7e7d86f5bf90cbb93f2ac9ad5935f15896) Thanks [@miguelg719](https://github.com/miguelg719)! - Observe got a major upgrade. Now it will return a suggested playwright method with any necessary arguments for the generated candidate elements. It also includes a major speedup when using a11y tree processing for context. - [#452](https://github.com/browserbase/stagehand/pull/452) [`16837ec`](https://github.com/browserbase/stagehand/commit/16837ece839e192fbf7b68bec128dd02f22c2613) Thanks [@kamath](https://github.com/kamath)! - add o3-mini to availablemodel - [#441](https://github.com/browserbase/stagehand/pull/441) [`1032d7d`](https://github.com/browserbase/stagehand/commit/1032d7d7d9c1ef8f30183c9019ea8324f1bdd5c6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow act to accept observe output ### Patch Changes - [#458](https://github.com/browserbase/stagehand/pull/458) [`da2e5d1`](https://github.com/browserbase/stagehand/commit/da2e5d1314b7504877fd50090e6a4b47f44fb9f6) Thanks [@miguelg719](https://github.com/miguelg719)! - Updated getAccessibilityTree() to make sure it doesn't skip useful nodes. Improved getXPathByResolvedObjectId() to account for text nodes and not skip generation - [#448](https://github.com/browserbase/stagehand/pull/448) [`b216072`](https://github.com/browserbase/stagehand/commit/b2160723923ed78eba83e75c7270634ca7d217de) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improve handling of radio button clicks - [#445](https://github.com/browserbase/stagehand/pull/445) [`5bc514f`](https://github.com/browserbase/stagehand/commit/5bc514fc18e6634b1c81553bbc1e8b7d71b67d34) Thanks [@miguelg719](https://github.com/miguelg719)! - Adding back useAccessibilityTree param to observe with a deprecation warning/error indicating to use onlyVisible instead ## 1.11.0 ### Minor Changes - [#428](https://github.com/browserbase/stagehand/pull/428) [`5efeb5a`](https://github.com/browserbase/stagehand/commit/5efeb5ad44852efe7b260862729a5ac74eaa0228) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - temporarily remove vision ## 1.10.1 ### Patch Changes - [#422](https://github.com/browserbase/stagehand/pull/422) [`a2878d0`](https://github.com/browserbase/stagehand/commit/a2878d0acaf393b37763fb0c07b1a24043f7eb8d) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixing a build type error for async functions being called inside evaulate for observeHandler. ## 1.10.0 ### Minor Changes - [#412](https://github.com/browserbase/stagehand/pull/412) [`4aa4813`](https://github.com/browserbase/stagehand/commit/4aa4813ad62cefc333a04ea6b1004f5888dec70f) Thanks [@miguelg719](https://github.com/miguelg719)! - Includes a new format to get website context using accessibility (a11y) trees. The new context is provided optionally with the flag useAccessibilityTree for observe tasks. - [#417](https://github.com/browserbase/stagehand/pull/417) [`1f2b2c5`](https://github.com/browserbase/stagehand/commit/1f2b2c57d93e3b276c61224e1e26c65c2cb50e12) Thanks [@sameelarif](https://github.com/sameelarif)! - Simplify Stagehand method calls by allowing a simple string input instead of an options object. - [#405](https://github.com/browserbase/stagehand/pull/405) [`0df1e23`](https://github.com/browserbase/stagehand/commit/0df1e233d4ad4ba39da457b6ed85916d8d20e12e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - in ProcessAllOfDom, scroll on large scrollable elements instead of just the root DOM - [#373](https://github.com/browserbase/stagehand/pull/373) [`ff00965`](https://github.com/browserbase/stagehand/commit/ff00965160d568ae0bc3ca437c01f95b5c6e9039) Thanks [@sameelarif](https://github.com/sameelarif)! - Allow the input of custom instructions into the constructor so that users can guide, or provide guardrails to, the LLM in making decisions. ### Patch Changes - [#386](https://github.com/browserbase/stagehand/pull/386) [`2cee0a4`](https://github.com/browserbase/stagehand/commit/2cee0a45ae2b48d1de6543b196e338e7021e59fe) Thanks [@kamath](https://github.com/kamath)! - add demo gif - [#362](https://github.com/browserbase/stagehand/pull/362) [`9c20de3`](https://github.com/browserbase/stagehand/commit/9c20de3e66f0ac20374d5e5e02eb107c620a2263) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - reduce collisions and improve accuracy of textExtract - [#413](https://github.com/browserbase/stagehand/pull/413) [`737b4b2`](https://github.com/browserbase/stagehand/commit/737b4b208c9214e8bb22535ab7a8daccf37610d9) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove topMostElement check when verifying visibility of text nodes - [#388](https://github.com/browserbase/stagehand/pull/388) [`e93561d`](https://github.com/browserbase/stagehand/commit/e93561d7875210ce7bd7fe841fb52decf6011fb3) Thanks [@kamath](https://github.com/kamath)! - Export LLMClient type ## 1.9.0 ### Minor Changes - [#374](https://github.com/browserbase/stagehand/pull/374) [`207244e`](https://github.com/browserbase/stagehand/commit/207244e3a46c4474d4d28db039eab131164790ca) Thanks [@sameelarif](https://github.com/sameelarif)! - Pass in a Stagehand Page object into the `on("popup")` listener to allow for multi-page handling. - [#367](https://github.com/browserbase/stagehand/pull/367) [`75c0e20`](https://github.com/browserbase/stagehand/commit/75c0e20cde54951399753e0fa841df463e1271b8) Thanks [@kamath](https://github.com/kamath)! - Logger in LLMClient is inherited by default from Stagehand. Named rather than positional arguments are used in implemented LLMClients. - [#381](https://github.com/browserbase/stagehand/pull/381) [`db2ef59`](https://github.com/browserbase/stagehand/commit/db2ef5997664e81b1dfb5ca992392362f2d3bab1) Thanks [@kamath](https://github.com/kamath)! - make logs only sync - [#385](https://github.com/browserbase/stagehand/pull/385) [`5899ec2`](https://github.com/browserbase/stagehand/commit/5899ec2c4b73c636bfd8120ec3aac225af7dd949) Thanks [@sameelarif](https://github.com/sameelarif)! - Moved the LLMClient logger paremeter to the createChatCompletion method options. - [#364](https://github.com/browserbase/stagehand/pull/364) [`08907eb`](https://github.com/browserbase/stagehand/commit/08907ebbc2cb47cfc3151946764656a7f4ce99c6) Thanks [@kamath](https://github.com/kamath)! - exposed llmClient in stagehand constructor ### Patch Changes - [#383](https://github.com/browserbase/stagehand/pull/383) [`a77efcc`](https://github.com/browserbase/stagehand/commit/a77efccfde3a3948013eda3a52935e8a21d45b3e) Thanks [@sameelarif](https://github.com/sameelarif)! - Unified LLM input/output types for reduced dependence on OpenAI types - [`b7b3701`](https://github.com/browserbase/stagehand/commit/b7b370160bf35b09f5dc132f6e86f6e34fb70a85) Thanks [@kamath](https://github.com/kamath)! - Fix $1-types exposed to the user - [#353](https://github.com/browserbase/stagehand/pull/353) [`5c6f14b`](https://github.com/browserbase/stagehand/commit/5c6f14bade201e08cb86d2e14e246cb65707f7ee) Thanks [@kamath](https://github.com/kamath)! - Throw custom error if context is referenced without initialization, remove act/extract handler from index - [#360](https://github.com/browserbase/stagehand/pull/360) [`89841fc`](https://github.com/browserbase/stagehand/commit/89841fc42ae82559baddfe2a9593bc3260c082a2) Thanks [@kamath](https://github.com/kamath)! - Remove stagehand nav entirely - [#379](https://github.com/browserbase/stagehand/pull/379) [`b1c6579`](https://github.com/browserbase/stagehand/commit/b1c657976847de86d82324030f90c2f6a1f3f976) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - dont require LLM Client to use non-ai stagehand functions - [#371](https://github.com/browserbase/stagehand/pull/371) [`30e7d09`](https://github.com/browserbase/stagehand/commit/30e7d091445004c71aec1748d3a7d75fb86d1f11) Thanks [@kamath](https://github.com/kamath)! - pretty readme :) - [#382](https://github.com/browserbase/stagehand/pull/382) [`a41271b`](https://github.com/browserbase/stagehand/commit/a41271baf351e20f4c79b4b654d8a947b615a121) Thanks [@sameelarif](https://github.com/sameelarif)! - Added example implementation of the Vercel AI SDK as an LLMClient - [#344](https://github.com/browserbase/stagehand/pull/344) [`c1cf345`](https://github.com/browserbase/stagehand/commit/c1cf34535ed30262989b1dbe262fb0414cdf8230) Thanks [@kamath](https://github.com/kamath)! - Remove duplicate logging and expose Page/BrowserContext types ## 1.8.0 ### Minor Changes - [#324](https://github.com/browserbase/stagehand/pull/324) [`cd23fa3`](https://github.com/browserbase/stagehand/commit/cd23fa33450107f29cb1ddb6edadfc769d336aa5) Thanks [@kamath](https://github.com/kamath)! - Move stagehand.act() -> stagehand.page.act() and deprecate stagehand.act() - [#319](https://github.com/browserbase/stagehand/pull/319) [`bacbe60`](https://github.com/browserbase/stagehand/commit/bacbe608058304bfa1f0ab049da4d8aa90e8d6f7) Thanks [@kamath](https://github.com/kamath)! - We now wrap playwright page/context within StagehandPage and StagehandContext objects. This helps us augment the Stagehand experience by being able to augment the underlying Playwright - [#324](https://github.com/browserbase/stagehand/pull/324) [`cd23fa3`](https://github.com/browserbase/stagehand/commit/cd23fa33450107f29cb1ddb6edadfc769d336aa5) Thanks [@kamath](https://github.com/kamath)! - moves extract and act -> page and deprecates stagehand.extract and stagehand.observe ### Patch Changes - [#320](https://github.com/browserbase/stagehand/pull/320) [`c0cdd0e`](https://github.com/browserbase/stagehand/commit/c0cdd0e985d66f0464d2e70b7d0cb343b0efbd3f) Thanks [@kamath](https://github.com/kamath)! - bug fix: set this.env to LOCAL if BROWSERBASE_API_KEY is not defined - [#325](https://github.com/browserbase/stagehand/pull/325) [`cc46f34`](https://github.com/browserbase/stagehand/commit/cc46f345c0a1dc0af4abae7e207833df17da50e7) Thanks [@pkiv](https://github.com/pkiv)! - only start domdebug if enabled ## 1.7.0 ### Minor Changes - [#316](https://github.com/browserbase/stagehand/pull/316) [`902e633`](https://github.com/browserbase/stagehand/commit/902e633e126a58b80b757ea0ecada01a7675a473) Thanks [@kamath](https://github.com/kamath)! - rename browserbaseResumeSessionID -> browserbaseSessionID - [#296](https://github.com/browserbase/stagehand/pull/296) [`f11da27`](https://github.com/browserbase/stagehand/commit/f11da27a20409c240ceeea2003d520f676def61a) Thanks [@kamath](https://github.com/kamath)! - - Deprecate fields in `init` in favor of constructor options - Deprecate `initFromPage` in favor of `browserbaseResumeSessionID` in constructor - Rename `browserBaseSessionCreateParams` -> `browserbaseSessionCreateParams` - [#304](https://github.com/browserbase/stagehand/pull/304) [`0b72f75`](https://github.com/browserbase/stagehand/commit/0b72f75f6a62aaeb28b0c488ae96db098d6a2846) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add textExtract: an optional, text based approach to the existing extract method. textExtract often performs better on long form extraction tasks. By default `extract` uses the existing approach `domExtract`. - [#298](https://github.com/browserbase/stagehand/pull/298) [`55f0cd2`](https://github.com/browserbase/stagehand/commit/55f0cd2fe7976e800833ec6e41e9af62d88d09d5) Thanks [@kamath](https://github.com/kamath)! - Add sessionId to public params ### Patch Changes - [#283](https://github.com/browserbase/stagehand/pull/283) [`b902192`](https://github.com/browserbase/stagehand/commit/b902192bc7ff8eb02c85150c1fe6f89c2a95b211) Thanks [@sameelarif](https://github.com/sameelarif)! - allowed customization of eval config via .env - [#299](https://github.com/browserbase/stagehand/pull/299) [`fbe2300`](https://github.com/browserbase/stagehand/commit/fbe23007176488043c2415519f25021612fff989) Thanks [@sameelarif](https://github.com/sameelarif)! - log playwright actions for better debugging ## 1.6.0 ### Minor Changes - [#286](https://github.com/browserbase/stagehand/pull/286) [`9605836`](https://github.com/browserbase/stagehand/commit/9605836ee6b8207ed7dc9146e12ced1c78630d59) Thanks [@kamath](https://github.com/kamath)! - minor improvement in action + new eval case - [#279](https://github.com/browserbase/stagehand/pull/279) [`d6d7057`](https://github.com/browserbase/stagehand/commit/d6d70570623a718354797ef83aa8489eacc085d1) Thanks [@kamath](https://github.com/kamath)! - Add support for o1-mini and o1-preview in OpenAIClient - [#282](https://github.com/browserbase/stagehand/pull/282) [`5291797`](https://github.com/browserbase/stagehand/commit/529179724a53bf2fd578a4012fd6bc6b7348d1ae) Thanks [@kamath](https://github.com/kamath)! - Added eslint for stricter type checking. Streamlined most of the internal types throughout the cache, llm, and handlers. This should make it easier to add new LLMs down the line, maintain and update the existing code, and make it easier to add new features in the future. Types can be checked by running `npx eslint .` from the project directory. ### Patch Changes - [#270](https://github.com/browserbase/stagehand/pull/270) [`6b10b3b`](https://github.com/browserbase/stagehand/commit/6b10b3b1160649b19f50d66588395ceb679b3d68) Thanks [@sameelarif](https://github.com/sameelarif)! - add close link to readme - [#288](https://github.com/browserbase/stagehand/pull/288) [`5afa0b9`](https://github.com/browserbase/stagehand/commit/5afa0b940a9f379a3719a5bbae249dd2a9ef8380) Thanks [@kamath](https://github.com/kamath)! - add multi-region support for browserbase - [#284](https://github.com/browserbase/stagehand/pull/284) [`474217c`](https://github.com/browserbase/stagehand/commit/474217cfaff8e68614212b66baa62d35493fd2ce) Thanks [@kamath](https://github.com/kamath)! - Build wasn't working, this addresses tsc failure. - [#236](https://github.com/browserbase/stagehand/pull/236) [`85483fe`](https://github.com/browserbase/stagehand/commit/85483fe091544fc079015c62b6923b03f8b9caa7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - reduce chunk size ## 1.5.0 ### Minor Changes - [#266](https://github.com/browserbase/stagehand/pull/266) [`0e8f34f`](https://github.com/browserbase/stagehand/commit/0e8f34fc15aee91c548d09534deaccc8adca7c4d) Thanks [@kamath](https://github.com/kamath)! - Install wasn't working from NPM due to misconfigured build step. This attempts to fix that. ## 1.4.0 ### Minor Changes - [#253](https://github.com/browserbase/stagehand/pull/253) [`598cae2`](https://github.com/browserbase/stagehand/commit/598cae230c7b8d4e31ae22fd63047a91b63e51b8) Thanks [@sameelarif](https://github.com/sameelarif)! - clean up contexts after use ### Patch Changes - [#225](https://github.com/browserbase/stagehand/pull/225) [`a2366fe`](https://github.com/browserbase/stagehand/commit/a2366feb023180fbb2ccc7a8379692f9f8347fe5) Thanks [@sameelarif](https://github.com/sameelarif)! - Ensuring cross-platform compatibility with tmp directories - [#249](https://github.com/browserbase/stagehand/pull/249) [`7d06d43`](https://github.com/browserbase/stagehand/commit/7d06d43f2b9a477fed35793d7479de9b183e8d53) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix broken evals - [#227](https://github.com/browserbase/stagehand/pull/227) [`647eefd`](https://github.com/browserbase/stagehand/commit/647eefd651852eec495faa1b8f4dbe6b1da17999) Thanks [@kamath](https://github.com/kamath)! - Fix debugDom still showing chunks when set to false - [#250](https://github.com/browserbase/stagehand/pull/250) [`5886620`](https://github.com/browserbase/stagehand/commit/5886620dd1b0a57c68bf810cf130df2ca0a50a69) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add ci specific evals - [#222](https://github.com/browserbase/stagehand/pull/222) [`8dff026`](https://github.com/browserbase/stagehand/commit/8dff02674df7a6448f2262c7e212b58c03be57bc) Thanks [@sameelarif](https://github.com/sameelarif)! - Streamline type definitions and fix existing typescript errors - [#232](https://github.com/browserbase/stagehand/pull/232) [`b9f9949`](https://github.com/browserbase/stagehand/commit/b9f99494021e6a9e2487b77bb64ed0a491751400) Thanks [@kamath](https://github.com/kamath)! - Minor changes to package.json and tsconfig, mainly around the build process. Also add more type defs and remove unused dependencies. ## 1.3.0 ### Minor Changes - [#195](https://github.com/browserbase/stagehand/pull/195) [`87a6305`](https://github.com/browserbase/stagehand/commit/87a6305d9a2faf1ab5915965913bc14d5cc15772) Thanks [@kamath](https://github.com/kamath)! - - Adds structured and more standardized JSON logging - Doesn't init cache if `enableCaching` is false, preventing `tmp/.cache` from being created - Updates bundling for browser-side code to support NextJS and serverless ## 1.2.0 ### Minor Changes - [#179](https://github.com/browserbase/stagehand/pull/179) [`0031871`](https://github.com/browserbase/stagehand/commit/0031871d5a6d6180f272a68b88a8634e5a991785) Thanks [@navidkpr](https://github.com/navidkpr)! - Fixes: The last big change we pushed out, introduced a small regression. As a result, the gray outline showing the elements Stagehand is looking out is missing. This commit fixes that. We now process selectorMap properly now (using the updated type Record

The AI Browser Automation Framework
Read the Docs

MIT License Discord Community

browserbase%2Fstagehand | Trendshift

Ask DeepWiki

If you're looking for the Python implementation, you can find it here

Vibe code Stagehand with Director Director
## What is Stagehand? Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. ## Why Stagehand? Most existing browser automation tools either require you to write low-level code in a framework like Selenium, Playwright, or Puppeteer, or use high-level agents that can be unpredictable in production. By letting developers choose what to write in code vs. natural language (and bridging the gap between the two) Stagehand is the natural choice for browser automations in production. 1. **Choose when to write code vs. natural language**: use AI when you want to navigate unfamiliar pages, and use code when you know exactly what you want to do. 2. **Go from AI-driven to repeatable workflows**: Stagehand lets you preview AI actions before running them, and also helps you easily cache repeatable actions to save time and tokens. 3. **Write once, run forever**: Stagehand's auto-caching combined with self-healing remembers previous actions, runs without LLM inference, and knows when to involve AI whenever the website changes and your automation breaks. ## Getting Started Start with Stagehand with one line of code, or check out our [Quickstart Guide](https://docs.stagehand.dev/v3/first-steps/quickstart) for more information: ```bash npx create-browser-app ``` ## Example Here's how to build a sample browser automation with Stagehand: ```typescript // Stagehand's CDP engine provides an optimized, low level interface to the browser built for automation const page = stagehand.context.pages()[0]; await page.goto("https://github.com/browserbase"); // Use act() to execute individual actions await stagehand.act("click on the stagehand repo"); // Use agent() for multi-step tasks const agent = stagehand.agent(); await agent.execute("Get to the latest PR"); // Use extract() to get structured data from the page const { author, title } = await stagehand.extract( "extract the author and title of the PR", z.object({ author: z.string().describe("The username of the PR author"), title: z.string().describe("The title of the PR"), }), ); ``` ## Documentation Visit [docs.stagehand.dev](https://docs.stagehand.dev) to view the full documentation. ### Build and Run from Source ```bash git clone https://github.com/browserbase/stagehand.git cd stagehand pnpm install pnpm run build pnpm run example # run the blank script at ./examples/example.ts ``` Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run: ```bash cp .env.example .env nano .env # Edit the .env file to add API keys ``` ### Installing from a branch You can install and build Stagehand directly from a github branch using [gitpkg](https://github.com/EqualMa/gitpkg) In your project's `package.json` set: ```json "@browserbasehq/stagehand": "https://gitpkg.now.sh/browserbase/stagehand/packages/core?", ``` ## Contributing > [!NOTE] > We highly value contributions to Stagehand! For questions or support, please join our [Discord community](https://stagehand.dev/discord). At a high level, we're focused on improving reliability, extensibility, speed, and cost in that order of priority. If you're interested in contributing, **bug fixes and small improvements are the best way to get started**. For more involved features, we strongly recommend reaching out to [Miguel Gonzalez](https://x.com/miguel_gonzf) or [Paul Klein](https://x.com/pk_iv) in our [Discord community](https://stagehand.dev/discord) before starting to ensure that your contribution aligns with our goals. ## Acknowledgements We'd like to thank the following people for their major contributions to Stagehand: - [Paul Klein](https://github.com/pkiv) - [Sean McGuire](https://github.com/seanmcguire12) - [Miguel Gonzalez](https://github.com/miguelg719) - [Sameel Arif](https://github.com/sameelarif) - [Thomas Katwan](https://github.com/tkattkat) - [Filip Michalsky](https://github.com/filip-michalsky) - [Anirudh Kamath](https://github.com/kamath) - [Jeremy Press](https://x.com/jeremypress) - [Navid Pour](https://github.com/navidpour) ## License Licensed under the MIT License. Copyright 2025 Browserbase, Inc. ================================================ FILE: claude.md ================================================ # Stagehand Project This is a project that uses Stagehand V3, a browser automation framework with AI-powered `act`, `extract`, `observe`, and `agent` methods. The main class can be imported as `Stagehand` from `@browserbasehq/stagehand`. **Key Classes:** - `Stagehand`: Main orchestrator class providing `act`, `extract`, `observe`, and `agent` methods - `context`: A `V3Context` object that manages browser contexts and pages - `page`: Individual page objects accessed via `stagehand.context.pages()[i]` or created with `stagehand.context.newPage()` ## Initialize ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", // or "BROWSERBASE" verbose: 2, // 0, 1, or 2 model: "openai/gpt-4.1-mini", // or any supported model }); await stagehand.init(); // Access the browser context and pages const page = stagehand.context.pages()[0]; const context = stagehand.context; // Create new pages if needed const page2 = await stagehand.context.newPage(); ``` ## Act Actions are called on the `stagehand` instance (not the page). Use atomic, specific instructions: ```typescript // Act on the current active page await stagehand.act("click the sign in button"); // Act on a specific page (when you need to target a page that isn't currently active) await stagehand.act("click the sign in button", { page: page2 }); ``` **Important:** Act instructions should be atomic and specific: - ✅ Good: "Click the sign in button" or "Type 'hello' into the search input" - ❌ Bad: "Order me pizza" or "Type in the search bar and hit enter" (multi-step) ### Observe + Act Pattern (Recommended) Cache the results of `observe` to avoid unexpected DOM changes: ```typescript const instruction = "Click the sign in button"; // Get candidate actions const actions = await stagehand.observe(instruction); // Execute the first action await stagehand.act(actions[0]); ``` To target a specific page: ```typescript const actions = await stagehand.observe("select blue as the favorite color", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Extract Extract data from pages using natural language instructions. The `extract` method is called on the `stagehand` instance. ### Basic Extraction (with schema) ```typescript import { z } from "zod"; // Extract with explicit schema const data = await stagehand.extract( "extract all apartment listings with prices and addresses", z.object({ listings: z.array( z.object({ price: z.string(), address: z.string(), }), ), }), ); console.log(data.listings); ``` ### Simple Extraction (without schema) ```typescript // Extract returns a default object with 'extraction' field const result = await stagehand.extract("extract the sign in button text"); console.log(result); // Output: { extraction: "Sign in" } // Or destructure directly const { extraction } = await stagehand.extract( "extract the sign in button text", ); console.log(extraction); // "Sign in" ``` ### Targeted Extraction Extract data from a specific element using a selector: ```typescript const reason = await stagehand.extract( "extract the reason why script injection fails", z.string(), { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, ); ``` ### URL Extraction When extracting links or URLs, use `z.string().url()`: ```typescript const { links } = await stagehand.extract( "extract all navigation links", z.object({ links: z.array(z.string().url()), }), ); ``` ### Extracting from a Specific Page ```typescript // Extract from a specific page (when you need to target a page that isn't currently active) const data = await stagehand.extract( "extract the placeholder text on the name field", { page: page2 }, ); ``` ## Observe Plan actions before executing them. Returns an array of candidate actions: ```typescript // Get candidate actions on the current active page const [action] = await stagehand.observe("Click the sign in button"); // Execute the action await stagehand.act(action); ``` Observing on a specific page: ```typescript // Target a specific page (when you need to target a page that isn't currently active) const actions = await stagehand.observe("find the next page button", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Agent Use the `agent` method to autonomously execute complex, multi-step tasks. ### Basic Agent Usage ```typescript const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); const agent = stagehand.agent({ model: "google/gemini-2.0-flash", executionModel: "google/gemini-2.0-flash", }); const result = await agent.execute({ instruction: "Search for the stock price of NVDA", maxSteps: 20, }); console.log(result.message); ``` ### Computer Use Agent (CUA) For more advanced scenarios using computer-use models: ```typescript const agent = stagehand.agent({ mode: "cua", // Enable Computer Use Agent mode model: "anthropic/claude-sonnet-4-20250514", // or "google/gemini-2.5-computer-use-preview-10-2025" systemPrompt: `You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.`, }); await agent.execute({ instruction: "Apply for a library card at the San Francisco Public Library", maxSteps: 30, }); ``` ### Agent with Custom Model Configuration ```typescript const agent = stagehand.agent({ model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GEMINI_API_KEY, }, systemPrompt: `You are a helpful assistant.`, }); ``` ### Agent with Integrations (MCP/External Tools) ```typescript const agent = stagehand.agent({ integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], systemPrompt: `You have access to the Exa search tool.`, }); ``` ### Agent Hybrid Mode Hybrid mode uses both DOM-based and coordinate-based tools (act, click, type, dragAndDrop) for visual interactions. This requires `experimental: true` and models that support reliable coordinate-based actions. **Recommended models for hybrid mode:** - `google/gemini-3-flash-preview` - `anthropic/claude-sonnet-4-20250514`, `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001` ```typescript const stagehand = new Stagehand({ env: "LOCAL", experimental: true, // Required for hybrid mode }); await stagehand.init(); const agent = stagehand.agent({ mode: "hybrid", model: "google/gemini-3-flash-preview", }); await agent.execute({ instruction: "Click the submit button and fill the form", maxSteps: 20, highlightCursor: true, // Enabled by default in hybrid mode }); ``` **Agent modes:** - `"dom"` (default): Uses DOM-based tools (act, fillForm) - works with any model - `"hybrid"`: Uses both DOM-based and coordinate-based tools (act, click, type, dragAndDrop) - requires grounding-capable models - `"cua"`: Uses Computer Use Agent providers ## Advanced Features ### DeepLocator (XPath Targeting) Target specific elements across shadow DOM and iframes: ```typescript await page .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") .highlight({ durationMs: 5000, contentColor: { r: 255, g: 0, b: 0 }, }); ``` ### Multi-Page Workflows ```typescript const page1 = stagehand.context.pages()[0]; await page1.goto("https://example.com"); const page2 = await stagehand.context.newPage(); await page2.goto("https://example2.com"); // Act/extract/observe operate on the current active page by default // Pass { page } option to target a specific page await stagehand.act("click button", { page: page1 }); await stagehand.extract("get title", { page: page2 }); ``` ================================================ FILE: eslint.config.mjs ================================================ import globals from "globals"; import pluginJs from "@eslint/js"; import tseslint from "typescript-eslint"; import security from "eslint-plugin-security"; /** @type {import('eslint').Linter.Config[]} */ export default [ { files: ["**/*.{js,mjs,cjs,ts}"] }, { languageOptions: { globals: globals.browser } }, { files: ["packages/core/scripts/**/*.{js,cjs,mjs}"], languageOptions: { globals: globals.node }, }, { files: [ "packages/server-v3/scripts/**/*.{js,cjs,mjs,ts}", "packages/server-v4/scripts/**/*.{js,cjs,mjs,ts}", ], languageOptions: { globals: globals.node }, }, { files: ["packages/cli/**/*.{js,cjs,mjs,ts}"], languageOptions: { globals: globals.node }, }, { ignores: [ "**/dist/**", "**/node_modules/**", "packages/core/lib/dom/build/**", "packages/core/lib/v3/dom/build/**", "packages/core/lib/v4/dom/build/**", "packages/core/scripts/prepare.js", "**/*.config.js", "**/*.config.mjs", ".browserbase/**", "**/.browserbase/**", "**/*.json", "stainless.yml", "packages/server-v3/openapi.v3.yaml", "packages/server-v4/openapi.v4.yaml", ], }, pluginJs.configs.recommended, ...tseslint.configs.recommended, { plugins: { security, }, rules: { "no-eval": "error", "no-implied-eval": "error", "no-new-func": "error", "security/detect-eval-with-expression": "error", "preserve-caught-error": "error", "no-restricted-syntax": [ "error", { selector: "CallExpression[callee.name='Function']", message: "Dynamic function construction is prohibited.", }, { selector: "NewExpression[callee.name='Function']", message: "Dynamic function construction is prohibited.", }, { selector: "CallExpression[callee.object.name='window'][callee.property.name='Function']", message: "Dynamic function construction via window.Function is prohibited.", }, { selector: "CallExpression[callee.object.name='globalThis'][callee.property.name='Function']", message: "Dynamic function construction via globalThis.Function is prohibited.", }, ], }, }, { files: ["packages/cli/**/*.{js,cjs,mjs,ts}"], rules: { "no-empty": ["error", { allowEmptyCatch: true }], }, }, ]; ================================================ FILE: package.json ================================================ { "name": "stagehand-workspace", "version": "0.0.0", "private": true, "description": "Stagehand monorepo workspace", "type": "module", "scripts": { "build": "turbo run build", "build:full": "turbo run build", "build:cjs": "turbo run build:cjs", "build:cli": "turbo run build:cli", "build:esm": "turbo run build:esm", "build:sea": "turbo run build:sea:esm", "build:sea:esm": "turbo run build:sea:esm", "build:sea:cjs": "turbo run build:sea:cjs", "lint": "turbo run lint", "format": "prettier --write .", "prettier": "prettier --write .", "eslint": "eslint .", "test": "turbo run test:core test:e2e test:server test:evals test:cli", "test:core": "turbo run test:core --", "test:core:local": "STAGEHAND_BROWSER_TARGET=local pnpm run test:core --", "test:core:bb": "STAGEHAND_BROWSER_TARGET=browserbase pnpm run test:core --", "test:e2e": "turbo run test:e2e --", "test:e2e:local": "STAGEHAND_BROWSER_TARGET=local pnpm run test:e2e --", "test:e2e:bb": "STAGEHAND_BROWSER_TARGET=browserbase pnpm run test:e2e --", "test:server": "turbo run test:server --", "test:server:sea": "STAGEHAND_SERVER_TARGET=sea pnpm run test:server --", "test:server:local": "STAGEHAND_SERVER_TARGET=local pnpm run test:server --", "test:server:remote": "STAGEHAND_SERVER_TARGET=remote pnpm run test:server --", "test:evals": "turbo run test:evals --", "test:evals:local": "STAGEHAND_BROWSER_TARGET=local pnpm run test:evals --", "test:evals:bb": "STAGEHAND_BROWSER_TARGET=browserbase pnpm run test:evals --", "coverage:merge": "pnpm -w exec tsx packages/core/scripts/coverage.ts merge", "docs": "turbo run docs", "dev": "turbo run dev", "example": "pnpm --filter @browserbasehq/stagehand run example --", "cache:clear": "turbo run build --force", "prepare": "node packages/core/scripts/prepare.js", "release": "turbo run build && changeset publish", "release-canary": "turbo run build && changeset version --snapshot && changeset publish --tag alpha" }, "devDependencies": { "@changesets/changelog-github": "^0.5.0", "@changesets/cli": "^2.27.9", "@eslint/js": "^10.0.1", "c8": "^10.1.3", "dotenv": "^17.3.1", "esbuild": "0.27.2", "eslint": "^10.0.2", "eslint-plugin-security": "^3.0.1", "globals": "^15.13.0", "junit-to-ctrf": "^0.0.14", "prettier": "^3.2.5", "source-map": "^0.7.4", "tsx": "^4.19.4", "turbo": "^2.8.10", "typescript": "5.8.3", "typescript-eslint": "^8.56.1" }, "repository": { "type": "git", "url": "git+https://github.com/browserbase/stagehand.git" }, "bugs": { "url": "https://github.com/browserbase/stagehand/issues" }, "homepage": "https://stagehand.dev", "overrides": { "whatwg-url": "^14.0.0", "jwa": "^2.0.1", "zod": "4.2.1", "tsx": "4.19.4" }, "engines": { "node": "^20.19.0 || >=22.12.0" }, "packageManager": "pnpm@9.15.0+sha512.76e2379760a4328ec4415815bcd6628dee727af3779aaa4c914e3944156c4299921a89f976381ee107d41f12cfa4b66681ca9c718f0668fa0831ed4c6d8ba56c" } ================================================ FILE: packages/README.md ================================================ # Stagehand Packages This directory contains the Stagehand monorepo packages: - **core** - The main Stagehand package - **evals** - Evals CLI - **docs** - [Docs](https://docs.stagehand.dev) - **server** - Fastify server wrapping the core package for different language clients ================================================ FILE: packages/cli/CHANGELOG.md ================================================ # @browserbasehq/browse-cli ## 0.2.0 ### Minor Changes - [#1816](https://github.com/browserbase/stagehand/pull/1816) [`687d54a`](https://github.com/browserbase/stagehand/commit/687d54addad5625f28d51c6994170c7b629871f2) Thanks [@shrey150](https://github.com/shrey150)! - Add `--context-id` and `--persist` flags to `browse open` for loading and persisting Browserbase Contexts across sessions - [#1793](https://github.com/browserbase/stagehand/pull/1793) [`e38c13b`](https://github.com/browserbase/stagehand/commit/e38c13b7526b140b693152ef1ffda88a74e9c425) Thanks [@shrey150](https://github.com/shrey150)! - Initial release of browse CLI - browser automation for AI agents ### Patch Changes - [#1806](https://github.com/browserbase/stagehand/pull/1806) [`f8c7738`](https://github.com/browserbase/stagehand/commit/f8c773898f4d97e8854cc67a0b18eb7d1cdd7b75) Thanks [@shrey150](https://github.com/shrey150)! - Fix `browse env` showing stale mode after `browse env remote` - Updated dependencies [[`505e8c6`](https://github.com/browserbase/stagehand/commit/505e8c6736f3706328dbc8df670c49a018058388), [`2f43ffa`](https://github.com/browserbase/stagehand/commit/2f43ffac11778152d17e4c44405770cc32c3ec8c), [`63ee247`](https://github.com/browserbase/stagehand/commit/63ee247ac6bf2992046d4f6b2759f46b15643e36), [`7dc35f5`](https://github.com/browserbase/stagehand/commit/7dc35f5e25689e6518d68b25ef71536d2781c8aa), [`335cf47`](https://github.com/browserbase/stagehand/commit/335cf4730e73bce33e92331d04bda4b0fd42685d), [`6ba0a1d`](https://github.com/browserbase/stagehand/commit/6ba0a1db7fc2d5d5a2f8927b1417d8f1d15eda10), [`4ff3bb8`](https://github.com/browserbase/stagehand/commit/4ff3bb831a6ef6e2d57148e7afb68ea8d23e395d), [`c27054b`](https://github.com/browserbase/stagehand/commit/c27054bbd0508431ade91d655f89efc87bbf5867), [`2abf5b9`](https://github.com/browserbase/stagehand/commit/2abf5b90f1e2bb1442509ef3a686b6128c9cdcf6), [`7817fcc`](https://github.com/browserbase/stagehand/commit/7817fcc315eee4455ce04567cf56c9ec801caf0b), [`7390508`](https://github.com/browserbase/stagehand/commit/73905088c5ed5923d276da9cce2efd0a0a3a46eb), [`611f43a`](https://github.com/browserbase/stagehand/commit/611f43ac8d4c580216d55d2b217c14a9a9c11013), [`521a10e`](https://github.com/browserbase/stagehand/commit/521a10e3698fc5631e219947bc90dad0f8bddaa8), [`2402a3c`](https://github.com/browserbase/stagehand/commit/2402a3c4d50270391b3e6440f4385cdcf5e1eb64)]: - @browserbasehq/stagehand@3.2.0 ================================================ FILE: packages/cli/README.md ================================================ # Browse CLI Browser automation CLI for AI agents. Built on [Stagehand](https://github.com/browserbase/stagehand), providing raw browser control without requiring LLM integration. ## Installation ```bash npm install -g @browserbasehq/browse-cli ``` Requires Chrome/Chromium installed on the system. ## Quick Start ```bash # Navigate to a URL (auto-starts browser daemon) browse open https://example.com # Take a snapshot to get element refs browse snapshot -c # Click an element by ref browse click @0-5 # Type text browse type "Hello, world!" # Take a screenshot browse screenshot ./page.png # Stop the browser browse stop ``` ## How It Works Browse uses a daemon architecture for fast, stateful interactions: 1. **First command** auto-starts a Chrome browser daemon 2. **Subsequent commands** reuse the same browser session 3. **State persists** between commands (cookies, refs, etc.) 4. **Multiple sessions** supported via `--session` or `BROWSE_SESSION` env var ### Self-Healing Sessions The CLI automatically recovers from stale sessions. If the daemon or Chrome crashes: 1. Detects the failure 2. Cleans up stale processes and files 3. Restarts the daemon 4. Retries the command Agents don't need to handle recovery - commands "just work". ## Commands ### Navigation ```bash browse open [--wait load|domcontentloaded|networkidle] [-t|--timeout ms] browse reload browse back browse forward ``` The `--timeout` flag (default: 30000ms) controls how long to wait for the page load state. Use longer timeouts for slow-loading pages: ```bash browse open https://slow-site.com --timeout 60000 ``` ### Click Actions ```bash browse click [-b left|right|middle] [-c count] # Click by ref (e.g., @0-5) browse click_xy [--button] [--xpath] # Click at coordinates ``` ### Coordinate Actions ```bash browse hover [--xpath] browse scroll [--xpath] browse drag [--steps n] [--xpath] ``` ### Keyboard ```bash browse type [-d delay] [--mistakes] browse press # e.g., Enter, Tab, Cmd+A ``` ### Forms ```bash browse fill [--no-press-enter] browse select browse highlight [-d duration] ``` ### Page Info ```bash browse get url browse get title browse get text browse get html browse get value browse get box # Returns center coordinates browse snapshot [-c|--compact] # Accessibility tree with refs browse screenshot [path] [-f|--full-page] [-t png|jpeg] ``` ### Waiting ```bash browse wait load [state] browse wait selector [-t timeout] [-s visible|hidden|attached|detached] browse wait timeout ``` ### Multi-Tab ```bash browse pages # List all tabs browse newpage [url] # Open new tab browse tab_switch # Switch to tab by index browse tab_close [n] # Close tab (default: last) ``` ### Network Capture Capture HTTP requests to the filesystem for inspection: ```bash browse network on # Start capturing requests browse network off # Stop capturing browse network path # Get capture directory path browse network clear # Clear captured requests ``` Captured requests are saved as directories: ``` /tmp/browse-default-network/ 001-GET-api.github.com-repos/ request.json # method, url, headers, body response.json # status, headers, body, duration ``` ### Daemon Control ```bash browse start # Explicitly start daemon browse stop [--force] # Stop daemon browse status # Check daemon status browse env [target] # Show or switch environment: local | remote ``` ### Environment Switching (Local vs Remote) Use environment switching when an agent should keep the same command flow, but the browser runtime needs to change: - `local` runs Chrome on your machine (best for local debugging/dev loops) - `remote` runs a Browserbase session (best for anti-bot hardening and cloud runs) ```bash # Show active environment (if running) and desired environment for next start browse env # Switch current session to Browserbase (restarts daemon if needed) browse env remote # Switch back to local Chrome browse env local ``` Behavior details: - Environment is scoped per `--session` - `browse env ` persists an override and restarts the daemon - `browse stop` clears the override so next start falls back to env-var-based auto detection - Auto detection defaults to: - `remote` when `BROWSERBASE_API_KEY` is set - `local` otherwise ## Global Options | Option | Description | |--------|-------------| | `--session ` | Session name for multiple browsers (default: "default") | | `--headless` | Run Chrome in headless mode | | `--headed` | Run Chrome with visible window (default) | | `--ws ` | Connect to existing Chrome via CDP WebSocket | | `--json` | Output as JSON | ## Environment Variables | Variable | Description | |----------|-------------| | `BROWSE_SESSION` | Default session name (alternative to `--session`) | | `BROWSERBASE_API_KEY` | Browserbase API key (required for `browse env remote`) | | `BROWSERBASE_PROJECT_ID` | Browserbase project ID (optional, passed through if set) | ## Element References After running `browse snapshot`, you can reference elements by their ref ID: ```bash # Get snapshot with refs browse snapshot -c # Output includes refs like [0-5], [1-2], etc. # RootWebArea "Example" url="https://example.com" # [0-0] link "Home" # [0-1] link "About" # [0-2] button "Sign In" # Click using ref (multiple formats supported) browse click @0-2 # @ prefix browse click 0-2 # Plain ref browse click ref=0-2 # Explicit prefix ``` The full snapshot output includes mappings: - **xpathMap**: Cross-frame XPath selectors - **cssMap**: Fast CSS selectors when available - **urlMap**: Extracted URLs from links ## Multiple Sessions Run multiple browser instances simultaneously: ```bash # Terminal 1 BROWSE_SESSION=session1 browse open https://google.com # Terminal 2 BROWSE_SESSION=session2 browse open https://github.com # Or use --session flag browse --session work open https://slack.com browse --session personal open https://twitter.com ``` ## Direct CDP Connection Connect to an existing Chrome instance: ```bash # Start Chrome with remote debugging google-chrome --remote-debugging-port=9222 # Connect via WebSocket browse --ws ws://localhost:9222/devtools/browser/... open https://example.com ``` ## Optimal AI Workflow 1. **Navigate** to target page (browser auto-starts) 2. **Snapshot** to get the accessibility tree with refs 3. **Click/Fill** using refs directly (e.g., `@0-5`) 4. **Re-snapshot** after actions to verify state changes 5. **Stop** when done ```bash browse open https://example.com browse snapshot -c # [0-5] textbox: Search # [0-8] button: Submit browse fill @0-5 "my query" browse click @0-8 browse snapshot -c # Verify result browse stop ``` ## Troubleshooting ### Chrome not found The CLI uses your system Chrome/Chromium. If not found: ```bash # macOS - Install Chrome or set path export CHROME_PATH=/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome # Linux - Install chromium sudo apt install chromium-browser ``` ### Stale daemon If the daemon becomes unresponsive: ```bash browse stop --force ``` ### Permission denied on socket ```bash # Clean up stale socket files rm /tmp/browse-*.sock /tmp/browse-*.pid ``` ## Platform Support - macOS (Intel and Apple Silicon) - Linux (x64 and arm64) Windows support requires WSL or TCP socket implementation. ## Development ```bash # Clone and setup (in monorepo) cd packages/cli pnpm install # Install dependencies first! pnpm run build # Build the CLI # Run without building (for development) pnpm run dev -- # Or with tsx directly npx tsx src/index.ts # Run linting and formatting pnpm run lint pnpm run format ``` ## License MIT - see [LICENSE](./LICENSE) ## Related - [Stagehand](https://github.com/browserbase/stagehand) - AI web browser automation framework - [Browserbase](https://browserbase.com) - Cloud browser infrastructure ================================================ FILE: packages/cli/package.json ================================================ { "name": "@browserbasehq/browse-cli", "version": "0.2.0", "description": "Browser automation CLI for AI agents, built on Stagehand", "type": "commonjs", "license": "MIT", "author": "Browserbase ", "repository": { "type": "git", "url": "git+https://github.com/browserbase/stagehand.git", "directory": "packages/cli" }, "bugs": { "url": "https://github.com/browserbase/stagehand/issues" }, "homepage": "https://github.com/browserbase/stagehand/tree/main/packages/cli#readme", "keywords": [ "browser", "automation", "cli", "ai", "agent", "chrome", "cdp", "web-scraping", "testing", "stagehand" ], "engines": { "node": "^20.19.0 || >=22.12.0" }, "publishConfig": { "access": "public" }, "main": "./dist/index.js", "bin": { "browse": "./dist/index.js" }, "files": [ "dist", "README.md", "LICENSE" ], "scripts": { "build": "tsup", "dev": "tsx src/index.ts", "browse": "tsx src/index.ts", "typecheck": "tsc --noEmit", "eslint": "eslint .", "lint": "cd ../.. && prettier --check packages/cli && cd packages/cli && pnpm run eslint && pnpm run typecheck", "test": "vitest run", "test:cli": "vitest run", "test:watch": "vitest", "prepublishOnly": "pnpm run build" }, "dependencies": { "@browserbasehq/stagehand": "workspace:*", "commander": "^12.0.0", "dotenv": "^16.4.5", "pino": "^9.6.0", "pino-pretty": "^13.0.0", "ws": "^8.18.0" }, "devDependencies": { "@types/node": "^20.11.30", "devtools-protocol": "^0.0.1464554", "eslint": "^10.0.2", "tsup": "^8.2.1", "tsx": "^4.10.5", "typescript": "5.8.3", "vitest": "^4.0.8" } } ================================================ FILE: packages/cli/src/index.ts ================================================ /** * Browse CLI - Browser automation for AI agents * * Usage: * browse [options] [args...] * * The CLI runs a daemon process that maintains browser state between commands. * Multiple sessions can run simultaneously using --session or BROWSE_SESSION env var. */ import { Command } from "commander"; import { Stagehand, type Page as BrowsePage } from "@browserbasehq/stagehand"; import { promises as fs } from "fs"; import * as path from "path"; import * as os from "os"; import * as net from "net"; import { spawn } from "child_process"; import * as readline from "readline"; import type { Protocol } from "devtools-protocol"; import { version as VERSION } from "../package.json"; const program = new Command(); // Type aliases type BrowseContext = Stagehand["context"]; // ==================== DAEMON INFRASTRUCTURE ==================== const SOCKET_DIR = os.tmpdir(); function getSocketPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.sock`); } function getLockPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.lock`); } /** * Acquire an exclusive lock for daemon operations. * Uses O_EXCL for atomic file creation to prevent race conditions. */ async function acquireLock( session: string, timeoutMs: number = 10000, ): Promise { const lockPath = getLockPath(session); const startTime = Date.now(); while (Date.now() - startTime < timeoutMs) { try { // O_EXCL ensures atomic creation - fails if file exists const handle = await fs.open(lockPath, "wx"); await handle.write(String(process.pid)); await handle.close(); return true; } catch (err: unknown) { if ((err as NodeJS.ErrnoException).code === "EEXIST") { // Lock exists - check if holder is still alive try { const holderPid = parseInt(await fs.readFile(lockPath, "utf-8")); process.kill(holderPid, 0); // Throws if process doesn't exist // Process exists, wait and retry await new Promise((r) => setTimeout(r, 100)); } catch { // Lock holder is dead, remove stale lock try { await fs.unlink(lockPath); } catch {} } continue; } throw err; } } return false; } async function releaseLock(session: string): Promise { try { await fs.unlink(getLockPath(session)); } catch {} } /** * Check if a socket is actually connectable (not just exists on disk). */ async function isSocketConnectable( socketPath: string, timeoutMs: number, ): Promise { return new Promise((resolve) => { const client = net.createConnection(socketPath); const timeout = setTimeout(() => { client.destroy(); resolve(false); }, timeoutMs); client.on("connect", () => { clearTimeout(timeout); client.destroy(); resolve(true); }); client.on("error", () => { clearTimeout(timeout); resolve(false); }); }); } /** * Wait for socket to become connectable with exponential backoff. */ async function waitForSocketReady( socketPath: string, timeoutMs: number, ): Promise { const startTime = Date.now(); let delay = 50; while (Date.now() - startTime < timeoutMs) { if (await isSocketConnectable(socketPath, 500)) return; await new Promise((r) => setTimeout(r, delay)); delay = Math.min(delay * 1.5, 500); } throw new Error(`Socket not ready after ${timeoutMs}ms`); } function getPidPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.pid`); } function getWsPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.ws`); } function getChromePidPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.chrome.pid`); } function getNetworkDir(session: string): string { return path.join(SOCKET_DIR, `browse-${session}-network`); } function getModePath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.mode`); } function getModeOverridePath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.mode-override`); } function getContextPath(session: string): string { return path.join(SOCKET_DIR, `browse-${session}.context`); } type BrowseMode = "browserbase" | "local"; function hasBrowserbaseCredentials(): boolean { return Boolean(process.env.BROWSERBASE_API_KEY); } function assertModeSupported(mode: BrowseMode): void { if (mode === "browserbase" && !hasBrowserbaseCredentials()) { throw new Error( "Remote mode requires BROWSERBASE_API_KEY. Set the env var or run `browse env local`.", ); } } function toModeTarget(mode: BrowseMode): "local" | "remote" { return mode === "browserbase" ? "remote" : "local"; } async function readCurrentMode(session: string): Promise { try { const mode = (await fs.readFile(getModePath(session), "utf-8")).trim(); if (mode === "browserbase" || mode === "local") { return mode; } } catch { // File may not exist yet. } return null; } /** Determine desired mode: explicit override > env var detection */ async function getDesiredMode(session: string): Promise { try { const override = ( await fs.readFile(getModeOverridePath(session), "utf-8") ).trim(); if (override === "browserbase" || override === "local") return override; } catch {} return hasBrowserbaseCredentials() ? "browserbase" : "local"; } async function isDaemonRunning(session: string): Promise { try { const pidFile = getPidPath(session); const pid = parseInt(await fs.readFile(pidFile, "utf-8")); process.kill(pid, 0); // Check if process exists // Also verify socket exists and is actually connectable const socketPath = getSocketPath(session); await fs.access(socketPath); // Verify socket is actually connectable (not just exists on disk) return await isSocketConnectable(socketPath, 500); } catch { return false; } } /** Daemon state files — cleaned on both startup (stale) and shutdown. */ const DAEMON_STATE_FILES = (session: string) => [ getSocketPath(session), getPidPath(session), getWsPath(session), getChromePidPath(session), getLockPath(session), getModePath(session), ]; async function cleanupStaleFiles(session: string): Promise { const files = [ ...DAEMON_STATE_FILES(session), // Context is client-written config, only cleaned on full shutdown getContextPath(session), ]; for (const file of files) { try { await fs.unlink(file); } catch {} } } /** Like cleanupStaleFiles but preserves client-written config (context). */ async function cleanupDaemonStateFiles(session: string): Promise { for (const file of DAEMON_STATE_FILES(session)) { try { await fs.unlink(file); } catch {} } } /** Find and kill Chrome processes for this session */ async function killChromeProcesses(session: string): Promise { try { const { exec } = await import("child_process"); const { promisify } = await import("util"); const execAsync = promisify(exec); if (process.platform === "darwin" || process.platform === "linux") { // Find Chrome processes with our user data dir pattern const { stdout } = await execAsync( `pgrep -f "browse-${session}" || true`, ); const pids = stdout.trim().split("\n").filter(Boolean); for (const pid of pids) { try { process.kill(parseInt(pid), "SIGTERM"); } catch {} } return pids.length > 0; } return false; } catch { return false; } } interface DaemonRequest { command: string; args: unknown[]; } interface DaemonResponse { success: boolean; result?: unknown; error?: string; } // ==================== DAEMON SERVER ==================== // Default viewport matching Stagehand core const DEFAULT_VIEWPORT = { width: 1288, height: 711 }; async function runDaemon(session: string, headless: boolean): Promise { // Only clean daemon state files (socket, pid, etc.), not client-written config (context) await cleanupDaemonStateFiles(session); // Write daemon PID file and initial mode so status is immediately available await fs.writeFile(getPidPath(session), String(process.pid)); await fs.writeFile(getModePath(session), await getDesiredMode(session)); // Browser state (initialized lazily on first command) let stagehand: Stagehand | null = null; let context: BrowseContext | null = null; let isInitializing = false; /** * Lazy browser initialization - called on first command (like agent-browser) * This allows daemon to signal "started" immediately without waiting for browser */ async function ensureBrowserInitialized(): Promise<{ stagehand: Stagehand; context: BrowseContext; }> { if (stagehand && context) { return { stagehand, context }; } // Prevent concurrent initialization if (isInitializing) { // Wait for initialization to complete while (isInitializing) { await new Promise((resolve) => setTimeout(resolve, 100)); } if (stagehand && context) { return { stagehand, context }; } throw new Error("Browser initialization failed"); } isInitializing = true; try { const desiredMode = await getDesiredMode(session); assertModeSupported(desiredMode); const useBrowserbase = desiredMode === "browserbase"; // Read context config if present (written by `browse open --context-id`) let contextConfig: { id: string; persist?: boolean } | null = null; try { const raw = await fs.readFile(getContextPath(session), "utf-8"); contextConfig = JSON.parse(raw); } catch {} stagehand = new Stagehand({ env: useBrowserbase ? "BROWSERBASE" : "LOCAL", verbose: 0, disablePino: true, ...(useBrowserbase ? { disableAPI: true, ...(contextConfig ? { browserbaseSessionCreateParams: { browserSettings: { context: contextConfig, }, }, } : {}), } : { localBrowserLaunchOptions: { headless, viewport: DEFAULT_VIEWPORT, }, }), }); // Persist mode so status command can report it await fs.writeFile(getModePath(session), desiredMode); await stagehand.init(); context = stagehand.context; // Try to save Chrome info for reference (best effort) try { const wsUrl = stagehand.connectURL(); await fs.writeFile(getWsPath(session), wsUrl); } catch {} // Store session name for network capture networkSession = session; return { stagehand, context }; } finally { isInitializing = false; } } // Create Unix socket server const socketPath = getSocketPath(session); const server = net.createServer((conn) => { const rl = readline.createInterface({ input: conn }); rl.on("line", async (line) => { let response: DaemonResponse; try { const request: DaemonRequest = JSON.parse(line); // Lazy browser initialization on first command (like agent-browser) const { stagehand: sh, context: ctx } = await ensureBrowserInitialized(); const result = await executeCommand( ctx, request.command, request.args, sh, ); response = { success: true, result }; } catch (e) { response = { success: false, error: e instanceof Error ? e.message : String(e), }; } conn.write(JSON.stringify(response) + "\n"); }); rl.on("close", () => { conn.destroy(); }); }); server.listen(socketPath); // Signal daemon started immediately (before browser initialization) console.log(JSON.stringify({ daemon: "started", session, pid: process.pid })); // Graceful shutdown handler let shuttingDown = false; const shutdown = async () => { if (shuttingDown) return; shuttingDown = true; server.close(); try { if (stagehand) { await stagehand.close(); } } catch {} await cleanupStaleFiles(session); process.exit(0); }; // Handle all termination signals process.on("SIGTERM", () => shutdown()); process.on("SIGINT", () => shutdown()); process.on("SIGHUP", () => shutdown()); process.on("uncaughtException", (err) => { console.error("Uncaught exception:", err); shutdown(); }); process.on("unhandledRejection", (reason) => { console.error("Unhandled rejection:", reason); shutdown(); }); // Keep daemon running (signal already sent above) } // ==================== REF MAP (cached from last snapshot) ==================== /** Cached ref maps from the last snapshot - allows @ref syntax in commands */ let refMap: { xpathMap: Record; urlMap: Record; } = { xpathMap: {}, urlMap: {}, }; // ==================== NETWORK CAPTURE STATE ==================== interface PendingRequest { id: string; timestamp: string; method: string; url: string; headers: Record; body: string | null; resourceType: string; } let networkEnabled = false; let networkDir: string | null = null; let networkCounter = 0; let networkSession: string | null = null; const pendingRequests = new Map(); /** Sanitize a string for use in a filename */ function sanitizeForFilename(str: string, maxLen: number = 30): string { return str .replace(/[^a-zA-Z0-9.-]/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, "") .slice(0, maxLen); } /** Generate a directory name for a request */ function getRequestDirName( counter: number, method: string, url: string, ): string { try { const parsed = new URL(url); const domain = sanitizeForFilename(parsed.hostname, 30); const pathPart = parsed.pathname.split("/").filter(Boolean)[0] || "root"; const pathSlug = sanitizeForFilename(pathPart, 20); return `${String(counter).padStart(3, "0")}-${method}-${domain}-${pathSlug}`; } catch { return `${String(counter).padStart(3, "0")}-${method}-unknown`; } } /** Write request data to filesystem */ async function writeRequestToFs( request: PendingRequest, ): Promise { if (!networkDir) return null; const dirName = getRequestDirName( networkCounter++, request.method, request.url, ); const requestDir = path.join(networkDir, dirName); try { await fs.mkdir(requestDir, { recursive: true }); const requestData = { id: request.id, timestamp: request.timestamp, method: request.method, url: request.url, headers: request.headers, body: request.body, resourceType: request.resourceType, }; await fs.writeFile( path.join(requestDir, "request.json"), JSON.stringify(requestData, null, 2), ); return requestDir; } catch (err) { console.error("Failed to write request:", err); return null; } } /** Write response data to filesystem */ async function writeResponseToFs( requestDir: string, response: { id: string; status: number; statusText: string; headers: Record; mimeType: string; body: string | null; duration: number; error?: string; }, ): Promise { try { await fs.writeFile( path.join(requestDir, "response.json"), JSON.stringify(response, null, 2), ); } catch (err) { console.error("Failed to write response:", err); } } /** * Parse a ref from a selector argument. * Supports: @0-3, @[0-3], [0-3], 0-3, ref=0-3 */ function parseRef(selector: string): string | null { if (selector.startsWith("@")) { const rest = selector.slice(1); if (rest.startsWith("[") && rest.endsWith("]")) { return rest.slice(1, -1); } return rest; } if ( selector.startsWith("[") && selector.endsWith("]") && /^\[\d+-\d+]$/.test(selector) ) { return selector.slice(1, -1); } if (selector.startsWith("ref=")) { return selector.slice(4); } if (/^\d+-\d+$/.test(selector)) { return selector; } return null; } /** * Resolve a selector - if it's a ref, look up from refMap. * Always uses XPath since CSS selectors cannot cross shadow DOM boundaries * and can cause issues with dynamically generated class names. */ function resolveSelector(selector: string): string { const ref = parseRef(selector); if (ref) { const xpath = refMap.xpathMap[ref]; if (!xpath) { throw new Error( `Unknown ref "${ref}" - run snapshot first to populate refs (have ${Object.keys(refMap.xpathMap).length} refs)`, ); } return xpath; } return selector; } // ==================== COMMAND EXECUTION ==================== async function executeCommand( context: BrowseContext, command: string, args: unknown[], stagehand?: Stagehand, ): Promise { // Use awaitActivePage() like stagehand.act() does - handles popups and waits for page to be ready const page = command !== "pages" && command !== "newpage" ? await context.awaitActivePage() : context.activePage(); if (!page && command !== "pages" && command !== "newpage") { throw new Error("No active page"); } switch (command) { // Navigation case "open": { const [url, waitUntil, timeout] = args as [string, string?, number?]; await page!.goto(url, { waitUntil: waitUntil as "load" | "domcontentloaded" | "networkidle", timeoutMs: timeout ?? 30000, }); return { url: page!.url() }; } case "reload": { await page!.reload(); return { url: page!.url() }; } case "back": { await page!.goBack(); return { url: page!.url() }; } case "forward": { await page!.goForward(); return { url: page!.url() }; } // Click by ref - uses stagehand.act with Action type (skips LLM, uses deterministic path) case "click": { const [selector] = args as [string]; if (!stagehand) { throw new Error("Stagehand instance not available"); } const resolved = resolveSelector(selector); // Construct an Action object (like observe() returns) to use the deterministic path const action = { selector: resolved, description: "click element", method: "click", arguments: [], }; await stagehand.act(action); return { clicked: true }; } // Click by coordinates case "click_xy": { const [x, y, opts] = args as [ number, number, { button?: string; clickCount?: number; returnXPath?: boolean }, ]; const result = await page!.click(x, y, { button: (opts?.button as "left" | "right" | "middle") ?? "left", clickCount: opts?.clickCount ?? 1, }); if (opts?.returnXPath) { return { clicked: true, xpath: result }; } return { clicked: true }; } case "hover": { const [x, y, opts] = args as [number, number, { returnXPath?: boolean }]; const result = await page!.hover(x, y); if (opts?.returnXPath) { return { hovered: true, xpath: result }; } return { hovered: true }; } case "scroll": { const [x, y, deltaX, deltaY, opts] = args as [ number, number, number, number, { returnXPath?: boolean }, ]; const result = await page!.scroll(x, y, deltaX, deltaY); if (opts?.returnXPath) { return { scrolled: true, xpath: result }; } return { scrolled: true }; } case "drag": { const [fromX, fromY, toX, toY, opts] = args as [ number, number, number, number, { steps?: number; delay?: number; button?: string; returnXPath?: boolean; }, ]; const [fromXpath, toXpath] = await page!.dragAndDrop( fromX, fromY, toX, toY, { button: (opts?.button as "left" | "right" | "middle") ?? "left", steps: opts?.steps ?? 10, delay: opts?.delay ?? 0, returnXpath: opts?.returnXPath, }, ); if (opts?.returnXPath) { return { dragged: true, xpath: fromXpath, fromXpath, toXpath, }; } return { dragged: true }; } // Keyboard case "type": { const [text, opts] = args as [ string, { delay?: number; mistakes?: boolean }, ]; await page!.type(text, { delay: opts?.delay, withMistakes: opts?.mistakes, }); return { typed: true }; } case "press": { const [key] = args as [string]; await page!.keyPress(key); return { pressed: key }; } // Element actions - use stagehand.act with Action type for reliable interaction case "fill": { const [selector, value, opts] = args as [ string, string, { pressEnter?: boolean }?, ]; if (!stagehand) { throw new Error("Stagehand instance not available"); } const resolved = resolveSelector(selector); const action = { selector: resolved, description: "fill element", method: "fill", arguments: [value], }; await stagehand.act(action); if (opts?.pressEnter) { await page!.keyPress("Enter"); } return { filled: true, pressedEnter: opts?.pressEnter ?? false }; } case "select": { const [selector, values] = args as [string, string[]]; if (!stagehand) { throw new Error("Stagehand instance not available"); } const resolved = resolveSelector(selector); // selectOption takes the first value as argument const action = { selector: resolved, description: "select option", method: "selectOption", arguments: [values[0] || ""], }; await stagehand.act(action); return { selected: values }; } case "highlight": { const [selector, duration] = args as [string, number?]; await page! .deepLocator(resolveSelector(selector)) .highlight({ durationMs: duration ?? 2000 }); return { highlighted: true }; } // Page info case "get": { const [what, selector] = args as [string, string?]; switch (what) { case "url": return { url: page!.url() }; case "title": return { title: await page!.title() }; case "text": return { text: await page! .deepLocator(resolveSelector(selector!)) .textContent(), }; case "html": return { html: await page! .deepLocator(resolveSelector(selector!)) .innerHtml(), }; case "value": return { value: await page! .deepLocator(resolveSelector(selector!)) .inputValue(), }; case "box": { const { x, y } = await page! .deepLocator(resolveSelector(selector!)) .centroid(); return { x: Math.round(x), y: Math.round(y) }; } case "visible": return { visible: await page! .deepLocator(resolveSelector(selector!)) .isVisible(), }; case "checked": return { checked: await page! .deepLocator(resolveSelector(selector!)) .isChecked(), }; default: throw new Error(`Unknown get type: ${what}`); } } // Screenshot case "screenshot": { const [opts] = args as [ { path?: string; fullPage?: boolean; type?: string; quality?: number; clip?: object; animations?: string; caret?: string; }, ]; const buffer = await page!.screenshot({ fullPage: opts?.fullPage, type: opts?.type as "png" | "jpeg" | undefined, quality: opts?.quality, clip: opts?.clip as | { x: number; y: number; width: number; height: number } | undefined, animations: opts?.animations as "disabled" | "allow" | undefined, caret: opts?.caret as "hide" | "initial" | undefined, timeout: 10000, }); if (opts?.path) { await fs.writeFile(opts.path, buffer); return { saved: opts.path }; } return { base64: buffer.toString("base64") }; } // Snapshot case "snapshot": { const [compact] = args as [boolean?]; const snapshot = await page!.snapshot(); refMap = { xpathMap: snapshot.xpathMap ?? {}, urlMap: snapshot.urlMap ?? {}, }; if (compact) { return { tree: snapshot.formattedTree }; } return { tree: snapshot.formattedTree, xpathMap: snapshot.xpathMap, urlMap: snapshot.urlMap, }; } // Viewport case "viewport": { const [width, height, scale] = args as [number, number, number?]; await page!.setViewportSize(width, height, { deviceScaleFactor: scale ?? 1, }); return { viewport: { width, height } }; } // Eval case "eval": { const [expr] = args as [string]; const result = await page!.evaluate(expr); return { result }; } // Element state case "is": { const [check, selector] = args as [string, string]; const locator = page!.deepLocator(resolveSelector(selector)); switch (check) { case "visible": return { visible: await locator.isVisible() }; case "checked": return { checked: await locator.isChecked() }; default: throw new Error(`Unknown check: ${check}`); } } // Wait case "wait": { const [type, arg, opts] = args as [ string, string?, { timeout?: number; state?: string }?, ]; switch (type) { case "load": await page!.waitForLoadState( (arg as "load" | "domcontentloaded" | "networkidle") ?? "load", opts?.timeout ?? 30000, ); break; case "selector": await page!.waitForSelector(resolveSelector(arg!), { state: (opts?.state as "attached" | "detached" | "visible" | "hidden") ?? "visible", timeout: opts?.timeout ?? 30000, }); break; case "timeout": await page!.waitForTimeout(parseInt(arg!)); break; default: throw new Error(`Unknown wait type: ${type}`); } return { waited: true }; } // Cursor case "cursor": { await page!.enableCursorOverlay(); return { cursor: "enabled" }; } // Multi-page case "pages": { const pages = context.pages(); return { pages: pages.map((p: BrowsePage, i: number) => ({ index: i, url: p.url(), targetId: p.targetId(), })), }; } case "newpage": { const [url] = args as [string?]; const newPage = await context.newPage(url); return { created: true, url: newPage.url(), targetId: newPage.targetId(), }; } case "tab_switch": { const [index] = args as [number]; const pages = context.pages(); if (index < 0 || index >= pages.length) { throw new Error( `Tab index ${index} out of range (0-${pages.length - 1})`, ); } context.setActivePage(pages[index]); return { switched: true, index, url: pages[index].url() }; } case "tab_close": { const [index] = args as [number?]; const pages = context.pages(); const targetIndex = index ?? pages.length - 1; if (targetIndex < 0 || targetIndex >= pages.length) { throw new Error( `Tab index ${targetIndex} out of range (0-${pages.length - 1})`, ); } if (pages.length === 1) { throw new Error("Cannot close the last tab"); } await pages[targetIndex].close(); return { closed: true, index: targetIndex }; } // Debug: show current ref map case "refs": { return { count: Object.keys(refMap.xpathMap).length, xpathMap: refMap.xpathMap, urlMap: refMap.urlMap, }; } // Network capture commands case "network_enable": { if (networkEnabled && networkDir) { return { enabled: true, path: networkDir, alreadyEnabled: true }; } const session = networkSession || "default"; networkDir = getNetworkDir(session); await fs.mkdir(networkDir, { recursive: true }); networkCounter = 0; pendingRequests.clear(); const cdpSession = page!.mainFrame().session; await cdpSession.send("Network.enable", { maxTotalBufferSize: 10000000, maxResourceBufferSize: 5000000, }); // Set up CDP event listeners for network capture const requestStartTimes = new Map(); const requestDirs = new Map(); cdpSession.on( "Network.requestWillBeSent", async (params: Protocol.Network.RequestWillBeSentEvent) => { if (!networkEnabled || !networkDir) return; const request: PendingRequest = { id: params.requestId, timestamp: new Date().toISOString(), method: params.request.method, url: params.request.url, headers: params.request.headers || {}, body: params.request.postData || null, resourceType: params.type || "Other", }; pendingRequests.set(params.requestId, request); requestStartTimes.set(params.requestId, Date.now()); const requestDir = await writeRequestToFs(request); if (requestDir) { requestDirs.set(params.requestId, requestDir); } }, ); cdpSession.on( "Network.loadingFinished", async (params: Protocol.Network.LoadingFinishedEvent) => { if (!networkEnabled) return; const requestDir = requestDirs.get(params.requestId); const pending = pendingRequests.get(params.requestId); if (!requestDir || !pending) return; const startTime = requestStartTimes.get(params.requestId) || Date.now(); const duration = Date.now() - startTime; let body: string | null = null; try { const result = await cdpSession.send( "Network.getResponseBody", { requestId: params.requestId, }, ); body = result.body || null; if (result.base64Encoded && body) { body = `[base64] ${body.slice(0, 100)}...`; } } catch { // Body not available (e.g., for redirects) } const responseData = { id: params.requestId, status: 0, statusText: "", headers: {} as Record, mimeType: "", body, duration, }; await writeResponseToFs(requestDir, responseData); pendingRequests.delete(params.requestId); requestStartTimes.delete(params.requestId); requestDirs.delete(params.requestId); }, ); cdpSession.on( "Network.loadingFailed", async (params: Protocol.Network.LoadingFailedEvent) => { if (!networkEnabled) return; const requestDir = requestDirs.get(params.requestId); if (!requestDir) return; const startTime = requestStartTimes.get(params.requestId) || Date.now(); const duration = Date.now() - startTime; const responseData = { id: params.requestId, status: 0, statusText: "Failed", headers: {}, mimeType: "", body: null, duration, error: params.errorText || "Unknown error", }; await writeResponseToFs(requestDir, responseData); pendingRequests.delete(params.requestId); requestStartTimes.delete(params.requestId); requestDirs.delete(params.requestId); }, ); networkEnabled = true; return { enabled: true, path: networkDir }; } case "network_disable": { if (!networkEnabled) { return { enabled: false, alreadyDisabled: true }; } try { await page!.mainFrame().session.send("Network.disable"); } catch {} networkEnabled = false; return { enabled: false, path: networkDir }; } case "network_path": { if (!networkDir) { const session = networkSession || "default"; return { path: getNetworkDir(session), enabled: false }; } return { path: networkDir, enabled: networkEnabled }; } case "network_clear": { if (!networkDir) { return { cleared: false, error: "Network capture not enabled" }; } try { const entries = await fs.readdir(networkDir, { withFileTypes: true }); for (const entry of entries) { if (entry.isDirectory()) { await fs.rm(path.join(networkDir, entry.name), { recursive: true }); } } networkCounter = 0; pendingRequests.clear(); return { cleared: true, path: networkDir }; } catch (err) { return { cleared: false, error: err instanceof Error ? err.message : String(err), }; } } // Daemon control case "stop": { process.nextTick(() => { process.emit("SIGTERM"); }); return { stopping: true }; } default: throw new Error(`Unknown command: ${command}`); } } // ==================== CLIENT ==================== async function sendCommandOnce( session: string, command: string, args: unknown[], ): Promise { return new Promise((resolve, reject) => { const socketPath = getSocketPath(session); const client = net.createConnection(socketPath); let done = false; const timeout = setTimeout(() => { cleanup(); reject(new Error("Command timeout")); }, 60000); const cleanup = () => { if (!done) { done = true; clearTimeout(timeout); rl.close(); client.destroy(); } }; const rl = readline.createInterface({ input: client }); rl.on("line", (line) => { const response: DaemonResponse = JSON.parse(line); cleanup(); if (response.success) { resolve(response.result); } else { reject(new Error(response.error)); } }); rl.on("error", () => {}); client.on("connect", () => { const request: DaemonRequest = { command, args }; client.write(JSON.stringify(request) + "\n"); }); client.on("error", (err) => { cleanup(); reject(new Error(`Connection failed: ${err.message}`)); }); }); } /** Send command with automatic retry and daemon restart on connection failure */ async function sendCommand( session: string, command: string, args: unknown[], headless: boolean = false, ): Promise { const maxRetries = 3; for (let attempt = 0; attempt < maxRetries; attempt++) { try { return await sendCommandOnce(session, command, args); } catch (err) { const errMsg = err instanceof Error ? err.message : String(err); if (command === "stop") { throw err; } const isConnectionError = errMsg.includes("ENOENT") || errMsg.includes("ECONNREFUSED") || errMsg.includes("Connection failed"); if (!isConnectionError) { throw err; } // Attempt 0: Brief wait and retry (socket might be temporarily unavailable) if (attempt === 0) { await new Promise((r) => setTimeout(r, 200)); continue; } // Attempt 1: Try to restart daemon without cleanup if (attempt === 1) { await ensureDaemon(session, headless); continue; } // Final attempt: Full cleanup and restart await killChromeProcesses(session); await cleanupStaleFiles(session); await ensureDaemon(session, headless); } } throw new Error( `Max retries exceeded for command ${command} on session ${session}`, ); } async function stopDaemonAndCleanup(session: string): Promise { try { await sendCommandOnce(session, "stop", []); } catch { // Daemon may already be down. } await new Promise((r) => setTimeout(r, 500)); await cleanupStaleFiles(session); } async function ensureDaemon(session: string, headless: boolean): Promise { const wantMode = await getDesiredMode(session); assertModeSupported(wantMode); if (await isDaemonRunning(session)) { // Missing mode file means daemon predates mode support, which was local-only. const currentMode = (await readCurrentMode(session)) ?? "local"; if (currentMode === wantMode) { return; } await stopDaemonAndCleanup(session); } // Acquire lock before spawning to prevent race conditions const locked = await acquireLock(session); if (!locked) { throw new Error(`Timeout acquiring lock for session ${session}`); } try { // Re-check after acquiring lock (another process may have started daemon) if (await isDaemonRunning(session)) { const currentMode = (await readCurrentMode(session)) ?? "local"; if (currentMode === wantMode) { return; } await stopDaemonAndCleanup(session); } const args = ["--session", session, "daemon"]; if (headless) args.push("--headless"); const child = spawn(process.argv[0], [process.argv[1], ...args], { detached: true, // Avoid piping stdout for detached daemon startup. Deep-locator internals // can log via console fallback, and writing to a broken pipe crashes daemon. stdio: ["ignore", "ignore", "ignore"], }); child.unref(); await new Promise((resolve, reject) => { let settled = false; const finish = (err?: Error) => { if (settled) return; settled = true; clearTimeout(timeout); child.off("error", onError); child.off("exit", onExit); if (err) reject(err); else resolve(); }; const onError = (err: Error) => { finish(err); }; const onExit = (code: number | null, signal: string | null) => { finish( new Error( `Daemon exited before ready (code=${code ?? "null"}, signal=${signal ?? "null"})`, ), ); }; const timeout = setTimeout(() => { finish(new Error("Timeout waiting for daemon to start")); }, 30000); child.once("error", onError); child.once("exit", onExit); // Readiness is determined by socket connectivity, not daemon stdout. waitForSocketReady(getSocketPath(session), 28000) .then(() => finish()) .catch((err) => finish(err instanceof Error ? err : new Error(String(err))), ); }); } finally { await releaseLock(session); } } // ==================== CLI INTERFACE ==================== interface GlobalOpts { ws?: string; headless?: boolean; headed?: boolean; json?: boolean; session?: string; } function getSession(opts: GlobalOpts): string { return opts.session ?? process.env.BROWSE_SESSION ?? "default"; } function isHeadless(opts: GlobalOpts): boolean { return opts.headless === true && opts.headed !== true; } function output(data: unknown, json: boolean): void { if (json) { console.log(JSON.stringify(data, null, 2)); } else if (typeof data === "string") { console.log(data); } else { console.log(JSON.stringify(data, null, 2)); } } async function runCommand(command: string, args: unknown[]): Promise { const opts = program.opts(); const session = getSession(opts); const headless = isHeadless(opts); // If --ws provided, bypass daemon and connect directly if (opts.ws) { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, disablePino: true, localBrowserLaunchOptions: { cdpUrl: opts.ws, }, }); await stagehand.init(); try { return await executeCommand(stagehand.context, command, args); } finally { await stagehand.close(); } } await ensureDaemon(session, headless); return sendCommand(session, command, args, headless); } program .name("browse") .description("Browser automation CLI for AI agents") .version(VERSION) .option( "--ws ", "CDP WebSocket URL (bypasses daemon, direct connection)", ) .option("--headless", "Run Chrome in headless mode") .option("--headed", "Run Chrome with visible window (default)") .option("--json", "Output as JSON", false) .option( "--session ", "Session name for multiple browsers (or use BROWSE_SESSION env var)", ); // ==================== DAEMON COMMANDS ==================== program .command("start") .description("Start browser daemon (auto-started by other commands)") .action(async () => { const opts = program.opts(); const session = getSession(opts); if (await isDaemonRunning(session)) { console.log(JSON.stringify({ status: "already running", session })); return; } await ensureDaemon(session, isHeadless(opts)); console.log(JSON.stringify({ status: "started", session })); }); program .command("stop") .description("Stop browser daemon") .option("--force", "Force kill Chrome processes if daemon is unresponsive") .action(async (cmdOpts) => { const opts = program.opts(); const session = getSession(opts); // Clear any explicit env override so next start uses env var detection try { await fs.unlink(getModeOverridePath(session)); } catch {} try { await sendCommand(session, "stop", []); console.log(JSON.stringify({ status: "stopped", session })); } catch { if (cmdOpts.force) { await killChromeProcesses(session); await cleanupStaleFiles(session); console.log(JSON.stringify({ status: "force stopped", session })); } else { console.log(JSON.stringify({ status: "not running", session })); } } }); program .command("status") .description("Check daemon status") .action(async () => { const opts = program.opts(); const session = getSession(opts); const running = await isDaemonRunning(session); let wsUrl = null; let mode: BrowseMode | null = null; if (running) { try { wsUrl = await fs.readFile(getWsPath(session), "utf-8"); } catch {} mode = await readCurrentMode(session); } console.log(JSON.stringify({ running, session, wsUrl, mode })); }); program .command("env [target]") .description("Show or switch browser environment (local | remote)") .action(async (target?: string) => { const opts = program.opts(); const session = getSession(opts); if (!target) { let mode: string | null = null; const desiredMode = await getDesiredMode(session); if (await isDaemonRunning(session)) { mode = toModeTarget((await readCurrentMode(session)) ?? desiredMode); } console.log( JSON.stringify({ mode: mode ?? "not running", desired: toModeTarget(desiredMode), session, }), ); return; } const modeMap: Record = { local: "local", remote: "browserbase", }; const mapped = modeMap[target]; if (!mapped) { console.error("Usage: browse env [local|remote]"); process.exit(1); } try { assertModeSupported(mapped); } catch (err) { console.error(err instanceof Error ? err.message : String(err)); process.exit(1); } await fs.writeFile(getModeOverridePath(session), mapped); if (await isDaemonRunning(session)) { const currentMode = (await readCurrentMode(session)) ?? "local"; if (currentMode === mapped) { console.log( JSON.stringify({ mode: toModeTarget(mapped), session, restarted: false, }), ); return; } await stopDaemonAndCleanup(session); } await ensureDaemon(session, isHeadless(opts)); console.log( JSON.stringify({ mode: toModeTarget(mapped), session, restarted: true, }), ); }); program .command("refs") .description("Show cached ref map from last snapshot") .action(async () => { const opts = program.opts(); try { const result = await runCommand("refs", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("daemon") .description("Run as daemon (internal use)") .action(async () => { const opts = program.opts(); await runDaemon(getSession(opts), isHeadless(opts)); }); // ==================== NAVIGATION ==================== program .command("open ") .alias("goto") .description("Navigate to URL") .option( "--wait ", "Wait state: load, domcontentloaded, networkidle", "load", ) .option("-t, --timeout ", "Navigation timeout in milliseconds", "30000") .option( "--context-id ", "Browserbase context ID to load browser state (remote mode only)", ) .option( "--persist", "Persist context changes back after session ends (requires --context-id)", false, ) .action(async (url: string, cmdOpts) => { const opts = program.opts(); try { // Validate context flags if (cmdOpts.persist && !cmdOpts.contextId) { console.error("Error: --persist requires --context-id"); process.exit(1); } const session = getSession(opts); if (cmdOpts.contextId) { // Contexts only work with Browserbase remote sessions const desiredMode = await getDesiredMode(session); if (desiredMode === "local") { console.error( "Error: --context-id is only supported in remote mode. Run `browse env remote` first.", ); process.exit(1); } const newConfig = JSON.stringify({ id: cmdOpts.contextId, persist: cmdOpts.persist ?? false, }); // If daemon is already running with a different context, restart it // (context is baked into the Browserbase session at creation time) if (await isDaemonRunning(session)) { let currentConfig: string | null = null; try { currentConfig = await fs.readFile(getContextPath(session), "utf-8"); } catch {} if (currentConfig !== newConfig) { await stopDaemonAndCleanup(session); } } await fs.writeFile(getContextPath(session), newConfig); } else { // No --context-id: clear any stale context file so the daemon starts clean try { await fs.unlink(getContextPath(session)); } catch {} } const result = await runCommand("open", [ url, cmdOpts.wait, parseInt(cmdOpts.timeout), ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("reload") .description("Reload current page") .action(async () => { const opts = program.opts(); try { const result = await runCommand("reload", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("back") .description("Go back in history") .action(async () => { const opts = program.opts(); try { const result = await runCommand("back", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("forward") .description("Go forward in history") .action(async () => { const opts = program.opts(); try { const result = await runCommand("forward", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== CLICK ACTIONS ==================== program .command("click ") .description("Click element by ref (e.g., @0-5, 0-5, or CSS/XPath selector)") .option("-b, --button ", "Mouse button: left, right, middle", "left") .option("-c, --count ", "Click count", "1") .option( "-f, --force", "Force click even if element has no layout (uses synthetic event)", ) .action(async (ref: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("click", [ ref, { button: cmdOpts.button, clickCount: parseInt(cmdOpts.count), force: cmdOpts.force, }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("click_xy ") .description("Click at exact coordinates") .option("-b, --button ", "Mouse button: left, right, middle", "left") .option("-c, --count ", "Click count", "1") .option("--xpath", "Return XPath of clicked element") .action(async (x: string, y: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("click_xy", [ parseFloat(x), parseFloat(y), { button: cmdOpts.button, clickCount: parseInt(cmdOpts.count), returnXPath: cmdOpts.xpath, }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== COORDINATE ACTIONS ==================== program .command("hover ") .description("Hover at coordinates") .option("--xpath", "Return XPath of hovered element") .action(async (x: string, y: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("hover", [ parseFloat(x), parseFloat(y), { returnXPath: cmdOpts.xpath }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("scroll ") .description("Scroll at coordinates") .option("--xpath", "Return XPath of scrolled element") .action(async (x: string, y: string, dx: string, dy: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("scroll", [ parseFloat(x), parseFloat(y), parseFloat(dx), parseFloat(dy), { returnXPath: cmdOpts.xpath }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("drag ") .description("Drag from one point to another") .option("-b, --button ", "Mouse button: left, right, middle", "left") .option("--steps ", "Number of intermediate drag steps", "10") .option("--delay ", "Delay between drag steps in milliseconds", "0") .option("--xpath", "Return XPath of source and target elements") .action(async (fx: string, fy: string, tx: string, ty: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("drag", [ parseFloat(fx), parseFloat(fy), parseFloat(tx), parseFloat(ty), { button: cmdOpts.button, steps: parseInt(cmdOpts.steps, 10), delay: parseInt(cmdOpts.delay, 10), returnXPath: cmdOpts.xpath, }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== KEYBOARD ==================== program .command("type ") .description("Type text") .option("-d, --delay ", "Delay between keystrokes") .option("--mistakes", "Enable human-like typing with mistakes") .action(async (text: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("type", [ text, { delay: cmdOpts.delay ? parseInt(cmdOpts.delay) : undefined, mistakes: cmdOpts.mistakes, }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("press ") .alias("key") .description("Press key (e.g., Enter, Tab, Escape, Cmd+A)") .action(async (key: string) => { const opts = program.opts(); try { const result = await runCommand("press", [key]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== ELEMENT ACTIONS ==================== program .command("fill ") .description("Fill input element (presses Enter by default)") .option("--no-press-enter", "Don't press Enter after filling") .action(async (selector: string, value: string, cmdOpts) => { const opts = program.opts(); try { const pressEnter = cmdOpts.pressEnter !== false; const result = await runCommand("fill", [ selector, value, { pressEnter }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("select ") .description("Select option(s)") .action(async (selector: string, values: string[]) => { const opts = program.opts(); try { const result = await runCommand("select", [selector, values]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("highlight ") .description("Highlight element") .option("-d, --duration ", "Duration", "2000") .action(async (selector: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("highlight", [ selector, parseInt(cmdOpts.duration), ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== PAGE INFO ==================== program .command("get [selector]") .description( "Get page info: url, title, text, html, value, box, visible, checked", ) .action(async (what: string, selector?: string) => { const opts = program.opts(); try { const result = await runCommand("get", [what, selector]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== SCREENSHOT ==================== program .command("screenshot [path]") .description("Take screenshot") .option("-f, --full-page", "Full page screenshot") .option("-t, --type ", "Image type: png, jpeg", "png") .option("-q, --quality ", "JPEG quality (0-100)") .option("--clip ", "Clip region as JSON") .option("--no-animations", "Disable animations") .option("--hide-caret", "Hide text caret") .action(async (filePath: string | undefined, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("screenshot", [ { path: filePath, fullPage: cmdOpts.fullPage, type: cmdOpts.type, quality: cmdOpts.quality ? parseInt(cmdOpts.quality) : undefined, clip: cmdOpts.clip ? JSON.parse(cmdOpts.clip) : undefined, animations: cmdOpts.animations === false ? "disabled" : "allow", caret: cmdOpts.hideCaret ? "hide" : "initial", }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== SNAPSHOT ==================== program .command("snapshot") .description("Get accessibility tree snapshot") .option("-c, --compact", "Output tree only (no xpath map)") .action(async (cmdOpts) => { const opts = program.opts(); try { const result = (await runCommand("snapshot", [cmdOpts.compact])) as { tree: string; xpathMap?: Record; urlMap?: Record; }; if (cmdOpts.compact && !opts.json) { console.log(result.tree); } else { output(result, opts.json ?? false); } } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== VIEWPORT ==================== program .command("viewport ") .description("Set viewport size") .option("-s, --scale ", "Device scale factor", "1") .action(async (w: string, h: string, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("viewport", [ parseInt(w), parseInt(h), parseFloat(cmdOpts.scale), ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== EVAL ==================== program .command("eval ") .description("Evaluate JavaScript in page") .action(async (expr: string) => { const opts = program.opts(); try { const result = await runCommand("eval", [expr]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== WAIT ==================== program .command("wait [arg]") .description("Wait for: load, selector, timeout") .option("-t, --timeout ", "Timeout", "30000") .option( "-s, --state ", "Element state: visible, hidden, attached, detached", "visible", ) .action(async (type: string, arg: string | undefined, cmdOpts) => { const opts = program.opts(); try { const result = await runCommand("wait", [ type, arg, { timeout: parseInt(cmdOpts.timeout), state: cmdOpts.state }, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== ELEMENT STATE CHECKS ==================== program .command("is ") .description("Check element state: visible, checked") .action(async (check: string, selector: string) => { const opts = program.opts(); try { const result = await runCommand("is", [check, selector]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== CURSOR ==================== program .command("cursor") .description("Enable visual cursor overlay") .action(async () => { const opts = program.opts(); try { const result = await runCommand("cursor", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== MULTI-PAGE ==================== program .command("pages") .description("List all open pages") .action(async () => { const opts = program.opts(); try { const result = await runCommand("pages", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("newpage [url]") .description("Create a new page/tab") .action(async (url?: string) => { const opts = program.opts(); try { const result = await runCommand("newpage", [url]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("tab_switch ") .alias("switch") .description("Switch to tab by index") .action(async (index: string) => { const opts = program.opts(); try { const result = await runCommand("tab_switch", [parseInt(index)]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); program .command("tab_close [index]") .alias("close") .description("Close tab by index (defaults to last tab)") .action(async (index?: string) => { const opts = program.opts(); try { const result = await runCommand("tab_close", [ index ? parseInt(index) : undefined, ]); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== NETWORK CAPTURE ==================== const networkCmd = program .command("network") .description( "Network capture commands (writes to filesystem for agent inspection)", ); networkCmd .command("on") .description("Enable network capture (creates temp directory for requests)") .action(async () => { const opts = program.opts(); try { const result = await runCommand("network_enable", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); networkCmd .command("off") .description("Disable network capture") .action(async () => { const opts = program.opts(); try { const result = await runCommand("network_disable", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); networkCmd .command("path") .description("Get network capture directory path") .action(async () => { const opts = program.opts(); try { const result = await runCommand("network_path", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); networkCmd .command("clear") .description("Clear all captured requests") .action(async () => { const opts = program.opts(); try { const result = await runCommand("network_clear", []); output(result, opts.json ?? false); } catch (e) { console.error("Error:", e instanceof Error ? e.message : e); process.exit(1); } }); // ==================== RUN ==================== program.parse(); ================================================ FILE: packages/cli/tests/cli.test.ts ================================================ /** * Browse CLI Tests * * Comprehensive test suite covering: * - Daemon lifecycle * - Navigation commands * - Actions (click, type, fill) * - Information retrieval (snapshot, screenshot, get) * - Multi-tab operations * - Network capture * - Error handling */ import { describe, it, expect, beforeAll, afterAll, afterEach } from "vitest"; import { exec } from "child_process"; import * as fs from "fs/promises"; import * as path from "path"; import * as os from "os"; // CLI executable path - use the built dist for testing (daemon spawns via process.argv[0]) const CLI_PATH = path.join(__dirname, "../dist/index.js"); // Test session name to avoid conflicts const TEST_SESSION = `test-${Date.now()}`; // Helper to run CLI commands async function browse( args: string, options: { timeout?: number; session?: string } = {}, ): Promise<{ stdout: string; stderr: string; exitCode: number }> { const session = options.session ?? TEST_SESSION; const timeout = options.timeout ?? 30000; return new Promise((resolve) => { const fullArgs = `node ${CLI_PATH} --headless --session ${session} ${args}`; exec(fullArgs, { timeout }, (error, stdout, stderr) => { resolve({ stdout: stdout.trim(), stderr: stderr.trim(), exitCode: error?.code ?? 0, }); }); }); } // Helper to parse JSON output function parseJson>(output: string): T { try { return JSON.parse(output) as T; } catch { throw new Error(`Failed to parse JSON: ${output}`); } } // Cleanup helper async function cleanupSession(session: string): Promise { const tmpDir = os.tmpdir(); const patterns = [ `browse-${session}.sock`, `browse-${session}.pid`, `browse-${session}.ws`, `browse-${session}.chrome.pid`, `browse-${session}.mode`, `browse-${session}.mode-override`, ]; for (const pattern of patterns) { try { await fs.unlink(path.join(tmpDir, pattern)); } catch {} } // Clean network dir try { await fs.rm(path.join(tmpDir, `browse-${session}-network`), { recursive: true, }); } catch {} } describe("Browse CLI", () => { // Cleanup before and after all tests beforeAll(async () => { await cleanupSession(TEST_SESSION); }); afterAll(async () => { // Stop daemon if running await browse("stop --force"); await cleanupSession(TEST_SESSION); }); describe("Daemon Lifecycle", () => { afterEach(async () => { await browse("stop --force"); }); it("should start daemon on first command", async () => { const result = await browse("status"); const data = parseJson(result.stdout); // Initially not running expect(data.running).toBe(false); // Start via command const startResult = await browse("start"); expect(startResult.stdout).toContain("started"); // Now should be running const statusResult = await browse("status"); const statusData = parseJson(statusResult.stdout); expect(statusData.running).toBe(true); }); it("should stop daemon gracefully", async () => { await browse("start"); const stopResult = await browse("stop"); const data = parseJson(stopResult.stdout); expect(data.status).toBe("stopped"); // Verify stopped const statusResult = await browse("status"); const statusData = parseJson(statusResult.stdout); expect(statusData.running).toBe(false); }); it("should force stop unresponsive daemon", async () => { await browse("start"); const result = await browse("stop --force"); const data = parseJson(result.stdout); expect(["stopped", "force stopped", "not running"]).toContain( data.status, ); }); it("should support multiple sessions", async () => { const session1 = `${TEST_SESSION}-1`; const session2 = `${TEST_SESSION}-2`; try { // Start both sessions await browse("start", { session: session1 }); await browse("start", { session: session2 }); // Both should be running const status1 = parseJson( (await browse("status", { session: session1 })).stdout, ); const status2 = parseJson( (await browse("status", { session: session2 })).stdout, ); expect(status1.running).toBe(true); expect(status2.running).toBe(true); } finally { await browse("stop --force", { session: session1 }); await browse("stop --force", { session: session2 }); await cleanupSession(session1); await cleanupSession(session2); } }); }); describe("Navigation", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should navigate to URL", async () => { const result = await browse("open https://example.com"); const data = parseJson(result.stdout); expect(data.url).toContain("example.com"); }); it("should get current URL", async () => { await browse("open https://example.com"); const result = await browse("get url"); const data = parseJson(result.stdout); expect(data.url).toContain("example.com"); }); it("should get page title", async () => { await browse("open https://example.com"); const result = await browse("get title"); const data = parseJson(result.stdout); expect(data.title).toBeTruthy(); }); it("should reload page", async () => { await browse("open https://example.com"); const result = await browse("reload"); const data = parseJson(result.stdout); expect(data.url).toContain("example.com"); }); }); describe("Snapshot", () => { beforeAll(async () => { await browse("start"); await browse("open https://example.com"); }); afterAll(async () => { await browse("stop --force"); }); it("should take snapshot with refs", async () => { const result = await browse("snapshot"); const data = parseJson(result.stdout); expect(data.tree).toBeTruthy(); expect(data.xpathMap).toBeTruthy(); expect(typeof data.xpathMap).toBe("object"); }); it("should take compact snapshot", async () => { const result = await browse("snapshot -c"); // Compact mode outputs tree directly (not JSON when not --json) expect(result.stdout).toContain("RootWebArea"); }); it("should populate refs for subsequent commands", async () => { await browse("snapshot"); const refsResult = await browse("refs"); const data = parseJson(refsResult.stdout); expect(data.count).toBeGreaterThan(0); expect(data.xpathMap).toBeTruthy(); }); }); describe("Screenshot", () => { const screenshotPath = path.join( os.tmpdir(), `browse-test-${Date.now()}.png`, ); beforeAll(async () => { await browse("start"); await browse("open https://example.com"); }); afterAll(async () => { await browse("stop --force"); try { await fs.unlink(screenshotPath); } catch {} }); it("should take screenshot and return base64", async () => { const result = await browse("screenshot"); const data = parseJson<{ base64: string }>(result.stdout); expect(data.base64).toBeTruthy(); expect(data.base64.length).toBeGreaterThan(100); }); it("should save screenshot to file", async () => { const result = await browse(`screenshot ${screenshotPath}`); const data = parseJson(result.stdout); expect(data.saved).toBe(screenshotPath); // Verify file exists const stat = await fs.stat(screenshotPath); expect(stat.size).toBeGreaterThan(0); }); }); describe("Actions", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should click by coordinates", async () => { await browse("open https://example.com"); const result = await browse("click_xy 100 100"); const data = parseJson(result.stdout); expect(data.clicked).toBe(true); }); it("should click by ref after snapshot", async () => { await browse("open https://example.com"); await browse("snapshot"); // Find a clickable ref const refsResult = await browse("refs"); const refs = parseJson<{ count: number; xpathMap: Record; }>(refsResult.stdout); if (refs.count > 0) { const firstRef = Object.keys(refs.xpathMap)[0]; const result = await browse(`click @${firstRef}`); const data = parseJson(result.stdout); expect(data.clicked).toBe(true); } }); it("should type text", async () => { await browse("open https://example.com"); const result = await browse('type "Hello World"'); const data = parseJson(result.stdout); expect(data.typed).toBe(true); }); it("should press keys", async () => { await browse("open https://example.com"); const result = await browse("press Tab"); const data = parseJson(result.stdout); expect(data.pressed).toBe("Tab"); }); it("should hover at coordinates", async () => { await browse("open https://example.com"); const result = await browse("hover 200 200"); const data = parseJson(result.stdout); expect(data.hovered).toBe(true); }); it("should scroll", async () => { await browse("open https://example.com"); const result = await browse("scroll 400 400 0 100"); const data = parseJson(result.stdout); expect(data.scrolled).toBe(true); }); it("should drag and drop between coordinates", async () => { const html = `
Not dropped
`; const dataUrl = `data:text/html,${encodeURIComponent(html)}`; await browse(`open "${dataUrl}"`); const dragResult = await browse("drag 80 80 310 100 --steps 8 --xpath"); const dragData = parseJson(dragResult.stdout); expect(dragData.dragged).toBe(true); expect(typeof dragData.fromXpath).toBe("string"); expect(typeof dragData.toXpath).toBe("string"); const statusResult = await browse( 'eval "document.getElementById(\\"status\\").textContent"', ); const statusData = parseJson(statusResult.stdout); expect(statusData.result).toBe("Dropped"); }); }); describe("Multi-Tab", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should list pages", async () => { await browse("open https://example.com"); const result = await browse("pages"); const data = parseJson<{ pages: { index: number; url: string }[] }>( result.stdout, ); expect(data.pages).toBeInstanceOf(Array); expect(data.pages.length).toBeGreaterThan(0); expect(data.pages[0]).toHaveProperty("index"); expect(data.pages[0]).toHaveProperty("url"); }); it("should create new page", async () => { const beforeResult = await browse("pages"); const beforeData = parseJson<{ pages: unknown[] }>(beforeResult.stdout); const beforeCount = beforeData.pages.length; const newResult = await browse("newpage https://github.com"); const newData = parseJson(newResult.stdout); expect(newData.created).toBe(true); const afterResult = await browse("pages"); const afterData = parseJson<{ pages: unknown[] }>(afterResult.stdout); expect(afterData.pages.length).toBe(beforeCount + 1); }); it("should switch tabs", async () => { await browse("open https://example.com"); await browse("newpage https://github.com"); const result = await browse("tab_switch 0"); const data = parseJson(result.stdout); expect(data.switched).toBe(true); expect(data.index).toBe(0); }); it("should close tab", async () => { await browse("open https://example.com"); await browse("newpage https://github.com"); const beforeResult = await browse("pages"); const beforeCount = parseJson<{ pages: unknown[] }>(beforeResult.stdout) .pages.length; const closeResult = await browse("tab_close"); const closeData = parseJson(closeResult.stdout); expect(closeData.closed).toBe(true); const afterResult = await browse("pages"); const afterCount = parseJson<{ pages: unknown[] }>(afterResult.stdout) .pages.length; expect(afterCount).toBe(beforeCount - 1); }); }); describe("Waiting", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should wait for timeout", async () => { await browse("open https://example.com"); const start = Date.now(); const result = await browse("wait timeout 500"); const elapsed = Date.now() - start; const data = parseJson(result.stdout); expect(data.waited).toBe(true); expect(elapsed).toBeGreaterThanOrEqual(450); }); it("should wait for load state", async () => { await browse("open https://example.com"); const result = await browse("wait load"); const data = parseJson(result.stdout); expect(data.waited).toBe(true); }); }); describe("Network Capture", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should enable network capture", async () => { const result = await browse("network on"); const data = parseJson(result.stdout); expect(data.enabled).toBe(true); expect(data.path).toBeTruthy(); }); it("should return network path", async () => { await browse("network on"); const result = await browse("network path"); const data = parseJson(result.stdout); expect(data.path).toBeTruthy(); expect(data.enabled).toBe(true); }); it("should capture requests to filesystem", async () => { await browse("network on"); const pathResult = await browse("network path"); const networkDir = parseJson<{ path: string }>(pathResult.stdout).path; // Navigate to trigger requests await browse("open https://example.com"); // Wait for requests to be written await browse("wait timeout 1000"); // Check if directory has content try { const entries = await fs.readdir(networkDir); // May or may not have captured requests depending on timing expect(Array.isArray(entries)).toBe(true); } catch { // Directory may not exist if no requests captured } }); it("should disable network capture", async () => { await browse("network on"); const result = await browse("network off"); const data = parseJson(result.stdout); expect(data.enabled).toBe(false); }); it("should clear network captures", async () => { await browse("network on"); await browse("open https://example.com"); await browse("wait timeout 500"); const result = await browse("network clear"); const data = parseJson(result.stdout); expect(data.cleared).toBe(true); }); }); describe("Viewport", () => { beforeAll(async () => { await browse("start"); await browse("open https://example.com"); }); afterAll(async () => { await browse("stop --force"); }); it("should set viewport size", async () => { const result = await browse("viewport 1920 1080"); const data = parseJson<{ viewport: { width: number; height: number } }>( result.stdout, ); expect(data.viewport.width).toBe(1920); expect(data.viewport.height).toBe(1080); }); }); describe("Eval", () => { beforeAll(async () => { await browse("start"); await browse("open https://example.com"); }); afterAll(async () => { await browse("stop --force"); }); it("should evaluate JavaScript", async () => { const result = await browse('eval "document.title"'); const data = parseJson(result.stdout); expect(data.result).toBeTruthy(); }); it("should return computed values", async () => { const result = await browse('eval "1 + 1"'); const data = parseJson(result.stdout); expect(data.result).toBe(2); }); }); describe("Error Handling", () => { beforeAll(async () => { await browse("start"); }); afterAll(async () => { await browse("stop --force"); }); it("should error on invalid ref", async () => { await browse("open https://example.com"); // Don't run snapshot, so refs are empty const result = await browse("click @99-99"); expect(result.stderr).toContain("Error"); }); it("should error on unknown command", async () => { const result = await browse("nonexistent"); expect(result.exitCode).not.toBe(0); }); }); }); ================================================ FILE: packages/cli/tests/mode.test.ts ================================================ import { describe, it, expect, afterEach } from "vitest"; import { exec } from "child_process"; import { promises as fs } from "fs"; import * as path from "path"; import * as os from "os"; const CLI_PATH = path.join(__dirname, "../dist/index.js"); const TEST_SESSION = `env-test-${Date.now()}`; async function browse( args: string, options: { timeout?: number; env?: NodeJS.ProcessEnv } = {}, ): Promise<{ stdout: string; stderr: string; exitCode: number }> { const timeout = options.timeout ?? 30000; const env = { ...process.env, ...options.env }; return new Promise((resolve) => { const fullArgs = `node ${CLI_PATH} --headless --session ${TEST_SESSION} ${args}`; exec(fullArgs, { timeout, env }, (error, stdout, stderr) => { resolve({ stdout: stdout.trim(), stderr: stderr.trim(), exitCode: error?.code ?? 0, }); }); }); } function parseJson>(output: string): T { try { return JSON.parse(output) as T; } catch { throw new Error(`Failed to parse JSON: ${output}`); } } async function cleanupSession(session: string): Promise { const tmpDir = os.tmpdir(); const patterns = [ `browse-${session}.sock`, `browse-${session}.pid`, `browse-${session}.ws`, `browse-${session}.chrome.pid`, `browse-${session}.mode`, `browse-${session}.mode-override`, ]; for (const pattern of patterns) { try { await fs.unlink(path.join(tmpDir, pattern)); } catch { // Ignore missing files. } } try { await fs.rm(path.join(tmpDir, `browse-${session}-network`), { recursive: true, }); } catch { // Ignore missing directory. } } describe("Browse CLI env command", () => { afterEach(async () => { await browse("stop --force"); await cleanupSession(TEST_SESSION); }); it("shows desired env even when daemon is not running", async () => { const result = await browse("env"); expect(result.exitCode).toBe(0); const data = parseJson(result.stdout); expect(data.mode).toBe("not running"); expect(["local", "remote"]).toContain(data.desired); }); it("rejects unsupported env target", async () => { const result = await browse("env invalid-target"); expect(result.exitCode).not.toBe(0); expect(result.stderr).toContain("Usage: browse env [local|remote]"); }); it("rejects remote env without Browserbase credentials", async () => { const result = await browse("env remote", { env: { ...process.env, BROWSERBASE_API_KEY: "", }, }); expect(result.exitCode).not.toBe(0); expect(result.stderr).toContain("Remote mode requires BROWSERBASE_API_KEY"); }); }); ================================================ FILE: packages/cli/tsconfig.json ================================================ { "extends": "../../tsconfig.base.json", "compilerOptions": { "moduleResolution": "bundler", "strict": true, "outDir": "./dist", "rootDir": ".", "types": ["node"] }, "include": ["src/**/*", "tests/**/*"], "exclude": ["node_modules", "dist"] } ================================================ FILE: packages/cli/tsup.config.ts ================================================ import { defineConfig } from "tsup"; export default defineConfig({ entry: ["src/index.ts"], format: ["cjs"], target: "node20", clean: true, shims: true, banner: { js: "#!/usr/bin/env node", }, // Bundle everything possible, only externalize what truly can't be bundled noExternal: [/@browserbasehq\/stagehand/], external: [ // Browser automation - user must install playwright to use the CLI "playwright", "playwright-core", // CJS packages with dynamic requires that break in ESM bundles "pino", "pino-pretty", "ws", "dotenv", ], }); ================================================ FILE: packages/cli/vitest.config.ts ================================================ import { defineConfig } from "vitest/config"; export default defineConfig({ test: { globals: true, testTimeout: 60000, hookTimeout: 60000, include: ["tests/**/*.test.ts"], // Run tests sequentially since they share browser state pool: "forks", poolOptions: { forks: { singleFork: true, }, }, }, }); ================================================ FILE: packages/core/CHANGELOG.md ================================================ # @browserbasehq/stagehand ## 3.2.0 ### Minor Changes - [#1779](https://github.com/browserbase/stagehand/pull/1779) [`2f43ffa`](https://github.com/browserbase/stagehand/commit/2f43ffac11778152d17e4c44405770cc32c3ec8c) Thanks [@shrey150](https://github.com/shrey150)! - feat: add `cdpHeaders` option to `localBrowserLaunchOptions` for passing custom HTTP headers when connecting to an existing browser via CDP URL - [#1834](https://github.com/browserbase/stagehand/pull/1834) [`63ee247`](https://github.com/browserbase/stagehand/commit/63ee247ac6bf2992046d4f6b2759f46b15643e36) Thanks [@tkattkat](https://github.com/tkattkat)! - Update stagehand agents search tool - [#1774](https://github.com/browserbase/stagehand/pull/1774) [`521a10e`](https://github.com/browserbase/stagehand/commit/521a10e3698fc5631e219947bc90dad0f8bddaa8) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add new page.setExtraHTTPHeaders() method ### Patch Changes - [#1759](https://github.com/browserbase/stagehand/pull/1759) [`505e8c6`](https://github.com/browserbase/stagehand/commit/505e8c6736f3706328dbc8df670c49a018058388) Thanks [@shrey150](https://github.com/shrey150)! - Add bedrock to the provider enum in model configuration schemas and regenerate OpenAPI spec. - [#1814](https://github.com/browserbase/stagehand/pull/1814) [`7dc35f5`](https://github.com/browserbase/stagehand/commit/7dc35f5e25689e6518d68b25ef71536d2781c8aa) Thanks [@tkattkat](https://github.com/tkattkat)! - Change usage of openai provider in agent to default to store:false - [#1846](https://github.com/browserbase/stagehand/pull/1846) [`335cf47`](https://github.com/browserbase/stagehand/commit/335cf4730e73bce33e92331d04bda4b0fd42685d) Thanks [@aq17](https://github.com/aq17)! - Fix streaming finished event being silently dropped. The final SSE event containing the result payload (success status, message, actions, usage, and messages) was previously discarded instead of being yielded to the caller. - [#1764](https://github.com/browserbase/stagehand/pull/1764) [`6ba0a1d`](https://github.com/browserbase/stagehand/commit/6ba0a1db7fc2d5d5a2f8927b1417d8f1d15eda10) Thanks [@shrey150](https://github.com/shrey150)! - Expose `headers` in `GoogleVertexProviderSettings` so model configs can pass custom provider headers (for example `X-Goog-Priority`) without TypeScript errors. - [#1847](https://github.com/browserbase/stagehand/pull/1847) [`4ff3bb8`](https://github.com/browserbase/stagehand/commit/4ff3bb831a6ef6e2d57148e7afb68ea8d23e395d) Thanks [@miguelg719](https://github.com/miguelg719)! - Enable FlowLogger on BROWSERBASE_FLOW_LOGS=1 - [#1752](https://github.com/browserbase/stagehand/pull/1752) [`c27054b`](https://github.com/browserbase/stagehand/commit/c27054bbd0508431ade91d655f89efc87bbf5867) Thanks [@derekmeegan](https://github.com/derekmeegan)! - fix: pause Browserbase agents while captcha solving is active and improve CUA recovery after the solve completes - [#1800](https://github.com/browserbase/stagehand/pull/1800) [`2abf5b9`](https://github.com/browserbase/stagehand/commit/2abf5b90f1e2bb1442509ef3a686b6128c9cdcf6) Thanks [@shrey150](https://github.com/shrey150)! - Make projectId optional for Browserbase sessions — only BROWSERBASE_API_KEY is required - [#1766](https://github.com/browserbase/stagehand/pull/1766) [`7817fcc`](https://github.com/browserbase/stagehand/commit/7817fcc315eee4455ce04567cf56c9ec801caf0b) Thanks [@tkattkat](https://github.com/tkattkat)! - Add configurable timeout to tools in agent - [#1749](https://github.com/browserbase/stagehand/pull/1749) [`7390508`](https://github.com/browserbase/stagehand/commit/73905088c5ed5923d276da9cce2efd0a0a3a46eb) Thanks [@pirate](https://github.com/pirate)! - When connecting to a browser session that has zero open tabs, Stagehand now automatically creates an initial `about:blank` tab so the connection can continue. - [#1761](https://github.com/browserbase/stagehand/pull/1761) [`611f43a`](https://github.com/browserbase/stagehand/commit/611f43ac8d4c580216d55d2b217c14a9a9c11013) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where handlePossibleNavigation was producing unnecessary error logs on clicks that trigger page close - [#1817](https://github.com/browserbase/stagehand/pull/1817) [`2402a3c`](https://github.com/browserbase/stagehand/commit/2402a3c4d50270391b3e6440f4385cdcf5e1eb64) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for passing custom headers in clientOptions ## 3.1.0 ### Minor Changes - [#1681](https://github.com/browserbase/stagehand/pull/1681) [`e3db9aa`](https://github.com/browserbase/stagehand/commit/e3db9aa863f44270792215801fe6e3a02a1321aa) Thanks [@tkattkat](https://github.com/tkattkat)! - Add cookie management APIs: `context.addCookies()`, `context.clearCookies()`, & `context.cookies()` - [#1672](https://github.com/browserbase/stagehand/pull/1672) [`b65756e`](https://github.com/browserbase/stagehand/commit/b65756e9e85643055446aa4a51956f7d6627c89f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add boolean keepAlive parameter to allow for configuring whether the browser should be closed when stagehand.close() is called. - [#1708](https://github.com/browserbase/stagehand/pull/1708) [`176d420`](https://github.com/browserbase/stagehand/commit/176d42002cc0a2c7d13b4c0ffbbd56b70fdc49e8) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add context.setExtraHTTPHeaders() - [#1611](https://github.com/browserbase/stagehand/pull/1611) [`8a3c066`](https://github.com/browserbase/stagehand/commit/8a3c06600a9ba98485db7e9ed5c3cc43ea180334) Thanks [@monadoid](https://github.com/monadoid)! - Using `mode` enum instead of old `cua` boolean in openapi spec ### Patch Changes - [#1683](https://github.com/browserbase/stagehand/pull/1683) [`7584f3e`](https://github.com/browserbase/stagehand/commit/7584f3e92e60a557d2b3e0e0d2a2af04c3527523) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: include shadow DOM in .count() & .nth() & support xpath predicates - [#1644](https://github.com/browserbase/stagehand/pull/1644) [`1e1c9c1`](https://github.com/browserbase/stagehand/commit/1e1c9c15773e49d5c3cd36021dbc1d23495c1bce) Thanks [@monadoid](https://github.com/monadoid)! - Fix unhandled CDP detaches by returning the original sendCDP promise - [#1729](https://github.com/browserbase/stagehand/pull/1729) [`6bef890`](https://github.com/browserbase/stagehand/commit/6bef89090ebd231e77d8092b2c32a0f06303d5a9) Thanks [@shrey150](https://github.com/shrey150)! - fix: support Claude 4.6 (Opus and Sonnet) in CUA mode by using the correct `computer_20251124` tool version and `computer-use-2025-11-24` beta header - [#1647](https://github.com/browserbase/stagehand/pull/1647) [`ffd4b33`](https://github.com/browserbase/stagehand/commit/ffd4b335a873d0f4dcd76ea22d44f47919bf8e49) Thanks [@tkattkat](https://github.com/tkattkat)! - Fix [Agent] - Address bug causing issues with continuing a conversation from past messages in dom mode - [#1614](https://github.com/browserbase/stagehand/pull/1614) [`677bff5`](https://github.com/browserbase/stagehand/commit/677bff5834c879a2d95f7dbff918b8e1510516b3) Thanks [@miguelg719](https://github.com/miguelg719)! - Enforce - regex validation on act/observe for elementId - [#1580](https://github.com/browserbase/stagehand/pull/1580) [`65ff464`](https://github.com/browserbase/stagehand/commit/65ff464bc13388eb109eba0a2cf533c1cc202854) Thanks [@tkattkat](https://github.com/tkattkat)! - Add unified variables support across act and agent with a single VariableValue type - [#1666](https://github.com/browserbase/stagehand/pull/1666) [`101bcf2`](https://github.com/browserbase/stagehand/commit/101bcf2da8b527fd6ace6aa291ada5d0f2d90344) Thanks [@Kylejeong2](https://github.com/Kylejeong2)! - add support for codex models - [#1728](https://github.com/browserbase/stagehand/pull/1728) [`0a94301`](https://github.com/browserbase/stagehand/commit/0a94301caa991d1aa4cdade6e28a065b1aefb3e2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - handle potential race condition on `.close()` when using the Stagehand API - [#1664](https://github.com/browserbase/stagehand/pull/1664) [`b27c04d`](https://github.com/browserbase/stagehand/commit/b27c04d278c290364347acd0c354a878ea9b7c2d) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fixes issue with context.addInitScript() where scripts were not being applied to out of process iframes (OOPIFs), and popup pages with same process iframes (SPIFs) - [#1632](https://github.com/browserbase/stagehand/pull/1632) [`afbd08b`](https://github.com/browserbase/stagehand/commit/afbd08bb6367a9c9f65f67e453667987e4659918) Thanks [@pirate](https://github.com/pirate)! - Remove automatic `.env` loading via `dotenv`. If your app relies on `.env` files, install `dotenv` and load it explicitly in your code: ```ts import dotenv from "dotenv"; dotenv.config({ path: ".env" }); ``` - [#1624](https://github.com/browserbase/stagehand/pull/1624) [`0e8d569`](https://github.com/browserbase/stagehand/commit/0e8d5695f662040f7384e64f46301152802e3c62) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where screenshot masks were not being applied to dialog elements - [#1596](https://github.com/browserbase/stagehand/pull/1596) [`ff0f979`](https://github.com/browserbase/stagehand/commit/ff0f9795f3b2c1cf4f2610a80ebcb3341a24f987) Thanks [@tkattkat](https://github.com/tkattkat)! - Update usage/metrics handling in agent - [#1631](https://github.com/browserbase/stagehand/pull/1631) [`2d89d2b`](https://github.com/browserbase/stagehand/commit/2d89d2b35ce812431956b28e0c8b52d32ddc7a27) Thanks [@miguelg719](https://github.com/miguelg719)! - Add right and middle click support to act and observe - [#1697](https://github.com/browserbase/stagehand/pull/1697) [`aac9a19`](https://github.com/browserbase/stagehand/commit/aac9a19bdfbe62e4508631337ab0bfbcf8ae62b2) Thanks [@shrey150](https://github.com/shrey150)! - fix: support `` elements in XPath frame boundary detection so `act()` works on legacy `` pages - [#1692](https://github.com/browserbase/stagehand/pull/1692) [`06de50f`](https://github.com/browserbase/stagehand/commit/06de50ff377fd31f1b0fcf79adb996d04562d2c0) Thanks [@shrey150](https://github.com/shrey150)! - fix: skip piercer injection for chrome-extension:// and other non-HTML targets - [#1613](https://github.com/browserbase/stagehand/pull/1613) [`aa4d981`](https://github.com/browserbase/stagehand/commit/aa4d981e440bdd0e3d3f42ccc310d5958aa25cc6) Thanks [@miguelg719](https://github.com/miguelg719)! - SupportedUnderstudyAction Enum validation for 'method' on act/observe inference - [#1652](https://github.com/browserbase/stagehand/pull/1652) [`18b1e3b`](https://github.com/browserbase/stagehand/commit/18b1e3bd2b16b721845d52fcf1a45c6158e2403f) Thanks [@miguelg719](https://github.com/miguelg719)! - Add support for gemini 3 flash and pro in hybrid/cua agent - [#1706](https://github.com/browserbase/stagehand/pull/1706) [`957d82b`](https://github.com/browserbase/stagehand/commit/957d82b9845b4413b123539e81a2e4a490e74a8a) Thanks [@chrisreadsf](https://github.com/chrisreadsf)! - Add GLM to prompt-based JSON fallback for models without native structured output support - [#1633](https://github.com/browserbase/stagehand/pull/1633) [`22e371a`](https://github.com/browserbase/stagehand/commit/22e371ae4c25deb6350328fe02832bf2b2197b94) Thanks [@tkattkat](https://github.com/tkattkat)! - Add warning when incorrect models are used with agents hybrid mode - [#1673](https://github.com/browserbase/stagehand/pull/1673) [`d29b91f`](https://github.com/browserbase/stagehand/commit/d29b91fa506636ca36f724fcf106320de54ec3f3) Thanks [@miguelg719](https://github.com/miguelg719)! - Add multi-region support for Stagehand API with region-specific endpoints - [#1695](https://github.com/browserbase/stagehand/pull/1695) [`7b4f817`](https://github.com/browserbase/stagehand/commit/7b4f817cafb9829ac81c4b5890c318c7f9521fe4) Thanks [@tkattkat](https://github.com/tkattkat)! - Fix: zod bug when pinning zod to v3 and using structured output in agent - [#1609](https://github.com/browserbase/stagehand/pull/1609) [`3f9ca4d`](https://github.com/browserbase/stagehand/commit/3f9ca4d9acc109101357378d29cf969168991608) Thanks [@miguelg719](https://github.com/miguelg719)! - Add SupportedUnderstudyActions to observe system prompt - [#1581](https://github.com/browserbase/stagehand/pull/1581) [`49ead1e`](https://github.com/browserbase/stagehand/commit/49ead1e1e8678a8da0f87ad2042491dacc6b01d7) Thanks [@sameelarif](https://github.com/sameelarif)! - **Server-side caching is now available.** When running `env: "BROWSERBASE"`, Stagehand automatically caches `act()`, `extract()`, and `observe()` results server-side — repeated calls with the same inputs return instantly without consuming LLM tokens. Caching is enabled by default and can be disabled via `serverCache: false` on the Stagehand instance or per individual call. Check out the [browserbase blog](https://www.browserbase.com/blog/stagehand-caching) for more details. - [#1642](https://github.com/browserbase/stagehand/pull/1642) [`3673369`](https://github.com/browserbase/stagehand/commit/36733691f90c15386cf2a7b47d04ef429b7195ae) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where scripts added via context.addInitScripts() were not being injected into new pages that were opened via popups (eg, clicking a link that opens a new page) and/or calling context.newPage(url) - [#1735](https://github.com/browserbase/stagehand/pull/1735) [`c465e87`](https://github.com/browserbase/stagehand/commit/c465e87ab41942435132c76338518fb3fa8e7896) Thanks [@monadoid](https://github.com/monadoid)! - Supports request header authentication with connectToMCPServer - [#1705](https://github.com/browserbase/stagehand/pull/1705) [`ae533e4`](https://github.com/browserbase/stagehand/commit/ae533e40195181b53833f8055b1259fb360a927b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - include error cause in UnderstudyCommandException - [#1636](https://github.com/browserbase/stagehand/pull/1636) [`ea33052`](https://github.com/browserbase/stagehand/commit/ea330520a325583b71b87d85beb740df4bdb9b2d) Thanks [@miguelg719](https://github.com/miguelg719)! - Include executionModel on the AgentConfigSchema - [#1679](https://github.com/browserbase/stagehand/pull/1679) [`5764ede`](https://github.com/browserbase/stagehand/commit/5764edee7aab00ef1aafafb68fc56eb26c0a70b2) Thanks [@shrey150](https://github.com/shrey150)! - fix issue where locator.count() was not working with xpaths that have attribute predicates - [#1646](https://github.com/browserbase/stagehand/pull/1646) [`f09b184`](https://github.com/browserbase/stagehand/commit/f09b184cc5e774736280ae8c94ba3f4f13adda80) Thanks [@miguelg719](https://github.com/miguelg719)! - Add user-agent to CDP connections - [#1637](https://github.com/browserbase/stagehand/pull/1637) [`a7d29de`](https://github.com/browserbase/stagehand/commit/a7d29decee0f7d12e2437267b9eef1795d3b4e3a) Thanks [@miguelg719](https://github.com/miguelg719)! - Improve error and warning message for legacy model format - [#1685](https://github.com/browserbase/stagehand/pull/1685) [`d334399`](https://github.com/browserbase/stagehand/commit/d3343990041bf9cd5613569840afb0c17131e33c) Thanks [@tkattkat](https://github.com/tkattkat)! - Bump ai sdk & google provider version - [#1662](https://github.com/browserbase/stagehand/pull/1662) [`44416da`](https://github.com/browserbase/stagehand/commit/44416da7ff33301bb32d3811e6c3be8782a7d168) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where locator.fill() was not working on elements that require direct value setting - [#1612](https://github.com/browserbase/stagehand/pull/1612) [`bdd8b4e`](https://github.com/browserbase/stagehand/commit/bdd8b4ee3c697a02728375510ab7fae764990576) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where screenshot mask was only being applied to the first element that the locator resolved to. masks now apply to all matching elements. ## 3.0.8 ### Patch Changes - [#1514](https://github.com/browserbase/stagehand/pull/1514) [`40ce5cc`](https://github.com/browserbase/stagehand/commit/40ce5cc83ec758f4e8c37132a7f4ac8eeea7ca34) Thanks [@tkattkat](https://github.com/tkattkat)! - Rename the close tool in agent to "done" - [#1574](https://github.com/browserbase/stagehand/pull/1574) [`5506f41`](https://github.com/browserbase/stagehand/commit/5506f416d2609d112b553263984e21d7a30e32b1) Thanks [@tkattkat](https://github.com/tkattkat)! - fix(server): pass cdpUrl to localBrowserLaunchOptions when launchOptions absent - [#1521](https://github.com/browserbase/stagehand/pull/1521) [`84c05ca`](https://github.com/browserbase/stagehand/commit/84c05ca8de4587181faf128e5c7464fd960caacc) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: get agent cache working in API mode - [#1486](https://github.com/browserbase/stagehand/pull/1486) [`692ffa0`](https://github.com/browserbase/stagehand/commit/692ffa0346ad3d121686aba503c0a22844293efa) Thanks [@tkattkat](https://github.com/tkattkat)! - improve logging in agent - [#1551](https://github.com/browserbase/stagehand/pull/1551) [`1ef8901`](https://github.com/browserbase/stagehand/commit/1ef8901e1314e90f43b36be20192e652d3b5598f) Thanks [@miguelg719](https://github.com/miguelg719)! - move extract handler response log to after URL injection - [#1495](https://github.com/browserbase/stagehand/pull/1495) [`72ac775`](https://github.com/browserbase/stagehand/commit/72ac775a831d6f0f376ceda4426525f93cc21452) Thanks [@tkattkat](https://github.com/tkattkat)! - export tool function & type to simplify defining custom tools - [#1481](https://github.com/browserbase/stagehand/pull/1481) [`3d5af07`](https://github.com/browserbase/stagehand/commit/3d5af07f66d6d26d1f5ac4bd9be7183c3381dd92) Thanks [@tkattkat](https://github.com/tkattkat)! - add waitForTimeout to page - [#1423](https://github.com/browserbase/stagehand/pull/1423) [`40e1d80`](https://github.com/browserbase/stagehand/commit/40e1d80776b9216422a25a81070ccb3105e56ec2) Thanks [@miguelg719](https://github.com/miguelg719)! - Improve benchmark handling and add metadata - [#1588](https://github.com/browserbase/stagehand/pull/1588) [`56c0d24`](https://github.com/browserbase/stagehand/commit/56c0d244f9b2431218bfa832ddfc0587930ae038) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add SnapshotOptions to page.snapshot() - [#1483](https://github.com/browserbase/stagehand/pull/1483) [`16d72fb`](https://github.com/browserbase/stagehand/commit/16d72fb4c4081dd33bf45605d75c27644ea4c00e) Thanks [@tkattkat](https://github.com/tkattkat)! - Optimize screenshot handling in agent hybrid mode - [#1498](https://github.com/browserbase/stagehand/pull/1498) [`088c4cc`](https://github.com/browserbase/stagehand/commit/088c4cc31dc924bb232a9d5a09ab42cd961c2d36) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: replaying cached actions (for agent & act) now uses the originally defined model, (instead of default model) when action fails and rerunning inference is needed - [#1575](https://github.com/browserbase/stagehand/pull/1575) [`4276f4a`](https://github.com/browserbase/stagehand/commit/4276f4abc8bbde215faac6c0321bf243484c376b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - expose port param in localBrowserLaunchOptions - [#1544](https://github.com/browserbase/stagehand/pull/1544) [`6005786`](https://github.com/browserbase/stagehand/commit/600578637e65f6fd18b0cdb322b9e0b857708b2f) Thanks [@tkattkat](https://github.com/tkattkat)! - Recommend hybrid mode over DOM mode in agent, which is now considered legacy - [#1505](https://github.com/browserbase/stagehand/pull/1505) [`6fbf5fc`](https://github.com/browserbase/stagehand/commit/6fbf5fc811e5e5d9d22f10c5309fbd336892263a) Thanks [@tkattkat](https://github.com/tkattkat)! - Add structured output to agent result + ensure close tool is always called - [#1511](https://github.com/browserbase/stagehand/pull/1511) [`704cf18`](https://github.com/browserbase/stagehand/commit/704cf18cb2bdd187ba06c35f05ccb47317a7668c) Thanks [@shrey150](https://github.com/shrey150)! - Fix ControlOrMeta keypress event - [#1480](https://github.com/browserbase/stagehand/pull/1480) [`091296e`](https://github.com/browserbase/stagehand/commit/091296e438bb2374c8bb10ef6c08283978145ebf) Thanks [@tkattkat](https://github.com/tkattkat)! - Update agent to only calculate xpath when caching is enabled - [#1509](https://github.com/browserbase/stagehand/pull/1509) [`e56c6eb`](https://github.com/browserbase/stagehand/commit/e56c6eb139bf3aad37e98b16626fff13a6c671d0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for page.waitForSelector() - [#1478](https://github.com/browserbase/stagehand/pull/1478) [`2cb78d0`](https://github.com/browserbase/stagehand/commit/2cb78d0f5ddef9f7337a9a2fe3137f1421df700a) Thanks [@tkattkat](https://github.com/tkattkat)! - update agent message handling - [#1518](https://github.com/browserbase/stagehand/pull/1518) [`5dad639`](https://github.com/browserbase/stagehand/commit/5dad63938f08d968d434bb1ee2804f1e54fb836a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add page.snapshot() for capturing a stringified DOM snapshot of the page, including an xpath map & url map - [#1576](https://github.com/browserbase/stagehand/pull/1576) [`b7c2571`](https://github.com/browserbase/stagehand/commit/b7c2571ad4ac563f3ca0518e1f29a40da93e33bc) Thanks [@tkattkat](https://github.com/tkattkat)! - utilize waitForSelector when running agent cache - [#1560](https://github.com/browserbase/stagehand/pull/1560) [`4c69117`](https://github.com/browserbase/stagehand/commit/4c6911748953199dc9aad3eabe98bcf325f871e4) Thanks [@tkattkat](https://github.com/tkattkat)! - Update coordinate handling in cua and hybrid ## 3.0.7 ### Patch Changes - [#1461](https://github.com/browserbase/stagehand/pull/1461) [`0f3991e`](https://github.com/browserbase/stagehand/commit/0f3991eedc0aaff72ef718dda3ddb0839cf4a464) Thanks [@tkattkat](https://github.com/tkattkat)! - Move hybrid mode out of experimental - [#1433](https://github.com/browserbase/stagehand/pull/1433) [`e0e22e0`](https://github.com/browserbase/stagehand/commit/e0e22e06bc752a8ffde30f3dbfa58d91e24e6c09) Thanks [@tkattkat](https://github.com/tkattkat)! - Put hybrid mode behind experimental - [#1456](https://github.com/browserbase/stagehand/pull/1456) [`f261051`](https://github.com/browserbase/stagehand/commit/f2610517d74774374de9ee93191e663439ef55e5) Thanks [@shrey150](https://github.com/shrey150)! - Invoke page.hover for agent move action - [#1473](https://github.com/browserbase/stagehand/pull/1473) [`e021674`](https://github.com/browserbase/stagehand/commit/e021674f9641c1c5f9d0c1817c3fdf599eea124d) Thanks [@shrey150](https://github.com/shrey150)! - Add safety confirmation support for OpenAI + Google CUA - [#1399](https://github.com/browserbase/stagehand/pull/1399) [`6a5496f`](https://github.com/browserbase/stagehand/commit/6a5496f17dbb716be1ee1aaa4e5ba9d8c723b30b) Thanks [@tkattkat](https://github.com/tkattkat)! - Ensure cua agent is killed when stagehand.close is called - [#1436](https://github.com/browserbase/stagehand/pull/1436) [`fea1700`](https://github.com/browserbase/stagehand/commit/fea1700552af3319052f463685752501c8e71de3) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix auto-load key for act/extract/observe parametrized models on api - [#1439](https://github.com/browserbase/stagehand/pull/1439) [`5b288d9`](https://github.com/browserbase/stagehand/commit/5b288d9ac37406ff22460ac8050bea26b87a378e) Thanks [@tkattkat](https://github.com/tkattkat)! - Remove base64 from agent actions array ( still present in messages object ) - [#1408](https://github.com/browserbase/stagehand/pull/1408) [`e822f5a`](https://github.com/browserbase/stagehand/commit/e822f5a8898df9eb48ca32c321025f0c74b638f0) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow for act() cache hit when variable values change - [#1472](https://github.com/browserbase/stagehand/pull/1472) [`638efc7`](https://github.com/browserbase/stagehand/commit/638efc7fea401bc43dd05dceedf4c13a3495a728) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: agent cache not refreshed on action failure - [#1424](https://github.com/browserbase/stagehand/pull/1424) [`a890f16`](https://github.com/browserbase/stagehand/commit/a890f16fa3a752f308f858e5ab9c9a0faf6b3b34) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: "Error: -32000 Failed to convert response to JSON: CBOR: stack limit exceeded" - [#1418](https://github.com/browserbase/stagehand/pull/1418) [`934f492`](https://github.com/browserbase/stagehand/commit/934f492ec587bef81f0ce75b45a35b44ab545712) Thanks [@miguelg719](https://github.com/miguelg719)! - Cleanup handlers and bus listeners on close - [#1430](https://github.com/browserbase/stagehand/pull/1430) [`bd2db92`](https://github.com/browserbase/stagehand/commit/bd2db925f66a826d61d58be1611d55646cbdb560) Thanks [@shrey150](https://github.com/shrey150)! - Fix CUA model coordinate translation - [#1465](https://github.com/browserbase/stagehand/pull/1465) [`51e0170`](https://github.com/browserbase/stagehand/commit/51e01709ce1c947c1947b4e2cb0b1f4f97b77182) Thanks [@miguelg719](https://github.com/miguelg719)! - Add media resolution high provider option to gemini 3 hybrid agent - [#1431](https://github.com/browserbase/stagehand/pull/1431) [`05f5580`](https://github.com/browserbase/stagehand/commit/05f5580937c3c157550e3c25ae6671f44f562211) Thanks [@tkattkat](https://github.com/tkattkat)! - Update the cache handling for agent - [#1432](https://github.com/browserbase/stagehand/pull/1432) [`f56a9c2`](https://github.com/browserbase/stagehand/commit/f56a9c296d4ddce25a405358c66837f8ce4d679f) Thanks [@tkattkat](https://github.com/tkattkat)! - Deprecate cua: true in favor of mode: "cua" - [#1406](https://github.com/browserbase/stagehand/pull/1406) [`b40ae11`](https://github.com/browserbase/stagehand/commit/b40ae11391af49c3581fce27faa1b7483fc4a169) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for hovering with coordinates ( page.hover ) - [#1407](https://github.com/browserbase/stagehand/pull/1407) [`0d2b398`](https://github.com/browserbase/stagehand/commit/0d2b398cd40b32a9ecaf28ede70853036b7c91bd) Thanks [@tkattkat](https://github.com/tkattkat)! - Clean up page methods - [#1412](https://github.com/browserbase/stagehand/pull/1412) [`cd01f29`](https://github.com/browserbase/stagehand/commit/cd01f290578eac703521f801ba3712f5332918f3) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: load GOOGLE_API_KEY from .env - [#1462](https://github.com/browserbase/stagehand/pull/1462) [`a734fca`](https://github.com/browserbase/stagehand/commit/a734fca0b4573753767d3ebc48ec414baf4f23e1) Thanks [@shrey150](https://github.com/shrey150)! - fix: correctly pass userDataDir to chrome launcher - [#1466](https://github.com/browserbase/stagehand/pull/1466) [`b342acf`](https://github.com/browserbase/stagehand/commit/b342acfaae058127fb57664644c5fd965db02bf2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - move playwright to optional dependencies - [#1440](https://github.com/browserbase/stagehand/pull/1440) [`2987cd1`](https://github.com/browserbase/stagehand/commit/2987cd1e5ffabefa9411936609635d4a638faed5) Thanks [@tkattkat](https://github.com/tkattkat)! - [Feature] support excluding tools from agent - [#1455](https://github.com/browserbase/stagehand/pull/1455) [`dfab1d5`](https://github.com/browserbase/stagehand/commit/dfab1d566299c8c5a63f20565a6da07dc8f61ccd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - update aisdk client to better enforce structured output with deepseek models - [#1428](https://github.com/browserbase/stagehand/pull/1428) [`4d71162`](https://github.com/browserbase/stagehand/commit/4d71162beb119635b69b17637564a2bbd0e373e7) Thanks [@tkattkat](https://github.com/tkattkat)! - Add "hybrid" mode to stagehand agent ## 3.0.6 ### Patch Changes - [#1388](https://github.com/browserbase/stagehand/pull/1388) [`605ed6b`](https://github.com/browserbase/stagehand/commit/605ed6b81a3ff8f25d4022f1e5fce6b42aecfc19) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix multiple click event dispatches on CDP and Anthropic CUA handling (double clicks) - [#1400](https://github.com/browserbase/stagehand/pull/1400) [`34e7e5b`](https://github.com/browserbase/stagehand/commit/34e7e5b292f5e6af6efc0da60118663310c5f718) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - don't write base64 encoded screenshots to disk when caching agent actions - [#1345](https://github.com/browserbase/stagehand/pull/1345) [`943d2d7`](https://github.com/browserbase/stagehand/commit/943d2d79d0f289ac41c9164578f2f1dd876058f2) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for aborting / stopping an agent run & continuing an agent run using messages from prior runs - [#1334](https://github.com/browserbase/stagehand/pull/1334) [`0e95cd2`](https://github.com/browserbase/stagehand/commit/0e95cd2f67672f64f0017024fd47d8b3aef59a95) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for google vertex provider - [#1410](https://github.com/browserbase/stagehand/pull/1410) [`d4237e4`](https://github.com/browserbase/stagehand/commit/d4237e40951ecd10abfdbe766672d498f8806484) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: include extract in stagehand.history() - [#1315](https://github.com/browserbase/stagehand/pull/1315) [`86975e7`](https://github.com/browserbase/stagehand/commit/86975e795db7505804949a267b20509bd16b5256) Thanks [@tkattkat](https://github.com/tkattkat)! - Add streaming support to agent through stream:true in the agent config - [#1304](https://github.com/browserbase/stagehand/pull/1304) [`d5e119b`](https://github.com/browserbase/stagehand/commit/d5e119be5eec84915a79f8d611b6ba0546f48c99) Thanks [@miguelg719](https://github.com/miguelg719)! - Add support for Microsoft's Fara-7B - [#1346](https://github.com/browserbase/stagehand/pull/1346) [`4e051b2`](https://github.com/browserbase/stagehand/commit/4e051b23add7ae276b0dbead38b4587838cfc1c1) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: don't attach to targets twice - [#1327](https://github.com/browserbase/stagehand/pull/1327) [`6b5a3c9`](https://github.com/browserbase/stagehand/commit/6b5a3c9035654caaed2da375085b465edda97de4) Thanks [@miguelg719](https://github.com/miguelg719)! - Informed error parsing from api - [#1335](https://github.com/browserbase/stagehand/pull/1335) [`bb85ad9`](https://github.com/browserbase/stagehand/commit/bb85ad912738623a7a866f0cb6e8d5807c6c2738) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for page.addInitScript() - [#1331](https://github.com/browserbase/stagehand/pull/1331) [`88d28cc`](https://github.com/browserbase/stagehand/commit/88d28cc6f31058d1cf6ec6dc948a4ae77a926b3c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: page.evaluate() now works with scripts injected via context.addInitScript() - [#1316](https://github.com/browserbase/stagehand/pull/1316) [`45bcef0`](https://github.com/browserbase/stagehand/commit/45bcef0e5788b083f9e38dfd7c3bc63afcd4b6dd) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for callbacks in stagehand agent - [#1374](https://github.com/browserbase/stagehand/pull/1374) [`6aa9d45`](https://github.com/browserbase/stagehand/commit/6aa9d455aa5836ec2ee8ab2e8b9df3fb218e5381) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix key action mapping in Anthropic CUA - [#1330](https://github.com/browserbase/stagehand/pull/1330) [`d382084`](https://github.com/browserbase/stagehand/commit/d382084745fff98c3e71413371466394a2625429) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: make act, extract, and observe respect user defined timeout param - [#1336](https://github.com/browserbase/stagehand/pull/1336) [`1df08cc`](https://github.com/browserbase/stagehand/commit/1df08ccb0a2cf73b5c37a91c129721114ff6371c) Thanks [@tkattkat](https://github.com/tkattkat)! - Patch agent on api - [#1358](https://github.com/browserbase/stagehand/pull/1358) [`2b56600`](https://github.com/browserbase/stagehand/commit/2b566009606fcbba987260f21b075b318690ce99) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for 4.5 opus in cua agent ## 3.0.4 ### Patch Changes - [#1281](https://github.com/browserbase/stagehand/pull/1281) [`fa18cfd`](https://github.com/browserbase/stagehand/commit/fa18cfdc45f28e35e6566587b54612396e6ece45) Thanks [@monadoid](https://github.com/monadoid)! - Add Browserbase session URL and debug URL accessors - [#1264](https://github.com/browserbase/stagehand/pull/1264) [`767d168`](https://github.com/browserbase/stagehand/commit/767d1686285cf9c57675595f553f8a891f13c63b) Thanks [@Kylejeong2](https://github.com/Kylejeong2)! - feat: adding gpt 5.1 to stagehand - [#1282](https://github.com/browserbase/stagehand/pull/1282) [`f27a99c`](https://github.com/browserbase/stagehand/commit/f27a99c11b020b33736fe67af8f7f0e663c6f45f) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for zod 4, while maintaining backwards compatibility for zod 3 - [#1295](https://github.com/browserbase/stagehand/pull/1295) [`91a1ca0`](https://github.com/browserbase/stagehand/commit/91a1ca07d9178c46269bfb951abb20a215eb7c29) Thanks [@tkattkat](https://github.com/tkattkat)! - Patch zod handling of non objects in extract - [#1298](https://github.com/browserbase/stagehand/pull/1298) [`1dd7d43`](https://github.com/browserbase/stagehand/commit/1dd7d4330de9022dc6cd45a8b5c86cb9e1b575ec) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - log Browserbase session status when websocket is closed due to session timeout - [#1284](https://github.com/browserbase/stagehand/pull/1284) [`c0f3b98`](https://github.com/browserbase/stagehand/commit/c0f3b98277c15c77b2b4c3f55503e61ef3d27cf3) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: waitForDomNetworkQuiet() causing `act()` to hang indefinitely - [#1246](https://github.com/browserbase/stagehand/pull/1246) [`44bb4f5`](https://github.com/browserbase/stagehand/commit/44bb4f51dcccbdca8df07e4d7f8d28a7e6e793ec) Thanks [@filip-michalsky](https://github.com/filip-michalsky)! - make ci faster - [#1300](https://github.com/browserbase/stagehand/pull/1300) [`2b70347`](https://github.com/browserbase/stagehand/commit/2b7034771bc6d6b1fabb13deaa56c299881b3728) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for context.addInitScript() ## 3.0.3 ### Patch Changes - [#1273](https://github.com/browserbase/stagehand/pull/1273) [`ab51232`](https://github.com/browserbase/stagehand/commit/ab51232db428be048957c0f5d67f2176eb7a5194) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: trigger shadow root rerender in OOPIFs by cloning & replacing instead of reloading - [#1268](https://github.com/browserbase/stagehand/pull/1268) [`c76ade0`](https://github.com/browserbase/stagehand/commit/c76ade009ef81208accae6475ec4707d3906e566) Thanks [@tkattkat](https://github.com/tkattkat)! - Expose reasoning, and cached input tokens in stagehand metrics - [#1267](https://github.com/browserbase/stagehand/pull/1267) [`ffb5e5d`](https://github.com/browserbase/stagehand/commit/ffb5e5d2ab49adcb2efdfc9e5c76e8c96268b5b3) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: file uploads failing on Browserbase - [#1269](https://github.com/browserbase/stagehand/pull/1269) [`772e735`](https://github.com/browserbase/stagehand/commit/772e73543e45106d7fa0fafd95ade46ae11023bc) Thanks [@tkattkat](https://github.com/tkattkat)! - Add example using playwright screen recording ## 3.0.2 ### Patch Changes - [#1245](https://github.com/browserbase/stagehand/pull/1245) [`a224b33`](https://github.com/browserbase/stagehand/commit/a224b3371b6c1470baf342742fb745c7192b52c6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow act() to call hover() - [#1234](https://github.com/browserbase/stagehand/pull/1234) [`6fc9de2`](https://github.com/browserbase/stagehand/commit/6fc9de2a1079e4f2fb0b1633d8df0bb7a9f7f89f) Thanks [@miguelg719](https://github.com/miguelg719)! - Add a page.sendCDP method - [#1233](https://github.com/browserbase/stagehand/pull/1233) [`4935be7`](https://github.com/browserbase/stagehand/commit/4935be788b3431527f3d110864c0fd7060cfaf7c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - extend page.screenshot() options to mirror playwright - [#1232](https://github.com/browserbase/stagehand/pull/1232) [`bdd76fc`](https://github.com/browserbase/stagehand/commit/bdd76fcd1e48079fc5ab8cf040ebb5997dfc6c99) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - export Page type - [#1229](https://github.com/browserbase/stagehand/pull/1229) [`7ea18a4`](https://github.com/browserbase/stagehand/commit/7ea18a420fc033d1b72556db83a1f41735e5a022) Thanks [@tkattkat](https://github.com/tkattkat)! - Adjust extract tool + expose extract response in agent result - [#1239](https://github.com/browserbase/stagehand/pull/1239) [`d4de014`](https://github.com/browserbase/stagehand/commit/d4de014235a18f9e1089240bc72e28cbfe77ca1c) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix stagehand.metrics on api mode - [#1241](https://github.com/browserbase/stagehand/pull/1241) [`2d1b573`](https://github.com/browserbase/stagehand/commit/2d1b5732dc441a3331f5743cdfed3e1037d8b3b5) Thanks [@miguelg719](https://github.com/miguelg719)! - Return response on page.goto api mode - [#1253](https://github.com/browserbase/stagehand/pull/1253) [`5556041`](https://github.com/browserbase/stagehand/commit/5556041e2deaed5012363303fd7a8ac00e3242cd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix missing page issue when connecting to existing browser - [#1235](https://github.com/browserbase/stagehand/pull/1235) [`7e4b43e`](https://github.com/browserbase/stagehand/commit/7e4b43ed46fbdd2074827e87d9a245e2dc96456b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - make page.goto() return a Response object - [#1254](https://github.com/browserbase/stagehand/pull/1254) [`7e72adf`](https://github.com/browserbase/stagehand/commit/7e72adfd7e4af5ec49ac2f552e7f1f57c1acc554) Thanks [@sameelarif](https://github.com/sameelarif)! - Added custom error types to allow for a smoother debugging experience. - [#1227](https://github.com/browserbase/stagehand/pull/1227) [`9bf09d0`](https://github.com/browserbase/stagehand/commit/9bf09d041111870d71cb9ffcb3ac5fa2c4b1399d) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix readme's media links and add instructions for installing from a branch - [#1257](https://github.com/browserbase/stagehand/pull/1257) [`92d32ea`](https://github.com/browserbase/stagehand/commit/92d32eafe91a4241615cc65501b8461c6074a02b) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for a custom baseUrl with google cua client - [#1230](https://github.com/browserbase/stagehand/pull/1230) [`ebcf3a1`](https://github.com/browserbase/stagehand/commit/ebcf3a1ffa859374d71de4931c6a9b982a565e46) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add stagehand.browserbaseSessionID getter - [#1262](https://github.com/browserbase/stagehand/pull/1262) [`c29a4f2`](https://github.com/browserbase/stagehand/commit/c29a4f2eca91ae2902ed9d48b2385b4436f7b664) Thanks [@miguelg719](https://github.com/miguelg719)! - Remove error throwing when api and experimental are both set - [#1223](https://github.com/browserbase/stagehand/pull/1223) [`6d21efa`](https://github.com/browserbase/stagehand/commit/6d21efa8b30317aa3ce3e37ac6c2222af3b967b5) Thanks [@miguelg719](https://github.com/miguelg719)! - Disable api mode when using custom LLM clients - [#1228](https://github.com/browserbase/stagehand/pull/1228) [`525ef0c`](https://github.com/browserbase/stagehand/commit/525ef0c1243aaf3452ee7e4ea81b4208f4c2efd1) Thanks [@Kylejeong2](https://github.com/Kylejeong2)! - update slack link in docs - [#1226](https://github.com/browserbase/stagehand/pull/1226) [`9ddb872`](https://github.com/browserbase/stagehand/commit/9ddb872e350358214e12a91cf6a614fd2ec1f74c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for page.on('console') events ## 3.0.1 ### Patch Changes - [#1207](https://github.com/browserbase/stagehand/pull/1207) [`55da8c6`](https://github.com/browserbase/stagehand/commit/55da8c6e9575cbad3246c55b17650cf6b293ddbe) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix broken links to quickstart docs - [#1200](https://github.com/browserbase/stagehand/pull/1200) [`0a5ee63`](https://github.com/browserbase/stagehand/commit/0a5ee638bde051d109eb2266e665934a12f3dc31) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - log info when scope narrowing selector fails - [#1205](https://github.com/browserbase/stagehand/pull/1205) [`ee76881`](https://github.com/browserbase/stagehand/commit/ee7688156cb67a9f0f90dfe0dbab77423693a332) Thanks [@miguelg719](https://github.com/miguelg719)! - Update README.md, add Changelog for v3 - [#1209](https://github.com/browserbase/stagehand/pull/1209) [`9e95add`](https://github.com/browserbase/stagehand/commit/9e95add37eb30db4f85e73df7760c7e63fb4131e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix circular import in exported aisdk example client - [#1211](https://github.com/browserbase/stagehand/pull/1211) [`98e212b`](https://github.com/browserbase/stagehand/commit/98e212b27887241879608c6c1b6c2524477a40d7) Thanks [@miguelg719](https://github.com/miguelg719)! - Add an example for passing custom tools to agent - [#1206](https://github.com/browserbase/stagehand/pull/1206) [`d5ecbfc`](https://github.com/browserbase/stagehand/commit/d5ecbfc8e419a59b91c2115fd7f984378381d3d0) Thanks [@miguelg719](https://github.com/miguelg719)! - Export example AISdkClient properly from the stagehand package ================================================ FILE: packages/core/README.md ================================================

The AI Browser Automation Framework
Read the Docs

MIT License Discord Community

browserbase%2Fstagehand | Trendshift

Ask DeepWiki

If you're looking for the Python implementation, you can find it here

Vibe code Stagehand with Director Director
## What is Stagehand? Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. ## Why Stagehand? Most existing browser automation tools either require you to write low-level code in a framework like Selenium, Playwright, or Puppeteer, or use high-level agents that can be unpredictable in production. By letting developers choose what to write in code vs. natural language (and bridging the gap between the two) Stagehand is the natural choice for browser automations in production. 1. **Choose when to write code vs. natural language**: use AI when you want to navigate unfamiliar pages, and use code when you know exactly what you want to do. 2. **Go from AI-driven to repeatable workflows**: Stagehand lets you preview AI actions before running them, and also helps you easily cache repeatable actions to save time and tokens. 3. **Write once, run forever**: Stagehand's auto-caching combined with self-healing remembers previous actions, runs without LLM inference, and knows when to involve AI whenever the website changes and your automation breaks. ## Getting Started Start with Stagehand with one line of code, or check out our [Quickstart Guide](https://docs.stagehand.dev/v3/first-steps/quickstart) for more information: ```bash npx create-browser-app ``` ## Example Here's how to build a sample browser automation with Stagehand: ```typescript // Stagehand's CDP engine provides an optimized, low level interface to the browser built for automation const page = stagehand.context.pages()[0]; await page.goto("https://github.com/browserbase"); // Use act() to execute individual actions await stagehand.act("click on the stagehand repo"); // Use agent() for multi-step tasks const agent = stagehand.agent(); await agent.execute("Get to the latest PR"); // Use extract() to get structured data from the page const { author, title } = await stagehand.extract( "extract the author and title of the PR", z.object({ author: z.string().describe("The username of the PR author"), title: z.string().describe("The title of the PR"), }), ); ``` ## Documentation Visit [docs.stagehand.dev](https://docs.stagehand.dev) to view the full documentation. ### Build and Run from Source ```bash git clone https://github.com/browserbase/stagehand.git cd stagehand pnpm install pnpm run build pnpm run example # run the blank script at ./examples/example.ts ``` Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run: ```bash cp .env.example .env nano .env # Edit the .env file to add API keys ``` ### Installing from a branch You can install and build Stagehand directly from a github branch using [gitpkg](https://github.com/EqualMa/gitpkg) In your project's `package.json` set: ```json "@browserbasehq/stagehand": "https://gitpkg.now.sh/browserbase/stagehand/packages/core?", ``` ## Contributing > [!NOTE] > We highly value contributions to Stagehand! For questions or support, please join our [Discord community](https://stagehand.dev/discord). At a high level, we're focused on improving reliability, extensibility, speed, and cost in that order of priority. If you're interested in contributing, **bug fixes and small improvements are the best way to get started**. For more involved features, we strongly recommend reaching out to [Miguel Gonzalez](https://x.com/miguel_gonzf) or [Paul Klein](https://x.com/pk_iv) in our [Discord community](https://stagehand.dev/discord) before starting to ensure that your contribution aligns with our goals. ## Acknowledgements We'd like to thank the following people for their major contributions to Stagehand: - [Paul Klein](https://github.com/pkiv) - [Sean McGuire](https://github.com/seanmcguire12) - [Miguel Gonzalez](https://github.com/miguelg719) - [Sameel Arif](https://github.com/sameelarif) - [Thomas Katwan](https://github.com/tkattkat) - [Filip Michalsky](https://github.com/filip-michalsky) - [Anirudh Kamath](https://github.com/kamath) - [Jeremy Press](https://x.com/jeremypress) - [Navid Pour](https://github.com/navidpour) ## License Licensed under the MIT License. Copyright 2025 Browserbase, Inc. ================================================ FILE: packages/core/examples/2048.ts ================================================ import { Stagehand } from "../lib/v3/index.js"; import { z } from "zod"; async function example() { console.log("🎮 Starting 2048 bot..."); const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, }); console.log("🌟 Initializing Stagehand..."); await stagehand.init(); const page = stagehand.context.pages()[0]; try { console.log("🌐 Navigating to 2048..."); await page.goto("https://ovolve.github.io/2048-AI/"); // Main game loop while (true) { console.log("🔄 Game loop iteration..."); // Add a small delay for UI updates await new Promise((resolve) => setTimeout(resolve, 300)); // Get current game state const gameState = await stagehand.extract( `Extract the current game state: 1. Score from the score counter 2. All tile values in the 4x4 grid (empty spaces as 0) 3. Highest tile value present`, z.object({ score: z.number(), highestTile: z.number(), grid: z.array(z.array(z.number())), }), ); const transposedGrid = gameState.grid[0].map((_, colIndex) => gameState.grid.map((row) => row[colIndex]), ); const grid = transposedGrid.map((row, rowIndex) => ({ [`row${rowIndex + 1}`]: row, })); console.log("Game State:", { score: gameState.score, highestTile: gameState.highestTile, grid: grid, }); // Analyze board and decide next move const analysis = await stagehand.extract( `Based on the current game state: - Score: ${gameState.score} - Highest tile: ${gameState.highestTile} - Grid: This is a 4x4 matrix ordered by row (top to bottom) and column (left to right). The rows are stacked vertically, and tiles can move vertically between rows or horizontally between columns:\n${grid .map((row) => { const rowName = Object.keys(row)[0]; return ` ${rowName}: ${row[rowName].join(", ")}`; }) .join("\n")} What is the best move (up/down/left/right)? Consider: 1. Keeping high value tiles in corners (bottom left, bottom right, top left, top right) 2. Maintaining a clear path to merge tiles 3. Avoiding moves that could block merges 4. Only adjacent tiles of the same value can merge 5. Making a move will move all tiles in that direction until they hit a tile of a different value or the edge of the board 6. Tiles cannot move past the edge of the board 7. Each move must move at least one tile`, z.object({ move: z.enum(["up", "down", "left", "right"]), confidence: z.number(), reasoning: z.string(), }), ); console.log("Move Analysis:", analysis); const moveKey = { up: "ArrowUp", down: "ArrowDown", left: "ArrowLeft", right: "ArrowRight", }[analysis.move]; await page.keyPress(moveKey); console.log("🎯 Executed move:", analysis.move); } } catch (error) { console.error("❌ Error in game loop:", error); const isGameOver = await page.evaluate(() => { return document.querySelector(".game-over") !== null; }); if (isGameOver) { console.log("🏁 Game Over!"); return; } throw error; // Re-throw non-game-over errors } } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/CHANGELOG.md ================================================ # @browserbasehq/stagehand-examples ## 1.0.9 ### Patch Changes - Updated dependencies [[`09b5e1e`](https://github.com/browserbase/stagehand/commit/09b5e1e9c23c845903686db6665cc968ac34efbb), [`e3734b9`](https://github.com/browserbase/stagehand/commit/e3734b9c98352d5f0a4eca49791b0bbf2130ab41), [`8244ab2`](https://github.com/browserbase/stagehand/commit/8244ab247cd679962685ae2f7c54e874ce1fa614), [`be85b19`](https://github.com/browserbase/stagehand/commit/be85b19679a826f19702e00f0aae72fce1118ec8), [`88d1565`](https://github.com/browserbase/stagehand/commit/88d1565c65bb65a104fea2d5f5e862bbbda69677), [`ab5d6ed`](https://github.com/browserbase/stagehand/commit/ab5d6ede19aabc059badc4247f1cb2c6c9e71bae)]: - @browserbasehq/stagehand@2.5.0 ## 1.0.8 ### Patch Changes - Updated dependencies [[`9e8c173`](https://github.com/browserbase/stagehand/commit/9e8c17374fdc8fbe7f26e6cf802c36bd14f11039)]: - @browserbasehq/stagehand@2.4.4 ## 1.0.7 ### Patch Changes - Updated dependencies [[`f45afdc`](https://github.com/browserbase/stagehand/commit/f45afdccc8680650755fee66ffbeac32b41e075d), [`261bba4`](https://github.com/browserbase/stagehand/commit/261bba43fa79ac3af95328e673ef3e9fced3279b), [`8de7bd8`](https://github.com/browserbase/stagehand/commit/8de7bd8635c2051cd8025e365c6c8aa83d81c7e7), [`3d80421`](https://github.com/browserbase/stagehand/commit/3d804210a106a6828c7fa50f8b765b10afd4cc6a), [`0ead63d`](https://github.com/browserbase/stagehand/commit/0ead63d6526f6c286362b74b6407c8bebc900e69), [`8422828`](https://github.com/browserbase/stagehand/commit/8422828c4cd5fd5ebcf348cfbdb40c768bb76dd9), [`b769206`](https://github.com/browserbase/stagehand/commit/b7692060f98a2f49aeeefb90d8789ed034b08ec2), [`72d2683`](https://github.com/browserbase/stagehand/commit/72d2683202af7e578d98367893964b33e0828de5)]: - @browserbasehq/stagehand@2.4.3 ## 1.0.6 ### Patch Changes - Updated dependencies [[`6b4e6e3`](https://github.com/browserbase/stagehand/commit/6b4e6e3f31d5496cf15728e9018eddeb04839542), [`e77d018`](https://github.com/browserbase/stagehand/commit/e77d0188683ebf596dfb78dfafbbca1dc32993f0), [`c20adb9`](https://github.com/browserbase/stagehand/commit/c20adb95539fed8c56a4aa413262a9c65a8e6474), [`b86df93`](https://github.com/browserbase/stagehand/commit/b86df93b9136aae96292121a29c25f3d74d84bf7), [`023c2c2`](https://github.com/browserbase/stagehand/commit/023c2c273b46d3792d7e5d3c902089487b16b531), [`8c28647`](https://github.com/browserbase/stagehand/commit/8c2864755ecd05c8f7de235d4198deec0dd5f78e), [`87e09c6`](https://github.com/browserbase/stagehand/commit/87e09c618940f364ec8af00455a19a17ec63cbd3), [`a611115`](https://github.com/browserbase/stagehand/commit/a61111525d70b450bdfc43f112380f44899c9e97), [`69913fe`](https://github.com/browserbase/stagehand/commit/69913fe1dfb8201ae2aeffa5f049fb46ab02cbc2), [`b1b83a1`](https://github.com/browserbase/stagehand/commit/b1b83a1d334fe76e5f5f9dd32dc92c16b7d40ce6), [`be8497c`](https://github.com/browserbase/stagehand/commit/be8497cb6b142cc893cea9692b8c47bd19514c60), [`98704c9`](https://github.com/browserbase/stagehand/commit/98704c9ed225ca25bbde4bb3dc286936e9c54471), [`04978bd`](https://github.com/browserbase/stagehand/commit/04978bdd30d2edcbc69eb9fd91358a16975ea2eb)]: - @browserbasehq/stagehand@2.4.2 ## 1.0.5 ### Patch Changes - Updated dependencies [[`8a43c5a`](https://github.com/browserbase/stagehand/commit/8a43c5a86d4da40cfaedd9cf2e42186928bdf946), [`890ffcc`](https://github.com/browserbase/stagehand/commit/890ffccac5e0a60ade64a46eb550c981ffb3e84a), [`64c1072`](https://github.com/browserbase/stagehand/commit/64c10727bda50470483a3eb175c02842db0923a1), [`b077d3f`](https://github.com/browserbase/stagehand/commit/b077d3f48a97f47a71ccc79ae39b41e7f07f9c04), [`8bcb5d7`](https://github.com/browserbase/stagehand/commit/8bcb5d77debf6bf7601fd5c090efd7fde75c5d5e), [`7bf10c5`](https://github.com/browserbase/stagehand/commit/7bf10c55b267078fe847c1d7f7a60d604f9c7c94)]: - @browserbasehq/stagehand@2.4.1 ## 1.0.4 ### Patch Changes - Updated dependencies [[`124e0d3`](https://github.com/browserbase/stagehand/commit/124e0d3bb54ddb6738ede6d7aa99a945ef1cacd1), [`6a18c1e`](https://github.com/browserbase/stagehand/commit/6a18c1ee1e46d55c6e90c4d5572e17ed8daa140c), [`1660751`](https://github.com/browserbase/stagehand/commit/1660751cd14cb5b27d44f8167216afb8d1c3c45c), [`cadac9d`](https://github.com/browserbase/stagehand/commit/cadac9da09123d12e5d496a0e8b12660964c1b33), [`759da55`](https://github.com/browserbase/stagehand/commit/759da55775eb2df81d56ae18c0f386fd9b02a9f0), [`a175a51`](https://github.com/browserbase/stagehand/commit/a175a519b8c14300db6f1ed30709e113d18e99db), [`8527a80`](https://github.com/browserbase/stagehand/commit/8527a80522c3eedb9516a6caa1a0e4e4be981a3d), [`55fca2f`](https://github.com/browserbase/stagehand/commit/55fca2f7da63cc0ef6e27b45a33f63c666cdce7e)]: - @browserbasehq/stagehand@2.4.0 ## 1.0.3 ### Patch Changes - Updated dependencies [[`12a99b3`](https://github.com/browserbase/stagehand/commit/12a99b398d8a4c3eea3ca69a3cf793faaaf4aea3), [`2451797`](https://github.com/browserbase/stagehand/commit/2451797f64c0efa4a72fd70265110003c8d0a6cd), [`1d631a5`](https://github.com/browserbase/stagehand/commit/1d631a57a197390f672b718ae5199991ab27cfb1), [`9c398bb`](https://github.com/browserbase/stagehand/commit/9c398bb9ec2d10bdb53ad5aa7e3b58cce24fdb2b), [`c19ad7f`](https://github.com/browserbase/stagehand/commit/c19ad7f1e082e91fdeaa9c2ef63767a5a2b3a195)]: - @browserbasehq/stagehand@2.3.1 ## 1.0.2 ### Patch Changes - Updated dependencies [[`5680d25`](https://github.com/browserbase/stagehand/commit/5680d2509352c383ad502c9f4fabde01fa638833), [`4de92a8`](https://github.com/browserbase/stagehand/commit/4de92a8af461fc95063faf39feee1d49259f58ba), [`6ef6073`](https://github.com/browserbase/stagehand/commit/6ef60730cab0ad9025f44b6eeb2c83751d1dcd35)]: - @browserbasehq/stagehand@2.3.0 ## 1.0.1 ### Patch Changes - Updated dependencies [[`be8652e`](https://github.com/browserbase/stagehand/commit/be8652e770b57fdb3299fa0b2efa4eb0e816434e), [`6b413b7`](https://github.com/browserbase/stagehand/commit/6b413b7ad00b13ca0bd53ee2e7393023821408b6), [`7eafbd9`](https://github.com/browserbase/stagehand/commit/7eafbd9b1a73b37effa444929767df7c592caf02), [`1b50aa6`](https://github.com/browserbase/stagehand/commit/1b50aa61cf0a429dd6cb2760a08f7f698a50454b), [`f2b7f1f`](https://github.com/browserbase/stagehand/commit/f2b7f1f284eef1f96753319b66c7d0b273a6f8cd), [`c8d672f`](https://github.com/browserbase/stagehand/commit/c8d672f7c410c256defbc2e87ead99239837aa28), [`bebf204`](https://github.com/browserbase/stagehand/commit/bebf2044502333c694743078c5b0c9deae11fb79), [`37d6810`](https://github.com/browserbase/stagehand/commit/37d6810a704773d0383a86f98f5f17c7d5b21975)]: - @browserbasehq/stagehand@2.2.1 ================================================ FILE: packages/core/examples/actionable_observe_example.ts ================================================ /** * This example shows how to use actionable observe() * * You can use observe to get a cache-able Playwright action as JSON, then pass that JSON to act() to perform the action. * * This is useful for: * - Previewing actions before running them * - Saving actions to a file and replaying them later * - Hiding sensitive information from LLMs * * For more on caching, see: https://docs.stagehand.dev/examples/caching * Also check out the form_filling_sensible.ts example for a more complex example of using observe() to fill out a form. */ import { Action, Stagehand } from "../lib/v3/index.js"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://www.apartments.com/san-francisco-ca/"); let observation: Action; await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe("find the 'all filters' button"); await stagehand.act(observation); await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe( "find the '1+' button in the 'beds' section", ); await stagehand.act(observation); await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe( "find the 'apartments' button in the 'home type' section", ); await stagehand.act(observation); await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe( "find the pet policy dropdown to click on.", ); await stagehand.act(observation); await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe( "find the 'Dog Friendly' option to click on", ); await stagehand.act(observation); await new Promise((resolve) => setTimeout(resolve, 3000)); [observation] = await stagehand.observe("find the 'see results' section"); await stagehand.act(observation); const currentUrl = page.url(); await stagehand.close(); if ( currentUrl.includes( "https://www.apartments.com/apartments/san-francisco-ca/min-1-bedrooms-pet-friendly-dog/", ) ) { console.log("✅ Success! we made it to the correct page"); } else { console.log( "❌ Whoops, looks like we didn't make it to the correct page. " + "\nThanks for testing out this new Stagehand feature!" + "\nReach us on Discord if you have any feedback/questions/suggestions!", ); } } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/agent-custom-tools.ts ================================================ /** * This example shows how to pass custom tools to stagehand agent (both CUA and non-CUA) */ import { z } from "zod"; import { tool } from "ai"; import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; // Mock weather API, replace with your own API/tool logic // eslint-disable-next-line @typescript-eslint/no-unused-vars const fetchWeatherAPI = async (location: string) => { return { temp: 70, conditions: "sunny", }; }; // Define the tool in an AI SDK format const getWeather = tool({ description: "Get the current weather in a location", inputSchema: z.object({ location: z.string().describe("The location to get weather for"), }), execute: async ({ location }) => { // Your custom logic here const weather = await fetchWeatherAPI(location); return { location, temperature: weather.temp, conditions: weather.conditions, }; }, }); async function main() { console.log( `\n${chalk.bold("Stagehand 🤘 Computer Use Agent (CUA) Demo")}\n`, ); // Initialize Stagehand const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, experimental: true, // You must enable experimental mode to use custom tools / MCP integrations model: "anthropic/claude-sonnet-4-5", }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; // Create a computer use agent const agent = stagehand.agent({ mode: "cua", model: { modelName: "anthropic/claude-sonnet-4-5-20250929", apiKey: process.env.ANTHROPIC_API_KEY, }, systemPrompt: `You are a helpful assistant that can use a web browser. You are currently on the following page: ${page.url()}. Do not ask follow up questions, the user will trust your judgement. Today's date is ${new Date().toLocaleDateString()}.`, tools: { getWeather, // Pass the tools to the agent }, }); // const agent = stagehand.agent({ // systemPrompt: `You are a helpful assistant that can use a web browser. // You are currently on the following page: ${page.url()}. // Do not ask follow up questions, the user will trust your judgement. Today's date is ${new Date().toLocaleDateString()}.`, // // Pass the tools to the agent // tools: { // getWeather: getWeather, // }, // }); // Navigate to the Browserbase careers page await page.goto("https://www.google.com"); // Define the instruction for the CUA const instruction = "What's the weather in San Francisco?"; console.log(`Instruction: ${chalk.white(instruction)}`); // Execute the instruction const result = await agent.execute({ instruction, maxSteps: 20, }); console.log(`${chalk.green("✓")} Execution complete`); console.log(`${chalk.yellow("⤷")} Result:`); console.log(chalk.white(JSON.stringify(result, null, 2))); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); if (error instanceof Error && error.stack) { console.log(chalk.dim(error.stack.split("\n").slice(1).join("\n"))); } } finally { // Close the browser await stagehand.close(); } } main().catch((error) => { console.log(`${chalk.red("✗")} Unhandled error in main function`); console.log(chalk.red(error)); }); ================================================ FILE: packages/core/examples/agent_stream_example.ts ================================================ import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; // Load environment variables async function main() { console.log(`\n${chalk.bold("Stagehand 🤘 Agent Streaming Example")}\n`); // Initialize Stagehand const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, cacheDir: "stagehand-agent-cache", logInferenceToFile: false, experimental: true, }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; await page.goto("https://amazon.com"); // Create a streaming agent with stream: true in the config const agent = stagehand.agent({ model: "anthropic/claude-sonnet-4-5-20250929", stream: true, // This makes execute() return AgentStreamResult }); const agentRun = await agent.execute({ instruction: "go to amazon, and search for shampoo, stop after searching", maxSteps: 20, }); // stream the text for await (const delta of agentRun.textStream) { process.stdout.write(delta); } // stream everything ( toolcalls, messages, etc.) // for await (const delta of result.fullStream) { // console.log(delta); // } const finalResult = await agentRun.result; console.log("Final Result:", finalResult); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); } } main(); ================================================ FILE: packages/core/examples/cua-example.ts ================================================ /** * This example shows how to use a computer use agent (CUA) to navigate a web page and extract data. * * To learn more about the CUA, see: https://docs.stagehand.dev/examples/computer_use * * NOTE: YOU MUST CONFIGURE BROWSER DIMENSIONS TO USE COMPUTER USE! * Check out stagehand.config.ts for more information. */ import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; async function main() { console.log( `\n${chalk.bold("Stagehand 🤘 Computer Use Agent (CUA) Demo")}\n`, ); // Initialize Stagehand const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; // Create a computer use agent const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-3-flash-preview", apiKey: process.env.GEMINI_API_KEY ?? process.env.GOOGLE_API_KEY, }, systemPrompt: `You are a helpful assistant that can use a web browser. You are currently on the following page: ${page.url()}. Do not ask follow up questions, the user will trust your judgement. Today's date is ${new Date().toLocaleDateString()}.`, }); // Navigate to the Browserbase careers page await page.goto("https://www.browserbase.com/careers"); // Define the instruction for the CUA const instruction = "Apply for the first engineer position with mock data. Don't submit the form. You're on the right page"; console.log(`Instruction: ${chalk.white(instruction)}`); // Execute the instruction const result = await agent.execute({ instruction, maxSteps: 20, }); await new Promise((resolve) => setTimeout(resolve, 30000)); console.log(`${chalk.green("✓")} Execution complete`); console.log(`${chalk.yellow("⤷")} Result:`); console.log(chalk.white(JSON.stringify(result, null, 2))); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); if (error instanceof Error && error.stack) { console.log(chalk.dim(error.stack.split("\n").slice(1).join("\n"))); } } finally { // Close the browser await stagehand.close(); } } main().catch((error) => { console.log(`${chalk.red("✗")} Unhandled error in main function`); console.log(chalk.red(error)); }); ================================================ FILE: packages/core/examples/custom_client_aisdk.ts ================================================ /** * This example shows how to use the Vercel AI SDK to power the Stagehand LLM Client. * * You will need to reference the AI SDK Client in /external_clients/aisdk.ts * * To learn more about the Vercel AI SDK, see: https://sdk.vercel.ai/docs */ import { Stagehand } from "../lib/v3/index.js"; import { AISdkClient } from "./external_clients/aisdk.js"; import { z } from "zod"; import { openai } from "@ai-sdk/openai"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, llmClient: new AISdkClient({ model: openai("gpt-4o"), }), }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://news.ycombinator.com"); const { story } = await stagehand.extract( "extract the title of the top story on the page", z.object({ story: z.string().describe("the top story on the page"), }), ); console.log("The top story is:", story); await stagehand.act("click the first story"); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/custom_client_langchain.ts ================================================ /** * This example shows how to use the Langchain client with Stagehand. * * You will need to reference the Langchain Client in /external_clients/langchain.ts */ import { z } from "zod"; import { Stagehand } from "../lib/v3/index.js"; import { LangchainClient } from "./external_clients/langchain.js"; import { ChatOpenAI } from "@langchain/openai"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, llmClient: new LangchainClient( new ChatOpenAI({ model: "gpt-4o", }), ), }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://news.ycombinator.com"); const { story } = await stagehand.extract( "extract the title of the top story on the page", z.object({ story: z.string().describe("the top story on the page"), }), ); console.log("The top story is:", story); await stagehand.act("click the first story"); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/custom_client_openai.ts ================================================ /** * This example shows how to use a custom OpenAI client with Stagehand. * * The OpenAI API provides a simple, type-safe, and composable way to build AI applications. * * You will need to reference the Custom OpenAI Client in /external_clients/customOpenAI.ts */ import { Stagehand } from "../lib/v3/index.js"; import { z } from "zod"; import { CustomOpenAIClient } from "./external_clients/customOpenAI.js"; import OpenAI from "openai"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, llmClient: new CustomOpenAIClient({ modelName: "gpt-4o-mini", client: new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }), }), }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://news.ycombinator.com"); await stagehand.act("click on the 'new' link"); const headlines = await stagehand.extract( "Extract the top 3 stories from the Hacker News homepage.", z.object({ stories: z.array( z.object({ title: z.string(), url: z.string(), points: z.number(), }), ), }), ); console.log(headlines); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/example.ts ================================================ import { Stagehand } from "../lib/v3/index.js"; async function example(stagehand: Stagehand) { /** * Add your code here! */ const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-hn/", ); const { extraction } = await stagehand.extract( "grab the the first title from inside the iframe", ); console.log(extraction); const page2 = await stagehand.context.newPage(); await page2.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-same-proc/", ); await stagehand.extract( "extract the placeholder text on the your name field", { page: page2 }, ); await stagehand.act("fill the your name field with the text 'John Doe'", { page: page2, }); const action2 = await stagehand.observe( "select blue as the favorite color on the dropdown", { page: page2 }, ); for (const action of action2) { await stagehand.act(action, { page: page2, timeout: 30_000 }); } } (async () => { const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, model: { modelName: "openai/gpt-5", apiKey: process.env.MODEL_API_KEY, }, verbose: 2, }); try { await stagehand.init(); await example(stagehand); } finally { await stagehand.close(); } })(); ================================================ FILE: packages/core/examples/external_clients/aisdk.ts ================================================ export { AISdkClient } from "../../lib/v3/external_clients/aisdk.js"; ================================================ FILE: packages/core/examples/external_clients/customOpenAI.ts ================================================ export { CustomOpenAIClient } from "../../lib/v3/external_clients/customOpenAI.js"; ================================================ FILE: packages/core/examples/external_clients/langchain.ts ================================================ import { BaseChatModel } from "@langchain/core/language_models/chat_models"; import { CreateChatCompletionOptions, LLMClient, AvailableModel, } from "../../lib/v3/index.js"; import { AIMessage, BaseMessageLike, HumanMessage, SystemMessage, } from "@langchain/core/messages"; import { ChatCompletion } from "openai/resources"; import { toJsonSchema } from "../../lib/v3/zodCompat.js"; export class LangchainClient extends LLMClient { public type = "langchainClient" as const; private model: BaseChatModel; constructor(model: BaseChatModel) { super(model.name as AvailableModel); this.model = model; } async createChatCompletion({ options, }: CreateChatCompletionOptions): Promise { const formattedMessages: BaseMessageLike[] = options.messages.map( (message) => { if (Array.isArray(message.content)) { if (message.role === "system") { return new SystemMessage( message.content .map((c) => ("text" in c ? c.text : "")) .join("\n"), ); } const content = message.content.map((content) => "image_url" in content ? { type: "image", image: content.image_url.url } : { type: "text", text: content.text }, ); if (message.role === "user") return new HumanMessage({ content }); const textOnlyParts = content.map((part) => ({ type: "text" as const, text: part.type === "image" ? "[Image]" : part.text, })); return new AIMessage({ content: textOnlyParts }); } return { role: message.role, content: message.content, }; }, ); if (options.response_model) { //ref string no longer needed, this is now default behavior const responseSchema = toJsonSchema(options.response_model.schema); const structuredModel = this.model.withStructuredOutput(responseSchema); const response = await structuredModel.invoke(formattedMessages); return { data: response, usage: { prompt_tokens: 0, // Langchain doesn't provide token counts by default completion_tokens: 0, total_tokens: 0, }, } as T; } const modelWithTools = this.model.bindTools(options.tools); const response = await modelWithTools.invoke(formattedMessages); return { data: response, usage: { prompt_tokens: 0, // Langchain doesn't provide token counts by default completion_tokens: 0, total_tokens: 0, }, } as T; } } ================================================ FILE: packages/core/examples/form_filling_sensible.ts ================================================ /** * This example shows you how to use observe() to get a cacheable Playwright action as JSON, then pass that JSON to act() to perform the action. * * In this specific example, we use observe() to get multiple actions, then iterate through each action to fill the form with sensitive data at lightning speed. */ import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; async function formFillingSensible() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, }); await stagehand.init(); const page = stagehand.context.pages()[0]; // Go to the website and wait for it to load await page.goto("https://file.1040.com/estimate/", { waitUntil: "networkidle", timeoutMs: 30000, }); // Observe the form fields with suggested actions const observed = await stagehand.observe( "fill all the form fields in the page with mock data. In the description include the field name", ); // Uncomment the following snippet to see the stagehand candidate suggestions (initial) console.log( `${chalk.green("Observe:")} Form fields found:\n${observed .map((r) => `${chalk.yellow(r.description)} -> ${chalk.gray(r.selector)}`) .join("\n")}`, ); // Create a mapping of 1+ keywords in the form fields to standardize field names const mapping = (description: string): string | null => { const keywords: { [key: string]: string[] } = { age: ["old"], dependentsUnder17: ["under age 17", "child", "minor"], dependents17to23: ["17-23", "school", "student"], wages: ["wages", "W-2 Box 1"], federalTax: ["federal tax", "Box 2"], stateTax: ["state tax", "Box 17"], }; for (const [key, terms] of Object.entries(keywords)) { if (terms.some((term) => description.toLowerCase().includes(term))) { return key; } } return null; }; // Fill the form fields with sensible data. This data will only be used in your session and not be shared with LLM providers/external APIs. const userInputs: { [key: string]: string } = { age: "26", dependentsUnder17: "1", wages: "54321", federalTax: "8345", stateTax: "2222", }; const updatedFields = observed.map((candidate) => { const key = mapping(candidate.description); if (key && userInputs[key]) { candidate.arguments = [userInputs[key]]; } return candidate; }); // List of sensible-data candidates console.log( `\n${chalk.green("Sensible Data form inputs:")} Form fields to be filled:\n${updatedFields .map( (r) => `${chalk.yellow(r.description)} -> ${chalk.blue(r.arguments?.[0] || "no value")}`, ) .join("\n")}`, ); // Fill all the form fields with the sensible candidates for (const candidate of updatedFields) { await stagehand.act(candidate); } } (async () => { await formFillingSensible(); })(); ================================================ FILE: packages/core/examples/google_enter.ts ================================================ /** * This example shows how to use the Stagehand agent to navigate to Google and search for "Browserbase". * * It's mainly meant to sanity check using page.act() to press enter, since some LLMs have issues with it. */ import { Stagehand } from "../lib/v3/index.js"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://google.com"); await stagehand.act("type in 'Browserbase'"); await stagehand.act("press enter"); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/instructions.ts ================================================ /** * This example shows how to use custom system prompts with Stagehand. */ import { Stagehand } from "../lib/v3/index.js"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, systemPrompt: "if the users says `secret12345`, click on the 'getting started' tab. additionally, if the user says to type something, translate their input into french and type it.", }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://docs.browserbase.com/"); await stagehand.act("secret12345"); await stagehand.act("search for 'how to use browserbase'"); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/integrations/exa.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); const agent = stagehand.agent({ integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, ], // Optional: Add custom instructions systemPrompt: `You are a helpful assistant that can use a browser as well as external tools such as web search. You have access to the Exa search tool to find information on the web. When looking for products to buy, make sure to search for current and reliable information. Be thorough in your research before making purchase decisions.`, }); const result = await agent.execute( "Use one of the tools from Exa to search for the top headphones of 2025. After doing so, use the browser and go through the checkout flow for the best one.", ); console.log(result); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", model: "openai/gpt-4.1", verbose: 1, logInferenceToFile: true, experimental: true, }); try { await stagehand.init(); await example(stagehand); } catch (error) { console.error("Error running example:", error); } finally { await stagehand.close(); } })(); ================================================ FILE: packages/core/examples/integrations/supabase.ts ================================================ import { connectToMCPServer, Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto("https://www.opentable.com/"); const supabaseClient = await connectToMCPServer( `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}`, ); const agent = stagehand.agent({ model: "openai/computer-use-preview", integrations: [supabaseClient], }); const result = await agent.execute( "Search for restaurants in New Brunswick, NJ. Then, use the Supabase tools to insert the name of the first result of the search into a table called 'restaurants'.", ); console.log(result); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, }); try { await stagehand.init(); await example(stagehand); } catch (error) { console.error("Error running example:", error); } finally { await stagehand.close(); } })(); ================================================ FILE: packages/core/examples/mcp.ts ================================================ // import { Stagehand } from "../lib/v3"; // import StagehandConfig from "@/stagehand.config"; // import chalk from "chalk"; // import { connectToMCPServer } from "../lib/mcp/connection"; // async function main() { // console.log(`\n${chalk.bold("Stagehand 🤘 MCP Demo")}\n`); // console.log(process.env.NOTION_TOKEN); // // Initialize Stagehand // const stagehand = new Stagehand({ // ...StagehandConfig, // env: "LOCAL", // experimental: true, // }); // await stagehand.init(); // const notionClient = await connectToMCPServer({ // command: "npx", // args: ["-y", "@notionhq/notion-mcp-server"], // env: { // NOTION_TOKEN: process.env.NOTION_TOKEN, // }, // }); // try { // const page = stagehand.page; // // Create a computer use agent // const agent = stagehand.agent({ // provider: "anthropic", // // For Anthropic, use claude-sonnet-4-6 or claude-sonnet-4-5-20250929 // model: "claude-sonnet-4-6", // instructions: `You are a helpful assistant that can use a web browser. // You are currently on the following page: ${page.url()}. // Do not ask follow up questions, the user will trust your judgement. // You have access to the Notion MCP.`, // options: { // apiKey: process.env.ANTHROPIC_API_KEY, // }, // integrations: [notionClient], // }); // // Navigate to the Browserbase careers page // await page.goto("https://www.google.com"); // // Define the instruction for the CUA // const instruction = // "Check the Agent Tasks page in notion, read your tasks, perform them and update the notion page with the results."; // console.log(`Instruction: ${chalk.white(instruction)}`); // // Execute the instruction // const result = await agent.execute({ // instruction, // maxSteps: 50, // }); // console.log(`${chalk.green("✓")} Execution complete`); // console.log(`${chalk.yellow("⤷")} Result:`); // console.log(chalk.white(JSON.stringify(result, null, 2))); // } catch (error) { // console.log(`${chalk.red("✗")} Error: ${error}`); // if (error instanceof Error && error.stack) { // console.log(chalk.dim(error.stack.split("\n").slice(1).join("\n"))); // } // } finally { // // Close the browser // await stagehand.close(); // } // } // main().catch((error) => { // console.log(`${chalk.red("✗")} Unhandled error in main function`); // console.log(chalk.red(error)); // }); ================================================ FILE: packages/core/examples/operator-example.ts ================================================ /** * This example shows how to use the Stagehand operator to do simple autonomous tasks. * * This is built off of our open source project, Open Operator: https://operator.browserbase.com * * To learn more about Stagehand Agents, see: https://docs.stagehand.dev/concepts/agent */ import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; // Load environment variables async function main() { console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`); // Initialize Stagehand const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, cacheDir: "stagehand-agent-cache", logInferenceToFile: false, }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/shadow-dom/", ); const agent = stagehand.agent(); const result = await agent.execute({ instruction: "click the button", maxSteps: 20, }); console.log(`${chalk.green("✓")} Execution complete`); console.log(`${chalk.yellow("⤷")} Result:`); console.log(JSON.stringify(result, null, 2)); console.log(chalk.white(result.message)); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); } finally { // await stagehand.close(); } } main(); ================================================ FILE: packages/core/examples/oss-cua-example.ts ================================================ /** * This example shows how to use a computer use agent (CUA) to navigate a web page and extract data. * * To learn more about the CUA, see: https://docs.stagehand.dev/examples/computer_use * * NOTE: YOU MUST CONFIGURE BROWSER DIMENSIONS TO USE COMPUTER USE! * Check out stagehand.config.ts for more information. */ import { Stagehand } from "../lib/v3/index.js"; import chalk from "chalk"; async function main() { console.log( `\n${chalk.bold("Stagehand 🤘 Computer Use Agent (CUA) Demo")}\n`, ); // Initialize Stagehand const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, localBrowserLaunchOptions: { viewport: { width: 1288, height: 711, }, deviceScaleFactor: 1, }, }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; // Create a computer use agent const agent = stagehand.agent({ mode: "cua", model: { modelName: "microsoft/fara-7b", apiKey: process.env.AZURE_API_KEY, baseURL: process.env.AZURE_ENDPOINT, /** Alternative model configuration for Fireworks Deployments */ // modelName: "accounts/...", // apiKey: process.env.FIREWORKS_API_KEY, // baseURL: "https://api.fireworks.ai/inference/v1", // provider: "microsoft", // Important: this routes to the MicrosoftCUAClient }, systemPrompt: `You are a helpful assistant that can use a web browser. You are currently on the following page: ${page.url()}. Do not ask follow up questions, the user will trust your judgement. Today's date is ${new Date().toLocaleDateString()}. Remember apply buttons are there for a reason.`, }); // Navigate to the Browserbase careers page await page.goto("https://www.browserbase.com/careers"); // Define the instruction for the CUA const instruction = `Apply for the first engineer position with mock data on the ${page.url()} page. Don't submit the form.`; console.log(`Instruction: ${chalk.white(instruction)}`); // Execute the instruction const result = await agent.execute({ instruction, maxSteps: 20, }); await new Promise((resolve) => setTimeout(resolve, 30000)); console.log(`${chalk.green("✓")} Execution complete`); console.log(`${chalk.yellow("⤷")} Result:`); console.log(chalk.white(JSON.stringify(result, null, 2))); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); if (error instanceof Error && error.stack) { console.log(chalk.dim(error.stack.split("\n").slice(1).join("\n"))); } } finally { // Close the browser await stagehand.close(); } } main().catch((error) => { console.log(`${chalk.red("✗")} Unhandled error in main function`); console.log(chalk.red(error)); }); ================================================ FILE: packages/core/examples/parameterizeApiKey.ts ================================================ import { Stagehand } from "../lib/v3/index.js"; import { z } from "zod"; /** * This example shows how to parameterize the API key for the LLM provider. * * In order to best demonstrate, unset the OPENAI_API_KEY environment variable and * set the USE_OPENAI_API_KEY environment variable to your OpenAI API key. * * export USE_OPENAI_API_KEY=$OPENAI_API_KEY * unset OPENAI_API_KEY */ async function example() { const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, model: { modelName: "gpt-4o", apiKey: process.env.USE_OPENAI_API_KEY, }, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://github.com/browserbase/stagehand"); await stagehand.act("click on the contributors"); const contributor = await stagehand.extract( "extract the top contributor", z.object({ username: z.string(), url: z.string(), }), ); console.log(`Our favorite contributor is ${contributor.username}`); } (async () => { await example(); })(); ================================================ FILE: packages/core/examples/persist_logs_example.ts ================================================ /** * Example: Run a Stagehand agent and persist structured logging events to a user-specified dir. */ import path from "node:path"; import { Stagehand } from "../lib/v3/index.js"; async function main() { const logsRoot = path.resolve(process.cwd(), "examples", "logs"); process.env.BROWSERBASE_CONFIG_DIR = logsRoot; const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, }); await stagehand.init(); try { const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); const agent = stagehand.agent(); await agent.execute({ instruction: "Search for Browserbase and stop after the results are visible.", maxSteps: 10, }); } finally { // All logs can be found at logs/sessions/$SESSION_ID/session.json, or agent_events.log etc await stagehand.close(); } } main().catch((error) => { console.error(error); process.exitCode = 1; }); ================================================ FILE: packages/core/examples/tsconfig.json ================================================ { "extends": "../tsconfig.json", "include": ["*.ts"], "exclude": ["node_modules"] } ================================================ FILE: packages/core/examples/v3/cuaReplay.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; import { v3Logger } from "../../lib/v3/logger.js"; async function runDemo(runNumber: number) { const startTime = Date.now(); v3Logger({ level: 1, category: "demo", message: `RUN ${runNumber}: ${runNumber === 1 ? "BUILDING CACHE" : "USING CACHE"}`, }); const stagehand = new Stagehand({ env: "BROWSERBASE", disableAPI: false, verbose: 1, cacheDir: "cua-agent-cache", }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://v0-modern-login-flow.vercel.app/", { waitUntil: "networkidle", }); const agent = stagehand.agent({ mode: "cua", model: { modelName: "anthropic/claude-sonnet-4-20250514", apiKey: process.env.ANTHROPIC_API_KEY!, }, }); const result = await agent.execute({ instruction: `Sign in with the email address 'test@browserbaser.com' and the password 'stagehand=goated'`, maxSteps: 20, }); const endTime = Date.now(); const duration = (endTime - startTime) / 1000; await stagehand.close(); return { duration, success: result.success, result, }; } async function main() { const metrics1 = await runDemo(1); v3Logger({ level: 1, category: "demo", message: "⏳ Waiting 2 seconds before cached run...", }); await new Promise((resolve) => setTimeout(resolve, 2000)); v3Logger({ level: 1, category: "demo", message: "Starting second run with cache...", }); const metrics2 = await runDemo(2); const duration1 = `${metrics1.duration.toFixed(2)}s`; const duration2 = `${metrics2.duration.toFixed(2)}s`; v3Logger({ level: 1, category: "demo", message: ` ╔════════════════════════════════════════════════════════════╗ ║ 📊 PERFORMANCE COMPARISON ║ ╚════════════════════════════════════════════════════════════╝ ┌─────────────────────┬──────────────────┬──────────────────┐ │ Metric │ Run 1 (Cold) │ Run 2 (Cached) │ ├─────────────────────┼──────────────────┼──────────────────┤ │ Duration │ ${duration1.padEnd(16)} │ ${duration2.padEnd(16)} │ └─────────────────────┴──────────────────┴──────────────────┘ Performance Comparison: • Speed: ${((1 - metrics2.duration / metrics1.duration) * 100).toFixed(1)}% faster with cache • Time saved: ${(metrics1.duration - metrics2.duration).toFixed(2)} seconds Insights: • First run establishes the CUA action cache • Second run reuses cached actions for instant execution • Zero LLM tokens used on cached run`, }); } main().catch(console.error); ================================================ FILE: packages/core/examples/v3/deepLocator.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-closed-shadow-dom/", ); // crossing OOPIF & shadow root boundaries with deep locator await page .deepLocator( "/html/body/shadow-host//section/iframe/html/body/main/section[1]/form/div/div[1]/input", ) .fill("nunya"); await page .deepLocator( "/html/body/shadow-host//section/iframe/html/body/main/section[1]/form/div/div[2]/input", ) .fill("business"); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/dropdown.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/scroll-dropdown/", ); const actResult = await stagehand.act( "choose 'Peach' from the favorite colour dropdown", ); const numSteps = actResult.actions.length; console.log( `\n\nThis act() call took ${numSteps} steps. Here are the actions:`, ); for (const action of actResult.actions) { console.log(`\naction: `, action); } } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "google/gemini-2.5-flash", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/highlight.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/closed-shadow-root-in-oopif/", ); await page .deepLocator( "xpath=/html/body/main/section/iframe/html/body/shadow-demo//div/button", ) .highlight({ durationMs: 20000, contentColor: { r: 255, g: 0, b: 0 }, }); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "google/gemini-2.5-flash", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/patchright.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; import { chromium } from "patchright-core"; import { z } from "zod"; async function example(stagehand: Stagehand) { const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const prContext = browser.contexts()[0]; const prPage = prContext.pages()[0]; await prPage.goto("https://github.com/microsoft/playwright/issues/30261"); await stagehand.act("scroll to the bottom of the page", { page: prPage }); const reason = await stagehand.extract( "extract the reason why playwright doesn't expose frame IDs", z.string(), // page arg not required ); console.log(reason); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/playwright.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; import { chromium } from "playwright-core"; import { z } from "zod"; async function example(stagehand: Stagehand) { const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const pwContext = browser.contexts()[0]; const pwPage1 = pwContext.pages()[0]; await pwPage1.goto("https://docs.stagehand.dev/first-steps/introduction"); const pwPage2 = await pwContext.newPage(); await pwPage2.goto("https://docs.stagehand.dev/configuration/observability"); const [page1Extraction, page2Extraction] = await Promise.all([ stagehand.extract( "extract the names of the four stagehand primitives", z.array(z.string()), { page: pwPage1 }, ), stagehand.extract( "extract the list of session dashboard features", z.array(z.string()), { page: pwPage2 }, ), ]); console.log(page1Extraction); console.log(page2Extraction); } (async () => { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/puppeteer.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; import puppeteer from "puppeteer-core"; async function example(stagehand: Stagehand) { const browser = await puppeteer.connect({ browserWSEndpoint: stagehand.connectURL(), defaultViewport: null, }); const ppPages = await browser.pages(); const ppPage = ppPages[0]; await ppPage.goto("https://www.browserbase.com/blog"); const actions = await stagehand.observe("find the next page button", { page: ppPage, }); await stagehand.act(actions[0]); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/recordVideo.ts ================================================ import path from "node:path"; import { mkdir } from "node:fs/promises"; import { Stagehand } from "../../lib/v3/index.js"; import { chromium } from "playwright-core"; import { z } from "zod"; async function recordPlaywrightVideo(stagehand: Stagehand): Promise { const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const videoDir = path.resolve(process.cwd(), "artifacts", "stagehand-videos"); await mkdir(videoDir, { recursive: true }); const context = await browser.newContext({ recordVideo: { dir: videoDir, size: { width: 1280, height: 720 }, }, }); const page = await context.newPage(); await page.goto("https://docs.stagehand.dev/first-steps/quickstart", { waitUntil: "domcontentloaded", }); await stagehand.act("click the introduction div in the first steps section"); const { primitives } = await stagehand.extract( "list the four Stagehand primitives that are described on the page", z.object({ primitives: z.array(z.string()), }), { page }, ); console.log("Stagehand primitives:", primitives.join(", ")); // Capture the handle before closing the context so we can read the video path afterwards. const video = page.video(); await context.close(); if (video) { const videoPath = await video.path(); console.log(`Playwright saved the video to ${videoPath}`); } else { console.log("Video recording was not enabled for this context."); } } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, model: "google/gemini-2.5-flash", }); try { await stagehand.init(); await recordPlaywrightVideo(stagehand); } finally { await stagehand.close().catch(() => {}); } })(); ================================================ FILE: packages/core/examples/v3/returnXpath.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-closed-shadow-dom/", ); const xpath = await page.click(286, 628, { returnXpath: true }); // use the xpath that was returned from out coord click await page.deepLocator(xpath).fill("hellooooooooo"); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/shadowRoot.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/shadow-dom-closed/", ); // clicking in closed mode shadow root with an xpath await page.locator("/html/body/shadow-demo//div/button").click(); await new Promise((resolve) => setTimeout(resolve, 3000)); await page.reload(); await new Promise((resolve) => setTimeout(resolve, 3000)); // clicking in closed mode shadow root with css selector await page.locator("div > button").click(); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/targetedExtract.ts ================================================ import { Stagehand } from "../../lib/v3/index.js"; import { z } from "zod"; async function example(stagehand: Stagehand) { const page = stagehand.context.pages()[0]; await page.goto( "https://ambarc.github.io/web-element-test/stagehand-breaking-test.html", ); await page .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") .highlight({ durationMs: 5000, contentColor: { r: 255, g: 0, b: 0 }, }); const reason = await stagehand.extract( "extract the reason why script injection fails", z.string(), // selector: "// body > div.test-container > div:nth-child(3) > iframe >> body > p:nth-child(3)", { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, ); console.log(reason); } (async () => { const stagehand = new Stagehand({ env: "LOCAL", verbose: 0, model: "openai/gpt-4.1", logInferenceToFile: true, }); await stagehand.init(); await example(stagehand); })(); ================================================ FILE: packages/core/examples/v3/v3_agent.ts ================================================ import chalk from "chalk"; import { V3 } from "../../lib/v3/index.js"; const INSTRUCTION = "scroll down and click on the last hn story"; async function main() { console.log(`\n${chalk.bold("Stagehand V3 🤘 Operator Example")}\n`); // Initialize Stagehand const v3 = new V3({ env: "LOCAL", verbose: 2, }); await v3.init(); try { const startPage = v3.context.pages()[0]; await startPage.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-hn/", ); const agent = v3.agent({ cua: false, model: "google/gemini-2.0-flash", executionModel: "google/gemini-2.0-flash", }); // { // model: "computer-use-preview-2025-03-11", // provider: "openai", // } // Execute the agent console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`); const result = await agent.execute({ instruction: INSTRUCTION, maxSteps: 20, }); console.log(`${chalk.green("✓")} Execution complete`); console.log(`${chalk.yellow("⤷")} Result:`); console.log(JSON.stringify(result, null, 2)); console.log(chalk.white(result.message)); } catch (error) { console.log(`${chalk.red("✗")} Error: ${error}`); } finally { // await v3.close(); } } main(); ================================================ FILE: packages/core/examples/v3_example.ts ================================================ import { V3 } from "../lib/v3/index.js"; import { z } from "zod"; async function example(v3: V3) { const page = v3.context.pages()[0]; await page.goto("https://www.apartments.com/san-francisco-ca/2-bedrooms/", { waitUntil: "load", }); const apartment_listings = await v3.extract( "Extract all the apartment listings with their prices and their addresses.", z.object({ listings: z.array( z.object({ price: z.string().describe("The price of the listing"), address: z.string().describe("The address of the listing"), }), ), }), ); const listings = apartment_listings.listings; console.log(listings); console.log(`found ${listings.length} listings`); } (async () => { const v3 = new V3({ env: "LOCAL", verbose: 2, logInferenceToFile: false, model: "google/gemini-2.0-flash", cacheDir: "stagehand-extract-cache", }); await v3.init(); await example(v3); })(); ================================================ FILE: packages/core/examples/wordle.ts ================================================ import { Stagehand } from "../lib/v3/index.js"; async function example() { const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://www.nytimes.com/games/wordle/index.html"); await stagehand.act("click 'Continue'"); await stagehand.act("click 'Play'"); await stagehand.act("click cross sign on top right of 'How To Play' card"); const word = "WORDS"; for (const letter of word) { await stagehand.act(`press ${letter}`); } await stagehand.act("press enter"); await stagehand.close(); } (async () => { await example(); })(); ================================================ FILE: packages/core/lib/CHANGELOG.md ================================================ # @browserbasehq/stagehand-lib ## 2.4.1 ### Patch Changes - [#1027](https://github.com/browserbase/stagehand/pull/1027) [`455b61f`](https://github.com/browserbase/stagehand/commit/455b61fb6f7a34ae50d7e7c76c1d639241e213d6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - Fixed small issue with module-level state guard for the Playwright selectors.register call ## 2.4.0 ### Minor Changes - [#778](https://github.com/browserbase/stagehand/pull/778) [`df570b6`](https://github.com/browserbase/stagehand/commit/df570b67e46febcaf7282ffb65dd5707e2808152) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - iframe support ### Patch Changes - [#809](https://github.com/browserbase/stagehand/pull/809) [`03ebebc`](https://github.com/browserbase/stagehand/commit/03ebebc0317f92d8de77285cc2e66dc0131fe9fe) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - log NoObjectGenerated error details - [#801](https://github.com/browserbase/stagehand/pull/801) [`1d4f0ab`](https://github.com/browserbase/stagehand/commit/1d4f0abca47bf47ae8b7aeb53f3cd1155a7e5448) Thanks [@miguelg719](https://github.com/miguelg719)! - Default use API to true - [#798](https://github.com/browserbase/stagehand/pull/798) [`d86200b`](https://github.com/browserbase/stagehand/commit/d86200bd5bde4c5ba113ca89e28ab86c14a8304e) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix pino logging memory leak by reusing worker ## 2.3.0 ### Minor Changes - [#731](https://github.com/browserbase/stagehand/pull/731) [`393c8e0`](https://github.com/browserbase/stagehand/commit/393c8e05d016086e481c0043ee6b084c61886cad) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - make extract() with no arguments return the hybrid tree instead of text-rendered webpage - [#737](https://github.com/browserbase/stagehand/pull/737) [`6ef6073`](https://github.com/browserbase/stagehand/commit/6ef60730cab0ad9025f44b6eeb2c83751d1dcd35) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - deprecate useTextExtract and remove functionality ### Patch Changes - [#741](https://github.com/browserbase/stagehand/pull/741) [`5680d25`](https://github.com/browserbase/stagehand/commit/5680d2509352c383ad502c9f4fabde01fa638833) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use safeparse for zod validation - [#740](https://github.com/browserbase/stagehand/pull/740) [`28840a7`](https://github.com/browserbase/stagehand/commit/28840a7d3fec89a490984582fb37fa3d007c0349) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - dont log deprecation warning when onlyVisible is undefined - [#755](https://github.com/browserbase/stagehand/pull/755) [`ba687ab`](https://github.com/browserbase/stagehand/commit/ba687abdfb598f839ddfec0442d3d7b6b696b0a3) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix context init error on undefined context - [#789](https://github.com/browserbase/stagehand/pull/789) [`c5ff8ce`](https://github.com/browserbase/stagehand/commit/c5ff8ce2d7467b70a450ca52bc3e03b15280ce1b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix noisy useTextExtract deprecation log - [#757](https://github.com/browserbase/stagehand/pull/757) [`628e534`](https://github.com/browserbase/stagehand/commit/628e534ea6d7ca081bad6c32167c7d53d4772eed) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - optimize CDP calls when building hybrid tree - [#772](https://github.com/browserbase/stagehand/pull/772) [`64d331d`](https://github.com/browserbase/stagehand/commit/64d331dc2eba86675a8b148d361897f55f170703) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixes an issue with the new tab intercepts for invalid urls - [#770](https://github.com/browserbase/stagehand/pull/770) [`d312a43`](https://github.com/browserbase/stagehand/commit/d312a43672fe2865abcf184a712a759a12f5b9d1) Thanks [@miguelg719](https://github.com/miguelg719)! - Removed default chromium flags that delay browser launching - [#753](https://github.com/browserbase/stagehand/pull/753) [`fbca400`](https://github.com/browserbase/stagehand/commit/fbca4003a547dc5eee0c0be5edc5e98c1f4d8c22) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix `stagehand.history` - [#745](https://github.com/browserbase/stagehand/pull/745) [`c54afab`](https://github.com/browserbase/stagehand/commit/c54afab0e43a2144eecbc56df7f33c5e444ceed5) Thanks [@miguelg719](https://github.com/miguelg719)! - Add an identifier for client language/runtime - [#768](https://github.com/browserbase/stagehand/pull/768) [`58b06eb`](https://github.com/browserbase/stagehand/commit/58b06eb2fdfb1a9cd84c03f46655ab0ea00ee07f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: page.evaluate: Execution context was destroyed, most likely because of a navigation - [#758](https://github.com/browserbase/stagehand/pull/758) [`98e1356`](https://github.com/browserbase/stagehand/commit/98e13566846a547003e4c9aebbe4f95eff653bba) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm unused functions - [#781](https://github.com/browserbase/stagehand/pull/781) [`8d239ce`](https://github.com/browserbase/stagehand/commit/8d239cec7a835d35243b2b00c3c00c1b66c05b5e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix variable parsing issue with gpt-4.1 - [#761](https://github.com/browserbase/stagehand/pull/761) [`e1f7074`](https://github.com/browserbase/stagehand/commit/e1f7074be23c82ae897386d5e5e132ff8cb4120a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - build xpaths on node side instead of using injected JS ## 2.2.1 ### Patch Changes - [#729](https://github.com/browserbase/stagehand/pull/729) [`fc24f84`](https://github.com/browserbase/stagehand/commit/fc24f848ee0f300182e88993dfe8d68025d69fcb) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix "failed to inject helper scripts" log on stagehand.close() ================================================ FILE: packages/core/lib/inference.ts ================================================ import { z } from "zod"; import { LogLine } from "./v3/types/public/logs.js"; import { ChatMessage, LLMClient } from "./v3/llm/LLMClient.js"; import { getEnvTimeoutMs, withTimeout } from "./v3/timeoutConfig.js"; import { buildActSystemPrompt, buildExtractSystemPrompt, buildExtractUserPrompt, buildMetadataPrompt, buildMetadataSystemPrompt, buildObserveSystemPrompt, buildObserveUserMessage, } from "./prompt.js"; import { appendSummary, writeTimestampedTxtFile } from "./inferenceLogUtils.js"; import type { InferStagehandSchema, StagehandZodObject, } from "./v3/zodCompat.js"; import { SupportedUnderstudyAction } from "./v3/types/private/handlers.js"; // Re-export for backward compatibility export type { LLMParsedResponse, LLMUsage } from "./v3/llm/LLMClient.js"; function withLlmTimeout(promise: Promise, operation: string): Promise { return withTimeout( promise, getEnvTimeoutMs("LLM_MAX_MS"), `LLM ${operation}`, ); } export async function extract({ instruction, domElements, schema, llmClient, logger, userProvidedInstructions, logInferenceToFile = false, }: { instruction: string; domElements: string; schema: T; llmClient: LLMClient; userProvidedInstructions?: string; logger: (message: LogLine) => void; logInferenceToFile?: boolean; }) { const metadataSchema = z.object({ progress: z .string() .describe( "progress of what has been extracted so far, as concise as possible", ), completed: z .boolean() .describe( "true if the goal is now accomplished. Use this conservatively, only when sure that the goal has been completed.", ), }); type ExtractionResponse = InferStagehandSchema; type MetadataResponse = z.infer; const isUsingAnthropic = llmClient.type === "anthropic"; const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options const extractCallMessages: ChatMessage[] = [ buildExtractSystemPrompt(isUsingAnthropic, userProvidedInstructions), buildExtractUserPrompt(instruction, domElements, isUsingAnthropic), ]; let extractCallFile = ""; let extractCallTimestamp = ""; if (logInferenceToFile) { const { fileName, timestamp } = writeTimestampedTxtFile( "extract_summary", "extract_call", { modelCall: "extract", messages: extractCallMessages, }, ); extractCallFile = fileName; extractCallTimestamp = timestamp; } const extractStartTime = Date.now(); const extractionResponse = await withLlmTimeout( llmClient.createChatCompletion({ options: { messages: extractCallMessages, response_model: { schema, name: "Extraction", }, temperature: isGPT5 ? 1 : 0.1, top_p: 1, frequency_penalty: 0, presence_penalty: 0, }, logger, }), "extract", ); const extractEndTime = Date.now(); const { data: extractedData, usage: extractUsage } = extractionResponse; let extractResponseFile: string; if (logInferenceToFile) { const { fileName } = writeTimestampedTxtFile( "extract_summary", "extract_response", { modelResponse: "extract", rawResponse: extractedData, }, ); extractResponseFile = fileName; appendSummary("extract", { extract_inference_type: "extract", timestamp: extractCallTimestamp, LLM_input_file: extractCallFile, LLM_output_file: extractResponseFile, prompt_tokens: extractUsage?.prompt_tokens ?? 0, completion_tokens: extractUsage?.completion_tokens ?? 0, reasoning_tokens: extractUsage?.reasoning_tokens ?? 0, cached_input_tokens: extractUsage?.cached_input_tokens ?? 0, inference_time_ms: extractEndTime - extractStartTime, }); } const metadataCallMessages: ChatMessage[] = [ buildMetadataSystemPrompt(), buildMetadataPrompt(instruction, extractedData), ]; let metadataCallFile = ""; let metadataCallTimestamp = ""; if (logInferenceToFile) { const { fileName, timestamp } = writeTimestampedTxtFile( "extract_summary", "metadata_call", { modelCall: "metadata", messages: metadataCallMessages, }, ); metadataCallFile = fileName; metadataCallTimestamp = timestamp; } const metadataStartTime = Date.now(); const metadataResponse = await withLlmTimeout( llmClient.createChatCompletion({ options: { messages: metadataCallMessages, response_model: { name: "Metadata", schema: metadataSchema, }, temperature: isGPT5 ? 1 : 0.1, top_p: 1, frequency_penalty: 0, presence_penalty: 0, }, logger, }), "extract metadata", ); const metadataEndTime = Date.now(); const { data: { completed: metadataResponseCompleted, progress: metadataResponseProgress, }, usage: metadataResponseUsage, } = metadataResponse; let metadataResponseFile: string; if (logInferenceToFile) { const { fileName } = writeTimestampedTxtFile( "extract_summary", "metadata_response", { modelResponse: "metadata", completed: metadataResponseCompleted, progress: metadataResponseProgress, }, ); metadataResponseFile = fileName; appendSummary("extract", { extract_inference_type: "metadata", timestamp: metadataCallTimestamp, LLM_input_file: metadataCallFile, LLM_output_file: metadataResponseFile, prompt_tokens: metadataResponseUsage?.prompt_tokens ?? 0, completion_tokens: metadataResponseUsage?.completion_tokens ?? 0, reasoning_tokens: metadataResponseUsage?.reasoning_tokens ?? 0, cached_input_tokens: metadataResponseUsage?.cached_input_tokens ?? 0, inference_time_ms: metadataEndTime - metadataStartTime, }); } const totalPromptTokens = (extractUsage?.prompt_tokens ?? 0) + (metadataResponseUsage?.prompt_tokens ?? 0); const totalCompletionTokens = (extractUsage?.completion_tokens ?? 0) + (metadataResponseUsage?.completion_tokens ?? 0); const totalInferenceTimeMs = extractEndTime - extractStartTime + (metadataEndTime - metadataStartTime); const totalReasoningTokens = (extractUsage?.reasoning_tokens ?? 0) + (metadataResponseUsage?.reasoning_tokens ?? 0); const totalCachedInputTokens = (extractUsage?.cached_input_tokens ?? 0) + (metadataResponseUsage?.cached_input_tokens ?? 0); return { ...extractedData, metadata: { completed: metadataResponseCompleted, progress: metadataResponseProgress, }, prompt_tokens: totalPromptTokens, completion_tokens: totalCompletionTokens, reasoning_tokens: totalReasoningTokens, cached_input_tokens: totalCachedInputTokens, inference_time_ms: totalInferenceTimeMs, }; } export async function observe({ instruction, domElements, llmClient, userProvidedInstructions, logger, logInferenceToFile = false, supportedActions, }: { instruction: string; domElements: string; llmClient: LLMClient; userProvidedInstructions?: string; logger: (message: LogLine) => void; logInferenceToFile?: boolean; supportedActions?: string[]; }) { const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options const observeSchema = z.object({ elements: z .array( z.object({ elementId: z .string() .regex(/^\d+-\d+$/) .describe( "the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'.", ), description: z .string() .describe( "a description of the accessible element and its purpose", ), method: z .enum( // Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly Object.values(SupportedUnderstudyAction) as unknown as readonly [ string, ...string[], ], ) .describe( `the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.`, ), arguments: z.array( z .string() .describe( "the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.", ), ), }), ) .describe("an array of accessible elements that match the instruction"), }); type ObserveResponse = z.infer; const messages: ChatMessage[] = [ buildObserveSystemPrompt(userProvidedInstructions, supportedActions), buildObserveUserMessage(instruction, domElements), ]; let callTimestamp = ""; let callFile = ""; if (logInferenceToFile) { const { fileName, timestamp } = writeTimestampedTxtFile( `observe_summary`, `observe_call`, { modelCall: "observe", messages, }, ); callFile = fileName; callTimestamp = timestamp; } const start = Date.now(); const rawResponse = await llmClient.createChatCompletion({ options: { messages, response_model: { schema: observeSchema, name: "Observation", }, temperature: isGPT5 ? 1 : 0.1, top_p: 1, frequency_penalty: 0, presence_penalty: 0, }, logger, }); const end = Date.now(); const usageTimeMs = end - start; const { data: observeData, usage: observeUsage } = rawResponse; const promptTokens = observeUsage?.prompt_tokens ?? 0; const completionTokens = observeUsage?.completion_tokens ?? 0; const reasoningTokens = observeUsage?.reasoning_tokens ?? 0; const cachedInputTokens = observeUsage?.cached_input_tokens ?? 0; let responseFile: string; if (logInferenceToFile) { const { fileName: responseFileName } = writeTimestampedTxtFile( `observe_summary`, `observe_response`, { modelResponse: "observe", rawResponse: observeData, }, ); responseFile = responseFileName; appendSummary("observe", { [`observe_inference_type`]: "observe", timestamp: callTimestamp, LLM_input_file: callFile, LLM_output_file: responseFile, prompt_tokens: promptTokens, completion_tokens: completionTokens, reasoning_tokens: reasoningTokens, cached_input_tokens: cachedInputTokens, inference_time_ms: usageTimeMs, }); } const parsedElements = observeData.elements?.map((el) => { const base = { elementId: el.elementId, description: String(el.description), method: String(el.method), arguments: el.arguments, }; return base; }) ?? []; return { elements: parsedElements, prompt_tokens: promptTokens, completion_tokens: completionTokens, reasoning_tokens: reasoningTokens, cached_input_tokens: cachedInputTokens, inference_time_ms: usageTimeMs, }; } export async function act({ instruction, domElements, llmClient, userProvidedInstructions, logger, logInferenceToFile = false, }: { instruction: string; domElements: string; llmClient: LLMClient; userProvidedInstructions?: string; logger: (message: LogLine) => void; logInferenceToFile?: boolean; }) { const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options const actSchema = z.object({ elementId: z .string() .regex(/^\d+-\d+$/) .describe( "the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'.", ), description: z .string() .describe("a description of the accessible element and its purpose"), method: z .enum( // Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly Object.values(SupportedUnderstudyAction) as unknown as readonly [ string, ...string[], ], ) .describe( "the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.", ), arguments: z.array( z .string() .describe( "the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.", ), ), twoStep: z.boolean(), }); type ActResponse = z.infer; const messages: ChatMessage[] = [ buildActSystemPrompt(userProvidedInstructions), buildObserveUserMessage(instruction, domElements), ]; let callTimestamp = ""; let callFile = ""; if (logInferenceToFile) { const { fileName, timestamp } = writeTimestampedTxtFile( `act_summary`, `act_call`, { modelCall: "act", messages, }, ); callFile = fileName; callTimestamp = timestamp; } const start = Date.now(); const rawResponse = await llmClient.createChatCompletion({ options: { messages, response_model: { schema: actSchema, name: "act", }, temperature: isGPT5 ? 1 : 0.1, top_p: 1, frequency_penalty: 0, presence_penalty: 0, }, logger, }); const end = Date.now(); const usageTimeMs = end - start; const { data: actData, usage: actUsage } = rawResponse; const promptTokens = actUsage?.prompt_tokens ?? 0; const completionTokens = actUsage?.completion_tokens ?? 0; const reasoningTokens = actUsage?.reasoning_tokens ?? 0; const cachedInputTokens = actUsage?.cached_input_tokens ?? 0; let responseFile: string; if (logInferenceToFile) { const { fileName: responseFileName } = writeTimestampedTxtFile( `act_summary`, `act_response`, { modelResponse: "act", rawResponse: actData, }, ); responseFile = responseFileName; appendSummary("act", { [`act_inference_type`]: "act", timestamp: callTimestamp, LLM_input_file: callFile, LLM_output_file: responseFile, prompt_tokens: promptTokens, completion_tokens: completionTokens, reasoning_tokens: reasoningTokens, cached_input_tokens: cachedInputTokens, inference_time_ms: usageTimeMs, }); } const parsedElement = { elementId: actData.elementId, description: String(actData.description), method: String(actData.method), arguments: actData.arguments, }; return { element: parsedElement, prompt_tokens: promptTokens, completion_tokens: completionTokens, reasoning_tokens: reasoningTokens, cached_input_tokens: cachedInputTokens, inference_time_ms: usageTimeMs, twoStep: actData.twoStep, }; } ================================================ FILE: packages/core/lib/inferenceLogUtils.ts ================================================ import fs from "fs"; import path from "path"; /** * Create (or ensure) a parent directory named "inference_summary". */ function ensureInferenceSummaryDir(): string { const inferenceDir = path.join(process.cwd(), "inference_summary"); if (!fs.existsSync(inferenceDir)) { fs.mkdirSync(inferenceDir, { recursive: true }); } return inferenceDir; } /** * Appends a new entry to the act_summary.json file, then writes the file back out. */ export function appendSummary(inferenceType: string, entry: T) { const summaryPath = getSummaryJsonPath(inferenceType); const arrayKey = `${inferenceType}_summary`; const existingData = readSummaryFile(inferenceType); existingData[arrayKey].push(entry); fs.writeFileSync(summaryPath, JSON.stringify(existingData, null, 2)); } /** A simple timestamp utility for filenames. */ function getTimestamp(): string { return new Date() .toISOString() .replace(/[^0-9T]/g, "") .replace("T", "_"); } /** * Writes `data` as JSON into a file in `directory`, using a prefix plus timestamp. * Returns both the file name and the timestamp used, so you can log them. */ export function writeTimestampedTxtFile( directory: string, prefix: string, data: unknown, ): { fileName: string; timestamp: string } { const baseDir = ensureInferenceSummaryDir(); const subDir = path.join(baseDir, directory); if (!fs.existsSync(subDir)) { fs.mkdirSync(subDir, { recursive: true }); } const timestamp = getTimestamp(); const fileName = `${timestamp}_${prefix}.txt`; const filePath = path.join(subDir, fileName); fs.writeFileSync( filePath, JSON.stringify(data, null, 2).replace(/\\n/g, "\n"), ); return { fileName, timestamp }; } /** * Returns the path to the `_summary.json` file. * * For example, if `inferenceType = "act"`, this will be: * `./inference_summary/act_summary/act_summary.json` */ function getSummaryJsonPath(inferenceType: string): string { const baseDir = ensureInferenceSummaryDir(); const subDir = path.join(baseDir, `${inferenceType}_summary`); if (!fs.existsSync(subDir)) { fs.mkdirSync(subDir, { recursive: true }); } return path.join(subDir, `${inferenceType}_summary.json`); } /** * Reads the `_summary.json` file, returning an object * with the top-level array named `_summary`, if it exists. * * E.g. if inferenceType is "act", we expect a shape like: * { * "act_summary": [ ... ] * } * * If the file or array is missing, returns { "_summary": [] }. */ function readSummaryFile(inferenceType: string): Record { const summaryPath = getSummaryJsonPath(inferenceType); // The top-level array key, e.g. "act_summary", "observe_summary", "extract_summary" const arrayKey = `${inferenceType}_summary`; if (!fs.existsSync(summaryPath)) { return { [arrayKey]: [] }; } try { const raw = fs.readFileSync(summaryPath, "utf8"); const parsed = JSON.parse(raw); if ( parsed && typeof parsed === "object" && Array.isArray(parsed[arrayKey]) ) { return parsed; } } catch { // If we fail to parse for any reason, fall back to empty array } return { [arrayKey]: [] }; } ================================================ FILE: packages/core/lib/logger.ts ================================================ import pino from "pino"; import { LogLine } from "./v3/types/public/logs.js"; // Map our existing levels to Pino's standard levels const levelMapping: Record = { 0: "error", // Critical/important messages 1: "info", // Standard information 2: "debug", // Detailed debugging information }; // Define configuration options export interface LoggerOptions { pretty?: boolean; level?: pino.Level; destination?: pino.DestinationStream; usePino?: boolean; // Whether to use pino (default: true) } /** * Creates a configured Pino logger instance */ export function createLogger(options: LoggerOptions = {}) { const loggerConfig: pino.LoggerOptions = { level: options.level || "info", base: undefined, // Don't include pid and hostname browser: { asObject: true, }, // Disable worker threads to avoid issues in tests transport: undefined, }; // Add pretty printing for dev environments only if explicitly requested // and not in a test environment if (options.pretty && !isTestEnvironment()) { try { // Use require for dynamic import const transport = { transport: { target: "pino-pretty", options: { colorize: true, translateTime: "SYS:standard", ignore: "pid,hostname", }, }, }; Object.assign(loggerConfig, transport); } catch { console.warn( "pino-pretty not available, falling back to standard logging", ); } } return pino(loggerConfig, options.destination); } /** * Check if we're running in a test environment */ function isTestEnvironment(): boolean { return ( process.env.NODE_ENV === "test" || process.env.JEST_WORKER_ID !== undefined || process.env.PLAYWRIGHT_TEST_BASE_DIR !== undefined || // Check if we're in a CI environment process.env.CI === "true" ); } /** * StagehandLogger class that wraps Pino for our specific needs * * LOGGING PRECEDENCE: * * Test environments: * - External logger provided -> external logger only. * - No external logger -> console fallback only (Pino disabled). * * Non-test environments: * - usePino === true -> emit via Pino and also call the external logger when present. * - usePino === false -> disable Pino; use the external logger when present, otherwise console fallback. * - usePino === undefined -> prefer the external logger when present; otherwise use Pino. * * SHARED PINO OPTIMIZATION: * We maintain a single shared Pino instance when `usePino` is enabled. * This prevents spawning a new worker thread for every Stagehand instance * (which happens when `pino-pretty` transport is used), eliminating the * memory/RSS growth observed when many Stagehand objects are created and * disposed within the same process (e.g. a request-per-instance API). */ export class StagehandLogger { /** * Shared Pino logger instance across all StagehandLogger instances. * First instance to enable Pino creates it, subsequent instances reuse it. */ private static sharedPinoLogger: pino.Logger | null = null; private logger?: pino.Logger; private verbose: 0 | 1 | 2; private externalLogger?: (logLine: LogLine) => void; private usePino: boolean; private isTest: boolean; constructor( options: LoggerOptions = {}, externalLogger?: (logLine: LogLine) => void, ) { this.isTest = isTestEnvironment(); this.externalLogger = externalLogger; const externalProvided = typeof externalLogger === "function"; const explicitUsePino = options.usePino; if (this.isTest) { this.usePino = false; } else if (explicitUsePino === true) { this.usePino = true; } else if (explicitUsePino === false) { this.usePino = false; } else { this.usePino = !externalProvided; } if (this.usePino) { // Re-use (or create) a single shared Pino logger instance if (!StagehandLogger.sharedPinoLogger) { StagehandLogger.sharedPinoLogger = createLogger(options); } this.logger = StagehandLogger.sharedPinoLogger; } this.verbose = 1; // Default verbosity level } /** * Set the verbosity level */ setVerbosity(level: 0 | 1 | 2) { this.verbose = level; if (this.usePino && this.logger) { // Map our verbosity levels to Pino log levels switch (level) { case 0: this.logger.level = "error"; break; case 1: this.logger.level = "info"; break; case 2: this.logger.level = "debug"; break; } } } /** * Log a message using our LogLine format */ log(logLine: LogLine): void { // Skip logs above verbosity level if ((logLine.level ?? 1) > this.verbose) { return; } // For test environments WITHOUT an external logger OR for cases where Pino // is disabled and no external logger is provided, fall back to console.* so // users still see logs (non-colourised). const shouldFallbackToConsole = (!this.usePino && !this.externalLogger) || (this.isTest && !this.externalLogger); if (shouldFallbackToConsole) { const level = logLine.level ?? 1; const ts = logLine.timestamp ?? new Date().toISOString(); const levelStr = level === 0 ? "ERROR" : level === 2 ? "DEBUG" : "INFO"; // Format like Pino: [timestamp] LEVEL: message let output = `[${ts}] ${levelStr}: ${logLine.message}`; // Add auxiliary data on separate indented lines (like Pino pretty format) if (logLine.auxiliary) { const formattedData = this.formatAuxiliaryData(logLine.auxiliary); for (const [key, value] of Object.entries(formattedData)) { let formattedValue: string; if (typeof value === "object" && value !== null) { // Pretty print objects with indentation formattedValue = JSON.stringify(value, null, 2) .split("\n") .map((line, i) => (i === 0 ? line : ` ${line}`)) .join("\n"); } else { formattedValue = String(value); } output += `\n ${key}: ${formattedValue}`; } } switch (level) { case 0: console.error(output); break; case 1: console.log(output); break; case 2: console.debug(output); break; } return; // already handled via console output, avoid duplicate logging } if (this.usePino && this.logger) { // Determine the Pino log level const pinoLevel = levelMapping[logLine.level ?? 1] || "info"; // Structure the log data const logData = { category: logLine.category, timestamp: logLine.timestamp || new Date().toISOString(), ...this.formatAuxiliaryData(logLine.auxiliary), }; // Log through Pino with the appropriate level if (pinoLevel === "error") { this.logger.error(logData, logLine.message); } else if (pinoLevel === "info") { this.logger.info(logData, logLine.message); } else if (pinoLevel === "debug") { this.logger.debug(logData, logLine.message); } else if (pinoLevel === "warn") { this.logger.warn(logData, logLine.message); } else if (pinoLevel === "trace") { this.logger.trace(logData, logLine.message); } else { this.logger.info(logData, logLine.message); } } // IMPORTANT: External logger receives logs ALWAYS when provided (takes precedence) // This ensures user-provided loggers (e.g., EvalLogger, custom loggers) capture all logs // regardless of Pino configuration. Pino is used for console output, external logger // is used for programmatic log capture. if (this.externalLogger) { this.externalLogger(logLine); } } /** * Helper to format auxiliary data for structured logging */ private formatAuxiliaryData(auxiliary?: LogLine["auxiliary"]) { if (!auxiliary) return {}; const formattedData: Record = {}; for (const [key, { value, type }] of Object.entries(auxiliary)) { let formattedValue: unknown; // Convert values based on their type switch (type) { case "integer": formattedValue = parseInt(value, 10); break; case "float": formattedValue = parseFloat(value); break; case "boolean": formattedValue = value === "true"; break; case "object": try { formattedValue = JSON.parse(value); } catch { formattedValue = value; } break; default: formattedValue = value; } // Skip undefined values and empty objects/arrays if (formattedValue === undefined) continue; if (typeof formattedValue === "object" && formattedValue !== null) { const isEmpty = Array.isArray(formattedValue) ? formattedValue.length === 0 : Object.keys(formattedValue).length === 0; if (isEmpty) continue; } formattedData[key] = formattedValue; } return formattedData; } /** * Convenience methods for different log levels */ error(message: string, data?: Record): void { this.log({ message, level: 0, auxiliary: this.convertToAuxiliary(data), }); } warn(message: string, data?: Record): void { this.log({ message, level: 1, category: "warning", auxiliary: this.convertToAuxiliary(data), }); } info(message: string, data?: Record): void { this.log({ message, level: 1, auxiliary: this.convertToAuxiliary(data), }); } debug(message: string, data?: Record): void { this.log({ message, level: 2, auxiliary: this.convertToAuxiliary(data), }); } /** * Convert a plain object to our auxiliary format */ private convertToAuxiliary( data?: Record, ): LogLine["auxiliary"] { if (!data) return undefined; const auxiliary: LogLine["auxiliary"] = {}; for (const [key, value] of Object.entries(data)) { if (value === undefined) continue; const type = typeof value; auxiliary[key] = { value: type === "object" ? JSON.stringify(value) : String(value), type: type === "number" ? Number.isInteger(value) ? "integer" : "float" : type === "boolean" ? "boolean" : type === "object" ? "object" : "string", }; } return auxiliary; } } ================================================ FILE: packages/core/lib/modelUtils.ts ================================================ import { ClientOptions, ModelConfiguration } from "./v3/types/public/model.js"; import { AVAILABLE_CUA_MODELS, AvailableCuaModel, } from "./v3/types/public/agent.js"; //useful when resolving a model from string or object formats we accept export function extractModelName( model?: string | { modelName: string; [key: string]: unknown }, ): string | undefined { if (!model) return undefined; return typeof model === "string" ? model : model.modelName; } export function splitModelName(model: string): { provider: string; modelName: string; } { const firstSlashIndex = model.indexOf("/"); const provider = model.substring(0, firstSlashIndex); const modelName = model.substring(firstSlashIndex + 1); return { provider, modelName }; } export function resolveModel(model: string | ModelConfiguration): { provider: string; modelName: string; clientOptions: ClientOptions; isCua: boolean; } { const modelString = extractModelName(model)!; const clientOptions = typeof model === "string" ? {} : (() => { // eslint-disable-next-line @typescript-eslint/no-unused-vars const { modelName: _, ...rest } = model; return rest; })(); // Check if provider is explicitly set in clientOptions const hasExplicitProvider = clientOptions.provider !== undefined; // If provider is explicitly set, don't split the model name - pass it through as-is let provider: string; let parsedModelName: string; if (hasExplicitProvider) { provider = clientOptions.provider as string; parsedModelName = modelString; // Keep the full model name } else { // Parse the model string normally const split = splitModelName(modelString); provider = split.provider; parsedModelName = split.modelName; } // Check if it's a CUA model const isCua = hasExplicitProvider || AVAILABLE_CUA_MODELS.includes(modelString as AvailableCuaModel); return { provider, modelName: parsedModelName, clientOptions, isCua, }; } ================================================ FILE: packages/core/lib/prompt.ts ================================================ import { ChatMessage } from "./v3/llm/LLMClient.js"; import type { Variables } from "./v3/types/public/agent.js"; export function buildUserInstructionsString( userProvidedInstructions?: string, ): string { if (!userProvidedInstructions) { return ""; } return `\n\n# Custom Instructions Provided by the User Please keep the user's instructions in mind when performing actions. If the user's instructions are not relevant to the current task, ignore them. User Instructions: ${userProvidedInstructions}`; } // extract export function buildExtractSystemPrompt( isUsingPrintExtractedDataTool: boolean = false, userProvidedInstructions?: string, ): ChatMessage { const baseContent = `You are extracting content on behalf of a user. If a user asks you to extract a 'list' of information, or 'all' information, YOU MUST EXTRACT ALL OF THE INFORMATION THAT THE USER REQUESTS. You will be given: 1. An instruction 2. `; const contentDetail = `A list of DOM elements to extract from.`; const instructions = ` Print the exact text from the DOM elements with all symbols, characters, and endlines as is. Print null or an empty string if no new information is found. `.trim(); const toolInstructions = isUsingPrintExtractedDataTool ? ` ONLY print the content using the print_extracted_data tool provided. ONLY print the content using the print_extracted_data tool provided. `.trim() : ""; const additionalInstructions = "If a user is attempting to extract links or URLs, you MUST respond with ONLY the IDs of the link elements. \n" + "Do not attempt to extract links directly from the text unless absolutely necessary. "; const userInstructions = buildUserInstructionsString( userProvidedInstructions, ); const content = `${baseContent}${contentDetail}\n\n${instructions}\n${toolInstructions}${ additionalInstructions ? `\n\n${additionalInstructions}` : "" }${userInstructions ? `\n\n${userInstructions}` : ""}`.replace(/\s+/g, " "); return { role: "system", content, }; } export function buildExtractUserPrompt( instruction: string, domElements: string, isUsingPrintExtractedDataTool: boolean = false, ): ChatMessage { let content = `Instruction: ${instruction} DOM: ${domElements}`; if (isUsingPrintExtractedDataTool) { content += ` ONLY print the content using the print_extracted_data tool provided. ONLY print the content using the print_extracted_data tool provided.`; } return { role: "user", content, }; } const metadataSystemPrompt = `You are an AI assistant tasked with evaluating the progress and completion status of an extraction task. Analyze the extraction response and determine if the task is completed or if more information is needed. Strictly abide by the following criteria: 1. Once the instruction has been satisfied by the current extraction response, ALWAYS set completion status to true and stop processing, regardless of remaining chunks. 2. Only set completion status to false if BOTH of these conditions are true: - The instruction has not been satisfied yet - There are still chunks left to process (chunksTotal > chunksSeen)`; export function buildMetadataSystemPrompt(): ChatMessage { return { role: "system", content: metadataSystemPrompt, }; } export function buildMetadataPrompt( instruction: string, extractionResponse: object, ): ChatMessage { return { role: "user", content: `Instruction: ${instruction} Extracted content: ${JSON.stringify(extractionResponse, null, 2)}`, }; } // observe export function buildObserveSystemPrompt( userProvidedInstructions?: string, supportedActions?: string[], ): ChatMessage { const actionsString = supportedActions?.length ? `\n\nSupported actions: ${supportedActions.join(", ")}` : ""; const observeSystemPrompt = ` You are helping the user automate the browser by finding elements based on what the user wants to observe in the page. You will be given: 1. a instruction of elements to observe 2. a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree. Return an array of elements that match the instruction if they exist, otherwise return an empty array. When returning elements, include the appropriate method from the supported actions list.${actionsString}. When choosing non-left click actions, provide right or middle as the argument.`; const content = observeSystemPrompt.replace(/\s+/g, " "); return { role: "system", content: [content, buildUserInstructionsString(userProvidedInstructions)] .filter(Boolean) .join("\n\n"), }; } export function buildObserveUserMessage( instruction: string, domElements: string, ): ChatMessage { return { role: "user", content: `instruction: ${instruction} Accessibility Tree: \n${domElements}\n`, }; } export function buildActSystemPrompt( userProvidedInstructions?: string, ): ChatMessage { const actSystemPrompt = ` You are helping the user automate the browser by finding elements based on what action the user wants to take on the page You will be given: 1. a user defined instruction about what action to take 2. a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree. Return the element that matches the instruction if it exists. Otherwise, return an empty object.`; const content = actSystemPrompt.replace(/\s+/g, " "); return { role: "system", content: [content, buildUserInstructionsString(userProvidedInstructions)] .filter(Boolean) .join("\n\n"), }; } export function buildActPrompt( action: string, supportedActions: string[], variables?: Variables, ): string { // Base instruction let instruction = `Find the most relevant element to perform an action on given the following action: ${action}. IF AND ONLY IF the action EXPLICITLY includes the word 'dropdown' and implies choosing/selecting an option from a dropdown, ignore the 'General Instructions' section, and follow the 'Dropdown Specific Instructions' section carefully. General Instructions: Provide an action for this element such as ${supportedActions.join(", ")}. Remember that to users, buttons and links look the same in most cases. When choosing non-left click actions, provide right or middle as the argument If the action is completely unrelated to a potential action to be taken on the page, return an empty object. ONLY return one action. If multiple actions are relevant, return the most relevant one. If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc. If the user is asking to scroll to the next chunk/previous chunk, choose the nextChunk/prevChunk method. No arguments are required here. If the action implies a key press, e.g., 'press enter', 'press a', 'press space', etc., always choose the press method with the appropriate key as argument — e.g. 'a', 'Enter', 'Space'. Do not choose a click action on an on-screen keyboard. Capitalize the first character like 'Enter', 'Tab', 'Escape' only for special keys. Dropdown Specific Instructions: For interacting with dropdowns, there are two specific cases that you need to handle. CASE 1: the element is a 'select' element. - choose the selectOptionFromDropdown method, - set the argument to the exact text of the option that should be selected, - set twoStep to false. CASE 2: the element is NOT a 'select' element: - do not attempt to directly choose the element from the dropdown. You will need to click to expand the dropdown first. You will achieve this by following these instructions: - choose the node that most closely corresponds to the given instruction EVEN if it is a 'StaticText' element, or otherwise does not appear to be interactable. - choose the 'click' method - set twoStep to true. `; // Add variable names (not values) to the instruction if any if (variables && Object.keys(variables).length > 0) { const variableNames = Object.keys(variables) .map((key) => `%${key}%`) .join(", "); const variablesPrompt = `The following variables are available to use in the action: ${variableNames}. Fill the argument variables with the variable name.`; instruction += ` ${variablesPrompt}`; } return instruction; } export function buildStepTwoPrompt( originalUserAction: string, previousAction: string, supportedActions: string[], variables?: Variables, ): string { // Base instruction let instruction = ` The original user action was: ${originalUserAction}. You have just taken the following action which completed step 1 of 2: ${previousAction}. Now, you must find the most relevant element to perform an action on in order to complete step 2 of 2. General Instructions: Provide an action for this element such as ${supportedActions.join(", ")}. Remember that to users, buttons and links look the same in most cases. If the action is completely unrelated to a potential action to be taken on the page, return an empty object. ONLY return one action. If multiple actions are relevant, return the most relevant one. If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc. If the user is asking to scroll to the next chunk/previous chunk, choose the nextChunk/prevChunk method. No arguments are required here. If the action implies a key press, e.g., 'press enter', 'press a', 'press space', etc., always choose the press method with the appropriate key as argument — e.g. 'a', 'Enter', 'Space'. Do not choose a click action on an on-screen keyboard. Capitalize the first character like 'Enter', 'Tab', 'Escape' only for special keys. `; // Add variable names (not values) to the instruction if any if (variables && Object.keys(variables).length > 0) { const variableNames = Object.keys(variables) .map((key) => `%${key}%`) .join(", "); const variablesPrompt = `The following variables are available to use in the action: ${variableNames}. Fill the argument variables with the variable name.`; instruction += ` ${variablesPrompt}`; } return instruction; } export function buildOperatorSystemPrompt(goal: string): ChatMessage { return { role: "system", content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page. You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken. # Your current goal ${goal} # CRITICAL: You MUST use the provided tools to take actions. Do not just describe what you want to do - actually call the appropriate tools. # Available tools and when to use them: - \`act\`: Use this to interact with the page (click, type, navigate, etc.) - \`extract\`: Use this to get information from the page - \`goto\`: Use this to navigate to a specific URL - \`wait\`: Use this to wait for a period of time - \`navback\`: Use this to go back to the previous page - \`refresh\`: Use this to refresh the current page - \`close\`: Use this ONLY when the task is complete or cannot be achieved - External tools: Use any additional tools (like search tools) as needed for your goal # Important guidelines 1. ALWAYS use tools - never just provide text responses about what you plan to do 2. Break down complex actions into individual atomic steps 3. For \`act\` commands, use only one action at a time, such as: - Single click on a specific element - Type into a single input field - Select a single option 4. Avoid combining multiple actions in one instruction 5. If multiple actions are needed, they should be separate steps 6. Only use \`close\` when the task is genuinely complete or impossible to achieve`, }; } export function buildCuaDefaultSystemPrompt(): string { return `You are a helpful assistant that can use a web browser.\nDo not ask follow up questions, the user will trust your judgement. Today's date is ${new Date().toISOString().split("T")[0]}.`; } export function buildGoogleCUASystemPrompt(): ChatMessage { return { role: "system", content: `You are a general-purpose browser agent whose job is to accomplish the user's goal. Today's date is ${new Date().toISOString().split("T")[0]}. You have access to a search tool; however, in most cases you should operate within the page/url the user has provided. ONLY use the search tool if you're stuck or the task is impossible to complete within the current page. You will be given a goal and a list of steps that have been taken so far. Avoid requesting the user for input as much as possible. Good luck! `, }; } ================================================ FILE: packages/core/lib/utils.ts ================================================ import { ZodSchemaValidationError } from "./v3/types/public/sdkErrors.js"; import { Schema, Type } from "@google/genai"; import { z, ZodTypeAny } from "zod"; import z3 from "zod/v3"; import { LogLine } from "./v3/types/public/logs.js"; import { ModelProvider } from "./v3/types/public/model.js"; import { ZodPathSegments } from "./v3/types/private/internal.js"; import type { StagehandZodSchema } from "./v3/zodCompat.js"; import { isZod4Schema } from "./v3/zodCompat.js"; const ID_PATTERN = /^\d+-\d+$/; const zFactories = { v4: z, v3: z3 as unknown as typeof z, }; export function getZFactory(schema: StagehandZodSchema): typeof z { return isZod4Schema(schema) ? zFactories.v4 : zFactories.v3; } const TYPE_NAME_MAP: Record = { ZodString: "string", string: "string", ZodNumber: "number", number: "number", ZodBoolean: "boolean", boolean: "boolean", ZodObject: "object", object: "object", ZodArray: "array", array: "array", ZodUnion: "union", union: "union", ZodIntersection: "intersection", intersection: "intersection", ZodOptional: "optional", optional: "optional", ZodNullable: "nullable", nullable: "nullable", ZodLiteral: "literal", literal: "literal", ZodEnum: "enum", enum: "enum", ZodDefault: "default", default: "default", ZodEffects: "effects", effects: "effects", pipe: "pipe", }; function getZ4Def(schema: StagehandZodSchema) { return (schema as SchemaInternals)._zod?.def as | Record | undefined; } function getZ4Bag(schema: StagehandZodSchema) { return (schema as SchemaInternals)._zod?.bag as | Record | undefined; } function getZ3Def(schema: StagehandZodSchema) { return (schema as SchemaInternals)._def as | Record | undefined; } function getObjectShape( schema: StagehandZodSchema, ): Record | undefined { const z4Shape = getZ4Def(schema)?.shape as | Record | undefined; if (z4Shape) { return z4Shape; } const z3Shape = getZ3Def(schema)?.shape; if (!z3Shape) { return undefined; } if (typeof z3Shape === "function") { return (z3Shape as () => Record)(); } return z3Shape as Record; } function getArrayElement( schema: StagehandZodSchema, ): StagehandZodSchema | undefined { return (getZ4Def(schema)?.element ?? getZ3Def(schema)?.type) as | StagehandZodSchema | undefined; } function getInnerType( schema: StagehandZodSchema, ): StagehandZodSchema | undefined { return (getZ4Def(schema)?.innerType ?? getZ3Def(schema)?.innerType) as | StagehandZodSchema | undefined; } function getUnionOptions( schema: StagehandZodSchema, ): StagehandZodSchema[] | undefined { const z4Options = getZ4Def(schema)?.options; if (Array.isArray(z4Options)) { return z4Options as StagehandZodSchema[]; } const z3Options = getZ3Def(schema)?.options; return Array.isArray(z3Options) ? (z3Options as StagehandZodSchema[]) : undefined; } function getIntersectionSides(schema: StagehandZodSchema): { left?: StagehandZodSchema; right?: StagehandZodSchema; } { const z4Def = getZ4Def(schema); if (z4Def?.left || z4Def?.right) { return { left: z4Def?.left as StagehandZodSchema | undefined, right: z4Def?.right as StagehandZodSchema | undefined, }; } const z3Def = getZ3Def(schema); return { left: z3Def?.left as StagehandZodSchema | undefined, right: z3Def?.right as StagehandZodSchema | undefined, }; } function getEnumValues(schema: StagehandZodSchema): string[] | undefined { const z4Entries = getZ4Def(schema)?.entries; if (z4Entries && typeof z4Entries === "object") { return Object.values(z4Entries as Record); } const z3Values = getZ3Def(schema)?.values; return Array.isArray(z3Values) ? (z3Values as string[]) : undefined; } function getLiteralValues(schema: StagehandZodSchema): unknown[] { const z4Values = getZ4Def(schema)?.values; if (Array.isArray(z4Values)) { return z4Values as unknown[]; } const value = getZ3Def(schema)?.value; return typeof value !== "undefined" ? [value] : []; } function getStringChecks(schema: StagehandZodSchema): unknown[] { const z4Checks = getZ4Def(schema)?.checks; if (Array.isArray(z4Checks)) { return z4Checks; } const z3Checks = getZ3Def(schema)?.checks; return Array.isArray(z3Checks) ? z3Checks : []; } function getStringFormat(schema: StagehandZodSchema): string | undefined { const bagFormat = getZ4Bag(schema)?.format; if (typeof bagFormat === "string") { return bagFormat; } const z4Format = getZ4Def(schema)?.format; if (typeof z4Format === "string") { return z4Format; } const z3Format = getZ3Def(schema)?.format; return typeof z3Format === "string" ? z3Format : undefined; } function getPipeEndpoints(schema: StagehandZodSchema): { in?: StagehandZodSchema; out?: StagehandZodSchema; } { const z4Def = getZ4Def(schema); if (z4Def?.in || z4Def?.out) { return { in: z4Def?.in as StagehandZodSchema | undefined, out: z4Def?.out as StagehandZodSchema | undefined, }; } return {}; } function getEffectsBaseSchema( schema: StagehandZodSchema, ): StagehandZodSchema | undefined { return getZ3Def(schema)?.schema as StagehandZodSchema | undefined; } type SchemaInternals = { _zod?: { def?: Record; bag?: Record }; _def?: Record; }; export function validateZodSchema(schema: StagehandZodSchema, data: unknown) { const result = schema.safeParse(data); if (result.success) { return true; } throw new ZodSchemaValidationError(data, result.error.format()); } /** * Detects if the code is running in the Bun runtime environment. * @returns {boolean} True if running in Bun, false otherwise. */ export function isRunningInBun(): boolean { return ( typeof process !== "undefined" && typeof process.versions !== "undefined" && "bun" in process.versions ); } /* * Helper functions for converting between Gemini and Zod schemas */ function decorateGeminiSchema( geminiSchema: Schema, zodSchema: z.ZodTypeAny, ): Schema { if (geminiSchema.nullable === undefined) { geminiSchema.nullable = zodSchema.isOptional(); } if (zodSchema.description) { geminiSchema.description = zodSchema.description; } return geminiSchema; } export function toGeminiSchema(zodSchema: StagehandZodSchema): Schema { const normalizedSchema = zodSchema as z.ZodTypeAny; const zodType = getZodType(zodSchema); switch (zodType) { case "array": { const element = getArrayElement(zodSchema) ?? z.any(); return decorateGeminiSchema( { type: Type.ARRAY, items: toGeminiSchema(element), }, normalizedSchema, ); } case "object": { const properties: Record = {}; const required: string[] = []; const shape = getObjectShape(zodSchema); if (shape) { Object.entries(shape).forEach( ([key, value]: [string, StagehandZodSchema]) => { properties[key] = toGeminiSchema(value); if (getZodType(value) !== "optional") { required.push(key); } }, ); } return decorateGeminiSchema( { type: Type.OBJECT, properties, required: required.length > 0 ? required : undefined, }, normalizedSchema, ); } case "string": return decorateGeminiSchema( { type: Type.STRING, }, normalizedSchema, ); case "number": return decorateGeminiSchema( { type: Type.NUMBER, }, normalizedSchema, ); case "boolean": return decorateGeminiSchema( { type: Type.BOOLEAN, }, normalizedSchema, ); case "enum": { const values = getEnumValues(zodSchema); return decorateGeminiSchema( { type: Type.STRING, enum: values, }, normalizedSchema, ); } case "default": case "nullable": case "optional": { const innerType = getInnerType(zodSchema) ?? z.any(); const innerSchema = toGeminiSchema(innerType); return decorateGeminiSchema( { ...innerSchema, nullable: true, }, normalizedSchema, ); } case "literal": { const values = getLiteralValues(zodSchema); return decorateGeminiSchema( { type: Type.STRING, enum: values as string[], }, normalizedSchema, ); } case "pipe": { const endpoints = getPipeEndpoints(zodSchema); if (endpoints.in) { return toGeminiSchema(endpoints.in); } return decorateGeminiSchema( { type: Type.STRING, }, normalizedSchema, ); } // Standalone transforms and any unknown types fall through to default default: return decorateGeminiSchema( { type: Type.STRING, }, normalizedSchema, ); } } // Helper function to check the type of Zod schema export function getZodType(schema: StagehandZodSchema): string { const schemaWithDef = schema as SchemaInternals & { _zod?: { def?: { type?: string } }; }; const rawType = (schemaWithDef._zod?.def?.type as string | undefined) ?? (schemaWithDef._def?.typeName as string | undefined) ?? (schemaWithDef._def?.type as string | undefined); if (!rawType) { return "unknown"; } return TYPE_NAME_MAP[rawType] ?? rawType; } /** * Recursively traverses a given Zod schema, scanning for any fields of type `z.string().url()`. * For each such field, it replaces the `z.string().url()` with `z.number()`. * * This function is used internally by higher-level utilities (e.g., transforming entire object schemas) * and handles nested objects, arrays, unions, intersections, optionals. * * @param schema - The Zod schema to transform. * @param currentPath - An array of string/number keys representing the current schema path (used internally for recursion). * @returns A two-element tuple: * 1. The updated Zod schema, with any `.url()` fields replaced by `z.number()`. * 2. An array of {@link ZodPathSegments} objects representing each replaced field, including the path segments. */ export function transformSchema( schema: StagehandZodSchema, currentPath: Array, ): [StagehandZodSchema, ZodPathSegments[]] { if (isKind(schema, "string")) { const checks = getStringChecks(schema); const format = getStringFormat(schema); const hasUrlCheck = checks.some((check) => { const candidate = check as { kind?: string; format?: string; _zod?: { def?: { check?: string; format?: string } }; }; return ( candidate.kind === "url" || candidate.format === "url" || candidate._zod?.def?.check === "url" || candidate._zod?.def?.format === "url" ); }) || format === "url"; if (hasUrlCheck) { return [makeIdStringSchema(schema), [{ segments: [] }]]; } return [schema, []]; } if (isKind(schema, "object")) { const shape = getObjectShape(schema); if (!shape) { return [schema, []]; } const newShape: Record = {}; const urlPaths: ZodPathSegments[] = []; let changed = false; for (const key of Object.keys(shape)) { const child = shape[key]; const [transformedChild, childPaths] = transformSchema(child, [ ...currentPath, key, ]); if (transformedChild !== child) { changed = true; } newShape[key] = transformedChild; childPaths.forEach((cp) => { urlPaths.push({ segments: [key, ...cp.segments] }); }); } if (changed) { const factory = getZFactory(schema); return [ factory.object(newShape as Record), urlPaths, ]; } return [schema, urlPaths]; } if (isKind(schema, "array")) { const itemType = getArrayElement(schema); if (!itemType) { return [schema, []]; } const [transformedItem, childPaths] = transformSchema(itemType, [ ...currentPath, "*", ]); const arrayPaths: ZodPathSegments[] = childPaths.map((cp) => ({ segments: ["*", ...cp.segments], })); if (transformedItem !== itemType) { const factory = getZFactory(schema); return [ factory.array(transformedItem as unknown as z.ZodTypeAny), arrayPaths, ]; } return [schema, arrayPaths]; } if (isKind(schema, "union")) { const unionOptions = getUnionOptions(schema); if (!unionOptions || unionOptions.length === 0) { return [schema, []]; } const newOptions: StagehandZodSchema[] = []; let changed = false; let allPaths: ZodPathSegments[] = []; unionOptions.forEach((option, idx) => { const [newOption, childPaths] = transformSchema(option, [ ...currentPath, `union_${idx}`, ]); if (newOption !== option) { changed = true; } newOptions.push(newOption); allPaths = [...allPaths, ...childPaths]; }); if (changed) { const factory = getZFactory(schema); return [ factory.union( newOptions as unknown as [ z.ZodTypeAny, z.ZodTypeAny, ...z.ZodTypeAny[], ], ), allPaths, ]; } return [schema, allPaths]; } if (isKind(schema, "intersection")) { const { left, right } = getIntersectionSides(schema); if (!left || !right) { return [schema, []]; } const [newLeft, leftPaths] = transformSchema(left, [ ...currentPath, "intersection_left", ]); const [newRight, rightPaths] = transformSchema(right, [ ...currentPath, "intersection_right", ]); const changed = newLeft !== left || newRight !== right; const allPaths = [...leftPaths, ...rightPaths]; if (changed) { const factory = getZFactory(schema); return [ factory.intersection( newLeft as unknown as z.ZodTypeAny, newRight as unknown as z.ZodTypeAny, ), allPaths, ]; } return [schema, allPaths]; } if (isKind(schema, "optional")) { const innerType = getInnerType(schema); if (!innerType) { return [schema, []]; } const [inner, innerPaths] = transformSchema(innerType, currentPath); if (inner !== innerType) { return [ (inner as z.ZodTypeAny).optional() as unknown as StagehandZodSchema, innerPaths, ]; } return [schema, innerPaths]; } if (isKind(schema, "nullable")) { const innerType = getInnerType(schema); if (!innerType) { return [schema, []]; } const [inner, innerPaths] = transformSchema(innerType, currentPath); if (inner !== innerType) { return [ (inner as z.ZodTypeAny).nullable() as unknown as StagehandZodSchema, innerPaths, ]; } return [schema, innerPaths]; } if (isKind(schema, "pipe") && isZod4Schema(schema)) { const { in: inSchema, out: outSchema } = getPipeEndpoints(schema); if (!inSchema || !outSchema) { return [schema, []]; } const [newIn, inPaths] = transformSchema(inSchema, currentPath); const [newOut, outPaths] = transformSchema(outSchema, currentPath); const allPaths = [...inPaths, ...outPaths]; if (newIn !== inSchema || newOut !== outSchema) { const result = z.pipe( newIn as unknown as z.ZodTypeAny, newOut as unknown as z.ZodTypeAny, ) as StagehandZodSchema; return [result, allPaths]; } return [schema, allPaths]; } if (isKind(schema, "effects")) { const baseSchema = getEffectsBaseSchema(schema); if (!baseSchema) { return [schema, []]; } return transformSchema(baseSchema, currentPath); } return [schema, []]; } /** * Once we get the final extracted object that has numeric IDs in place of URLs, * use `injectUrls` to walk the object and replace numeric IDs * with the real URL strings from idToUrlMapping. The `path` may include `*` * for array indices (indicating "all items in the array"). */ export function injectUrls( obj: unknown, path: Array, idToUrlMapping: Record, ): void { if (path.length === 0) return; const toId = (value: unknown): string | undefined => { if (typeof value === "number") { return String(value); } if (typeof value === "string" && ID_PATTERN.test(value)) { return value; } return undefined; }; const [key, ...rest] = path; if (key === "*") { if (Array.isArray(obj)) { if (rest.length === 0) { for (let i = 0; i < obj.length; i += 1) { const id = toId(obj[i]); if (id !== undefined) { obj[i] = idToUrlMapping[id] ?? ""; } } } else { for (const item of obj) injectUrls(item, rest, idToUrlMapping); } } return; } if (obj && typeof obj === "object") { const record = obj as Record; if (path.length === 1) { const fieldValue = record[key]; const id = toId(fieldValue); if (id !== undefined) { record[key] = idToUrlMapping[id] ?? ""; } } else { injectUrls(record[key], rest, idToUrlMapping); } } } // Helper to check if a schema is of a specific type function isKind(s: StagehandZodSchema, kind: string): boolean { try { return getZodType(s) === kind; } catch { return false; } } function makeIdStringSchema(orig: StagehandZodSchema): StagehandZodSchema { const userDesc = (orig as unknown as { description?: string }).description ?? ""; const base = "This field must be the element-ID in the form 'frameId-backendId' " + '(e.g. "0-432").'; const composed = userDesc.trim().length > 0 ? `${base} that follows this user-defined description: ${userDesc}` : base; const factory = getZFactory(orig); return factory.string().regex(ID_PATTERN).describe(composed); } /** * Mapping from LLM provider names to their corresponding environment variable names for API keys. */ export const providerEnvVarMap: Partial< Record> > = { openai: "OPENAI_API_KEY", anthropic: "ANTHROPIC_API_KEY", google: ["GEMINI_API_KEY", "GOOGLE_GENERATIVE_AI_API_KEY", "GOOGLE_API_KEY"], vertex: "GOOGLE_VERTEX_AI_API_KEY", groq: "GROQ_API_KEY", cerebras: "CEREBRAS_API_KEY", togetherai: "TOGETHER_AI_API_KEY", mistral: "MISTRAL_API_KEY", deepseek: "DEEPSEEK_API_KEY", perplexity: "PERPLEXITY_API_KEY", azure: "AZURE_API_KEY", xai: "XAI_API_KEY", google_legacy: "GOOGLE_API_KEY", }; const providersWithoutApiKey = new Set(["bedrock", "ollama"]); /** * Loads an API key for a provider, checking environment variables. * @param provider The name of the provider (e.g., 'openai', 'anthropic') * @param logger Optional logger for info/error messages * @returns The API key if found, undefined otherwise */ export function loadApiKeyFromEnv( provider: string | undefined, logger: (logLine: LogLine) => void, ): string | undefined { if (!provider) { return undefined; } const envVarName = providerEnvVarMap[provider]; if (!envVarName) { if (!providersWithoutApiKey.has(provider)) { logger({ category: "init", message: `No known environment variable for provider '${provider}'`, level: 0, }); } return undefined; } const apiKeyFromEnv = Array.isArray(envVarName) ? envVarName .map((name) => process.env[name]) .find((key) => key && key.length > 0) : process.env[envVarName as string]; if (typeof apiKeyFromEnv === "string" && apiKeyFromEnv.length > 0) { return apiKeyFromEnv; } // Don't log - this is expected when llmClient is provided or API key will be set later return undefined; } export function trimTrailingTextNode( path: string | undefined, ): string | undefined { return path?.replace(/\/text\(\)(\[\d+\])?$/iu, ""); } export function toTitleCase(str: string): string { return str.replace( /\w\S*/g, (text) => text.charAt(0).toUpperCase() + text.substring(1), ); } // TODO: move to separate types file export interface JsonSchemaProperty { type: string; enum?: unknown[]; items?: JsonSchemaProperty; properties?: Record; required?: string[]; minimum?: number; maximum?: number; description?: string; format?: string; // JSON Schema format field (e.g., "uri", "url", "email", etc.) } export interface JsonSchema extends JsonSchemaProperty { type: string; } /** * Converts a JSON Schema object to a Zod schema * @param schema The JSON Schema object to convert * @returns A Zod schema equivalent to the input JSON Schema */ export function jsonSchemaToZod(schema: JsonSchema): ZodTypeAny { switch (schema.type) { case "object": if (schema.properties) { const shape: Record = {}; for (const key in schema.properties) { shape[key] = jsonSchemaToZod(schema.properties[key]); } let zodObject = z.object(shape); if (schema.required && Array.isArray(schema.required)) { const requiredFields = schema.required.reduce>( (acc, field) => ({ ...acc, [field]: true }), {}, ); zodObject = zodObject.partial().required(requiredFields); } if (schema.description) { zodObject = zodObject.describe(schema.description); } return zodObject; } else { return z.object({}); } case "array": if (schema.items) { let zodArray = z.array(jsonSchemaToZod(schema.items)); if (schema.description) { zodArray = zodArray.describe(schema.description); } return zodArray; } else { return z.array(z.any()); } case "string": { if (schema.enum) { return z.string().refine((val) => schema.enum!.includes(val)); } let zodString = z.string(); // Handle JSON Schema format field if (schema.format === "uri" || schema.format === "url") { zodString = zodString.url(); } else if (schema.format === "email") { zodString = zodString.email(); } else if (schema.format === "uuid") { zodString = zodString.uuid(); } // Add more format handlers as needed if (schema.description) { zodString = zodString.describe(schema.description); } return zodString; } case "number": { let zodNumber = z.number(); if (schema.minimum !== undefined) { zodNumber = zodNumber.min(schema.minimum); } if (schema.maximum !== undefined) { zodNumber = zodNumber.max(schema.maximum); } if (schema.description) { zodNumber = zodNumber.describe(schema.description); } return zodNumber; } case "boolean": { let zodBoolean = z.boolean(); if (schema.description) { zodBoolean = zodBoolean.describe(schema.description); } return zodBoolean; } default: return z.any(); } } ================================================ FILE: packages/core/lib/v3/agent/AgentClient.ts ================================================ import { AgentAction, AgentResult, AgentType, AgentExecutionOptions, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; /** * Abstract base class for agent clients * This provides a common interface for all agent implementations */ export abstract class AgentClient { public type: AgentType; public modelName: string; public clientOptions: ClientOptions; public userProvidedInstructions?: string; constructor( type: AgentType, modelName: string, userProvidedInstructions?: string, ) { this.type = type; this.modelName = modelName; this.userProvidedInstructions = userProvidedInstructions; this.clientOptions = {}; } abstract execute(options: AgentExecutionOptions): Promise; abstract captureScreenshot( options?: Record, ): Promise; abstract setViewport(width: number, height: number): void; abstract setCurrentUrl(url: string): void; abstract setScreenshotProvider(provider: () => Promise): void; abstract setActionHandler( handler: (action: AgentAction) => Promise, ): void; /** Optional hook called at the top of every step in the agent loop. */ protected preStepHook?: () => Promise; setPreStepHook(handler: () => Promise): void { this.preStepHook = handler; } /** * Optional ephemeral context note that should be sent to the next model turn. * Clients that do not support this can ignore it. */ addContextNote(note: string): void { void note; // no-op by default } } ================================================ FILE: packages/core/lib/v3/agent/AgentProvider.ts ================================================ import { ToolSet } from "ai/dist"; import { AgentProviderType } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; import { ClientOptions } from "../types/public/model.js"; import { UnsupportedModelError, UnsupportedModelProviderError, } from "../types/public/sdkErrors.js"; import { AgentClient } from "./AgentClient.js"; import { AnthropicCUAClient } from "./AnthropicCUAClient.js"; import { OpenAICUAClient } from "./OpenAICUAClient.js"; import { GoogleCUAClient } from "./GoogleCUAClient.js"; import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js"; // Map model names to their provider types export const modelToAgentProviderMap: Record = { "computer-use-preview": "openai", "computer-use-preview-2025-03-11": "openai", "claude-sonnet-4-20250514": "anthropic", "claude-sonnet-4-5-20250929": "anthropic", "claude-opus-4-5-20251101": "anthropic", "claude-opus-4-6": "anthropic", "claude-sonnet-4-6": "anthropic", "claude-haiku-4-5-20251001": "anthropic", "gemini-2.5-computer-use-preview-10-2025": "google", "gemini-3-flash-preview": "google", "gemini-3-pro-preview": "google", "fara-7b": "microsoft", }; /** * Provider for agent clients * This class is responsible for creating the appropriate agent client * based on the provider type */ export class AgentProvider { private logger: (message: LogLine) => void; /** * Create a new agent provider */ constructor(logger: (message: LogLine) => void) { this.logger = logger; } getClient( modelName: string, clientOptions?: ClientOptions, userProvidedInstructions?: string, tools?: ToolSet, ): AgentClient { // Check if provider is explicitly set in clientOptions const explicitProvider = clientOptions?.provider as | AgentProviderType | undefined; const type = explicitProvider || AgentProvider.getAgentProvider(modelName); this.logger({ category: "agent", message: `Getting agent client for type: ${type}, model: ${modelName}${explicitProvider ? " (explicit provider)" : ""}`, level: 2, }); try { switch (type) { case "openai": return new OpenAICUAClient( type, modelName, userProvidedInstructions, clientOptions, tools, ); case "anthropic": return new AnthropicCUAClient( type, modelName, userProvidedInstructions, clientOptions, tools, ); case "google": return new GoogleCUAClient( type, modelName, userProvidedInstructions, clientOptions, tools, ); case "microsoft": return new MicrosoftCUAClient( type, modelName, userProvidedInstructions, clientOptions, ); default: throw new UnsupportedModelProviderError( ["openai", "anthropic", "google", "microsoft"], "Computer Use Agent", ); } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); this.logger({ category: "agent", message: `Error creating agent client: ${errorMessage}`, level: 0, }); throw error; } } static getAgentProvider(modelName: string): AgentProviderType { const normalized = modelName.includes("/") ? modelName.split("/")[1] : modelName; if (normalized in modelToAgentProviderMap) { return modelToAgentProviderMap[normalized]; } throw new UnsupportedModelError( Object.keys(modelToAgentProviderMap), "Computer Use Agent", ); } } ================================================ FILE: packages/core/lib/v3/agent/AnthropicCUAClient.ts ================================================ import { AgentAction, AgentResult, AgentType, AnthropicContentBlock, AnthropicMessage, AnthropicTextBlock, AnthropicToolResult, AgentExecutionOptions, ToolUseItem, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentScreenshotProviderError, StagehandClosedError, } from "../types/public/sdkErrors.js"; import Anthropic from "@anthropic-ai/sdk"; import { ToolSet } from "ai"; import { AgentClient } from "./AgentClient.js"; import { compressConversationImages } from "./utils/imageCompression.js"; import { toJsonSchema } from "../zodCompat.js"; import type { StagehandZodSchema } from "../zodCompat.js"; import { FlowLogger, extractLlmCuaPromptSummary, extractLlmCuaResponseSummary, } from "../flowlogger/FlowLogger.js"; import { v7 as uuidv7 } from "uuid"; export type ResponseInputItem = AnthropicMessage | AnthropicToolResult; /** * Client for Anthropic's Computer Use API * This implementation uses the official Anthropic Messages API for Computer Use */ export class AnthropicCUAClient extends AgentClient { private apiKey: string; private baseURL?: string; private client: Anthropic; public lastMessageId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private thinkingBudget: number | null = null; private tools?: ToolSet; constructor( type: AgentType, modelName: string, userProvidedInstructions?: string, clientOptions?: ClientOptions, tools?: ToolSet, ) { super(type, modelName, userProvidedInstructions); // Process client options this.apiKey = (clientOptions?.apiKey as string) || process.env.ANTHROPIC_API_KEY || ""; this.baseURL = (clientOptions?.baseURL as string) || undefined; // Get thinking budget if specified if ( clientOptions?.thinkingBudget && typeof clientOptions.thinkingBudget === "number" ) { this.thinkingBudget = clientOptions.thinkingBudget; } // Store client options for reference this.clientOptions = { apiKey: this.apiKey, }; if (this.baseURL) { this.clientOptions.baseURL = this.baseURL; } // Initialize the Anthropic client this.client = new Anthropic(this.clientOptions); this.tools = tools; } setViewport(width: number, height: number): void { this.currentViewport = { width, height }; } setCurrentUrl(url: string): void { this.currentUrl = url; } setScreenshotProvider(provider: () => Promise): void { this.screenshotProvider = provider; } setActionHandler(handler: (action: AgentAction) => Promise): void { this.actionHandler = handler; } setTools(tools: ToolSet): void { this.tools = tools; } /** * Execute a task with the Anthropic CUA * This is the main entry point for the agent * @implements AgentClient.execute */ async execute(executionOptions: AgentExecutionOptions): Promise { const { options, logger } = executionOptions; const { instruction } = options; const maxSteps = options.maxSteps || 10; let currentStep = 0; let completed = false; const actions: AgentAction[] = []; const messageList: string[] = []; let finalMessage = ""; // Start with the initial instruction let inputItems: ResponseInputItem[] = this.createInitialInputItems(instruction); logger({ category: "agent", message: `Starting Anthropic agent execution with instruction: ${instruction}`, level: 1, }); let totalInputTokens = 0; let totalOutputTokens = 0; let totalInferenceTime = 0; try { // Execute steps until completion or max steps reached while (!completed && currentStep < maxSteps) { await this.preStepHook?.(); logger({ category: "agent", message: `Executing step ${currentStep + 1}/${maxSteps}`, level: 1, }); const result = await this.executeStep(inputItems, logger); totalInputTokens += result.usage.input_tokens; totalOutputTokens += result.usage.output_tokens; totalInferenceTime += result.usage.inference_time_ms; // Add actions to the list if (result.actions.length > 0) { logger({ category: "agent", message: `Step ${currentStep + 1} performed ${result.actions.length} actions`, level: 2, }); actions.push(...result.actions); } // Update completion status completed = result.completed; // Update the input items for the next step if we're continuing if (!completed) { inputItems = result.nextInputItems; } // Record any message for this step if (result.message) { messageList.push(result.message); finalMessage = result.message; } // Increment step counter currentStep++; } logger({ category: "agent", message: `Anthropic agent execution completed: ${completed}, with ${actions.length} total actions performed`, level: 1, }); // Return the final result return { success: completed, actions, message: finalMessage, completed, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing agent task: ${errorMessage}`, level: 0, }); return { success: false, actions, message: `Failed to execute task: ${errorMessage}`, completed: false, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } } async executeStep( inputItems: ResponseInputItem[], logger: (message: LogLine) => void, ): Promise<{ actions: AgentAction[]; message: string; completed: boolean; nextInputItems: ResponseInputItem[]; usage: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; }> { try { // Get response from the model const result = await this.getAction(inputItems); const content = result.content; const usage = { input_tokens: result.usage.input_tokens, output_tokens: result.usage.output_tokens, inference_time_ms: result.usage.inference_time_ms, }; logger({ category: "agent", message: `Received response with ${content.length} content blocks`, level: 2, }); // Extract actions from the content const stepActions: AgentAction[] = []; const toolUseItems: ToolUseItem[] = []; let message = ""; // Process content blocks to find tool use items and text content for (const block of content) { logger({ category: "agent", message: `Processing block type: ${block.type}, id: ${block.id || "unknown"}`, level: 2, }); if (block.type === "tool_use") { // Direct handling of tool_use type logger({ category: "agent", message: `Found tool_use block: ${JSON.stringify(block)}`, level: 2, }); // Cast to ToolUseItem and add to list const toolUseItem = block as ToolUseItem; toolUseItems.push(toolUseItem); logger({ category: "agent", message: `Added tool_use item: ${toolUseItem.name}, action: ${JSON.stringify(toolUseItem.input)}`, level: 2, }); // Convert tool use to action and add to actions list const action = this.convertToolUseToAction(toolUseItem); if (action) { logger({ category: "agent", message: `Created action from tool_use: ${toolUseItem.name}, action: ${action.type}`, level: 2, }); stepActions.push(action); } else if (this.tools && toolUseItem.name in this.tools) { stepActions.push({ type: "custom_tool", tool: toolUseItem.name, input: toolUseItem.input, } as AgentAction); } } else if (block.type === "text") { // Safe to cast here since we've verified it's a text block const textBlock = block as unknown as AnthropicTextBlock; message += textBlock.text + "\n"; logger({ category: "agent", message: `Found text block: ${textBlock.text}`, level: 2, }); } else { logger({ category: "agent", message: `Found unknown block type: ${block.type}`, level: 2, }); } } // Execute actions if an action handler is provided if (this.actionHandler && stepActions.length > 0) { for (const action of stepActions) { try { logger({ category: "agent", message: `Executing action: ${action.type}`, level: 1, }); await this.actionHandler(action); } catch (error) { if (error instanceof StagehandClosedError) { throw error; } const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing action ${action.type}: ${errorMessage}`, level: 0, }); } } } // Create the assistant response message with all content blocks const assistantMessage: AnthropicMessage = { role: "assistant", content: content as unknown as AnthropicContentBlock[], }; // Keep track of the conversation history by preserving all previous messages // and adding new messages at the end const nextInputItems: ResponseInputItem[] = [...inputItems]; // Add the assistant message with tool_use blocks to the history compressConversationImages(nextInputItems); nextInputItems.push(assistantMessage); // Generate tool results and add them as a user message if (toolUseItems.length > 0) { const toolResults = await this.takeAction(toolUseItems, logger); if (toolResults.length > 0) { // Tool results are AnthropicToolResult[] which are compatible with AnthropicContentBlock[] const userToolResultsMessage: AnthropicMessage = { role: "user", content: toolResults as unknown as AnthropicContentBlock[], }; nextInputItems.push(userToolResultsMessage); } } // The step is completed only if there were no tool_use items const completed = toolUseItems.length === 0; logger({ category: "agent", message: `Step processed ${toolUseItems.length} tool use items, completed: ${completed}`, level: 2, }); return { actions: stepActions, message: message.trim(), completed, nextInputItems, usage: usage, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing step: ${errorMessage}`, level: 0, }); throw error; } } private createInitialInputItems(instruction: string): AnthropicMessage[] { // For the initial request, we use a simple array with the user's instruction return [ { role: "system", content: this.userProvidedInstructions, }, { role: "user", content: instruction, }, ]; } async getAction(inputItems: ResponseInputItem[]): Promise<{ content: AnthropicContentBlock[]; id: string; usage: Record; }> { try { // For the API request, we use the inputItems directly // These should already be properly formatted as a sequence of user/assistant messages const messages: AnthropicMessage[] = []; for (const item of inputItems) { if ("role" in item) { // Skip system messages as Anthropic requires system as a top-level parameter if (item.role !== "system") { messages.push(item); } } // Note: We don't need special handling for tool_result items here anymore // as they should already be properly wrapped in user messages } // Configure thinking capability if available const thinking = this.thinkingBudget ? { type: "enabled" as const, budget_tokens: this.thinkingBudget } : undefined; // Claude 4.6+ models require the newer computer_20251124 tool version const modelBase = this.modelName.includes("/") ? this.modelName.split("/")[1] : this.modelName; const shouldUseNewToolVersion = [ "claude-opus-4-6", "claude-sonnet-4-6", "claude-opus-4-5-20251101", ].includes(modelBase); const computerToolType = shouldUseNewToolVersion ? "computer_20251124" : "computer_20250124"; const betaFlag = shouldUseNewToolVersion ? "computer-use-2025-11-24" : "computer-use-2025-01-24"; // Create the request parameters const requestParams: Record = { model: this.modelName, max_tokens: 4096, messages: messages, tools: [ { type: computerToolType, name: "computer", display_width_px: this.currentViewport.width, display_height_px: this.currentViewport.height, display_number: 1, }, ], betas: [betaFlag], }; // Add custom tools if available if (this.tools && Object.keys(this.tools).length > 0) { const customTools = Object.entries(this.tools).map(([name, tool]) => { const schema = tool.inputSchema as StagehandZodSchema; // Convert Zod schema to proper JSON schema format for Anthropic const jsonSchema = toJsonSchema(schema) as { properties?: Record; required?: string[]; }; const inputSchema = { type: "object", properties: jsonSchema.properties || {}, required: jsonSchema.required || [], }; return { name, description: tool.description, input_schema: inputSchema, }; }); requestParams.tools = [ ...(requestParams.tools as Record[]), ...customTools, ]; } // Add system parameter if provided if (this.userProvidedInstructions) { requestParams.system = this.userProvidedInstructions; } // Add thinking parameter if available if (thinking) { requestParams.thinking = thinking; } // Log LLM request const llmRequestId = uuidv7(); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: this.modelName, prompt: extractLlmCuaPromptSummary(messages), }); const startTime = Date.now(); // Create the message using the Anthropic Messages API // @ts-expect-error - The Anthropic SDK types are stricter than what we need const response = await this.client.beta.messages.create(requestParams); const endTime = Date.now(); const elapsedMs = endTime - startTime; const usage = { input_tokens: response.usage.input_tokens, output_tokens: response.usage.output_tokens, inference_time_ms: elapsedMs, }; // Log LLM response FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.modelName, output: extractLlmCuaResponseSummary(response.content), inputTokens: response.usage.input_tokens, outputTokens: response.usage.output_tokens, }); // Store the message ID for future use this.lastMessageId = response.id; // Return the content and message ID return { // Cast the response content to our internal type content: response.content as unknown as AnthropicContentBlock[], id: response.id, usage, }; } catch (error) { console.error("Error getting action from Anthropic:", error); throw error; } } async takeAction( toolUseItems: ToolUseItem[], logger: (message: LogLine) => void, ): Promise { const toolResults: AnthropicToolResult[] = []; logger({ category: "agent", message: `Taking action on ${toolUseItems.length} tool use items`, level: 2, }); // Process each tool use item for (const item of toolUseItems) { try { logger({ category: "agent", message: `Processing tool use: ${item.name}, id: ${item.id}, action: ${JSON.stringify(item.input)}`, level: 2, }); // TODO: Normalize and migrate to agentHandler // For computer tool, capture screenshot and return image if (item.name === "computer") { // Get action type const action = item.input.action as string; logger({ category: "agent", message: `Computer action type: ${action}`, level: 2, }); // Capture a screenshot for the response const screenshot = await this.captureScreenshot(); logger({ category: "agent", message: `Screenshot captured, length: ${screenshot.length}`, level: 2, }); // Create proper image content block for Anthropic const imageContent = [ { type: "image", source: { type: "base64", media_type: "image/png", data: screenshot.replace(/^data:image\/png;base64,/, ""), }, }, ]; // Add current URL if available if (this.currentUrl) { toolResults.push({ type: "tool_result", tool_use_id: item.id, content: [ ...imageContent, { type: "text", text: `Current URL: ${this.currentUrl}`, }, ], }); } else { toolResults.push({ type: "tool_result", tool_use_id: item.id, content: imageContent, }); } logger({ category: "agent", message: `Added computer tool result for tool_use_id: ${item.id}`, level: 2, }); } else { // Handle custom tools let toolResult = "Tool executed successfully"; if (this.tools && item.name in this.tools) { try { const tool = this.tools[item.name]; logger({ category: "agent", message: `Executing tool call: ${item.name} with args: ${JSON.stringify(item.input)}`, level: 1, }); const result = await tool.execute(item.input, { toolCallId: item.id, messages: [], }); toolResult = JSON.stringify(result); logger({ category: "agent", message: `Tool ${item.name} completed successfully. Result: ${toolResult}`, level: 1, }); } catch (toolError) { const errorMessage = toolError instanceof Error ? toolError.message : String(toolError); toolResult = `Error executing tool: ${errorMessage}`; logger({ category: "agent", message: `Error executing tool ${item.name}: ${errorMessage}`, level: 0, }); } } toolResults.push({ type: "tool_result", tool_use_id: item.id, content: [ { type: "text", text: toolResult, }, ], }); logger({ category: "agent", message: `Added custom tool result for tool ${item.name}, tool_use_id: ${item.id}`, level: 2, }); } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing tool use: ${errorMessage}`, level: 0, }); try { // For computer tool, try to capture a screenshot even on error if (item.name === "computer") { const screenshot = await this.captureScreenshot(); toolResults.push({ type: "tool_result", tool_use_id: item.id, content: [ { type: "image", source: { type: "base64", media_type: "image/png", data: screenshot.replace(/^data:image\/png;base64,/, ""), }, }, { type: "text", text: `Error: ${errorMessage}`, }, ], }); logger({ category: "agent", message: `Added error tool result with screenshot for tool_use_id: ${item.id}`, level: 1, }); } else { // For other tools, return an error message as a text content block toolResults.push({ type: "tool_result", tool_use_id: item.id, content: [ { type: "text", text: `Error: ${errorMessage}`, }, ], }); logger({ category: "agent", message: `Added error tool result for tool_use_id: ${item.id}`, level: 1, }); } } catch (screenshotError) { // If we can't capture a screenshot, just send the error logger({ category: "agent", message: `Error capturing screenshot: ${String(screenshotError)}`, level: 0, }); toolResults.push({ type: "tool_result", tool_use_id: item.id, content: [ { type: "text", text: `Error: ${errorMessage}`, }, ], }); logger({ category: "agent", message: `Added text error tool result for tool_use_id: ${item.id}`, level: 1, }); } } } logger({ category: "agent", message: `Prepared ${toolResults.length} tool results for next request`, level: 2, }); return toolResults; } private convertToolUseToAction(item: ToolUseItem): AgentAction | null { try { const { name, input } = item; if (name === "computer") { // For computer actions, format according to the action type const action = input.action as string; if (!action) { console.warn("Missing action in tool use item:", item); return null; } // Handle different action types specifically if (action === "screenshot") { return { type: "screenshot", ...input, }; } else if (action === "click") { return { type: "click", x: input.x as number, y: input.y as number, button: (input.button as string) || "left", ...input, }; } else if (action === "type") { return { type: "type", text: input.text as string, ...input, }; } else if (action === "keypress" || action === "key") { return { type: "keypress", keys: [input.text as string], ...input, }; } else if (action === "double_click" || action === "doubleClick") { return { type: "doubleClick", x: (input.x as number) || (input.coordinate ? (input.coordinate as number[])[0] : 0), y: (input.y as number) || (input.coordinate ? (input.coordinate as number[])[1] : 0), ...input, }; } else if (action === "scroll") { // Convert Anthropic's coordinate, scroll_amount and scroll_direction into scroll_x and scroll_y const x = (input.x as number) || (input.coordinate ? (input.coordinate as number[])[0] : 0); const y = (input.y as number) || (input.coordinate ? (input.coordinate as number[])[1] : 0); // Calculate scroll_x and scroll_y based on scroll_amount and scroll_direction let scroll_x = 0; let scroll_y = 0; const scrollAmount = (input.scroll_amount as number) || 5; const scrollMultiplier = 100; // Pixels per unit of scroll_amount if (input.scroll_direction) { const direction = input.scroll_direction as string; if (direction === "down") { scroll_y = scrollAmount * scrollMultiplier; } else if (direction === "up") { scroll_y = -scrollAmount * scrollMultiplier; } else if (direction === "right") { scroll_x = scrollAmount * scrollMultiplier; } else if (direction === "left") { scroll_x = -scrollAmount * scrollMultiplier; } } else { // Use direct scroll_x and scroll_y if provided scroll_x = (input.scroll_x as number) || 0; scroll_y = (input.scroll_y as number) || 0; } return { type: "scroll", x: x, y: y, scroll_x: scroll_x, scroll_y: scroll_y, ...input, }; } else if (action === "move") { // Handle Anthropic's coordinate format const coordinates = input.coordinate as number[] | undefined; const x = coordinates ? coordinates[0] : (input.x as number) || 0; const y = coordinates ? coordinates[1] : (input.y as number) || 0; return { type: "move", x: x, y: y, ...input, }; } else if (action === "drag" || action === "left_click_drag") { // Make sure path is properly formatted const path = (input.path as { x: number; y: number }[]) || (input.coordinate ? [ { x: (input.start_coordinate as number[])[0], y: (input.start_coordinate as number[])[1], }, { x: (input.coordinate as number[])[0], y: (input.coordinate as number[])[1], }, ] : []); return { type: "drag", path: path, ...input, }; } else if (action === "wait") { return { type: "wait", ...input, }; } else if (action === "left_click") { // Convert left_click to regular click const coordinates = input.coordinate as number[] | undefined; const x = coordinates ? coordinates[0] : (input.x as number) || 0; const y = coordinates ? coordinates[1] : (input.y as number) || 0; return { type: "click", x: x, y: y, button: "left", ...input, }; } else { // For other computer actions, use the action type directly return { type: action, ...input, }; } } else if (name === "str_replace_editor" || name === "bash") { // For editor or bash tools return { type: name, params: input, }; } else if (this.tools && name in this.tools) { return null; } console.warn(`Unknown tool name: ${name}`); return null; } catch (error) { console.error("Error converting tool use to action:", error); return null; } } async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; }): Promise { // Use provided options if available if (options?.base64Image) { return `data:image/png;base64,${options.base64Image}`; } // Use the screenshot provider if available if (this.screenshotProvider) { try { const base64Image = await this.screenshotProvider(); return `data:image/png;base64,${base64Image}`; } catch (error) { console.error("Error capturing screenshot:", error); throw error; } } throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", ); } } ================================================ FILE: packages/core/lib/v3/agent/GoogleCUAClient.ts ================================================ import { GoogleGenAI, Content, Part, GenerateContentResponse, FunctionCall, GenerateContentConfig, Tool, GoogleGenAIOptions, } from "@google/genai"; import { LogLine } from "../types/public/logs.js"; import { AgentAction, AgentResult, AgentType, AgentExecutionOptions, SafetyCheck, SafetyConfirmationHandler, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; import { AgentScreenshotProviderError, LLMResponseError, StagehandClosedError, } from "../types/public/sdkErrors.js"; import { buildGoogleCUASystemPrompt } from "../../prompt.js"; import { compressGoogleConversationImages } from "./utils/imageCompression.js"; import { mapKeyToPlaywright } from "./utils/cuaKeyMapping.js"; import { executeGoogleCustomTool, isCustomTool, convertToolSetToFunctionDeclarations, } from "./utils/googleCustomToolHandler.js"; import { ToolSet } from "ai"; import { FlowLogger, extractLlmCuaPromptSummary, extractLlmCuaResponseSummary, } from "../flowlogger/FlowLogger.js"; import { v7 as uuidv7 } from "uuid"; /** * Client for Google's Computer Use Assistant API * This implementation uses the Google Generative AI SDK for Computer Use */ export class GoogleCUAClient extends AgentClient { private apiKey: string; private client: GoogleGenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private history: Content[] = []; private environment: "ENVIRONMENT_BROWSER" | "ENVIRONMENT_DESKTOP" = "ENVIRONMENT_BROWSER"; private generateContentConfig: GenerateContentConfig; private tools?: ToolSet; private baseURL?: string; private safetyConfirmationHandler?: SafetyConfirmationHandler; constructor( type: AgentType, modelName: string, userProvidedInstructions?: string, clientOptions?: ClientOptions, tools?: ToolSet, ) { super(type, modelName, userProvidedInstructions); this.tools = tools; // Process client options this.apiKey = (clientOptions?.apiKey as string) || process.env.GEMINI_API_KEY || process.env.GOOGLE_GENERATIVE_AI_API_KEY || process.env.GOOGLE_API_KEY || ""; this.baseURL = clientOptions?.baseURL as string | undefined; // Initialize the Google Generative AI client const genAIOptions: GoogleGenAIOptions = { apiKey: this.apiKey, ...(this.baseURL ? { httpOptions: { baseUrl: this.baseURL } } : {}), }; this.client = new GoogleGenAI(genAIOptions); // Get environment if specified if ( clientOptions?.environment && typeof clientOptions.environment === "string" ) { this.environment = clientOptions.environment as typeof this.environment; } this.generateContentConfig = { temperature: 1, topP: 0.95, topK: 40, maxOutputTokens: 8192, // systemInstruction: this.userProvidedInstructions // ? { parts: [{ text: this.userProvidedInstructions }] } // : { parts: [{ text: buildGoogleCUASystemPrompt() }] }, tools: [ { computerUse: { environment: this.environment, }, } as Tool, ], }; // Store client options for reference this.clientOptions = { apiKey: this.apiKey, ...(this.baseURL ? { baseURL: this.baseURL } : {}), }; // Initialize tools if provided if (this.tools && Object.keys(this.tools).length > 0) { this.updateGenerateContentConfig(); } } public setViewport(width: number, height: number): void { this.currentViewport = { width, height }; } setCurrentUrl(url: string): void { this.currentUrl = url; } setScreenshotProvider(provider: () => Promise): void { this.screenshotProvider = provider; } setActionHandler(handler: (action: AgentAction) => Promise): void { this.actionHandler = handler; } setTools(tools: ToolSet): void { this.tools = tools; this.updateGenerateContentConfig(); } setSafetyConfirmationHandler(handler?: SafetyConfirmationHandler): void { this.safetyConfirmationHandler = handler; } private async handleSafetyConfirmation( safetyDecision: unknown, logger: (message: LogLine) => void, ): Promise { const safetyMessage = typeof safetyDecision === "object" ? JSON.stringify(safetyDecision, null, 2) : String(safetyDecision); const safetyChecks: SafetyCheck[] = [ { id: "google-safety-decision", code: "safety_decision", message: safetyMessage, }, ]; if (this.safetyConfirmationHandler) { logger({ category: "agent", message: `Requesting safety confirmation for Google safety decision: ${safetyMessage}`, level: 1, }); const response = await this.safetyConfirmationHandler(safetyChecks); if (response.acknowledged) { logger({ category: "agent", message: `Safety decision acknowledged by user`, level: 1, }); return "true"; } else { logger({ category: "agent", message: `Safety decision rejected by user`, level: 1, }); return undefined; } } logger({ category: "agent", message: `Auto-acknowledging Google safety decision`, level: 2, }); return "true"; } /** * Update the generateContentConfig with current tools */ private updateGenerateContentConfig(): void { const functionDeclarations = this.tools && Object.keys(this.tools).length > 0 ? convertToolSetToFunctionDeclarations(this.tools) : []; this.generateContentConfig = { ...this.generateContentConfig, tools: [ { computerUse: { environment: this.environment, }, ...(functionDeclarations.length > 0 ? { functionDeclarations } : {}), } as Tool, ], }; } /** * Execute a task with the Google CUA * This is the main entry point for the agent * @implements AgentClient.execute */ async execute(executionOptions: AgentExecutionOptions): Promise { const { options, logger } = executionOptions; const { instruction } = options; const maxSteps = options.maxSteps || 10; let currentStep = 0; let completed = false; const actions: AgentAction[] = []; const messageList: string[] = []; let finalMessage = ""; this.history = []; // Clear history for new execution // Start with the initial instruction await this.initializeHistory(instruction); let totalInputTokens = 0; let totalOutputTokens = 0; let totalInferenceTime = 0; try { // Execute steps until completion or max steps reached while (!completed && currentStep < maxSteps) { await this.preStepHook?.(); logger({ category: "agent", message: `Executing step ${currentStep + 1}/${maxSteps}`, level: 1, }); const result = await this.executeStep(logger); totalInputTokens += result.usage.input_tokens; totalOutputTokens += result.usage.output_tokens; totalInferenceTime += result.usage.inference_time_ms; // Add actions to the list actions.push(...result.actions); // Update completion status completed = result.completed; // Record any message for this step if (result.message) { messageList.push(result.message); finalMessage = result.message; } // Increment step counter currentStep++; } // Return the final result return { success: completed, actions, message: finalMessage, completed, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing agent task: ${errorMessage}`, level: 0, }); return { success: false, actions, message: `Failed to execute task: ${errorMessage}`, completed: false, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } } /** * Initialize conversation history with the initial instruction */ private async initializeHistory(instruction: string): Promise { const parts: Part[] = [{ text: instruction }]; // Note: The Python implementation doesn't include the initial screenshot // Following the same pattern here const systemPromptContent = this.userProvidedInstructions ? this.userProvidedInstructions : buildGoogleCUASystemPrompt().content; this.history = [ { role: "user", parts: [ { text: "System prompt: " + systemPromptContent, }, ], }, { role: "user", parts, }, ]; } /** * Execute a single step of the agent */ async executeStep(logger: (message: LogLine) => void): Promise<{ actions: AgentAction[]; message: string; completed: boolean; usage: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; }> { try { const startTime = Date.now(); // Compress images in conversation history before sending to the model const compressedResult = compressGoogleConversationImages( this.history, 2, ); const compressedHistory = compressedResult.items; // Use the SDK's generateContent method with retry logic (matching Python's get_model_response) const maxRetries = 5; const baseDelayS = 1; let lastError: Error | null = null; let response: GenerateContentResponse | null = null; // Log LLM request const llmRequestId = uuidv7(); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: this.modelName, prompt: extractLlmCuaPromptSummary(compressedHistory), }); for (let attempt = 0; attempt < maxRetries; attempt++) { try { // Add exponential backoff delay for retries if (attempt > 0) { const delay = baseDelayS * Math.pow(2, attempt) * 1000; // Convert to ms logger({ category: "agent", message: `Generating content failed on attempt ${attempt + 1}. Retrying in ${delay / 1000} seconds...`, level: 2, }); await new Promise((resolve) => setTimeout(resolve, delay)); } // Use the SDK's generateContent method - following Python SDK pattern response = await this.client.models.generateContent({ model: this.modelName, contents: compressedHistory, config: this.generateContentConfig, }); // Check if we have valid response content if (!response.candidates || response.candidates.length === 0) { throw new LLMResponseError("agent", "Response has no candidates!"); } const candidate = response.candidates[0]; if (!candidate.content || !candidate.content.parts) { const reason = candidate.finishReason || "unknown"; throw new LLMResponseError( "agent", `Response has no content (finish reason: ${reason})`, ); } // Success - we have a valid response break; } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); logger({ category: "agent", message: `API call error: ${lastError.message}`, level: 2, }); // If this was the last attempt, throw the error if (attempt === maxRetries - 1) { logger({ category: "agent", message: `Generating content failed after ${maxRetries} attempts.`, level: 0, }); throw lastError; } } } if (!response) { throw ( lastError || new Error("Failed to get response after all retries") ); } const endTime = Date.now(); const elapsedMs = endTime - startTime; const { usageMetadata } = response; // Log LLM response FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.modelName, output: extractLlmCuaResponseSummary(response), inputTokens: usageMetadata?.promptTokenCount, outputTokens: usageMetadata?.candidatesTokenCount, }); // Process the response const result = await this.processResponse(response, logger); // Add model response to history if (response.candidates && response.candidates[0]) { // Sanitize any out-of-range coordinates in function calls before adding to history const sanitizedContent = JSON.parse( JSON.stringify(response.candidates[0].content), ); if (sanitizedContent.parts) { for (const part of sanitizedContent.parts) { if (part.functionCall?.args) { if ( typeof part.functionCall.args.x === "number" && part.functionCall.args.x > 999 ) { part.functionCall.args.x = 999; } if ( typeof part.functionCall.args.y === "number" && part.functionCall.args.y > 999 ) { part.functionCall.args.y = 999; } } } } this.history.push(sanitizedContent); } // Execute actions and collect function responses const functionResponses: Part[] = []; if (result.actions.length > 0) { let hasError = false; // Execute all actions for (let i = 0; i < result.actions.length; i++) { const action = result.actions[i]; logger({ category: "agent", message: `Executing action ${i + 1}/${result.actions.length}: ${action.type}`, level: 2, }); // Special handling for open_web_browser - don't execute it if (action.type === "open_web_browser") { // Set pageUrl for open_web_browser since it doesn't go through action handler action.pageUrl = this.currentUrl; logger({ category: "agent", message: "Skipping open_web_browser action", level: 2, }); } else if (action.type === "custom_tool") { const toolName = action.name as string; const toolArgs = action.arguments as Record; if (this.tools && toolName in this.tools) { const correspondingFunctionCall = result.functionCalls.find( (fc) => fc.name === toolName, ); if (correspondingFunctionCall) { const executionResult = await executeGoogleCustomTool( toolName, toolArgs, this.tools, correspondingFunctionCall, logger, ); functionResponses.push(executionResult.functionResponse); if (!executionResult.success) { hasError = true; } } } } else if (this.actionHandler) { try { await this.actionHandler(action); // Add a delay between actions to ensure they complete properly // Longer delay for typing actions to ensure fields are ready if (i < result.actions.length - 1) { const nextAction = result.actions[i + 1]; const isTypingAction = action.type === "type" || nextAction.type === "type"; const delay = isTypingAction ? 500 : 200; await new Promise((resolve) => setTimeout(resolve, delay)); } } catch (actionError) { if (actionError instanceof StagehandClosedError) { throw actionError; } logger({ category: "agent", message: `Error executing action ${action.type}: ${actionError}`, level: 0, }); hasError = true; // Continue processing other actions even if one fails } } } // Create function responses for computer use actions (non-custom tools) // We need exactly one response per function call, regardless of how many actions were generated if (result.functionCalls.length > 0 || hasError) { // Filter out custom tool function calls as they've already been handled const computerUseFunctionCalls = result.functionCalls.filter( (fc) => !isCustomTool(fc, this.tools), ); if (computerUseFunctionCalls.length > 0) { try { logger({ category: "agent", message: `Taking screenshot after executing ${result.actions.length} actions${hasError ? " (with errors)" : ""}`, level: 2, }); const screenshot = await this.captureScreenshot(); const base64Data = screenshot.replace( /^data:image\/png;base64,/, "", ); // Create one function response for each computer use function call // Following Python SDK pattern: FunctionResponse with parts containing inline_data for (const functionCall of computerUseFunctionCalls) { let safetyAcknowledgement: string | undefined; if (functionCall.args?.safety_decision) { safetyAcknowledgement = await this.handleSafetyConfirmation( functionCall.args.safety_decision, logger, ); } const functionResponsePart: Part = { functionResponse: { name: functionCall.name, response: { url: this.currentUrl || "", ...(safetyAcknowledgement !== undefined ? { safety_acknowledgement: safetyAcknowledgement, } : {}), }, parts: [ { inlineData: { mimeType: "image/png", data: base64Data, }, }, ], }, }; functionResponses.push(functionResponsePart); } } catch (error) { logger({ category: "agent", message: `Error capturing screenshot: ${error}`, level: 0, }); } } } // Add all function responses to history in a single user message if (functionResponses.length > 0) { logger({ category: "agent", message: `Adding ${functionResponses.length} function responses to history`, level: 2, }); this.history.push({ role: "user", parts: functionResponses, }); } } return { actions: result.actions, message: result.message, completed: result.completed, usage: { input_tokens: usageMetadata?.promptTokenCount || 0, output_tokens: usageMetadata?.candidatesTokenCount || 0, inference_time_ms: elapsedMs, }, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing step: ${errorMessage}`, level: 0, }); throw error; } } /** * Process the response from Google's API */ private async processResponse( response: GenerateContentResponse, logger: (message: LogLine) => void, ): Promise<{ actions: AgentAction[]; message: string; completed: boolean; functionCalls: FunctionCall[]; }> { const actions: AgentAction[] = []; let message = ""; const functionCalls: FunctionCall[] = []; if (!response.candidates || response.candidates.length === 0) { return { actions: [], message: "No candidates in response", completed: true, functionCalls: [], }; } const candidate = response.candidates[0]; // Log the raw response for debugging logger({ category: "agent", message: `Raw response from Google: ${JSON.stringify(candidate.content, null, 2)}`, level: 2, }); // Process all parts - Google can send multiple function calls for (const part of candidate.content.parts) { if (part.text) { message += part.text + "\n"; logger({ category: "agent", message: `Reasoning: ${part.text}`, level: 1, }); } if (part.functionCall) { functionCalls.push(part.functionCall); logger({ category: "agent", message: `Found function call: ${part.functionCall.name} with args: ${JSON.stringify(part.functionCall.args)}`, level: 2, }); // Convert function call to action(s) const action = this.convertFunctionCallToAction(part.functionCall); if (action) { // Special handling for type_text_at - we need to click first if ( part.functionCall.name === "type_text_at" && action.type === "type" ) { logger({ category: "agent", message: `Adding action: ${JSON.stringify(action)}`, level: 2, }); // First add a click action at the same coordinates actions.push({ type: "click", x: action.x, y: action.y, button: "left", }); // If clear_before_typing is true (default), add a select all if (action.clearBeforeTyping) { // Select all text in the field actions.push({ type: "keypress", keys: ["ControlOrMeta+A"], }); actions.push({ type: "keypress", keys: ["Backspace"], }); } // Then add the type action actions.push(action); if (action.pressEnter) { actions.push({ type: "keypress", keys: ["Enter"], }); } } else { actions.push(action); } } else { logger({ category: "agent", message: `Warning: Could not convert function call ${part.functionCall.name} to action`, level: 1, }); } } } // Log summary of what we found logger({ category: "agent", message: `Found ${functionCalls.length} function calls, converted to ${actions.length} actions`, level: 2, }); // Check if task is completed const completed = functionCalls.length === 0 || (candidate.finishReason && candidate.finishReason !== "STOP"); return { actions, message: message.trim(), completed, functionCalls, }; } /** * Convert Google function call to Stagehand action */ private convertFunctionCallToAction( functionCall: FunctionCall, ): AgentAction | null { const { name, args } = functionCall; if (!name || !args) { return null; } switch (name) { case "open_web_browser": return { type: "open_web_browser", timestamp: Date.now(), }; case "click_at": { const { x, y } = this.normalizeCoordinates( args.x as number, args.y as number, ); return { type: "click", x, y, button: args.button || "left", }; } case "type_text_at": { const { x, y } = this.normalizeCoordinates( args.x as number, args.y as number, ); // Google's type_text_at includes press_enter and clear_before_typing parameters const pressEnter = (args.press_enter as boolean) ?? false; const clearBeforeTyping = (args.clear_before_typing as boolean) ?? true; // For type_text_at, we need to click first then type // This matches the behavior expected by Google's CUA // We'll handle this in the executeStep method by converting to two actions return { type: "type", text: args.text as string, x, y, pressEnter, clearBeforeTyping, }; } case "key_combination": { const keys = (args.keys as string) .split("+") .map((key: string) => key.trim()) .map((key: string) => mapKeyToPlaywright(key)); return { type: "keypress", keys, }; } case "scroll_document": { const direction = (args.direction as string).toLowerCase(); return { type: "keypress", keys: [direction === "up" ? "PageUp" : "PageDown"], }; } case "scroll_at": { const { x, y } = this.normalizeCoordinates( args.x as number, args.y as number, ); const direction = ((args.direction as string) || "down").toLowerCase(); const magnitude = typeof args.magnitude === "number" ? (args.magnitude as number) : 800; let scroll_x = 0; let scroll_y = 0; if (direction === "up") { scroll_y = -magnitude; } else if (direction === "down") { scroll_y = magnitude; } else if (direction === "left") { scroll_x = -magnitude; } else if (direction === "right") { scroll_x = magnitude; } else { // Default to down if unknown direction scroll_y = magnitude; } return { type: "scroll", x, y, scroll_x, scroll_y, }; } case "navigate": return { type: "goto", url: args.url as string, }; case "go_back": return { type: "back", }; case "go_forward": return { type: "forward", }; case "wait_5_seconds": return { type: "wait", timeMs: 5000, // Google CUA waits for 5 seconds }; case "hover_at": { const { x, y } = this.normalizeCoordinates( args.x as number, args.y as number, ); return { type: "move", x, y, }; } case "search": return { type: "goto", url: "https://www.google.com", }; case "drag_and_drop": { const startPoint = this.normalizeCoordinates( args.x as number, args.y as number, ); const endPoint = this.normalizeCoordinates( args.destination_x as number, args.destination_y as number, ); return { type: "drag", path: [ { x: startPoint.x, y: startPoint.y }, { x: endPoint.x, y: endPoint.y }, ], }; } default: if (isCustomTool(functionCall, this.tools)) { return { type: "custom_tool", name, arguments: args, timestamp: Date.now(), pageUrl: this.currentUrl, }; } console.warn(`Unsupported Google CUA function: ${name}`); return null; } } /** * Normalize coordinates from Google's 0-1000 range to viewport dimensions */ private normalizeCoordinates(x: number, y: number): { x: number; y: number } { const clampedX = Math.min(999, Math.max(0, x)); const clampedY = Math.min(999, Math.max(0, y)); return { x: Math.floor((clampedX / 1000) * this.currentViewport.width), y: Math.floor((clampedY / 1000) * this.currentViewport.height), }; } async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; }): Promise { // Update current URL if provided if (options?.currentUrl) { this.currentUrl = options.currentUrl; } // Use provided options if available if (options?.base64Image) { return `data:image/png;base64,${options.base64Image}`; } // Use the screenshot provider if available if (this.screenshotProvider) { try { const base64Image = await this.screenshotProvider(); return `data:image/png;base64,${base64Image}`; } catch (error) { console.error("Error capturing screenshot:", error); throw error; } } throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", ); } } ================================================ FILE: packages/core/lib/v3/agent/MicrosoftCUAClient.ts ================================================ import OpenAI from "openai"; import { LogLine } from "../types/public/logs.js"; import { AgentAction, AgentResult, AgentType, AgentExecutionOptions, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; import { AgentScreenshotProviderError } from "../types/public/sdkErrors.js"; import { mapKeyToPlaywright } from "./utils/cuaKeyMapping.js"; import { ChatCompletionMessageParam } from "openai/resources/chat/completions"; /** * Message types for FARA agent */ interface FaraMessage { role: "system" | "user" | "assistant"; content: string | FaraMessageContent[]; } interface FaraMessageContent { type: "text" | "image_url"; text?: string; image_url?: { url: string; // data:image/png;base64,... }; } /** * FARA function call structure (parsed from XML tags) */ interface FaraFunctionCall { name: string; // Always "computer_use" arguments: { action: string; thoughts?: string; [key: string]: unknown; }; } /** * Client for FARA (Function-based Autonomous Research Agent) by Microsoft * This implementation uses OpenAI-compatible API with XML-based tool calling */ export class MicrosoftCUAClient extends AgentClient { private apiKey: string; private baseURL: string; private client: OpenAI; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; // Dual history system private conversationHistory: FaraMessage[] = []; // Conceptual flow private actionHistory: FaraMessage[] = []; // Raw model responses private maxImages: number = 3; private temperature: number = 0; private facts: string[] = []; // FARA-specific MLM processor config private readonly MLM_PROCESSOR_IM_CFG = { min_pixels: 3136, max_pixels: 12845056, patch_size: 14, merge_size: 2, }; // Resized dimensions for model input private resizedViewport = { width: 1288, height: 711 }; constructor( type: AgentType, modelName: string, userProvidedInstructions?: string, clientOptions?: ClientOptions, ) { super(type, modelName || "fara-7b", userProvidedInstructions); // Process client options this.apiKey = (clientOptions?.apiKey as string) || process.env.AZURE_API_KEY || process.env.FIREWORKS_API_KEY || ""; this.baseURL = (clientOptions?.baseURL as string) || process.env.AZURE_ENDPOINT || process.env.FIREWORKS_ENDPOINT || ""; // Store client options for reference this.clientOptions = { apiKey: this.apiKey, baseURL: this.baseURL, }; // Validate API key if (!this.apiKey || this.apiKey === "") { throw new Error( "API key is required. Please provide it via clientOptions.apiKey or AZURE_API_KEY or FIREWORKS_API_KEY environment variables.", ); } // Initialize the OpenAI client (FARA uses OpenAI-compatible API) this.client = new OpenAI({ apiKey: this.apiKey, baseURL: this.baseURL, }); // Max images to keep in history if (clientOptions?.maxImages !== undefined) { this.maxImages = clientOptions.maxImages as number; } // Temperature if (clientOptions?.temperature !== undefined) { this.temperature = clientOptions.temperature as number; } } setViewport(width: number, height: number): void { this.currentViewport = { width, height }; // Compute resized viewport using smart_resize logic this.resizedViewport = this.smartResize(width, height); } setCurrentUrl(url: string): void { this.currentUrl = url; } setScreenshotProvider(provider: () => Promise): void { this.screenshotProvider = provider; } setActionHandler(handler: (action: AgentAction) => Promise): void { this.actionHandler = handler; } /** * Smart resize algorithm from FARA * Ensures dimensions are divisible by factor and within pixel limits */ private smartResize( width: number, height: number, ): { width: number; height: number } { const { patch_size, merge_size, min_pixels, max_pixels } = this.MLM_PROCESSOR_IM_CFG; const factor = patch_size * merge_size; const roundByFactor = (num: number, f: number) => Math.round(num / f) * f; const ceilByFactor = (num: number, f: number) => Math.ceil(num / f) * f; const floorByFactor = (num: number, f: number) => Math.floor(num / f) * f; let h_bar = Math.max(factor, roundByFactor(height, factor)); let w_bar = Math.max(factor, roundByFactor(width, factor)); if (h_bar * w_bar > max_pixels) { const beta = Math.sqrt((height * width) / max_pixels); h_bar = floorByFactor(height / beta, factor); w_bar = floorByFactor(width / beta, factor); } else if (h_bar * w_bar < min_pixels) { const beta = Math.sqrt(min_pixels / (height * width)); h_bar = ceilByFactor(height * beta, factor); w_bar = ceilByFactor(width * beta, factor); } return { width: w_bar, height: h_bar }; } /** * Generate system prompt with tool description * Simplified to match Python's minimal approach */ private generateSystemPrompt(): string { const { width, height } = this.resizedViewport; // Base prompt - Minimalist like Python let basePrompt = "You are a helpful assistant."; // Add user-provided instructions if available if (this.userProvidedInstructions) { basePrompt = `${basePrompt}\n\n${this.userProvidedInstructions}`; } // Tool description from FaraComputerUse const toolDescription = `Use a mouse and keyboard to interact with a computer, and take screenshots. * This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications. * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot. * The screen's resolution is ${width}x${height}. * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor. * If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked. * When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll(). * If a popup window appears that you want to close, if left_click() on the 'X' or close button doesn't work, try key(keys=['Escape']) to close it. * On some search bars, when you type(), you may need to press_enter=False and instead separately call left_click() on the search button to submit the search query. This is especially true of search bars that have auto-suggest popups for e.g. locations * For calendar widgets, you usually need to left_click() on arrows to move between months and left_click() on dates to select them; type() is not typically used to input dates there.`; // Tool parameters description const actionsDescription = `The action to perform. The available actions are: * \`key\`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. Includes "Enter", "Alt", "Shift", "Tab", "Control", "Backspace", "Delete", "Escape", "ArrowUp", "ArrowDown", "ArrowLeft", "ArrowRight", "PageDown", "PageUp", "Shift", etc. * \`type\`: Type a string of text on the keyboard. * \`mouse_move\`: Move the cursor to a specified (x, y) pixel coordinate on the screen. * \`left_click\`: Click the left mouse button. * \`scroll\`: Performs a scroll of the mouse scroll wheel. * \`history_back\`: Go back to the previous page in the browser history. * \`pause_and_memorize_fact\`: Pause and memorize a fact for future reference. * \`visit_url\`: Visit a specified URL. * \`web_search\`: Perform a web search with a specified query. * \`wait\`: Wait specified seconds for the change to happen. * \`terminate\`: Terminate the current task and report its completion status.`; // Tool JSON schema const toolSchema = { name: "computer_use", description: toolDescription, parameters: { type: "object", required: ["action"], properties: { action: { type: "string", description: actionsDescription, enum: [ "key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate", ], }, keys: { type: "array", description: "Required only by `action=key`.", }, text: { type: "string", description: "Required only by `action=type`.", }, press_enter: { type: "boolean", description: "Whether to press the Enter key after typing. Required only by `action=type`.", }, delete_existing_text: { type: "boolean", description: "Whether to delete existing text before typing. Required only by `action=type`.", }, coordinate: { type: "array", description: "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", }, pixels: { type: "number", description: "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", }, fact: { type: "string", description: "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", }, time: { type: "number", description: "The seconds to wait. Required only by `action=wait`.", }, status: { type: "string", description: "The status of the task. Required only by `action=terminate`.", enum: ["success", "failure"], }, }, }, }; // Format as FARA function calling template (FN_CALL_TEMPLATE format) const toolDescs = JSON.stringify(toolSchema, null, 2); const functionCallTemplate = ` You are provided with function signatures within XML tags: ${toolDescs} For each function call, return a json object with function name and arguments within XML tags: {{"name": , "arguments": }} `; return `${basePrompt}\n\n${functionCallTemplate}`; } /** * Parse thoughts and action from model response * FARA uses XML-based tool calling: \n{...}\n */ private parseThoughtsAndAction(response: string): { thoughts: string; functionCall: FaraFunctionCall; } { try { const parts = response.split("\n"); const thoughts = parts[0].trim(); const actionText = parts[1].split("\n")[0].trim(); let parsedAction; try { parsedAction = JSON.parse(actionText); } catch (jsonError) { // Fix common malformed JSON: double opening brackets {{"name": ...}} // This happens when the model adds an extra opening brace if (actionText.startsWith("{{") && actionText.endsWith("}")) { // Remove the extra opening brace const fixedText = actionText.slice(1); try { parsedAction = JSON.parse(fixedText); } catch (retryError) { throw new Error( `Failed to parse action text even after fixing double brackets. Original: ${actionText}. Fixed: ${fixedText}. Error: ${retryError}`, { cause: retryError }, ); } } else { throw new Error( `Failed to parse action text as JSON: ${actionText}. Error: ${jsonError}`, { cause: jsonError }, ); } } return { thoughts, functionCall: { name: parsedAction.name || "computer_use", arguments: { ...parsedAction.arguments, thoughts, }, }, }; } catch (error) { throw new Error( `Failed to parse FARA tool call from response: ${response}. Error: ${error}`, { cause: error }, ); } } /** * Convert FARA function call to Stagehand AgentAction */ private convertFunctionCallToAction( functionCall: FaraFunctionCall, ): AgentAction { const args = functionCall.arguments; const action = args.action as string; // Transform coordinates from resized to original viewport const transformCoordinate = (coord: number[]): number[] => { if (!coord || coord.length !== 2) return coord; const [x, y] = coord; const scaleX = this.currentViewport.width / this.resizedViewport.width; const scaleY = this.currentViewport.height / this.resizedViewport.height; return [Math.round(x * scaleX), Math.round(y * scaleY)]; }; const baseAction = { type: action, reasoning: args.thoughts as string, }; switch (action) { case "left_click": { const clickCoord = transformCoordinate(args.coordinate as number[]); return { ...baseAction, type: "click", x: clickCoord[0], y: clickCoord[1], button: "left" as const, }; } case "mouse_move": { const moveCoord = transformCoordinate(args.coordinate as number[]); return { ...baseAction, type: "move", coordinate: moveCoord, }; } case "type": { const typeCoord = args.coordinate ? transformCoordinate(args.coordinate as number[]) : undefined; return { ...baseAction, text: args.text as string, ...(typeCoord && { x: typeCoord[0], y: typeCoord[1] }), press_enter: args.press_enter !== undefined ? (args.press_enter as boolean) : true, ...(args.delete_existing_text !== undefined && { delete_existing_text: args.delete_existing_text as boolean, }), }; } case "key": case "keypress": { const keys = (args.keys as string[]) || []; // Normalize keys to Playwright format const normalizedKeys = keys.map((k) => mapKeyToPlaywright(k)); return { ...baseAction, type: "keypress", keys: normalizedKeys, }; } case "scroll": { const pixels = (args.pixels as number) || 0; // FARA: positive = scroll up, negative = scroll down // Convert to scroll_x/scroll_y return { ...baseAction, scroll_x: 0, scroll_y: -pixels, // Invert: negative pixels = scroll down }; } case "visit_url": { let url = args.url as string; // Enhanced URL processing like Python if ( !url.startsWith("https://") && !url.startsWith("http://") && !url.startsWith("file://") && !url.startsWith("about:") ) { // If URL contains space, treat as search query if (url.includes(" ")) { url = `https://www.bing.com/search?q=${encodeURIComponent(url)}&FORM=QBLH`; } else { // Otherwise prefix with https:// url = "https://" + url; } } return { ...baseAction, type: "goto", url, }; } case "web_search": { // Convert web search to visit_url with Bing search const query = args.query as string; const searchUrl = `https://www.bing.com/search?q=${encodeURIComponent(query)}&FORM=QBLH`; return { ...baseAction, type: "goto", url: searchUrl, }; } case "history_back": return { ...baseAction, type: "back", }; case "wait": { // Support both 'time' and 'duration' parameters with default (matches Python) const durationSeconds = (args.time as number) || (args.duration as number) || 3.0; return { ...baseAction, timeMs: durationSeconds * 1000, // Convert seconds to ms }; } case "pause_and_memorize_fact": { // Store the fact for future reference (matches Python) const fact = args.fact as string; this.facts.push(fact); return { ...baseAction, fact, }; } case "terminate": return { ...baseAction, status: args.status as string, }; default: return { ...baseAction, ...args, }; } } /** * Capture a screenshot and return as base64 data URL */ async captureScreenshot(): Promise { if (!this.screenshotProvider) { throw new AgentScreenshotProviderError("Screenshot provider not set"); } const base64Screenshot = await this.screenshotProvider(); return `data:image/png;base64,${base64Screenshot}`; } /** * Remove old screenshots from history * Matches Python's maybe_remove_old_screenshots */ private maybeRemoveOldScreenshots( history: FaraMessage[], includesCurrent: boolean = false, ): FaraMessage[] { if (this.maxImages <= 0) { return history; } const maxImages = includesCurrent ? this.maxImages : this.maxImages - 1; const newHistory: FaraMessage[] = []; let nImages = 0; // Iterate backwards for (let i = history.length - 1; i >= 0; i--) { const msg = history[i]; // Check if message has image let hasImage = false; if (Array.isArray(msg.content)) { hasImage = msg.content.some((c) => c.type === "image_url"); } if (i === 0 && nImages >= maxImages) { // First message (task) - preserve text, remove image if (Array.isArray(msg.content)) { const newContent = msg.content.filter((c) => c.type !== "image_url"); // If no content left, skip (unless it's the only message, but Python logic says continue) if (newContent.length === 0) { continue; } newHistory.push({ ...msg, content: newContent }); } else { newHistory.push(msg); } continue; } if (hasImage) { if (nImages < maxImages) { newHistory.push(msg); nImages++; } else { // Remove image, keep text if (Array.isArray(msg.content)) { const newContent = msg.content.filter( (c) => c.type !== "image_url", ); // If content becomes empty, we can skip this message entirely (unless it's meaningful text) // Python logic: if msg is None continue. if (newContent.length > 0) { newHistory.push({ ...msg, content: newContent }); } } else { newHistory.push(msg); } } } else { newHistory.push(msg); } } return newHistory.reverse(); } /** * Reconstruct history for API call * Merges conceptual chat history with raw action history */ private reconstructHistory(): FaraMessage[] { const history: FaraMessage[] = []; let actionTurn = 0; for (let i = 0; i < this.conversationHistory.length; i++) { const m = this.conversationHistory[i]; if (m.role === "assistant") { if (actionTurn >= this.actionHistory.length) { // Should not happen if synced correctly console.warn("OUT OF SYNC: Action history shorter than chat history"); history.push(m); } else { history.push(this.actionHistory[actionTurn]); actionTurn++; } } else { history.push(m); } } return this.maybeRemoveOldScreenshots(history); } /** * Execute a single step */ private async executeStep( logger: (message: LogLine) => void, isFirstRound: boolean = false, ): Promise<{ actions: AgentAction[]; completed: boolean; usage: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; }> { // Capture screenshot const screenshotDataUrl = await this.captureScreenshot(); // Update conversation history with new screenshot/message if (isFirstRound) { // First round: modify the last message (initial user instruction) to include screenshot const lastMessage = this.conversationHistory[this.conversationHistory.length - 1]; if (lastMessage && lastMessage.role === "user") { const originalContent = typeof lastMessage.content === "string" ? lastMessage.content : (lastMessage.content.find((c) => c.type === "text")?.text ?? "Start task"); lastMessage.content = [ { type: "image_url", image_url: { url: screenshotDataUrl }, }, { type: "text", text: originalContent, }, ]; } } else { // Subsequent rounds: add new user message with screenshot const userContent: FaraMessageContent[] = [ { type: "image_url", image_url: { url: screenshotDataUrl }, }, ]; // Add current URL if available let textPrompt = "Here is the next screenshot. Think about what to do next."; if (this.currentUrl) { const trimmedUrl = this.currentUrl.length > 100 ? this.currentUrl.slice(0, 100) + "..." : this.currentUrl; textPrompt = `Current URL: ${trimmedUrl}\n${textPrompt}`; } userContent.push({ type: "text", text: textPrompt, }); this.conversationHistory.push({ role: "user", content: userContent, }); } // Reconstruct history for model call let history = this.reconstructHistory(); // Prepend system prompt (generated fresh) const systemMessage: FaraMessage = { role: "system", content: this.generateSystemPrompt(), }; history = [systemMessage, ...history]; // Make API call logger({ category: "agent", message: `Making API call to FARA model with ${history.length} messages`, level: 2, }); const startTime = Date.now(); let response; try { response = await this.client.chat.completions.create({ model: this.modelName, messages: history as unknown as ChatCompletionMessageParam[], temperature: this.temperature, }); } catch (apiError) { logger({ category: "agent", message: `API call failed: ${apiError instanceof Error ? apiError.message : String(apiError)}`, level: 0, }); throw apiError; } const inferenceTime = Date.now() - startTime; logger({ category: "agent", message: `API call completed in ${inferenceTime}ms`, level: 2, }); const content = response.choices[0].message.content || ""; const usage = response.usage || { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0, }; // Add assistant response to both histories const assistantMsg: FaraMessage = { role: "assistant", content, }; this.conversationHistory.push(assistantMsg); this.actionHistory.push(assistantMsg); logger({ category: "agent", message: `Model response: ${content}`, level: 2, }); // Parse tool call const { thoughts, functionCall } = this.parseThoughtsAndAction(content); logger({ category: "agent", message: `Thoughts: ${thoughts}`, level: 2, }); logger({ category: "agent", message: `Action: ${JSON.stringify(functionCall.arguments)}`, level: 2, }); // Convert to AgentAction const agentAction = this.convertFunctionCallToAction(functionCall); // Expand type action into multiple actions if it has coordinates const actions: AgentAction[] = []; if ( agentAction.type === "type" && typeof agentAction.x === "number" && typeof agentAction.y === "number" ) { // First, click at the coordinates to focus the field actions.push({ type: "click", x: agentAction.x, y: agentAction.y, button: "left", }); // If delete_existing_text is true, clear the field first if (agentAction.delete_existing_text) { actions.push({ type: "keypress", keys: ["Command+A"], }); actions.push({ type: "keypress", keys: ["Backspace"], }); } // Add the type action (without coordinates since we already clicked) actions.push({ type: "type", text: agentAction.text, }); // If press_enter is true (default), press Enter after typing if (agentAction.press_enter !== false) { actions.push({ type: "keypress", keys: ["Enter"], }); } } else { // For all other actions, just add as-is actions.push(agentAction); } // Execute all actions if handler is available if (this.actionHandler && agentAction.type !== "terminate") { for (const action of actions) { await this.actionHandler(action); } } // Check if completed const completed = functionCall.arguments.action === "terminate"; return { actions, completed, usage: { input_tokens: usage.prompt_tokens, output_tokens: usage.completion_tokens, inference_time_ms: inferenceTime, }, }; } /** * Execute a task with the FARA CUA * This is the main entry point for the agent * @implements AgentClient.execute */ async execute(executionOptions: AgentExecutionOptions): Promise { const { options, logger } = executionOptions; const { instruction } = options; const maxSteps = options.maxSteps || 10; let currentStep = 0; let completed = false; const actions: AgentAction[] = []; const messageList: string[] = []; let finalMessage: string; let totalInputTokens = 0; let totalOutputTokens = 0; let totalInferenceTime = 0; // Initialize conversation with user instruction // System prompt is NOT added here, it's added dynamically in executeStep this.conversationHistory = [ { role: "user", content: instruction, }, ]; this.actionHistory = []; try { // Execute steps until completion or max steps reached while (!completed && currentStep < maxSteps) { await this.preStepHook?.(); logger({ category: "agent", message: `Executing step ${currentStep + 1}/${maxSteps}`, level: 1, }); const isFirstRound = currentStep === 0; const result = await this.executeStep(logger, isFirstRound); totalInputTokens += result.usage.input_tokens; totalOutputTokens += result.usage.output_tokens; totalInferenceTime += result.usage.inference_time_ms; // Add actions to the list actions.push(...result.actions); // Update completion status completed = result.completed; currentStep++; // Record message for this step const lastAction = result.actions[result.actions.length - 1]; if (lastAction?.reasoning) { messageList.push(lastAction.reasoning); } } // Generate final message if (completed) { const lastAction = actions[actions.length - 1]; finalMessage = (lastAction as { status?: string })?.status === "success" ? "Task completed successfully." : "Task completed with failures."; } else { finalMessage = `Reached maximum steps (${maxSteps}) without completion.`; } if (messageList.length > 0) { finalMessage = `${messageList.join("\n\n")}\n\n${finalMessage}`; } return { success: completed, completed, message: finalMessage, actions, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } catch (error) { logger({ category: "agent", message: `Error during execution: ${error}`, level: 0, }); // Rethrow to allow eval runner's retry logic to handle transient errors throw error; } } } ================================================ FILE: packages/core/lib/v3/agent/OpenAICUAClient.ts ================================================ import OpenAI from "openai"; import { LogLine } from "../types/public/logs.js"; import { AgentAction, AgentResult, AgentType, AgentExecutionOptions, ResponseInputItem, ResponseItem, ComputerCallItem, FunctionCallItem, SafetyCheck, SafetyConfirmationHandler, } from "../types/public/agent.js"; import { ClientOptions } from "../types/public/model.js"; import { AgentClient } from "./AgentClient.js"; import { AgentScreenshotProviderError, StagehandClosedError, } from "../types/public/sdkErrors.js"; import { ToolSet } from "ai"; import { FlowLogger, extractLlmCuaPromptSummary, extractLlmCuaResponseSummary, } from "../flowlogger/FlowLogger.js"; import { v7 as uuidv7 } from "uuid"; /** * Client for OpenAI's Computer Use Assistant API * This implementation uses the official OpenAI Responses API for Computer Use */ const CAPTCHA_PROCEED_TOOL = "captchaSolvedProceed"; export class OpenAICUAClient extends AgentClient { private pendingContextNotes: string[] = []; private captchaSolvedToolActive = false; private apiKey: string; private organization?: string; private baseURL: string; private client: OpenAI; public lastResponseId?: string; private currentViewport = { width: 1288, height: 711 }; private currentUrl?: string; private screenshotProvider?: () => Promise; private actionHandler?: (action: AgentAction) => Promise; private reasoningItems: Map = new Map(); private environment: string = "browser"; // "browser", "mac", "windows", or "ubuntu" private tools?: ToolSet; private safetyConfirmationHandler?: SafetyConfirmationHandler; constructor( type: AgentType, modelName: string, userProvidedInstructions?: string, clientOptions?: ClientOptions, tools?: ToolSet, ) { super(type, modelName, userProvidedInstructions); // Process client options this.apiKey = (clientOptions?.apiKey as string) || process.env.OPENAI_API_KEY || ""; this.baseURL = (clientOptions?.baseURL as string) || undefined; this.organization = (clientOptions?.organization as string) || process.env.OPENAI_ORG; // Get environment if specified if ( clientOptions?.environment && typeof clientOptions.environment === "string" ) { this.environment = clientOptions.environment; } // Store client options for reference this.clientOptions = { apiKey: this.apiKey, }; if (this.baseURL) { this.clientOptions.baseURL = this.baseURL; } // Initialize the OpenAI client this.client = new OpenAI(this.clientOptions); this.tools = tools; } setViewport(width: number, height: number): void { this.currentViewport = { width, height }; } setCurrentUrl(url: string): void { this.currentUrl = url; } setScreenshotProvider(provider: () => Promise): void { this.screenshotProvider = provider; } setActionHandler(handler: (action: AgentAction) => Promise): void { this.actionHandler = handler; } setTools(tools: ToolSet): void { this.tools = tools; } setSafetyConfirmationHandler(handler?: SafetyConfirmationHandler): void { this.safetyConfirmationHandler = handler; } addContextNote(note: string): void { this.pendingContextNotes.push(note); // When a captcha-related note arrives, expose a tool that the model can // call instead of asking the user for confirmation. This replaces // fragile English-phrase parsing with a structured tool call. if (note.toLowerCase().includes("captcha")) { this.captchaSolvedToolActive = true; } } /** * Execute a task with the OpenAI CUA * This is the main entry point for the agent * @implements AgentClient.execute */ async execute(executionOptions: AgentExecutionOptions): Promise { const { options, logger } = executionOptions; const { instruction } = options; const maxSteps = options.maxSteps || 10; let currentStep = 0; let completed = false; const actions: AgentAction[] = []; const messageList: string[] = []; let finalMessage = ""; this.reasoningItems.clear(); // Clear any previous reasoning items // Start with the initial instruction let inputItems = this.createInitialInputItems(instruction); let previousResponseId: string | undefined = undefined; let totalInputTokens = 0; let totalOutputTokens = 0; let totalInferenceTime = 0; try { // Execute steps until completion or max steps reached while (!completed && currentStep < maxSteps) { await this.preStepHook?.(); logger({ category: "agent", message: `Executing step ${currentStep + 1}/${maxSteps}`, level: 1, }); const result = await this.executeStep( inputItems, previousResponseId, logger, ); totalInputTokens += result.usage.input_tokens; totalOutputTokens += result.usage.output_tokens; totalInferenceTime += result.usage.inference_time_ms; // Add actions to the list actions.push(...result.actions); // Update completion status completed = result.completed; // Store the previous response ID for the next request previousResponseId = result.responseId; // Update the input items for the next step if we're continuing if (!completed) { inputItems = result.nextInputItems; const contextNotes = this.drainContextNotes(); if (contextNotes.length > 0) { inputItems = [ ...inputItems, ...contextNotes.map((note) => ({ role: "user" as const, content: note, })), ]; } } // Record any message for this step if (result.message) { messageList.push(result.message); finalMessage = result.message; } // Increment step counter currentStep++; } // Return the final result return { success: completed, actions, message: finalMessage, completed, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing agent task: ${errorMessage}`, level: 0, }); return { success: false, actions, message: `Failed to execute task: ${errorMessage}`, completed: false, usage: { input_tokens: totalInputTokens, output_tokens: totalOutputTokens, inference_time_ms: totalInferenceTime, }, }; } } /** * Execute a single step of the agent * This coordinates the flow: Request → Get Action → Execute Action */ async executeStep( inputItems: ResponseInputItem[], previousResponseId: string | undefined, logger: (message: LogLine) => void, ): Promise<{ actions: AgentAction[]; message: string; completed: boolean; nextInputItems: ResponseInputItem[]; responseId: string; usage: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; }> { try { // Get response from the model const result = await this.getAction(inputItems, previousResponseId); const output = result.output; const responseId = result.responseId; const usage = { input_tokens: result.usage.input_tokens, output_tokens: result.usage.output_tokens, inference_time_ms: result.usage.inference_time_ms, }; // Add any reasoning items to our map for (const item of output) { if (item.type === "reasoning") { this.reasoningItems.set(item.id, item); logger({ category: "agent", message: `Reasoning: ${String(item.content || "")}`, level: 1, }); } } // Extract actions from the output const stepActions: AgentAction[] = []; for (const item of output) { if (item.type === "computer_call" && this.isComputerCallItem(item)) { logger({ category: "agent", message: `Found computer_call: ${item.action.type}, payload: ${JSON.stringify(item.action)}, call_id: ${item.call_id}`, level: 2, }); const action = this.convertComputerCallToAction(item); if (action) { stepActions.push(action); logger({ category: "agent", message: `Converted computer_call to action: ${action.type}`, level: 2, }); } } else if ( item.type === "function_call" && this.isFunctionCallItem(item) ) { logger({ category: "agent", message: `Found function_call: ${item.name}, call_id: ${item.call_id}`, level: 2, }); const action = this.convertFunctionCallToAction(item); if (action) { stepActions.push(action); logger({ category: "agent", message: `Converted function_call to action: ${action.type}`, level: 2, }); } } } // Extract message text let message = ""; for (const item of output) { if (item.type === "message") { logger({ category: "agent", message: `Found message block`, level: 2, }); if (item.content && Array.isArray(item.content)) { for (const content of item.content) { if (content.type === "output_text" && content.text) { message += content.text + "\n"; logger({ category: "agent", message: `Message text: ${String(content.text || "")}`, level: 1, }); } } } } } // Take actions and get results const nextInputItems = await this.takeAction(output, logger); // Check if completed const completed = output.length === 0 || output.every( (item) => item.type === "message" || item.type === "reasoning", ); return { actions: stepActions, message: message.trim(), completed, nextInputItems, responseId, usage: usage, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing step: ${errorMessage}`, level: 0, }); throw error; } } private isComputerCallItem(item: ResponseItem): item is ComputerCallItem { return ( item.type === "computer_call" && "call_id" in item && "action" in item && typeof item.action === "object" ); } private async handleSafetyConfirmation( pendingSafetyChecks: SafetyCheck[], logger: (message: LogLine) => void, ): Promise { if (this.safetyConfirmationHandler) { logger({ category: "agent", message: `Requesting safety confirmation for ${pendingSafetyChecks.length} check(s): ${pendingSafetyChecks.map((c) => c.code).join(", ")}`, level: 1, }); const response = await this.safetyConfirmationHandler(pendingSafetyChecks); if (response.acknowledged) { logger({ category: "agent", message: `Safety checks acknowledged by user`, level: 1, }); return pendingSafetyChecks; } else { logger({ category: "agent", message: `Safety checks rejected by user`, level: 1, }); return undefined; } } logger({ category: "agent", message: `Auto-acknowledging ${pendingSafetyChecks.length} safety check(s)`, level: 2, }); return pendingSafetyChecks; } private isFunctionCallItem(item: ResponseItem): item is FunctionCallItem { return ( item.type === "function_call" && "call_id" in item && "name" in item && "arguments" in item ); } private createInitialInputItems(instruction: string): ResponseInputItem[] { // For the initial request, we use a simple array with the user's instruction return [ { role: "system", content: this.userProvidedInstructions, }, { role: "user", content: instruction, }, ]; } async getAction( inputItems: ResponseInputItem[], previousResponseId?: string, ): Promise<{ output: ResponseItem[]; responseId: string; usage: Record; }> { try { // Create the request parameters const requestParams: Record = { model: this.modelName, tools: [ { type: "computer_use_preview", display_width: this.currentViewport.width, display_height: this.currentViewport.height, environment: this.environment, }, ], input: inputItems, truncation: "auto", }; // Add custom tools if available if (this.tools && Object.keys(this.tools).length > 0) { const customTools = Object.entries(this.tools).map(([name, tool]) => ({ type: "function" as const, name, function: { name, description: tool.description, parameters: tool.inputSchema, }, })); requestParams.tools = [ ...(requestParams.tools as Record[]), ...customTools, ]; } // When a captcha was just solved, expose a tool the model can call // to confirm it should proceed. This avoids fragile English-phrase // parsing and works regardless of the model's output language. if (this.captchaSolvedToolActive) { requestParams.tools = [ ...(requestParams.tools as Record[]), { type: "function" as const, name: CAPTCHA_PROCEED_TOOL, function: { name: CAPTCHA_PROCEED_TOOL, description: "The captcha on this page was solved automatically. " + "Call this tool to confirm and continue with your task " + "instead of asking the user for permission.", parameters: { type: "object", properties: {}, required: [] }, }, }, ]; } // Add previous_response_id if available if (previousResponseId) { requestParams.previous_response_id = previousResponseId; } // Log LLM request const llmRequestId = uuidv7(); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: this.modelName, prompt: extractLlmCuaPromptSummary(inputItems), }); const startTime = Date.now(); // Create the response using the OpenAI Responses API // @ts-expect-error - Force type to match what the OpenAI SDK expects const response = await this.client.responses.create(requestParams); const endTime = Date.now(); const elapsedMs = endTime - startTime; // Extract only the input_tokens and output_tokens const usage = { input_tokens: response.usage.input_tokens, output_tokens: response.usage.output_tokens, inference_time_ms: elapsedMs, }; // Log LLM response FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.modelName, output: extractLlmCuaResponseSummary(response.output), inputTokens: response.usage.input_tokens, outputTokens: response.usage.output_tokens, }); // Store the response ID for future use this.lastResponseId = response.id; // Return the output and response ID return { output: response.output as unknown as ResponseItem[], responseId: response.id, usage, }; } catch (error) { console.error("Error getting action from OpenAI:", error); throw error; } } async takeAction( output: ResponseItem[], logger: (message: LogLine) => void, ): Promise { const nextInputItems: ResponseInputItem[] = []; // Process each output item for (const item of output) { if (item.type === "computer_call" && this.isComputerCallItem(item)) { // Handle computer calls try { const action = this.convertComputerCallToAction(item); if (action && this.actionHandler) { logger({ category: "agent", message: `Executing computer action: ${action.type}`, level: 1, }); await this.actionHandler(action); } // Capture a screenshot const screenshot = await this.captureScreenshot(); // Create a computer_call_output for the next request const outputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { type: "input_image" as const, image_url: screenshot, }, } as ResponseInputItem; logger({ category: "agent", message: `Added computer_call_output for call_id: ${item.call_id}`, level: 2, }); // Add current URL if available if (this.currentUrl) { const computerCallOutput = outputItem as { type: "computer_call_output"; call_id: string; output: { type: "input_image"; image_url: string; current_url?: string; }; acknowledged_safety_checks?: SafetyCheck[]; }; computerCallOutput.output.current_url = this.currentUrl; } if ( item.pending_safety_checks && item.pending_safety_checks.length > 0 ) { const acknowledgedChecks = await this.handleSafetyConfirmation( item.pending_safety_checks, logger, ); if (acknowledgedChecks) { const computerCallOutput = outputItem as { type: "computer_call_output"; call_id: string; output: { type: "input_image"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; }; computerCallOutput.acknowledged_safety_checks = acknowledgedChecks; } } nextInputItems.push(outputItem); } catch (error) { if (error instanceof StagehandClosedError) { throw error; } const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing computer call: ${errorMessage}`, level: 0, }); try { // Capture a screenshot even on error const screenshot = await this.captureScreenshot(); const errorOutputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { type: "input_image" as const, image_url: screenshot, error: errorMessage, }, } as ResponseInputItem; // Add current URL if available if (this.currentUrl) { const computerCallOutput = errorOutputItem as { type: "computer_call_output"; call_id: string; output: { type: "input_image"; image_url: string; current_url?: string; }; acknowledged_safety_checks?: SafetyCheck[]; }; computerCallOutput.output.current_url = this.currentUrl; } if ( item.pending_safety_checks && item.pending_safety_checks.length > 0 ) { const acknowledgedChecks = await this.handleSafetyConfirmation( item.pending_safety_checks, logger, ); if (acknowledgedChecks) { const computerCallOutput = errorOutputItem as { type: "computer_call_output"; call_id: string; output: { type: "input_image"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; }; computerCallOutput.acknowledged_safety_checks = acknowledgedChecks; } } nextInputItems.push(errorOutputItem); } catch (screenshotError) { if (screenshotError instanceof StagehandClosedError) { throw screenshotError; } // If we can't capture a screenshot, just send the error logger({ category: "agent", message: `Error capturing screenshot: ${String(screenshotError)}`, level: 0, }); // For error cases without a screenshot, we need to use a string output nextInputItems.push({ type: "computer_call_output", call_id: item.call_id, output: `Error: ${errorMessage}`, } as ResponseInputItem); } } } else if ( item.type === "function_call" && this.isFunctionCallItem(item) ) { // Handle the captcha-proceed tool — just return a confirmation and // deactivate the tool so it doesn't appear on subsequent steps. if (item.name === CAPTCHA_PROCEED_TOOL) { this.captchaSolvedToolActive = false; nextInputItems.push({ type: "function_call_output", call_id: item.call_id, output: "Confirmed. The captcha is solved. Continue completing the original task autonomously without asking for further confirmation.", } as ResponseInputItem); continue; } // Handle function calls (tool calls) try { const action = this.convertFunctionCallToAction(item); if (action && this.actionHandler) { await this.actionHandler(action); } // Execute the tool if available let toolResult = "Tool executed successfully"; if (this.tools && item.name in this.tools) { try { const tool = this.tools[item.name]; const args = JSON.parse(item.arguments); logger({ category: "agent", message: `Executing tool call: ${item.name} with args: ${item.arguments}`, level: 1, }); const result = await tool.execute(args, { toolCallId: item.call_id, messages: [], }); toolResult = JSON.stringify(result); logger({ category: "agent", message: `Tool ${item.name} completed successfully. Result: ${toolResult}`, level: 1, }); } catch (toolError) { const errorMessage = toolError instanceof Error ? toolError.message : String(toolError); toolResult = `Error executing tool: ${errorMessage}`; logger({ category: "agent", message: `Error executing tool ${item.name}: ${errorMessage}`, level: 0, }); } } // Create a function_call_output for the next request const outputItem: ResponseInputItem = { type: "function_call_output", call_id: item.call_id, output: toolResult, }; nextInputItems.push(outputItem); } catch (error) { if (error instanceof StagehandClosedError) { throw error; } const errorMessage = error instanceof Error ? error.message : String(error); logger({ category: "agent", message: `Error executing function call: ${errorMessage}`, level: 0, }); // Send error result back const errorOutputItem: ResponseInputItem = { type: "function_call_output", call_id: item.call_id, output: `Error: ${errorMessage}`, }; nextInputItems.push(errorOutputItem); } } } return nextInputItems; } private convertComputerCallToAction( call: ComputerCallItem, ): AgentAction | null { const { action } = call; // Instead of wrapping the action in a params object, spread the action properties directly // This ensures properties like x, y, button, etc. are directly accessible on the AgentAction return { type: action.type as string, ...action, // Spread all properties from the action }; } private drainContextNotes(): string[] { if (this.pendingContextNotes.length === 0) { return []; } const notes = [...this.pendingContextNotes]; this.pendingContextNotes = []; return notes; } private convertFunctionCallToAction( call: FunctionCallItem, ): AgentAction | null { try { const args = JSON.parse(call.arguments); return { type: call.name, params: args, }; } catch (error) { console.error("Error parsing function call arguments:", error); return null; } } async captureScreenshot(options?: { base64Image?: string; currentUrl?: string; }): Promise { // Use provided options if available if (options?.base64Image) { return `data:image/png;base64,${options.base64Image}`; } // Use the screenshot provider if available if (this.screenshotProvider) { try { const base64Image = await this.screenshotProvider(); return `data:image/png;base64,${base64Image}`; } catch (error) { console.error("Error capturing screenshot:", error); throw error; } } throw new AgentScreenshotProviderError( "`screenshotProvider` has not been set. " + "Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image", ); } } ================================================ FILE: packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts ================================================ import type { AgentToolMode, Variables } from "../../types/public/agent.js"; import { CAPTCHA_SYSTEM_PROMPT_NOTE } from "../utils/captchaSolver.js"; export interface AgentSystemPromptOptions { url: string; executionInstruction: string; mode: AgentToolMode; systemInstructions?: string; /** Whether captchas are automatically solved by the browser environment */ captchasAutoSolve?: boolean; /** Tools to exclude from the system prompt */ excludeTools?: string[]; /** Variables available to the agent for use in act/type tools */ variables?: Variables; /** Whether the search tool is enabled for this execution */ useSearch?: boolean; } /** * Builds the system prompt for the agent based on the tool mode. * * @param options - The prompt configuration options * @returns The formatted system prompt string */ interface ToolDefinition { name: string; description: string; } function buildToolsSection( isHybridMode: boolean, hasSearch: boolean, excludeTools?: string[], ): string { const excludeSet = new Set(excludeTools ?? []); const hybridTools: ToolDefinition[] = [ { name: "screenshot", description: "Take a compressed JPEG screenshot for quick visual context", }, { name: "ariaTree", description: "Get an accessibility (ARIA) hybrid tree for full page context", }, { name: "click", description: "Click on an element (PREFERRED - more reliable when element is visible in viewport)", }, { name: "type", description: "Type text into an element (PREFERRED - more reliable when element is visible in viewport)", }, { name: "act", description: "Perform a specific atomic action (click, type, etc.) - ONLY use when element is in ariaTree but NOT visible in screenshot. Less reliable but can interact with out-of-viewport elements.", }, { name: "dragAndDrop", description: "Drag and drop an element" }, { name: "clickAndHold", description: "Click and hold on an element" }, { name: "keys", description: "Press a keyboard key" }, { name: "fillFormVision", description: "Fill out a form using coordinates", }, { name: "think", description: "Think about the task" }, { name: "extract", description: "Extract structured data" }, { name: "goto", description: "Navigate to a URL" }, { name: "wait", description: "Wait for a specified time" }, { name: "navback", description: "Navigate back in browser history" }, { name: "scroll", description: "Scroll the page x pixels up or down" }, ]; const domTools: ToolDefinition[] = [ { name: "screenshot", description: "Take a compressed JPEG screenshot for quick visual context", }, { name: "ariaTree", description: "Get an accessibility (ARIA) hybrid tree for full page context", }, { name: "act", description: "Perform a specific atomic action (click, type)", }, { name: "keys", description: "Press a keyboard key" }, { name: "fillForm", description: "Fill out a form" }, { name: "think", description: "Think about the task" }, { name: "extract", description: "Extract structured data" }, { name: "goto", description: "Navigate to a URL" }, { name: "wait", description: "Wait for a specified time" }, { name: "navback", description: "Navigate back in browser history" }, { name: "scroll", description: "Scroll the page x pixels up or down" }, ]; const baseTools = isHybridMode ? hybridTools : domTools; if (hasSearch) { baseTools.push({ name: "search", description: "Perform a web search and return results. Prefer this over navigating to Google and searching within the page for reliability and efficiency.", }); } const filteredTools = baseTools.filter((tool) => !excludeSet.has(tool.name)); const toolLines = filteredTools .map((tool) => ` ${tool.description}`) .join("\n"); return `\n${toolLines}\n `; } export function buildAgentSystemPrompt( options: AgentSystemPromptOptions, ): string { const { url, executionInstruction, mode, systemInstructions, captchasAutoSolve = false, excludeTools, variables, useSearch = false, } = options; const localeDate = new Date().toLocaleDateString(); const isoDate = new Date().toISOString(); const cdata = (text: string) => ``; const isHybridMode = mode === "hybrid"; const hasSearch = useSearch || Boolean(process.env.BRAVE_API_KEY); // Tools section differs based on mode and excluded tools const toolsSection = buildToolsSection(isHybridMode, hasSearch, excludeTools); // Strategy differs based on mode const strategyItems = isHybridMode ? [ `Tool selection priority: Use specific tools (click, type) when elements are visible in viewport for maximum reliability.`, `Always use screenshot to get proper grounding of the coordinates you want to type/click into.`, `When interacting with an input, always use the type tool to type into the input, over clicking and then typing into it.`, `Use ariaTree as a secondary tool when elements aren't visible in screenshot or to get full page context.`, `Only use act when element is in ariaTree but NOT visible in screenshot.`, ] : [ `Tool selection priority: Use act tool for all clicking and typing on a page.`, `Always check ariaTree first to understand full page content without scrolling - it shows all elements including those below the fold.`, `When interacting with an input, always use the act tool to type into the input, over clicking and then typing.`, `If an element is present in the ariaTree, use act to interact with it directly - this eliminates the need to scroll.`, `Use screenshot for visual confirmation when needed, but rely primarily on ariaTree for element detection.`, ]; const strategySection = strategyItems.join("\n "); const commonStrategyItems = ` CRITICAL: Use extract ONLY when the task explicitly requires structured data output (e.g., "get job listings", "extract product details"). For reading page content or understanding elements, always use ${isHybridMode ? "screenshot or ariaTree" : "ariaTree"} instead - it's faster and more reliable. Keep actions atomic and verify outcomes before proceeding. For each action, provide clear reasoning about why you're taking that step. When you need to input text that could be entered character-by-character or through multiple separate inputs, prefer using the keys tool to type the entire sequence at once. This is more efficient for scenarios like verification codes split across multiple fields, or when virtual keyboards are present but direct typing would be faster. `; // Page understanding protocol differs based on mode const pageUnderstandingProtocol = isHybridMode ? ` UNDERSTAND THE PAGE screenshot Visual confirmation when needed. Ideally after navigating to a new page. ariaTree Get complete page context before taking actions Eliminates the need to scroll and provides full accessible content ` : ` UNDERSTAND THE PAGE ariaTree Get complete page context before taking actions Eliminates the need to scroll and provides full accessible content screenshot Visual confirmation when needed. Ideally after navigating to a new page. `; // Roadblocks section only shown when captchas are auto-solved const roadblocksSection = captchasAutoSolve ? ` ${CAPTCHA_SYSTEM_PROMPT_NOTE} ` : ""; // Build customInstructions block only if provided const customInstructionsBlock = systemInstructions ? `${cdata(systemInstructions)}\n ` : ""; // Build variables section only if variables are provided const hasVariables = variables && Object.keys(variables).length > 0; const variableToolsNote = isHybridMode ? "Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields." : "Use %variableName% syntax in the act or fillForm tool's action fields."; const variablesSection = hasVariables ? ` You have access to the following variables. Use %variableName% syntax to substitute variable values. This is especially important for sensitive data like passwords. ${variableToolsNote} To type a password, use: type %password% into the password field ${Object.entries(variables) .map(([name, v]) => { const description = typeof v === "object" && v !== null && "value" in v ? v.description : undefined; return description ? `${description}` : ``; }) .join("\n ")} ` : ""; return ` You are a web automation assistant using browser automation tools to accomplish the user's goal. ${customInstructionsBlock} ${cdata(executionInstruction)} ${localeDate} You may think the date is different due to knowledge cutoff, but this is the actual date. you are starting your task on this url: ${url} Be very intentional about your action. The initial instruction is very important, and slight variations of the actual goal can lead to failures. If something fails to meet a single condition of the task, move on from it rather than seeing if it meets other criteria. We only care that it meets all of it When the task is complete, do not seek more information; you have completed the task. Always start by understanding the current page state Use the screenshot tool to verify page state when needed Use appropriate tools for each action ${pageUnderstandingProtocol} If you are confident in the URL, navigate directly to it. ${hasSearch ? `If you are not confident in the URL, use the search tool to find it.` : ``} ${toolsSection} ${strategySection} ${commonStrategyItems} ${roadblocksSection} ${variablesSection} When you complete the task, explain any information that was found that was relevant to the original task. If you were asked for specific flights, list the flights you found. If you were asked for information about a product, list the product information you were asked for. `; } ================================================ FILE: packages/core/lib/v3/agent/tools/README.md ================================================ This folder provides v3-native agent tools for the AISDK-based agent flow. They mirror the v2 tools but operate on the V3 CDP-native APIs. Files are placed under lib/v3/agent/tools and consumed by V3AgentHandler. ================================================ FILE: packages/core/lib/v3/agent/tools/act.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { AgentModelConfig, Variables } from "../../types/public/agent.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; export const actTool = ( v3: V3, executionModel?: string | AgentModelConfig, variables?: Variables, toolTimeout?: number, ) => { const hasVariables = variables && Object.keys(variables).length > 0; const actionDescription = hasVariables ? `Describe what to click or type, e.g. "click the Login button" or "type %variableName% into the input". Available variables: ${Object.keys(variables).join(", ")}` : 'Describe what to click or type, e.g. "click the Login button" or "type "John" into the first name input"'; return tool({ description: "Perform an action on the page (click, type). Provide a short, specific phrase that mentions the element type.", inputSchema: z.object({ action: z.string().describe(actionDescription), }), execute: async ({ action }) => { try { v3.logger({ category: "agent", message: `Agent calling tool: act`, level: 1, auxiliary: { arguments: { value: action, type: "string", }, }, }); const options = executionModel ? { model: executionModel, variables, timeout: toolTimeout } : { variables, timeout: toolTimeout }; const result = await v3.act(action, options); const actions = (result.actions as Action[] | undefined) ?? []; v3.recordAgentReplayStep({ type: "act", instruction: action, actions, actionDescription: result.actionDescription, message: result.message, }); // Only include playwrightArguments when actions exist // (undefined is not valid JSON and breaks AI SDK validation) const response: { success: boolean; action: string; playwrightArguments?: Action; } = { success: result.success ?? true, action: result?.actionDescription ?? action, }; if (actions.length > 0) { response.playwrightArguments = actions[0]; } return response; } catch (error) { if (error instanceof TimeoutError) { throw error; } return { success: false, error: error?.message ?? String(error), }; } }, }); }; ================================================ FILE: packages/core/lib/v3/agent/tools/ariaTree.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; export const ariaTreeTool = (v3: V3, toolTimeout?: number) => tool({ description: "gets the accessibility (ARIA) hybrid tree text for the current page. use this to understand structure and content.", inputSchema: z.object({}), execute: async () => { try { v3.logger({ category: "agent", message: `Agent calling tool: ariaTree`, level: 1, }); const page = await v3.context.awaitActivePage(); const extractOptions = toolTimeout ? { timeout: toolTimeout } : undefined; const { pageText } = (await v3.extract(extractOptions)) as { pageText: string; }; const pageUrl = page.url(); let content = pageText; const MAX_TOKENS = 70000; // rough cap, assume ~4 chars per token for conservative truncation const estimatedTokens = Math.ceil(content.length / 4); if (estimatedTokens > MAX_TOKENS) { const maxChars = MAX_TOKENS * 4; content = content.substring(0, maxChars) + "\n\n[CONTENT TRUNCATED: Exceeded 70,000 token limit]"; } return { success: true, content, pageUrl }; } catch (error) { if (error instanceof TimeoutError) { throw error; } return { content: "", error: error?.message ?? String(error), success: false, pageUrl: "", }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } return { type: "content", value: [ { type: "text", text: `Accessibility Tree:\n${result.content}` }, ], }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/braveSearch.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export interface BraveSearchResult { title: string; url: string; description?: string; } interface SearchResponse { data?: { results: BraveSearchResult[]; }; error?: string; } interface BraveWebResult { title?: string; url?: string; description?: string; age?: string; meta_url?: { favicon?: string; }; } interface BraveApiResponse { web?: { results?: BraveWebResult[]; }; } async function performBraveSearch(query: string): Promise { try { const encodedQuery = encodeURIComponent(query); const response = await fetch( `https://api.search.brave.com/res/v1/web/search?q=${encodedQuery}`, { method: "GET", headers: { Accept: "application/json", "Accept-Encoding": "gzip", "X-Subscription-Token": process.env.BRAVE_API_KEY!, }, }, ); if (!response.ok) { return { error: `Brave API error: ${response.status} ${response.statusText}`, data: { results: [] }, }; } const data = (await response.json()) as BraveApiResponse; const results: BraveSearchResult[] = []; if (data?.web?.results && Array.isArray(data.web.results)) { for (const item of data.web.results.slice(0, 5)) { if (item.title && item.url) { results.push({ title: item.title, url: item.url, description: item.description, }); } } } return { data: { results } }; } catch (error) { console.error("Search error", error); return { error: `Error performing search: ${error.message}`, data: { results: [] }, }; } } export const searchTool = (v3: V3) => tool({ description: "Perform a web search and returns results. Use this tool when you need information from the web or when you are unsure of the exact URL you want to navigate to. This can be used to find the ideal entry point, resulting in a task that is easier to complete due to starting further in the process.", inputSchema: z.object({ query: z.string().describe("The search query to look for on the web"), }), execute: async ({ query }) => { v3.logger({ category: "agent", message: `Agent calling tool: search`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ query }), type: "object", }, }, }); const result = await performBraveSearch(query); v3.recordAgentReplayStep({ type: "search", instruction: query, playwrightArguments: { query }, message: result.error ?? `Found ${result.data?.results.length ?? 0} results`, }); return { ...result, timestamp: Date.now(), }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/browserbaseSearch.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export interface SearchResult { title: string; url: string; publishedDate?: string; } interface BrowserbaseRawResult { title?: string; url?: string; publishedDate?: string; } interface BrowserbaseApiResponse { results?: BrowserbaseRawResult[]; } async function performBrowserbaseSearch( v3: V3, query: string, apiKey: string, numResults: number = 5, ): Promise<{ results: SearchResult[]; error?: string }> { try { const response = await fetch("https://api.browserbase.com/v1/search", { method: "POST", headers: { "Content-Type": "application/json", "x-bb-api-key": apiKey, }, body: JSON.stringify({ query, numResults }), }); if (!response.ok) { return { results: [], error: `Browserbase Search API error: ${response.status} ${response.statusText}`, }; } const data = (await response.json()) as BrowserbaseApiResponse; const results: SearchResult[] = (data?.results ?? []).map( ({ title, url, publishedDate }) => ({ title: title, url: url, ...(publishedDate && { publishedDate }), }), ); return { results }; } catch (error) { v3.logger({ category: "agent", message: `Search error: ${error.message}`, level: 0, }); return { results: [], error: `Error performing search: ${error.message}`, }; } } export const searchTool = (v3: V3, apiKey: string) => tool({ description: "Perform a web search and returns results. Use this tool when you need information from the web or when you are unsure of the exact URL you want to navigate to. This can be used to find the ideal entry point, resulting in a task that is easier to complete due to starting further in the process.", inputSchema: z.object({ query: z.string().describe("The search query to look for on the web"), }), execute: async ({ query }) => { v3.logger({ category: "agent", message: `Agent calling tool: search`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ query }), type: "object", }, }, }); const result = await performBrowserbaseSearch(v3, query, apiKey); v3.recordAgentReplayStep({ type: "search", instruction: query, playwrightArguments: { query }, message: result.error ?? `Found ${result.results.length} results`, }); return { ...result, timestamp: Date.now() }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/click.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { ClickToolResult, ModelOutputContentItem, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; export const clickTool = (v3: V3, provider?: string) => tool({ description: "Click on an element using its coordinates (this is the most reliable way to click on an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)", inputSchema: z.object({ describe: z .string() .describe( "Describe the element to click on in a short, specific phrase that mentions the element type and a good visual description", ), coordinates: z .array(z.number()) .describe("The (x, y) coordinates to click on"), }), execute: async ({ describe, coordinates }): Promise => { try { const page = await v3.context.awaitActivePage(); const processed = processCoordinates( coordinates[0], coordinates[1], provider, v3, ); v3.logger({ category: "agent", message: `Agent calling tool: click`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ describe }), type: "object", }, }, }); // Only request XPath when caching is enabled to avoid unnecessary computation const shouldCollectXpath = v3.isAgentReplayActive(); const xpath = await page.click(processed.x, processed.y, { returnXpath: shouldCollectXpath, }); const screenshotBase64 = await waitAndCaptureScreenshot(page); // Record as an "act" step with proper Action for deterministic replay (only when caching) if (shouldCollectXpath) { const normalizedXpath = ensureXPath(xpath); if (normalizedXpath) { const action: Action = { selector: normalizedXpath, description: describe, method: "click", arguments: [], }; v3.recordAgentReplayStep({ type: "act", instruction: describe, actions: [action], actionDescription: describe, }); } } return { success: true, describe, coordinates: [processed.x, processed.y], screenshotBase64, }; } catch (error) { return { success: false, error: `Error clicking: ${error.message}`, }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, describe: result.describe, coordinates: result.coordinates, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/clickAndHold.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { ensureXPath } from "../utils/xpath.js"; export const clickAndHoldTool = (v3: V3, provider?: string) => tool({ description: "Click and hold on an element using its coordinates", inputSchema: z.object({ describe: z .string() .describe( "Describe the element to click on in a short, specific phrase that mentions the element type and a good visual description", ), duration: z .number() .describe("The duration to hold the element in milliseconds"), coordinates: z .array(z.number()) .describe("The (x, y) coordinates to click on"), }), execute: async ({ describe, coordinates, duration }) => { try { const page = await v3.context.awaitActivePage(); const processed = processCoordinates( coordinates[0], coordinates[1], provider, v3, ); v3.logger({ category: "agent", message: `Agent calling tool: clickAndHold`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ describe, duration, }), type: "object", }, }, }); // Only request XPath when caching is enabled to avoid unnecessary computation const shouldCollectXpath = v3.isAgentReplayActive(); // Use dragAndDrop from same point to same point with delay to simulate click and hold const [xpath] = await page.dragAndDrop( processed.x, processed.y, processed.x, processed.y, { delay: duration, returnXpath: shouldCollectXpath }, ); // Record as "act" step with proper Action for deterministic replay (only when caching) if (shouldCollectXpath) { const normalizedXpath = ensureXPath(xpath); if (normalizedXpath) { const action: Action = { selector: normalizedXpath, description: describe, method: "clickAndHold", arguments: [String(duration)], }; v3.recordAgentReplayStep({ type: "act", instruction: describe, actions: [action], actionDescription: describe, }); } } return { success: true, describe }; } catch (error) { return { success: false, error: `Error clicking and holding: ${error.message}`, }; } }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/dragAndDrop.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { DragAndDropToolResult, ModelOutputContentItem, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; export const dragAndDropTool = (v3: V3, provider?: string) => tool({ description: "Drag and drop an element using its coordinates (this is the most reliable way to drag and drop an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)", inputSchema: z.object({ describe: z.string().describe("Describe the element to drag and drop"), startCoordinates: z .array(z.number()) .describe("The (x, y) coordinates to start the drag and drop from"), endCoordinates: z .array(z.number()) .describe("The (x, y) coordinates to end the drag and drop at"), }), execute: async ({ describe, startCoordinates, endCoordinates, }): Promise => { try { const page = await v3.context.awaitActivePage(); const processedStart = processCoordinates( startCoordinates[0], startCoordinates[1], provider, v3, ); const processedEnd = processCoordinates( endCoordinates[0], endCoordinates[1], provider, v3, ); v3.logger({ category: "agent", message: `Agent calling tool: dragAndDrop`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ describe, }), type: "object", }, }, }); // Only request XPath when caching is enabled to avoid unnecessary computation const shouldCollectXpath = v3.isAgentReplayActive(); const [fromXpath, toXpath] = await page.dragAndDrop( processedStart.x, processedStart.y, processedEnd.x, processedEnd.y, { returnXpath: shouldCollectXpath }, ); const screenshotBase64 = await waitAndCaptureScreenshot(page); // Record as "act" step with proper Action for deterministic replay (only when caching) if (shouldCollectXpath) { const normalizedFrom = ensureXPath(fromXpath); const normalizedTo = ensureXPath(toXpath); if (normalizedFrom && normalizedTo) { const action: Action = { selector: normalizedFrom, description: describe, method: "dragAndDrop", arguments: [normalizedTo], }; v3.recordAgentReplayStep({ type: "act", instruction: describe, actions: [action], actionDescription: describe, }); } } return { success: true, describe, screenshotBase64, }; } catch (error) { return { success: false, error: `Error dragging: ${error.message}`, }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, describe: result.describe, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/extract.ts ================================================ import { tool } from "ai"; import { z, ZodTypeAny } from "zod"; import type { V3 } from "../../v3.js"; import type { AgentModelConfig } from "../../types/public/agent.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; interface JsonSchema { type?: string; properties?: Record; items?: JsonSchema; enum?: string[]; format?: "url" | "email" | "uuid"; } function jsonSchemaToZod(schema: JsonSchema): ZodTypeAny { switch (schema.type) { case "object": { const shape: Record = {}; if (schema.properties) { for (const [key, value] of Object.entries(schema.properties)) { shape[key] = jsonSchemaToZod(value); } } return z.object(shape); } case "array": return z.array(schema.items ? jsonSchemaToZod(schema.items) : z.any()); case "string": { let s = z.string(); if (schema.format === "url") s = s.url(); if (schema.format === "email") s = s.email(); if (schema.format === "uuid") s = s.uuid(); if (schema.enum && schema.enum.length > 0) return z.enum(schema.enum as [string, ...string[]]); return s; } case "number": case "integer": return z.number(); case "boolean": return z.boolean(); case "null": return z.null(); default: return z.any(); } } export const extractTool = ( v3: V3, executionModel?: string | AgentModelConfig, toolTimeout?: number, ) => tool({ description: `Extract structured data from the current page based on a provided schema. USAGE GUIDELINES: - Keep schemas MINIMAL - only include fields essential for the task - IMPORTANT: only use this if explicitly asked for structured output. In most scenarios, you should use the aria tree tool over this. - For URL fields, use format: "url" EXAMPLES: 1. Extract a single value: instruction: "extract the product price" schema: { type: "object", properties: { price: { type: "number" } } } 2. Extract multiple fields: instruction: "extract product name and price" schema: { type: "object", properties: { name: { type: "string" }, price: { type: "number" } } } 3. Extract arrays: instruction: "extract all product names and prices" schema: { type: "object", properties: { products: { type: "array", items: { type: "object", properties: { name: { type: "string" }, price: { type: "number" } } } } } } 4. Extract a URL: instruction: "extract the link" schema: { type: "object", properties: { url: { type: "string", format: "url" } } }`, inputSchema: z.object({ instruction: z.string(), schema: z .object({ type: z.string().optional(), properties: z.record(z.string(), z.unknown()).optional(), items: z.unknown().optional(), enum: z.array(z.string()).optional(), format: z.enum(["url", "email", "uuid"]).optional(), }) .passthrough() .optional() .describe("JSON Schema object describing the structure to extract"), }), execute: async ({ instruction, schema }) => { try { const parsedSchema = schema ? jsonSchemaToZod(schema as JsonSchema) : undefined; const result = await v3.extract(instruction, parsedSchema, { ...(executionModel ? { model: executionModel } : {}), timeout: toolTimeout, }); return { success: true, result }; } catch (error) { if (error instanceof TimeoutError) { throw error; } return { success: false, error: error?.message ?? String(error) }; } }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/fillFormVision.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { FillFormVisionToolResult, ModelOutputContentItem, Variables, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; import { substituteVariables } from "../utils/variables.js"; export const fillFormVisionTool = ( v3: V3, provider?: string, variables?: Variables, ) => { const hasVariables = variables && Object.keys(variables).length > 0; const valueDescription = hasVariables ? `Text to type into the target field. Use %variableName% to substitute a variable value. Available: ${Object.keys(variables).join(", ")}` : "Text to type into the target field"; return tool({ description: `FORM FILL - SPECIALIZED MULTI-FIELD INPUT TOOL CRITICAL: Use this for ANY form with 2+ input fields (text inputs, textareas, etc.) IMPORTANT: Ensure the fields are visible within the current viewport WHY THIS TOOL EXISTS: - Forms are the #1 use case for multi-field input - Optimized specifically for input/textarea elements - 4-6x faster than individual typing actions Use fillFormVision: Pure form filling (inputs, textareas only) MANDATORY USE CASES (always use fillFormVision for these): - Registration forms: name, email, password fields - Contact forms: name, email, message fields - Checkout forms: address, payment info fields - Profile updates: multiple user data fields - Search filters: multiple criteria inputs`, inputSchema: z.object({ fields: z .array( z.object({ action: z .string() .describe( "Description of the typing action, e.g. 'type foo into the bar field'", ), value: z.string().describe(valueDescription), coordinates: z .object({ x: z.number(), y: z.number(), }) .describe("Coordinates of the target field"), }), ) .min(2, "Provide at least two fields to fill"), }), execute: async ({ fields }): Promise => { try { const page = await v3.context.awaitActivePage(); // Process coordinates and substitute variables for each field // Keep original values (with %tokens%) for logging/caching, substituted values for typing const processedFields = fields.map((field) => { const processed = processCoordinates( field.coordinates.x, field.coordinates.y, provider, v3, ); return { ...field, originalValue: field.value, // Keep original with %tokens% for cache value: substituteVariables(field.value, variables), coordinates: { x: processed.x, y: processed.y }, }; }); v3.logger({ category: "agent", message: `Agent calling tool: fillFormVision`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ fields }), // Don't log substituted values type: "object", }, }, }); // Only request XPath when caching is enabled to avoid unnecessary computation const shouldCollectXpath = v3.isAgentReplayActive(); const actions: Action[] = []; for (const field of processedFields) { // Click the field, only requesting XPath when caching is enabled const xpath = await page.click( field.coordinates.x, field.coordinates.y, { returnXpath: shouldCollectXpath, }, ); await page.type(field.value); // Build Action with XPath for deterministic replay (only when caching) // Use originalValue (with %tokens%) so cache stores references, not sensitive values if (shouldCollectXpath) { const normalizedXpath = ensureXPath(xpath); if (normalizedXpath) { actions.push({ selector: normalizedXpath, description: field.action, method: "type", arguments: [field.originalValue], }); } } // Small delay between fields await new Promise((resolve) => setTimeout(resolve, 100)); } const screenshotBase64 = await waitAndCaptureScreenshot(page, 100); // Record as "act" step with proper Actions for deterministic replay (only when caching) if (shouldCollectXpath && actions.length > 0) { v3.recordAgentReplayStep({ type: "act", instruction: `Fill ${fields.length} form fields`, actions, actionDescription: `Fill ${fields.length} form fields`, }); } return { success: true, playwrightArguments: processedFields, screenshotBase64, }; } catch (error) { return { success: false, error: `Error filling form: ${error.message}`, }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [ { type: "text", text: JSON.stringify({ success: result.success, error: result.error, }), }, ], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, fieldsCount: result.playwrightArguments?.length ?? 0, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); }; ================================================ FILE: packages/core/lib/v3/agent/tools/fillform.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { AgentModelConfig, Variables } from "../../types/public/agent.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; export const fillFormTool = ( v3: V3, executionModel?: string | AgentModelConfig, variables?: Variables, toolTimeout?: number, ) => { const hasVariables = variables && Object.keys(variables).length > 0; const actionDescription = hasVariables ? `Must follow the pattern: "type into the ". Use %variableName% to substitute a variable value. Available: ${Object.keys(variables).join(", ")}. Examples: "type %email% into the email input", "type %password% into the password input"` : 'Must follow the pattern: "type into the ". Examples: "type john@example.com into the email input", "type John into the first name input"'; return tool({ description: 'FORM FILL - MULTI-FIELD INPUT TOOL\nFill 2+ form inputs/textareas at once. Each action MUST include the exact text to type and the target field, e.g. "type john@example.com into the email field".', inputSchema: z.object({ fields: z .array( z.object({ action: z.string().describe(actionDescription), }), ) .min(1, "Provide at least one field to fill"), }), execute: async ({ fields }) => { try { v3.logger({ category: "agent", message: `Agent calling tool: fillForm`, level: 1, auxiliary: { arguments: { value: JSON.stringify(fields), type: "object", }, }, }); const instruction = `Return observation results for the following actions: ${fields .map((f) => f.action) .join(", ")}`; const observeOptions = executionModel ? { model: executionModel, timeout: toolTimeout } : { timeout: toolTimeout }; const observeResults = await v3.observe(instruction, observeOptions); const completed = [] as unknown[]; const replayableActions: Action[] = []; for (const res of observeResults) { const actOptions = variables ? { variables, timeout: toolTimeout } : { timeout: toolTimeout }; const actResult = await v3.act(res, actOptions); completed.push(actResult); if (Array.isArray(actResult.actions)) { replayableActions.push(...(actResult.actions as Action[])); } } v3.recordAgentReplayStep({ type: "fillForm", fields, observeResults, actions: replayableActions, }); return { success: true, actions: completed, playwrightArguments: replayableActions, }; } catch (error) { if (error instanceof TimeoutError) { throw error; } return { success: false, error: error?.message ?? String(error), }; } }, }); }; ================================================ FILE: packages/core/lib/v3/agent/tools/goto.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export const gotoTool = (v3: V3) => tool({ description: "Navigate to a specific URL", inputSchema: z.object({ url: z.string().describe("The URL to navigate to"), }), execute: async ({ url }) => { try { v3.logger({ category: "agent", message: `Agent calling tool: goto`, level: 1, auxiliary: { arguments: { value: url, type: "string", }, }, }); const page = await v3.context.awaitActivePage(); await page.goto(url, { waitUntil: "load" }); v3.recordAgentReplayStep({ type: "goto", url, waitUntil: "load" }); return { success: true, url }; } catch (error) { return { success: false, error: error?.message ?? String(error) }; } }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/index.ts ================================================ import { gotoTool } from "./goto.js"; import { actTool } from "./act.js"; import { screenshotTool } from "./screenshot.js"; import { waitTool } from "./wait.js"; import { navBackTool } from "./navback.js"; import { ariaTreeTool } from "./ariaTree.js"; import { fillFormTool } from "./fillform.js"; import { scrollTool, scrollVisionTool } from "./scroll.js"; import { extractTool } from "./extract.js"; import { clickTool } from "./click.js"; import { typeTool } from "./type.js"; import { dragAndDropTool } from "./dragAndDrop.js"; import { clickAndHoldTool } from "./clickAndHold.js"; import { keysTool } from "./keys.js"; import { fillFormVisionTool } from "./fillFormVision.js"; import { thinkTool } from "./think.js"; import { searchTool as browserbaseSearchTool } from "./browserbaseSearch.js"; import { searchTool as braveSearchTool } from "./braveSearch.js"; import type { ToolSet, InferUITools } from "ai"; import type { V3 } from "../../v3.js"; import type { LogLine } from "../../types/public/logs.js"; import type { AgentToolMode, AgentModelConfig, Variables, } from "../../types/public/agent.js"; import { withTimeout } from "../../timeoutConfig.js"; import { TimeoutError } from "../../types/public/sdkErrors.js"; export interface V3AgentToolOptions { executionModel?: string | AgentModelConfig; logger?: (message: LogLine) => void; /** * Tool mode determines which set of tools are available. * - 'dom' (default): Uses DOM-based tools (act, fillForm) - removes coordinate-based tools * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.) - removes fillForm */ mode?: AgentToolMode; /** * The model provider. Used for model-specific coordinate handling */ provider?: string; /** * Tools to exclude from the available toolset. * These tools will be filtered out after mode-based filtering. */ excludeTools?: string[]; /** * Variables available to the agent for use in act/type tools. * When provided, these tools will have an optional useVariable field. */ variables?: Variables; /** * Timeout in milliseconds for async tool calls. * Applied to all tools that perform I/O (except wait and think). */ toolTimeout?: number; /** * Whether to enable the Browserbase-powered web search tool. * Requires a valid Browserbase API key. */ useSearch?: boolean; /** * The Browserbase API key used for the search tool. * Resolved from BROWSERBASE_API_KEY env var or the Stagehand constructor. */ browserbaseApiKey?: string; } /** * Filters tools based on mode and explicit exclusions. * - 'dom' mode: Removes coordinate-based tools (click, type, dragAndDrop, clickAndHold, fillFormVision) * - 'hybrid' mode: Removes DOM-based form tool (fillForm) in favor of coordinate-based fillFormVision * - excludeTools: Additional tools to remove from the toolset */ function filterTools( tools: ToolSet, mode: AgentToolMode, excludeTools?: string[], ): ToolSet { const filtered: ToolSet = { ...tools }; // Mode-based filtering if (mode === "hybrid") { delete filtered.fillForm; } else { // DOM mode (default) delete filtered.click; delete filtered.type; delete filtered.dragAndDrop; delete filtered.clickAndHold; delete filtered.fillFormVision; } if (excludeTools) { for (const toolName of excludeTools) { delete filtered[toolName]; } } return filtered; } /** * Wraps an AI SDK tool's execute function with a timeout guard. * On timeout, returns `{ success: false, error: "TimeoutError: ..." }` to the LLM * and logs the error. Also acts as a safety net for any uncaught errors. */ // eslint-disable-next-line @typescript-eslint/no-explicit-any function wrapToolWithTimeout>( agentTool: T, toolName: string, v3: V3, timeoutMs?: number, timeoutHint?: string, ): T { if (!timeoutMs || !agentTool.execute) return agentTool; const originalExecute = agentTool.execute; return { ...agentTool, execute: async (...args: unknown[]) => { try { return await withTimeout(originalExecute(...args), timeoutMs, toolName); } catch (error) { if (error instanceof TimeoutError) { const message = `TimeoutError: ${error.message}${timeoutHint ? ` ${timeoutHint}` : ""}`; v3.logger({ category: "agent", message, level: 0, }); return { success: false, error: message, }; } throw error; } }, } as T; } export function createAgentTools(v3: V3, options?: V3AgentToolOptions) { const executionModel = options?.executionModel; const mode = options?.mode ?? "dom"; const provider = options?.provider; const excludeTools = options?.excludeTools; const variables = options?.variables; const toolTimeout = options?.toolTimeout; const timeoutHints: Record = { act: "(it may continue executing in the background) — try using a different description for the action", ariaTree: "— the page may be too large", extract: "— try using a smaller or simpler schema", fillForm: "(it may continue executing in the background) — try filling fewer fields at once or use a different tool", }; const unwrappedTools: ToolSet = { act: actTool(v3, executionModel, variables, toolTimeout), ariaTree: ariaTreeTool(v3, toolTimeout), click: clickTool(v3, provider), clickAndHold: clickAndHoldTool(v3, provider), dragAndDrop: dragAndDropTool(v3, provider), extract: extractTool(v3, executionModel, toolTimeout), fillForm: fillFormTool(v3, executionModel, variables, toolTimeout), fillFormVision: fillFormVisionTool(v3, provider, variables), goto: gotoTool(v3), keys: keysTool(v3), navback: navBackTool(v3), screenshot: screenshotTool(v3), scroll: mode === "hybrid" ? scrollVisionTool(v3, provider) : scrollTool(v3), type: typeTool(v3, provider, variables), }; if (options?.useSearch && options.browserbaseApiKey) { unwrappedTools.search = browserbaseSearchTool( v3, options.browserbaseApiKey, ); } else if (process.env.BRAVE_API_KEY) { unwrappedTools.search = braveSearchTool(v3); } const allTools: ToolSet = { ...Object.fromEntries( Object.entries(unwrappedTools).map(([name, t]) => [ name, wrapToolWithTimeout( t, `${name}()`, v3, toolTimeout, timeoutHints[name], ), ]), ), think: thinkTool(), wait: waitTool(v3, mode), }; return filterTools(allTools, mode, excludeTools); } export type AgentTools = ReturnType; /** * Type map of all agent tools for strong typing of tool calls and results. * Note: `search` is optional — enabled via useSearch: true (Browserbase) or BRAVE_API_KEY env var (legacy). */ export type AgentToolTypesMap = { act: ReturnType; ariaTree: ReturnType; click: ReturnType; clickAndHold: ReturnType; dragAndDrop: ReturnType; extract: ReturnType; fillForm: ReturnType; fillFormVision: ReturnType; goto: ReturnType; keys: ReturnType; navback: ReturnType; screenshot: ReturnType; scroll: ReturnType | ReturnType; search?: | ReturnType | ReturnType; think: ReturnType; type: ReturnType; wait: ReturnType; }; /** * Inferred UI tools type for type-safe tool inputs and outputs. * Use with UIMessage for full type safety in UI contexts. */ export type AgentUITools = InferUITools; /** * Union type for all possible agent tool calls. * Provides type-safe access to tool call arguments. */ export type AgentToolCall = { [K in keyof AgentToolTypesMap]: { toolName: K; toolCallId: string; args: AgentUITools[K]["input"]; }; }[keyof AgentToolTypesMap]; /** * Union type for all possible agent tool results. * Provides type-safe access to tool result values. */ export type AgentToolResult = { [K in keyof AgentToolTypesMap]: { toolName: K; toolCallId: string; result: AgentUITools[K]["output"]; }; }[keyof AgentToolTypesMap]; ================================================ FILE: packages/core/lib/v3/agent/tools/keys.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export const keysTool = (v3: V3) => tool({ description: `Send keyboard input to the page without targeting a specific element. Unlike the type tool which clicks then types into coordinates, this sends keystrokes directly to wherever focus currently is. Use method="type" to enter text into the currently focused element. Preferred when: input is already focused, text needs to flow across multiple fields (e.g., verification codes) Use method="press" for navigation keys (Enter, Tab, Escape, Backspace, arrows) and keyboard shortcuts (Cmd+A, Ctrl+C, Shift+Tab).`, inputSchema: z.object({ method: z.enum(["press", "type"]), value: z .string() .describe( "The text to type, or the key/combo to press (Enter, Tab, Cmd+A)", ), repeat: z.number().optional(), }), execute: async ({ method, value, repeat }) => { try { const page = await v3.context.awaitActivePage(); v3.logger({ category: "agent", message: `Agent calling tool: keys`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ method, value, repeat }), type: "object", }, }, }); const times = Math.max(1, repeat ?? 1); if (method === "type") { for (let i = 0; i < times; i++) { await page.type(value, { delay: 100 }); } v3.recordAgentReplayStep({ type: "keys", instruction: `type "${value}"`, playwrightArguments: { method, text: value, times }, }); return { success: true, method, value, times }; } if (method === "press") { for (let i = 0; i < times; i++) { await page.keyPress(value, { delay: 100 }); } v3.recordAgentReplayStep({ type: "keys", instruction: `press ${value}`, playwrightArguments: { method, keys: value, times }, }); return { success: true, method, value, times }; } return { success: false, error: `Unsupported method: ${method}` }; } catch (error) { return { success: false, error: error.message }; } }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/navback.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export const navBackTool = (v3: V3) => tool({ description: "Navigate back to the previous page", inputSchema: z.object({ reasoningText: z.string().describe("Why you're going back"), }), execute: async () => { v3.logger({ category: "agent", message: `Agent calling tool: navback`, level: 1, }); const page = await v3.context.awaitActivePage(); await page.goBack({ waitUntil: "domcontentloaded" }); v3.recordAgentReplayStep({ type: "navback", waitUntil: "domcontentloaded", }); return { success: true }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/screenshot.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; export const screenshotTool = (v3: V3) => tool({ description: "Takes a screenshot (PNG) of the current page. Use this to quickly verify page state.", inputSchema: z.object({}), execute: async () => { try { v3.logger({ category: "agent", message: `Agent calling tool: screenshot`, level: 1, }); const page = await v3.context.awaitActivePage(); const buffer = await page.screenshot({ fullPage: false }); const pageUrl = page.url(); return { success: true, base64: buffer.toString("base64"), timestamp: Date.now(), pageUrl, }; } catch (error) { return { success: false, error: `Error taking screenshot: ${(error as Error).message}`, }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } return { type: "content", value: [{ type: "media", mediaType: "image/png", data: result.base64 }], }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/scroll.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { ScrollToolResult, ScrollVisionToolResult, ModelOutputContentItem, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; /** * Simple scroll tool for DOM mode (non-grounding models). * No coordinates - scrolls from viewport center. */ export const scrollTool = (v3: V3) => tool({ description: "Scroll the page up or down by a percentage of the viewport height. Default is 80%, and what should be typically used for general page scrolling", inputSchema: z.object({ direction: z.enum(["up", "down"]), percentage: z.number().min(1).max(200).optional(), }), execute: async ({ direction, percentage = 80, }): Promise => { v3.logger({ category: "agent", message: `Agent calling tool: scroll`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ direction, percentage }), type: "object", }, }, }); const page = await v3.context.awaitActivePage(); const { w, h } = await page.mainFrame().evaluate<{ w: number; h: number; }>("({ w: window.innerWidth, h: window.innerHeight })"); const scrollDistance = Math.round((h * percentage) / 100); const cx = Math.floor(w / 2); const cy = Math.floor(h / 2); const deltaY = direction === "up" ? -scrollDistance : scrollDistance; await page.scroll(cx, cy, 0, deltaY); v3.recordAgentReplayStep({ type: "scroll", deltaX: 0, deltaY, anchor: { x: cx, y: cy }, }); return { success: true, message: `Scrolled ${percentage}% ${direction} (${scrollDistance}px)`, scrolledPixels: scrollDistance, }; }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } return { type: "json", value: { success: result.success, message: result.message, scrolledPixels: result.scrolledPixels, }, }; }, }); /** * Scroll tool for hybrid mode (grounding models). * Supports optional coordinates for scrolling within nested scrollable elements. */ export const scrollVisionTool = (v3: V3, provider?: string) => tool({ description: `Scroll the page up or down. For general page scrolling, no coordinates needed. Only provide coordinates when scrolling inside a nested scrollable element (e.g., a dropdown menu, modal with overflow, or scrollable sidebar). Default is 80%, and what should be typically used for general page scrolling`, inputSchema: z.object({ direction: z.enum(["up", "down"]), coordinates: z .array(z.number()) .optional() .describe( "Only use coordinates for scrolling inside a nested scrollable element - provide (x, y) within that element", ), percentage: z.number().min(1).max(200).optional(), }), execute: async ({ direction, coordinates, percentage = 80, }): Promise => { const page = await v3.context.awaitActivePage(); const { w, h } = await page.mainFrame().evaluate<{ w: number; h: number; }>("({ w: window.innerWidth, h: window.innerHeight })"); // Process coordinates if provided, otherwise use viewport center let cx: number; let cy: number; if (coordinates) { const processed = processCoordinates( coordinates[0], coordinates[1], provider, v3, ); cx = processed.x; cy = processed.y; } else { cx = Math.floor(w / 2); cy = Math.floor(h / 2); } v3.logger({ category: "agent", message: `Agent calling tool: scroll`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ direction, coordinates, percentage, processed: { cx, cy }, }), type: "object", }, }, }); const scrollDistance = Math.round((h * percentage) / 100); const deltaY = direction === "up" ? -scrollDistance : scrollDistance; await page.scroll(cx, cy, 0, deltaY); const screenshotBase64 = await waitAndCaptureScreenshot(page, 100); v3.recordAgentReplayStep({ type: "scroll", deltaX: 0, deltaY, anchor: { x: cx, y: cy }, }); return { success: true, message: coordinates ? `Scrolled ${percentage}% ${direction} at (${cx}, ${cy})` : `Scrolled ${percentage}% ${direction}`, scrolledPixels: scrollDistance, screenshotBase64, }; }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, message: result.message, scrolledPixels: result.scrolledPixels, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/think.ts ================================================ import { tool } from "ai"; import { z } from "zod"; export const thinkTool = () => tool({ description: `Use this tool to think through complex problems or plan a sequence of steps. This is for internal reasoning only and doesn't perform any actions. Use this to: 1. Plan a multi-step approach before taking action 2. Break down complex tasks 3. Reason through edge cases 4. Evaluate options when you're unsure what to do next The output is only visible to you; use it to track your own reasoning process.`, inputSchema: z.object({ reasoning: z .string() .describe( "Your step-by-step reasoning or planning process. Be as detailed as needed.", ), }), execute: async ({ reasoning }) => { return { acknowledged: true, message: reasoning, }; }, }); ================================================ FILE: packages/core/lib/v3/agent/tools/type.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { Action } from "../../types/public/methods.js"; import type { TypeToolResult, ModelOutputContentItem, Variables, } from "../../types/public/agent.js"; import { processCoordinates } from "../utils/coordinateNormalization.js"; import { ensureXPath } from "../utils/xpath.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; import { substituteVariables } from "../utils/variables.js"; export const typeTool = (v3: V3, provider?: string, variables?: Variables) => { const hasVariables = variables && Object.keys(variables).length > 0; const textDescription = hasVariables ? `The text to type into the element. Use %variableName% to substitute a variable value. Available: ${Object.keys(variables).join(", ")}` : "The text to type into the element"; return tool({ description: "Type text into an element using its coordinates. This will click the element and then type the text into it (this is the most reliable way to type into an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)", inputSchema: z.object({ describe: z .string() .describe( "Describe the element to type into in a short, specific phrase that mentions the element type and a good visual description", ), text: z.string().describe(textDescription), coordinates: z .array(z.number()) .describe("The (x, y) coordinates to type into the element"), }), execute: async ({ describe, coordinates, text, }): Promise => { try { const page = await v3.context.awaitActivePage(); const processed = processCoordinates( coordinates[0], coordinates[1], provider, v3, ); // Substitute any %variableName% tokens in the text const actualText = substituteVariables(text, variables); v3.logger({ category: "agent", message: `Agent calling tool: type`, level: 1, auxiliary: { arguments: { value: JSON.stringify({ describe, text }), type: "object", }, }, }); // Only request XPath when caching is enabled to avoid unnecessary computation const shouldCollectXpath = v3.isAgentReplayActive(); const xpath = await page.click(processed.x, processed.y, { returnXpath: shouldCollectXpath, }); await page.type(actualText); const screenshotBase64 = await waitAndCaptureScreenshot(page); // Record as an "act" step with proper Action for deterministic replay (only when caching) if (shouldCollectXpath) { const normalizedXpath = ensureXPath(xpath); if (normalizedXpath) { const action: Action = { selector: normalizedXpath, description: describe, method: "type", arguments: [text], }; v3.recordAgentReplayStep({ type: "act", instruction: describe, actions: [action], actionDescription: describe, }); } } return { success: true, describe, text, // Return original text (with %variableName% tokens) to avoid exposing sensitive values to LLM screenshotBase64, }; } catch (error) { return { success: false, error: `Error typing: ${error.message}`, }; } }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, describe: result.describe, text: result.text, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); }; ================================================ FILE: packages/core/lib/v3/agent/tools/wait.ts ================================================ import { tool } from "ai"; import { z } from "zod"; import type { V3 } from "../../v3.js"; import type { AgentToolMode, WaitToolResult, ModelOutputContentItem, } from "../../types/public/agent.js"; import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js"; export const waitTool = (v3: V3, mode?: AgentToolMode) => tool({ description: "Wait for a specified time", inputSchema: z.object({ timeMs: z.number().describe("Time in milliseconds"), }), execute: async ({ timeMs }): Promise => { v3.logger({ category: "agent", message: `Agent calling tool: wait`, level: 1, auxiliary: { arguments: { value: `Waiting for ${timeMs} milliseconds`, type: "string", }, }, }); await new Promise((resolve) => setTimeout(resolve, timeMs)); if (timeMs > 0) { v3.recordAgentReplayStep({ type: "wait", timeMs }); } // Take screenshot after wait in hybrid mode for visual feedback if (mode === "hybrid") { const page = await v3.context.awaitActivePage(); const screenshotBase64 = await waitAndCaptureScreenshot(page, 0); return { success: true, waited: timeMs, screenshotBase64 }; } return { success: true, waited: timeMs }; }, toModelOutput: (result) => { if (result.success === false || result.error !== undefined) { return { type: "content", value: [{ type: "text", text: JSON.stringify(result) }], }; } const content: ModelOutputContentItem[] = [ { type: "text", text: JSON.stringify({ success: result.success, waited: result.waited, }), }, ]; if (result.screenshotBase64) { content.push({ type: "media", mediaType: "image/png", data: result.screenshotBase64, }); } return { type: "content", value: content }; }, }); ================================================ FILE: packages/core/lib/v3/agent/utils/actionMapping.ts ================================================ import { AgentAction } from "../../types/public/agent.js"; import { ActionMappingOptions } from "../../types/private/agent.js"; /** * Keys to exclude from tool outputs when mapping to actions. * These are large data fields that shouldn't be included in the actions array. * Users can access this data through result.messages if needed. */ const EXCLUDED_OUTPUT_KEYS = ["screenshotBase64"] as const; /** * Strips excluded keys (like screenshotBase64) from a tool output object. */ function stripExcludedKeys( output: Record, ): Record { const result: Record = {}; for (const [key, value] of Object.entries(output)) { if ( !EXCLUDED_OUTPUT_KEYS.includes( key as (typeof EXCLUDED_OUTPUT_KEYS)[number], ) ) { result[key] = value; } } return result; } export function mapToolResultToActions({ toolCallName, toolResult, args, reasoning, }: ActionMappingOptions): AgentAction[] { switch (toolCallName) { case "act": return mapActToolResult(toolResult, args, reasoning); case "fillForm": return mapFillFormToolResult(toolResult, args, reasoning); default: return [createStandardAction(toolCallName, toolResult, args, reasoning)]; } } function mapActToolResult( toolResult: unknown, args: Record, reasoning?: string, ): AgentAction[] { if (!toolResult || typeof toolResult !== "object") { return [createStandardAction("act", toolResult, args, reasoning)]; } const result = toolResult as Record; // AI SDK wraps the tool result in an output property const output = (result.output as Record) || result; // Extract playwright arguments if they exist const action: AgentAction = { type: "act", reasoning, taskCompleted: false, ...args, }; if (output.playwrightArguments) { action.playwrightArguments = output.playwrightArguments; } return [action]; } function mapFillFormToolResult( toolResult: unknown, args: Record, reasoning?: string, ): AgentAction[] { if (!toolResult || typeof toolResult !== "object") { return [createStandardAction("fillForm", toolResult, args, reasoning)]; } const result = toolResult as Record; // AI SDK wraps the tool result in an output property const output = (result.output as Record) || result; const observeResults = Array.isArray(output?.playwrightArguments) ? output.playwrightArguments : []; const actions: AgentAction[] = []; actions.push({ type: "fillForm", reasoning, taskCompleted: false, ...args, }); for (const observeResult of observeResults) { actions.push({ type: "act", reasoning: "acting from fillform tool", taskCompleted: false, playwrightArguments: observeResult, }); } return actions; } function createStandardAction( toolCallName: string, toolResult: unknown, args: Record, reasoning?: string, ): AgentAction { const action: AgentAction = { type: toolCallName, reasoning, taskCompleted: toolCallName === "done" ? (args?.taskComplete as boolean) : false, ...args, }; // For screenshot tool, exclude base64 data and just indicate a screenshot was taken, // if somebody really wants the base64 data, they can access it through messages if (toolCallName === "screenshot") { action.result = "screenshotTaken"; return action; } // Spread the output from the tool result if it exists // Exclude ariaTree tool result as it is very large and unnecessary if (toolCallName !== "ariaTree" && toolResult) { const result = toolResult as { output?: unknown }; const output = result.output; if (output && typeof output === "object" && !Array.isArray(output)) { const cleanedOutput = stripExcludedKeys( output as Record, ); Object.assign(action, cleanedOutput); } } return action; } ================================================ FILE: packages/core/lib/v3/agent/utils/captchaSolver.ts ================================================ import type { Page } from "../../understudy/page.js"; import type { ConsoleMessage } from "../../understudy/consoleMessage.js"; const SOLVING_STARTED = "browserbase-solving-started"; const SOLVING_FINISHED = "browserbase-solving-finished"; const SOLVING_ERRORED = "browserbase-solving-errored"; /** Maximum time (ms) to wait for the captcha solver before giving up. */ const SOLVE_TIMEOUT_MS = 90_000; // --------------------------------------------------------------------------- // Shared captcha notification strings // --------------------------------------------------------------------------- /** Injected into the agent message stream after a successful captcha solve. */ export const CAPTCHA_SOLVED_MSG = "A captcha was automatically detected and solved — no further interaction with the captcha is needed, even if it does not visually appear solved. Do not click the captcha checkbox, widget, or challenge again. Continue with your task."; /** Injected into the agent message stream when the captcha solver fails. */ export const CAPTCHA_ERRORED_MSG = "A captcha was detected but the automatic captcha solver failed to solve it. You may need to try a different approach or navigate around the captcha."; /** Appended to the system prompt (DOM/hybrid agents) when captchas auto-solve. */ export const CAPTCHA_SYSTEM_PROMPT_NOTE = "Captchas on this page are automatically detected and solved by the browser environment. Do not interact with or attempt to solve any captchas yourself — they will be handled for you. Do not click the captcha checkbox, widget, or challenge again after it has been solved, even if it still looks unresolved. Continue with your task as if the captcha does not exist."; /** Appended to the CUA system prompt when captchas auto-solve. */ export const CAPTCHA_CUA_SYSTEM_PROMPT_NOTE = "\n\nCaptchas on this page are automatically detected and solved by the browser environment. Do not interact with or attempt to solve any captchas yourself — they will be handled for you. Continue with your task as if the captcha does not exist."; /** * Tracks Browserbase captcha solver state via console messages and provides * a blocking `waitIfSolving()` that agents call before each step/action. * * Accepts a page-provider callback so the listener is automatically * re-attached when the active page changes (e.g. popup / new tab). * * All concurrent callers of `waitIfSolving()` share the same underlying * promise, so multiple waiters are safely resolved together. */ export class CaptchaSolver { private solving = false; private _solvedSinceLastConsume = false; private _erroredSinceLastConsume = false; private listener: ((msg: ConsoleMessage) => void) | null = null; private attachedPage: Page | null = null; private pageProvider: (() => Promise) | null = null; /** Shared promise that all concurrent waitIfSolving() callers await. */ private waitPromise: Promise | null = null; /** Resolves the shared waitPromise. */ private resolveWait: (() => void) | null = null; /** Timeout handle for the 90s deadline. */ private waitTimer: ReturnType | null = null; /** * Initialise with a callback that returns the current active page. * The listener is lazily (re-)attached whenever the active page changes. */ init(pageProvider: () => Promise): void { this.pageProvider = pageProvider; } /** Whether a captcha solve is currently in progress. */ isSolving(): boolean { return this.solving; } /** * Ensure the console listener is attached to the current active page. * If the active page has changed since the last call, the old listener * is removed and a new one is installed. */ async ensureAttached(): Promise { if (!this.pageProvider) return; const page = await this.pageProvider(); if (page === this.attachedPage) return; // Detach from the old page this.detachListener(); this.attachedPage = page; this.listener = (msg: ConsoleMessage) => { const text = msg.text(); if (text === SOLVING_STARTED) { this.solving = true; } else if (text === SOLVING_FINISHED) { this.solving = false; this._solvedSinceLastConsume = true; this.settle(); } else if (text === SOLVING_ERRORED) { this.solving = false; this._erroredSinceLastConsume = true; this.settle(); } }; page.on("console", this.listener); } /** * Returns a promise that resolves immediately if no captcha is being * solved, or blocks until the solver finishes, errors, or the 90s * timeout is reached. * * Also re-attaches the listener to the current active page if it has * changed since the last call. * * All concurrent callers share the same promise, so no waiter is * orphaned. */ async waitIfSolving(): Promise { await this.ensureAttached(); if (!this.solving) return; // Return the existing shared promise if one is already pending if (this.waitPromise) return this.waitPromise; this.waitPromise = new Promise((resolve) => { this.resolveWait = resolve; this.waitTimer = setTimeout(() => { this.solving = false; this._erroredSinceLastConsume = true; this.settle(); }, SOLVE_TIMEOUT_MS); }); return this.waitPromise; } /** * Returns and resets the solve event flags. * Call after `waitIfSolving()` to check whether a captcha was solved * (or errored) since the last consume. This captures events even if * the solve completed between two `waitIfSolving()` calls. */ consumeSolveResult(): { solved: boolean; errored: boolean } { const result = { solved: this._solvedSinceLastConsume, errored: this._erroredSinceLastConsume, }; this._solvedSinceLastConsume = false; this._erroredSinceLastConsume = false; return result; } /** * Remove the console listener and reset all state. */ dispose(): void { this.detachListener(); this.attachedPage = null; this.pageProvider = null; this.solving = false; this._solvedSinceLastConsume = false; this._erroredSinceLastConsume = false; this.settle(); } // ------------------------------------------------------------------ // Internal helpers // ------------------------------------------------------------------ /** Remove the console listener from the currently attached page. */ private detachListener(): void { if (this.attachedPage && this.listener) { this.attachedPage.off("console", this.listener); } this.listener = null; // If a solve was in progress, mark it as errored so consumers // know it was interrupted (consistent with the timeout path). if (this.solving) { this._erroredSinceLastConsume = true; } // Reset solving state so waiters aren't stuck waiting for events // that can never arrive from the detached page. this.solving = false; this.settle(); } /** Resolve the shared wait promise and clear the timeout. */ private settle(): void { if (this.waitTimer) { clearTimeout(this.waitTimer); this.waitTimer = null; } if (this.resolveWait) { const resolve = this.resolveWait; this.resolveWait = null; this.waitPromise = null; resolve(); } } } ================================================ FILE: packages/core/lib/v3/agent/utils/coordinateNormalization.ts ================================================ import type { V3 } from "../../v3.js"; // Default viewport for advancedStealth mode const STEALTH_VIEWPORT = { width: 1288, height: 711 }; export function isGoogleProvider(provider?: string): boolean { if (!provider) return false; return provider.toLowerCase().includes("google"); } // Google returns coordinates in a 0-1000 range, we need to normalize // them to the viewport dimensions export function normalizeGoogleCoordinates( x: number, y: number, viewport: { width: number; height: number }, ): { x: number; y: number } { const clampedX = Math.min(999, Math.max(0, x)); const clampedY = Math.min(999, Math.max(0, y)); return { x: Math.floor((clampedX / 1000) * viewport.width), y: Math.floor((clampedY / 1000) * viewport.height), }; } export function processCoordinates( x: number, y: number, provider?: string, v3?: V3, ): { x: number; y: number } { if (isGoogleProvider(provider) && v3) { // advancedStealth uses fixed viewport, otherwise use configured viewport const viewport = v3.isAdvancedStealth ? STEALTH_VIEWPORT : v3.configuredViewport; return normalizeGoogleCoordinates(x, y, viewport); } return { x, y }; } ================================================ FILE: packages/core/lib/v3/agent/utils/cuaKeyMapping.ts ================================================ /** * Universal key mapping utility for converting various key representations * to Playwright-compatible key names. Used by all CUA clients and handlers. */ /** * map of key variations to Playwright key names * This handles keys from both Anthropic and OpenAI CUA APIs */ const KEY_MAP: Record = { ENTER: "Enter", RETURN: "Enter", ESCAPE: "Escape", ESC: "Escape", BACKSPACE: "Backspace", TAB: "Tab", SPACE: " ", DELETE: "Delete", DEL: "Delete", ARROWUP: "ArrowUp", ARROWDOWN: "ArrowDown", ARROWLEFT: "ArrowLeft", ARROWRIGHT: "ArrowRight", ARROW_UP: "ArrowUp", ARROW_DOWN: "ArrowDown", ARROW_LEFT: "ArrowLeft", ARROW_RIGHT: "ArrowRight", UP: "ArrowUp", DOWN: "ArrowDown", LEFT: "ArrowLeft", RIGHT: "ArrowRight", SHIFT: "Shift", CONTROL: "Control", CTRL: "Control", ALT: "Alt", OPTION: "Alt", // macOS alternative name META: "Meta", COMMAND: "Meta", // macOS CMD: "Meta", // macOS shorthand SUPER: "Meta", // Linux WINDOWS: "Meta", // Windows WIN: "Meta", // Windows shorthand HOME: "Home", END: "End", PAGEUP: "PageUp", PAGEDOWN: "PageDown", PAGE_UP: "PageUp", PAGE_DOWN: "PageDown", PGUP: "PageUp", PGDN: "PageDown", }; /** * Maps a key name from various formats to Playwright-compatible format * @param key The key name in any supported format * @returns The Playwright-compatible key name */ export function mapKeyToPlaywright(key: string): string { if (!key) return key; const upperKey = key.toUpperCase(); return KEY_MAP[upperKey] || key; } ================================================ FILE: packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts ================================================ import { Part, FunctionCall, FunctionDeclaration, Type } from "@google/genai"; import { ToolSet } from "ai"; import { LogLine } from "../../types/public/logs.js"; import { toJsonSchema } from "../../zodCompat.js"; import type { StagehandZodSchema } from "../../zodCompat.js"; /** * Result of executing a custom tool for Google CUA */ export interface CustomToolExecutionResult { functionResponse: Part; success: boolean; } /** * Execute a custom tool and format the response for Google's API * This handles tool execution, result formatting, and error handling * specific to Google's function response format */ export async function executeGoogleCustomTool( toolName: string, toolArgs: Record, tools: ToolSet, functionCall: FunctionCall, logger: (message: LogLine) => void, ): Promise { try { logger({ category: "agent", message: `Executing custom tool: ${toolName} with args: ${JSON.stringify(toolArgs)}`, level: 1, }); const tool = tools[toolName]; const toolResult = await tool.execute(toolArgs, { toolCallId: `tool_${Date.now()}`, messages: [], }); logger({ category: "agent", message: `Tool ${toolName} completed successfully. Result: ${JSON.stringify(toolResult)}`, level: 1, }); // Create function response with the result const functionResponsePart: Part = { functionResponse: { name: toolName, response: { result: JSON.stringify(toolResult), }, }, }; return { functionResponse: functionResponsePart, success: true, }; } catch (toolError) { const errorMessage = toolError instanceof Error ? toolError.message : String(toolError); logger({ category: "agent", message: `Error executing custom tool ${toolName}: ${errorMessage}`, level: 0, }); // Create error function response const functionResponsePart: Part = { functionResponse: { name: toolName, response: { error: errorMessage, }, }, }; return { functionResponse: functionResponsePart, success: false, }; } } /** * Check if a function call is a custom tool */ export function isCustomTool( functionCall: FunctionCall, tools?: ToolSet, ): boolean { return !!(tools && functionCall.name && functionCall.name in tools); } /** * Convert ToolSet to Google's FunctionDeclaration array * Handles the conversion of Zod schemas to Google's parameter format */ export function convertToolSetToFunctionDeclarations( tools: ToolSet, ): FunctionDeclaration[] { const functionDeclarations: FunctionDeclaration[] = []; for (const [name, tool] of Object.entries(tools)) { const functionDeclaration = convertToolToFunctionDeclaration(name, tool); if (functionDeclaration) { functionDeclarations.push(functionDeclaration); } } return functionDeclarations; } /** * Convert a single ToolSet tool to Google's FunctionDeclaration format */ function convertToolToFunctionDeclaration( name: string, tool: { description?: string; inputSchema: unknown }, ): FunctionDeclaration | null { try { // Convert Zod schema to JSON schema const schema = tool.inputSchema as StagehandZodSchema; const jsonSchema = toJsonSchema(schema) as { properties?: Record; required?: string[]; type?: string; }; const parameters = convertJsonSchemaToGoogleParameters(jsonSchema); return { name, description: tool.description || `Execute ${name}`, parameters, }; } catch (error) { console.error( `Error converting tool ${name} to function declaration:`, error, ); return null; } } /** * Convert JSON schema to Google's parameter format */ function convertJsonSchemaToGoogleParameters(schema: { properties?: Record; required?: string[]; type?: string; }): { type: Type; properties: Record; required?: string[]; } { const properties: Record = {}; if (schema.properties) { for (const [key, value] of Object.entries(schema.properties)) { const propSchema = value as { type?: string; description?: string; items?: { type?: string }; }; properties[key] = { type: mapJsonTypeToGoogleType(propSchema.type || "string"), ...(propSchema.description ? { description: propSchema.description } : {}), }; } } return { type: Type.OBJECT, properties, ...(schema.required && schema.required.length > 0 ? { required: schema.required } : {}), }; } /** * Map JSON schema types to Google's Type enum */ function mapJsonTypeToGoogleType(jsonType: string): Type { switch (jsonType.toLowerCase()) { case "string": return Type.STRING; case "number": case "integer": return Type.NUMBER; case "boolean": return Type.BOOLEAN; case "array": return Type.ARRAY; case "object": return Type.OBJECT; default: return Type.STRING; } } ================================================ FILE: packages/core/lib/v3/agent/utils/handleDoneToolCall.ts ================================================ import { generateText, ModelMessage, LanguageModel, ToolSet } from "ai"; import { z } from "zod"; import { tool } from "ai"; import { LogLine } from "../../types/public/logs.js"; import { StagehandZodObject } from "../../zodCompat.js"; import { getZFactory } from "../../../utils.js"; import type { StagehandZodSchema } from "../../zodCompat.js"; interface DoneResult { reasoning: string; taskComplete: boolean; messages: ModelMessage[]; output?: Record; } function buildBaseDoneSchema(factory: typeof z) { return factory.object({ reasoning: factory .string() .describe("Brief summary of what actions were taken and the outcome"), taskComplete: factory .boolean() .describe("true if the task was fully completed, false otherwise"), }); } /** * Force a done tool call at the end of an agent run. * This ensures we always get a structured final response, * even if the main loop ended without calling done. */ export async function handleDoneToolCall(options: { model: LanguageModel; inputMessages: ModelMessage[]; instruction: string; outputSchema?: StagehandZodObject; logger: (message: LogLine) => void; }): Promise { const { model, inputMessages, instruction, outputSchema, logger } = options; logger({ category: "agent", message: "Agent calling tool: done", level: 1, }); // Use the same Zod version as the user's outputSchema to avoid v3/v4 mixing const factory = outputSchema ? getZFactory(outputSchema as StagehandZodSchema) : z; const baseDoneSchema = buildBaseDoneSchema(factory); // Merge base done schema with user-provided output schema if present const doneToolSchema = outputSchema ? baseDoneSchema.extend({ output: outputSchema.describe( "The specific data the user requested from this task", ), }) : baseDoneSchema; const outputInstructions = outputSchema ? `\n\nThe user also requested the following information from this task. Provide it in the "output" field:\n${JSON.stringify( Object.fromEntries( Object.entries(outputSchema.shape).map( ([key, value]: [string, StagehandZodSchema]) => [ key, value.description || "no description", ], ), ), null, 2, )}` : ""; const systemPrompt = `You are a web automation assistant that was tasked with completing a task. The task was: "${instruction}" Review what was accomplished and provide your final assessment in whether the task was completed successfully. you have been provided with the history of the actions taken so far, use this to determine if the task was completed successfully.${outputInstructions} Call the "done" tool with: 1. A brief summary of what was done 2. Whether the task was completed successfully${outputSchema ? "\n3. The requested output data based on what you found" : ""}`; const doneTool = tool({ description: outputSchema ? "Complete the task with your assessment and the requested output data." : "Complete the task with your final assessment.", inputSchema: doneToolSchema, execute: async (params) => { return { success: true, ...params }; }, }); const userPrompt: ModelMessage = { role: "user", content: outputSchema ? "Provide your final assessment and the requested output data." : "Provide your final assessment.", }; const result = await generateText({ model, system: systemPrompt, messages: [...inputMessages, userPrompt], tools: { done: doneTool } as ToolSet, toolChoice: { type: "tool", toolName: "done" }, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, openai: { store: false }, }, }); const doneToolCall = result.toolCalls.find((tc) => tc.toolName === "done"); const outputMessages: ModelMessage[] = [ userPrompt, ...(result.response?.messages || []), ]; if (!doneToolCall) { return { reasoning: result.text || "Task execution completed", taskComplete: false, messages: outputMessages, }; } const input = doneToolCall.input as { reasoning: string; taskComplete: boolean; output?: Record; }; logger({ category: "agent", message: `Task completed`, level: 1, }); return { reasoning: input.reasoning, taskComplete: input.taskComplete, messages: outputMessages, output: input.output, }; } ================================================ FILE: packages/core/lib/v3/agent/utils/imageCompression.ts ================================================ import { AnthropicMessage, AnthropicContentBlock, AnthropicToolResult, ResponseInputItem as OpenAIResponseInputItem, } from "../../types/public/agent.js"; import type { Content as GoogleContent, Part as GooglePart, } from "@google/genai"; export type ResponseInputItem = AnthropicMessage | AnthropicToolResult; interface FunctionResponseData { inlineData?: { mimeType?: string; data?: string; }; } export type AnthropicResponseInputItem = AnthropicMessage | AnthropicToolResult; export type SupportedInputItem = | AnthropicResponseInputItem | OpenAIResponseInputItem | GoogleContent; /** * Finds all items in the conversation history that contain images * @param items - Array of conversation items to check * @returns Array of indices where images were found */ export function findItemsWithImages(items: ResponseInputItem[]): number[] { const itemsWithImages: number[] = []; items.forEach((item, index) => { let hasImage = false; if (Array.isArray(item.content)) { hasImage = item.content.some( (contentItem: AnthropicContentBlock) => contentItem.type === "tool_result" && "content" in contentItem && Array.isArray(contentItem.content) && (contentItem.content as AnthropicContentBlock[]).some( (nestedItem: AnthropicContentBlock) => nestedItem.type === "image", ), ); } if (hasImage) { itemsWithImages.push(index); } }); return itemsWithImages; } /** * Compresses conversation history by removing images from older items * while keeping the most recent images intact * @param items - Array of conversation items to process * @param keepMostRecentCount - Number of most recent image-containing items to preserve (default: 2) * @returns Object with processed items */ export function compressConversationImages( items: ResponseInputItem[], keepMostRecentCount: number = 2, ): { items: ResponseInputItem[] } { const itemsWithImages = findItemsWithImages(items); items.forEach((item, index) => { const imageIndex = itemsWithImages.indexOf(index); const shouldCompress = imageIndex >= 0 && imageIndex < itemsWithImages.length - keepMostRecentCount; if (shouldCompress) { if (Array.isArray(item.content)) { item.content = item.content.map( (contentItem: AnthropicContentBlock) => { if ( contentItem.type === "tool_result" && "content" in contentItem && Array.isArray(contentItem.content) && (contentItem.content as AnthropicContentBlock[]).some( (nestedItem: AnthropicContentBlock) => nestedItem.type === "image", ) ) { return { ...contentItem, content: "screenshot taken", } as AnthropicContentBlock; } return contentItem; }, ); } } }); return { items, }; } /** * Finds all items in the conversation history that contain images (Google format) * @param items - Array of conversation items to check * @returns Array of indices where images were found */ export function findGoogleItemsWithImages(items: GoogleContent[]): number[] { const itemsWithImages: number[] = []; items.forEach((item, index) => { let hasImage = false; if (item.parts && Array.isArray(item.parts)) { hasImage = item.parts.some((part: GooglePart) => { // Check for functionResponse with data containing images if (part.functionResponse?.response?.data) { const data = part.functionResponse.response .data as FunctionResponseData[]; return data.some((dataItem) => dataItem.inlineData?.mimeType?.startsWith("image/"), ); } // Check for functionResponse with parts containing images if (part.functionResponse?.parts) { return part.functionResponse.parts.some((responsePart) => responsePart.inlineData?.mimeType?.startsWith("image/"), ); } // Check for direct inline data return part.inlineData?.mimeType?.startsWith("image/"); }); } if (hasImage) { itemsWithImages.push(index); } }); return itemsWithImages; } /** * Finds all items in the conversation history that contain images (OpenAI format) * @param items - Array of conversation items to check * @returns Array of indices where images were found */ export function findOpenAIItemsWithImages( items: OpenAIResponseInputItem[], ): number[] { const itemsWithImages: number[] = []; items.forEach((item, index) => { let hasImage = false; // Check for computer_call_output with image if ( "type" in item && item.type === "computer_call_output" && "output" in item ) { const output = item.output as unknown as { type: string; image_url: string; }; hasImage = output?.type === "input_image" && !!output?.image_url; } if (hasImage) { itemsWithImages.push(index); } }); return itemsWithImages; } /** * Compresses OpenAI conversation history by removing images from older items * while keeping the most recent images intact * @param items - Array of conversation items to process * @param keepMostRecentCount - Number of most recent image-containing items to preserve (default: 2) * @returns Object with processed items */ export function compressOpenAIConversationImages( items: OpenAIResponseInputItem[], keepMostRecentCount: number = 2, ): { items: OpenAIResponseInputItem[] } { const itemsWithImages = findOpenAIItemsWithImages(items); items.forEach((item, index) => { const imageIndex = itemsWithImages.indexOf(index); const shouldCompress = imageIndex >= 0 && imageIndex < itemsWithImages.length - keepMostRecentCount; if (shouldCompress) { // For computer_call_output with image, replace with text if ( "type" in item && item.type === "computer_call_output" && "output" in item ) { const output = item.output as unknown as { type: string }; if (output?.type === "input_image") { // Replace the image with a text message (item as unknown as { output: string }).output = "screenshot taken"; } } } }); return { items, }; } /** * Compresses Google conversation history by removing images from older items * while keeping the most recent images intact * @param items - Array of conversation items to process * @param keepMostRecentCount - Number of most recent image-containing items to preserve (default: 2) * @returns Object with processed items */ export function compressGoogleConversationImages( items: GoogleContent[], keepMostRecentCount: number = 2, ): { items: GoogleContent[] } { const itemsWithImages = findGoogleItemsWithImages(items); items.forEach((item, index) => { const imageIndex = itemsWithImages.indexOf(index); const shouldCompress = imageIndex >= 0 && imageIndex < itemsWithImages.length - keepMostRecentCount; if (shouldCompress && item.parts && Array.isArray(item.parts)) { item.parts = item.parts.map((part: GooglePart) => { // Replace functionResponse with data containing images if (part.functionResponse?.response?.data) { const data = part.functionResponse.response .data as FunctionResponseData[]; const hasImage = data.some((dataItem) => dataItem.inlineData?.mimeType?.startsWith("image/"), ); if (hasImage) { return { ...part, functionResponse: { ...part.functionResponse, data: [] as FunctionResponseData[], response: { ...part.functionResponse.response, compressed: "screenshot taken", }, }, }; } } // Replace functionResponse with parts containing images if (part.functionResponse?.parts) { const hasImageInParts = part.functionResponse.parts.some( (responsePart) => responsePart.inlineData?.mimeType?.startsWith("image/"), ); if (hasImageInParts) { return { ...part, functionResponse: { ...part.functionResponse, parts: part.functionResponse.parts.filter( (responsePart) => !responsePart.inlineData?.mimeType?.startsWith("image/"), ), response: { ...part.functionResponse.response, compressed: "screenshot taken", }, }, }; } } // Replace direct inline data images if (part.inlineData?.mimeType?.startsWith("image/")) { return { text: "screenshot taken", }; } return part; }); } }); return { items, }; } ================================================ FILE: packages/core/lib/v3/agent/utils/messageProcessing.ts ================================================ import type { ModelMessage } from "ai"; // Vision action tools that include screenshots in their results const VISION_ACTION_TOOLS = [ "click", "type", "dragAndDrop", "wait", "fillFormVision", "scroll", ]; function isToolMessage( message: unknown, ): message is { role: "tool"; content: unknown[] } { return ( !!message && typeof message === "object" && (message as { role?: unknown }).role === "tool" && Array.isArray((message as { content?: unknown }).content) ); } function isScreenshotPart(part: unknown): boolean { return ( !!part && typeof part === "object" && (part as { toolName?: unknown }).toolName === "screenshot" ); } function isVisionActionPart(part: unknown): boolean { if (!part || typeof part !== "object") return false; const toolName = (part as { toolName?: unknown }).toolName; return typeof toolName === "string" && VISION_ACTION_TOOLS.includes(toolName); } function isVisionPart(part: unknown): boolean { return isScreenshotPart(part) || isVisionActionPart(part); } function isAriaTreePart(part: unknown): boolean { return ( !!part && typeof part === "object" && (part as { toolName?: unknown }).toolName === "ariaTree" ); } /** * Compress old screenshot/ariaTree data in messages in-place. * * Strategy: * - Keep only the 2 most recent vision results (screenshots OR vision action tools like click/type/etc) * - Keep only the 1 most recent ariaTree (replace older ones with placeholder) * * @param messages - The messages array to modify in-place * @returns Number of items compressed */ export function processMessages(messages: ModelMessage[]): number { let compressedCount = 0; // Find indices of all vision-related tool results (screenshots + vision actions) // and ariaTree results const visionIndices: number[] = []; const ariaTreeIndices: number[] = []; for (let i = 0; i < messages.length; i++) { const message = messages[i]; if (isToolMessage(message)) { const content = message.content as unknown[]; if (content.some(isVisionPart)) { visionIndices.push(i); } if (content.some(isAriaTreePart)) { ariaTreeIndices.push(i); } } } // Compress old vision results (keep 2 most recent across all vision tools) if (visionIndices.length > 2) { const toCompress = visionIndices.slice(0, visionIndices.length - 2); for (const index of toCompress) { const message = messages[index]; if (isToolMessage(message)) { // Both functions are safe to call - they only modify their respective part types compressScreenshotMessage(message); compressVisionActionMessage(message); compressedCount++; } } } // Compress old ariaTree results (keep 1 most recent) if (ariaTreeIndices.length > 1) { const toCompress = ariaTreeIndices.slice(0, ariaTreeIndices.length - 1); for (const idx of toCompress) { const message = messages[idx]; if (isToolMessage(message)) { compressAriaTreeMessage(message); compressedCount++; } } } return compressedCount; } /** * Tool result part structure from AI SDK. * The output field uses a discriminated union - type determines value format: * - type: "content" -> value: Array<{type: "text", ...} | {type: "media", ...}> * - type: "text" -> value: string * - type: "json" -> value: JSONValue * - type: "error-text" -> value: string * - type: "error-json" -> value: JSONValue */ interface ToolResultPart { output?: { type: string; value?: unknown; }; } /** * Check if output has type "content" (array-based value format). * Only outputs with type "content" should have array values. */ function isContentTypeOutput(output: { type: string; value?: unknown; }): boolean { return output.type === "content"; } /** * Compress screenshot message content in-place. * Only modifies outputs with type "content" to maintain schema validity. * Replaces entire output object to ensure type/value consistency. */ function compressScreenshotMessage(message: { role: "tool"; content: unknown[]; }): void { for (const part of message.content) { if (isScreenshotPart(part)) { const typedPart = part as ToolResultPart; // Only compress if output exists and has type "content" if (typedPart.output && isContentTypeOutput(typedPart.output)) { // Replace entire output to ensure type/value consistency typedPart.output = { type: "content", value: [{ type: "text", text: "screenshot taken" }], }; } } } } /** * Compress vision action message content in-place by removing the screenshot * but keeping the action result text. * Only modifies outputs with type "content" to maintain schema validity. */ function compressVisionActionMessage(message: { role: "tool"; content: unknown[]; }): void { for (const part of message.content) { if (isVisionActionPart(part)) { const typedPart = part as ToolResultPart; // Only compress if output is type "content" (array-based value) if ( typedPart.output && isContentTypeOutput(typedPart.output) && Array.isArray(typedPart.output.value) ) { // Filter out media content but keep text results const filteredValue = ( typedPart.output.value as Array<{ type?: string }> ).filter( (item) => item && typeof item === "object" && item.type !== "media", ); // Replace entire output to ensure type/value consistency typedPart.output = { type: "content", value: filteredValue, }; } } } } /** * Compress ariaTree message content in-place. * Only modifies outputs with type "content" to maintain schema validity. * Replaces entire output object to ensure type/value consistency. */ function compressAriaTreeMessage(message: { role: "tool"; content: unknown[]; }): void { for (const part of message.content) { if (isAriaTreePart(part)) { const typedPart = part as ToolResultPart; // Only compress if output exists and has type "content" if (typedPart.output && isContentTypeOutput(typedPart.output)) { typedPart.output = { type: "content", value: [ { type: "text", text: "ARIA tree extracted for context of page elements", }, ], }; } } } } ================================================ FILE: packages/core/lib/v3/agent/utils/screenshotHandler.ts ================================================ import type { Page } from "../../understudy/page.js"; /** * Default delay in milliseconds to wait after vision actions before capturing screenshot. * Allows the page to settle after interactions. */ const DEFAULT_DELAY_MS = 500; /** * Waits for the page to settle and captures a screenshot. * If the screenshot fails (e.g., page closed, navigation in progress), * returns undefined instead of throwing - allowing the action to still succeed. * * @param page - The page to capture * @param delayMs - Delay before capturing (default: 500ms, pass 0 to skip delay) */ export async function waitAndCaptureScreenshot( page: Page, delayMs: number = DEFAULT_DELAY_MS, ): Promise { if (delayMs > 0) { await page.waitForTimeout(delayMs); } try { const buffer = await page.screenshot({ fullPage: false }); return buffer.toString("base64"); } catch { return undefined; } } ================================================ FILE: packages/core/lib/v3/agent/utils/validateExperimentalFeatures.ts ================================================ import { ExperimentalNotConfiguredError, StagehandInvalidArgumentError, } from "../../types/public/sdkErrors.js"; import type { AgentConfig, AgentExecuteOptionsBase, } from "../../types/public/index.js"; export interface AgentValidationOptions { /** Whether experimental mode is enabled */ isExperimental: boolean; /** Agent config options (integrations, tools, stream, cua, etc.) */ agentConfig?: Partial; /** Execute options (callbacks, signal, messages, etc.) */ executeOptions?: | (Partial & { callbacks?: unknown }) | null; /** Whether this is streaming mode (can be derived from agentConfig.stream) */ isStreaming?: boolean; } /** * Validates agent configuration and experimental feature usage. * * This utility consolidates all validation checks for both CUA and non-CUA agent paths: * - Invalid argument errors for CUA (streaming, abort signal, message continuation, excludeTools, output schema are not supported) * - Experimental feature checks for integrations and tools (both CUA and non-CUA) * - Experimental feature checks for hybrid mode (requires experimental: true) * - Experimental feature checks for non-CUA only (callbacks, signal, messages, streaming, excludeTools, output schema) * * Throws StagehandInvalidArgumentError for invalid/unsupported configurations. * Throws ExperimentalNotConfiguredError if experimental features are used without experimental mode. */ export function validateExperimentalFeatures( options: AgentValidationOptions, ): void { const { isExperimental, agentConfig, executeOptions, isStreaming } = options; // Check if CUA mode is enabled (via mode: "cua" or deprecated cua: true) const isCuaMode = agentConfig?.mode !== undefined ? agentConfig.mode === "cua" : agentConfig?.cua === true; // CUA-specific validation: certain features are not available at all if (isCuaMode) { const unsupportedFeatures: string[] = []; if (agentConfig?.stream) { unsupportedFeatures.push("streaming"); } if (executeOptions?.signal) { unsupportedFeatures.push("abort signal"); } if (executeOptions?.messages) { unsupportedFeatures.push("message continuation"); } if ( executeOptions?.excludeTools && executeOptions.excludeTools.length > 0 ) { unsupportedFeatures.push("excludeTools"); } if (executeOptions?.output) { unsupportedFeatures.push("output schema"); } if ( executeOptions?.variables && Object.keys(executeOptions.variables).length > 0 ) { unsupportedFeatures.push("variables"); } if (unsupportedFeatures.length > 0) { throw new StagehandInvalidArgumentError( `${unsupportedFeatures.join(", ")} ${unsupportedFeatures.length === 1 ? "is" : "are"} not supported with CUA (Computer Use Agent) mode.`, ); } } // Skip experimental checks if already in experimental mode if (isExperimental) return; const features: string[] = []; // Check agent config features (check array length to avoid false positives for empty arrays) const hasIntegrations = agentConfig?.integrations && agentConfig.integrations.length > 0; const hasTools = agentConfig?.tools && Object.keys(agentConfig.tools).length > 0; if (hasIntegrations || hasTools) { features.push("MCP integrations and custom tools"); } // Check streaming mode (either explicit or derived from config) - only for non-CUA if (!isCuaMode && (isStreaming || agentConfig?.stream)) { features.push("streaming"); } // Check execute options features - only for non-CUA if (executeOptions && !isCuaMode) { if (executeOptions.callbacks) { features.push("callbacks"); } if (executeOptions.signal) { features.push("abort signal"); } if (executeOptions.messages) { features.push("message continuation"); } if (executeOptions.excludeTools && executeOptions.excludeTools.length > 0) { features.push("excludeTools"); } if (executeOptions.output) { features.push("output schema"); } if ( executeOptions.variables && Object.keys(executeOptions.variables).length > 0 ) { features.push("variables"); } } if (features.length > 0) { throw new ExperimentalNotConfiguredError(`Agent ${features.join(", ")}`); } } ================================================ FILE: packages/core/lib/v3/agent/utils/variables.ts ================================================ import type { Variables, VariableValue } from "../../types/public/agent.js"; /** * Resolves a VariableValue to its primitive string value. * Handles both simple primitives ("secret") and rich objects ({ value: "secret", description: "..." }). */ export function resolveVariableValue(v: VariableValue): string { if (typeof v === "object" && v !== null && "value" in v) { return String(v.value); } return String(v); } /** * Extracts the optional description from a VariableValue. * Returns undefined for simple primitive values. */ export function getVariableDescription(v: VariableValue): string | undefined { if (typeof v === "object" && v !== null && "value" in v) { return v.description; } return undefined; } /** * Substitutes %variableName% tokens in text with resolved variable values. * Works with both simple and rich variable formats. */ export function substituteVariables( text: string, variables?: Variables, ): string { if (!variables) return text; let result = text; for (const [key, v] of Object.entries(variables)) { const token = `%${key}%`; result = result.split(token).join(resolveVariableValue(v)); } return result; } /** * Flattens Variables to Record for internal consumers * that only need key→value mappings (e.g., actHandler, cache replay). */ export function flattenVariables( variables?: Variables, ): Record | undefined { if (!variables || Object.keys(variables).length === 0) return undefined; const result: Record = {}; for (const [key, v] of Object.entries(variables)) { result[key] = resolveVariableValue(v); } return result; } ================================================ FILE: packages/core/lib/v3/agent/utils/xpath.ts ================================================ /** * Utility functions for XPath handling in agent tools. */ /** * Ensures a value is properly formatted as an XPath selector. * Returns null if the value is not a valid string. * * @param value - The value to normalize as an XPath * @returns The normalized XPath string prefixed with "xpath=" or null */ export function ensureXPath(value: unknown): string | null { if (typeof value !== "string") return null; const trimmed = value.trim(); if (!trimmed) return null; return trimmed.startsWith("xpath=") ? trimmed : `xpath=${trimmed}`; } ================================================ FILE: packages/core/lib/v3/api.ts ================================================ import makeFetchCookie from "fetch-cookie"; import { loadApiKeyFromEnv } from "../utils.js"; import { STAGEHAND_VERSION } from "../version.js"; import { StagehandAPIError, StagehandAPIUnauthorizedError, StagehandHttpError, StagehandResponseBodyError, StagehandResponseParseError, StagehandServerError, ExperimentalNotConfiguredError, } from "./types/public/index.js"; import type { ActResult, AgentConfig, AgentExecuteOptions, AgentResult, ExtractResult, ObserveResult, LogLine, StagehandMetrics, BrowserbaseRegion, ActOptions, ExtractOptions, ObserveOptions, Api, } from "./types/public/index.js"; import type { SerializableResponse, AgentCacheTransferPayload, } from "./types/private/index.js"; import type { ModelConfiguration } from "./types/public/model.js"; import { toJsonSchema } from "./zodCompat.js"; import type { StagehandZodSchema } from "./zodCompat.js"; // ============================================================================= // Multi-region API URL mapping // ============================================================================= /** * Mapping of Browserbase regions to their corresponding Stagehand API base URLs. * Users should configure their client to hit the API endpoint that matches * the region where their browser session is running. */ export const REGION_API_URLS: Record = { "us-west-2": "https://api.stagehand.browserbase.com", "us-east-1": "https://api.use1.stagehand.browserbase.com", "eu-central-1": "https://api.euc1.stagehand.browserbase.com", "ap-southeast-1": "https://api.apse1.stagehand.browserbase.com", }; /** * Returns the full API URL (with /v1 suffix) for a given Browserbase region. * If no region is specified or the region is unknown, defaults to us-west-2. * * @param region - The Browserbase region (e.g., "us-west-2", "eu-central-1") * @returns The full API URL including /v1 suffix */ export function getApiUrlForRegion( region: BrowserbaseRegion | undefined, ): string { const baseUrl = REGION_API_URLS[region as BrowserbaseRegion] ?? REGION_API_URLS["us-west-2"]; return `${baseUrl}/v1`; } // ============================================================================= // Client-specific types (can't be Zod schemas due to functions/Page objects) // ============================================================================= // // These types mirror the Api.* schemas from types/public/api.ts but include // non-serializable SDK fields (like Page objects) that get stripped before // sending requests over the wire. // // Relationship to wire format: // - Client accepts: SDK types (ActOptions, ExtractOptions, etc.) with optional `page` // - Wire sends: Api.* types (page stripped, Zod schema converted to JSON schema) // - Client returns: SDK result types (ActResult, ExtractResult, etc.) // ============================================================================= /** * Constructor parameters for StagehandAPIClient */ interface StagehandAPIConstructorParams { apiKey: string; projectId?: string; logger: (message: LogLine) => void; /** * When true, enables server-side caching by default for all requests. * When false, disables server-side caching. * Defaults to true (caching enabled). * Can be overridden per-method in act(), extract(), and observe() options. */ serverCache?: boolean; } /** * Parameters for starting a session via the API client. * Extends Api.SessionStartRequest with client-specific field (modelApiKey). * * Wire format: Api.SessionStartRequest (modelApiKey sent via header, not body) */ interface ClientSessionStartParams extends Api.SessionStartRequest { /** Model API key - sent via x-model-api-key header, not in request body */ modelApiKey: string; } /** * Generic API response wrapper matching Api.*Response schemas */ type ApiResponse = | { success: true; data: T } | { success: false; message: string }; /** * Union of all API request body types for type-safe execute() calls */ type ApiRequestBody = | Api.ActRequest | Api.ExtractRequest | Api.ObserveRequest | Api.NavigateRequest | Api.AgentExecuteRequest; /** * Parameters for executing an action via the streaming API */ interface ExecuteActionParams { method: "act" | "extract" | "observe" | "navigate" | "end" | "agentExecute"; args?: ApiRequestBody; params?: Record; /** * Override the instance-level serverCache setting for this request. * When true, enables server-side caching. * When false, disables server-side caching. */ serverCache?: boolean; } /** * Client parameters for act() method. * Derives structure from Api.ActRequest but uses SDK's ActOptions (which includes `page`). * Before serialization, `page` is stripped to produce Api.ActRequest wire format. */ interface ClientActParameters { input: Api.ActRequest["input"]; options?: ActOptions; frameId?: Api.ActRequest["frameId"]; } /** * Client parameters for extract() method. * Derives structure from Api.ExtractRequest but uses SDK's ExtractOptions (which includes `page`) * and accepts Zod schema (converted to JSON schema for wire format). */ interface ClientExtractParameters { instruction?: Api.ExtractRequest["instruction"]; schema?: StagehandZodSchema; options?: ExtractOptions; frameId?: Api.ExtractRequest["frameId"]; } /** * Client parameters for observe() method. * Derives structure from Api.ObserveRequest but uses SDK's ObserveOptions (which includes `page`). * Before serialization, `page` is stripped to produce Api.ObserveRequest wire format. */ interface ClientObserveParameters { instruction?: Api.ObserveRequest["instruction"]; options?: ObserveOptions; frameId?: Api.ObserveRequest["frameId"]; } export class StagehandAPIClient { private apiKey: string; private projectId?: string; private sessionId?: string; private modelApiKey: string; private modelProvider?: string; private region?: BrowserbaseRegion; private logger: (message: LogLine) => void; private fetchWithCookies; private serverCache: boolean; private lastFinishedEventData: Record | null = null; private latestAgentCacheEntry: AgentCacheTransferPayload | null = null; constructor({ apiKey, projectId, logger, serverCache, }: StagehandAPIConstructorParams) { this.apiKey = apiKey; this.projectId = projectId; this.logger = logger; this.serverCache = serverCache ?? true; // Create a single cookie jar instance that will persist across all requests this.fetchWithCookies = makeFetchCookie(fetch); } async init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, // browser, TODO for local browsers }: ClientSessionStartParams): Promise { if (!modelApiKey) { throw new StagehandAPIError("modelApiKey is required"); } this.modelApiKey = modelApiKey; // Extract provider from modelName (e.g., "openai/gpt-5-nano" -> "openai") this.modelProvider = modelName?.includes("/") ? modelName.split("/")[0] : undefined; // Store the region for multi-region API URL resolution this.region = browserbaseSessionCreateParams?.region; this.logger({ category: "init", message: "Creating new browserbase session...", level: 1, }); // Build wire-format request body (Api.SessionStartRequest shape) const requestBody: Api.SessionStartRequest = { modelName, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, // browser, TODO: only send when connected to local fastify }; const sessionResponse = await this.request("/sessions/start", { method: "POST", body: JSON.stringify(requestBody), }); if (sessionResponse.status === 401) { throw new StagehandAPIUnauthorizedError( "Unauthorized. Ensure you provided a valid API key.", ); } else if (sessionResponse.status !== 200) { const errorText = await sessionResponse.text(); this.logger({ category: "api", message: `API error (${sessionResponse.status}): ${errorText}`, level: 0, }); throw new StagehandHttpError(`Unknown error: ${sessionResponse.status}`); } const sessionResponseBody = (await sessionResponse.json()) as ApiResponse; if (sessionResponseBody.success === false) { throw new StagehandAPIError(sessionResponseBody.message); } // Temporary reroute for rollout if (!sessionResponseBody.data?.available && browserbaseSessionID) { sessionResponseBody.data.sessionId = browserbaseSessionID; } this.sessionId = sessionResponseBody.data.sessionId; return sessionResponseBody.data; } async act({ input, options, frameId, }: ClientActParameters): Promise { // Strip non-serializable `page` and SDK-only fields from options before wire serialization let wireOptions: Api.ActRequest["options"]; let serverCache: boolean | undefined; if (options) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const { page: _, serverCache: enableCache, ...restOptions } = options; serverCache = enableCache; if (Object.keys(restOptions).length > 0) { if (restOptions.model) { restOptions.model = this.prepareModelConfig(restOptions.model); } wireOptions = restOptions as unknown as Api.ActRequest["options"]; } } // Build wire-format request body const requestBody: Api.ActRequest = { input, options: wireOptions, frameId, }; return this.execute({ method: "act", args: requestBody, serverCache, }); } async extract({ instruction, schema: zodSchema, options, frameId, }: ClientExtractParameters): Promise> { // Convert Zod schema to JSON schema for wire format const jsonSchema = zodSchema ? toJsonSchema(zodSchema) : undefined; // Strip non-serializable `page` and SDK-only fields from options before wire serialization let wireOptions: Api.ExtractRequest["options"]; let serverCache: boolean | undefined; if (options) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const { page: _, serverCache: enableCache, ...restOptions } = options; serverCache = enableCache; if (Object.keys(restOptions).length > 0) { if (restOptions.model) { restOptions.model = this.prepareModelConfig(restOptions.model); } wireOptions = restOptions as unknown as Api.ExtractRequest["options"]; } } // Build wire-format request body const requestBody: Api.ExtractRequest = { instruction, schema: jsonSchema, options: wireOptions, frameId, }; return this.execute>({ method: "extract", args: requestBody, serverCache, }); } async observe({ instruction, options, frameId, }: ClientObserveParameters): Promise { // Strip non-serializable `page` and SDK-only fields from options before wire serialization let wireOptions: Api.ObserveRequest["options"]; let serverCache: boolean | undefined; if (options) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const { page: _, serverCache: enableCache, ...restOptions } = options; serverCache = enableCache; if (Object.keys(restOptions).length > 0) { if (restOptions.model) { restOptions.model = this.prepareModelConfig(restOptions.model); } wireOptions = restOptions as unknown as Api.ObserveRequest["options"]; } } // Build wire-format request body const requestBody: Api.ObserveRequest = { instruction, options: wireOptions, frameId, }; return this.execute({ method: "observe", args: requestBody, serverCache, }); } async goto( url: string, options?: Api.NavigateRequest["options"], frameId?: string, ): Promise { const requestBody: Api.NavigateRequest = { url, options, frameId }; return this.execute({ method: "navigate", args: requestBody, }); } async agentExecute( agentConfig: AgentConfig, executeOptions: AgentExecuteOptions | string, frameId?: string, shouldCache?: boolean, ): Promise { // Check if integrations are being used in API mode (not supported) if (agentConfig.integrations && agentConfig.integrations.length > 0) { throw new ExperimentalNotConfiguredError("MCP integrations"); } // Strip non-serializable `page` from executeOptions before wire serialization let wireExecuteOptions: Api.AgentExecuteRequest["executeOptions"]; if (typeof executeOptions === "string") { wireExecuteOptions = { instruction: executeOptions }; } else if (executeOptions.page) { // eslint-disable-next-line @typescript-eslint/no-unused-vars const { page: _, ...rest } = executeOptions; wireExecuteOptions = rest; } else { wireExecuteOptions = executeOptions; } const wireAgentConfig: Api.AgentExecuteRequest["agentConfig"] = { systemPrompt: agentConfig.systemPrompt, mode: agentConfig.mode ?? (agentConfig.cua === true ? "cua" : undefined), cua: agentConfig.mode === undefined ? agentConfig.cua : undefined, model: agentConfig.model ? this.prepareModelConfig(agentConfig.model) : undefined, executionModel: agentConfig.executionModel ? this.prepareModelConfig(agentConfig.executionModel) : undefined, }; // Build wire-format request body const requestBody: Api.AgentExecuteRequest = { agentConfig: wireAgentConfig, executeOptions: wireExecuteOptions, frameId, shouldCache, }; const result = await this.execute({ method: "agentExecute", args: requestBody, }); const finishedData = this.consumeFinishedEventData() ?? null; this.latestAgentCacheEntry = finishedData?.cacheEntry !== undefined ? (finishedData.cacheEntry as AgentCacheTransferPayload) : null; return result; } consumeLatestAgentCacheEntry(): AgentCacheTransferPayload | null { const entry = this.latestAgentCacheEntry; this.latestAgentCacheEntry = null; return entry; } async end(): Promise { const url = `/sessions/${this.sessionId}/end`; const response = await this.request(url, { method: "POST", }); return response; } async getReplayMetrics(): Promise { if (!this.sessionId) { throw new StagehandAPIError("sessionId is required to fetch metrics."); } const response = await this.request(`/sessions/${this.sessionId}/replay`, { method: "GET", }); if (response.status !== 200) { const errorText = await response.text(); this.logger({ category: "api", message: `Failed to fetch metrics. Status ${response.status}: ${errorText}`, level: 0, }); throw new StagehandHttpError( `Failed to fetch metrics with status ${response.status}: ${errorText}`, ); } const data = (await response.json()) as | Api.ReplayResponse | { success: false; error?: string }; if (!data.success) { const errorData = data as { success: false; error?: string }; throw new StagehandAPIError( `Failed to fetch metrics: ${errorData.error || "Unknown error"}`, ); } // Parse the API data into StagehandMetrics format const apiData = (data as Api.ReplayResponse).data; const metrics: StagehandMetrics = { actPromptTokens: 0, actCompletionTokens: 0, actReasoningTokens: 0, actCachedInputTokens: 0, actInferenceTimeMs: 0, extractPromptTokens: 0, extractCompletionTokens: 0, extractReasoningTokens: 0, extractCachedInputTokens: 0, extractInferenceTimeMs: 0, observePromptTokens: 0, observeCompletionTokens: 0, observeReasoningTokens: 0, observeCachedInputTokens: 0, observeInferenceTimeMs: 0, agentPromptTokens: 0, agentCompletionTokens: 0, agentReasoningTokens: 0, agentCachedInputTokens: 0, agentInferenceTimeMs: 0, totalPromptTokens: 0, totalCompletionTokens: 0, totalReasoningTokens: 0, totalCachedInputTokens: 0, totalInferenceTimeMs: 0, }; // Parse pages and their actions const pages = apiData?.pages || []; for (const page of pages) { const actions = page.actions || []; for (const action of actions) { // Get method name and token usage const method = (action.method || "").toLowerCase(); const tokenUsage = action.tokenUsage; if (tokenUsage) { const inputTokens = tokenUsage.inputTokens || 0; const outputTokens = tokenUsage.outputTokens || 0; const reasoningTokens = "reasoningTokens" in tokenUsage ? Number( (tokenUsage as { reasoningTokens?: number }) .reasoningTokens ?? 0, ) : 0; const cachedInputTokens = "cachedInputTokens" in tokenUsage ? Number( (tokenUsage as { cachedInputTokens?: number }) .cachedInputTokens ?? 0, ) : 0; const timeMs = tokenUsage.timeMs || 0; // Map method to metrics fields if (method === "act") { metrics.actPromptTokens += inputTokens; metrics.actCompletionTokens += outputTokens; metrics.actReasoningTokens += reasoningTokens; metrics.actCachedInputTokens += cachedInputTokens; metrics.actInferenceTimeMs += timeMs; } else if (method === "extract") { metrics.extractPromptTokens += inputTokens; metrics.extractCompletionTokens += outputTokens; metrics.extractReasoningTokens += reasoningTokens; metrics.extractCachedInputTokens += cachedInputTokens; metrics.extractInferenceTimeMs += timeMs; } else if (method === "observe") { metrics.observePromptTokens += inputTokens; metrics.observeCompletionTokens += outputTokens; metrics.observeReasoningTokens += reasoningTokens; metrics.observeCachedInputTokens += cachedInputTokens; metrics.observeInferenceTimeMs += timeMs; } else if (method === "agent") { metrics.agentPromptTokens += inputTokens; metrics.agentCompletionTokens += outputTokens; metrics.agentReasoningTokens += reasoningTokens; metrics.agentCachedInputTokens += cachedInputTokens; metrics.agentInferenceTimeMs += timeMs; } // Always update totals for any method with token usage metrics.totalPromptTokens += inputTokens; metrics.totalCompletionTokens += outputTokens; metrics.totalReasoningTokens += reasoningTokens; metrics.totalCachedInputTokens += cachedInputTokens; metrics.totalInferenceTimeMs += timeMs; } } } return metrics; } /** * Prepares a model configuration for the API payload by ensuring the `apiKey` * is included. If the model is passed as a string, converts it to an object * with `modelName` and `apiKey`. * * In API mode, we only attempt to load an API key from env vars when the * model provider differs from the one used to init the session. */ private prepareModelConfig( model: ModelConfiguration, ): { modelName: string; apiKey: string } & Record { if (typeof model === "string") { // Extract provider from model string (e.g., "openai/gpt-5-nano" -> "openai") const provider = model.includes("/") ? model.split("/")[0] : undefined; const apiKey = provider && provider !== this.modelProvider ? (loadApiKeyFromEnv(provider, this.logger) ?? this.modelApiKey) : this.modelApiKey; return { modelName: model, apiKey, }; } if (!model.apiKey) { const provider = model.modelName?.includes("/") ? model.modelName.split("/")[0] : undefined; const apiKey = provider && provider !== this.modelProvider ? (loadApiKeyFromEnv(provider, this.logger) ?? this.modelApiKey) : this.modelApiKey; return { ...model, apiKey, }; } return model as { modelName: string; apiKey: string } & Record< string, unknown >; } private consumeFinishedEventData(): T | null { const data = this.lastFinishedEventData as T | null; this.lastFinishedEventData = null; return data; } private async execute({ method, args, params, serverCache, }: ExecuteActionParams): Promise { this.lastFinishedEventData = null; const urlParams = new URLSearchParams(params as Record); const queryString = urlParams.toString(); const url = `/sessions/${this.sessionId}/${method}${queryString ? `?${queryString}` : ""}`; const response = await this.request( url, { method: "POST", body: JSON.stringify(args), }, serverCache, ); // Capture cache status from response header const cacheStatus = response.headers.get("browserbase-cache-status") as | "HIT" | "MISS" | null; if (!response.ok) { const errorBody = await response.text(); throw new StagehandHttpError( `HTTP error! status: ${response.status}, body: ${errorBody}`, ); } if (!response.body) { throw new StagehandResponseBodyError(); } const reader = response.body.getReader(); const decoder = new TextDecoder(); let buffer = ""; while (true) { const { value, done } = await reader.read(); if (done && !buffer) { throw new StagehandServerError( "Stream ended without completion signal", ); } buffer += decoder.decode(value, { stream: true }); const lines = buffer.split("\n\n"); buffer = lines.pop() || ""; for (const line of lines) { if (!line.startsWith("data: ")) continue; try { const eventData = JSON.parse(line.slice(6)); if (eventData.type === "system") { if (eventData.data.status === "error") { const { error: errorMsg } = eventData.data; // Throw plain Error to match local SDK behavior (useApi: false) throw new Error(errorMsg); } if (eventData.data.status === "finished") { this.lastFinishedEventData = eventData.data; // If caching was bypassed for this request, suppress cache status // so we don't log or surface a MISS that the server emits anyway. const cacheEnabled = this.shouldUseCache(serverCache); return this.attachCacheStatus( eventData.data.result as T, method, cacheEnabled ? cacheStatus : null, cacheEnabled ? eventData : { data: {} }, ); } } else if (eventData.type === "log") { const msg = eventData.data.message; // Skip server-side internal logs that don't apply to API mode if (msg?.message === "Connecting to local browser") { continue; } this.logger(eventData.data.message); } } catch (e) { // Let Error instances pass through (server errors thrown above) // Only wrap SyntaxError from JSON.parse as parse errors if (e instanceof Error && !(e instanceof SyntaxError)) { throw e; } const errorMessage = e instanceof Error ? e.message : String(e); this.logger({ category: "api", message: `Failed to parse SSE event: ${errorMessage}`, level: 0, }); throw new StagehandResponseParseError( `Failed to parse server response: ${errorMessage}`, ); } } if (done) { // Process any remaining data in buffer before exiting if (buffer.trim() && buffer.startsWith("data: ")) { try { const eventData = JSON.parse(buffer.slice(6)); if ( eventData.type === "system" && eventData.data.status === "finished" ) { return this.attachCacheStatus( eventData.data.result as T, method, cacheStatus, eventData, ); } } catch { this.logger({ category: "api", message: `Incomplete data in final buffer: ${buffer.substring(0, 100)}`, level: 0, }); } } throw new StagehandServerError( "Stream ended without completion signal", ); } } } /** * Resolves the final cache status from the response header or SSE event data, * logs it, and attaches it to act/extract results before returning. */ private attachCacheStatus( result: T, method: string, cacheStatus: "HIT" | "MISS" | null, eventData: { data: { cacheHit?: boolean } }, ): T { const finalCacheStatus = cacheStatus || (typeof eventData.data.cacheHit === "boolean" ? eventData.data.cacheHit ? "HIT" : "MISS" : undefined); if ( finalCacheStatus && (method === "act" || method === "extract" || method === "observe") ) { this.logger({ category: "cache", message: `${method} server cache ${finalCacheStatus.toLowerCase()}`, level: 1, }); } if ( finalCacheStatus && result && typeof result === "object" && (method === "act" || method === "extract" || method === "observe") ) { // eslint-disable-next-line @typescript-eslint/no-explicit-any (result as ActResult | ExtractResult | ObserveResult).cacheStatus = finalCacheStatus; } return result; } /** * Determine if caching should be enabled for a request. * Method-level setting takes precedence over instance-level setting. */ private shouldUseCache(methodServerCache?: boolean): boolean { // If method-level setting is explicitly provided, use it if (methodServerCache !== undefined) { return methodServerCache; } // Otherwise, use instance-level setting return this.serverCache; } private async request( path: string, options: RequestInit, serverCache?: boolean, ): Promise { const defaultHeaders: Record = { "x-bb-api-key": this.apiKey, ...(this.projectId ? { "x-bb-project-id": this.projectId } : {}), "x-bb-session-id": this.sessionId, // we want real-time logs, so we stream the response "x-stream-response": "true", "x-model-api-key": this.modelApiKey, "x-language": "typescript", "x-sdk-version": STAGEHAND_VERSION, }; // Add cache bypass header if caching is disabled if (!this.shouldUseCache(serverCache)) { defaultHeaders["browserbase-cache-bypass"] = "true"; } if (options.method === "POST" && options.body) { defaultHeaders["Content-Type"] = "application/json"; } // Use STAGEHAND_API_URL env var if set, otherwise use region-based URL // Ensure /v1 suffix is present for consistency let baseUrl: string; if (process.env.STAGEHAND_API_URL) { const envUrl = process.env.STAGEHAND_API_URL.replace(/\/+$/, ""); // Append /v1 if not already present baseUrl = envUrl.endsWith("/v1") ? envUrl : `${envUrl}/v1`; } else { baseUrl = getApiUrlForRegion(this.region); } const response = await this.fetchWithCookies(`${baseUrl}${path}`, { ...options, headers: { ...defaultHeaders, ...options.headers, }, }); return response; } } ================================================ FILE: packages/core/lib/v3/cache/ActCache.ts ================================================ import { createHash } from "crypto"; import type { ActHandler } from "../handlers/actHandler.js"; import type { LLMClient } from "../llm/LLMClient.js"; import type { Action, ActResult, Logger } from "../types/public/index.js"; import type { Page } from "../understudy/page.js"; import { CacheStorage } from "./CacheStorage.js"; import { safeGetPageUrl, waitForCachedSelector } from "./utils.js"; import { ActCacheContext, ActCacheDeps, CachedActEntry, } from "../types/private/index.js"; import { StagehandNotInitializedError } from "../types/public/sdkErrors.js"; import { withTimeout } from "../timeoutConfig.js"; export class ActCache { private readonly storage: CacheStorage; private readonly logger: Logger; private readonly getActHandler: () => ActHandler | null; private readonly getDefaultLlmClient: () => LLMClient; private readonly domSettleTimeoutMs?: number; constructor({ storage, logger, getActHandler, getDefaultLlmClient, domSettleTimeoutMs, }: ActCacheDeps) { this.storage = storage; this.logger = logger; this.getActHandler = getActHandler; this.getDefaultLlmClient = getDefaultLlmClient; this.domSettleTimeoutMs = domSettleTimeoutMs; } get enabled(): boolean { return this.storage.enabled; } async prepareContext( instruction: string, page: Page, variables?: Record, ): Promise { if (!this.enabled) return null; const sanitizedInstruction = instruction.trim(); const sanitizedVariables = variables ? { ...variables } : undefined; const variableKeys = sanitizedVariables ? Object.keys(sanitizedVariables).sort() : []; const pageUrl = await safeGetPageUrl(page); const cacheKey = this.buildActCacheKey( sanitizedInstruction, pageUrl, variableKeys, ); return { instruction: sanitizedInstruction, cacheKey, pageUrl, variableKeys, variables: sanitizedVariables, }; } async tryReplay( context: ActCacheContext, page: Page, timeout?: number, llmClientOverride?: LLMClient, ): Promise { if (!this.enabled) return null; const { value: entry, error, path, } = await this.storage.readJson(`${context.cacheKey}.json`); if (error && path) { this.logger({ category: "cache", message: `failed to read act cache entry: ${path}`, level: 2, auxiliary: { error: { value: String(error), type: "string" }, }, }); return null; } if (!entry) return null; if (entry.version !== 1) return null; if (!Array.isArray(entry.actions) || entry.actions.length === 0) { return null; } const entryVariableKeys = Array.isArray(entry.variableKeys) ? [...entry.variableKeys].sort() : []; const contextVariableKeys = [...context.variableKeys]; if (!this.doVariableKeysMatch(entryVariableKeys, contextVariableKeys)) { return null; } if ( contextVariableKeys.length > 0 && (!context.variables || !this.hasAllVariableValues(contextVariableKeys, context.variables)) ) { this.logger({ category: "cache", message: "act cache miss: missing variables for replay", level: 2, auxiliary: { instruction: { value: context.instruction, type: "string" }, }, }); return null; } this.logger({ category: "cache", message: "act cache hit", level: 1, auxiliary: { instruction: { value: context.instruction, type: "string" }, url: { value: entry.url ?? context.pageUrl, type: "string", }, }, }); return await this.replayCachedActions( context, entry, page, timeout, llmClientOverride, ); } async store(context: ActCacheContext, result: ActResult): Promise { if (!this.enabled) return; const entry: CachedActEntry = { version: 1, instruction: context.instruction, url: context.pageUrl, variableKeys: context.variableKeys, actions: result.actions ?? [], actionDescription: result.actionDescription, message: result.message, }; const { error, path } = await this.storage.writeJson( `${context.cacheKey}.json`, entry, ); if (error && path) { this.logger({ category: "cache", message: "failed to write act cache entry", level: 1, auxiliary: { error: { value: String(error), type: "string" }, }, }); return; } this.logger({ category: "cache", message: "act cache stored", level: 2, auxiliary: { instruction: { value: context.instruction, type: "string" }, url: { value: context.pageUrl, type: "string" }, }, }); } private buildActCacheKey( instruction: string, url: string, variableKeys: string[], ): string { const payload = JSON.stringify({ instruction, url, variableKeys, }); return createHash("sha256").update(payload).digest("hex"); } private async replayCachedActions( context: ActCacheContext, entry: CachedActEntry, page: Page, timeout?: number, llmClientOverride?: LLMClient, ): Promise { const handler = this.getActHandler(); if (!handler) { throw new StagehandNotInitializedError("act()"); } const effectiveClient = llmClientOverride ?? this.getDefaultLlmClient(); const execute = async (): Promise => { const actionResults: ActResult[] = []; for (const action of entry.actions) { await waitForCachedSelector({ page, selector: action.selector, timeout: this.domSettleTimeoutMs, logger: this.logger, context: "act", }); const result = await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, effectiveClient, undefined, context.variables, ); actionResults.push(result); if (!result.success) { break; } } if (actionResults.length === 0) { return { success: false, message: "Failed to perform act: cached entry has no actions", actionDescription: entry.actionDescription ?? entry.instruction, actions: [], }; } const success = actionResults.every((r) => r.success); const actions = actionResults.flatMap((r) => r.actions ?? []); const message = actionResults .map((r) => r.message) .filter((m) => m && m.trim().length > 0) .join(" → ") || entry.message || `Replayed ${entry.actions.length} cached action${ entry.actions.length === 1 ? "" : "s" }.`; const actionDescription = entry.actionDescription || actionResults[actionResults.length - 1]?.actionDescription || entry.actions[entry.actions.length - 1]?.description || entry.instruction; if ( success && actions.length > 0 && this.haveActionsChanged(entry.actions, actions) ) { await this.refreshCacheEntry(context, { ...entry, actions, message, actionDescription, }); } return { success, message, actionDescription, actions, }; }; return await withTimeout(execute(), timeout, "act()"); } private haveActionsChanged(original: Action[], updated: Action[]): boolean { if (original.length !== updated.length) { return true; } for (let i = 0; i < original.length; i += 1) { const orig = original[i]; const next = updated[i]; if (!next) { return true; } if (orig.selector !== next.selector) { return true; } if (orig.description !== next.description) { return true; } if ((orig.method ?? "") !== (next.method ?? "")) { return true; } const origArgs = orig.arguments ?? []; const nextArgs = next.arguments ?? []; if (origArgs.length !== nextArgs.length) { return true; } for (let j = 0; j < origArgs.length; j += 1) { if (origArgs[j] !== nextArgs[j]) { return true; } } } return false; } private async refreshCacheEntry( context: ActCacheContext, entry: CachedActEntry, ): Promise { const { error, path } = await this.storage.writeJson( `${context.cacheKey}.json`, { ...entry, variableKeys: context.variableKeys, }, ); if (error && path) { this.logger({ category: "cache", message: "failed to update act cache entry after self-heal", level: 0, auxiliary: { error: { value: String(error), type: "string" }, }, }); return; } this.logger({ category: "cache", message: "act cache entry updated after self-heal", level: 2, auxiliary: { instruction: { value: context.instruction, type: "string" }, url: { value: context.pageUrl, type: "string" }, }, }); } private doVariableKeysMatch( entryKeys: string[], contextKeys: string[], ): boolean { if (entryKeys.length !== contextKeys.length) { return false; } for (let i = 0; i < entryKeys.length; i += 1) { if (entryKeys[i] !== contextKeys[i]) { return false; } } return true; } private hasAllVariableValues( variableKeys: string[], variables: Record, ): boolean { for (const key of variableKeys) { if (!(key in variables)) { return false; } } return true; } } ================================================ FILE: packages/core/lib/v3/cache/AgentCache.ts ================================================ import { createHash } from "crypto"; import type { ActHandler } from "../handlers/actHandler.js"; import type { LLMClient } from "../llm/LLMClient.js"; import type { AgentReplayActStep, AgentReplayFillFormStep, AgentReplayGotoStep, AgentReplayKeysStep, AgentReplayNavBackStep, AgentReplayScrollStep, AgentReplayStep, AgentReplayWaitStep, CachedAgentEntry, SanitizedAgentExecuteOptions, ActFn, AgentCacheContext, AgentCacheDeps, AgentCacheTransferPayload, } from "../types/private/index.js"; import type { Action, AgentResult, AgentStreamResult, AgentConfig, AgentExecuteOptionsBase, AvailableModel, Logger, } from "../types/public/index.js"; import type { Page } from "../understudy/page.js"; import type { V3Context } from "../understudy/context.js"; import { CacheStorage } from "./CacheStorage.js"; import { cloneForCache, safeGetPageUrl, waitForCachedSelector, } from "./utils.js"; const SENSITIVE_CONFIG_KEYS = new Set(["apikey", "api_key", "api-key"]); export class AgentCache { private readonly storage: CacheStorage; private readonly logger: Logger; private readonly getActHandler: () => ActHandler | null; private readonly getContext: () => V3Context | null; private readonly getDefaultLlmClient: () => LLMClient; private readonly getBaseModelName: () => AvailableModel; private readonly getSystemPrompt: () => string | undefined; private readonly domSettleTimeoutMs?: number; private readonly act: ActFn; private readonly bufferLatestEntry: boolean; private recording: AgentReplayStep[] | null = null; private latestEntry: AgentCacheTransferPayload | null = null; constructor({ storage, logger, getActHandler, getContext, getDefaultLlmClient, getBaseModelName, getSystemPrompt, domSettleTimeoutMs, act, bufferLatestEntry, }: AgentCacheDeps) { this.storage = storage; this.logger = logger; this.getActHandler = getActHandler; this.getContext = getContext; this.getDefaultLlmClient = getDefaultLlmClient; this.getBaseModelName = getBaseModelName; this.getSystemPrompt = getSystemPrompt; this.domSettleTimeoutMs = domSettleTimeoutMs; this.act = act; this.bufferLatestEntry = bufferLatestEntry ?? false; } get enabled(): boolean { return this.storage.enabled; } shouldAttemptCache(instruction: string): boolean { return this.enabled && instruction.trim().length > 0; } sanitizeExecuteOptions( options?: AgentExecuteOptionsBase, ): SanitizedAgentExecuteOptions { if (!options) return {}; const sanitized: SanitizedAgentExecuteOptions = {}; if (typeof options.maxSteps === "number") { sanitized.maxSteps = options.maxSteps; } if ( "highlightCursor" in options && typeof (options as { highlightCursor?: unknown }).highlightCursor === "boolean" ) { sanitized.highlightCursor = ( options as { highlightCursor?: boolean } ).highlightCursor; } return sanitized; } buildConfigSignature(agentOptions?: AgentConfig): string { const toolKeys = agentOptions?.tools ? Object.keys(agentOptions.tools).sort() : undefined; const integrationSignatures = agentOptions?.integrations ? agentOptions.integrations.map((integration) => typeof integration === "string" ? integration : "client", ) : undefined; const serializedModel = this.serializeAgentModelForCache( agentOptions?.model, ); const serializedExecutionModel = this.serializeAgentModelForCache( agentOptions?.executionModel, ); const isCuaMode = agentOptions?.mode !== undefined ? agentOptions.mode === "cua" : agentOptions?.cua === true; return JSON.stringify({ v3Model: this.getBaseModelName(), systemPrompt: this.getSystemPrompt() ?? "", agent: { cua: isCuaMode, model: serializedModel ?? null, executionModel: isCuaMode ? null : serializedExecutionModel, systemPrompt: agentOptions?.systemPrompt ?? null, toolKeys, integrations: integrationSignatures, }, }); } async prepareContext(params: { instruction: string; options: SanitizedAgentExecuteOptions; configSignature: string; page: Page; variables?: Record; }): Promise { if (!this.shouldAttemptCache(params.instruction)) { return null; } const instruction = params.instruction.trim(); const startUrl = await safeGetPageUrl(params.page); const variableKeys = params.variables ? Object.keys(params.variables).sort() : []; const cacheKey = this.buildAgentCacheKey( instruction, startUrl, params.options, params.configSignature, variableKeys, ); return { instruction, startUrl, options: params.options, configSignature: params.configSignature, cacheKey, variableKeys, variables: params.variables, }; } async tryReplay( context: AgentCacheContext, llmClientOverride?: LLMClient, ): Promise { if (!this.enabled) return null; const { value: entry, error, path, } = await this.storage.readJson( `agent-${context.cacheKey}.json`, ); if (error && path) { this.logger({ category: "cache", message: `failed to read agent cache entry: ${path}`, level: 1, auxiliary: { error: { value: String(error), type: "string" }, }, }); return null; } if (!entry || entry.version !== 1) { return null; } this.logger({ category: "cache", message: "agent cache hit", level: 1, auxiliary: { instruction: { value: context.instruction, type: "string" }, url: { value: context.startUrl, type: "string" }, }, }); return await this.replayAgentCacheEntry(context, entry, llmClientOverride); } /** * Attempts to replay a cached agent execution and returns it as a stream result. * * This method exists because the agent API exposes two execution modes: * - `execute()` - Returns a Promise directly * - `stream()` - Returns an AgentStreamResult with async iterables for real-time output * * When a cache hit occurs, we need to return the appropriate type for each mode: * - For `execute()`, we use `tryReplay()` which returns AgentResult * - For `stream()`, we use `tryReplayAsStream()` which wraps the result in a * stream-compatible interface * * This ensures consumers using `stream()` can still iterate over `textStream` * and await `result` even when the response comes from cache, maintaining * API consistency regardless of whether the result was cached or live. */ async tryReplayAsStream( context: AgentCacheContext, llmClientOverride?: LLMClient, ): Promise { const result = await this.tryReplay(context, llmClientOverride); if (!result) return null; return this.createCachedStreamResult(result); } /** * Creates a mock AgentStreamResult that wraps a cached AgentResult. * * AgentStreamResult (from the AI SDK) is a complex type with multiple async * iterables and promises. When serving from cache, we don't have an actual * LLM stream to consume - we just have the final result. This method creates * a "fake" stream * This approach lets cached responses be transparent to the consumer - * they can use the same iteration patterns whether the result is live or cached. */ private createCachedStreamResult( cachedResult: AgentResult, ): AgentStreamResult { const message = cachedResult.message ?? ""; async function* textStreamGenerator(): AsyncGenerator { yield message; } async function* fullStreamGenerator(): AsyncGenerator<{ type: string; textDelta?: string; }> { yield { type: "text-delta", textDelta: message }; yield { type: "finish" }; } const mockStreamResult = { textStream: textStreamGenerator(), fullStream: fullStreamGenerator(), result: Promise.resolve(cachedResult), text: Promise.resolve(message), usage: Promise.resolve({ promptTokens: 0, completionTokens: 0, totalTokens: 0, }), finishReason: Promise.resolve("stop" as const), experimental_providerMetadata: Promise.resolve(undefined), response: Promise.resolve({ id: "cached", timestamp: new Date(), modelId: "cached", }), rawResponse: Promise.resolve({ headers: {} }), warnings: Promise.resolve([]), steps: Promise.resolve([]), toolCalls: Promise.resolve([]), toolResults: Promise.resolve([]), [Symbol.asyncIterator]: () => textStreamGenerator(), } as unknown as AgentStreamResult; return mockStreamResult; } /** * Wraps an AgentStreamResult with caching logic. * * This method handles the complexity of caching for streaming responses: * 1. Begins recording agent replay steps * 2. Wraps the stream's result promise to capture completion * 3. On success: ends recording and stores the cache entry * 4. On error: discards the recording * * This keeps the caching orchestration in AgentCache rather than * spreading it across the V3 class. * * @param context - The cache context for this execution * @param streamResult - The stream result from the agent handler * @param beginRecording - Callback to start recording (from V3) * @param endRecording - Callback to end recording and get steps (from V3) * @param discardRecording - Callback to discard recording on error (from V3) * @returns The wrapped stream result with caching enabled */ wrapStreamForCaching( context: AgentCacheContext, streamResult: AgentStreamResult, beginRecording: () => void, endRecording: () => AgentReplayStep[], discardRecording: () => void, ): AgentStreamResult { beginRecording(); const originalResultPromise = streamResult.result; const wrappedResultPromise = originalResultPromise.then( async (result) => { const agentSteps = endRecording(); if (result.success && agentSteps.length > 0) { await this.store(context, agentSteps, result); } return result; }, (error) => { discardRecording(); throw error; }, ); streamResult.result = wrappedResultPromise; return streamResult; } async store( context: AgentCacheContext, steps: AgentReplayStep[], result: AgentResult, ): Promise { if (!this.enabled) return; const entry: CachedAgentEntry = { version: 1, instruction: context.instruction, startUrl: context.startUrl, options: context.options, configSignature: context.configSignature, steps: cloneForCache(steps), result: this.pruneAgentResult(result), timestamp: new Date().toISOString(), }; const { error, path } = await this.storage.writeJson( `agent-${context.cacheKey}.json`, entry, ); if (error && path) { this.logger({ category: "cache", message: "failed to write agent cache entry", level: 1, auxiliary: { error: { value: String(error), type: "string" }, }, }); return; } this.logger({ category: "cache", message: "agent cache stored", level: 2, auxiliary: { instruction: { value: context.instruction, type: "string" }, steps: { value: String(steps.length), type: "string" }, }, }); if (this.bufferLatestEntry) { this.latestEntry = { cacheKey: context.cacheKey, entry: cloneForCache(entry), }; } } consumeBufferedEntry(): AgentCacheTransferPayload | null { if (!this.bufferLatestEntry || !this.latestEntry) { return null; } const payload = this.latestEntry; this.latestEntry = null; return payload; } async storeTransferredEntry( payload: AgentCacheTransferPayload | null, ): Promise { if (!this.enabled || !payload) return; const entry = cloneForCache(payload.entry); const { error, path } = await this.storage.writeJson( `agent-${payload.cacheKey}.json`, entry, ); if (error && path) { this.logger({ category: "cache", message: "failed to import remote agent cache entry", level: 0, auxiliary: { error: { value: String(error), type: "string" }, }, }); return; } this.logger({ category: "cache", message: "agent cache imported from server", level: 2, auxiliary: { instruction: { value: entry.instruction, type: "string" }, steps: { value: String(entry.steps?.length ?? 0), type: "string" }, }, }); } /** * Clone the agent result and prune bulky fields (e.g. screenshot base64 blobs) * before persisting it to disk. This keeps cache entries compact without * mutating the live AgentResult returned to callers. */ private pruneAgentResult(result: AgentResult): AgentResult { const cloned = cloneForCache(result); if (!Array.isArray(cloned.actions)) { return cloned; } for (const action of cloned.actions) { if (action?.type === "screenshot") { delete action.base64; } } return cloned; } beginRecording(): void { this.recording = []; } endRecording(): AgentReplayStep[] { if (!this.recording) return []; const steps = cloneForCache(this.recording); this.recording = null; return steps; } discardRecording(): void { this.recording = null; } isRecording(): boolean { return Array.isArray(this.recording); } recordStep(step: AgentReplayStep): void { if (!this.isRecording()) return; try { this.recording!.push(cloneForCache(step)); } catch (err) { this.logger({ category: "cache", message: "failed to record agent replay step", level: 2, auxiliary: { error: { value: String(err), type: "string" }, }, }); } } isReplayActive(): boolean { return this.isRecording(); } private serializeAgentModelForCache( model?: AgentConfig["model"], ): null | string | { modelName: string; options?: Record } { if (!model) return null; if (typeof model === "string") return model; const { modelName, ...modelOptions } = model; const sanitizedOptions = Object.keys(modelOptions).length > 0 ? this.sanitizeModelOptionsForCache( modelOptions as Record, ) : undefined; return sanitizedOptions ? { modelName, options: sanitizedOptions } : modelName; } private buildAgentCacheKey( instruction: string, startUrl: string, options: SanitizedAgentExecuteOptions, configSignature: string, variableKeys?: string[], ): string { const payload = { instruction, startUrl, options, configSignature, variableKeys: variableKeys ?? [], }; return createHash("sha256").update(JSON.stringify(payload)).digest("hex"); } private sanitizeModelOptionsForCache( value: Record, ): Record | undefined { const sanitizedEntries: Record = {}; for (const [key, rawValue] of Object.entries(value)) { if (SENSITIVE_CONFIG_KEYS.has(key.toLowerCase())) { continue; } const sanitizedValue = this.sanitizeModelValueForCache(rawValue); if (sanitizedValue !== undefined) { sanitizedEntries[key] = sanitizedValue; } } return Object.keys(sanitizedEntries).length > 0 ? sanitizedEntries : undefined; } private sanitizeModelValueForCache(value: unknown): unknown { if (Array.isArray(value)) { const sanitizedArray = value .map((item) => this.sanitizeModelValueForCache(item)) .filter((item) => item !== undefined); return sanitizedArray; } if (value && typeof value === "object") { return this.sanitizeModelOptionsForCache( value as Record, ); } return value; } private async replayAgentCacheEntry( context: AgentCacheContext, entry: CachedAgentEntry, llmClientOverride?: LLMClient, ): Promise { const ctx = this.getContext(); const handler = this.getActHandler(); if (!ctx || !handler) return null; const effectiveClient = llmClientOverride ?? this.getDefaultLlmClient(); try { const updatedSteps: AgentReplayStep[] = []; let stepsChanged = false; for (const step of entry.steps ?? []) { const replayedStep = (await this.executeAgentReplayStep( step, ctx, handler, effectiveClient, context.variables, )) ?? step; stepsChanged ||= replayedStep !== step; updatedSteps.push(replayedStep); } const result = cloneForCache(entry.result); result.usage = { input_tokens: 0, output_tokens: 0, reasoning_tokens: 0, cached_input_tokens: 0, inference_time_ms: 0, }; result.metadata = { ...(result.metadata ?? {}), cacheHit: true, cacheTimestamp: entry.timestamp, }; if (stepsChanged) { await this.refreshAgentCacheEntry(context, entry, updatedSteps); } return result; } catch (err) { this.logger({ category: "cache", message: "agent cache replay failed", level: 1, auxiliary: { error: { value: String(err), type: "string" }, }, }); return null; } } private async executeAgentReplayStep( step: AgentReplayStep, ctx: V3Context, handler: ActHandler, llmClient: LLMClient, variables?: Record, ): Promise { switch (step.type) { case "act": return await this.replayAgentActStep( step as AgentReplayActStep, ctx, handler, llmClient, variables, ); case "fillForm": return await this.replayAgentFillFormStep( step as AgentReplayFillFormStep, ctx, handler, llmClient, variables, ); case "goto": await this.replayAgentGotoStep(step as AgentReplayGotoStep, ctx); return step; case "scroll": await this.replayAgentScrollStep(step as AgentReplayScrollStep, ctx); return step; case "wait": await this.replayAgentWaitStep(step as AgentReplayWaitStep); return step; case "navback": await this.replayAgentNavBackStep(step as AgentReplayNavBackStep, ctx); return step; case "keys": await this.replayAgentKeysStep(step as AgentReplayKeysStep, ctx); return step; case "done": case "extract": case "screenshot": case "ariaTree": return step; default: this.logger({ category: "cache", message: `agent cache skipping step type: ${step.type}`, level: 2, }); return step; } } private async replayAgentActStep( step: AgentReplayActStep, ctx: V3Context, handler: ActHandler, llmClient: LLMClient, variables?: Record, ): Promise { const actions = Array.isArray(step.actions) ? step.actions : []; if (actions.length > 0) { const page = await ctx.awaitActivePage(); const updatedActions: Action[] = []; for (const action of actions) { await waitForCachedSelector({ page, selector: action.selector, timeout: this.domSettleTimeoutMs, logger: this.logger, context: "agent act", }); const result = await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, llmClient, undefined, variables, ); if (result.success && Array.isArray(result.actions)) { updatedActions.push(...cloneForCache(result.actions)); } else { updatedActions.push(cloneForCache(action)); } } if (this.haveActionsChanged(actions, updatedActions)) { return { ...step, actions: updatedActions }; } return step; } await this.act(step.instruction, { timeout: step.timeout, variables }); return step; } private async replayAgentFillFormStep( step: AgentReplayFillFormStep, ctx: V3Context, handler: ActHandler, llmClient: LLMClient, variables?: Record, ): Promise { const actions = Array.isArray(step.actions) && step.actions.length > 0 ? step.actions : (step.observeResults ?? []); if (!Array.isArray(actions) || actions.length === 0) { return step; } const page = await ctx.awaitActivePage(); const updatedActions: Action[] = []; for (const action of actions) { await waitForCachedSelector({ page, selector: action.selector, timeout: this.domSettleTimeoutMs, logger: this.logger, context: "fillForm", }); const result = await handler.takeDeterministicAction( action, page, this.domSettleTimeoutMs, llmClient, undefined, // ensureTimeRemaining is not used in this context variables, ); if (result.success && Array.isArray(result.actions)) { updatedActions.push(...cloneForCache(result.actions)); } else { updatedActions.push(cloneForCache(action)); } } if (this.haveActionsChanged(actions, updatedActions)) { return { ...step, actions: updatedActions }; } return step; } private async replayAgentGotoStep( step: AgentReplayGotoStep, ctx: V3Context, ): Promise { const page = await ctx.awaitActivePage(); await page.goto(step.url, { waitUntil: step.waitUntil ?? "load" }); } private async replayAgentScrollStep( step: AgentReplayScrollStep, ctx: V3Context, ): Promise { const page = await ctx.awaitActivePage(); let anchor = step.anchor; if (!anchor) { anchor = await page .mainFrame() .evaluate<{ x: number; y: number }>(() => ({ x: Math.max(0, Math.floor(window.innerWidth / 2)), y: Math.max(0, Math.floor(window.innerHeight / 2)), })); } const deltaX = step.deltaX ?? 0; const deltaY = step.deltaY ?? 0; await page.scroll( Math.round(anchor.x ?? 0), Math.round(anchor.y ?? 0), deltaX, deltaY, ); } private async replayAgentWaitStep(step: AgentReplayWaitStep): Promise { if (!step.timeMs || step.timeMs <= 0) return; await new Promise((resolve) => setTimeout(resolve, step.timeMs)); } private async replayAgentNavBackStep( step: AgentReplayNavBackStep, ctx: V3Context, ): Promise { const page = await ctx.awaitActivePage(); await page.goBack({ waitUntil: step.waitUntil ?? "domcontentloaded" }); } private async replayAgentKeysStep( step: AgentReplayKeysStep, ctx: V3Context, ): Promise { const page = await ctx.awaitActivePage(); const { method, text, keys, times } = step.playwrightArguments; const repeatCount = Math.max(1, times ?? 1); if (method === "type" && text) { for (let i = 0; i < repeatCount; i++) { await page.type(text, { delay: 100 }); } } else if (method === "press" && keys) { for (let i = 0; i < repeatCount; i++) { await page.keyPress(keys, { delay: 100 }); } } } private haveActionsChanged(original: Action[], updated: Action[]): boolean { if (original.length !== updated.length) { return true; } for (let i = 0; i < original.length; i += 1) { const orig = original[i]; const next = updated[i]; if (!orig || !next) { return true; } if (orig.selector !== next.selector) { return true; } if ((orig.description ?? "") !== (next.description ?? "")) { return true; } if ((orig.method ?? "") !== (next.method ?? "")) { return true; } const origArgs = Array.isArray(orig.arguments) ? orig.arguments : []; const nextArgs = Array.isArray(next.arguments) ? next.arguments : []; if (origArgs.length !== nextArgs.length) { return true; } for (let j = 0; j < origArgs.length; j += 1) { if (origArgs[j] !== nextArgs[j]) { return true; } } } return false; } private async refreshAgentCacheEntry( context: AgentCacheContext, entry: CachedAgentEntry, updatedSteps: AgentReplayStep[], ): Promise { const updatedEntry: CachedAgentEntry = { ...entry, steps: cloneForCache(updatedSteps), timestamp: new Date().toISOString(), }; const { error, path } = await this.storage.writeJson( `agent-${context.cacheKey}.json`, updatedEntry, ); if (error && path) { this.logger({ category: "cache", message: "failed to update agent cache entry after self-heal", level: 0, auxiliary: { error: { value: String(error), type: "string" }, }, }); return; } this.logger({ category: "cache", message: "agent cache entry updated after self-heal", level: 2, auxiliary: { instruction: { value: context.instruction, type: "string" }, steps: { value: String(updatedSteps.length), type: "string" }, }, }); } } ================================================ FILE: packages/core/lib/v3/cache/CacheStorage.ts ================================================ import fs from "fs"; import path from "path"; import type { Logger } from "../types/public/index.js"; import { ReadJsonResult, WriteJsonResult } from "../types/private/index.js"; const jsonClone = (value: T): T => { const serialized = JSON.stringify(value); if (serialized === undefined) { return value; } return JSON.parse(serialized) as T; }; export class CacheStorage { private constructor( private readonly logger: Logger, private readonly dir?: string, private readonly memoryStore?: Map, ) {} static create( cacheDir: string | undefined, logger: Logger, options?: { label?: string }, ): CacheStorage { if (!cacheDir) { return new CacheStorage(logger); } const resolved = path.resolve(cacheDir); try { fs.mkdirSync(resolved, { recursive: true }); return new CacheStorage(logger, resolved); } catch (err) { const label = options?.label ?? "cache directory"; logger({ category: "cache", message: `unable to initialize ${label}: ${resolved}`, level: 1, auxiliary: { error: { value: String(err), type: "string" }, }, }); return new CacheStorage(logger); } } static createMemory(logger: Logger): CacheStorage { return new CacheStorage(logger, undefined, new Map()); } get directory(): string | undefined { return this.dir; } get enabled(): boolean { return !!this.dir || !!this.memoryStore; } private resolvePath(fileName: string): string | null { if (!this.dir) return null; return path.join(this.dir, fileName); } async readJson(fileName: string): Promise> { if (this.memoryStore) { if (!this.memoryStore.has(fileName)) { return { value: null }; } const existing = this.memoryStore.get(fileName) as T; return { value: jsonClone(existing) }; } const filePath = this.resolvePath(fileName); if (!filePath) { return { value: null }; } try { const raw = await fs.promises.readFile(filePath, "utf8"); return { value: JSON.parse(raw) as T }; } catch (err) { const code = (err as NodeJS.ErrnoException)?.code; if (code === "ENOENT") { return { value: null }; } return { value: null, error: err, path: filePath }; } } async writeJson(fileName: string, data: unknown): Promise { if (this.memoryStore) { this.memoryStore.set(fileName, jsonClone(data)); return {}; } const filePath = this.resolvePath(fileName); if (!filePath) { return {}; } try { await fs.promises.mkdir(path.dirname(filePath), { recursive: true }); await fs.promises.writeFile( filePath, JSON.stringify(data, null, 2), "utf8", ); return {}; } catch (err) { return { error: err, path: filePath }; } } } ================================================ FILE: packages/core/lib/v3/cache/serverAgentCache.ts ================================================ import { AgentCache } from "./AgentCache.js"; import { CacheStorage } from "./CacheStorage.js"; import type { V3 } from "../v3.js"; import type { AgentCacheTransferPayload } from "../types/private/index.js"; import type { ActHandler } from "../handlers/actHandler.js"; import type { V3Context } from "../understudy/context.js"; import type { AvailableModel, V3Options } from "../types/public/index.js"; import type { ModelConfiguration } from "../types/public/model.js"; import type { LLMClient } from "../llm/LLMClient.js"; export interface ServerAgentCacheHandle { complete(): AgentCacheTransferPayload | null; discard(): void; } // TODO (refactor-caching): this reflective access is a known temporary escape hatch. // Once the caching internals are reworked, replace it with proper V3 helpers so // we stop poking private fields from the outside. function getInternalField(instance: V3, key: string): T { return (instance as unknown as Record)[key] as T; } function setInternalField(instance: V3, key: string, value: unknown): void { (instance as unknown as Record)[key] = value; } function createMemoryAgentCache(stagehand: V3): AgentCache { const resolveLlmClient = getInternalField< (model?: ModelConfiguration) => LLMClient >(stagehand, "resolveLlmClient"); return new AgentCache({ storage: CacheStorage.createMemory(stagehand.logger), logger: stagehand.logger, getActHandler: () => getInternalField(stagehand, "actHandler"), getContext: () => getInternalField(stagehand, "ctx"), getDefaultLlmClient: () => resolveLlmClient.call(stagehand), getBaseModelName: () => getInternalField(stagehand, "modelName"), getSystemPrompt: () => getInternalField(stagehand, "opts").systemPrompt, domSettleTimeoutMs: getInternalField( stagehand, "domSettleTimeoutMs", ), act: stagehand.act.bind(stagehand), bufferLatestEntry: true, }); } export function __internalCreateInMemoryAgentCacheHandle( stagehand: V3, ): ServerAgentCacheHandle { const originalCache = getInternalField(stagehand, "agentCache"); const memoryCache = createMemoryAgentCache(stagehand); setInternalField(stagehand, "agentCache", memoryCache); let restored = false; const restore = () => { if (!restored) { setInternalField(stagehand, "agentCache", originalCache); restored = true; } }; return { complete: () => { const entry = memoryCache.consumeBufferedEntry(); restore(); return entry; }, discard: () => { restore(); }, }; } ================================================ FILE: packages/core/lib/v3/cache/utils.ts ================================================ import type { Logger } from "../types/public/index.js"; import { Page } from "../understudy/page.js"; const DEFAULT_WAIT_TIMEOUT_MS = 15000; export function cloneForCache(value: T): T { return JSON.parse(JSON.stringify(value)) as T; } export async function safeGetPageUrl(page: Page): Promise { try { return page.url(); } catch { return ""; } } /** * Waits for a cached action's selector to be attached to the DOM before executing. * Logs a warning and proceeds if the wait times out (non-blocking). */ export async function waitForCachedSelector(params: { page: Page; selector: string | undefined; timeout: number | undefined; logger: Logger; context?: string; }): Promise { const { page, selector, timeout, logger, context } = params; if (!selector) return; try { await page.waitForSelector(selector, { state: "attached", timeout: timeout ?? DEFAULT_WAIT_TIMEOUT_MS, }); } catch (err) { logger({ category: "cache", message: `waitForSelector failed for ${context ?? "cached"} action selector, proceeding anyway`, level: 2, auxiliary: { selector: { value: selector, type: "string" }, error: { value: String(err), type: "string" }, }, }); } } ================================================ FILE: packages/core/lib/v3/cli.js ================================================ #!/usr/bin/env node import process from "node:process"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; // currently the CLI is only used to spawn the shutdown supervisor // in the future, we may want to add more CLI commands here if (!maybeRunShutdownSupervisorFromArgv(process.argv.slice(2))) { console.error( "Unsupported stagehand CLI invocation. Expected --supervisor with valid args.", ); process.exit(1); } ================================================ FILE: packages/core/lib/v3/dom/a11yScripts/index.ts ================================================ export function getScrollOffsets(): { sx: number; sy: number } { try { const sx = window.scrollX ?? window.pageXOffset ?? document.documentElement?.scrollLeft ?? 0; const sy = window.scrollY ?? window.pageYOffset ?? document.documentElement?.scrollTop ?? 0; return { sx: Number(sx) || 0, sy: Number(sy) || 0 }; } catch { return { sx: 0, sy: 0 }; } } export function getBoundingRectLite(this: Element): { left: number; top: number; } { try { const rect = this.getBoundingClientRect(); return { left: Number(rect?.left ?? 0) || 0, top: Number(rect?.top ?? 0) || 0, }; } catch { return { left: 0, top: 0 }; } } export function resolveDeepActiveElement(): Element | null { try { const deepActive = (doc: Document | ShadowRoot): Element | null => { let el: Element | null = doc.activeElement ?? null; while (el && el.shadowRoot && el.shadowRoot.activeElement) { el = el.shadowRoot.activeElement; } return el ?? null; }; return deepActive(document); } catch { return null; } } export function nodeToAbsoluteXPath(this: Node | null | undefined): string { const compute = (node: Node | null | undefined): string => { try { const sibIndex = (n: Node | null | undefined): number => { if (!n || !n.parentNode) return 1; let i = 1; const targetKey = `${n.nodeType}:${(n.nodeName || "").toLowerCase()}`; for (let p = n.previousSibling; p; p = p.previousSibling) { const key = `${p.nodeType}:${(p.nodeName || "").toLowerCase()}`; if (key === targetKey) i += 1; } return i; }; const step = (n: Node | null | undefined): string => { if (!n) return ""; if (n.nodeType === Node.DOCUMENT_NODE) return ""; if (n.nodeType === Node.DOCUMENT_FRAGMENT_NODE) return "//"; if (n.nodeType === Node.TEXT_NODE) return `text()[${sibIndex(n)}]`; if (n.nodeType === Node.COMMENT_NODE) return `comment()[${sibIndex(n)}]`; const tag = (n.nodeName || "").toLowerCase(); const name = tag.includes(":") ? `*[name()='${tag}']` : tag; return `${name}[${sibIndex(n)}]`; }; const parts: string[] = []; let cur: Node | null | undefined = node; while (cur) { if (cur.nodeType === Node.DOCUMENT_FRAGMENT_NODE) { parts.push("//"); cur = (cur as ShadowRoot).host ?? null; continue; } const s = step(cur); if (s) parts.push(s); cur = cur.parentNode; } parts.reverse(); let out = ""; for (const part of parts) { if (part === "//") { out = out ? (out.endsWith("/") ? `${out}/` : `${out}//`) : "//"; } else { out = out ? out.endsWith("/") ? `${out}${part}` : `${out}/${part}` : `/${part}`; } } return out || "/"; } catch { return "/"; } }; return compute(this); } export function documentHasFocusStrict(): boolean { try { return document.hasFocus() === true; } catch { return false; } } ================================================ FILE: packages/core/lib/v3/dom/genA11yScripts.ts ================================================ import fs from "node:fs"; import path from "node:path"; import { pathToFileURL } from "node:url"; import esbuild from "esbuild"; import { getCurrentDirPath } from "../runtimePaths.js"; const here = getCurrentDirPath(); const srcDir = path.join(here, "./a11yScripts"); const outDir = path.join(here, "./build"); const entry = path.join(srcDir, "index.ts"); const moduleOut = path.join(outDir, "a11yScripts.mjs"); const bundleOut = path.join(outDir, "a11yScripts.bundle.js"); async function main(): Promise { fs.mkdirSync(outDir, { recursive: true }); esbuild.buildSync({ entryPoints: [entry], bundle: true, format: "esm", platform: "browser", target: "es2020", minify: true, outfile: moduleOut, }); esbuild.buildSync({ entryPoints: [entry], bundle: true, format: "iife", platform: "browser", target: "es2020", globalName: "__stagehandA11yScriptsFactory", minify: true, outfile: bundleOut, }); const bundleRaw = fs.readFileSync(bundleOut, "utf8").trim(); const bootstrap = `if (!globalThis.__stagehandA11yScripts) { ${bundleRaw}\n globalThis.__stagehandA11yScripts = __stagehandA11yScriptsFactory;\n}`; const compiledModule = (await import( pathToFileURL(moduleOut).href )) as Record; const entries = Object.entries(compiledModule).filter( ([, value]) => typeof value === "function", ); const sorted = entries.sort(([a], [b]) => a.localeCompare(b)); const scriptMap: Record = Object.fromEntries( sorted.map(([name, fn]) => { const callable = fn as (...args: unknown[]) => unknown; return [name, callable.toString()]; }), ); const banner = `/*\n * AUTO-GENERATED FILE. DO NOT EDIT.\n * Update sources in lib/v3/dom/a11yScripts and run genA11yScripts.ts.\n */`; const globalRefs: Record = Object.fromEntries( sorted.map(([name]) => [name, `globalThis.__stagehandA11yScripts.${name}`]), ); const content = `${banner} export const a11yScriptBootstrap = ${JSON.stringify(bootstrap)}; export const a11yScriptSources = ${JSON.stringify(scriptMap, null, 2)} as const; export const a11yScriptGlobalRefs = ${JSON.stringify(globalRefs, null, 2)} as const; export type A11yScriptName = keyof typeof a11yScriptSources; `; fs.writeFileSync(path.join(outDir, "a11yScripts.generated.ts"), content); await fs.promises.unlink(moduleOut).catch(() => {}); await fs.promises.unlink(bundleOut).catch(() => {}); } void main(); ================================================ FILE: packages/core/lib/v3/dom/genDomScripts.ts ================================================ /** * Build the v3 DOM script into a single JS file and then export its contents * as a string constant (`v3ScriptContent`) for CDP injection (document-start). */ import fs from "node:fs"; import path from "node:path"; import esbuild from "esbuild"; import { getCurrentDirPath } from "../runtimePaths.js"; const here = getCurrentDirPath(); const outDir = path.join(here, "./build"); fs.mkdirSync(outDir, { recursive: true }); esbuild.buildSync({ entryPoints: [path.join(here, "piercer.entry.ts")], bundle: true, format: "iife", platform: "browser", target: "es2020", minify: true, legalComments: "none", outfile: path.join(outDir, "v3-index.js"), }); const script = fs.readFileSync(path.join(outDir, "v3-index.js"), "utf8"); const content = `export const v3ScriptContent = ${JSON.stringify(script)};`; fs.writeFileSync(path.join(outDir, "scriptV3Content.ts"), content); esbuild.buildSync({ entryPoints: [path.join(here, "rerenderMissingShadows.entry.ts")], bundle: true, format: "iife", platform: "browser", target: "es2020", minify: true, legalComments: "none", outfile: path.join(outDir, "rerender-index.js"), }); const rerenderScript = fs.readFileSync( path.join(outDir, "rerender-index.js"), "utf8", ); const rerenderContent = `export const reRenderScriptContent = ${JSON.stringify( rerenderScript, )};`; fs.writeFileSync( path.join(outDir, "reRenderScriptContent.ts"), rerenderContent, ); ================================================ FILE: packages/core/lib/v3/dom/genLocatorScripts.ts ================================================ import fs from "node:fs"; import path from "node:path"; import { pathToFileURL } from "node:url"; import esbuild from "esbuild"; import { getCurrentDirPath } from "../runtimePaths.js"; const here = getCurrentDirPath(); const outDir = path.join(here, "./build"); const entry = path.join(here, "./locatorScripts/index.ts"); const moduleOutfile = path.join(outDir, "locatorScripts.mjs"); const bundleOutfile = path.join(outDir, "locatorScripts.bundle.js"); async function main(): Promise { fs.mkdirSync(outDir, { recursive: true }); esbuild.buildSync({ entryPoints: [entry], bundle: true, format: "esm", platform: "browser", target: "es2020", minify: true, outfile: moduleOutfile, }); esbuild.buildSync({ entryPoints: [entry], bundle: true, format: "iife", platform: "browser", target: "es2020", globalName: "__stagehandLocatorScriptsFactory", minify: true, outfile: bundleOutfile, }); const bundleRaw = fs.readFileSync(bundleOutfile, "utf8").trim(); const bootstrap = `if (!globalThis.__stagehandLocatorScripts) { ${bundleRaw}\n globalThis.__stagehandLocatorScripts = __stagehandLocatorScriptsFactory;\n}`; const compiledModule = (await import( pathToFileURL(moduleOutfile).href )) as Record; const entries = Object.entries(compiledModule).filter( ([, value]) => typeof value === "function", ); const sorted = entries.sort(([a], [b]) => a.localeCompare(b)); const scriptMap: Record = Object.fromEntries( sorted.map(([name, fn]) => { const callable = fn as (...args: unknown[]) => unknown; return [name, callable.toString()]; }), ); const banner = `/*\n * AUTO-GENERATED FILE. DO NOT EDIT.\n * Update sources in lib/v3/dom/locatorScripts and run genLocatorScripts.ts.\n */`; const globalRefs: Record = Object.fromEntries( sorted.map(([name]) => [ name, `globalThis.__stagehandLocatorScripts.${name}`, ]), ); const content = `${banner}\nexport const locatorScriptBootstrap = ${JSON.stringify(bootstrap)};\nexport const locatorScriptSources = ${JSON.stringify(scriptMap, null, 2)} as const;\nexport const locatorScriptGlobalRefs = ${JSON.stringify(globalRefs, null, 2)} as const;\nexport type LocatorScriptName = keyof typeof locatorScriptSources;\n`; fs.writeFileSync(path.join(outDir, "locatorScripts.generated.ts"), content); await fs.promises.unlink(moduleOutfile).catch(() => {}); await fs.promises.unlink(bundleOutfile).catch(() => {}); } void main(); ================================================ FILE: packages/core/lib/v3/dom/genScreenshotScripts.ts ================================================ import fs from "node:fs"; import path from "node:path"; import { pathToFileURL } from "node:url"; import esbuild from "esbuild"; import { getCurrentDirPath } from "../runtimePaths.js"; const here = getCurrentDirPath(); const srcDir = path.join(here, "./screenshotScripts"); const outDir = path.join(here, "./build"); const entry = path.join(srcDir, "index.ts"); const moduleOut = path.join(outDir, "screenshotScripts.mjs"); async function main(): Promise { fs.mkdirSync(outDir, { recursive: true }); esbuild.buildSync({ entryPoints: [entry], bundle: true, format: "esm", platform: "browser", target: "es2020", minify: true, outfile: moduleOut, }); const compiledModule = (await import( pathToFileURL(moduleOut).href )) as Record; const entries = Object.entries(compiledModule).filter( ([, value]) => typeof value === "function", ); const sorted = entries.sort(([a], [b]) => a.localeCompare(b)); const scriptMap: Record = Object.fromEntries( sorted.map(([name, fn]) => { const callable = fn as (...args: unknown[]) => unknown; return [name, callable.toString()]; }), ); const banner = `/*\n * AUTO-GENERATED FILE. DO NOT EDIT.\n * Update sources in lib/v3/dom/screenshotScripts and run genScreenshotScripts.ts.\n */`; const content = `${banner} export const screenshotScriptSources = ${JSON.stringify(scriptMap, null, 2)} as const; export type ScreenshotScriptName = keyof typeof screenshotScriptSources; `; fs.writeFileSync( path.join(outDir, "screenshotScripts.generated.ts"), content, ); await fs.promises.unlink(moduleOut).catch(() => {}); } void main(); ================================================ FILE: packages/core/lib/v3/dom/global.d.ts ================================================ export interface StagehandV3Backdoor { /** Closed shadow-root accessors */ getClosedRoot(host: Element): ShadowRoot | undefined; /** Stats + quick health check */ stats(): { installed: true; url: string; isTop: boolean; open: number; closed: number; }; } declare global { interface Window { __stagehandV3Injected?: boolean; __stagehandV3__?: StagehandV3Backdoor; } } ================================================ FILE: packages/core/lib/v3/dom/index.ts ================================================ export * from "./piercer.runtime.js"; ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/counts.ts ================================================ import { countXPathMatches } from "./xpathResolver.js"; export interface TextMatchSample { tag: string; id: string; class: string; text: string; } export interface TextMatchResult { count: number; sample: TextMatchSample[]; error: null; } export function countCssMatchesPrimary(selectorRaw: string): number { const selector = String(selectorRaw ?? "").trim(); if (!selector) return 0; const seen = new WeakSet(); const visit = (root: Node | null | undefined): number => { if (!root || seen.has(root)) return 0; seen.add(root); let total = 0; try { const queryable = root as unknown as ParentNode & { querySelectorAll?: Document["querySelectorAll"]; }; if (typeof queryable.querySelectorAll === "function") { total += queryable.querySelectorAll(selector).length; } } catch { // ignore query errors } try { const doc = root instanceof Document ? root : ((root as Element)?.ownerDocument ?? document); const walker = doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); let node: Node | null; while ((node = walker.nextNode())) { if (node instanceof Element && node.shadowRoot) { total += visit(node.shadowRoot); } } } catch { // ignore traversal errors } return total; }; try { return visit(document); } catch { try { return document.querySelectorAll(selector).length; } catch { return 0; } } } export function countCssMatchesPierce(selectorRaw: string): number { const selector = String(selectorRaw ?? "").trim(); if (!selector) return 0; const backdoor = window.__stagehandV3__; if (!backdoor || typeof backdoor.getClosedRoot !== "function") { try { return document.querySelectorAll(selector).length; } catch { return 0; } } const seen = new WeakSet(); const queue: Node[] = []; const enqueue = (node: Node | null | undefined) => { if (!node || seen.has(node)) return; seen.add(node); queue.push(node); }; enqueue(document); let total = 0; const visitElement = (element: Element) => { const open = element.shadowRoot; if (open) enqueue(open); try { const closed = backdoor.getClosedRoot(element); if (closed) enqueue(closed); } catch { // ignore } }; while (queue.length) { const root = queue.shift(); if (!root) continue; try { const queryable = root as unknown as ParentNode & { querySelectorAll?: Document["querySelectorAll"]; }; if (typeof queryable.querySelectorAll === "function") { total += queryable.querySelectorAll(selector).length; } } catch { // ignore query errors } try { const doc = root instanceof Document ? root : root instanceof ShadowRoot ? (root.host?.ownerDocument ?? document) : ((root as Element).ownerDocument ?? document); const walker = doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); let node: Node | null; while ((node = walker.nextNode())) { if (node instanceof Element) { visitElement(node); } } } catch { // ignore traversal errors } } return total; } export function countTextMatches(rawNeedle: string): TextMatchResult { const needle = String(rawNeedle ?? ""); if (!needle) { return { count: 0, sample: [], error: null }; } const needleLc = needle.toLowerCase(); const skipTags = new Set([ "SCRIPT", "STYLE", "TEMPLATE", "NOSCRIPT", "HEAD", "TITLE", "LINK", "META", "HTML", "BODY", ]); const shouldSkip = (node: Element | null | undefined): boolean => { if (!node) return false; const tag = node.tagName?.toUpperCase() ?? ""; return skipTags.has(tag); }; const extractText = (element: Element): string => { try { if (shouldSkip(element)) return ""; const inner = (element as HTMLElement).innerText; if (typeof inner === "string" && inner.trim()) return inner.trim(); } catch { // ignore } try { const text = element.textContent; if (typeof text === "string") return text.trim(); } catch { // ignore } return ""; }; const matches = (element: Element): boolean => { const text = extractText(element); return !!text && text.toLowerCase().includes(needleLc); }; const backdoor = window.__stagehandV3__; const getClosedRoot: (host: Element) => ShadowRoot | null = backdoor && typeof backdoor.getClosedRoot === "function" ? (host: Element): ShadowRoot | null => { try { return backdoor.getClosedRoot(host) ?? null; } catch { return null; } } : (host: Element): ShadowRoot | null => { void host; return null; }; const seen = new WeakSet(); const queue: Node[] = []; const enqueue = (node: Node | null | undefined) => { if (!node || seen.has(node)) return; seen.add(node); queue.push(node); }; const walkerFor = (root: Node): TreeWalker | null => { try { const doc = root instanceof Document ? root : ((root as Element)?.ownerDocument ?? document); return doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); } catch { return null; } }; const matchesList: Array<{ element: Element; tag: string; id: string; className: string; text: string; }> = []; enqueue(document); while (queue.length) { const root = queue.shift(); if (!root) continue; if (root instanceof Element && matches(root)) { matchesList.push({ element: root, tag: root.tagName ?? "", id: root.id ?? "", className: (root as HTMLElement).className ?? "", text: extractText(root), }); } const walker = walkerFor(root); if (!walker) continue; let node: Node | null; while ((node = walker.nextNode())) { if (!(node instanceof Element)) continue; if (matches(node)) { matchesList.push({ element: node, tag: node.tagName ?? "", id: node.id ?? "", className: (node as HTMLElement).className ?? "", text: extractText(node), }); } const open = node.shadowRoot; if (open) enqueue(open); const closed = getClosedRoot(node); if (closed) enqueue(closed); } } const innermost: typeof matchesList = []; for (const item of matchesList) { const el = item.element; let skip = false; for (const other of matchesList) { if (item === other) continue; try { if (el.contains(other.element)) { skip = true; break; } } catch { // ignore containment errors } } if (!skip) innermost.push(item); } const count = innermost.length; const sample = innermost.slice(0, 5).map((item) => ({ tag: item.tag, id: item.id, class: item.className, text: item.text, })); return { count, sample, error: null }; } export function countXPathMatchesMainWorld(rawXp: string): number { return countXPathMatches(rawXp, { pierceShadow: true }); } ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/index.ts ================================================ export * from "./scripts.js"; export * from "./selectors.js"; export * from "./counts.js"; export * from "./waitForSelector.js"; ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/scripts.ts ================================================ /* * DOM-side helpers used by Locator Runtime.callFunctionOn invocations. * * NOTE: These functions run inside the page context. Keep them dependency-free * and resilient to exceptions (match the best-effort semantics of the old * inline string snippets). */ export interface ClickEventOptions { bubbles?: boolean; cancelable?: boolean; composed?: boolean; detail?: number; } export function ensureFileInputElement(this: Element): boolean { try { const tag = (this as HTMLElement).tagName?.toLowerCase() ?? ""; if (tag !== "input") return false; const type = String((this as HTMLInputElement).type ?? "").toLowerCase(); return type === "file"; } catch { return false; } } export interface SerializedFilePayload { name: string; mimeType: string; base64: string; lastModified?: number; } /** Attach File objects created from serialized payloads to an . */ export function assignFilePayloadsToInputElement( this: Element, payloads: SerializedFilePayload[], ): boolean { try { const input = this as HTMLInputElement; if (!input || input.tagName?.toLowerCase() !== "input") return false; if ((input.type ?? "").toLowerCase() !== "file") return false; const transfer: DataTransfer | null = (() => { try { return new DataTransfer(); } catch { return null; } })(); if (!transfer) return false; const entries = Array.isArray(payloads) ? payloads : []; for (const payload of entries) { if (!payload) continue; const name = payload.name || "upload.bin"; const mimeType = payload.mimeType || "application/octet-stream"; const lastModified = typeof payload.lastModified === "number" ? payload.lastModified : Date.now(); const binary = window.atob(payload.base64 ?? ""); const bytes = new Uint8Array(binary.length); for (let i = 0; i < binary.length; i += 1) { bytes[i] = binary.charCodeAt(i); } const blob = new Blob([bytes], { type: mimeType }); const file = new File([blob], name, { type: mimeType, lastModified }); transfer.items.add(file); } input.files = transfer.files; input.dispatchEvent(new Event("input", { bubbles: true })); input.dispatchEvent(new Event("change", { bubbles: true })); return true; } catch { return false; } } export function dispatchDomClick( this: Element, options?: ClickEventOptions, ): void { const opts = options ?? {}; try { const event = new MouseEvent("click", { bubbles: !!opts.bubbles, cancelable: !!opts.cancelable, composed: !!opts.composed, detail: typeof opts.detail === "number" ? opts.detail : 1, view: this?.ownerDocument?.defaultView ?? window, }); this.dispatchEvent(event); } catch { try { // Fallback to native click if MouseEvent construction fails. (this as HTMLElement).click(); } catch { /* ignore */ } } } export function scrollElementToPercent( this: Element, percent: number | string, ): boolean { const normalize = (value: unknown): number => { if (typeof value === "number" && Number.isFinite(value)) return value; const str = String(value ?? "").trim(); if (!str) return 0; const numeric = parseFloat(str.replace("%", "")); if (Number.isNaN(numeric) || !Number.isFinite(numeric)) return 0; return numeric; }; try { const pct = Math.max(0, Math.min(normalize(percent), 100)); const element = this as HTMLElement; const tag = element.tagName?.toLowerCase() ?? ""; const scrollWindow = tag === "html" || tag === "body"; if (scrollWindow) { const root = element.ownerDocument?.scrollingElement || element.ownerDocument?.documentElement || element.ownerDocument?.body || document.scrollingElement || document.documentElement || document.body; const scrollHeight = root?.scrollHeight ?? document.body.scrollHeight ?? 0; const viewportHeight = element.ownerDocument?.defaultView?.innerHeight ?? window.innerHeight; const maxTop = Math.max(0, scrollHeight - viewportHeight); const top = maxTop * (pct / 100); element.ownerDocument?.defaultView?.scrollTo({ top, left: element.ownerDocument?.defaultView?.scrollX ?? window.scrollX ?? 0, behavior: "smooth", }); return true; } const scrollHeight = element.scrollHeight ?? 0; const clientHeight = element.clientHeight ?? 0; const maxTop = Math.max(0, scrollHeight - clientHeight); const top = maxTop * (pct / 100); element.scrollTo({ top, left: element.scrollLeft ?? 0, behavior: "smooth", }); return true; } catch { return false; } } const inputTypesToSetValue = new Set([ "color", "date", "datetime-local", "month", "range", "time", "week", ]); const inputTypesToTypeInto = new Set([ "", "email", "number", "password", "search", "tel", "text", "url", ]); export type FillElementResult = | { status: "done" } | { status: "needsinput"; value: string; reason?: string } | { status: "error"; reason: string }; export function prepareElementForTyping(this: Element): boolean { try { const element = this as HTMLElement; if (!element.isConnected) return false; const doc = element.ownerDocument || document; const win = doc.defaultView || window; try { if (typeof element.focus === "function") { element.focus(); } } catch { /* ignore */ } if ( element instanceof win.HTMLInputElement || element instanceof win.HTMLTextAreaElement ) { try { if (typeof element.select === "function") { element.select(); return true; } } catch { /* ignore */ } try { const length = (element.value ?? "").length; if (typeof element.setSelectionRange === "function") { element.setSelectionRange(0, length); return true; } } catch { /* ignore */ } return true; } if (element.isContentEditable) { const selection = doc.getSelection?.(); const range = doc.createRange?.(); if (selection && range) { try { range.selectNodeContents(element); selection.removeAllRanges(); selection.addRange(range); } catch { /* ignore */ } } return true; } return false; } catch { return false; } } export function fillElementValue( this: Element, rawValue: string, ): FillElementResult { const element = this as HTMLElement; if (!element.isConnected) { return { status: "error", reason: "notconnected" }; } const doc = element.ownerDocument || document; const win = doc.defaultView || window; let fallbackValue = rawValue ?? ""; try { const dispatchInputAndChange = (eventValue: string): void => { let inputEvent: Event; if (typeof win.InputEvent === "function") { try { inputEvent = new win.InputEvent("input", { bubbles: true, composed: true, data: eventValue, inputType: "insertText", }); } catch { inputEvent = new win.Event("input", { bubbles: true, composed: true, }); } } else { inputEvent = new win.Event("input", { bubbles: true, composed: true }); } element.dispatchEvent(inputEvent); const changeEvent = new win.Event("change", { bubbles: true }); element.dispatchEvent(changeEvent); }; if (element instanceof win.HTMLInputElement) { const type = (element.type || "").toLowerCase(); if (!inputTypesToTypeInto.has(type) && !inputTypesToSetValue.has(type)) { return { status: "error", reason: `unsupported-input-type:${type}` }; } let valueForTyping = rawValue; if (type === "number") { const trimmed = rawValue.trim(); if (trimmed !== "" && Number.isNaN(Number(trimmed))) { return { status: "error", reason: "invalid-number-value" }; } valueForTyping = trimmed; } fallbackValue = valueForTyping; if (inputTypesToSetValue.has(type)) { const trimmed = rawValue.trim(); fallbackValue = trimmed; prepareElementForTyping.call(element); const prototype = win.HTMLInputElement.prototype; const descriptor = Object.getOwnPropertyDescriptor(prototype, "value"); const nativeSetter = descriptor?.set; if (typeof nativeSetter === "function") { nativeSetter.call(element, trimmed); } else { element.value = trimmed; } const tracker = ( element as unknown as { _valueTracker?: { setValue?: (next: string) => void }; } )._valueTracker; tracker?.setValue?.(trimmed); if (element.value !== trimmed) { return { status: "error", reason: "malformed-value" }; } dispatchInputAndChange(trimmed); return { status: "done" }; } prepareElementForTyping.call(element); return { status: "needsinput", value: valueForTyping }; } if (element instanceof win.HTMLTextAreaElement) { prepareElementForTyping.call(element); fallbackValue = rawValue; return { status: "needsinput", value: rawValue }; } if (element instanceof win.HTMLSelectElement) { // Select elements use setInputFiles/selectOption instead. return { status: "error", reason: "unsupported-element" }; } if (element.isContentEditable) { prepareElementForTyping.call(element); fallbackValue = rawValue; return { status: "needsinput", value: rawValue }; } return { status: "error", reason: "unsupported-element" }; } catch (error) { let reason = "exception"; if (error && typeof error === "object") { const message = (error as { message?: unknown }).message; if (typeof message === "string" && message.trim().length > 0) { reason = `exception:${message}`; } } return { status: "needsinput", value: fallbackValue, reason }; } } export function focusElement(this: Element): void { try { if (typeof (this as HTMLElement).focus === "function") { (this as HTMLElement).focus(); } } catch { /* ignore */ } } export function selectElementOptions( this: Element, rawValues: string | string[], ): string[] { try { if (!(this instanceof HTMLSelectElement)) return []; const desired = Array.isArray(rawValues) ? rawValues : [rawValues]; const wanted = new Set(desired.map((v) => String(v ?? "").trim())); const matches = (option: HTMLOptionElement): boolean => { const label = (option.label || option.textContent || "").trim(); const value = String(option.value ?? "").trim(); return wanted.has(label) || wanted.has(value); }; if (this.multiple) { for (const option of Array.from(this.options)) { option.selected = matches(option); } } else { let chosen = false; for (const option of Array.from(this.options)) { if (!chosen && matches(option)) { option.selected = true; this.value = option.value; chosen = true; } else { option.selected = false; } } } const inputEvent = new Event("input", { bubbles: true }); const changeEvent = new Event("change", { bubbles: true }); this.dispatchEvent(inputEvent); this.dispatchEvent(changeEvent); return Array.from(this.selectedOptions).map((opt) => opt.value); } catch { return []; } } export function isElementVisible(this: Element): boolean { try { const element = this as HTMLElement; if (!element.isConnected) return false; const style = element.ownerDocument?.defaultView?.getComputedStyle(element) ?? window.getComputedStyle(element); if (!style) return false; if (style.display === "none" || style.visibility === "hidden") return false; const opacity = parseFloat(style.opacity ?? "1"); if (!Number.isFinite(opacity) || opacity === 0) return false; const rect = element.getBoundingClientRect(); if (!rect) return false; if (Math.max(rect.width, rect.height) === 0) return false; if (element.getClientRects().length === 0) return false; return true; } catch { return false; } } export function isElementChecked(this: Element): boolean { try { const element = this as HTMLElement; const tag = (element.tagName || "").toLowerCase(); if (tag === "input") { const type = (element as HTMLInputElement).type?.toLowerCase() ?? ""; if (type === "checkbox" || type === "radio") { return !!(element as HTMLInputElement).checked; } } const aria = element.getAttribute?.("aria-checked"); if (aria != null) return aria === "true"; return false; } catch { return false; } } export function readElementInputValue(this: Element): string { try { const element = this as HTMLElement; const tag = (element.tagName || "").toLowerCase(); if (tag === "input" || tag === "textarea") { return String( (element as HTMLInputElement | HTMLTextAreaElement).value ?? "", ); } if (tag === "select") { return String((element as HTMLSelectElement).value ?? ""); } if (element.isContentEditable) { return String(element.textContent ?? ""); } return ""; } catch { return ""; } } export function readElementTextContent(this: Element): string { try { return String(this.textContent ?? ""); } catch { return ""; } } export function readElementInnerHTML(this: Element): string { try { return String((this as HTMLElement).innerHTML ?? ""); } catch { return ""; } } export function readElementInnerText(this: Element): string { try { const element = this as HTMLElement; const inner = (element as HTMLElement & { innerText?: unknown }).innerText; if (typeof inner === "string" && inner.length > 0) { return inner; } const fallback = element.textContent; return typeof fallback === "string" ? fallback : ""; } catch { return ""; } } ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/selectors.ts ================================================ import { resolveXPathAtIndex } from "./xpathResolver.js"; const parseTargetIndex = (value: unknown): number => { const num = Number(value ?? 0); if (!Number.isFinite(num) || num < 0) return 0; return Math.floor(num); }; const collectCssMatches = (selector: string, limit: number): Element[] => { if (!selector) return []; const seenRoots = new WeakSet(); const seenElements = new Set(); const results: Element[] = []; const queue: Array = [document]; const visit = (root: Document | ShadowRoot): void => { if (!root || seenRoots.has(root) || results.length >= limit) return; seenRoots.add(root); try { const matches = root.querySelectorAll(selector); for (const element of matches) { if (seenElements.has(element)) continue; seenElements.add(element); results.push(element); if (results.length >= limit) return; } } catch { // ignore querySelectorAll issues } try { const ownerDocument = root instanceof Document ? root : (root.host?.ownerDocument ?? document); const walker = ownerDocument.createTreeWalker( root, NodeFilter.SHOW_ELEMENT, ); let node: Node | null; while ((node = walker.nextNode())) { if (!(node instanceof Element)) continue; const open = node.shadowRoot; if (open) queue.push(open); } } catch { // ignore traversal issues } }; while (queue.length && results.length < limit) { const next = queue.shift(); if (next) visit(next); } return results; }; export function resolveCssSelector( selectorRaw: string, targetIndexRaw?: number, ): Element | null { const selector = String(selectorRaw ?? "").trim(); if (!selector) return null; const targetIndex = parseTargetIndex(targetIndexRaw); const matches = collectCssMatches(selector, targetIndex + 1); return matches[targetIndex] ?? null; } export function resolveCssSelectorPierce( selectorRaw: string, targetIndexRaw?: number, ): Element | null { const selector = String(selectorRaw ?? "").trim(); if (!selector) return null; const targetIndex = parseTargetIndex(targetIndexRaw); const backdoor = window.__stagehandV3__; if (!backdoor || typeof backdoor.getClosedRoot !== "function") { const matches = collectCssMatches(selector, targetIndex + 1); return matches[targetIndex] ?? null; } const getClosedRoot: (host: Element) => ShadowRoot | null = ( host: Element, ) => { try { return backdoor.getClosedRoot(host) ?? null; } catch { return null; } }; const seenRoots = new WeakSet(); const seenElements = new Set(); const results: Element[] = []; const queue: Array = [document]; const visit = (root: Document | ShadowRoot): void => { if (!root || seenRoots.has(root) || results.length >= targetIndex + 1) return; seenRoots.add(root); try { const matches = root.querySelectorAll(selector); for (const element of matches) { if (seenElements.has(element)) continue; seenElements.add(element); results.push(element); if (results.length >= targetIndex + 1) return; } } catch { // ignore query errors } try { const ownerDocument = root instanceof Document ? root : (root.host?.ownerDocument ?? document); const walker = ownerDocument.createTreeWalker( root, NodeFilter.SHOW_ELEMENT, ); let node: Node | null; while ((node = walker.nextNode())) { if (!(node instanceof Element)) continue; const open = node.shadowRoot; if (open) queue.push(open); const closed = getClosedRoot(node); if (closed) queue.push(closed); } } catch { // ignore traversal issues } }; while (queue.length && results.length < targetIndex + 1) { const next = queue.shift(); if (next) visit(next); } return results[targetIndex] ?? null; } export function resolveTextSelector( rawNeedle: string, targetIndexRaw?: number, ): Element | null { const needle = String(rawNeedle ?? ""); if (!needle) return null; const needleLc = needle.toLowerCase(); const targetIndex = parseTargetIndex(targetIndexRaw); const skipTags = new Set([ "SCRIPT", "STYLE", "TEMPLATE", "NOSCRIPT", "HEAD", "TITLE", "LINK", "META", "HTML", "BODY", ]); const shouldSkip = (node: Element | null | undefined): boolean => { if (!node) return false; const tag = node.tagName?.toUpperCase() ?? ""; return skipTags.has(tag); }; const extractText = (node: Element): string => { try { if (shouldSkip(node)) return ""; const inner = (node as HTMLElement).innerText; if (typeof inner === "string" && inner.trim()) return inner.trim(); } catch { // ignore } try { const text = node.textContent; if (typeof text === "string") return text.trim(); } catch { // ignore } return ""; }; const matches = (node: Element): boolean => { const text = extractText(node); return !!text && text.toLowerCase().includes(needleLc); }; const backdoor = window.__stagehandV3__; const getClosedRoot: (host: Element) => ShadowRoot | null = backdoor && typeof backdoor.getClosedRoot === "function" ? (host: Element): ShadowRoot | null => { try { return backdoor.getClosedRoot(host) ?? null; } catch { return null; } } : (host: Element): ShadowRoot | null => { void host; return null; }; const seen = new WeakSet(); const queue: Node[] = []; const matchesList: Array<{ element: Element; tag: string; id: string; className: string; text: string; }> = []; const enqueue = (node: Node | null | undefined) => { if (!node || seen.has(node)) return; seen.add(node); queue.push(node); }; const walkerFor = (root: Node): TreeWalker | null => { try { const doc = root instanceof Document ? root : ((root as Element)?.ownerDocument ?? document); return doc.createTreeWalker(root, NodeFilter.SHOW_ELEMENT); } catch { return null; } }; enqueue(document); while (queue.length) { const root = queue.shift(); if (!root) continue; if (root instanceof Element && matches(root)) { matchesList.push({ element: root, tag: root.tagName ?? "", id: root.id ?? "", className: (root as HTMLElement).className ?? "", text: extractText(root), }); } const walker = walkerFor(root); if (!walker) continue; let node: Node | null; while ((node = walker.nextNode())) { if (!(node instanceof Element)) continue; if (matches(node)) { matchesList.push({ element: node, tag: node.tagName ?? "", id: node.id ?? "", className: (node as HTMLElement).className ?? "", text: extractText(node), }); } const open = node.shadowRoot; if (open) enqueue(open); const closed = getClosedRoot(node); if (closed) enqueue(closed); } } const innermost: typeof matchesList = []; for (const item of matchesList) { const el = item.element; let skip = false; for (const other of matchesList) { if (item === other) continue; try { if (el.contains(other.element)) { skip = true; break; } } catch { // ignore containment errors } } if (!skip) { innermost.push(item); } } const target = innermost[targetIndex]; return target?.element ?? null; } export function resolveXPathMainWorld( rawXp: string, targetIndexRaw?: number, ): Element | null { const targetIndex = parseTargetIndex(targetIndexRaw); return resolveXPathAtIndex(rawXp, targetIndex, { pierceShadow: true }); } ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/waitForSelector.ts ================================================ /** * waitForSelector - Waits for an element matching a selector to reach a specific state. * Supports both CSS selectors and XPath expressions. * Uses MutationObserver for efficiency and integrates with the V3 piercer for closed shadow roots. * * NOTE: This function runs inside the page context. Keep it dependency-free * and resilient to exceptions. */ import { resolveXPathFirst } from "./xpathResolver.js"; type WaitForSelectorState = "attached" | "detached" | "visible" | "hidden"; /** * Check if a selector is an XPath expression. */ const isXPath = (selector: string): boolean => { return selector.startsWith("xpath=") || selector.startsWith("/"); }; /** * Get closed shadow root via the V3 piercer if available. */ const getClosedRoot = (element: Element): ShadowRoot | null => { try { const backdoor = window.__stagehandV3__; if (backdoor && typeof backdoor.getClosedRoot === "function") { return backdoor.getClosedRoot(element) ?? null; } } catch { // ignore } return null; }; /** * Get shadow root (open or closed via piercer). */ const getShadowRoot = (element: Element): ShadowRoot | null => { // First try open shadow root if (element.shadowRoot) return element.shadowRoot; // Then try closed shadow root via piercer return getClosedRoot(element); }; /** * Deep querySelector that pierces shadow DOM (both open and closed via piercer). */ const deepQuerySelector = ( root: Document | ShadowRoot, selector: string, pierceShadow: boolean, ): Element | null => { // Try regular querySelector first try { const el = root.querySelector(selector); if (el) return el; } catch { // ignore query errors } if (!pierceShadow) return null; // BFS queue to search all shadow roots (open and closed) const seenRoots = new WeakSet(); const queue: Array = [root]; while (queue.length > 0) { const currentRoot = queue.shift(); if (!currentRoot || seenRoots.has(currentRoot)) continue; seenRoots.add(currentRoot); // Try querySelector on this root try { const found = currentRoot.querySelector(selector); if (found) return found; } catch { // ignore query errors } // Walk all elements in this root to find shadow hosts try { const ownerDoc = currentRoot instanceof Document ? currentRoot : (currentRoot.host?.ownerDocument ?? document); const walker = ownerDoc.createTreeWalker( currentRoot, NodeFilter.SHOW_ELEMENT, ); let node: Node | null; while ((node = walker.nextNode())) { if (!(node instanceof Element)) continue; const shadowRoot = getShadowRoot(node); if (shadowRoot && !seenRoots.has(shadowRoot)) { queue.push(shadowRoot); } } } catch { // ignore traversal errors } } return null; }; /** * Resolve XPath with shadow DOM piercing support. */ const deepXPathQuery = ( xpath: string, pierceShadow: boolean, ): Element | null => { return resolveXPathFirst(xpath, { pierceShadow }); }; /** * Find element by selector (CSS or XPath) with optional shadow DOM piercing. */ const findElement = ( selector: string, pierceShadow: boolean, ): Element | null => { if (isXPath(selector)) { return deepXPathQuery(selector, pierceShadow); } return deepQuerySelector(document, selector, pierceShadow); }; /** * Check if element matches the desired state. */ const checkState = ( el: Element | null, state: WaitForSelectorState, ): boolean => { if (state === "detached") return el === null; if (state === "attached") return el !== null; if (el === null) return false; if (state === "hidden") { try { const style = window.getComputedStyle(el); const rect = el.getBoundingClientRect(); return ( style.display === "none" || style.visibility === "hidden" || style.opacity === "0" || rect.width === 0 || rect.height === 0 ); } catch { return false; } } // state === "visible" try { const style = window.getComputedStyle(el); const rect = el.getBoundingClientRect(); return ( style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0" && rect.width > 0 && rect.height > 0 ); } catch { return false; } }; /** * Set up MutationObservers on all shadow roots to detect changes. */ const setupShadowObservers = ( callback: () => void, observers: MutationObserver[], ): void => { const seenRoots = new WeakSet(); const observeShadowRoots = (node: Element): void => { const shadowRoot = getShadowRoot(node); if (shadowRoot && !seenRoots.has(shadowRoot)) { seenRoots.add(shadowRoot); const shadowObserver = new MutationObserver(callback); shadowObserver.observe(shadowRoot, { childList: true, subtree: true, attributes: true, attributeFilter: ["style", "class", "hidden", "disabled"], }); observers.push(shadowObserver); // Recurse into shadow root children for (const child of Array.from(shadowRoot.children)) { observeShadowRoots(child); } } // Recurse into regular children for (const child of Array.from(node.children)) { observeShadowRoots(child); } }; const root = document.documentElement || document.body; if (root) { observeShadowRoots(root); } }; /** * Wait for an element matching the selector to reach the specified state. * Supports both CSS selectors and XPath expressions (prefix with "xpath=" or start with "/"). * * @param selectorRaw - CSS selector or XPath expression to wait for * @param stateRaw - Element state: 'attached' | 'detached' | 'visible' | 'hidden' * @param timeoutRaw - Maximum time to wait in milliseconds * @param pierceShadowRaw - Whether to search inside shadow DOM * @returns Promise that resolves to true when condition is met, or rejects on timeout */ export function waitForSelector( selectorRaw: string, stateRaw?: string, timeoutRaw?: number, pierceShadowRaw?: boolean, ): Promise { const selector = String(selectorRaw ?? "").trim(); const state = (String(stateRaw ?? "visible") as WaitForSelectorState) || "visible"; const timeout = typeof timeoutRaw === "number" && timeoutRaw > 0 ? timeoutRaw : 30000; const pierceShadow = pierceShadowRaw !== false; return new Promise((resolve, reject) => { let timeoutId: ReturnType | null = null; let domReadyHandler: (() => void) | null = null; let settled = false; const clearTimer = (): void => { if (timeoutId !== null) { clearTimeout(timeoutId); timeoutId = null; } }; // Check immediately const el = findElement(selector, pierceShadow); if (checkState(el, state)) { settled = true; resolve(true); return; } const observers: MutationObserver[] = []; const cleanup = (): void => { for (const obs of observers) { obs.disconnect(); } if (domReadyHandler) { document.removeEventListener("DOMContentLoaded", domReadyHandler); domReadyHandler = null; } }; const check = (): void => { if (settled) return; const el = findElement(selector, pierceShadow); if (checkState(el, state)) { settled = true; clearTimer(); cleanup(); resolve(true); } }; // Handle case where document.body is not ready yet const observeRoot = document.body || document.documentElement; if (!observeRoot) { domReadyHandler = (): void => { document.removeEventListener("DOMContentLoaded", domReadyHandler!); domReadyHandler = null; check(); setupObservers(); }; document.addEventListener("DOMContentLoaded", domReadyHandler); timeoutId = setTimeout(() => { if (settled) return; settled = true; clearTimer(); cleanup(); reject( new Error( `waitForSelector: Timeout ${timeout}ms exceeded waiting for "${selector}" to be ${state}`, ), ); }, timeout); return; } const setupObservers = (): void => { const root = document.body || document.documentElement; if (!root) return; // Main document observer const mainObserver = new MutationObserver(check); mainObserver.observe(root, { childList: true, subtree: true, attributes: true, attributeFilter: ["style", "class", "hidden", "disabled"], }); observers.push(mainObserver); // Shadow DOM observers (if piercing) if (pierceShadow) { setupShadowObservers(check, observers); } }; setupObservers(); // Set up timeout timeoutId = setTimeout(() => { if (settled) return; settled = true; clearTimer(); cleanup(); reject( new Error( `waitForSelector: Timeout ${timeout}ms exceeded waiting for "${selector}" to be ${state}`, ), ); }, timeout); }); } ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/xpathParser.ts ================================================ export type XPathPredicate = | { type: "index"; index: number } | { type: "attrEquals"; name: string; value: string; normalize?: boolean } | { type: "attrExists"; name: string } | { type: "attrContains"; name: string; value: string; normalize?: boolean; } | { type: "attrStartsWith"; name: string; value: string; normalize?: boolean; } | { type: "textEquals"; value: string; normalize?: boolean } | { type: "textContains"; value: string; normalize?: boolean } | { type: "and"; predicates: XPathPredicate[] } | { type: "or"; predicates: XPathPredicate[] } | { type: "not"; predicate: XPathPredicate }; export interface XPathStep { axis: "child" | "desc"; tag: string; predicates: XPathPredicate[]; } /** * Parse an XPath expression into a list of traversal steps. * * This is a subset parser designed for composed DOM traversal (including * shadow roots). It intentionally does not implement the full XPath spec. * * Supported: * - Child (`/`) and descendant (`//`) axes * - Tag names and wildcard (`*`) * - Positional indices (`[n]`) * - Attribute equality predicates (`[@attr='value']`, `[@attr="value"]`) * - Attribute existence (`[@attr]`) * - Attribute contains/starts-with (`contains(@attr,'v')`, `starts-with(@attr,'v')`) * - Text equality/contains (`[text()='v']`, `[contains(text(),'v')]`, `[.='v']`) * - normalize-space on text/attributes (`[normalize-space(text())='v']`) * - Basic boolean predicates (`and`, `or`, `not(...)`) * - Multiple predicates per step (`[@class='foo'][2]`) * - Optional `xpath=` prefix * * Not supported: * - Position functions (`[position() > n]`, `[last()]`) * - Axes beyond child/descendant (`ancestor::`, `parent::`, `self::`, * `preceding-sibling::`, `following-sibling::`) * - Union operator (`|`) * - Grouped expressions (`(//div)[n]`) * * Unsupported predicates are silently ignored — the step still matches * by tag name, but the unrecognized predicate has no filtering effect. */ export function parseXPathSteps(input: string): XPathStep[] { const path = String(input || "") .trim() .replace(/^xpath=/i, ""); if (!path) return []; const steps: XPathStep[] = []; let i = 0; while (i < path.length) { let axis: "child" | "desc" = "child"; if (path.startsWith("//", i)) { axis = "desc"; i += 2; } else if (path[i] === "/") { axis = "child"; i += 1; } const start = i; let bracketDepth = 0; let quote: string | null = null; while (i < path.length) { const ch = path[i]; if (quote) { if (ch === quote) quote = null; } else if (ch === "'" || ch === '"') { quote = ch; } else if (ch === "[") { bracketDepth++; } else if (ch === "]") { bracketDepth--; } else if (ch === "/" && bracketDepth === 0) { break; } i += 1; } const rawStep = path.slice(start, i).trim(); if (!rawStep) continue; const { tag, predicates } = parseStep(rawStep); steps.push({ axis, tag, predicates }); } return steps; } /** * Extract predicate contents from a string like `[@attr='val'][2]`. * Handles `]` inside quoted attribute values (e.g. `[@title='a[0]']`). */ function extractPredicates(str: string): string[] { const results: string[] = []; let i = 0; while (i < str.length) { if (str[i] !== "[") { i++; continue; } i++; // skip opening [ const start = i; let quote: string | null = null; while (i < str.length) { const ch = str[i]; if (quote) { if (ch === quote) quote = null; } else if (ch === "'" || ch === '"') { quote = ch; } else if (ch === "]") { break; } i++; } results.push(str.slice(start, i).trim()); i++; // skip closing ] } return results; } function parseStep(raw: string): { tag: string; predicates: XPathPredicate[]; } { const bracketPos = raw.indexOf("["); if (bracketPos === -1) { const tag = raw === "" ? "*" : raw.toLowerCase(); return { tag, predicates: [] }; } const tagPart = raw.slice(0, bracketPos).trim(); const tag = tagPart === "" ? "*" : tagPart.toLowerCase(); const predicateStr = raw.slice(bracketPos); const predicates: XPathPredicate[] = []; for (const inner of extractPredicates(predicateStr)) { const parsed = parsePredicateExpression(inner); if (parsed) predicates.push(parsed); } return { tag, predicates }; } function parsePredicateExpression(input: string): XPathPredicate | null { const trimmed = input.trim(); if (!trimmed) return null; const orParts = splitTopLevel(trimmed, "or"); if (orParts.length > 1) { const preds = orParts .map((part) => parsePredicateExpression(part)) .filter(Boolean) as XPathPredicate[]; if (preds.length !== orParts.length) return null; return { type: "or", predicates: preds }; } const andParts = splitTopLevel(trimmed, "and"); if (andParts.length > 1) { const preds = andParts .map((part) => parsePredicateExpression(part)) .filter(Boolean) as XPathPredicate[]; if (preds.length !== andParts.length) return null; return { type: "and", predicates: preds }; } const notInner = unwrapFunctionCall(trimmed, "not"); if (notInner != null) { const predicate = parsePredicateExpression(notInner); return predicate ? { type: "not", predicate } : null; } return parseAtomicPredicate(trimmed); } function parseAtomicPredicate(input: string): XPathPredicate | null { const valueMatch = /^(?:'([^']*)'|"([^"]*)")$/; const attrName = "[a-zA-Z_][\\w.-]*"; const quoted = "(?:'([^']*)'|\"([^\"]*)\")"; if (/^\d+$/.test(input)) { return { type: "index", index: Math.max(1, Number(input)) }; } const normalizeAttrMatch = input.match( new RegExp( `^normalize-space\\(\\s*@(${attrName})\\s*\\)\\s*=\\s*${quoted}$`, ), ); if (normalizeAttrMatch) { return { type: "attrEquals", name: normalizeAttrMatch[1], value: normalizeAttrMatch[2] ?? normalizeAttrMatch[3] ?? "", normalize: true, }; } const normalizeTextMatch = input.match( new RegExp( `^normalize-space\\(\\s*(?:text\\(\\)|\\.)\\s*\\)\\s*=\\s*${quoted}$`, ), ); if (normalizeTextMatch) { return { type: "textEquals", value: normalizeTextMatch[1] ?? normalizeTextMatch[2] ?? "", normalize: true, }; } const attrEqualsMatch = input.match( new RegExp(`^@(${attrName})\\s*=\\s*${quoted}$`), ); if (attrEqualsMatch) { return { type: "attrEquals", name: attrEqualsMatch[1], value: attrEqualsMatch[2] ?? attrEqualsMatch[3] ?? "", }; } const attrExistsMatch = input.match(new RegExp(`^@(${attrName})$`)); if (attrExistsMatch) { return { type: "attrExists", name: attrExistsMatch[1] }; } const attrContainsMatch = input.match( new RegExp(`^contains\\(\\s*@(${attrName})\\s*,\\s*${quoted}\\s*\\)$`), ); if (attrContainsMatch) { return { type: "attrContains", name: attrContainsMatch[1], value: attrContainsMatch[2] ?? attrContainsMatch[3] ?? "", }; } const attrStartsMatch = input.match( new RegExp(`^starts-with\\(\\s*@(${attrName})\\s*,\\s*${quoted}\\s*\\)$`), ); if (attrStartsMatch) { return { type: "attrStartsWith", name: attrStartsMatch[1], value: attrStartsMatch[2] ?? attrStartsMatch[3] ?? "", }; } const textEqualsMatch = input.match( new RegExp(`^(?:text\\(\\)|\\.)\\s*=\\s*${quoted}$`), ); if (textEqualsMatch) { return { type: "textEquals", value: textEqualsMatch[1] ?? textEqualsMatch[2] ?? "", }; } const textContainsMatch = input.match( new RegExp(`^contains\\(\\s*(?:text\\(\\)|\\.)\\s*,\\s*${quoted}\\s*\\)$`), ); if (textContainsMatch) { return { type: "textContains", value: textContainsMatch[1] ?? textContainsMatch[2] ?? "", }; } if (valueMatch.test(input)) { return null; } return null; } function splitTopLevel(input: string, keyword: string): string[] { const parts: string[] = []; let start = 0; let depth = 0; let quote: string | null = null; let i = 0; while (i < input.length) { const ch = input[i]; if (quote) { if (ch === quote) quote = null; i += 1; continue; } if (ch === "'" || ch === '"') { quote = ch; i += 1; continue; } if (ch === "(") { depth += 1; i += 1; continue; } if (ch === ")") { depth = Math.max(0, depth - 1); i += 1; continue; } if (depth === 0 && isKeywordAt(input, i, keyword)) { parts.push(input.slice(start, i).trim()); i += keyword.length; start = i; continue; } i += 1; } parts.push(input.slice(start).trim()); return parts.filter((part) => part.length > 0); } function isKeywordAt(input: string, index: number, keyword: string): boolean { if (!input.startsWith(keyword, index)) return false; const before = index > 0 ? input[index - 1] : " "; if (before === "@") return false; const after = index + keyword.length < input.length ? input[index + keyword.length] : " "; return isBoundary(before) && isBoundary(after); } function isBoundary(ch: string): boolean { return !/[a-zA-Z0-9_.-]/.test(ch); } function unwrapFunctionCall(input: string, name: string): string | null { const prefix = `${name}(`; if (!input.startsWith(prefix) || !input.endsWith(")")) return null; const inner = input.slice(prefix.length, -1); return hasBalancedParens(inner) ? inner : null; } function hasBalancedParens(input: string): boolean { let depth = 0; let quote: string | null = null; for (let i = 0; i < input.length; i += 1) { const ch = input[i]; if (quote) { if (ch === quote) quote = null; continue; } if (ch === "'" || ch === '"') { quote = ch; continue; } if (ch === "(") depth += 1; else if (ch === ")") depth -= 1; if (depth < 0) return false; } return depth === 0; } const normalizeSpace = (value: string): string => value.replace(/\s+/g, " ").trim(); function textValue(element: Element): string { return String(element.textContent ?? ""); } function normalizeMaybe(value: string, normalize?: boolean): string { return normalize ? normalizeSpace(value) : value; } export function evaluatePredicate( element: Element, predicate: XPathPredicate, ): boolean { switch (predicate.type) { case "and": return predicate.predicates.every((p) => evaluatePredicate(element, p)); case "or": return predicate.predicates.some((p) => evaluatePredicate(element, p)); case "not": return !evaluatePredicate(element, predicate.predicate); case "attrExists": return element.getAttribute(predicate.name) !== null; case "attrEquals": { const attr = element.getAttribute(predicate.name); if (attr === null) return false; return ( normalizeMaybe(attr, predicate.normalize) === normalizeMaybe(predicate.value, predicate.normalize) ); } case "attrContains": { const attr = element.getAttribute(predicate.name); if (attr === null) return false; return normalizeMaybe(attr, predicate.normalize).includes( normalizeMaybe(predicate.value, predicate.normalize), ); } case "attrStartsWith": { const attr = element.getAttribute(predicate.name); if (attr === null) return false; return normalizeMaybe(attr, predicate.normalize).startsWith( normalizeMaybe(predicate.value, predicate.normalize), ); } case "textEquals": { const value = normalizeMaybe(textValue(element), predicate.normalize); return value === normalizeMaybe(predicate.value, predicate.normalize); } case "textContains": { const value = normalizeMaybe(textValue(element), predicate.normalize); return value.includes( normalizeMaybe(predicate.value, predicate.normalize), ); } case "index": return true; default: return true; } } export function applyPredicates( elements: Element[], predicates: XPathPredicate[], ): Element[] { let current = elements; for (const predicate of predicates) { if (!current.length) return []; if (predicate.type === "index") { const idx = predicate.index - 1; current = idx >= 0 && idx < current.length ? [current[idx]!] : []; continue; } current = current.filter((el) => evaluatePredicate(el, predicate)); } return current; } ================================================ FILE: packages/core/lib/v3/dom/locatorScripts/xpathResolver.ts ================================================ import { applyPredicates, parseXPathSteps, type XPathStep, } from "./xpathParser.js"; type ClosedRootGetter = (host: Element) => ShadowRoot | null; export type XPathResolveOptions = { pierceShadow?: boolean; }; type ShadowContext = { getClosedRoot: ClosedRootGetter | null; hasShadow: boolean; }; const normalizeXPath = (selector: string): string => { const raw = String(selector ?? "").trim(); if (!raw) return ""; return raw.replace(/^xpath=/i, "").trim(); }; export function resolveXPathFirst( rawXp: string, options?: XPathResolveOptions, ): Element | null { return resolveXPathAtIndex(rawXp, 0, options); } export function resolveXPathAtIndex( rawXp: string, index: number, options?: XPathResolveOptions, ): Element | null { if (!Number.isFinite(index) || index < 0) return null; const xp = normalizeXPath(rawXp); if (!xp) return null; const targetIndex = Math.floor(index); const pierceShadow = options?.pierceShadow !== false; const shadowCtx = pierceShadow ? getShadowContext() : null; if (!pierceShadow) { return resolveNativeAtIndexWithError(xp, targetIndex).value; } if (!shadowCtx?.hasShadow) { const native = resolveNativeAtIndexWithError(xp, targetIndex); if (!native.error) return native.value; const composed = resolveXPathComposedMatches(xp, shadowCtx?.getClosedRoot); return composed[targetIndex] ?? null; } const composed = resolveXPathComposedMatches(xp, shadowCtx.getClosedRoot); return composed[targetIndex] ?? null; } export function countXPathMatches( rawXp: string, options?: XPathResolveOptions, ): number { const xp = normalizeXPath(rawXp); if (!xp) return 0; const pierceShadow = options?.pierceShadow !== false; const shadowCtx = pierceShadow ? getShadowContext() : null; if (!pierceShadow) { return resolveNativeCountWithError(xp).count; } if (!shadowCtx?.hasShadow) { const count = resolveNativeCountWithError(xp); if (!count.error) return count.count; return resolveXPathComposedMatches(xp, shadowCtx?.getClosedRoot).length; } return resolveXPathComposedMatches(xp, shadowCtx.getClosedRoot).length; } export function resolveXPathComposedMatches( rawXp: string, getClosedRoot?: ClosedRootGetter | null, ): Element[] { const xp = normalizeXPath(rawXp); if (!xp) return []; const steps = parseXPathSteps(xp); if (!steps.length) return []; const closedRoot = getClosedRoot ?? null; let current: Array = [ document, ]; for (const step of steps) { const next: Element[] = []; const seen = new Set(); for (const root of current) { if (!root) continue; const pool = step.axis === "child" ? composedChildren(root, closedRoot) : composedDescendants(root, closedRoot); if (!pool.length) continue; const tagMatches = pool.filter((candidate) => matchesTag(candidate, step), ); const matches = applyPredicates(tagMatches, step.predicates); for (const candidate of matches) { if (!seen.has(candidate)) { seen.add(candidate); next.push(candidate); } } } if (!next.length) return []; current = next; } return current as Element[]; } function matchesTag(element: Element, step: XPathStep): boolean { if (step.tag === "*") return true; return element.localName === step.tag; } function getShadowContext(): ShadowContext { const backdoor = window.__stagehandV3__; const getClosedRoot: ClosedRootGetter | null = backdoor && typeof backdoor.getClosedRoot === "function" ? (host: Element): ShadowRoot | null => { try { return backdoor.getClosedRoot(host) ?? null; } catch { return null; } } : null; let hasShadow = false; try { if (backdoor && typeof backdoor.stats === "function") { const stats = backdoor.stats(); hasShadow = (stats?.open ?? 0) > 0 || (stats?.closed ?? 0) > 0; } } catch { // ignore stats errors } if (!hasShadow) { try { const walker = document.createTreeWalker( document, NodeFilter.SHOW_ELEMENT, ); while (walker.nextNode()) { const el = walker.currentNode as Element; if (el.shadowRoot) { hasShadow = true; break; } } } catch { // ignore scan errors } } return { getClosedRoot, hasShadow }; } function composedChildren( node: Node | null | undefined, getClosedRoot: ClosedRootGetter | null, ): Element[] { const out: Element[] = []; if (!node) return out; if (node instanceof Document) { if (node.documentElement) out.push(node.documentElement); return out; } if (node instanceof ShadowRoot || node instanceof DocumentFragment) { out.push(...Array.from(node.children ?? [])); return out; } if (node instanceof Element) { out.push(...Array.from(node.children ?? [])); const open = node.shadowRoot; if (open) out.push(...Array.from(open.children ?? [])); if (getClosedRoot) { const closed = getClosedRoot(node); if (closed) out.push(...Array.from(closed.children ?? [])); } return out; } return out; } function composedDescendants( node: Node | null | undefined, getClosedRoot: ClosedRootGetter | null, ): Element[] { const out: Element[] = []; const seen = new Set(); const stack = [...composedChildren(node, getClosedRoot)].reverse(); while (stack.length) { const next = stack.pop(); if (!next || seen.has(next)) continue; seen.add(next); out.push(next); const children = composedChildren(next, getClosedRoot); for (let i = children.length - 1; i >= 0; i -= 1) { stack.push(children[i]!); } } return out; } function resolveNativeAtIndexWithError( xp: string, index: number, ): { value: Element | null; error: boolean } { try { const snapshot = document.evaluate( xp, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null, ); return { value: snapshot.snapshotItem(index) as Element | null, error: false, }; } catch { return { value: null, error: true }; } } function resolveNativeCountWithError(xp: string): { count: number; error: boolean; } { try { const snapshot = document.evaluate( xp, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null, ); return { count: snapshot.snapshotLength, error: false }; } catch { return { count: 0, error: true }; } } ================================================ FILE: packages/core/lib/v3/dom/piercer.entry.ts ================================================ import { installV3ShadowPiercer } from "./piercer.runtime.js"; installV3ShadowPiercer({ debug: true, tagExisting: false }); ================================================ FILE: packages/core/lib/v3/dom/piercer.runtime.ts ================================================ export interface V3ShadowPatchOptions { debug?: boolean; tagExisting?: boolean; } export interface StagehandV3Backdoor { /** Closed shadow-root accessors */ getClosedRoot(host: Element): ShadowRoot | undefined; /** Stats + quick health check */ stats(): { installed: true; url: string; isTop: boolean; open: number; closed: number; }; } type V3InternalState = { hostToRoot: WeakMap; openCount: number; closedCount: number; debug: boolean; }; declare global { interface Window { __stagehandV3Injected?: boolean; __stagehandV3__?: StagehandV3Backdoor; } } export function installV3ShadowPiercer(opts: V3ShadowPatchOptions = {}): void { // hardcoded debug (remove later if desired) const DEBUG = true; type PatchedFn = Element["attachShadow"] & { __v3Patched?: boolean; __v3State?: V3InternalState; }; const bindBackdoor = (state: V3InternalState): void => { const { hostToRoot } = state; window.__stagehandV3__ = { getClosedRoot: (host: Element) => hostToRoot.get(host), stats: () => ({ installed: true, url: location.href, isTop: window.top === window, open: state.openCount, closed: state.closedCount, }), } satisfies StagehandV3Backdoor; }; // Look at the *current* function on the prototype. If it's already our patched // function, reuse its shared state and rebind the backdoor (no new WeakMap). const currentFn = Element.prototype.attachShadow as PatchedFn; if (currentFn.__v3Patched && currentFn.__v3State) { currentFn.__v3State.debug = DEBUG; // keep debug toggle consistent bindBackdoor(currentFn.__v3State); // idempotent: do not log "installed" again return; } // First-time install: create shared state and replace the prototype method const state: V3InternalState = { hostToRoot: new WeakMap(), openCount: 0, closedCount: 0, debug: DEBUG, }; const original = currentFn; // keep a reference to call through const patched: PatchedFn = function ( this: Element, init: ShadowRootInit, ): ShadowRoot { const mode = init?.mode ?? "open"; const root = original.call(this, init); try { state.hostToRoot.set(this, root); if (mode === "closed") state.closedCount++; else state.openCount++; if (state.debug) { console.info("[v3-piercer] attachShadow", { tag: (this as Element).tagName?.toLowerCase() ?? "", mode, url: location.href, }); } } catch { // } return root; } as PatchedFn; // Mark the *patched* function with metadata so re-entry sees it patched.__v3Patched = true; patched.__v3State = state; Object.defineProperty(Element.prototype, "attachShadow", { configurable: true, writable: true, value: patched, }); // Optionally tag existing open roots (closed cannot be discovered post-hoc) if (opts.tagExisting) { try { const walker = document.createTreeWalker( document, NodeFilter.SHOW_ELEMENT, ); while (walker.nextNode()) { const el = walker.currentNode as Element; if (el.shadowRoot) { state.hostToRoot.set(el, el.shadowRoot); state.openCount++; } } } catch { // } } window.__stagehandV3Injected = true; bindBackdoor(state); if (state.debug) { console.info("[v3-piercer] installed", { url: location.href, isTop: window.top === window, readyState: document.readyState, }); } } ================================================ FILE: packages/core/lib/v3/dom/rerenderMissingShadows.entry.ts ================================================ import { rerenderMissingShadowHosts } from "./rerenderMissingShadows.runtime.js"; rerenderMissingShadowHosts(); ================================================ FILE: packages/core/lib/v3/dom/rerenderMissingShadows.runtime.ts ================================================ export function rerenderMissingShadowHosts(): void { try { const piercer = window.__stagehandV3__; if (!piercer || typeof piercer.getClosedRoot !== "function") return; const needsReset: Element[] = []; const walker = document.createTreeWalker(document, NodeFilter.SHOW_ELEMENT); while (walker.nextNode()) { const el = walker.currentNode as Element; const tag = el.tagName?.toLowerCase() ?? ""; if (!tag.includes("-")) continue; if (typeof customElements?.get !== "function") continue; if (!customElements.get(tag)) continue; const hasOpen = !!el.shadowRoot; const hasClosed = !!piercer.getClosedRoot(el); if (hasOpen || hasClosed) continue; needsReset.push(el); } for (const host of needsReset) { try { const clone = host.cloneNode(true); host.replaceWith(clone); } catch { // ignore individual failures } } if (piercer.stats && needsReset.length) { console.info("[v3-piercer] rerender", { count: needsReset.length }); } } catch (err) { console.info("[v3-piercer] rerender error", { message: String(err ?? "") }); } } ================================================ FILE: packages/core/lib/v3/dom/screenshotScripts/index.ts ================================================ export { resolveMaskRect } from "./resolveMaskRect.js"; ================================================ FILE: packages/core/lib/v3/dom/screenshotScripts/resolveMaskRect.ts ================================================ export type MaskRect = { x: number; y: number; width: number; height: number; rootToken?: string | null; }; export function resolveMaskRect( this: Element | null, maskToken?: string, ): MaskRect | null { function safeClosest(el: Element | null, selector: string): Element | null { try { return el && typeof el.closest === "function" ? el.closest(selector) : null; } catch { return null; } } function safeMatches(el: Element | null, selector: string): boolean { try { return !!el && typeof el.matches === "function" && el.matches(selector); } catch { return false; } } function findTopLayerRoot(el: Element | null): Element | null { const dialog = safeClosest(el, "dialog[open]"); if (dialog) return dialog; const popover = safeClosest(el, "[popover]"); if (popover && safeMatches(popover, ":popover-open")) return popover; return null; } if (!this || typeof this.getBoundingClientRect !== "function") return null; const rect = this.getBoundingClientRect(); if (!rect) return null; const style = window.getComputedStyle(this); if (!style) return null; if (style.visibility === "hidden" || style.display === "none") return null; if (rect.width <= 0 || rect.height <= 0) return null; const root = findTopLayerRoot(this); if (root) { const rootRect = root.getBoundingClientRect(); if (!rootRect) return null; let rootToken: string | null = null; if (maskToken) { try { const existing = root.getAttribute("data-stagehand-mask-root"); if (existing && existing.startsWith(maskToken)) { rootToken = existing; } else { rootToken = maskToken + "_root_" + Math.random().toString(36).slice(2); root.setAttribute("data-stagehand-mask-root", rootToken); } } catch { rootToken = null; } } return { x: rect.left - rootRect.left - (root.clientLeft || 0) + (root.scrollLeft || 0), y: rect.top - rootRect.top - (root.clientTop || 0) + (root.scrollTop || 0), width: rect.width, height: rect.height, rootToken, }; } return { x: rect.left + window.scrollX, y: rect.top + window.scrollY, width: rect.width, height: rect.height, rootToken: null, }; } ================================================ FILE: packages/core/lib/v3/external_clients/aisdk.ts ================================================ import { CoreAssistantMessage, ModelMessage, CoreSystemMessage, Tool, CoreUserMessage, generateObject, generateText, ImagePart, TextPart, } from "ai"; import type { LanguageModelV2 } from "@ai-sdk/provider"; import { CreateChatCompletionOptions, LLMClient } from "../llm/LLMClient.js"; import { AvailableModel } from "../types/public/index.js"; import { ChatCompletion } from "openai/resources"; export class AISdkClient extends LLMClient { public type = "aisdk" as const; private model: LanguageModelV2; constructor({ model }: { model: LanguageModelV2 }) { super(model.modelId as AvailableModel); this.model = model; } async createChatCompletion({ options, }: CreateChatCompletionOptions): Promise { const formattedMessages: ModelMessage[] = options.messages.map( (message) => { if (Array.isArray(message.content)) { if (message.role === "system") { const systemMessage: CoreSystemMessage = { role: "system", content: message.content .map((c) => ("text" in c ? c.text : "")) .join("\n"), }; return systemMessage; } const contentParts = message.content.map((content) => { if ("image_url" in content) { const imageContent: ImagePart = { type: "image", image: content.image_url.url, }; return imageContent; } else { const textContent: TextPart = { type: "text", text: content.text, }; return textContent; } }); if (message.role === "user") { const userMessage: CoreUserMessage = { role: "user", content: contentParts, }; return userMessage; } else { const textOnlyParts = contentParts.map((part) => ({ type: "text" as const, text: part.type === "image" ? "[Image]" : part.text, })); const assistantMessage: CoreAssistantMessage = { role: "assistant", content: textOnlyParts, }; return assistantMessage; } } return { role: message.role, content: message.content, }; }, ); if (options.response_model) { const response = await generateObject({ model: this.model, messages: formattedMessages, schema: options.response_model.schema, }); return { data: response.object, usage: { prompt_tokens: response.usage.inputTokens ?? 0, completion_tokens: response.usage.outputTokens ?? 0, reasoning_tokens: response.usage.reasoningTokens ?? 0, cached_input_tokens: response.usage.cachedInputTokens ?? 0, total_tokens: response.usage.totalTokens ?? 0, }, } as T; } const tools: Record = {}; for (const rawTool of options.tools) { tools[rawTool.name] = { description: rawTool.description, inputSchema: rawTool.parameters, } as Tool; } const response = await generateText({ model: this.model, messages: formattedMessages, tools, }); return { data: response.text, usage: { prompt_tokens: response.usage.inputTokens ?? 0, completion_tokens: response.usage.outputTokens ?? 0, reasoning_tokens: response.usage.reasoningTokens ?? 0, cached_input_tokens: response.usage.cachedInputTokens ?? 0, total_tokens: response.usage.totalTokens ?? 0, }, } as T; } } ================================================ FILE: packages/core/lib/v3/external_clients/customOpenAI.ts ================================================ /** * Welcome to the Stagehand custom OpenAI client! * * This is a client for models that are compatible with the OpenAI API, like Ollama, Gemini, etc. * You can just pass in an OpenAI instance to the client and it will work. */ import type { AvailableModel } from "../types/public/model.js"; import { CreateChatCompletionOptions, LLMClient } from "../llm/LLMClient.js"; import OpenAI from "openai"; import type { ChatCompletion, ChatCompletionAssistantMessageParam, ChatCompletionContentPartImage, ChatCompletionContentPartText, ChatCompletionCreateParamsNonStreaming, ChatCompletionMessageParam, ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, } from "openai/resources/chat/completions"; import { toJsonSchema } from "../zodCompat.js"; import { validateZodSchema } from "../../utils.js"; import { CreateChatCompletionResponseError, ZodSchemaValidationError, } from "../types/public/sdkErrors.js"; export class CustomOpenAIClient extends LLMClient { public type = "openai" as const; private client: OpenAI; constructor({ modelName, client }: { modelName: string; client: OpenAI }) { super(modelName as AvailableModel); this.client = client; this.modelName = modelName as AvailableModel; } async createChatCompletion({ options, retries = 3, logger, }: CreateChatCompletionOptions): Promise { const { image, requestId, ...optionsWithoutImageAndRequestId } = options; // TODO: Implement vision support if (image) { console.warn( "Image provided. Vision is not currently supported for openai", ); } logger({ category: "openai", message: "creating chat completion", level: 1, auxiliary: { options: { value: JSON.stringify({ ...optionsWithoutImageAndRequestId, requestId, }), type: "object", }, modelName: { value: this.modelName, type: "string", }, }, }); let responseFormat: | ChatCompletionCreateParamsNonStreaming["response_format"] | undefined; if (options.response_model) { responseFormat = { type: "json_object", }; } /* eslint-disable */ // Remove unsupported options const { response_model, ...openaiOptions } = { ...optionsWithoutImageAndRequestId, model: this.modelName, }; logger({ category: "openai", message: "creating chat completion", level: 1, auxiliary: { openaiOptions: { value: JSON.stringify(openaiOptions), type: "object", }, }, }); const formattedMessages: ChatCompletionMessageParam[] = options.messages.map((message) => { if (Array.isArray(message.content)) { const contentParts = message.content.map((content) => { if ("image_url" in content) { const imageContent: ChatCompletionContentPartImage = { image_url: { url: content.image_url.url, }, type: "image_url", }; return imageContent; } else { const textContent: ChatCompletionContentPartText = { text: content.text, type: "text", }; return textContent; } }); if (message.role === "system") { const formattedMessage: ChatCompletionSystemMessageParam = { ...message, role: "system", content: contentParts.filter( (content): content is ChatCompletionContentPartText => content.type === "text", ), }; return formattedMessage; } else if (message.role === "user") { const formattedMessage: ChatCompletionUserMessageParam = { ...message, role: "user", content: contentParts, }; return formattedMessage; } else { const formattedMessage: ChatCompletionAssistantMessageParam = { ...message, role: "assistant", content: contentParts.filter( (content): content is ChatCompletionContentPartText => content.type === "text", ), }; return formattedMessage; } } return { ...message, content: message.content, } as ChatCompletionMessageParam; }); if (options.response_model) { const schemaJson = JSON.stringify( toJsonSchema(options.response_model.schema), null, 2, ); formattedMessages.push({ role: "user", content: `Respond with valid JSON matching this schema:\n${schemaJson}\n\nDo not include any other text, formatting or markdown in your output. Do not include \`\`\` or \`\`\`json in your response. Only the JSON object itself.`, }); } const body: ChatCompletionCreateParamsNonStreaming = { ...openaiOptions, model: this.modelName, messages: formattedMessages, response_format: responseFormat, stream: false, tools: options.tools?.map((tool) => ({ function: { name: tool.name, description: tool.description, parameters: tool.parameters, }, type: "function", })), }; const response = await this.client.chat.completions.create(body); logger({ category: "openai", message: "response", level: 1, auxiliary: { response: { value: JSON.stringify(response), type: "object", }, requestId: { value: requestId, type: "string", }, }, }); if (options.response_model) { const extractedData = response.choices[0].message.content; if (!extractedData) { throw new CreateChatCompletionResponseError("No content in response"); } let parsedData: unknown; try { parsedData = JSON.parse(extractedData); validateZodSchema(options.response_model.schema, parsedData); } catch (e) { const isParseError = e instanceof SyntaxError; logger({ category: "openai", message: isParseError ? "Response is not valid JSON" : "Response failed Zod schema validation", level: 0, }); if (retries > 0) { return this.createChatCompletion({ options, logger, retries: retries - 1, }); } if (e instanceof ZodSchemaValidationError) { logger({ category: "openai", message: `Error during chat completion: ${e.message}`, level: 0, auxiliary: { errorDetails: { value: `Message: ${e.message}${e.stack ? "\nStack: " + e.stack : ""}`, type: "string", }, requestId: { value: requestId, type: "string" }, }, }); throw new CreateChatCompletionResponseError(e.message); } throw new CreateChatCompletionResponseError( isParseError ? "Failed to parse model response as JSON" : e instanceof Error ? e.message : "Unknown error during response processing", ); } return { data: parsedData, usage: { prompt_tokens: response.usage?.prompt_tokens ?? 0, completion_tokens: response.usage?.completion_tokens ?? 0, total_tokens: response.usage?.total_tokens ?? 0, }, } as T; } return { data: response.choices[0].message.content, usage: { prompt_tokens: response.usage?.prompt_tokens ?? 0, completion_tokens: response.usage?.completion_tokens ?? 0, total_tokens: response.usage?.total_tokens ?? 0, }, } as T; } } ================================================ FILE: packages/core/lib/v3/flowlogger/EventEmitter.ts ================================================ import { EventEmitter } from "node:events"; type WildcardEventListener = (...args: unknown[]) => void; export class EventEmitterWithWildcardSupport extends EventEmitter { private readonly wildcardListeners = new Set(); override on( eventName: string | symbol, listener: (...args: unknown[]) => void, ): this { if (eventName === "*") { this.wildcardListeners.add(listener); return this; } return super.on(eventName, listener); } override off( eventName: string | symbol, listener: (...args: unknown[]) => void, ): this { if (eventName === "*") { this.wildcardListeners.delete(listener); return this; } return super.off(eventName, listener); } override emit(eventName: string | symbol, ...args: unknown[]): boolean { const handled = super.emit(eventName, ...args); for (const listener of this.wildcardListeners) { listener(...args); } return handled || this.wildcardListeners.size > 0; } } ================================================ FILE: packages/core/lib/v3/flowlogger/EventSink.ts ================================================ import fs from "node:fs"; import path from "node:path"; import { FlowEvent } from "./FlowLogger.js"; import type { EventStoreApi, EventStoreQuery } from "./EventStore.js"; import { prettifyColorStderrLine, prettifyEvent, prettifyIsCdpEvent, prettifySanitizeEvent, } from "./prettify.js"; // ============================================================================= // Event Sink Contracts // ============================================================================= export interface EventSink { emit(event: FlowEvent): Promise; query(query: EventStoreQuery): Promise; destroy(): Promise; } // Checks whether an event matches a query used by queryable sinks. `eventId` matches both the event itself and descendants of that event. function matchesEventStoreQuery( event: FlowEvent, query: EventStoreQuery, ): boolean { if (query.sessionId && event.sessionId !== query.sessionId) return false; if (query.eventId) { const matchesEvent = event.eventId === query.eventId || event.eventParentIds.includes(query.eventId); if (!matchesEvent) { return false; } } if (query.eventType) { const pattern = new RegExp( `^${query.eventType .replace(/[.*+?^${}()|[\]\\]/g, "\\$&") .replace(/\\\*/g, ".*")}$`, ); if (!pattern.test(event.eventType)) { return false; } } return true; } // ============================================================================= // File Sink Helpers // ============================================================================= // Returns true when a file sink's stream is still open and writable. function isWritable(stream: fs.WriteStream | null): stream is fs.WriteStream { return !!(stream && !stream.destroyed && stream.writable); } // Writes a serialized event to a file sink and converts callback-style stream completion into a promise. function writeToStream(stream: fs.WriteStream, value: string): Promise { return new Promise((resolve, reject) => { try { stream.write(value, (error?: Error | null) => { if (error) { reject(error); return; } resolve(); }); } catch (error) { reject(error); } }); } // ============================================================================= // Event Sink Implementations // ============================================================================= abstract class FileEventSink implements EventSink { private readonly streamPromise: Promise; // Lazily opens the one file stream owned by this sink when the session directory resolves. // Creates a best-effort file sink bound to a single session directory. constructor(sessionDirPromise: Promise, fileName: string) { this.streamPromise = sessionDirPromise.then((sessionDir) => sessionDir ? fs.createWriteStream(path.join(sessionDir, fileName), { flags: "a" }) : null, ); } protected abstract serialize(event: FlowEvent): Promise; // Serializes and appends a single event. File sinks are intentionally best-effort and never allowed to affect library execution flow. async emit(event: FlowEvent): Promise { try { const stream = await this.streamPromise; if (!isWritable(stream)) { return; } const serialized = await this.serialize(event); if (!serialized) { return; } await writeToStream(stream, serialized); } catch { // best effort only } } // File sinks are write-only and do not support query reads. async query(): Promise { return []; } // Closes the underlying file stream when the owning store shuts down. async destroy(): Promise { const stream = await this.streamPromise.catch((): null => null); if (!isWritable(stream)) { return; } await new Promise((resolve) => { stream.end(resolve); }); } } export class JsonlFileEventSink extends FileEventSink { // Writes full verbatim events to `session_events.jsonl`. constructor(sessionDirPromise: Promise) { super(sessionDirPromise, "session_events.jsonl"); } // Serializes the full event for lossless machine-readable storage. protected async serialize(event: FlowEvent): Promise { return `${JSON.stringify(event)}\n`; } } export class PrettyLogFileEventSink extends FileEventSink { // Writes human-readable pretty lines to `session_events.log`. constructor( sessionDirPromise: Promise, private readonly store: Pick, // Queried during prettification so each line can recover recent ancestry tags. ) { super(sessionDirPromise, "session_events.log"); } // Pretty-prints the event using recent in-memory ancestry. protected async serialize(event: FlowEvent): Promise { const line = await prettifyEvent(this.store, prettifySanitizeEvent(event)); return line ? `${line}\n` : null; } } export class PrettyStderrEventSink implements EventSink { // Writes pretty lines to stderr for verbose local debugging. CDP events are intentionally omitted here to keep stderr high-signal. constructor(private readonly store: Pick) {} // Queried during prettification so stderr lines can include recent ancestry tags. // Best-effort stderr writer used only for interactive debugging output. async emit(event: FlowEvent): Promise { try { if (prettifyIsCdpEvent(event)) { return; } const line = await prettifyEvent( this.store, prettifySanitizeEvent(event), ); if (!line) { return; } await new Promise((resolve, reject) => { try { process.stderr.write( `${prettifyColorStderrLine(line)}\n`, (error?: Error | null) => { if (error) { reject(error); return; } resolve(); }, ); } catch (error) { reject(error); } }); } catch { // best effort only } } // Stderr sink is write-only and does not support query reads. async query(): Promise { return []; } // No teardown is required for stderr. async destroy(): Promise {} } export class InMemoryEventSink implements EventSink { // Retains recent events for query lookups. Tests usually attach this sink explicitly when they need full historical payloads. constructor(protected readonly limit = Infinity) {} protected readonly events: FlowEvent[] = []; // Retained history; `emit()` appends to it and trims old entries when `limit` is exceeded. // Gives subclasses a hook to transform events before they are retained. protected storeEvent(event: FlowEvent): FlowEvent { return event; } // Stores a new event and trims the oldest retained entries once the sink exceeds its configured limit. async emit(event: FlowEvent): Promise { this.events.push(this.storeEvent(event)); if (this.events.length > this.limit) { this.events.splice(0, this.events.length - this.limit); } } // Returns retained events that match the query, ordered by creation time. async query(query: EventStoreQuery): Promise { const filtered = this.events.filter((event) => matchesEventStoreQuery(event, query), ); filtered.sort((left, right) => { const createdAtOrder = left.eventCreatedAt.localeCompare( right.eventCreatedAt, ); if (createdAtOrder !== 0) { return createdAtOrder; } return left.eventId.localeCompare(right.eventId); }); return query.limit ? filtered.slice(-query.limit) : filtered; } // Clears retained history when the owning store shuts down. async destroy(): Promise { this.events.length = 0; } } export class ShallowInMemoryEventSink extends InMemoryEventSink { // Retains only ancestry metadata for the default query sink so verbose or long-running sessions do not hold onto large payloads such as screenshots. protected override storeEvent(event: FlowEvent): FlowEvent { return new FlowEvent({ eventType: event.eventType, eventId: event.eventId, eventCreatedAt: event.eventCreatedAt, sessionId: event.sessionId, eventParentIds: [...event.eventParentIds], data: {}, }); } } ================================================ FILE: packages/core/lib/v3/flowlogger/EventStore.ts ================================================ import fs from "node:fs"; import path from "node:path"; import type { V3Options } from "../types/public/index.js"; import { EventSink, JsonlFileEventSink, PrettyLogFileEventSink, PrettyStderrEventSink, ShallowInMemoryEventSink, } from "./EventSink.js"; import { FlowEvent } from "./FlowLogger.js"; const DEFAULT_IN_MEMORY_EVENT_LIMIT = 500; // Per-session ancestry window retained by the default shallow query sink. const CONFIG_DIR = process.env.BROWSERBASE_CONFIG_DIR || ""; // Base directory for session metadata + file-backed flow logs. const FLOW_LOGS_ENABLED = process.env.BROWSERBASE_FLOW_LOGS === "1"; // Force-enables the pretty stderr flow sink even when `verbose !== 2`. const SENSITIVE_KEYS = /key|secret|token|api-key|apikey|api_key|password|passwd|pwd|credential|auth/i; // Redacts obvious secrets before session options are written to disk. // ============================================================================= // Public Contracts // ============================================================================= export interface EventStoreQuery { sessionId?: string; eventId?: string; eventType?: string; limit?: number; } export interface EventStoreApi { readonly sessionId: string; emit(event: FlowEvent): Promise; query(query: EventStoreQuery): Promise; destroy(): Promise; } // ============================================================================= // Filesystem Helpers // ============================================================================= // Redacts secrets before session options are written to `session.json` inside a config-dir-backed session directory. function sanitizeOptions(options: V3Options): Record { const sanitize = (value: unknown): unknown => { if (typeof value !== "object" || value === null) return value; if (Array.isArray(value)) return value.map(sanitize); const result: Record = {}; for (const [key, entry] of Object.entries(value)) { result[key] = SENSITIVE_KEYS.test(key) ? "******" : sanitize(entry); } return result; }; return sanitize({ ...options }) as Record; } // Resolves the configured Browserbase config directory used by file sinks. export function getConfigDir(): string { return CONFIG_DIR ? path.resolve(CONFIG_DIR) : ""; } // Creates the per-session directory used by file sinks and writes best-effort metadata such as the sanitized `session.json` file and `latest` symlink. async function createSessionDir( sessionId: string, options?: V3Options, ): Promise { const configDir = getConfigDir(); if (!configDir) { return null; } const sessionDir = path.join(configDir, "sessions", sessionId); await fs.promises.mkdir(sessionDir, { recursive: true }); if (options) { await fs.promises.writeFile( path.join(sessionDir, "session.json"), JSON.stringify(sanitizeOptions(options), null, 2), "utf-8", ); } const latestLink = path.join(configDir, "sessions", "latest"); try { try { await fs.promises.unlink(latestLink); } catch { // ignore missing link } await fs.promises.symlink(sessionId, latestLink, "dir"); } catch { // symlink best effort only } return sessionDir; } // ============================================================================= // Event Store // ============================================================================= // Per-session flow event sink manager. // This is not an event bus. V3 forwards already-emitted FlowEvents into it so // the store can fan them out to configured sinks, answer `query()` calls from // its one query sink, and tear down its sinks when the session closes. // We keep this as a separate object instead of wiring sinks directly with // `v3.bus.on("*", sink.emit)` because pretty sinks need access to a shared // query interface while rendering. Prettified lines often need to look up // related parent/child events to recover the readable ancestry tags and labels. // Passing sinks into each other to share that state gets messy quickly, so the // EventStore contains the circular dependency: all sinks live here, and any // sink that needs historical context can call the one `EventStore.query()` // entrypoint backed by the main query sink for this session. export class EventStore implements EventStoreApi { private readonly sinks = new Set(); // All sinks attached for this session; constructor registers them here and `destroy()` tears them down. private destroyed = false; // Flipped by `destroy()` so later emits and teardown calls become no-ops. public query: (query: EventStoreQuery) => Promise; // Always reads from the one query sink chosen at construction time. // Creates the per-instance store owned by a single V3 session. This store is intentionally single-session; it ignores events for other session ids. constructor( // Usually matches `browserbaseSessionId` today, but it is the store's own Stagehand session identifier and may diverge in the future. public readonly sessionId: string, options?: V3Options, querySink: EventSink = new ShallowInMemoryEventSink( DEFAULT_IN_MEMORY_EVENT_LIMIT, ), ) { const sessionDirPromise = createSessionDir(sessionId, options); this.registerSink(querySink); this.query = async (query) => { if (query.sessionId && query.sessionId !== this.sessionId) { return []; } return querySink.query({ ...query, sessionId: this.sessionId, }); }; if (getConfigDir()) { this.registerSink(new JsonlFileEventSink(sessionDirPromise)); this.registerSink(new PrettyLogFileEventSink(sessionDirPromise, this)); } if (FLOW_LOGS_ENABLED) { this.registerSink(new PrettyStderrEventSink(this)); } } // Adds a sink to the direct fanout list used by `emit()`. private registerSink(sink: EventSink): void { this.sinks.add(sink); } // Emits an event to all attached sinks when it belongs to this store's single session. emit = async (event: FlowEvent): Promise => { if (!(event instanceof FlowEvent)) { return; } if (this.destroyed || event.sessionId !== this.sessionId) { return; } await Promise.allSettled([...this.sinks].map((sink) => sink.emit(event))); }; // Tears down all sinks when the V3 instance is closed. async destroy(): Promise { if (this.destroyed) { return; } this.destroyed = true; await Promise.all( [...this.sinks].map((sink) => sink.destroy().catch(() => { // best effort cleanup }), ), ); this.sinks.clear(); } } ================================================ FILE: packages/core/lib/v3/flowlogger/FlowLogger.ts ================================================ import { AsyncLocalStorage } from "node:async_hooks"; import { v7 as uuidv7 } from "uuid"; import type { LanguageModelMiddleware } from "ai"; import { z } from "zod"; import { EventEmitterWithWildcardSupport } from "./EventEmitter.js"; // ============================================================================= // Flow Event Model // ============================================================================= export const FlowEventDataSchema = z.record(z.string(), z.unknown()); export const FlowEventInputSchema = z.object({ eventType: z.string(), eventId: z.string().optional(), eventParentIds: z.array(z.string()).optional(), eventCreatedAt: z.string().optional(), sessionId: z.string().optional(), data: FlowEventDataSchema.optional(), }); export type FlowEventData = z.infer; export type FlowEventInput = z.input; // the same as FlowEventInput, but with all fields required (non-optional) type FlowEventFields = Omit< FlowEventInput, "eventId" | "eventParentIds" | "eventCreatedAt" | "sessionId" | "data" > & { eventId: string; eventParentIds: string[]; eventCreatedAt: string; sessionId: string; data: FlowEventData; }; export class FlowEvent implements FlowEventFields { // "ModuleMethodSomethingEvent" -> hashToSmallInt("Modu) -> 5. eventId = "...5" private static deriveEventIdSuffix(eventType: string): string { const prefixMatch = eventType.match(/^[A-Z][a-z0-9]*/); const prefix = prefixMatch?.[0] ?? eventType.slice(0, 4); let hash = 0; for (const ch of prefix.slice(0, 4)) { hash = (hash * 31 + ch.charCodeAt(0)) % 10; } return String(hash); // e.g. "0" or "9" } // Builds a sortable UUID-like event id while preserving a stable, human-friendly suffix derived from the event family. static createEventId(eventType: string): string { const rawEventId = uuidv7(); return `${rawEventId.slice(0, -1)}${FlowEvent.deriveEventIdSuffix(eventType)}`; } // Base required fields for all events: eventType: string; eventId: string; eventParentIds: string[]; eventCreatedAt: string; // `sessionId` usually matches `browserbaseSessionId` today, but FlowLogger treats it as a generic Stagehand session identifier because those may diverge in the future. sessionId: string; data: FlowEventData; // event payload (e.g. params, action, result, error, etc.) // Normalizes the event shape used everywhere in the flow logger pipeline. This is called at emission time right before an event is attached to the event bus and any sinks. constructor(input: FlowEventInput) { if (!input.sessionId) { throw new Error("FlowEvent.sessionId is required."); } if (input.eventType.endsWith("Event")) { this.eventType = input.eventType; } else { this.eventType = `${input.eventType}Event`; } this.eventId = input.eventId ?? FlowEvent.createEventId(this.eventType); this.eventParentIds = input.eventParentIds ?? []; this.eventCreatedAt = input.eventCreatedAt ?? new Date().toISOString(); this.sessionId = input.sessionId; this.data = input.data ?? {}; } } export interface FlowLoggerContext { // Mirrors `FlowEvent.sessionId`; it is currently the Stagehand session id and often matches `browserbaseSessionId`, but callers should not rely on that. sessionId: string; eventBus: EventEmitterWithWildcardSupport; // Shared per-session bus; `emit()` writes to it and V3 forwards wildcard events into the instance-owned EventStore. parentEvents: FlowEvent[]; // Active parent stack for the current async chain; wrappers push/pop this as logged work starts and ends. } type AsyncOriginalMethod< TArgs extends unknown[] = unknown[], TResult = unknown, TThis = unknown, > = (this: TThis, ...args: TArgs) => Promise; type FlowLoggerLogOptions = FlowEventInput & { context?: FlowLoggerContext; }; // AsyncLocalStorage is the authoritative source for the active flow parent stack inside a single async call-chain. const loggerContext = new AsyncLocalStorage(); // Converts raw inline image/base64 payload lengths into a compact kb string for LLM prompt summaries. function dataToKb(data: string): string { return ((data.length * 0.75) / 1024).toFixed(1); } // ============================================================================= // Flow Logger Internals // ============================================================================= type CdpLogEventType = "call" | "response" | "responseError" | "message"; type CdpLogPayload = { method: string; params?: unknown; result?: unknown; error?: string; targetId?: string | null; }; const CDP_EVENT_NAMES: Record = { call: "CdpCallEvent", response: "CdpResponseEvent", responseError: "CdpResponseErrorEvent", message: "CdpMessageEvent", }; export class FlowLogger { // Copies the mutable parts of a context before it is re-entered in a later async callback. This prevents later parent-stack mutations from leaking backward into stored snapshots. private static cloneContext(ctx: FlowLoggerContext): FlowLoggerContext { return { ...ctx, parentEvents: ctx.parentEvents.map((event) => ({ ...event, eventParentIds: [...event.eventParentIds], })), }; } // Chooses the safest context to re-enter when callers already have a stored context // and ALS may or may not already contain one for the same session. // If the current ALS stack extends the stored stack, we keep the richer ALS view. // If the stored stack is deeper, we preserve that instead. // If they diverge, we prefer the current ALS view because it reflects the currently executing call-chain. private static resolveReentryContext( context: FlowLoggerContext, ): FlowLoggerContext { const currentContext = loggerContext.getStore() ?? null; // If ALS is empty or belongs to another session, the caller's stored // snapshot is the only safe context we can re-enter. if (!currentContext || currentContext.sessionId !== context.sessionId) { return FlowLogger.cloneContext(context); } const providedParentIds = context.parentEvents.map( (event) => event.eventId, ); const currentParentIds = currentContext.parentEvents.map( (event) => event.eventId, ); const currentExtendsProvided = providedParentIds.every( (eventId, index) => currentParentIds[index] === eventId, ); // ALS already has the provided chain as a prefix, so we keep the richer // currently-executing stack instead of truncating it. if (currentExtendsProvided) { return FlowLogger.cloneContext(currentContext); } const providedExtendsCurrent = currentParentIds.every( (eventId, index) => providedParentIds[index] === eventId, ); // The stored snapshot is deeper than the current ALS stack, which usually // means we are re-entering from a later async callback and need to restore // the missing parent chain. if (providedExtendsCurrent) { return FlowLogger.cloneContext(context); } // If the two chains diverged, prefer the live ALS chain because it reflects // the work currently executing on this async path. return FlowLogger.cloneContext(currentContext); } // Materializes and emits a single flow event on the active ALS context. // This is the lowest-level write path used by all higher-level logging helpers // after they have decided which parent chain and session the event belongs to. private static emit(event: FlowEventInput): FlowEvent | null { const ctx = FlowLogger.currentContext; const emittedEvent = new FlowEvent({ ...event, eventParentIds: event.eventParentIds ?? ctx.parentEvents.map((parent) => parent.eventId), sessionId: ctx.sessionId, }); ctx.eventBus.emit(emittedEvent.eventType, emittedEvent); return emittedEvent; } // Wraps a unit of async work with started/completed/error events while maintaining // the parent stack inside the active context. private static async runWithAutoStatusEventLogging( options: FlowLoggerLogOptions, originalMethod: AsyncOriginalMethod<[], TResult>, ): Promise { const ctx = FlowLogger.currentContext; const { data, eventParentIds, eventType } = options; let caughtError: unknown = null; // if eventParentIds is explicitly [], this is a root event, clear the parent events in context if (eventParentIds && eventParentIds.length === 0) { ctx.parentEvents = []; } const startedEvent = FlowLogger.emit({ eventType, data, eventParentIds, }); // Push after emitting so nested work sees this event as its direct parent // for the rest of the wrapped method's lifetime. ctx.parentEvents.push(startedEvent); try { return await originalMethod(); } catch (error) { caughtError = error; // Error events attach directly under the started event even though the // stack is still live, so the failure edge is explicit in the tree. FlowLogger.emit({ eventType: `${eventType}ErrorEvent`, eventParentIds: [...startedEvent.eventParentIds, startedEvent.eventId], data: { error: error instanceof Error ? error.message : String(error), durationMs: Date.now() - new Date(startedEvent.eventCreatedAt).getTime(), }, }); throw error; } finally { // Pop only the frame owned by this wrapper. If nested code has already // mutated the stack unexpectedly, we skip the completed event rather than // emitting a misleading lifecycle edge. const parentEvent = ctx.parentEvents.pop(); if (parentEvent?.eventId === startedEvent.eventId && !caughtError) { FlowLogger.emit({ eventType: `${eventType}CompletedEvent`, eventParentIds: [ ...startedEvent.eventParentIds, startedEvent.eventId, ], data: { durationMs: Date.now() - new Date(startedEvent.eventCreatedAt).getTime(), }, }); } } } // Emits a CDP event under a caller-supplied context. CDP transport code uses this // instead of `runWithLogging()` because request/response/message events // are separate lifecycle edges with explicit parent ids. private static logCdpEvent( context: FlowLoggerContext, eventType: CdpLogEventType, { method, params, result, error, targetId }: CdpLogPayload, eventParentIds?: string[], ): FlowEvent | null { if (method.endsWith(".enable") || method === "enable") { return null; } if (eventType === "message" && FlowLogger.NOISY_CDP_EVENTS.has(method)) { return null; } return loggerContext.run(FlowLogger.cloneContext(context), () => FlowLogger.emit({ eventType: CDP_EVENT_NAMES[eventType], eventParentIds, data: { method, params, result, error, targetId, }, }), ); } // Emits an LLM request/response event only when a flow context is active. // LLM logging is best-effort, so callers should not fail if it is invoked outside a tracked async chain. private static emitLlmEvent(event: FlowEventInput): void { const context = FlowLogger.resolveContext(); if (!context) { return; } loggerContext.run(context, () => { FlowLogger.emit(event); }); } // Builds the one-line prompt summary used in LLM request events for AI SDK middleware calls. private static buildMiddlewarePromptSummary(params: { prompt?: unknown; tools?: unknown; }): string { const toolCount = Array.isArray(params.tools) ? params.tools.length : 0; const messages = (params.prompt ?? []) as Array<{ role?: string; content?: unknown; }>; const lastMsg = messages .filter((message) => message.role !== "system") .pop(); let rolePrefix = lastMsg?.role ?? "?"; let promptSummary = `(no text) +{${toolCount} tools}`; if (!lastMsg) { return `?: ${promptSummary}`; } if (typeof lastMsg.content === "string") { promptSummary = `${lastMsg.content} +{${toolCount} tools}`; } else if (Array.isArray(lastMsg.content)) { const toolResult = ( lastMsg.content as Array<{ type?: string; toolName?: string; output?: { type?: string; value?: unknown }; }> ).find((part) => part.type === "tool-result"); if (toolResult) { rolePrefix = `tool result: ${toolResult.toolName}()`; if (toolResult.output?.type === "json" && toolResult.output.value) { promptSummary = `${JSON.stringify(toolResult.output.value)} +{${toolCount} tools}`; } else if (Array.isArray(toolResult.output?.value)) { promptSummary = `${ extractLlmMessageSummary({ content: toolResult.output.value, }) ?? "(no text)" } +{${toolCount} tools}`; } } else { promptSummary = `${ extractLlmMessageSummary({ content: lastMsg.content }) ?? "(no text)" } +{${toolCount} tools}`; } } return `${rolePrefix}: ${promptSummary}`; } // Builds the one-line output summary used in LLM response events for AI SDK middleware calls. private static buildMiddlewareOutputSummary(result: { text?: string; content?: unknown; toolCalls?: unknown[]; }): string { let outputSummary = result.text || ""; if (!outputSummary && result.content) { if (typeof result.content === "string") { outputSummary = result.content; } else if (Array.isArray(result.content)) { outputSummary = ( result.content as Array<{ type?: string; text?: string; toolName?: string; }> ) .map((contentPart) => { if (contentPart.text) { return contentPart.text; } if (contentPart.type === "tool-call") { return `tool call: ${contentPart.toolName}()`; } return `[${contentPart.type}]`; }) .join(" "); } } if (!outputSummary && result.toolCalls?.length) { return `[${result.toolCalls.length} tool calls]`; } return outputSummary || "[empty]"; } // ============================================================================= // Flow Logger Public Lifecycle API // ============================================================================= // Initialize a new logging context. Call this at the start of a session. static init( sessionId: string, eventBus: EventEmitterWithWildcardSupport, ): FlowLoggerContext { const ctx: FlowLoggerContext = { sessionId, eventBus, parentEvents: [], }; loggerContext.enterWith(ctx); return ctx; } // Clears the parent stack for a session when a V3 instance shuts down. // This does not emit a final event; it just tears down in-memory context. static async close(context?: FlowLoggerContext | null): Promise { const ctx = context ?? loggerContext.getStore() ?? null; if (!ctx) return; ctx.parentEvents = []; } // Returns the current ALS-backed flow context and throws when code // executes outside a tracked flow. Use `resolveContext()` for best-effort lookups. static get currentContext(): FlowLoggerContext { const ctx = loggerContext.getStore() ?? null; if (!ctx) { throw new Error("FlowLogger context is missing."); } return ctx; } // Returns a cloned FlowLogger context for the current async call-chain when one exists, // otherwise falls back to the provided instance-owned context. // This is the non-throwing lookup for callers that can continue without ALS. static resolveContext( fallbackContext?: FlowLoggerContext | null, ): FlowLoggerContext | null { const currentContext = loggerContext.getStore() ?? null; if (currentContext) { return FlowLogger.cloneContext(currentContext); } return fallbackContext ? FlowLogger.cloneContext(fallbackContext) : null; } // Decorator-style wrapper used on class methods that should emit their own started/completed/error envelope. // It resolves the flow context from either the decorator options or `this.flowLoggerContext`, // then delegates the actual lifecycle handling to `runWithLogging()`. static wrapWithLogging( options: FlowLoggerLogOptions, ) { return function < TWrappedMethod extends AsyncOriginalMethod< Parameters, Awaited>, ThisParameterType >, >(originalMethod: TWrappedMethod): TWrappedMethod { const wrappedMethod = async function ( this: ThisParameterType, ...args: Parameters ): Promise>> { let context = options.context; if (!context) { context = ( this as { flowLoggerContext?: FlowLoggerContext } | null | undefined )?.flowLoggerContext; } return await FlowLogger.runWithLogging( { ...options, context, }, (...boundArgs: Parameters) => originalMethod.apply(this, boundArgs) as Promise< Awaited> >, args, ); }; return wrappedMethod as unknown as TWrappedMethod; }; } // Wraps an async function or zero-arg closure with flow events. // This is the imperative entrypoint used by handlers that cannot use the decorator form. // Standard case: the logged params are the same tuple passed to the wrapped method. static runWithLogging( options: FlowLoggerLogOptions, originalMethod: TMethod, params: Readonly>, ): Promise>>; // Special case: log an arbitrary params tuple while executing a zero-arg closure. static runWithLogging( options: FlowLoggerLogOptions, originalMethod: AsyncOriginalMethod<[], TResult>, params: ReadonlyArray, ): Promise>; static runWithLogging( options: FlowLoggerLogOptions, originalMethod: AsyncOriginalMethod, params: ReadonlyArray, ): Promise { const eventData = { ...(options.data ?? {}), params: [...params], }; const execute = (): Promise => FlowLogger.runWithAutoStatusEventLogging( { ...options, data: eventData, }, () => originalMethod(...params), ); // No explicit context and no active ALS means there is nothing to attach // this work to, so we leave execution untouched instead of fabricating a // root event. if (!options.context && !(loggerContext.getStore() ?? null)) { return originalMethod(...params); } if (options.context) { // Re-enter the caller-owned context so wrapper events land under the same // session tree even when this code executes outside the original ALS // chain. return loggerContext.run( FlowLogger.resolveReentryContext(options.context), execute, ); } return execute(); } // Re-enters an existing FlowLogger context without emitting wrapper events. // Use this when work already belongs to a known parent and needs AsyncLocalStorage set manually. static withContext(context: FlowLoggerContext, fn: () => T): T { return loggerContext.run(FlowLogger.resolveReentryContext(context), fn); } // =========================================================================== // CDP Events // =========================================================================== private static readonly NOISY_CDP_EVENTS = new Set([ "Target.targetInfoChanged", "Runtime.executionContextCreated", "Runtime.executionContextDestroyed", "Runtime.executionContextsCleared", "Page.lifecycleEvent", "Network.dataReceived", "Network.loadingFinished", "Network.requestWillBeSentExtraInfo", "Network.responseReceivedExtraInfo", "Network.requestWillBeSent", "Network.responseReceived", ]); // Logs the start of a CDP command. CDP transport calls this before sending a // message over the websocket so the eventual response can attach to it. static logCdpCallEvent( context: FlowLoggerContext, data: { method: string; params?: object; targetId?: string | null; }, ): FlowEvent | null { return FlowLogger.logCdpEvent(context, "call", data); } // Logs the terminal response for a previously emitted CDP call event. static logCdpResponseEvent( context: FlowLoggerContext, parentEvent: Pick, data: { method: string; result?: unknown; error?: string; targetId?: string | null; }, ): void { FlowLogger.logCdpEvent( context, data.error ? "responseError" : "response", data, [...parentEvent.eventParentIds, parentEvent.eventId], ); } // Logs an unsolicited CDP message under the most recent related call event. static logCdpMessageEvent( context: FlowLoggerContext, parentEvent: Pick, data: { method: string; params?: unknown; targetId?: string | null; }, ): void { FlowLogger.logCdpEvent(context, "message", data, [ ...parentEvent.eventParentIds, parentEvent.eventId, ]); } // =========================================================================== // LLM Events // =========================================================================== // Emits a best-effort LLM request event when logging occurs inside an active flow context. static logLlmRequest({ requestId, model, prompt, }: { requestId: string; model: string; prompt?: string; }): void { FlowLogger.emitLlmEvent({ eventType: "LlmRequestEvent", data: { requestId, model, prompt, }, }); } // Emits a best-effort LLM response event when logging occurs inside an active flow context. static logLlmResponse({ requestId, model, output, inputTokens, outputTokens, }: { requestId: string; model: string; output?: string; inputTokens?: number; outputTokens?: number; }): void { FlowLogger.emitLlmEvent({ eventType: "LlmResponseEvent", data: { requestId, model, output, inputTokens, outputTokens, }, }); } // =========================================================================== // LLM Logging Middleware // =========================================================================== // Creates AI SDK middleware that wraps a generate call with FlowLogger LLM request/response events // while leaving model execution behavior unchanged. static createLlmLoggingMiddleware( modelId: string, ): Pick { return { wrapGenerate: async ({ doGenerate, params }) => { const llmRequestId = uuidv7(); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: modelId, prompt: FlowLogger.buildMiddlewarePromptSummary(params), }); const result = await doGenerate(); const res = result as { text?: string; content?: unknown; toolCalls?: unknown[]; }; FlowLogger.logLlmResponse({ requestId: llmRequestId, model: modelId, output: FlowLogger.buildMiddlewareOutputSummary(res), inputTokens: result.usage?.inputTokens, outputTokens: result.usage?.outputTokens, }); return result; }, }; } } // ============================================================================= // LLM Event Extraction Helpers // ============================================================================= type ContentPart = { type?: string; text?: string; content?: unknown[]; source?: { data?: string }; image_url?: { url?: string }; inlineData?: { data?: string }; }; type LlmMessageContent = { content?: unknown; text?: string; parts?: unknown[]; }; // Extracts text and image markers from an LLM content array. // This is shared by the request-summary helpers below so different provider message // shapes render consistently in the flow log. function extractLlmMessageContent(content: unknown[]): { text?: string; extras: string[]; } { const result = { text: undefined as string | undefined, extras: [] as string[], }; for (const part of content) { const p = part as ContentPart; // Text if (!result.text && p.text) { result.text = p.type === "text" || !p.type ? p.text : undefined; } // Images - various formats if (p.type === "image" || p.type === "image_url") { const url = p.image_url?.url; if (url?.startsWith("data:")) result.extras.push(`${dataToKb(url)}kb image`); else if (p.source?.data) result.extras.push(`${dataToKb(p.source.data)}kb image`); else result.extras.push("image"); } else if (p.source?.data) { result.extras.push(`${dataToKb(p.source.data)}kb image`); } else if (p.inlineData?.data) { result.extras.push(`${dataToKb(p.inlineData.data)}kb image`); } // Recurse into tool_result content if (p.type === "tool_result" && Array.isArray(p.content)) { const nested = extractLlmMessageContent(p.content); if (!result.text && nested.text) { result.text = nested.text; } result.extras.push(...nested.extras); } } return result; } // Produces a single compact summary from a provider-specific message payload // so request and tool-result logs stay readable. function extractLlmMessageSummary( input: LlmMessageContent, options?: { trimInstructionPrefix?: boolean; extras?: string[]; }, ): string | undefined { const result = { text: undefined as string | undefined, extras: [...(options?.extras ?? [])], }; if (typeof input.content === "string") { result.text = input.content; } else if (typeof input.text === "string") { result.text = input.text; } else if (Array.isArray(input.parts)) { const summary = extractLlmMessageContent(input.parts); result.text = summary.text; result.extras.push(...summary.extras); } else if (Array.isArray(input.content)) { const summary = extractLlmMessageContent(input.content); result.text = summary.text; result.extras.push(...summary.extras); } if (options?.trimInstructionPrefix && result.text) { result.text = result.text.replace(/^[Ii]nstruction: /, ""); } const text = result.text; if (!text && result.extras.length === 0) return undefined; let summary = text || ""; if (result.extras.length > 0) { const extrasStr = result.extras.map((e) => `+{${e}}`).join(" "); summary = summary ? `${summary} ${extrasStr}` : extrasStr; } return summary || undefined; } // Formats the last user-facing prompt into the one-line form used by standard LLM request logs, // for example: `some text +{5.8kb image} +{schema}`. export function extractLlmPromptSummary( messages: Array<{ role: string; content: unknown }>, options?: { toolCount?: number; hasSchema?: boolean }, ): string | undefined { try { const lastUserMsg = messages.filter((m) => m.role === "user").pop(); if (!lastUserMsg) return undefined; return extractLlmMessageSummary(lastUserMsg, { trimInstructionPrefix: true, extras: [ ...(options?.hasSchema ? ["schema"] : []), ...(options?.toolCount ? [`${options.toolCount} tools`] : []), ], }); } catch { return undefined; } } // Extract a text summary from CUA-style messages. This accepts Anthropic, OpenAI, and Google-style payloads. export function extractLlmCuaPromptSummary( messages: unknown[], ): string | undefined { try { const lastMsg = messages .filter((m) => { const msg = m as { role?: string; type?: string }; return msg.role === "user" || msg.type === "tool_result"; }) .pop() as | { content?: unknown; parts?: unknown[]; text?: string } | undefined; if (!lastMsg) return undefined; return extractLlmMessageSummary(lastMsg); } catch { return undefined; } } // Formats the response side of a CUA exchange into a single short log line. export function extractLlmCuaResponseSummary(output: unknown): string { try { const items: unknown[] = (output as { candidates?: [{ content?: { parts?: unknown[] } }] }) ?.candidates?.[0]?.content?.parts ?? (Array.isArray(output) ? output : []); const summary = items .map((item) => { const i = item as { type?: string; text?: string; name?: string; functionCall?: { name?: string }; }; if (i.text) return i.text; if (i.functionCall?.name) return i.functionCall.name; if (i.type === "tool_use" && i.name) return i.name; return i.type ?? "[item]"; }) .join(" "); return summary; } catch { return "[error]"; } } ================================================ FILE: packages/core/lib/v3/flowlogger/prettify.ts ================================================ import { toTitleCase } from "../../utils.js"; import { FlowEvent } from "./FlowLogger.js"; import type { EventStoreApi } from "./EventStore.js"; const MAX_LINE_LENGTH = 160; // Maximum width for a prettified log line. // ============================================================================= // Pretty Formatting // ============================================================================= // All functions in this section intentionally share the `prettify` prefix so the formatting pipeline is easy to scan and reason about in one place. // Sanitizes individual values before they are included in prettified output. This currently shortens CDP ids but otherwise preserves structure. function prettifySanitizeValue(value: unknown): unknown { if (typeof value === "string") { return truncateCdpIds(value); } if (Array.isArray(value)) { return value.map((entry) => prettifySanitizeValue(entry)); } if (value && typeof value === "object") { return Object.fromEntries( Object.entries(value).map(([key, entry]) => [ key, prettifySanitizeValue(entry), ]), ); } return value; } // Produces a prettified-safe copy of the event without mutating the original event that other sinks may still need to serialize verbatim. export function prettifySanitizeEvent(event: FlowEvent): FlowEvent { if (!event.eventType.startsWith("Cdp")) { return event; } return { ...event, data: prettifySanitizeValue(event.data) as Record, }; } // Collapses newlines and tabs, then truncates a string to the configured pretty log width while preserving the tail for ids and result summaries. function prettifyTruncateLine(value: string, maxLen: number): string { const collapsed = value.replace(/[\r\n\t]+/g, " "); if (collapsed.length <= maxLen) { return collapsed; } const endLen = Math.floor(maxLen * 0.3); const startLen = maxLen - endLen - 1; return `${collapsed.slice(0, startLen)}…${collapsed.slice(-endLen)}`; } // Converts any event argument into a compact string representation for pretty logs. function prettifyFormatValue(value: unknown): string { if (typeof value === "string") return `'${value}'`; if (value == null || typeof value !== "object") return String(value); try { return JSON.stringify(value); } catch { return "[unserializable]"; } } // Formats one or more call arguments into a comma-separated pretty string. function prettifyFormatArgs(args?: unknown | unknown[]): string { if (args === undefined) { return ""; } return (Array.isArray(args) ? args : [args]) .filter((entry) => entry !== undefined) .map(prettifyFormatValue) .filter((entry) => entry.length > 0) .join(", "); } // Returns the short id fragment used by pretty tags. function shortId(id: string | null | undefined): string { return id ? id.slice(-4) : "-"; } // Shortens 32-character CDP ids so pretty logs stay readable while still leaving enough information to correlate related targets. function truncateCdpIds(value: string): string { return value.replace( /([iI]d:?"?)([0-9A-F]{32})(?="?[,})\s]|$)/g, (_, prefix: string, id: string) => `${prefix}${id.slice(0, 4)}…${id.slice(-4)}`, ); } let nonce = 0; // Formats timestamps for pretty logs while appending a tiny nonce so lines emitted in the same millisecond remain stable and sortable. function prettifyFormatTimestamp(date: Date): string { const pad = (value: number, width = 2) => String(value).padStart(width, "0"); return `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())} ${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}${pad(nonce++ % 100)}`; } // Removes noisy quoting artifacts from the final pretty line. function prettifyRemoveQuotes(value: string): string { return value .replace(/([^\\])["']/g, "$1") .replace(/^["']|["']$/g, "") .trim(); } // Strips event lifecycle suffixes so related started/completed/error variants can be grouped under one logical operation name. function prettifyEventName(eventType: string): string { return eventType .replace(/CompletedEvent$/, "") .replace(/ErrorEvent$/, "") .replace(/Event$/, ""); } // Extracts the operation name from a Stagehand/Page/Understudy/Agent event. function prettifyEventAction(eventType: string): string { return prettifyEventName(eventType) .replace(/^Agent/, "") .replace(/^Stagehand/, "") .replace(/^Understudy/, "") .replace(/^Page/, ""); } // Formats `Target.method(args)` style entries while gracefully handling events whose action portion is intentionally blank, such as `StagehandEvent`. function prettifyFormatMethodCall( target: string, method: string, args: unknown, ): string { const member = method ? `.${method[0].toLowerCase()}${method.slice(1)}` : ""; return `▷ ${target}${member}(${prettifyFormatEventArgs(args)})`; } // Marks agent lifecycle events for ancestry tags. function prettifyIsAgentEvent(event: FlowEvent): boolean { return prettifyEventName(event.eventType).startsWith("Agent"); } // Marks Stagehand lifecycle events for ancestry tags. function prettifyIsStagehandEvent(event: FlowEvent): boolean { return prettifyEventName(event.eventType).startsWith("Stagehand"); } // Marks page and Understudy actions for the action tag. function prettifyIsActionEvent(event: FlowEvent): boolean { return /^(Page|Understudy)/.test(prettifyEventName(event.eventType)); } // Routes transport-level CDP traffic to the CDP formatter. export function prettifyIsCdpEvent(event: FlowEvent): boolean { return prettifyEventName(event.eventType).startsWith("Cdp"); } // Routes LLM request/response events to the LLM formatter. function prettifyIsLlmEvent(event: FlowEvent): boolean { return prettifyEventName(event.eventType).startsWith("Llm"); } // Completed events should inherit tags from the started operation. function prettifyIsCompletedEvent(event: FlowEvent): boolean { return event.eventType.endsWith("CompletedEvent"); } // Error events should inherit tags from the started operation. function prettifyIsErrorEvent(event: FlowEvent): boolean { return event.eventType.endsWith("ErrorEvent"); } // Renders the bracketed pretty tag used in stderr/file pretty logs. function prettifyFormatTag( label: string | null | undefined, id: string | null | undefined, icon: string, ): string { return id ? `[${icon} #${shortId(id)}${label ? ` ${label}` : ""}]` : "⤑"; } // Formats duration values stored on completed/error events. function prettifyFormatDuration(durationMs?: unknown): string | null { return typeof durationMs === "number" ? `${(durationMs / 1000).toFixed(2)}s` : null; } // Summarizes a prompt or output payload down to a single displayable string for the LLM pretty formatter. function prettifySummarizePrompt(value: unknown): string | undefined { if (typeof value === "string") { return prettifyTruncateLine(value, MAX_LINE_LENGTH / 2); } if (value == null) { return undefined; } return prettifyTruncateLine(prettifyFormatValue(value), MAX_LINE_LENGTH / 2); } // Replaces large object references from live runtime objects with placeholders before they are stringified for pretty output. function prettifyCompactValue(value: unknown): unknown { if (typeof value !== "object" || value === null) { return value; } if (Array.isArray(value)) { return value.map((entry) => prettifyCompactValue(entry)); } const result: Record = {}; for (const [key, entry] of Object.entries(value)) { if ( key === "page" || key === "frame" || key === "locator" || key === "conn" || key === "mainSession" || key === "sessions" || key === "registry" || key === "networkManager" || key === "apiClient" ) { result[key] = `[${toTitleCase(key)}]`; continue; } result[key] = prettifyCompactValue(entry); } return result; } // Formats event arguments after compacting any live object references. function prettifyFormatEventArgs(args?: unknown | unknown[]): string { return prettifyFormatArgs(prettifyCompactValue(args) as unknown | unknown[]); } // Finds the nearest event in the current parent chain that satisfies the given predicate. Pretty tags use this to recover agent/stagehand/action/llm ancestry. function prettifyFindNearestEvent( event: FlowEvent, parentMap: Map, predicate: (candidate: FlowEvent) => boolean, options?: { includeSelf?: boolean }, ): FlowEvent | null { if (options?.includeSelf !== false && predicate(event)) { return event; } for (let index = event.eventParentIds.length - 1; index >= 0; index -= 1) { const parent = parentMap.get(event.eventParentIds[index]); if (parent && predicate(parent)) { return parent; } } return null; } // Builds the semantic ancestry tags shown on each pretty log line. // 2026-03-16 22:04:15.45540 [🅰 #1083] [🆂 #7bf4 ACT] [🆄 #2125 CLICK] [🅲 #8B8B CDP] ⏴ Network.policyUpdated({}) function prettifyBuildContextTags( event: FlowEvent, parentMap: Map, ): string[] { // Completed/error events should inherit tags from their started parent so the completion line points back to the original operation id. const includeSelf = !prettifyIsCompletedEvent(event) && !prettifyIsErrorEvent(event); const agentEvent = prettifyFindNearestEvent( event, parentMap, prettifyIsAgentEvent, { includeSelf }, ); const stagehandEvent = prettifyFindNearestEvent( event, parentMap, prettifyIsStagehandEvent, { includeSelf }, ); const actionEvent = prettifyFindNearestEvent( event, parentMap, prettifyIsActionEvent, { includeSelf }, ); const llmEvent = prettifyFindNearestEvent( event, parentMap, prettifyIsLlmEvent, { includeSelf, }, ); let targetId: string | null = null; if (typeof event.data.targetId === "string") { targetId = event.data.targetId; } let stagehandLabel = ""; if (stagehandEvent) { stagehandLabel = prettifyEventAction( stagehandEvent.eventType, ).toUpperCase(); } let actionLabel = ""; if (actionEvent) { actionLabel = prettifyEventAction(actionEvent.eventType).toUpperCase(); } if (prettifyIsAgentEvent(event)) { return [prettifyFormatTag("", agentEvent?.eventId, "🅰")]; } if (prettifyIsStagehandEvent(event)) { return [ prettifyFormatTag("", agentEvent?.eventId, "🅰"), prettifyFormatTag( prettifyEventAction( stagehandEvent?.eventType ?? event.eventType, ).toUpperCase(), stagehandEvent?.eventId, "🆂", ), ]; } if (prettifyIsActionEvent(event)) { return [ prettifyFormatTag("", agentEvent?.eventId, "🅰"), prettifyFormatTag(stagehandLabel, stagehandEvent?.eventId, "🆂"), prettifyFormatTag( prettifyEventAction( actionEvent?.eventType ?? event.eventType, ).toUpperCase(), actionEvent?.eventId, "🆄", ), ]; } if (prettifyIsCdpEvent(event)) { return [ prettifyFormatTag("", agentEvent?.eventId, "🅰"), prettifyFormatTag(stagehandLabel, stagehandEvent?.eventId, "🆂"), prettifyFormatTag(actionLabel, actionEvent?.eventId, "🆄"), prettifyFormatTag("CDP", targetId, "🅲"), ]; } if (prettifyIsLlmEvent(event)) { let requestId: string | null = null; if (typeof event.data.requestId === "string") { requestId = event.data.requestId; } return [ prettifyFormatTag("", agentEvent?.eventId, "🅰"), prettifyFormatTag(stagehandLabel, stagehandEvent?.eventId, "🆂"), prettifyFormatTag("LLM", requestId ?? llmEvent?.eventId, "🅻"), ]; } return [`[#${shortId(event.eventId)}]`]; } // Formats the details section for started/root events. function prettifyFormatStartedDetails(event: FlowEvent): string { const data = event.data as { params?: unknown[]; target?: string; }; const name = prettifyEventName(event.eventType); const method = prettifyEventAction(event.eventType); if (name.startsWith("Stagehand")) { return prettifyFormatMethodCall("Stagehand", method, data.params); } if (name.startsWith("Page")) { return prettifyFormatMethodCall("Page", method, data.params); } if (name.startsWith("Understudy")) { const args = [ data.target, ...(Array.isArray(data.params) ? data.params : []), ].filter((entry) => entry !== undefined); return prettifyFormatMethodCall("Understudy", method, args); } if (name.startsWith("Agent")) { return `▷ Agent.execute(${prettifyFormatEventArgs(data.params)})`; } return `${event.eventType}(${prettifyFormatEventArgs(data.params ?? event.data)})`; } // Formats the details section for completed/error events. function prettifyFormatCompletedDetails(event: FlowEvent): string { const duration = prettifyFormatDuration(event.data.durationMs); const prefix = prettifyIsAgentEvent(event) ? "Agent.execute() completed" : `${prettifyEventAction(event.eventType).toUpperCase() || event.eventType} completed`; const message = prettifyIsErrorEvent(event) && typeof event.data.error === "string" ? ` ERROR ${event.data.error}` : ""; return `${prettifyIsErrorEvent(event) ? "✕" : "✓"} ${prefix}${duration ? ` in ${duration}` : ""}${message}`; } // Formats CDP request/response/message details. These are rendered differently from normal Stagehand lifecycle events because they represent transport-level traffic rather than method envelopes. function prettifyFormatCdpDetails(event: FlowEvent): string { const data = event.data as { method?: string; params?: unknown; result?: unknown; error?: string; }; const method = data.method ?? "unknown"; const icon = event.eventType === "CdpCallEvent" ? "⏵" : "⏴"; let payload: unknown; if (event.eventType === "CdpCallEvent") { payload = data.params; } else if (data.error) { payload = { error: data.error }; } else if (event.eventType === "CdpMessageEvent") { payload = data.params; } else { payload = data.result; } return `${icon} ${method}(${prettifyFormatEventArgs(payload)})`; } // Formats LLM request/response details for pretty logs. function prettifyFormatLlmDetails(event: FlowEvent): string { const data = event.data as { model?: string; prompt?: unknown; output?: unknown; inputTokens?: number; outputTokens?: number; }; const model = data.model ?? "llm"; if (event.eventType === "LlmRequestEvent") { const prompt = prettifySummarizePrompt(data.prompt); return prompt ? `${model} ⏴ ${prompt}` : `${model} ⏴`; } const tokenInfo = (data.inputTokens || data.outputTokens) > 0 ? ` ꜛ${data.inputTokens ?? 0} ꜜ${data.outputTokens ?? 0}` : ""; const output = prettifySummarizePrompt(data.output); return output ? `${model} ↳${tokenInfo} ${output}` : `${model} ↳${tokenInfo}`; } // Converts a flow event into a single pretty log line by combining the current event payload with recent shallow ancestry fetched from the store query sink. export async function prettifyEvent( store: Pick, event: FlowEvent, ): Promise { const recentEvents = await store.query({ limit: 500 }); const parentMap = new Map( recentEvents.map((recentEvent) => [recentEvent.eventId, recentEvent]), ); const tags = prettifyBuildContextTags(event, parentMap); let details = prettifyFormatStartedDetails(event); if (prettifyIsCdpEvent(event)) { details = prettifyFormatCdpDetails(event); } else if (prettifyIsLlmEvent(event)) { details = prettifyFormatLlmDetails(event); } else if (prettifyIsCompletedEvent(event) || prettifyIsErrorEvent(event)) { details = prettifyFormatCompletedDetails(event); } if (!details) { return null; } const createdAt = new Date(event.eventCreatedAt); let timestamp = prettifyFormatTimestamp(createdAt); if (Number.isNaN(createdAt.getTime())) { timestamp = prettifyFormatTimestamp(new Date()); } const line = `${timestamp} ${tags.join(" ")} ${details}`; const cleaned = prettifyRemoveQuotes(line); const processed = prettifyIsCdpEvent(event) ? truncateCdpIds(cleaned) : cleaned; return prettifyTruncateLine(processed, MAX_LINE_LENGTH); } // Adds subtle terminal color to stderr-only pretty lines without affecting file sinks. export function prettifyColorStderrLine(line: string): string { if ( process.env.NO_COLOR !== undefined || (process.env.FORCE_COLOR ?? "") === "0" || (!process.env.FORCE_COLOR && (!process.stderr.isTTY || process.env.TERM === "dumb")) ) { return line; } const color = (code: string, value: string) => `\u001B[${code}m${value}\u001B[0m`; return line .replace(/^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{5})/, (_, timestamp) => color("2", timestamp), ) .replace(/\[([🅰🆂🆄🅻🅲])([^\]]*)\]/gu, (_, icon, rest) => color( icon === "🅰" ? "36" : icon === "🆂" ? "33" : icon === "🆄" ? "32" : icon === "🅻" ? "95" : "90", `[${icon}${rest}]`, ), ) .replace( / in (\d+(?:\.\d+)?s)/g, (_, duration) => ` ${color("2", "in")} ${color("2", duration)}`, ) .replace(/▷/g, color("96", "▷")) .replace(/⏴/g, color("96", "⏴")) .replace(/↳/g, color("95", "↳")) .replace(/ꜛ/g, color("33", "ꜛ")) .replace(/ꜜ/g, color("95", "ꜜ")) .replace(/…/g, color("94", "…")) .replace(/[(){}=]/g, (char) => color("94", char)) .replace( /([A-Za-z])(\.)([A-Za-z])/g, (_, left, dot, right) => `${left}${color("94", dot)}${right}`, ) .replace(/ ✓ /g, ` ${color("32", "✓")} `) .replace(/ ✕ /g, ` ${color("31", "✕")} `); } ================================================ FILE: packages/core/lib/v3/handlers/actHandler.ts ================================================ // lib/v3/handlers/actHandler.ts import { act as actInference } from "../../inference.js"; import { buildActPrompt, buildStepTwoPrompt } from "../../prompt.js"; import { trimTrailingTextNode } from "../../utils.js"; import { v3Logger } from "../logger.js"; import { ActHandlerParams } from "../types/private/handlers.js"; import { ActResult, Action, V3FunctionName } from "../types/public/methods.js"; import { ActTimeoutError } from "../types/public/sdkErrors.js"; import { captureHybridSnapshot, diffCombinedTrees, } from "../understudy/a11y/snapshot/index.js"; import { LLMClient } from "../llm/LLMClient.js"; import { SupportedUnderstudyAction } from "../types/private/index.js"; import { EncodedId } from "../types/private/internal.js"; import { AvailableModel, ClientOptions, ModelConfiguration, } from "../types/public/model.js"; import type { Variables } from "../types/public/agent.js"; import type { Page } from "../understudy/page.js"; import { performUnderstudyMethod, waitForDomNetworkQuiet, } from "./handlerUtils/actHandlerUtils.js"; import { createTimeoutGuard } from "./handlerUtils/timeoutGuard.js"; import { resolveVariableValue } from "../agent/utils/variables.js"; type ActInferenceElement = { elementId?: string; description: string; method?: string; arguments?: string[]; }; type ActInferenceResponse = Awaited>; export class ActHandler { private readonly llmClient: LLMClient; private readonly defaultModelName: AvailableModel; private readonly defaultClientOptions: ClientOptions; private readonly resolveLlmClient: (model?: ModelConfiguration) => LLMClient; private readonly systemPrompt: string; private readonly logInferenceToFile: boolean; private readonly selfHeal: boolean; private readonly onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; private readonly defaultDomSettleTimeoutMs?: number; constructor( llmClient: LLMClient, defaultModelName: AvailableModel, defaultClientOptions: ClientOptions, resolveLlmClient: (model?: ModelConfiguration) => LLMClient, systemPrompt?: string, logInferenceToFile?: boolean, selfHeal?: boolean, onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void, defaultDomSettleTimeoutMs?: number, ) { this.llmClient = llmClient; this.defaultModelName = defaultModelName; this.defaultClientOptions = defaultClientOptions; this.resolveLlmClient = resolveLlmClient; this.systemPrompt = systemPrompt ?? ""; this.logInferenceToFile = logInferenceToFile ?? false; this.selfHeal = !!selfHeal; this.onMetrics = onMetrics; this.defaultDomSettleTimeoutMs = defaultDomSettleTimeoutMs; } private recordActMetrics(response: ActInferenceResponse): void { this.onMetrics?.( V3FunctionName.ACT, response.prompt_tokens ?? 0, response.completion_tokens ?? 0, response.reasoning_tokens ?? 0, response.cached_input_tokens ?? 0, response.inference_time_ms ?? 0, ); } private async getActionFromLLM({ instruction, domElements, xpathMap, llmClient, requireMethodAndArguments = true, }: { instruction: string; domElements: string; xpathMap: Record; llmClient: LLMClient; requireMethodAndArguments?: boolean; }): Promise<{ action?: Action; response: ActInferenceResponse }> { const response = await actInference({ instruction, domElements, llmClient, userProvidedInstructions: this.systemPrompt, logger: v3Logger, logInferenceToFile: this.logInferenceToFile, }); this.recordActMetrics(response); const normalized = normalizeActInferenceElement( response.element as ActInferenceElement | undefined, xpathMap, requireMethodAndArguments, ); if (!normalized) { return { response }; } return { action: { ...normalized } as Action, response, }; } async act(params: ActHandlerParams): Promise { const { instruction, page, variables, timeout, model } = params; const llmClient = this.resolveLlmClient(model); const ensureTimeRemaining = createTimeoutGuard( timeout, (ms) => new ActTimeoutError(ms), ); ensureTimeRemaining(); await waitForDomNetworkQuiet( page.mainFrame(), this.defaultDomSettleTimeoutMs, ); ensureTimeRemaining(); const { combinedTree, combinedXpathMap } = await captureHybridSnapshot( page, { experimental: true }, ); const actInstruction = buildActPrompt( instruction, Object.values(SupportedUnderstudyAction), variables, ); ensureTimeRemaining(); const { action: firstAction, response: actInferenceResponse } = await this.getActionFromLLM({ instruction: actInstruction, domElements: combinedTree, xpathMap: combinedXpathMap, llmClient, }); if (!firstAction) { v3Logger({ category: "action", message: "no actionable element returned by LLM", level: 1, }); return { success: false, message: "Failed to perform act: No action found", actionDescription: instruction, actions: [], }; } // First action (self-heal aware path) ensureTimeRemaining(); const firstResult = await this.takeDeterministicAction( firstAction, page, this.defaultDomSettleTimeoutMs, llmClient, ensureTimeRemaining, variables, ); // If not two-step, return the first action result if (actInferenceResponse?.twoStep !== true) { return firstResult; } // Take a new focused snapshot and observe again ensureTimeRemaining(); const { combinedTree: combinedTree2, combinedXpathMap: combinedXpathMap2 } = await captureHybridSnapshot(page, { experimental: true, }); let diffedTree = diffCombinedTrees(combinedTree, combinedTree2); if (!diffedTree.trim()) { // Fallback: if no diff detected, use the fresh tree to avoid empty context diffedTree = combinedTree2; } const previousAction = `method: ${firstAction.method}, description: ${firstAction.description}, arguments: ${firstAction.arguments}`; const stepTwoInstructions = buildStepTwoPrompt( instruction, previousAction, Object.values(SupportedUnderstudyAction).filter( ( action, ): action is Exclude< SupportedUnderstudyAction, SupportedUnderstudyAction.SELECT_OPTION_FROM_DROPDOWN > => action !== SupportedUnderstudyAction.SELECT_OPTION_FROM_DROPDOWN, ), variables, ); ensureTimeRemaining(); const { action: secondAction } = await this.getActionFromLLM({ instruction: stepTwoInstructions, domElements: diffedTree, xpathMap: combinedXpathMap2, llmClient, }); if (!secondAction) { // No second action found — return first result as-is return firstResult; } ensureTimeRemaining(); const secondResult = await this.takeDeterministicAction( secondAction, page, this.defaultDomSettleTimeoutMs, llmClient, ensureTimeRemaining, variables, ); // Combine results return { success: firstResult.success && secondResult.success, message: secondResult.success ? `${firstResult.message} → ${secondResult.message}` : `${firstResult.message} → ${secondResult.message}`, actionDescription: firstResult.actionDescription, actions: [ ...(firstResult.actions || []), ...(secondResult.actions || []), ], }; } async takeDeterministicAction( action: Action, page: Page, domSettleTimeoutMs?: number, llmClientOverride?: LLMClient, ensureTimeRemaining?: () => void, variables?: Variables, ): Promise { ensureTimeRemaining?.(); const settleTimeout = domSettleTimeoutMs ?? this.defaultDomSettleTimeoutMs; const effectiveClient = llmClientOverride ?? this.llmClient; const method = action.method?.trim(); if (!method || method === "not-supported") { v3Logger({ category: "action", message: "action has no supported method", level: 0, auxiliary: { act: { value: JSON.stringify(action), type: "object" }, }, }); return { success: false, message: `Unable to perform action: The method '${method ?? ""}' is not supported in Action. Please use a supported Playwright locator method.`, actionDescription: action.description || `Action (${method ?? "unknown"})`, actions: [], }; } const placeholderArgs = Array.isArray(action.arguments) ? [...action.arguments] : []; const resolvedArgs = substituteVariablesInArguments(action.arguments, variables) ?? []; try { ensureTimeRemaining?.(); await performUnderstudyMethod( page, page.mainFrame(), method, action.selector, resolvedArgs, settleTimeout, ); return { success: true, message: `Action [${method}] performed successfully on selector: ${action.selector}`, actionDescription: action.description || `action (${method})`, actions: [ { selector: action.selector, description: action.description || `action (${method})`, method, arguments: placeholderArgs, }, ], }; } catch (err) { if (err instanceof ActTimeoutError) { throw err; } const msg = err instanceof Error ? err.message : String(err); // Attempt self-heal: rerun actInference and retry with updated selector if (this.selfHeal) { v3Logger({ category: "action", message: "Error performing action. Reprocessing the page and trying again", level: 1, auxiliary: { error: { value: msg, type: "string" }, action: { value: JSON.stringify(action), type: "object", }, }, }); try { // Build an instruction combining method + description, avoiding duplication const actCommand = action.description ? action.description.toLowerCase().startsWith(method.toLowerCase()) ? action.description : `${method} ${action.description}` : method; // Take a fresh snapshot and ask for a new actionable element ensureTimeRemaining?.(); const { combinedTree, combinedXpathMap } = await captureHybridSnapshot(page, { experimental: true, }); const instruction = buildActPrompt( actCommand, Object.values(SupportedUnderstudyAction), {}, ); ensureTimeRemaining?.(); const { action: fallbackAction, response: fallbackResponse } = await this.getActionFromLLM({ instruction, domElements: combinedTree, xpathMap: combinedXpathMap, llmClient: effectiveClient, requireMethodAndArguments: false, }); const fallbackElement = fallbackResponse.element; if (!fallbackElement) { return { success: false, message: "Failed to self-heal act: No observe results found for action", actionDescription: actCommand, actions: [], }; } // Retry with original method/args but new selector from fallback let newSelector = action.selector; if (fallbackAction?.selector) { newSelector = fallbackAction.selector; } ensureTimeRemaining?.(); await performUnderstudyMethod( page, page.mainFrame(), method, newSelector, resolvedArgs, settleTimeout, ); return { success: true, message: `Action [${method}] performed successfully on selector: ${newSelector}`, actionDescription: action.description || `action (${method})`, actions: [ { selector: newSelector, description: action.description || `action (${method})`, method, arguments: placeholderArgs, }, ], }; } catch (retryErr) { if (retryErr instanceof ActTimeoutError) { throw retryErr; } const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); return { success: false, message: `Failed to perform act after self-heal: ${retryMsg}`, actionDescription: action.description || `action (${method})`, actions: [], }; } } return { success: false, message: `Failed to perform act: ${msg}`, actionDescription: action.description || `action (${method})`, actions: [], }; } } } function normalizeActInferenceElement( element: ActInferenceElement | undefined, xpathMap: Record, requireMethodAndArguments = true, ): Action | undefined { if (!element) { return undefined; } const { elementId, description, method, arguments: args } = element; const hasArgs = Array.isArray(args); if ( requireMethodAndArguments && (!method || method === "not-supported" || !hasArgs) ) { return undefined; } if (typeof elementId !== "string" || !elementId.includes("-")) { return undefined; } const xp = xpathMap[elementId as EncodedId]; const trimmed = trimTrailingTextNode(xp); if (!trimmed) { return undefined; } // For dragAndDrop, convert element ID in arguments to xpath (target element) let resolvedArgs = hasArgs ? args : undefined; if (method === "dragAndDrop" && hasArgs && args.length > 0) { const targetArg = args[0]; // Check if argument looks like an element ID (e.g., "1-67") if (typeof targetArg === "string" && /^\d+-\d+$/.test(targetArg)) { const argXpath = xpathMap[targetArg as EncodedId]; const trimmedArgXpath = trimTrailingTextNode(argXpath); if (trimmedArgXpath) { resolvedArgs = [`xpath=${trimmedArgXpath}`, ...args.slice(1)]; } else { // Target element lookup failed, filter out this action v3Logger({ category: "action", message: "dragAndDrop target element lookup failed", level: 1, auxiliary: { targetElementId: { value: targetArg, type: "string" }, sourceElementId: { value: elementId, type: "string" }, }, }); return undefined; } } else { v3Logger({ category: "action", message: "dragAndDrop target element invalid ID format", level: 0, auxiliary: { targetElementId: { value: String(targetArg), type: "string" }, sourceElementId: { value: elementId, type: "string" }, }, }); return undefined; } } return { description, method, arguments: resolvedArgs, selector: `xpath=${trimmed}`, } as Action; } function substituteVariablesInArguments( args: string[] | undefined, variables?: Variables, ): string[] | undefined { if (!variables || !Array.isArray(args)) { return args; } return args.map((arg: string) => { let out = arg; for (const [key, v] of Object.entries(variables)) { const token = `%${key}%`; out = out.split(token).join(resolveVariableValue(v)); } return out; }); } ================================================ FILE: packages/core/lib/v3/handlers/extractHandler.ts ================================================ // lib/v3/handlers/extractHandler.ts import { extract as runExtract } from "../../inference.js"; import { getZFactory, getZodType, injectUrls, transformSchema, } from "../../utils.js"; import { v3Logger } from "../logger.js"; import { V3FunctionName } from "../types/public/methods.js"; import { captureHybridSnapshot } from "../understudy/a11y/snapshot/index.js"; import type { ZodTypeAny } from "zod"; import { LLMClient } from "../llm/LLMClient.js"; import { ExtractHandlerParams } from "../types/private/handlers.js"; import { EncodedId, ZodPathSegments } from "../types/private/internal.js"; import { defaultExtractSchema, pageTextSchema, } from "../types/public/methods.js"; import { AvailableModel, ClientOptions, ModelConfiguration, } from "../types/public/model.js"; import { StagehandInvalidArgumentError, ExtractTimeoutError, } from "../types/public/sdkErrors.js"; import { createTimeoutGuard } from "./handlerUtils/timeoutGuard.js"; import type { InferStagehandSchema, StagehandZodObject, StagehandZodSchema, } from "../zodCompat.js"; /** * Scans the provided Zod schema for any `z.string().url()` fields and * replaces them with `z.number()`. * * @param schema - The Zod object schema to transform. * @returns A tuple containing: * 1. The transformed schema (or the original schema if no changes were needed). * 2. An array of {@link ZodPathSegments} objects representing all the replaced URL fields, * with each path segment showing where in the schema the replacement occurred. */ export function transformUrlStringsToNumericIds( schema: T, ): [StagehandZodSchema, ZodPathSegments[]] { const [finalSchema, urlPaths] = transformSchema(schema, []); return [finalSchema, urlPaths]; } interface ExtractionResponseBase { metadata: { completed: boolean }; prompt_tokens: number; completion_tokens: number; reasoning_tokens: number; cached_input_tokens?: number; inference_time_ms: number; } type ExtractionResponse = ExtractionResponseBase & InferStagehandSchema; export class ExtractHandler { private readonly llmClient: LLMClient; private readonly defaultModelName: AvailableModel; private readonly defaultClientOptions: ClientOptions; private readonly resolveLlmClient: (model?: ModelConfiguration) => LLMClient; private readonly systemPrompt: string; private readonly logInferenceToFile: boolean; private readonly experimental: boolean; private readonly onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; constructor( llmClient: LLMClient, defaultModelName: AvailableModel, defaultClientOptions: ClientOptions, resolveLlmClient: (model?: ModelConfiguration) => LLMClient, systemPrompt?: string, logInferenceToFile?: boolean, experimental?: boolean, onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void, ) { this.llmClient = llmClient; this.defaultModelName = defaultModelName; this.defaultClientOptions = defaultClientOptions; this.resolveLlmClient = resolveLlmClient; this.systemPrompt = systemPrompt ?? ""; this.logInferenceToFile = logInferenceToFile ?? false; this.experimental = experimental ?? false; this.onMetrics = onMetrics; } async extract( params: ExtractHandlerParams, ): Promise | { pageText: string }> { const { instruction, schema, page, selector, timeout, model } = params; const llmClient = this.resolveLlmClient(model); const ensureTimeRemaining = createTimeoutGuard( timeout, (ms) => new ExtractTimeoutError(ms), ); // No-args → page text (parity with v2) const noArgs = !instruction && !schema; if (noArgs) { const focusSelector = selector?.replace(/^xpath=/i, "") ?? ""; ensureTimeRemaining(); const snap = await captureHybridSnapshot(page, { experimental: this.experimental, focusSelector: focusSelector || undefined, }); ensureTimeRemaining(); const result = { pageText: snap.combinedTree }; // Validate via the same schema used in v2 return pageTextSchema.parse(result); } if (!instruction && schema) { throw new StagehandInvalidArgumentError( "extract() requires an instruction when a schema is provided.", ); } const focusSelector = selector?.replace(/^xpath=/, "") ?? ""; // Build the hybrid snapshot (includes combinedTree; combinedUrlMap optional) ensureTimeRemaining(); const { combinedTree, combinedUrlMap } = await captureHybridSnapshot(page, { experimental: this.experimental, focusSelector: focusSelector, }); v3Logger({ category: "extraction", message: "Starting extraction using a11y snapshot", level: 1, auxiliary: instruction ? { instruction: { value: instruction, type: "string" } } : undefined, }); // Normalize schema: if instruction provided without schema, use defaultExtractSchema const baseSchema: StagehandZodSchema = (schema ?? defaultExtractSchema) as StagehandZodSchema; // Ensure we pass an object schema into inference; wrap non-object schemas const isObjectSchema = getZodType(baseSchema) === "object"; const WRAP_KEY = "value" as const; const factory = getZFactory(baseSchema); const objectSchema: StagehandZodObject = isObjectSchema ? (baseSchema as StagehandZodObject) : (factory.object({ [WRAP_KEY]: baseSchema as ZodTypeAny, }) as StagehandZodObject); const [transformedSchema, urlFieldPaths] = transformUrlStringsToNumericIds(objectSchema); ensureTimeRemaining(); const extractionResponse: ExtractionResponse = await runExtract({ instruction, domElements: combinedTree, schema: transformedSchema as StagehandZodObject, llmClient, userProvidedInstructions: this.systemPrompt, logger: v3Logger, logInferenceToFile: this.logInferenceToFile, }); const { metadata: { completed }, prompt_tokens, completion_tokens, reasoning_tokens = 0, cached_input_tokens = 0, inference_time_ms, ...rest } = extractionResponse; let output = rest as InferStagehandSchema; // Update EXTRACT metrics from the LLM calls this.onMetrics?.( V3FunctionName.EXTRACT, prompt_tokens, completion_tokens, reasoning_tokens, cached_input_tokens, inference_time_ms, ); // Re-inject URLs for any url() fields we temporarily converted to number() const idToUrl: Record = (combinedUrlMap ?? {}) as Record< EncodedId, string >; for (const { segments } of urlFieldPaths) { injectUrls( output as Record, segments, idToUrl as unknown as Record, ); } // If we wrapped a non-object schema, unwrap the value if (!isObjectSchema && output && typeof output === "object") { output = (output as Record)[WRAP_KEY]; } const resultPreviewLength = 200; const resultString = JSON.stringify(output) ?? "undefined"; const resultPreview = resultString.length > resultPreviewLength ? resultString.slice(0, resultPreviewLength) + "..." : resultString; v3Logger({ category: "extraction", message: completed ? "Extraction completed successfully" : "Extraction incomplete after processing all data", level: 1, auxiliary: { prompt_tokens: { value: String(prompt_tokens), type: "string" }, completion_tokens: { value: String(completion_tokens), type: "string" }, inference_time_ms: { value: String(inference_time_ms), type: "string", }, result: { value: resultPreview, type: "string" }, }, }); return output as InferStagehandSchema; } } ================================================ FILE: packages/core/lib/v3/handlers/handlerUtils/actHandlerUtils.ts ================================================ // lib/v3/handlers/handlerUtils/actHandlerUtils.ts import { Protocol } from "devtools-protocol"; import { Frame } from "../../understudy/frame.js"; import { Locator } from "../../understudy/locator.js"; import { MouseButton } from "../../types/public/locator.js"; import { resolveLocatorWithHops } from "../../understudy/deepLocator.js"; import type { Page } from "../../understudy/page.js"; import { v3Logger } from "../../logger.js"; import { FlowLogger } from "../../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../../utils.js"; import { StagehandClickError, UnderstudyCommandException, } from "../../types/public/sdkErrors.js"; export interface UnderstudyMethodHandlerContext { method: string; locator: Locator; xpath: string; args: ReadonlyArray; frame: Frame; page: Page; initialUrl: string; domSettleTimeoutMs?: number; } // Normalize cases where the XPath is the root "/" to point to the HTML element. function normalizeRootXPath(input: string): string { const s = String(input ?? "").trim(); if (s === "/") return "/html"; if (/^xpath=\/$/i.test(s)) return "xpath=/html"; return s; } export async function performUnderstudyMethod( page: Page, frame: Frame, method: string, rawXPath: string, args: ReadonlyArray, domSettleTimeoutMs?: number, ): Promise { const selectorRaw = normalizeRootXPath(rawXPath); try { await FlowLogger.runWithLogging( { eventType: `Understudy${toTitleCase(method)}`, // e.g. "UnderstudyClick" data: { target: selectorRaw, }, }, async () => { // Unified resolver: supports '>>' hops and XPath across iframes. const locator: Locator = await resolveLocatorWithHops( page, frame, selectorRaw, ); const initialUrl = await getFrameUrl(frame); v3Logger({ category: "action", message: "performing understudy method", level: 2, auxiliary: { xpath: { value: selectorRaw, type: "string" }, method: { value: method, type: "string" }, url: { value: initialUrl, type: "string" }, }, }); const ctx: UnderstudyMethodHandlerContext = { method, locator, xpath: selectorRaw, args: args.map((a) => (a == null ? "" : String(a))), frame, page, initialUrl, domSettleTimeoutMs, }; const handler = METHOD_HANDLER_MAP[method] ?? null; if (handler) { await handler(ctx); return; } v3Logger({ category: "action", message: "chosen method is invalid", level: 1, auxiliary: { method: { value: method, type: "string" } }, }); throw new UnderstudyCommandException(`Method ${method} not supported`); }, args, ); } catch (e) { const msg = e instanceof Error ? e.message : String(e); const stack = e instanceof Error ? e.stack : undefined; v3Logger({ category: "action", message: "error performing method", level: 1, auxiliary: { error: { value: msg, type: "string" }, trace: { value: stack ?? "", type: "string" }, method: { value: method, type: "string" }, xpath: { value: selectorRaw, type: "string" }, args: { value: JSON.stringify(args), type: "object" }, }, }); if (e instanceof UnderstudyCommandException) { throw e; } throw new UnderstudyCommandException(msg, e); } } /* ===================== Handlers & Map ===================== */ const METHOD_HANDLER_MAP: Record< string, (ctx: UnderstudyMethodHandlerContext) => Promise > = { scrollIntoView, scrollByPixelOffset, scrollTo: scrollElementToPercentage, scroll: scrollElementToPercentage, "mouse.wheel": wheelScroll, fill: fillOrType, type: typeText, press: pressKey, click: clickElement, doubleClick, dragAndDrop, nextChunk: scrollToNextChunk, prevChunk: scrollToPreviousChunk, selectOptionFromDropdown: selectOption, selectOption: selectOption, hover: hover, }; export async function selectOption(ctx: UnderstudyMethodHandlerContext) { const { locator, xpath, args } = ctx; try { const text = args[0]?.toString() || ""; await locator.selectOption(text); } catch (e) { const msg = e instanceof Error ? e.message : String(e); const stack = e instanceof Error ? e.stack : undefined; v3Logger({ category: "action", message: "error selecting option", level: 0, auxiliary: { error: { value: msg, type: "string" }, trace: { value: stack ?? "", type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function scrollIntoView( ctx: UnderstudyMethodHandlerContext, ): Promise { const { locator, xpath } = ctx; v3Logger({ category: "action", message: "scrolling element into view", level: 2, auxiliary: { xpath: { value: xpath, type: "string" } }, }); const { objectId } = await locator.resolveNode(); const ownerSession = locator.getFrame().session; await ownerSession.send("DOM.scrollIntoViewIfNeeded", { objectId }); await ownerSession .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } async function scrollElementToPercentage( ctx: UnderstudyMethodHandlerContext, ): Promise { const { locator, xpath, args } = ctx; v3Logger({ category: "action", message: "scrolling element vertically to specified percentage", level: 2, auxiliary: { xpath: { value: xpath, type: "string" }, coordinate: { value: JSON.stringify(args), type: "string" }, }, }); const [yArg = "0%"] = args; await locator.scrollTo(yArg); } /** Scroll the page by pixel offset, starting from the element's center. */ async function scrollByPixelOffset( ctx: UnderstudyMethodHandlerContext, ): Promise { const { locator, page, args } = ctx; const dx = Number(args[0] ?? 0); const dy = Number(args[1] ?? 0); try { const { x, y } = await locator.centroid(); await page.scroll(x, y, dx, dy); } catch (e) { const msg = e instanceof Error ? e.message : String(e); throw new UnderstudyCommandException(msg, e); } } async function wheelScroll(ctx: UnderstudyMethodHandlerContext): Promise { const { frame, args } = ctx; const deltaY = Number(args[0] ?? 200); v3Logger({ category: "action", message: "dispatching mouse wheel", level: 2, auxiliary: { deltaY: { value: String(deltaY), type: "string" } }, }); await frame.session.send("Input.dispatchMouseEvent", { type: "mouseWheel", x: 0, y: 0, deltaY, deltaX: 0, } as Protocol.Input.DispatchMouseEventRequest); } async function fillOrType(ctx: UnderstudyMethodHandlerContext): Promise { const { locator, xpath, args } = ctx; try { await locator.fill(""); // clear await locator.fill(args[0] ?? ""); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error filling element", level: 1, auxiliary: { error: { value: msg, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function typeText(ctx: UnderstudyMethodHandlerContext): Promise { const { locator, xpath, args } = ctx; try { await locator.type(args[0] ?? ""); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error typing into element", level: 1, auxiliary: { error: { value: msg, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function pressKey(ctx: UnderstudyMethodHandlerContext): Promise { const { args, xpath, page } = ctx; const key = args[0] ?? ""; try { v3Logger({ category: "action", message: "pressing key", level: 1, auxiliary: { key: { value: key, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); await page.keyPress(key); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error pressing key", level: 1, auxiliary: { error: { value: msg, type: "string" }, key: { value: key, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function clickElement( ctx: UnderstudyMethodHandlerContext, ): Promise { const { locator, xpath, args } = ctx; try { await locator.click({ button: (args[0] as MouseButton) || undefined }); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error performing click", level: 0, auxiliary: { error: { value: msg, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new StagehandClickError(ctx.xpath, msg); } } async function doubleClick(ctx: UnderstudyMethodHandlerContext): Promise { const { locator, xpath } = ctx; try { await locator.click({ clickCount: 2 }); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error performing doubleClick", level: 0, auxiliary: { error: { value: msg, type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function dragAndDrop(ctx: UnderstudyMethodHandlerContext): Promise { const { page, frame, locator, args, xpath } = ctx; const toXPath = String(args[0] ?? "").trim(); if (!toXPath) throw new UnderstudyCommandException( "dragAndDrop requires a target XPath arg", ); const targetLocator = await resolveLocatorWithHops(page, frame, toXPath); try { // 1) Centers in local (owning-frame) viewport const { x: fromLocalX, y: fromLocalY } = await locator.centroid(); const { x: toLocalX, y: toLocalY } = await targetLocator.centroid(); // 2) Convert to main-viewport absolute coordinates const fromAbs = await locator .getFrame() .evaluate<{ x: number; y: number }, { x: number; y: number }>( ({ x, y }: { x: number; y: number }) => { let X = x; let Y = y; let w: Window = window; while (w !== w.top) { const fe = w.frameElement as HTMLElement | null; if (!fe) break; const r = fe.getBoundingClientRect(); X += r.left; Y += r.top; w = w.parent as Window; } return { x: Math.round(X), y: Math.round(Y) }; }, { x: fromLocalX, y: fromLocalY }, ); const toAbs = await targetLocator .getFrame() .evaluate<{ x: number; y: number }, { x: number; y: number }>( ({ x, y }: { x: number; y: number }) => { let X = x; let Y = y; let w: Window = window; while (w !== w.top) { const fe = w.frameElement as HTMLElement | null; if (!fe) break; const r = fe.getBoundingClientRect(); X += r.left; Y += r.top; w = w.parent as Window; } return { x: Math.round(X), y: Math.round(Y) }; }, { x: toLocalX, y: toLocalY }, ); // 3) Perform drag in main session await page.dragAndDrop(fromAbs.x, fromAbs.y, toAbs.x, toAbs.y, { steps: 10, delay: 5, }); } catch (e) { const msg = e instanceof Error ? e.message : String(e); v3Logger({ category: "action", message: "error performing dragAndDrop", level: 0, auxiliary: { error: { value: msg, type: "string" }, from: { value: xpath, type: "string" }, to: { value: toXPath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } async function scrollToNextChunk( ctx: UnderstudyMethodHandlerContext, ): Promise { await scrollByElementHeight(ctx, /*dir=*/ 1); } async function scrollToPreviousChunk( ctx: UnderstudyMethodHandlerContext, ): Promise { await scrollByElementHeight(ctx, /*dir=*/ -1); } async function scrollByElementHeight( ctx: UnderstudyMethodHandlerContext, direction: 1 | -1, ): Promise { const { locator, xpath } = ctx; v3Logger({ category: "action", message: direction > 0 ? "scrolling to next chunk" : "scrolling to previous chunk", level: 2, auxiliary: { xpath: { value: xpath, type: "string" } }, }); const { objectId } = await locator.resolveNode(); try { const ownerSession = locator.getFrame().session; await ownerSession.send( "Runtime.callFunctionOn", { objectId, functionDeclaration: ` function(dir) { const waitForScrollEnd = (el) => new Promise((resolve) => { let last = el.scrollTop ?? 0; const check = () => { const cur = el.scrollTop ?? 0; if (cur === last) return resolve(); last = cur; requestAnimationFrame(check); }; requestAnimationFrame(check); }); const tag = this.tagName?.toLowerCase(); if (tag === "html" || tag === "body") { const h = window.visualViewport?.height ?? window.innerHeight; window.scrollBy({ top: h * dir, left: 0, behavior: "smooth" }); const root = document.scrollingElement ?? document.documentElement; return waitForScrollEnd(root); } const h = this.getBoundingClientRect().height; this.scrollBy({ top: h * dir, left: 0, behavior: "smooth" }); return waitForScrollEnd(this); } `, arguments: [{ value: direction }], awaitPromise: true, returnByValue: true, }, ); } finally { const ownerSession = locator.getFrame().session; await ownerSession .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } } export async function hover(ctx: UnderstudyMethodHandlerContext) { const { locator, xpath } = ctx; try { await locator.hover(); } catch (e) { const msg = e instanceof Error ? e.message : String(e); const stack = e instanceof Error ? e.stack : undefined; v3Logger({ category: "action", message: "error attempting to hover", level: 0, auxiliary: { error: { value: msg, type: "string" }, trace: { value: stack ?? "", type: "string" }, xpath: { value: xpath, type: "string" }, }, }); throw new UnderstudyCommandException(msg, e); } } /* ===================== Helpers ===================== */ async function getFrameUrl(frame: Frame): Promise { // Evaluate from within the frame's isolated world const url = await frame.evaluate("location.href"); return url; } /** * More robust DOM settle using Network + Page events to detect network quiet. * Closely modeled after the provided snippet, adapted to our Frame/session + logger. */ export async function waitForDomNetworkQuiet( frame: Frame, timeoutMs?: number, ): Promise { const overallTimeout = typeof timeoutMs === "number" && Number.isFinite(timeoutMs) ? Math.max(0, timeoutMs) : 5_000; const client = frame.session; const settleStart = Date.now(); // Ensure a document exists; if not, wait for DOMContentLoaded on this frame. let hasDoc: boolean; try { const rs = await frame.evaluate("document.readyState"); hasDoc = rs === "interactive" || rs === "complete"; } catch { hasDoc = false; } if (!hasDoc && overallTimeout > 0) { await frame .waitForLoadState("domcontentloaded", overallTimeout) .catch(() => {}); } const elapsed = Date.now() - settleStart; const remainingBudget = Math.max(0, overallTimeout - elapsed); if (remainingBudget === 0) { return; } await client.send("Network.enable").catch(() => {}); await client.send("Page.enable").catch(() => {}); // Best-effort; some sessions may not support Target.setAutoAttach here. await client .send("Target.setAutoAttach", { autoAttach: true, waitForDebuggerOnStart: false, flatten: true, filter: [ { type: "worker", exclude: true }, { type: "shared_worker", exclude: true }, ], }) .catch(() => {}); return new Promise((resolve) => { const inflight = new Set(); const meta = new Map(); const docByFrame = new Map(); let quietTimer: NodeJS.Timeout | null = null; let stalledRequestSweepTimer: NodeJS.Timeout | null = null; const clearQuiet = () => { if (quietTimer) { clearTimeout(quietTimer); quietTimer = null; } }; const maybeQuiet = () => { if (inflight.size === 0 && !quietTimer) quietTimer = setTimeout(() => resolveDone(), 500); }; const finishReq = (id: string) => { if (!inflight.delete(id)) return; meta.delete(id); for (const [fid, rid] of docByFrame) if (rid === id) docByFrame.delete(fid); clearQuiet(); maybeQuiet(); }; const onRequest = (p: Protocol.Network.RequestWillBeSentEvent) => { // Ignore long-lived streams // ResourceType includes: Document, XHR, Fetch, WebSocket, EventSource, etc. if (p.type === "WebSocket" || p.type === "EventSource") return; inflight.add(p.requestId); meta.set(p.requestId, { url: p.request.url, start: Date.now() }); if (p.type === "Document" && p.frameId) docByFrame.set(p.frameId, p.requestId); clearQuiet(); }; const onFinish = (p: { requestId: string }) => finishReq(p.requestId); const onCached = (p: { requestId: string }) => finishReq(p.requestId); const onDataUrl = (p: Protocol.Network.ResponseReceivedEvent) => { if (p.response.url?.startsWith("data:")) finishReq(p.requestId); }; const onFrameStop = (f: Protocol.Page.FrameStoppedLoadingEvent) => { const id = docByFrame.get(f.frameId); if (id) finishReq(id); }; client.on("Network.requestWillBeSent", onRequest); client.on("Network.loadingFinished", onFinish); client.on("Network.loadingFailed", onFinish); client.on("Network.requestServedFromCache", onCached); client.on("Network.responseReceived", onDataUrl); client.on("Page.frameStoppedLoading", onFrameStop); stalledRequestSweepTimer = setInterval(() => { const now = Date.now(); for (const [id, m] of meta) { if (now - m.start > 2_000) { inflight.delete(id); meta.delete(id); v3Logger({ category: "dom", message: "⏳ forcing completion of stalled iframe document", level: 1, auxiliary: { url: { value: (m.url ?? "").slice(0, 120), type: "string" }, }, }); } } maybeQuiet(); }, 500); maybeQuiet(); const guard = setTimeout(() => { if (inflight.size) { v3Logger({ category: "dom", message: "⚠️ DOM-settle timeout reached – network requests still pending", level: 1, auxiliary: { count: { value: String(inflight.size), type: "integer" }, }, }); } resolveDone(); }, remainingBudget); const resolveDone = () => { client.off("Network.requestWillBeSent", onRequest); client.off("Network.loadingFinished", onFinish); client.off("Network.loadingFailed", onFinish); client.off("Network.requestServedFromCache", onCached); client.off("Network.responseReceived", onDataUrl); client.off("Page.frameStoppedLoading", onFrameStop); if (quietTimer) clearTimeout(quietTimer); if (stalledRequestSweepTimer) clearInterval(stalledRequestSweepTimer); clearTimeout(guard); resolve(); }; }); } ================================================ FILE: packages/core/lib/v3/handlers/handlerUtils/timeoutGuard.ts ================================================ import { TimeoutError } from "../../types/public/sdkErrors.js"; export type TimeoutGuard = () => void; export function createTimeoutGuard( timeoutMs?: number, errorFactory?: (timeoutMs: number) => Error, ): TimeoutGuard { if (!timeoutMs || timeoutMs <= 0) { return () => {}; } const startTime = Date.now(); return () => { if (Date.now() - startTime >= timeoutMs) { const err = errorFactory?.(timeoutMs) ?? new TimeoutError("operation", timeoutMs); throw err; } }; } ================================================ FILE: packages/core/lib/v3/handlers/observeHandler.ts ================================================ // lib/v3/handlers/observeHandler.ts import { observe as runObserve } from "../../inference.js"; import { trimTrailingTextNode } from "../../utils.js"; import { v3Logger } from "../logger.js"; import { V3FunctionName } from "../types/public/methods.js"; import { captureHybridSnapshot } from "../understudy/a11y/snapshot/index.js"; import { LLMClient } from "../llm/LLMClient.js"; import { ObserveHandlerParams, SupportedUnderstudyAction, } from "../types/private/handlers.js"; import { EncodedId } from "../types/private/internal.js"; import { Action } from "../types/public/methods.js"; import { AvailableModel, ClientOptions, ModelConfiguration, } from "../types/public/model.js"; import { ObserveTimeoutError } from "../types/public/sdkErrors.js"; import { createTimeoutGuard } from "./handlerUtils/timeoutGuard.js"; export class ObserveHandler { private readonly llmClient: LLMClient; private readonly defaultModelName: AvailableModel; private readonly defaultClientOptions: ClientOptions; private readonly resolveLlmClient: (model?: ModelConfiguration) => LLMClient; private readonly systemPrompt: string; private readonly logInferenceToFile: boolean; private readonly experimental: boolean; private readonly onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; constructor( llmClient: LLMClient, defaultModelName: AvailableModel, defaultClientOptions: ClientOptions, resolveLlmClient: (model?: ModelConfiguration) => LLMClient, systemPrompt?: string, logInferenceToFile?: boolean, experimental?: boolean, onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void, ) { this.llmClient = llmClient; this.defaultModelName = defaultModelName; this.defaultClientOptions = defaultClientOptions; this.resolveLlmClient = resolveLlmClient; this.systemPrompt = systemPrompt ?? ""; this.logInferenceToFile = logInferenceToFile ?? false; this.experimental = experimental ?? false; this.onMetrics = onMetrics; } async observe(params: ObserveHandlerParams): Promise { const { instruction, page, timeout, selector, model } = params; const llmClient = this.resolveLlmClient(model); const ensureTimeRemaining = createTimeoutGuard( timeout, (ms) => new ObserveTimeoutError(ms), ); const effectiveInstruction = instruction ?? "Find elements that can be used for any future actions in the page. These may be navigation links, related pages, section/subsection links, buttons, or other interactive elements. Be comprehensive: if there are multiple elements that may be relevant for future actions, return all of them."; v3Logger({ category: "observation", message: "starting observation", level: 1, auxiliary: { instruction: { value: effectiveInstruction, type: "string", }, }, }); // Build the hybrid snapshot (a11y-centric text tree + lookup maps) const focusSelector = selector?.replace(/^xpath=/i, "") ?? ""; ensureTimeRemaining(); const snapshot = await captureHybridSnapshot(page, { experimental: this.experimental, focusSelector: focusSelector || undefined, }); const combinedTree = snapshot.combinedTree; const combinedXpathMap = snapshot.combinedXpathMap ?? {}; v3Logger({ category: "observation", message: "Got accessibility tree data", level: 1, }); // Call the LLM to propose actionable elements ensureTimeRemaining(); const observationResponse = await runObserve({ instruction: effectiveInstruction, domElements: combinedTree, llmClient, userProvidedInstructions: this.systemPrompt, logger: v3Logger, logInferenceToFile: this.logInferenceToFile, supportedActions: Object.values(SupportedUnderstudyAction), }); const { prompt_tokens = 0, completion_tokens = 0, reasoning_tokens = 0, cached_input_tokens = 0, inference_time_ms = 0, } = observationResponse; // Update OBSERVE metrics from the LLM observation call this.onMetrics?.( V3FunctionName.OBSERVE, prompt_tokens, completion_tokens, reasoning_tokens, cached_input_tokens, inference_time_ms, ); // Map elementIds -> selectors via combinedXpathMap const elementsWithSelectors = ( await Promise.all( observationResponse.elements.map(async (element) => { const { elementId, ...rest } = element; // rest may or may not have method/arguments if (typeof elementId === "string" && elementId.includes("-")) { const lookUpIndex = elementId as EncodedId; const xpath = combinedXpathMap[lookUpIndex]; const trimmedXpath = trimTrailingTextNode(xpath); if (!trimmedXpath) return undefined; // For dragAndDrop, convert element ID in arguments to xpath (target element) let resolvedArgs = rest.arguments; if ( rest.method === "dragAndDrop" && Array.isArray(rest.arguments) && rest.arguments.length > 0 ) { const targetArg = rest.arguments[0]; // Check if argument looks like an element ID (e.g., "1-67") if ( typeof targetArg === "string" && /^\d+-\d+$/.test(targetArg) ) { const argXpath = combinedXpathMap[targetArg as EncodedId]; const trimmedArgXpath = trimTrailingTextNode(argXpath); if (trimmedArgXpath) { resolvedArgs = [ `xpath=${trimmedArgXpath}`, ...rest.arguments.slice(1), ]; } else { // Target element lookup failed, filter out this action v3Logger({ category: "observation", message: "dragAndDrop target element lookup failed", level: 0, auxiliary: { targetElementId: { value: targetArg, type: "string" }, sourceElementId: { value: elementId, type: "string" }, }, }); return undefined; } } else { v3Logger({ category: "observation", message: "dragAndDrop target element invalid ID format", level: 0, auxiliary: { targetElementId: { value: targetArg, type: "string" }, sourceElementId: { value: elementId, type: "string" }, }, }); return undefined; } } return { ...rest, arguments: resolvedArgs, selector: `xpath=${trimmedXpath}`, } as { description: string; method?: string; arguments?: string[]; selector: string; }; } // shadow-root fallback: return { description: "an element inside a shadow DOM", method: "not-supported", arguments: [], selector: "not-supported", }; }), ) ).filter((e: T | undefined): e is T => e !== undefined); v3Logger({ category: "observation", message: "found elements", level: 1, auxiliary: { elements: { value: JSON.stringify(elementsWithSelectors), type: "object", }, }, }); return elementsWithSelectors; } } ================================================ FILE: packages/core/lib/v3/handlers/v3AgentHandler.ts ================================================ import { createAgentTools } from "../agent/tools/index.js"; import { buildAgentSystemPrompt } from "../agent/prompts/agentSystemPrompt.js"; import { LogLine } from "../types/public/logs.js"; import { V3 } from "../v3.js"; import { ModelMessage, ToolSet, wrapLanguageModel, stepCountIs, LanguageModel, type LanguageModelUsage, type StepResult, type GenerateTextOnStepFinishCallback, type StreamTextOnStepFinishCallback, type PrepareStepFunction, } from "ai"; import { StagehandZodObject } from "../zodCompat.js"; import { processMessages } from "../agent/utils/messageProcessing.js"; import { LLMClient } from "../llm/LLMClient.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { AgentExecuteOptions, AgentStreamExecuteOptions, AgentExecuteOptionsBase, AgentResult, AgentContext, AgentState, AgentStreamResult, AgentStreamCallbacks, AgentToolMode, AgentModelConfig, Variables, } from "../types/public/agent.js"; import { V3FunctionName } from "../types/public/methods.js"; import { mapToolResultToActions } from "../agent/utils/actionMapping.js"; import { MissingLLMConfigurationError, MissingEnvironmentVariableError, StreamingCallbacksInNonStreamingModeError, AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, CAPTCHA_ERRORED_MSG, } from "../agent/utils/captchaSolver.js"; function getErrorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } /** * Prepends a system message with cache control to the messages array. * The cache control providerOptions are used by Anthropic and ignored by other providers. */ function prependSystemMessage( systemPrompt: string, messages: ModelMessage[], ): ModelMessage[] { return [ { role: "system", content: systemPrompt, providerOptions: { anthropic: { cacheControl: { type: "ephemeral" }, }, }, }, ...messages, ]; } export class V3AgentHandler { private v3: V3; private logger: (message: LogLine) => void; private llmClient: LLMClient; private executionModel?: string | AgentModelConfig; private systemInstructions?: string; private mcpTools?: ToolSet; private mode: AgentToolMode; private captchaAutoSolveEnabled: boolean; constructor( v3: V3, logger: (message: LogLine) => void, llmClient: LLMClient, executionModel?: string | AgentModelConfig, systemInstructions?: string, mcpTools?: ToolSet, mode?: AgentToolMode, captchaAutoSolveEnabled?: boolean, ) { this.v3 = v3; this.logger = logger; this.llmClient = llmClient; this.executionModel = executionModel; this.systemInstructions = systemInstructions; this.mcpTools = mcpTools; this.mode = mode ?? "dom"; this.captchaAutoSolveEnabled = captchaAutoSolveEnabled ?? false; } private async prepareAgent( instructionOrOptions: string | AgentExecuteOptionsBase, ): Promise { try { const options = typeof instructionOrOptions === "string" ? { instruction: instructionOrOptions } : instructionOrOptions; const maxSteps = options.maxSteps || 20; // Get the initial page URL first (needed for the system prompt) const initialPageUrl = (await this.v3.context.awaitActivePage()).url(); // Build the system prompt with mode-aware tool guidance const systemPrompt = buildAgentSystemPrompt({ url: initialPageUrl, executionInstruction: options.instruction, mode: this.mode, systemInstructions: this.systemInstructions, captchasAutoSolve: this.v3.isCaptchaAutoSolveEnabled, excludeTools: options.excludeTools, variables: options.variables, useSearch: options.useSearch, }); if (options.useSearch) { const bbApiKey = this.v3.browserbaseApiKey; if (!bbApiKey) { throw new MissingEnvironmentVariableError( "BROWSERBASE_API_KEY", "agent search (useSearch: true)", ); } } const tools = this.createTools( options.excludeTools, options.variables, options.toolTimeout, options.useSearch, ); const allTools: ToolSet = { ...tools, ...this.mcpTools }; // Use provided messages for continuation, or start fresh with the instruction const messages: ModelMessage[] = options.messages?.length ? [...options.messages, { role: "user", content: options.instruction }] : [{ role: "user", content: options.instruction }]; if (!this.llmClient?.getLanguageModel) { throw new MissingLLMConfigurationError(); } const baseModel = this.llmClient.getLanguageModel(); //to do - we likely do not need middleware anymore const wrappedModel = wrapLanguageModel({ model: baseModel, middleware: { ...FlowLogger.createLlmLoggingMiddleware(baseModel.modelId), }, }); if ( this.mode === "hybrid" && !baseModel.modelId.includes("gemini-3-flash") && !baseModel.modelId.includes("claude") ) { this.logger({ category: "agent", message: `Warning: "${baseModel.modelId}" may not perform well in hybrid mode. See recommended models: https://docs.stagehand.dev/v3/basics/agent#hybrid-mode`, level: 0, }); } return { options, maxSteps, systemPrompt, allTools, messages, wrappedModel, initialPageUrl, }; } catch (error) { this.logger({ category: "agent", message: `failed to prepare agent: ${error}`, level: 0, }); throw error; } } private createPrepareStep( userCallback?: PrepareStepFunction, captchaSolver?: CaptchaSolver, ): PrepareStepFunction { return async (options) => { processMessages(options.messages); if (captchaSolver) { if (captchaSolver.isSolving()) { this.logger({ category: "agent", message: "Captcha detected — waiting for Browserbase to solve it before continuing", level: 1, }); } await captchaSolver.waitIfSolving(); const { solved, errored } = captchaSolver.consumeSolveResult(); if (solved) { options.messages.push({ role: "user", content: CAPTCHA_SOLVED_MSG, }); this.logger({ category: "agent", message: "Captcha solved — injected notification into agent message stream", level: 1, }); } if (errored) { options.messages.push({ role: "user", content: CAPTCHA_ERRORED_MSG, }); this.logger({ category: "agent", message: "Captcha solver failed — injected error notification into agent message stream", level: 1, }); } } if (userCallback) { return userCallback(options); } return options; }; } private createStepHandler( state: AgentState, userCallback?: | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, ) { return async (event: StepResult) => { this.logger({ category: "agent", message: `Step finished: ${event.finishReason}`, level: 2, }); if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; const args = toolCall.input; const toolResult = event.toolResults?.[i]; if (event.text && event.text.length > 0) { state.collectedReasoning.push(event.text); this.logger({ category: "agent", message: `reasoning: ${event.text}`, level: 1, }); } if (toolCall.toolName === "done") { state.completed = true; if (args?.taskComplete) { const doneReasoning = args.reasoning; const allReasoning = state.collectedReasoning.join(" "); state.finalMessage = doneReasoning ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, toolResult, args, reasoning: event.text || undefined, }); for (const action of mappedActions) { action.pageUrl = state.currentPageUrl; action.timestamp = Date.now(); state.actions.push(action); } } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); } if (userCallback) { await userCallback(event); } }; } public async execute( instructionOrOptions: string | AgentExecuteOptions, ): Promise { const startTime = Date.now(); const options = typeof instructionOrOptions === "object" ? instructionOrOptions : null; const signal = options?.signal; // Highlight cursor defaults to true for hybrid mode, can be overridden const shouldHighlightCursor = options?.highlightCursor ?? this.mode === "hybrid"; const state: AgentState = { collectedReasoning: [], actions: [], finalMessage: "", completed: false, currentPageUrl: "", }; let messages: ModelMessage[] = []; let captchaSolver: CaptchaSolver | undefined; try { const { options: preparedOptions, maxSteps, systemPrompt, allTools, messages: preparedMessages, wrappedModel, initialPageUrl, } = await this.prepareAgent(instructionOrOptions); // Enable cursor overlay for hybrid mode (coordinate-based interactions) if (shouldHighlightCursor && this.mode === "hybrid") { const page = await this.v3.context.awaitActivePage(); await page.enableCursorOverlay().catch(() => {}); } // Set up captcha solver for Browserbase environments if (this.captchaAutoSolveEnabled) { captchaSolver = new CaptchaSolver(); captchaSolver.init(() => this.v3.context.awaitActivePage()); } messages = preparedMessages; state.currentPageUrl = initialPageUrl; const callbacks = (instructionOrOptions as AgentExecuteOptions).callbacks; if (callbacks) { const streamingOnlyCallbacks = [ "onChunk", "onFinish", "onError", "onAbort", ]; const invalidCallbacks = streamingOnlyCallbacks.filter( (name) => callbacks[name as keyof typeof callbacks] != null, ); if (invalidCallbacks.length > 0) { throw new StreamingCallbacksInNonStreamingModeError(invalidCallbacks); } } const result = await this.llmClient.generateText({ model: wrappedModel, messages: prependSystemMessage(systemPrompt, messages), tools: allTools, stopWhen: (result) => this.handleStop(result, maxSteps), temperature: 1, toolChoice: "auto", prepareStep: this.createPrepareStep( callbacks?.prepareStep, captchaSolver, ), onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), abortSignal: preparedOptions.signal, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, openai: { store: false }, }, }); const allMessages = [...messages, ...(result.response?.messages || [])]; const doneResult = await this.ensureDone( state, wrappedModel, allMessages, preparedOptions.instruction, preparedOptions.output, this.logger, ); return this.consolidateMetricsAndResult( startTime, state, doneResult.messages, result, maxSteps, doneResult.output, ); } catch (error) { // Re-throw validation errors that should propagate to the caller if ( error instanceof StreamingCallbacksInNonStreamingModeError || error instanceof MissingEnvironmentVariableError ) { throw error; } // Re-throw abort errors wrapped in AgentAbortError for consistent error typing if (signal?.aborted) { const reason = signal.reason ? String(signal.reason) : "aborted"; throw new AgentAbortError(reason); } const errorMessage = getErrorMessage(error); this.logger({ category: "agent", message: `Error executing agent task: ${errorMessage}`, level: 0, }); // For non-abort errors, return a failure result instead of throwing return { success: false, actions: state.actions, message: `Failed to execute task: ${errorMessage}`, completed: false, messages, }; } finally { captchaSolver?.dispose(); } } public async stream( instructionOrOptions: string | AgentStreamExecuteOptions, ): Promise { const streamOptions = typeof instructionOrOptions === "object" ? instructionOrOptions : null; // Highlight cursor defaults to true for hybrid mode, can be overridden const shouldHighlightCursor = streamOptions?.highlightCursor ?? this.mode === "hybrid"; const { options, maxSteps, systemPrompt, allTools, messages, wrappedModel, initialPageUrl, } = await this.prepareAgent(instructionOrOptions); // Enable cursor overlay for hybrid mode (coordinate-based interactions) if (shouldHighlightCursor && this.mode === "hybrid") { const page = await this.v3.context.awaitActivePage(); await page.enableCursorOverlay().catch(() => {}); } // Set up captcha solver for Browserbase environments let captchaSolver: CaptchaSolver | undefined; if (this.captchaAutoSolveEnabled) { captchaSolver = new CaptchaSolver(); captchaSolver.init(() => this.v3.context.awaitActivePage()); } const callbacks = (instructionOrOptions as AgentStreamExecuteOptions) .callbacks as AgentStreamCallbacks | undefined; const state: AgentState = { collectedReasoning: [], actions: [], finalMessage: "", completed: false, currentPageUrl: initialPageUrl, }; const startTime = Date.now(); let resolveResult: (value: AgentResult | PromiseLike) => void; let rejectResult: (reason: unknown) => void; const resultPromise = new Promise((resolve, reject) => { resolveResult = resolve; rejectResult = reject; }); const handleError = (error: unknown) => { const errorMessage = error instanceof Error ? error.message : String(error); this.logger({ category: "agent", message: `Error during streaming: ${errorMessage}`, level: 0, }); rejectResult(error); }; let streamResult: ReturnType; try { streamResult = this.llmClient.streamText({ model: wrappedModel, messages: prependSystemMessage(systemPrompt, messages), tools: allTools, stopWhen: (result) => this.handleStop(result, maxSteps), temperature: 1, toolChoice: "auto", prepareStep: this.createPrepareStep( callbacks?.prepareStep, captchaSolver, ), onStepFinish: this.createStepHandler(state, callbacks?.onStepFinish), onError: (event) => { captchaSolver?.dispose(); if (callbacks?.onError) { callbacks.onError(event); } handleError(event.error); }, onChunk: callbacks?.onChunk, onFinish: (event) => { captchaSolver?.dispose(); if (callbacks?.onFinish) { callbacks.onFinish(event); } const allMessages = [ ...messages, ...(event.response?.messages || []), ]; this.ensureDone( state, wrappedModel, allMessages, options.instruction, options.output, this.logger, ).then((doneResult) => { const result = this.consolidateMetricsAndResult( startTime, state, doneResult.messages, event, maxSteps, doneResult.output, ); resolveResult(result); }); }, onAbort: (event) => { captchaSolver?.dispose(); if (callbacks?.onAbort) { callbacks.onAbort(event); } // Reject the result promise with AgentAbortError when stream is aborted const reason = options.signal?.reason ? String(options.signal.reason) : "Stream was aborted"; rejectResult(new AgentAbortError(reason)); }, abortSignal: options.signal, providerOptions: { google: { mediaResolution: "MEDIA_RESOLUTION_HIGH" }, openai: { store: false }, }, }); } catch (error) { captchaSolver?.dispose(); throw error; } const agentStreamResult = streamResult as AgentStreamResult; agentStreamResult.result = resultPromise; return agentStreamResult; } private consolidateMetricsAndResult( startTime: number, state: AgentState, inputMessages: ModelMessage[], result: { text?: string; totalUsage?: LanguageModelUsage; response?: { messages?: ModelMessage[] }; steps?: StepResult[]; }, maxSteps?: number, output?: Record, ): AgentResult { if (!state.finalMessage) { const allReasoning = state.collectedReasoning.join(" ").trim(); if (!state.completed && maxSteps && result.steps?.length >= maxSteps) { this.logger({ category: "agent", message: `Agent stopped: reached maximum steps (${maxSteps})`, level: 1, }); state.finalMessage = `Agent stopped: reached maximum steps (${maxSteps})`; } else { state.finalMessage = allReasoning || result.text || ""; } } const endTime = Date.now(); const inferenceTimeMs = endTime - startTime; if (result.totalUsage) { this.v3.updateMetrics( V3FunctionName.AGENT, result.totalUsage.inputTokens || 0, result.totalUsage.outputTokens || 0, result.totalUsage.reasoningTokens || 0, result.totalUsage.cachedInputTokens || 0, inferenceTimeMs, ); } return { success: state.completed, message: state.finalMessage || "Task execution completed", actions: state.actions, completed: state.completed, output, usage: result.totalUsage ? { input_tokens: result.totalUsage.inputTokens || 0, output_tokens: result.totalUsage.outputTokens || 0, reasoning_tokens: result.totalUsage.reasoningTokens || 0, cached_input_tokens: result.totalUsage.cachedInputTokens || 0, inference_time_ms: inferenceTimeMs, } : undefined, messages: inputMessages, }; } private createTools( excludeTools?: string[], variables?: Variables, toolTimeout?: number, useSearch?: boolean, ) { const provider = this.llmClient?.getLanguageModel?.()?.provider; return createAgentTools(this.v3, { executionModel: this.executionModel, logger: this.logger, mode: this.mode, provider, excludeTools, variables, toolTimeout, useSearch, browserbaseApiKey: useSearch ? this.v3.browserbaseApiKey : undefined, }); } private handleStop( result: Parameters>[0], maxSteps: number, ): boolean | PromiseLike { const lastStep = result.steps[result.steps.length - 1]; if (lastStep?.toolCalls?.some((tc) => tc.toolName === "done")) { return true; } return stepCountIs(maxSteps)(result); } /** * Ensures the done tool is called at the end of agent execution. * Returns the messages and any extracted output from the done call. */ private async ensureDone( state: AgentState, model: LanguageModel, messages: ModelMessage[], instruction: string, outputSchema?: StagehandZodObject, logger?: (message: LogLine) => void, ): Promise<{ messages: ModelMessage[]; output?: Record }> { if (state.completed) return { messages }; const doneResult = await handleDoneToolCall({ model, inputMessages: messages, instruction, outputSchema, logger, }); state.completed = doneResult.taskComplete; state.finalMessage = doneResult.reasoning; const doneAction = mapToolResultToActions({ toolCallName: "done", toolResult: { success: true, reasoning: doneResult.reasoning, taskComplete: doneResult.taskComplete, }, args: { reasoning: doneResult.reasoning, taskComplete: doneResult.taskComplete, }, reasoning: doneResult.reasoning, }); for (const action of doneAction) { action.pageUrl = state.currentPageUrl; action.timestamp = Date.now(); state.actions.push(action); } return { messages: [...messages, ...doneResult.messages], output: doneResult.output, }; } } ================================================ FILE: packages/core/lib/v3/handlers/v3CuaAgentHandler.ts ================================================ import { computeActiveElementXpath } from "../understudy/a11y/snapshot/index.js"; import { V3 } from "../v3.js"; import { ToolSet } from "ai"; import { AgentClient } from "../agent/AgentClient.js"; import { AgentProvider } from "../agent/AgentProvider.js"; import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; import { ActionExecutionResult, AgentAction, AgentExecuteOptions, AgentHandlerOptions, AgentResult, SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; import { StagehandClosedError } from "../types/public/sdkErrors.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, CAPTCHA_ERRORED_MSG, } from "../agent/utils/captchaSolver.js"; export class V3CuaAgentHandler { private v3: V3; private agent: AgentClient; private provider: AgentProvider; private logger: (message: LogLine) => void; private agentClient: AgentClient; private options: AgentHandlerOptions; private highlightCursor: boolean; private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; constructor( v3: V3, logger: (message: LogLine) => void, options: AgentHandlerOptions, tools?: ToolSet, ) { this.v3 = v3; this.logger = logger; this.options = options; this.provider = new AgentProvider(logger); const client = this.provider.getClient( options.modelName, options.clientOptions || {}, options.userProvidedInstructions, tools, ); this.agentClient = client; this.setupAgentClient(); this.agent = client; } /** * Ensures the V3 context is still available (not closed). * Throws StagehandClosedError if stagehand.close() was called. */ private ensureNotClosed(): void { if (!this.v3.context) { throw new StagehandClosedError(); } } private setupAgentClient(): void { // Provide screenshots to the agent client this.agentClient.setScreenshotProvider(async () => { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); return screenshotBuffer.toString("base64"); // base64 png }); // Provide action executor this.agentClient.setActionHandler(async (action) => { this.ensureNotClosed(); // Wait for captcha solver to finish before executing action if (this.captchaSolver) { if (this.captchaSolver.isSolving()) { this.logger({ category: "agent", message: "Captcha detected — waiting for Browserbase to solve it before continuing", level: 1, }); } await this.captchaSolver.waitIfSolving(); this.handleCaptchaSolveResult(this.captchaSolver.consumeSolveResult()); } action.pageUrl = (await this.v3.context.awaitActivePage()).url(); if (await this.shouldSkipSolvedCaptchaInteraction(action)) { this.captchaClickGuardRemaining = Math.max( 0, this.captchaClickGuardRemaining - 1, ); this.agentClient.addContextNote( `The captcha has already been solved automatically. Do not click the captcha checkbox, widget, or challenge again. Continue with the original task outside the captcha area. Original task: ${this.currentInstruction}`, ); this.logger({ category: "agent", message: "Skipped click on solved captcha widget — injected follow-up guidance", level: 1, }); return; } const defaultDelay = 500; const waitBetween = (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; try { // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { await this.injectCursor(); } catch { // Ignore cursor injection failures } } await new Promise((r) => setTimeout(r, 300)); // Skip logging for screenshot actions - they're no-ops, the actual // Page.screenshot in captureAndSendScreenshot() is logged separately const shouldLog = action.type !== "screenshot"; if (shouldLog) { await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { target: this.computePointerTarget(action), }, }, async (loggedAction: typeof action) => await this.executeAction(loggedAction), [action], ); } else { await this.executeAction(action); } action.timestamp = Date.now(); await new Promise((r) => setTimeout(r, waitBetween)); try { await this.captureAndSendScreenshot(); } catch (e) { this.logger({ category: "agent", message: `Warning: Failed to take screenshot after action: ${String( (e as Error)?.message ?? e, )}`, level: 1, }); } } catch (error) { const msg = (error as Error)?.message ?? String(error); this.logger({ category: "agent", message: `Error executing action ${action.type}: ${msg}`, level: 0, }); throw error; } }); void this.updateClientViewport(); void this.updateClientUrl(); } setSafetyConfirmationHandler(handler?: SafetyConfirmationHandler): void { if ( this.agentClient instanceof GoogleCUAClient || this.agentClient instanceof OpenAICUAClient ) { this.agentClient.setSafetyConfirmationHandler(handler); } } async execute( optionsOrInstruction: AgentExecuteOptions | string, ): Promise { const options = typeof optionsOrInstruction === "string" ? { instruction: optionsOrInstruction } : optionsOrInstruction; this.setSafetyConfirmationHandler(options.callbacks?.onSafetyConfirmation); this.highlightCursor = options.highlightCursor !== false; this.currentInstruction = options.instruction; // Redirect if blank const page = await this.v3.context.awaitActivePage(); const currentUrl = page.url(); if (!currentUrl || currentUrl === "about:blank") { this.logger({ category: "agent", message: `Page URL is empty. Navigating to https://www.google.com ...`, level: 1, }); await page.goto("https://www.google.com", { waitUntil: "load" }); } // Set up captcha solver for Browserbase environments if (this.v3.isCaptchaAutoSolveEnabled) { this.captchaSolver = new CaptchaSolver(); this.captchaSolver.init(() => this.v3.context.awaitActivePage()); // Block the CUA agent loop before each step while a captcha is being solved this.agentClient.setPreStepHook(async () => { if (this.captchaSolver?.isSolving()) { this.logger({ category: "agent", message: "Captcha detected — waiting for Browserbase to solve it before continuing", level: 1, }); } await this.captchaSolver?.waitIfSolving(); this.handleCaptchaSolveResult(this.captchaSolver?.consumeSolveResult()); }); } if (this.highlightCursor) { try { await this.injectCursor(); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); this.logger({ category: "agent", message: `Warning: Failed to inject cursor: ${errorMessage}. Continuing with execution.`, level: 1, }); // Continue execution even if cursor injection fails } } const start = Date.now(); let result: AgentResult; try { result = await this.agent.execute({ options, logger: this.logger }); } finally { this.captchaSolver?.dispose(); this.captchaSolver = null; } const inferenceTimeMs = Date.now() - start; if (result.usage) { this.v3.updateMetrics( V3FunctionName.AGENT, result.usage.input_tokens, result.usage.output_tokens, result.usage.reasoning_tokens ?? 0, result.usage.cached_input_tokens ?? 0, inferenceTimeMs, ); } return result; } private async executeAction( action: AgentAction, ): Promise { const page = await this.v3.context.awaitActivePage(); const recording = this.v3.isAgentReplayActive(); switch (action.type) { case "click": { const { x, y, button = "left", clickCount } = action; if (recording) { const xpath = await page.click(x as number, y as number, { button: (button as "left" | "right" | "middle") ?? "left", clickCount: (clickCount as number) ?? 1, returnXpath: true, }); const normalized = ensureXPath(xpath); if (normalized) { const stagehandAction: Action = { selector: normalized, description: this.describePointerAction("click", x, y), method: "click", arguments: [], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } else { await page.click(x as number, y as number, { button: (button as "left" | "right" | "middle") ?? "left", clickCount: (clickCount as number) ?? 1, }); } return { success: true }; } case "double_click": case "doubleClick": { const { x, y } = action; if (recording) { const xpath = await page.click(x as number, y as number, { button: "left", clickCount: 2, returnXpath: true, }); const normalized = ensureXPath(xpath); if (normalized) { const stagehandAction: Action = { selector: normalized, description: this.describePointerAction("double click", x, y), method: "doubleClick", arguments: [], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } else { await page.click(x as number, y as number, { button: "left", clickCount: 2, }); } return { success: true }; } case "tripleClick": { const { x, y } = action; if (recording) { const xpath = await page.click(x as number, y as number, { button: "left", clickCount: 3, returnXpath: true, }); const normalized = ensureXPath(xpath); if (normalized) { const stagehandAction: Action = { selector: normalized, description: this.describePointerAction("triple click", x, y), method: "tripleClick", arguments: [], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } else { await page.click(x as number, y as number, { clickCount: 3, }); } return { success: true }; } case "type": { const { text } = action; await page.type(String(text ?? "")); if (recording) { const xpath = await computeActiveElementXpath(page); const normalized = ensureXPath(xpath); if (normalized) { const stagehandAction: Action = { selector: normalized, description: this.describeTypeAction(String(text ?? "")), method: "type", arguments: [String(text ?? "")], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } return { success: true }; } case "keypress": { const { keys } = action; const keyList = Array.isArray(keys) ? keys : [keys]; const stagehandActions: Action[] = []; for (const rawKey of keyList) { const mapped = mapKeyToPlaywright(String(rawKey ?? "")); await page.keyPress(mapped); if (recording) { stagehandActions.push({ selector: "xpath=/html", description: `press ${mapped}`, method: "press", arguments: [mapped], }); } } if (recording && stagehandActions.length > 0) { this.recordCuaActStep( action, stagehandActions, stagehandActions .map((a) => a.description) .filter(Boolean) .join(", ") || "keypress", ); } return { success: true }; } case "scroll": { const { x, y, scroll_x = 0, scroll_y = 0 } = action; await page.scroll( (x as number) ?? 0, (y as number) ?? 0, (scroll_x as number) ?? 0, (scroll_y as number) ?? 0, ); this.v3.recordAgentReplayStep({ type: "scroll", deltaX: Number(scroll_x ?? 0), deltaY: Number(scroll_y ?? 0), anchor: typeof x === "number" && typeof y === "number" ? { x: Math.round(x), y: Math.round(y) } : undefined, }); return { success: true }; } case "drag": { const { path } = action; if (Array.isArray(path) && path.length >= 2) { const start = path[0]; const end = path[path.length - 1]; if (recording) { const xps = await page.dragAndDrop(start.x, start.y, end.x, end.y, { steps: Math.min(20, Math.max(5, path.length)), delay: 10, returnXpath: true, }); const [fromXpath, toXpath] = (xps as [string, string]) || ["", ""]; const from = ensureXPath(fromXpath); const to = ensureXPath(toXpath); if (from && to) { const stagehandAction: Action = { selector: from, description: this.describeDragAction(), method: "dragAndDrop", arguments: [to], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } else { await page.dragAndDrop(start.x, start.y, end.x, end.y, { steps: Math.min(20, Math.max(5, path.length)), delay: 10, }); } } return { success: true }; } case "move": { const { x, y } = action; if (typeof x === "number" && typeof y === "number") { if (recording) { const xpath = await page.hover(x, y, { returnXpath: true }); const normalized = ensureXPath(xpath); if (normalized) { const stagehandAction: Action = { selector: normalized, description: this.describePointerAction("hover", x, y), method: "hover", arguments: [], }; this.recordCuaActStep( action, [stagehandAction], stagehandAction.description, ); } } else { await page.hover(x, y); } } return { success: true }; } case "wait": { const time = action?.timeMs ?? 1000; await new Promise((r) => setTimeout(r, time)); if (time > 0 && recording) { this.v3.recordAgentReplayStep({ type: "wait", timeMs: Number(time) }); } return { success: true }; } case "screenshot": { // No-op - screenshot is captured by captureAndSendScreenshot() after all actions return { success: true }; } case "goto": { const { url } = action; await page.goto(String(url ?? ""), { waitUntil: "load" }); if (recording) { this.v3.recordAgentReplayStep({ type: "goto", url: String(url ?? ""), }); } return { success: true }; } case "back": { await page.goBack(); if (recording) { this.v3.recordAgentReplayStep({ type: "back", }); } return { success: true }; } case "forward": { await page.goForward(); if (recording) { this.v3.recordAgentReplayStep({ type: "forward", }); } return { success: true }; } case "open_web_browser": { // Browser is already open, this is a no-op return { success: true }; } case "custom_tool": { // Custom tools are handled by the agent client directly return { success: true }; } default: this.logger({ category: "agent", message: `Unknown action type: ${String(action.type)}`, level: 1, }); return { success: false, error: `Unknown action ${String(action.type)}`, }; } } // helper to make pointer target human-readable for logging private computePointerTarget(action: AgentAction): string | undefined { return typeof action.x === "number" && typeof action.y === "number" ? `(${action.x}, ${action.y})` : typeof action.selector === "string" ? action.selector : typeof action.input === "string" ? action.input : typeof action.description === "string" ? action.description : undefined; } private describePointerAction(kind: string, x: unknown, y: unknown): string { const nx = Number(x); const ny = Number(y); if (Number.isFinite(nx) && Number.isFinite(ny)) { return `${kind} at (${Math.round(nx)}, ${Math.round(ny)})`; } return kind; } private describeTypeAction(text: string): string { const snippet = text.length > 30 ? `${text.slice(0, 27)}...` : text; return `type "${snippet}"`; } private describeDragAction(): string { return "drag and drop"; } private buildInstructionFallback( agentAction: AgentAction, fallback: string, ): string { const raw = (typeof agentAction.action === "string" && agentAction.action.trim()) || (typeof agentAction.reasoning === "string" && agentAction.reasoning.trim()); return raw && raw.length > 0 ? raw : fallback; } private recordCuaActStep( agentAction: AgentAction, stagehandActions: Action[], fallback: string, ): void { if (!stagehandActions.length) return; const instruction = this.buildInstructionFallback(agentAction, fallback); const description = stagehandActions[0]?.description || instruction; const actions = stagehandActions.map((act) => ({ ...act, description: act.description || description, })); this.v3.recordAgentReplayStep({ type: "act", instruction, actions, actionDescription: description, message: typeof agentAction.reasoning === "string" && agentAction.reasoning.trim().length > 0 ? agentAction.reasoning.trim() : undefined, }); } private async updateClientViewport(): Promise { try { // For Google CUA, use configured viewport for coordinate normalization // advancedStealth uses fixed 1288x711, otherwise use configured viewport if (this.agentClient instanceof GoogleCUAClient) { const dims = this.v3.isAdvancedStealth ? { width: 1288, height: 711 } : this.v3.configuredViewport; this.agentClient.setViewport(dims.width, dims.height); } else { // For other clients, use actual window dimensions const page = await this.v3.context.awaitActivePage(); const { w, h } = await page.mainFrame().evaluate<{ w: number; h: number; }>("({ w: window.innerWidth, h: window.innerHeight })"); if (w && h) this.agentClient.setViewport(w, h); } } catch { // } } private async updateClientUrl(): Promise { try { const page = await this.v3.context.awaitActivePage(); const url = page.url(); this.agentClient.setCurrentUrl(url); } catch { // } } async captureAndSendScreenshot(): Promise { this.logger({ category: "agent", message: "Capturing screenshot", level: 1, }); try { const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, }); } catch (e) { this.logger({ category: "agent", message: `Error capturing screenshot: ${String((e as Error)?.message ?? e)}`, level: 0, }); return null; } } private handleCaptchaSolveResult(result?: { solved: boolean; errored: boolean; }): void { if (!result) return; if (result.solved) { this.captchaClickGuardRemaining = 3; this.agentClient.addContextNote(CAPTCHA_SOLVED_MSG); this.logger({ category: "agent", message: "Captcha solved — continuing with task", level: 1, }); } if (result.errored) { this.captchaClickGuardRemaining = 0; this.agentClient.addContextNote(CAPTCHA_ERRORED_MSG); this.logger({ category: "agent", message: "Captcha solver failed or errored", level: 1, }); } } private async shouldSkipSolvedCaptchaInteraction( action: AgentAction, ): Promise { if (this.captchaClickGuardRemaining <= 0) { return false; } if (action.type !== "click") { return false; } const x = action.x; const y = action.y; if (typeof x !== "number" || typeof y !== "number") { return false; } try { const page = await this.v3.context.awaitActivePage(); const boxes = await page.evaluate< Array<{ left: number; top: number; right: number; bottom: number }> >(() => { const selectors = [ 'iframe[title*="reCAPTCHA"]', 'iframe[src*="recaptcha"]', 'iframe[src*="hcaptcha"]', 'iframe[src*="turnstile"]', ".g-recaptcha", "[data-sitekey]", '[class*="captcha"]', '[id*="captcha"]', ]; const seen = new Set(); const bounds: Array<{ left: number; top: number; right: number; bottom: number; }> = []; for (const selector of selectors) { for (const element of document.querySelectorAll(selector)) { if (seen.has(element)) continue; seen.add(element); const rect = element.getBoundingClientRect(); if (rect.width <= 0 || rect.height <= 0) continue; bounds.push({ left: rect.left, top: rect.top, right: rect.right, bottom: rect.bottom, }); } } return bounds; }); return boxes.some( (box) => x >= box.left && x <= box.right && y >= box.top && y <= box.bottom, ); } catch { return false; } } private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); await page.enableCursorOverlay(); } catch { // Best-effort only } } } ================================================ FILE: packages/core/lib/v3/index.ts ================================================ import * as PublicApi from "./types/public/index.js"; import { V3 } from "./v3.js"; import { AnnotatedScreenshotText, LLMClient } from "./llm/LLMClient.js"; import { AgentProvider, modelToAgentProviderMap, } from "./agent/AgentProvider.js"; import { validateZodSchema, isRunningInBun, toGeminiSchema, getZodType, transformSchema, injectUrls, providerEnvVarMap, loadApiKeyFromEnv, trimTrailingTextNode, jsonSchemaToZod, } from "../utils.js"; import { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat.js"; import { connectToMCPServer } from "./mcp/connection.js"; import { V3Evaluator } from "../v3Evaluator.js"; import { tool } from "ai"; import { getAISDKLanguageModel } from "./llm/LLMProvider.js"; import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; export { V3 } from "./v3.js"; export { V3 as Stagehand } from "./v3.js"; export * from "./types/public/index.js"; export { AnnotatedScreenshotText, LLMClient } from "./llm/LLMClient.js"; export { AgentProvider, modelToAgentProviderMap, } from "./agent/AgentProvider.js"; export type { AgentTools, AgentToolTypesMap, AgentUITools, AgentToolCall, AgentToolResult, } from "./agent/tools/index.js"; export { validateZodSchema, isRunningInBun, toGeminiSchema, getZodType, transformSchema, injectUrls, providerEnvVarMap, loadApiKeyFromEnv, trimTrailingTextNode, jsonSchemaToZod, } from "../utils.js"; export { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat.js"; export { connectToMCPServer } from "./mcp/connection.js"; export { V3Evaluator } from "../v3Evaluator.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; export { maybeRunShutdownSupervisorFromArgv as __internalMaybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; export type { ServerAgentCacheHandle } from "./cache/serverAgentCache.js"; export type { ChatMessage, ChatMessageContent, ChatMessageImageContent, ChatMessageTextContent, ChatCompletionOptions, LLMResponse, CreateChatCompletionOptions, LLMUsage, LLMParsedResponse, } from "./llm/LLMClient.js"; export type { StagehandZodSchema, StagehandZodObject, InferStagehandSchema, JsonSchemaDocument, } from "./zodCompat.js"; export type { JsonSchema, JsonSchemaProperty } from "../utils.js"; const StagehandDefault = { ...PublicApi, V3, Stagehand: V3, AnnotatedScreenshotText, LLMClient, AgentProvider, modelToAgentProviderMap, validateZodSchema, isRunningInBun, toGeminiSchema, getZodType, transformSchema, injectUrls, providerEnvVarMap, loadApiKeyFromEnv, trimTrailingTextNode, jsonSchemaToZod, isZod4Schema, isZod3Schema, toJsonSchema, connectToMCPServer, V3Evaluator, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, __internalMaybeRunShutdownSupervisorFromArgv: maybeRunShutdownSupervisorFromArgv, }; export default StagehandDefault; ================================================ FILE: packages/core/lib/v3/launch/browserbase.ts ================================================ import Browserbase from "@browserbasehq/sdk"; import { BrowserbaseSessionNotFoundError, StagehandInitError, } from "../types/public/sdkErrors.js"; import type { BrowserbaseSessionCreateParams } from "../types/public/api.js"; import { getEnvTimeoutMs, withTimeout } from "../timeoutConfig.js"; export async function createBrowserbaseSession( apiKey: string, projectId?: string, params?: BrowserbaseSessionCreateParams, resumeSessionId?: string, ): Promise<{ ws: string; sessionId: string; bb: Browserbase }> { const bb = new Browserbase({ apiKey }); const sessionCreateTimeoutMs = getEnvTimeoutMs( "BROWSERBASE_SESSION_CREATE_MAX_MS", ); // Resume an existing session if provided if (resumeSessionId) { const existing = (await withTimeout( bb.sessions.retrieve(resumeSessionId), sessionCreateTimeoutMs, "Browserbase session retrieve", )) as unknown as { id: string; connectUrl?: string; status?: string; }; if (!existing?.id) { throw new BrowserbaseSessionNotFoundError(); } const ws = existing.connectUrl; if (!ws) { throw new StagehandInitError( `Browserbase session resume missing connectUrl for ${resumeSessionId}`, ); } return { ws, sessionId: resumeSessionId, bb }; } // Create a new session with optional overrides and a default viewport const { projectId: overrideProjectId, browserSettings, userMetadata, ...rest } = params ?? {}; // satisfies check ensures our BrowserbaseSessionCreateParamsSchema stays in sync with SDK const resolvedProjectId = overrideProjectId ?? projectId; const createPayload = { ...(resolvedProjectId ? { projectId: resolvedProjectId } : {}), ...rest, browserSettings: { ...(browserSettings ?? {}), viewport: browserSettings?.viewport ?? { width: 1288, height: 711 }, }, userMetadata: { ...(userMetadata ?? {}), stagehand: "true", }, } satisfies Browserbase.Sessions.SessionCreateParams; const created = (await withTimeout( bb.sessions.create(createPayload), sessionCreateTimeoutMs, "Browserbase session create", )) as unknown as { id: string; connectUrl: string }; if (!created?.connectUrl || !created?.id) { throw new StagehandInitError( "Browserbase session creation returned an unexpected shape.", ); } return { ws: created.connectUrl, sessionId: created.id, bb }; } ================================================ FILE: packages/core/lib/v3/launch/local.ts ================================================ import { launch, LaunchedChrome } from "chrome-launcher"; import WebSocket from "ws"; import { ConnectionTimeoutError } from "../types/public/sdkErrors.js"; interface LaunchLocalOptions { chromePath?: string; chromeFlags?: string[]; headless?: boolean; userDataDir?: string; port?: number; connectTimeoutMs?: number; handleSIGINT?: boolean; } export async function launchLocalChrome( opts: LaunchLocalOptions, ): Promise<{ ws: string; chrome: LaunchedChrome }> { const connectTimeoutMs = opts.connectTimeoutMs ?? 15_000; const deadlineMs = Date.now() + connectTimeoutMs; const connectionPollInterval = 250; const maxConnectionRetries = Math.max( 1, Math.ceil(connectTimeoutMs / connectionPollInterval), ); const headless = opts.headless ?? false; const chromeFlags = [ headless ? "--headless=new" : undefined, "--remote-allow-origins=*", "--no-first-run", "--no-default-browser-check", "--disable-dev-shm-usage", "--site-per-process", ...(opts.chromeFlags ?? []), ].filter((f): f is string => typeof f === "string"); const chrome = await launch({ chromePath: opts.chromePath, chromeFlags, port: opts.port, userDataDir: opts.userDataDir, handleSIGINT: opts.handleSIGINT, connectionPollInterval, maxConnectionRetries, }); const ws = await waitForWebSocketDebuggerUrl(chrome.port, deadlineMs); await waitForWebSocketReady(ws, deadlineMs); return { ws, chrome }; } async function waitForWebSocketDebuggerUrl( port: number, deadlineMs: number, ): Promise { let lastErrMsg = ""; while (Date.now() < deadlineMs) { try { const resp = await fetch(`http://127.0.0.1:${port}/json/version`); if (resp.ok) { const json = (await resp.json()) as unknown; const url = (json as { webSocketDebuggerUrl?: string }) .webSocketDebuggerUrl; if (typeof url === "string") return url; } else { lastErrMsg = `${resp.status} ${resp.statusText}`; } } catch (err) { lastErrMsg = err instanceof Error ? err.message : String(err); } await new Promise((r) => setTimeout(r, 250)); } throw new ConnectionTimeoutError( `Timed out waiting for /json/version on port ${port} ${ lastErrMsg ? ` (last error: ${lastErrMsg})` : "" }`, ); } async function waitForWebSocketReady( wsUrl: string, deadlineMs: number, ): Promise { let lastErrMsg = ""; while (Date.now() < deadlineMs) { const remainingMs = Math.max(200, deadlineMs - Date.now()); try { await probeWebSocket(wsUrl, Math.min(2_000, remainingMs)); return; } catch (error) { lastErrMsg = error instanceof Error ? error.message : String(error); await new Promise((r) => setTimeout(r, 100)); } } throw new ConnectionTimeoutError( `Timed out waiting for CDP websocket to accept connections at ${wsUrl}${ lastErrMsg ? ` (last error: ${lastErrMsg})` : "" }`, ); } function probeWebSocket(wsUrl: string, timeoutMs: number): Promise { return new Promise((resolve, reject) => { const ws = new WebSocket(wsUrl); let settled = false; const finish = (error?: unknown) => { if (settled) return; settled = true; clearTimeout(timer); try { ws.terminate(); } catch { // best-effort cleanup } if (error) { reject(error); return; } resolve(); }; const timer = setTimeout(() => { finish(new Error(`websocket probe timeout after ${timeoutMs}ms`)); }, timeoutMs); ws.once("open", () => finish()); ws.once("error", (error) => finish(error)); }); } ================================================ FILE: packages/core/lib/v3/llm/AnthropicClient.ts ================================================ import Anthropic, { ClientOptions } from "@anthropic-ai/sdk"; import { ImageBlockParam, MessageParam, TextBlockParam, Tool, } from "@anthropic-ai/sdk/resources"; import { LogLine } from "../types/public/logs.js"; import { AnthropicJsonSchemaObject, AvailableModel, } from "../types/public/model.js"; import { CreateChatCompletionOptions, LLMClient, LLMResponse, } from "./LLMClient.js"; import { CreateChatCompletionResponseError } from "../types/public/sdkErrors.js"; import { toJsonSchema } from "../zodCompat.js"; export class AnthropicClient extends LLMClient { public type = "anthropic" as const; private client: Anthropic; declare public clientOptions: ClientOptions; constructor({ modelName, clientOptions, userProvidedInstructions, }: { logger: (message: LogLine) => void; modelName: AvailableModel; clientOptions?: ClientOptions; userProvidedInstructions?: string; }) { super(modelName); this.client = new Anthropic(clientOptions); this.modelName = modelName; this.clientOptions = clientOptions; this.userProvidedInstructions = userProvidedInstructions; } async createChatCompletion({ options, retries, logger, }: CreateChatCompletionOptions): Promise { const optionsWithoutImage = { ...options }; delete optionsWithoutImage.image; logger({ category: "anthropic", message: "creating chat completion", level: 2, auxiliary: { options: { value: JSON.stringify(optionsWithoutImage), type: "object", }, }, }); const systemMessage = options.messages.find((msg) => { if (msg.role === "system") { if (typeof msg.content === "string") { return true; } else if (Array.isArray(msg.content)) { return msg.content.every((content) => content.type !== "image_url"); } } return false; }); const userMessages = options.messages.filter( (msg) => msg.role !== "system", ); const formattedMessages: MessageParam[] = userMessages.map((msg) => { if (typeof msg.content === "string") { return { role: msg.role as "user" | "assistant", // ensure its not checking for system types content: msg.content, }; } else { return { role: msg.role as "user" | "assistant", content: msg.content.map((content) => { if ("image_url" in content) { const formattedContent: ImageBlockParam = { type: "image", source: { type: "base64", media_type: "image/jpeg", data: content.image_url.url, }, }; return formattedContent; } else { return { type: "text", text: content.text }; } }), }; } }); if (options.image) { const screenshotMessage: MessageParam = { role: "user", content: [ { type: "image", source: { type: "base64", media_type: "image/jpeg", data: options.image.buffer.toString("base64"), }, }, ], }; if ( options.image.description && Array.isArray(screenshotMessage.content) ) { screenshotMessage.content.push({ type: "text", text: options.image.description, }); } formattedMessages.push(screenshotMessage); } let anthropicTools: Tool[] = options.tools?.map((tool) => { return { name: tool.name, description: tool.description, input_schema: { type: "object", properties: tool.parameters.properties, required: tool.parameters.required, }, }; }); let toolDefinition: Tool | undefined; if (options.response_model) { const jsonSchema = toJsonSchema(options.response_model.schema); const { properties: schemaProperties, required: schemaRequired } = extractSchemaProperties(jsonSchema); toolDefinition = { name: "print_extracted_data", description: "Prints the extracted data based on the provided schema.", input_schema: { type: "object", properties: schemaProperties, required: schemaRequired, }, }; } if (toolDefinition) { anthropicTools = anthropicTools ?? []; anthropicTools.push(toolDefinition); } const response = await this.client.messages.create({ model: this.modelName, max_tokens: options.maxOutputTokens || 8192, messages: formattedMessages, tools: anthropicTools, system: systemMessage ? (systemMessage.content as string | TextBlockParam[]) // we can cast because we already filtered out image content : undefined, temperature: options.temperature, }); logger({ category: "anthropic", message: "response", level: 2, auxiliary: { response: { value: JSON.stringify(response), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); // We'll compute usage data from the response const usageData = { prompt_tokens: response.usage.input_tokens, completion_tokens: response.usage.output_tokens, total_tokens: response.usage.input_tokens + response.usage.output_tokens, }; const transformedResponse: LLMResponse = { id: response.id, object: "chat.completion", created: Date.now(), model: response.model, choices: [ { index: 0, message: { role: "assistant", content: response.content.find((c) => c.type === "text")?.text || null, tool_calls: response.content .filter((c) => c.type === "tool_use") .map((toolUse) => ({ id: toolUse.id, type: "function", function: { name: toolUse.name, arguments: JSON.stringify(toolUse.input), }, })), }, finish_reason: response.stop_reason, }, ], usage: usageData, }; logger({ category: "anthropic", message: "transformed response", level: 2, auxiliary: { transformedResponse: { value: JSON.stringify(transformedResponse), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); if (options.response_model) { const toolUse = response.content.find((c) => c.type === "tool_use"); if (toolUse && "input" in toolUse) { const result = toolUse.input; const finalParsedResponse = { data: result, usage: usageData, } as unknown as T; return finalParsedResponse; } else { if (!retries || retries < 5) { return this.createChatCompletion({ options, logger, retries: (retries ?? 0) + 1, }); } logger({ category: "anthropic", message: "error creating chat completion", level: 0, auxiliary: { requestId: { value: options.requestId, type: "string", }, }, }); throw new CreateChatCompletionResponseError( "No tool use with input in response", ); } } // if the function was called with a response model, it would have returned earlier // so we can safely cast here to T, which defaults to AnthropicTransformedResponse return transformedResponse as T; } } const extractSchemaProperties = (jsonSchema: AnthropicJsonSchemaObject) => { const schemaRoot = jsonSchema.definitions?.MySchema || jsonSchema; return { properties: schemaRoot.properties, required: schemaRoot.required, }; }; ================================================ FILE: packages/core/lib/v3/llm/CerebrasClient.ts ================================================ import OpenAI from "openai"; import type { ClientOptions } from "openai"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel } from "../types/public/model.js"; import { ChatMessage, CreateChatCompletionOptions, LLMClient, LLMResponse, } from "./LLMClient.js"; import { CreateChatCompletionResponseError } from "../types/public/sdkErrors.js"; import { toJsonSchema } from "../zodCompat.js"; export class CerebrasClient extends LLMClient { public type = "cerebras" as const; private client: OpenAI; declare public clientOptions: ClientOptions; public hasVision = false; constructor({ modelName, clientOptions, userProvidedInstructions, }: { logger: (message: LogLine) => void; modelName: AvailableModel; clientOptions?: ClientOptions; userProvidedInstructions?: string; }) { super(modelName, userProvidedInstructions); // Create OpenAI client with the base URL set to Cerebras API this.client = new OpenAI({ baseURL: "https://api.cerebras.ai/v1", apiKey: clientOptions?.apiKey || process.env.CEREBRAS_API_KEY, ...clientOptions, }); this.modelName = modelName; this.clientOptions = clientOptions; } async createChatCompletion({ options, retries, logger, }: CreateChatCompletionOptions): Promise { const optionsWithoutImage = { ...options }; delete optionsWithoutImage.image; logger({ category: "cerebras", message: "creating chat completion", level: 2, auxiliary: { options: { value: JSON.stringify(optionsWithoutImage), type: "object", }, }, }); // Format messages for Cerebras API (using OpenAI format) const formattedMessages = options.messages.map((msg: ChatMessage) => { const baseMessage = { content: typeof msg.content === "string" ? msg.content : Array.isArray(msg.content) && msg.content.length > 0 && "text" in msg.content[0] ? msg.content[0].text : "", }; // Cerebras only supports system, user, and assistant roles if (msg.role === "system") { return { ...baseMessage, role: "system" as const }; } else if (msg.role === "assistant") { return { ...baseMessage, role: "assistant" as const }; } else { // Default to user for any other role return { ...baseMessage, role: "user" as const }; } }); // Format tools if provided let tools = options.tools?.map((tool) => ({ type: "function" as const, function: { name: tool.name, description: tool.description, parameters: { type: "object", properties: tool.parameters.properties, required: tool.parameters.required, }, }, })); // Add response model as a tool if provided if (options.response_model) { const jsonSchema = toJsonSchema(options.response_model.schema) as { properties?: Record; required?: string[]; }; const schemaProperties = jsonSchema.properties || {}; const schemaRequired = jsonSchema.required || []; const responseTool = { type: "function" as const, function: { name: "print_extracted_data", description: "Prints the extracted data based on the provided schema.", parameters: { type: "object", properties: schemaProperties, required: schemaRequired, }, }, }; tools = tools ? [...tools, responseTool] : [responseTool]; } try { // Use OpenAI client with Cerebras API const apiResponse = await this.client.chat.completions.create({ model: this.modelName.split("cerebras-")[1], messages: [ ...formattedMessages, // Add explicit instruction to return JSON if we have a response model ...(options.response_model ? [ { role: "system" as const, content: `IMPORTANT: Your response must be valid JSON that matches this schema: ${JSON.stringify( options.response_model.schema, )}`, }, ] : []), ], temperature: options.temperature || 0.7, max_tokens: options.maxOutputTokens, tools: tools, tool_choice: options.tool_choice || "auto", }); // Format the response to match the expected LLMResponse format const response: LLMResponse = { id: apiResponse.id, object: "chat.completion", created: Date.now(), model: this.modelName.split("cerebras-")[1], choices: [ { index: 0, message: { role: "assistant", content: apiResponse.choices[0]?.message?.content || null, tool_calls: apiResponse.choices[0]?.message?.tool_calls || [], }, finish_reason: apiResponse.choices[0]?.finish_reason || "stop", }, ], usage: { prompt_tokens: apiResponse.usage?.prompt_tokens || 0, completion_tokens: apiResponse.usage?.completion_tokens || 0, total_tokens: apiResponse.usage?.total_tokens || 0, }, }; logger({ category: "cerebras", message: "response", level: 2, auxiliary: { response: { value: JSON.stringify(response), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); // If we have no response model, just return the entire LLMResponse if (!options.response_model) { return response as T; } // If we have a response model, parse JSON from tool calls or content const toolCall = response.choices[0]?.message?.tool_calls?.[0]; if (toolCall?.function?.arguments) { try { const result = JSON.parse(toolCall.function.arguments); const finalResponse = { data: result, usage: response.usage, }; return finalResponse as T; } catch (e) { logger({ category: "cerebras", message: "failed to parse tool call arguments as JSON, retrying", level: 0, auxiliary: { error: { value: e.message, type: "string", }, }, }); } } // If we have content but no tool calls, try to parse the content as JSON const content = response.choices[0]?.message?.content; if (content) { try { const jsonMatch = content.match(/\{[\s\S]*\}/); if (jsonMatch) { const result = JSON.parse(jsonMatch[0]); const finalResponse = { data: result, usage: response.usage, }; return finalResponse as T; } } catch (e) { logger({ category: "cerebras", message: "failed to parse content as JSON", level: 0, auxiliary: { error: { value: e.message, type: "string", }, }, }); } } // If we still haven't found valid JSON and have retries left, try again if (!retries || retries < 5) { return this.createChatCompletion({ options, logger, retries: (retries ?? 0) + 1, }); } throw new CreateChatCompletionResponseError("Invalid response schema"); } catch (error) { logger({ category: "cerebras", message: "error creating chat completion", level: 0, auxiliary: { error: { value: error.message, type: "string", }, requestId: { value: options.requestId, type: "string", }, }, }); throw error; } } } ================================================ FILE: packages/core/lib/v3/llm/GoogleClient.ts ================================================ import { GoogleGenAI, HarmCategory, HarmBlockThreshold, Content, Part, Tool, FunctionCall, Schema, Type, } from "@google/genai"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel, ClientOptions } from "../types/public/model.js"; import { validateZodSchema, toGeminiSchema, loadApiKeyFromEnv, } from "../../utils.js"; import { ChatCompletionOptions, ChatMessage, CreateChatCompletionOptions, LLMClient, LLMResponse, AnnotatedScreenshotText, } from "./LLMClient.js"; import { CreateChatCompletionResponseError, StagehandError, } from "../types/public/sdkErrors.js"; // Mapping from generic roles to Gemini roles const roleMap: { [key in ChatMessage["role"]]: string } = { user: "user", assistant: "model", system: "user", // Gemini API prefers system instructions either via system_instruction or at the start of 'user' content }; // Basic safety settings - adjust as needed const safetySettings = [ { category: HarmCategory.HARM_CATEGORY_HARASSMENT, threshold: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, }, { category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, }, { category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, }, { category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, }, ]; export class GoogleClient extends LLMClient { public type = "google" as const; private client: GoogleGenAI; declare public clientOptions: ClientOptions; declare public hasVision: boolean; private logger: (message: LogLine) => void; constructor({ logger, // Added logger based on other clients modelName, clientOptions, }: { logger: (message: LogLine) => void; // Added logger type modelName: AvailableModel; clientOptions?: ClientOptions; // Expecting { apiKey: string } here }) { super(modelName); if (!clientOptions?.apiKey) { // Try to get the API key from the environment variable GOOGLE_API_KEY clientOptions.apiKey = loadApiKeyFromEnv("google_legacy", logger); } this.clientOptions = clientOptions; this.client = new GoogleGenAI({ apiKey: clientOptions.apiKey }); this.modelName = modelName; this.logger = logger; // Determine vision capability based on model name (adjust as needed) this.hasVision = modelName.includes("vision") || modelName.includes("gemini-1.5"); // Example logic } // Helper to convert project's ChatMessage[] to Gemini's Content[] private formatMessages( messages: ChatMessage[], image?: ChatCompletionOptions["image"], ): Content[] { const contents: Content[] = []; let systemInstruction: string | null = null; messages.forEach((msg, index) => { const role = roleMap[msg.role]; if (!role) { this.logger({ category: "google", message: `WARNING: Unsupported role: ${msg.role}`, level: 1, }); return; // Skip unsupported roles } // Handle system messages - prepend to the first user message or use system_instruction if available if (msg.role === "system") { if (typeof msg.content === "string") { systemInstruction = (systemInstruction ? systemInstruction + "\n\n" : "") + msg.content; } return; // Don't add system messages directly to contents yet } const parts: Part[] = []; if (Array.isArray(msg.content)) { msg.content.forEach((partContent) => { if (partContent.type === "text") { parts.push({ text: partContent.text }); } else if (partContent.type === "image_url") { if ("image_url" in partContent && partContent.image_url?.url) { // Assuming base64 data URI format: data:[];base64, const base64Data = partContent.image_url.url.split(",")[1]; const mimeTypeMatch = partContent.image_url.url.match( /^data:(image\/\w+);base64,/, ); if (base64Data && mimeTypeMatch) { parts.push({ inlineData: { mimeType: mimeTypeMatch[1], data: base64Data }, }); } else { this.logger({ category: "google", message: "WARNING: Could not parse image data URI format", level: 1, }); } } } }); } else if (typeof msg.content === "string") { parts.push({ text: msg.content }); } // Add image from options if this is the last message and it's a user message if (image && index === messages.length - 1 && msg.role === "user") { const imageDesc = image.description || AnnotatedScreenshotText; parts.push({ text: imageDesc }); // Add description first parts.push({ inlineData: { mimeType: "image/jpeg", // Assuming JPEG, adjust if needed data: image.buffer.toString("base64"), }, }); } // Apply system instruction to the first non-system message if needed if (systemInstruction && contents.length === 0 && role === "user") { const firstPartText = parts.find((p) => "text" in p); if (firstPartText && "text" in firstPartText) { firstPartText.text = `${systemInstruction}\n\n${firstPartText.text}`; } else { parts.unshift({ text: systemInstruction }); } systemInstruction = null; // Clear after applying } if (parts.length > 0) { contents.push({ role, parts }); } }); // If system instruction wasn't applied (e.g., no user messages followed it), add it as a final user message if (systemInstruction) { contents.unshift({ role: "user", parts: [{ text: systemInstruction }] }); } return contents; } // Helper to convert LLMTool[] to Gemini's Tool[] private formatTools( tools?: ChatCompletionOptions["tools"], ): Tool[] | undefined { if (!tools || tools.length === 0) { return undefined; } return [ { functionDeclarations: tools.map((tool) => { let parameters: Schema | undefined = undefined; if (tool.parameters) { parameters = { type: Type.OBJECT, properties: tool.parameters.properties as { [key: string]: Schema; }, required: tool.parameters.required as string[] | undefined, }; } return { name: tool.name, description: tool.description, parameters: parameters, }; }), }, ]; } async createChatCompletion({ // Ensure LLMResponse is compatible options, logger, retries = 3, }: CreateChatCompletionOptions): Promise { const { image, requestId, response_model, tools, temperature, top_p, maxOutputTokens, } = options; const formattedMessages = this.formatMessages(options.messages, image); const formattedTools = this.formatTools(tools); const generationConfig = { maxOutputTokens: maxOutputTokens, temperature: temperature, topP: top_p, responseMimeType: response_model ? "application/json" : undefined, responseSchema: response_model ? toGeminiSchema(response_model.schema) : undefined, }; logger({ category: "google", message: "creating chat completion", level: 2, auxiliary: { modelName: { value: this.modelName, type: "string" }, requestId: { value: requestId, type: "string" }, requestPayloadSummary: { value: `Model: ${this.modelName}, Messages: ${formattedMessages.length}, Config Keys: ${Object.keys(generationConfig).join(", ")}, Tools: ${formattedTools ? formattedTools.length : 0}, Safety Categories: ${safetySettings.map((s) => s.category).join(", ")}`, type: "string", }, }, }); // Construct the full request object const requestPayload = { model: this.modelName, contents: formattedMessages, config: { ...generationConfig, safetySettings: safetySettings, tools: formattedTools, }, }; // Log the full payload safely try { logger({ category: "google", message: "Full request payload", level: 2, auxiliary: { requestId: { value: requestId, type: "string" }, fullPayload: { value: JSON.stringify(requestPayload), type: "object", }, }, }); } catch (e) { logger({ category: "google", message: "Failed to stringify full request payload for logging", level: 0, auxiliary: { requestId: { value: requestId, type: "string" }, error: { value: e.message, type: "string" }, }, }); } try { const result = await this.client.models.generateContent(requestPayload); // Pass the constructed payload logger({ category: "google", message: "received response", level: 2, auxiliary: { requestId: { value: requestId, type: "string" }, response: { value: JSON.stringify(result), type: "object", }, }, }); const finishReason = result.candidates?.[0]?.finishReason || "unknown"; const toolCalls = result.functionCalls?.map( (fc: FunctionCall, index: number) => ({ id: `tool_call_${requestId}_${index}`, type: "function" as const, function: { name: fc.name, arguments: JSON.stringify(fc.args), }, }), ); let content: string | null = null; try { content = result.text; } catch (e) { logger({ category: "google", message: `Could not extract text content: ${e.message}`, level: 1, auxiliary: { requestId: { value: requestId, type: "string" } }, }); content = null; } // Construct LLMResponse shape const llmResponse: LLMResponse = { id: result.candidates?.[0]?.index?.toString() || requestId, object: "chat.completion", created: Math.floor(Date.now() / 1000), model: this.modelName, choices: [ { index: 0, message: { role: "assistant", content: content, tool_calls: toolCalls, }, finish_reason: finishReason, }, ], usage: { prompt_tokens: result.usageMetadata?.promptTokenCount || 0, completion_tokens: result.usageMetadata?.candidatesTokenCount || 0, total_tokens: result.usageMetadata?.totalTokenCount || 0, }, }; // Validate schema if response_model was provided if (response_model) { let parsedData; try { // Need to handle potential markdown fences if the model didn't follow instructions perfectly const potentialJson = content?.trim().replace(/^```json\n?|\n?```$/g, "") || "{}"; parsedData = JSON.parse(potentialJson); } catch (e) { logger({ category: "google", message: `Failed to parse JSON response: ${e.message}`, level: 0, auxiliary: { content: { value: content || "null", type: "string" }, }, }); if (retries > 0) { return this.createChatCompletion({ options, logger, retries: retries - 1, }); } throw new CreateChatCompletionResponseError( `Failed to parse JSON response: ${e.message}`, ); } try { validateZodSchema(response_model.schema, parsedData); } catch (err) { logger({ category: "google", message: "Response failed Zod schema validation", level: 0, }); if (retries > 0) { return this.createChatCompletion({ options, logger, retries: retries - 1, }); } throw err; } // If schema validation passes, structure the response for extraction use case const extractionResult = { data: parsedData, usage: llmResponse.usage, }; return extractionResult as T; } return llmResponse as T; } catch (error) { logger({ category: "google", message: `Error during Google AI chat completion: ${error.message}`, level: 0, auxiliary: { errorDetails: { value: `Message: ${error.message}${error.stack ? "\nStack: " + error.stack : ""}`, type: "string", }, requestId: { value: requestId, type: "string" }, }, }); // Basic retry logic if (retries > 0) { logger({ category: "google", message: `Retrying... (${retries} attempts left)`, level: 1, }); await new Promise((resolve) => setTimeout(resolve, 1000 * (4 - retries)), ); // Simple backoff return this.createChatCompletion({ options, logger, retries: retries - 1, }); } // Re-throw specific Stagehand errors or a generic one if (error instanceof StagehandError) { throw error; } throw new StagehandError( `Google AI API request failed: ${error.message}`, ); } } } ================================================ FILE: packages/core/lib/v3/llm/GroqClient.ts ================================================ import type { ClientOptions } from "openai"; import OpenAI from "openai"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel } from "../types/public/model.js"; import { ChatMessage, CreateChatCompletionOptions, LLMClient, LLMResponse, } from "./LLMClient.js"; import { CreateChatCompletionResponseError } from "../types/public/sdkErrors.js"; import { toJsonSchema } from "../zodCompat.js"; export class GroqClient extends LLMClient { public type = "groq" as const; private client: OpenAI; declare public clientOptions: ClientOptions; public hasVision = false; constructor({ modelName, clientOptions, userProvidedInstructions, }: { logger: (message: LogLine) => void; modelName: AvailableModel; clientOptions?: ClientOptions; userProvidedInstructions?: string; }) { super(modelName, userProvidedInstructions); // Create OpenAI client with the base URL set to Groq API this.client = new OpenAI({ baseURL: "https://api.groq.com/openai/v1", apiKey: clientOptions?.apiKey || process.env.GROQ_API_KEY, ...clientOptions, }); this.modelName = modelName; this.clientOptions = clientOptions; } async createChatCompletion({ options, retries, logger, }: CreateChatCompletionOptions): Promise { const optionsWithoutImage = { ...options }; delete optionsWithoutImage.image; logger({ category: "groq", message: "creating chat completion", level: 2, auxiliary: { options: { value: JSON.stringify(optionsWithoutImage), type: "object", }, }, }); // Format messages for Groq API (using OpenAI format) const formattedMessages = options.messages.map((msg: ChatMessage) => { const baseMessage = { content: typeof msg.content === "string" ? msg.content : Array.isArray(msg.content) && msg.content.length > 0 && "text" in msg.content[0] ? msg.content[0].text : "", }; // Groq supports system, user, and assistant roles if (msg.role === "system") { return { ...baseMessage, role: "system" as const }; } else if (msg.role === "assistant") { return { ...baseMessage, role: "assistant" as const }; } else { // Default to user for any other role return { ...baseMessage, role: "user" as const }; } }); // Format tools if provided let tools = options.tools?.map((tool) => ({ type: "function" as const, function: { name: tool.name, description: tool.description, parameters: { type: "object", properties: tool.parameters.properties, required: tool.parameters.required, }, }, })); // Add response model as a tool if provided if (options.response_model) { const jsonSchema = toJsonSchema(options.response_model.schema) as { properties?: Record; required?: string[]; }; const schemaProperties = jsonSchema.properties || {}; const schemaRequired = jsonSchema.required || []; const responseTool = { type: "function" as const, function: { name: "print_extracted_data", description: "Prints the extracted data based on the provided schema.", parameters: { type: "object", properties: schemaProperties, required: schemaRequired, }, }, }; tools = tools ? [...tools, responseTool] : [responseTool]; } try { // Use OpenAI client with Groq API const apiResponse = await this.client.chat.completions.create({ model: this.modelName.split("groq-")[1], messages: [ ...formattedMessages, // Add explicit instruction to return JSON if we have a response model ...(options.response_model ? [ { role: "system" as const, content: `IMPORTANT: Your response must be valid JSON that matches this schema: ${JSON.stringify( options.response_model.schema, )}`, }, ] : []), ], temperature: options.temperature || 0.7, max_tokens: options.maxOutputTokens, tools: tools, tool_choice: options.tool_choice || "auto", }); // Format the response to match the expected LLMResponse format const response: LLMResponse = { id: apiResponse.id, object: "chat.completion", created: Date.now(), model: this.modelName.split("groq-")[1], choices: [ { index: 0, message: { role: "assistant", content: apiResponse.choices[0]?.message?.content || null, tool_calls: apiResponse.choices[0]?.message?.tool_calls || [], }, finish_reason: apiResponse.choices[0]?.finish_reason || "stop", }, ], usage: { prompt_tokens: apiResponse.usage?.prompt_tokens || 0, completion_tokens: apiResponse.usage?.completion_tokens || 0, total_tokens: apiResponse.usage?.total_tokens || 0, }, }; logger({ category: "groq", message: "response", level: 2, auxiliary: { response: { value: JSON.stringify(response), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); // If there's no response model, return the entire response object if (!options.response_model) { return response as T; } // Otherwise, try parsing the JSON from the tool call or content const toolCall = response.choices[0]?.message?.tool_calls?.[0]; if (toolCall?.function?.arguments) { try { const result = JSON.parse(toolCall.function.arguments); const finalResponse = { data: result, usage: response.usage, }; return finalResponse as T; } catch (e) { logger({ category: "groq", message: "failed to parse tool call arguments as JSON, retrying", level: 0, auxiliary: { error: { value: e.message, type: "string", }, }, }); } } // If we have content but no tool calls, try to parse the content as JSON const content = response.choices[0]?.message?.content; if (content) { try { // Try to extract JSON from the content const jsonMatch = content.match(/\{[\s\S]*\}/); if (jsonMatch) { const result = JSON.parse(jsonMatch[0]); const finalResponse = { data: result, usage: response.usage, }; return finalResponse as T; } } catch (e) { logger({ category: "groq", message: "failed to parse content as JSON", level: 0, auxiliary: { error: { value: e.message, type: "string", }, }, }); } } // If we still haven't found valid JSON and have retries left, try again if (!retries || retries < 5) { return this.createChatCompletion({ options, logger, retries: (retries ?? 0) + 1, }); } throw new CreateChatCompletionResponseError("Invalid response schema"); } catch (error) { logger({ category: "groq", message: "error creating chat completion", level: 0, auxiliary: { error: { value: error.message, type: "string", }, requestId: { value: options.requestId, type: "string", }, }, }); throw error; } } } ================================================ FILE: packages/core/lib/v3/llm/LLMClient.ts ================================================ import { LLMTool } from "../types/public/model.js"; import { embed, embedMany, experimental_generateImage, experimental_generateSpeech, experimental_transcribe, generateObject, generateText, streamObject, streamText, } from "ai"; import type { LanguageModelV2 } from "@ai-sdk/provider"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel, ClientOptions } from "../types/public/model.js"; import type { StagehandZodSchema } from "../zodCompat.js"; export interface ChatMessage { role: "system" | "user" | "assistant"; content: ChatMessageContent; } export type ChatMessageContent = | string | (ChatMessageImageContent | ChatMessageTextContent)[]; export interface ChatMessageImageContent { type: string; image_url?: { url: string }; text?: string; source?: { type: string; media_type: string; data: string; }; } export interface ChatMessageTextContent { type: string; text: string; } export const AnnotatedScreenshotText = "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically."; export interface ChatCompletionOptions { messages: ChatMessage[]; temperature?: number; top_p?: number; frequency_penalty?: number; presence_penalty?: number; image?: { buffer: Buffer; description?: string; }; response_model?: { name: string; schema: StagehandZodSchema; }; tools?: LLMTool[]; tool_choice?: "auto" | "none" | "required"; maxOutputTokens?: number; requestId?: string; } export type LLMResponse = { id: string; object: string; created: number; model: string; choices: { index: number; message: { role: string; content: string | null; tool_calls: { id: string; type: string; function: { name: string; arguments: string; }; }[]; }; finish_reason: string; }[]; usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number; }; }; export interface CreateChatCompletionOptions { options: ChatCompletionOptions; logger: (message: LogLine) => void; retries?: number; } /** Simple usage shape if your LLM returns usage tokens. */ export interface LLMUsage { prompt_tokens: number; completion_tokens: number; total_tokens: number; reasoning_tokens?: number; cached_input_tokens?: number; } /** * For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage } */ export interface LLMParsedResponse { data: T; usage?: LLMUsage; } export abstract class LLMClient { public type: "openai" | "anthropic" | "cerebras" | "groq" | (string & {}); public modelName: AvailableModel | (string & {}); public hasVision: boolean; public clientOptions: ClientOptions; public userProvidedInstructions?: string; constructor(modelName: AvailableModel, userProvidedInstructions?: string) { this.modelName = modelName; this.userProvidedInstructions = userProvidedInstructions; } // Overload 1: When response_model is provided, returns LLMParsedResponse abstract createChatCompletion( options: CreateChatCompletionOptions & { options: { response_model: { name: string; schema: StagehandZodSchema }; }; }, ): Promise>; // Overload 2: When response_model is not provided, returns T (defaults to LLMResponse) abstract createChatCompletion( options: CreateChatCompletionOptions, ): Promise; public generateObject = generateObject; public generateText = generateText; public streamText = streamText; public streamObject = streamObject; public generateImage = experimental_generateImage; public embed = embed; public embedMany = embedMany; public transcribe = experimental_transcribe; public generateSpeech = experimental_generateSpeech; getLanguageModel?(): LanguageModelV2; } ================================================ FILE: packages/core/lib/v3/llm/LLMProvider.ts ================================================ import { ExperimentalNotConfiguredError, UnsupportedAISDKModelProviderError, UnsupportedModelError, UnsupportedModelProviderError, } from "../types/public/sdkErrors.js"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel, ClientOptions, ModelProvider, } from "../types/public/model.js"; import { AISdkClient } from "./aisdk.js"; import { AnthropicClient } from "./AnthropicClient.js"; import { CerebrasClient } from "./CerebrasClient.js"; import { GoogleClient } from "./GoogleClient.js"; import { GroqClient } from "./GroqClient.js"; import { LLMClient } from "./LLMClient.js"; import { OpenAIClient } from "./OpenAIClient.js"; import { openai, createOpenAI } from "@ai-sdk/openai"; import { bedrock, createAmazonBedrock } from "@ai-sdk/amazon-bedrock"; import { vertex, createVertex } from "@ai-sdk/google-vertex"; import { anthropic, createAnthropic } from "@ai-sdk/anthropic"; import { google, createGoogleGenerativeAI } from "@ai-sdk/google"; import { xai, createXai } from "@ai-sdk/xai"; import { azure, createAzure } from "@ai-sdk/azure"; import { groq, createGroq } from "@ai-sdk/groq"; import { cerebras, createCerebras } from "@ai-sdk/cerebras"; import { togetherai, createTogetherAI } from "@ai-sdk/togetherai"; import { mistral, createMistral } from "@ai-sdk/mistral"; import { deepseek, createDeepSeek } from "@ai-sdk/deepseek"; import { perplexity, createPerplexity } from "@ai-sdk/perplexity"; import { ollama, createOllama } from "ollama-ai-provider-v2"; import { gateway, createGateway } from "ai"; import { AISDKProvider, AISDKCustomProvider } from "../types/public/model.js"; const AISDKProviders: Record = { openai, bedrock, anthropic, google, xai, azure, groq, cerebras, togetherai, mistral, deepseek, perplexity, ollama, vertex, gateway, }; const AISDKProvidersWithAPIKey: Record = { openai: createOpenAI, bedrock: createAmazonBedrock, anthropic: createAnthropic, google: createGoogleGenerativeAI, vertex: createVertex, xai: createXai, azure: createAzure, groq: createGroq, cerebras: createCerebras, togetherai: createTogetherAI, mistral: createMistral, deepseek: createDeepSeek, perplexity: createPerplexity, ollama: createOllama, gateway: createGateway, }; const modelToProviderMap: { [key in AvailableModel]: ModelProvider } = { "gpt-4.1": "openai", "gpt-4.1-mini": "openai", "gpt-4.1-nano": "openai", "o4-mini": "openai", //prettier-ignore "o3": "openai", "o3-mini": "openai", //prettier-ignore "o1": "openai", "o1-mini": "openai", "gpt-4o": "openai", "gpt-4o-mini": "openai", "gpt-4o-2024-08-06": "openai", "gpt-4.5-preview": "openai", "o1-preview": "openai", "cerebras-llama-3.3-70b": "cerebras", "cerebras-llama-3.1-8b": "cerebras", "groq-llama-3.3-70b-versatile": "groq", "groq-llama-3.3-70b-specdec": "groq", "moonshotai/kimi-k2-instruct": "groq", "gemini-1.5-flash": "google", "gemini-1.5-pro": "google", "gemini-1.5-flash-8b": "google", "gemini-2.0-flash-lite": "google", "gemini-2.0-flash": "google", "gemini-2.5-flash-preview-04-17": "google", "gemini-2.5-pro-preview-03-25": "google", }; export function getAISDKLanguageModel( subProvider: string, subModelName: string, clientOptions?: ClientOptions, ) { const hasValidOptions = clientOptions && Object.values(clientOptions).some((v) => v !== undefined && v !== null); if (hasValidOptions) { const creator = AISDKProvidersWithAPIKey[subProvider]; if (!creator) { throw new UnsupportedAISDKModelProviderError( subProvider, Object.keys(AISDKProvidersWithAPIKey), ); } const provider = creator(clientOptions); // Get the specific model from the provider return provider(subModelName); } else { const provider = AISDKProviders[subProvider]; if (!provider) { throw new UnsupportedAISDKModelProviderError( subProvider, Object.keys(AISDKProviders), ); } return provider(subModelName); } } export class LLMProvider { private logger: (message: LogLine) => void; constructor(logger: (message: LogLine) => void) { this.logger = logger; } getClient( modelName: AvailableModel, clientOptions?: ClientOptions, options?: { experimental?: boolean; disableAPI?: boolean }, ): LLMClient { if (modelName.includes("/")) { const firstSlashIndex = modelName.indexOf("/"); const subProvider = modelName.substring(0, firstSlashIndex); const subModelName = modelName.substring(firstSlashIndex + 1); if ( subProvider === "vertex" && !options?.disableAPI && !options?.experimental ) { throw new ExperimentalNotConfiguredError("Vertex provider"); } const languageModel = getAISDKLanguageModel( subProvider, subModelName, clientOptions, ); return new AISdkClient({ model: languageModel, logger: this.logger, }); } // Model name doesn't include "/" - this format is deprecated const provider = modelToProviderMap[modelName]; if (!provider) { throw new UnsupportedModelError(Object.keys(modelToProviderMap)); } this.logger({ category: "llm", message: `Deprecation warning: Model format "${modelName}" is deprecated. Please use the provider/model format (e.g., "openai/gpt-5" or "anthropic/claude-sonnet-4").`, level: 0, }); const availableModel = modelName as AvailableModel; switch (provider) { case "openai": return new OpenAIClient({ logger: this.logger, modelName: availableModel, clientOptions, }); case "anthropic": return new AnthropicClient({ logger: this.logger, modelName: availableModel, clientOptions, }); case "cerebras": return new CerebrasClient({ logger: this.logger, modelName: availableModel, clientOptions, }); case "groq": return new GroqClient({ logger: this.logger, modelName: availableModel, clientOptions, }); case "google": return new GoogleClient({ logger: this.logger, modelName: availableModel, clientOptions, }); default: // This default case handles unknown providers that exist in modelToProviderMap // but aren't implemented in the switch. This is an internal consistency issue. throw new UnsupportedModelProviderError([ ...new Set(Object.values(modelToProviderMap)), ]); } } static getModelProvider(modelName: AvailableModel): ModelProvider { if (modelName.includes("/")) { const firstSlashIndex = modelName.indexOf("/"); const subProvider = modelName.substring(0, firstSlashIndex); if (AISDKProviders[subProvider]) { return "aisdk"; } } const provider = modelToProviderMap[modelName]; return provider; } } ================================================ FILE: packages/core/lib/v3/llm/OpenAIClient.ts ================================================ import OpenAI, { ClientOptions } from "openai"; import { ChatCompletionAssistantMessageParam, ChatCompletionContentPartImage, ChatCompletionContentPartText, ChatCompletionCreateParamsNonStreaming, ChatCompletionMessageParam, ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, } from "openai/resources/chat"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel } from "../types/public/model.js"; import { validateZodSchema } from "../../utils.js"; import { ChatCompletionOptions, ChatMessage, CreateChatCompletionOptions, LLMClient, LLMResponse, } from "./LLMClient.js"; import { CreateChatCompletionResponseError, StagehandError, ZodSchemaValidationError, } from "../types/public/sdkErrors.js"; import { toJsonSchema } from "../zodCompat.js"; export class OpenAIClient extends LLMClient { public type = "openai" as const; private client: OpenAI; declare public clientOptions: ClientOptions; constructor({ modelName, clientOptions, }: { logger: (message: LogLine) => void; modelName: AvailableModel; clientOptions?: ClientOptions; }) { super(modelName); this.clientOptions = clientOptions; this.client = new OpenAI(clientOptions); this.modelName = modelName; } async createChatCompletion({ options: optionsInitial, logger, retries = 3, }: CreateChatCompletionOptions): Promise { let options: Partial = optionsInitial; // O1 models do not support most of the options. So we override them. // For schema and tools, we add them as user messages. let isToolsOverridedForO1 = false; if (this.modelName.startsWith("o1") || this.modelName.startsWith("o3")) { /* eslint-disable */ // Remove unsupported options let { tool_choice, top_p, frequency_penalty, presence_penalty, temperature, } = options; ({ tool_choice, top_p, frequency_penalty, presence_penalty, temperature, ...options } = options); /* eslint-enable */ // Remove unsupported options options.messages = options.messages.map((message) => ({ ...message, role: "user", })); if (options.tools && options.response_model) { throw new StagehandError( "Cannot use both tool and response_model for o1 models", ); } if (options.tools) { // Remove unsupported options const { tools, ...rest } = options; options = rest; isToolsOverridedForO1 = true; options.messages.push({ role: "user", content: `You have the following tools available to you:\n${JSON.stringify( tools, )} Respond with the following zod schema format to use a method: { "name": "", "arguments": } Do not include any other text or formattings like \`\`\` in your response. Just the JSON object.`, }); } } if ( options.temperature && (this.modelName.startsWith("o1") || this.modelName.startsWith("o3")) ) { throw new StagehandError("Temperature is not supported for o1 models"); } const { requestId, ...optionsWithoutImageAndRequestId } = options; logger({ category: "openai", message: "creating chat completion", level: 2, auxiliary: { options: { value: JSON.stringify({ ...optionsWithoutImageAndRequestId, requestId, }), type: "object", }, modelName: { value: this.modelName, type: "string", }, }, }); if (options.image) { const screenshotMessage: ChatMessage = { role: "user", content: [ { type: "image_url", image_url: { url: `data:image/jpeg;base64,${options.image.buffer.toString("base64")}`, }, }, ...(options.image.description ? [{ type: "text", text: options.image.description }] : []), ], }; options.messages.push(screenshotMessage); } let responseFormat: | ChatCompletionCreateParamsNonStreaming["response_format"] | undefined; if (options.response_model) { // For O1 models, we need to add the schema as a user message. if (this.modelName.startsWith("o1") || this.modelName.startsWith("o3")) { try { const parsedSchema = JSON.stringify( toJsonSchema(options.response_model.schema), ); options.messages.push({ role: "user", content: `Respond in this zod schema format:\n${parsedSchema}\n Do not include any other text, formatting or markdown in your output. Do not include \`\`\` or \`\`\`json in your response. Only the JSON object itself.`, }); } catch (error) { logger({ category: "openai", message: "Failed to parse response model schema", level: 0, }); if (retries > 0) { // as-casting to account for o1 models not supporting all options return this.createChatCompletion({ options: options as ChatCompletionOptions, logger, retries: retries - 1, }); } throw error; } } else { responseFormat = { type: "json_schema", json_schema: { name: options.response_model.name, schema: toJsonSchema(options.response_model.schema), }, }; } } /* eslint-disable */ // Remove unsupported options const { response_model, ...openAiOptions } = { ...optionsWithoutImageAndRequestId, model: this.modelName, }; /* eslint-enable */ logger({ category: "openai", message: "creating chat completion", level: 2, auxiliary: { openAiOptions: { value: JSON.stringify(openAiOptions), type: "object", }, }, }); const formattedMessages: ChatCompletionMessageParam[] = options.messages.map((message) => { if (Array.isArray(message.content)) { const contentParts = message.content.map((content) => { if ("image_url" in content) { const imageContent: ChatCompletionContentPartImage = { image_url: { url: content.image_url.url, }, type: "image_url", }; return imageContent; } else { const textContent: ChatCompletionContentPartText = { text: content.text, type: "text", }; return textContent; } }); if (message.role === "system") { const formattedMessage: ChatCompletionSystemMessageParam = { ...message, role: "system", content: contentParts.filter( (content): content is ChatCompletionContentPartText => content.type === "text", ), }; return formattedMessage; } else if (message.role === "user") { const formattedMessage: ChatCompletionUserMessageParam = { ...message, role: "user", content: contentParts, }; return formattedMessage; } else { const formattedMessage: ChatCompletionAssistantMessageParam = { ...message, role: "assistant", content: contentParts.filter( (content): content is ChatCompletionContentPartText => content.type === "text", ), }; return formattedMessage; } } const formattedMessage: ChatCompletionUserMessageParam = { role: "user", content: message.content, }; return formattedMessage; }); const body: ChatCompletionCreateParamsNonStreaming = { ...openAiOptions, model: this.modelName, messages: formattedMessages, response_format: responseFormat, stream: false, tools: options.tools?.map((tool) => ({ function: { name: tool.name, description: tool.description, parameters: tool.parameters, }, type: "function", })), }; const response = await this.client.chat.completions.create(body); // For O1 models, we need to parse the tool call response manually and add it to the response. if (isToolsOverridedForO1) { try { const parsedContent = JSON.parse(response.choices[0].message.content); response.choices[0].message.tool_calls = [ { function: { name: parsedContent["name"], arguments: JSON.stringify(parsedContent["arguments"]), }, type: "function", id: "-1", }, ]; response.choices[0].message.content = null; } catch (error) { logger({ category: "openai", message: "Failed to parse tool call response", level: 0, auxiliary: { error: { value: error.message, type: "string", }, content: { value: response.choices[0].message.content, type: "string", }, }, }); if (retries > 0) { // as-casting to account for o1 models not supporting all options return this.createChatCompletion({ options: options as ChatCompletionOptions, logger, retries: retries - 1, }); } throw error; } } logger({ category: "openai", message: "response", level: 2, auxiliary: { response: { value: JSON.stringify(response), type: "object", }, requestId: { value: requestId, type: "string", }, }, }); if (options.response_model) { const extractedData = response.choices[0].message.content; const parsedData = JSON.parse(extractedData); try { validateZodSchema(options.response_model.schema, parsedData); } catch (e) { logger({ category: "openai", message: "Response failed Zod schema validation", level: 0, }); if (retries > 0) { // as-casting to account for o1 models not supporting all options return this.createChatCompletion({ options: options as ChatCompletionOptions, logger, retries: retries - 1, }); } if (e instanceof ZodSchemaValidationError) { logger({ category: "openai", message: `Error during OpenAI chat completion: ${e.message}`, level: 0, auxiliary: { errorDetails: { value: `Message: ${e.message}${e.stack ? "\nStack: " + e.stack : ""}`, type: "string", }, requestId: { value: requestId, type: "string" }, }, }); throw new CreateChatCompletionResponseError(e.message); } throw e; } return { data: parsedData, usage: response.usage, } as T; } // if the function was called with a response model, it would have returned earlier // so we can safely cast here to T, which defaults to ChatCompletion return response as T; } } ================================================ FILE: packages/core/lib/v3/llm/aisdk.ts ================================================ import { CoreAssistantMessage, ModelMessage, CoreSystemMessage, CoreUserMessage, generateObject, generateText, ImagePart, NoObjectGeneratedError, TextPart, ToolSet, Tool, } from "ai"; import type { LanguageModelV2 } from "@ai-sdk/provider"; import { ChatCompletion } from "openai/resources"; import { v7 as uuidv7 } from "uuid"; import { LogLine } from "../types/public/logs.js"; import { AvailableModel } from "../types/public/model.js"; import { CreateChatCompletionOptions, LLMClient } from "./LLMClient.js"; import { FlowLogger, extractLlmPromptSummary, } from "../flowlogger/FlowLogger.js"; import { toJsonSchema } from "../zodCompat.js"; export class AISdkClient extends LLMClient { public type = "aisdk" as const; private model: LanguageModelV2; private logger?: (message: LogLine) => void; constructor({ model, logger, }: { model: LanguageModelV2; logger?: (message: LogLine) => void; }) { super(model.modelId as AvailableModel); this.model = model; this.logger = logger; } public getLanguageModel(): LanguageModelV2 { return this.model; } async createChatCompletion({ options, }: CreateChatCompletionOptions): Promise { this.logger?.({ category: "aisdk", message: "creating chat completion", level: 2, auxiliary: { options: { value: JSON.stringify({ ...options, image: undefined, messages: options.messages.map((msg) => ({ ...msg, content: Array.isArray(msg.content) ? msg.content.map((c) => "image_url" in c ? { ...c, image_url: { url: "[IMAGE_REDACTED]" } } : c, ) : msg.content, })), }), type: "object", }, modelName: { value: this.model.modelId, type: "string", }, }, }); const formattedMessages: ModelMessage[] = options.messages.map( (message) => { if (Array.isArray(message.content)) { if (message.role === "system") { const systemMessage: CoreSystemMessage = { role: "system", content: message.content .map((c) => ("text" in c ? c.text : "")) .join("\n"), }; return systemMessage; } const contentParts = message.content.map((content) => { if ("image_url" in content) { const imageContent: ImagePart = { type: "image", image: content.image_url.url, }; return imageContent; } else { const textContent: TextPart = { type: "text", text: content.text, }; return textContent; } }); if (message.role === "user") { const userMessage: CoreUserMessage = { role: "user", content: contentParts, }; return userMessage; } else { const textOnlyParts = contentParts.map((part) => ({ type: "text" as const, text: part.type === "image" ? "[Image]" : part.text, })); const assistantMessage: CoreAssistantMessage = { role: "assistant", content: textOnlyParts, }; return assistantMessage; } } return { role: message.role, content: message.content, }; }, ); let objectResponse: Awaited>; const isGPT5 = this.model.modelId.includes("gpt-5"); const isCodex = this.model.modelId.includes("codex"); const usesLowReasoningEffort = (this.model.modelId.includes("gpt-5.1") || this.model.modelId.includes("gpt-5.2")) && !isCodex; // Kimi models only support temperature=1 const isKimi = this.model.modelId.includes("kimi"); const temperature = isKimi ? 1 : options.temperature; // Models that lack native structured-output support need a prompt-based // JSON fallback instead of response_format: { type: "json_schema" }. const PROMPT_JSON_FALLBACK_PATTERNS = ["deepseek", "kimi", "glm"]; const needsPromptJsonFallback = PROMPT_JSON_FALLBACK_PATTERNS.some((p) => this.model.modelId.includes(p), ); if (options.response_model) { // Log LLM request for generateObject (extract) const llmRequestId = uuidv7(); const promptSummary = extractLlmPromptSummary(options.messages, { hasSchema: true, }); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: this.model.modelId, prompt: promptSummary, }); // For models that don't support native structured outputs, add a prompt instruction if (needsPromptJsonFallback) { const parsedSchema = JSON.stringify( toJsonSchema(options.response_model.schema), ); formattedMessages.push({ role: "user", content: `Respond in this zod schema format:\n${parsedSchema}\n You must respond in JSON format. respond WITH JSON. Do not include any other text, formatting or markdown in your output. Do not include \`\`\` or \`\`\`json in your response. Only the JSON object itself.`, }); } try { objectResponse = await generateObject({ model: this.model, messages: formattedMessages, schema: options.response_model.schema, temperature, providerOptions: isGPT5 ? { openai: { textVerbosity: isCodex ? "medium" : "low", // codex models only support 'medium' reasoningEffort: isCodex ? "medium" : usesLowReasoningEffort ? "low" : "minimal", }, } : undefined, }); } catch (err) { // Log error response to maintain request/response pairing FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.model.modelId, output: `[error: ${err instanceof Error ? err.message : "unknown"}]`, }); if (NoObjectGeneratedError.isInstance(err)) { this.logger?.({ category: "AISDK error", message: err.message, level: 0, auxiliary: { cause: { value: JSON.stringify(err.cause ?? {}), type: "object", }, text: { value: err.text ?? "", type: "string", }, response: { value: JSON.stringify(err.response ?? {}), type: "object", }, usage: { value: JSON.stringify(err.usage ?? {}), type: "object", }, finishReason: { value: err.finishReason ?? "unknown", type: "string", }, requestId: { value: options.requestId, type: "string", }, }, }); throw err; } throw err; } const result = { data: objectResponse.object, usage: { prompt_tokens: objectResponse.usage.inputTokens ?? 0, completion_tokens: objectResponse.usage.outputTokens ?? 0, reasoning_tokens: objectResponse.usage.reasoningTokens ?? 0, cached_input_tokens: objectResponse.usage.cachedInputTokens ?? 0, total_tokens: objectResponse.usage.totalTokens ?? 0, }, } as T; // Log LLM response for generateObject FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.model.modelId, output: JSON.stringify(objectResponse.object), inputTokens: objectResponse.usage.inputTokens, outputTokens: objectResponse.usage.outputTokens, }); this.logger?.({ category: "aisdk", message: "response", level: 1, auxiliary: { response: { value: JSON.stringify({ object: objectResponse.object, usage: objectResponse.usage, finishReason: objectResponse.finishReason, // Omit request and response properties that might contain images }), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); return result; } const tools: ToolSet = {}; if (options.tools && options.tools.length > 0) { for (const tool of options.tools) { tools[tool.name] = { description: tool.description, inputSchema: tool.parameters, } as Tool; } } // Log LLM request for generateText (act/observe) const llmRequestId = uuidv7(); const toolCount = Object.keys(tools).length; const promptSummary = extractLlmPromptSummary(options.messages, { toolCount, }); FlowLogger.logLlmRequest({ requestId: llmRequestId, model: this.model.modelId, prompt: promptSummary, }); let textResponse: Awaited>; try { textResponse = await generateText({ model: this.model, messages: formattedMessages, tools: Object.keys(tools).length > 0 ? tools : undefined, toolChoice: Object.keys(tools).length > 0 ? options.tool_choice === "required" ? "required" : options.tool_choice === "none" ? "none" : "auto" : undefined, temperature, }); } catch (err) { // Log error response to maintain request/response pairing FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.model.modelId, output: `[error: ${err instanceof Error ? err.message : "unknown"}]`, }); throw err; } // Transform AI SDK response to match LLMResponse format expected by operator handler const transformedToolCalls = (textResponse.toolCalls || []).map( (toolCall) => ({ id: toolCall.toolCallId || `call_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`, type: "function", function: { name: toolCall.toolName, arguments: JSON.stringify(toolCall.input), }, }), ); const result = { id: `chatcmpl_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`, object: "chat.completion", created: Math.floor(Date.now() / 1000), model: this.model.modelId, choices: [ { index: 0, message: { role: "assistant", content: textResponse.text || null, tool_calls: transformedToolCalls, }, finish_reason: textResponse.finishReason || "stop", }, ], usage: { prompt_tokens: textResponse.usage.inputTokens ?? 0, completion_tokens: textResponse.usage.outputTokens ?? 0, reasoning_tokens: textResponse.usage.reasoningTokens ?? 0, cached_input_tokens: textResponse.usage.cachedInputTokens ?? 0, total_tokens: textResponse.usage.totalTokens ?? 0, }, } as T; // Log LLM response for generateText FlowLogger.logLlmResponse({ requestId: llmRequestId, model: this.model.modelId, output: textResponse.text || (transformedToolCalls.length > 0 ? `[${transformedToolCalls.length} tool calls]` : ""), inputTokens: textResponse.usage.inputTokens, outputTokens: textResponse.usage.outputTokens, }); this.logger?.({ category: "aisdk", message: "response", level: 2, auxiliary: { response: { value: JSON.stringify({ text: textResponse.text, usage: textResponse.usage, finishReason: textResponse.finishReason, // Omit request and response properties that might contain images }), type: "object", }, requestId: { value: options.requestId, type: "string", }, }, }); return result; } } ================================================ FILE: packages/core/lib/v3/logger.ts ================================================ import type { LogLine } from "./types/public/logs.js"; import { AsyncLocalStorage } from "node:async_hooks"; /** * Stagehand V3 Logging * * Design goals: * - Support concurrent V3 instances with independent logger configuration * - Each V3 instance has its own StagehandLogger (handles usePino, verbose, externalLogger) * - Provide AsyncLocalStorage-based routing for backward compatibility with handler code * - Prevent cross-talk between concurrent instances * * How it works: * - Each V3 instance creates a StagehandLogger in its constructor (per-instance config) * - bindInstanceLogger()/unbindInstanceLogger(): registers logger callback per instance ID * - withInstanceLogContext(): establishes AsyncLocalStorage context for an async operation * - v3Logger(): routes logs using AsyncLocalStorage with console fallback * * ⚠️ CONTEXT LOSS SCENARIOS: * 1. setTimeout/setInterval callbacks lose context (runs outside AsyncLocalStorage scope) * 2. Event emitters (EventEmitter.on) lose context (callback invoked outside scope) * 3. Fire-and-forget promises (void promise) lose context if they don't complete synchronously * 4. Third-party library callbacks may lose context depending on implementation * * WORKAROUND for context loss: * - Use explicit logger parameter instead of v3Logger() * - Wrap callback in withInstanceLogContext() manually * - Or let logs fall back to console (acceptable for edge cases) */ // Per-instance routing using AsyncLocalStorage const logContext = new AsyncLocalStorage(); const instanceLoggers = new Map void>(); export function bindInstanceLogger( instanceId: string, logger: (line: LogLine) => void, ): void { instanceLoggers.set(instanceId, logger); } export function unbindInstanceLogger(instanceId: string): void { instanceLoggers.delete(instanceId); } export function withInstanceLogContext(instanceId: string, fn: () => T): T { return logContext.run(instanceId, fn); } /** * Routes logs to the appropriate instance logger based on AsyncLocalStorage context. * Falls back to console output if no instance context is available. */ export function v3Logger(line: LogLine): void { const id = logContext.getStore(); if (id) { const fn = instanceLoggers.get(id); if (fn) { const enriched: LogLine = { ...line, auxiliary: { ...(line.auxiliary || {}), }, }; try { fn(enriched); return; } catch { // fallback to console below } } } // Fallback: log to console when no instance context const ts = line.timestamp ?? new Date().toISOString(); const lvl = line.level ?? 1; const levelStr = lvl === 0 ? "ERROR" : lvl === 2 ? "DEBUG" : "INFO"; let output = `[${ts}] ${levelStr}: ${line.message}`; if (line.auxiliary) { for (const [key, { value, type }] of Object.entries(line.auxiliary)) { let formattedValue = value; if (type === "object") { try { formattedValue = JSON.stringify(JSON.parse(value), null, 2) .split("\n") .map((line, i) => (i === 0 ? line : ` ${line}`)) .join("\n"); } catch { formattedValue = value; } } output += `\n ${key}: ${formattedValue}`; } } if (lvl === 0) { console.error(output); } else if (lvl === 2) { (console.debug ?? console.log)(output); } else { console.log(output); } } ================================================ FILE: packages/core/lib/v3/mcp/connection.ts ================================================ import { Client, ClientOptions, } from "@modelcontextprotocol/sdk/client/index.js"; import { StreamableHTTPClientTransport, type StreamableHTTPClientTransportOptions, } from "@modelcontextprotocol/sdk/client/streamableHttp.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; import { MCPConnectionError } from "../types/public/sdkErrors.js"; export interface ConnectToMCPServerOptions { serverUrl: string | URL; clientOptions?: ClientOptions; requestOptions?: StreamableHTTPClientTransportOptions; } export interface StdioServerConfig { command: string; args?: string[]; env?: Record; } export const connectToMCPServer = async ( serverConfig: string | URL | StdioServerConfig | ConnectToMCPServerOptions, ): Promise => { try { let transport; let clientOptions: ClientOptions | undefined; let requestOptions: StreamableHTTPClientTransportOptions | undefined; // Check if it's a stdio config (has 'command' property) if (typeof serverConfig === "object" && "command" in serverConfig) { transport = new StdioClientTransport(serverConfig); } else { // Handle URL-based connection let serverUrl: string | URL; if (typeof serverConfig === "string" || serverConfig instanceof URL) { serverUrl = serverConfig; } else { serverUrl = (serverConfig as ConnectToMCPServerOptions).serverUrl; clientOptions = (serverConfig as ConnectToMCPServerOptions) .clientOptions; requestOptions = (serverConfig as ConnectToMCPServerOptions) .requestOptions; } transport = new StreamableHTTPClientTransport( new URL(serverUrl), requestOptions, ); } const client = new Client({ name: "Stagehand", version: "1.0.0", ...clientOptions, }); await client.connect(transport); try { await client.ping(); } catch (pingError) { await client.close(); throw new MCPConnectionError(serverConfig.toString(), pingError); } return client; } catch (error) { // Handle any errors during transport/client creation or connection if (error instanceof MCPConnectionError) { throw error; // Re-throw our custom error } throw new MCPConnectionError(serverConfig.toString(), error); } }; ================================================ FILE: packages/core/lib/v3/mcp/utils.ts ================================================ import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { ToolSet } from "ai"; import { JsonSchema, jsonSchemaToZod } from "../../utils.js"; import { connectToMCPServer } from "./connection.js"; export const resolveTools = async ( clients: (Client | string)[], userTools: ToolSet, ): Promise => { const tools: ToolSet = { ...userTools }; for (const client of clients) { let clientInstance: Client; if (typeof client === "string") { clientInstance = await connectToMCPServer(client); } else { clientInstance = client; } let nextCursor: string | undefined = undefined; do { const clientTools = await clientInstance.listTools({ cursor: nextCursor, }); for (const tool of clientTools.tools) { tools[tool.name] = { description: tool.description, inputSchema: jsonSchemaToZod(tool.inputSchema as JsonSchema), execute: async (input) => { const result = await clientInstance.callTool({ name: tool.name, arguments: input, }); return result; }, }; } nextCursor = clientTools.nextCursor; } while (nextCursor); } return tools; }; ================================================ FILE: packages/core/lib/v3/runtimePaths.ts ================================================ /** * Keep this file in sync with: * - /packages/core/lib/v3/runtimePaths.ts * - /packages/server-v3/scripts/runtimePaths.ts * - /packages/server-v4/scripts/runtimePaths.ts * - /packages/evals/runtimePaths.ts * - /packages/docs/scripts/runtimePaths.js */ import path from "node:path"; import { fileURLToPath } from "node:url"; import { createRequire } from "node:module"; const PACKAGE_SEGMENT = "/packages/core/"; const EVAL_FRAMES = new Set(["[eval]", "[eval]-wrapper"]); const INTERNAL_FRAME_NAMES = new Set([ "readCallsites", "readCallsitePath", "resolveCallerFilePath", "getCurrentFilePath", "getCurrentDirPath", "getRepoRootDir", "getPackageRootDir", "createRequireFromCaller", "isMainModule", ]); const normalizePath = (value: string): string => { const input = value.startsWith("file://") ? fileURLToPath(value) : value; return path.resolve(input).replaceAll("\\", "/"); }; const readCallsites = (): NodeJS.CallSite[] => { const previousPrepare = Error.prepareStackTrace; try { Error.prepareStackTrace = (_, stack) => stack; return ( (new Error().stack as unknown as NodeJS.CallSite[] | undefined) ?? [] ); } finally { Error.prepareStackTrace = previousPrepare; } }; type CallSiteWithScriptName = NodeJS.CallSite & { getScriptNameOrSourceURL?: () => string | null; }; const readCallsitePath = (callsite: NodeJS.CallSite): string | null => { const callsiteWithScript = callsite as CallSiteWithScriptName; const rawPath = callsite.getFileName() ?? callsiteWithScript.getScriptNameOrSourceURL?.(); if (!rawPath) return null; if (rawPath.startsWith("node:")) return null; if (EVAL_FRAMES.has(rawPath)) return null; return normalizePath(rawPath); }; const isInternalCallsite = (callsite: NodeJS.CallSite): boolean => { const functionName = callsite.getFunctionName(); if (functionName && INTERNAL_FRAME_NAMES.has(functionName)) return true; const methodName = callsite.getMethodName(); if (methodName && INTERNAL_FRAME_NAMES.has(methodName)) return true; const callsiteString = callsite.toString(); for (const frameName of INTERNAL_FRAME_NAMES) { if (callsiteString.includes(`${frameName} (`)) return true; if (callsiteString.includes(`.${frameName} (`)) return true; } return false; }; const resolveCallerFilePath = (): string => { const packageCandidates: string[] = []; const fallbackCandidates: string[] = []; for (const callsite of readCallsites()) { const filePath = readCallsitePath(callsite); if (!filePath) continue; if (isInternalCallsite(callsite)) continue; if (filePath.includes(PACKAGE_SEGMENT)) { packageCandidates.push(filePath); continue; } fallbackCandidates.push(filePath); } const packageCandidate = packageCandidates[0]; if (packageCandidate) return packageCandidate; const fallbackCandidate = fallbackCandidates[0]; if (fallbackCandidate) return fallbackCandidate; throw new Error("Unable to resolve caller file path."); }; export const getCurrentFilePath = (): string => resolveCallerFilePath(); export const getCurrentDirPath = (): string => path.dirname(getCurrentFilePath()); export const getRepoRootDir = (): string => { const currentFilePath = getCurrentFilePath(); const index = currentFilePath.lastIndexOf(PACKAGE_SEGMENT); if (index === -1) { throw new Error( `Unable to determine repo root from ${currentFilePath} (missing ${PACKAGE_SEGMENT}).`, ); } return currentFilePath.slice(0, index); }; export const getPackageRootDir = (): string => `${getRepoRootDir()}${PACKAGE_SEGMENT.slice(0, -1)}`; export const createRequireFromCaller = () => createRequire(getCurrentFilePath()); export const isMainModule = (): boolean => { const entryScript = process.argv.at(1); if (!entryScript) return false; return normalizePath(entryScript) === getCurrentFilePath(); }; ================================================ FILE: packages/core/lib/v3/shutdown/cleanupLocal.ts ================================================ import fs from "node:fs"; /** * Shared cleanup logic for locally launched Chrome. * * Used by both `V3.close()` (normal shutdown) and the supervisor process * (crash cleanup). The caller provides a `killChrome` callback since the * kill mechanism differs: chrome-launcher's `chrome.kill()` in-process * vs raw `process.kill(pid)` from the supervisor. */ export async function cleanupLocalBrowser(opts: { killChrome?: () => Promise | void; userDataDir?: string; createdTempProfile?: boolean; preserveUserDataDir?: boolean; }): Promise { if (opts.killChrome) { try { await opts.killChrome(); } catch { // best-effort } } if ( opts.createdTempProfile && !opts.preserveUserDataDir && opts.userDataDir ) { try { fs.rmSync(opts.userDataDir, { recursive: true, force: true }); } catch { // ignore cleanup errors } } } ================================================ FILE: packages/core/lib/v3/shutdown/supervisor.ts ================================================ /** * Shutdown supervisor process. * * This process watches a stdin lifeline. When the parent dies, stdin closes * and the supervisor performs best-effort cleanup: * - LOCAL: kill Chrome + remove temp profile * - STAGEHAND_API: request session release */ import Browserbase from "@browserbasehq/sdk"; import type { ShutdownSupervisorConfig } from "../types/private/shutdown.js"; import { cleanupLocalBrowser } from "./cleanupLocal.js"; const SIGKILL_POLL_MS = 250; const SIGKILL_TIMEOUT_MS = 7_000; const PID_POLL_INTERVAL_MS = 500; // `cleanupPromise` guarantees we execute cleanup at most once. let config: ShutdownSupervisorConfig | null = null; let cleanupPromise: Promise | null = null; let started = false; let localPidKnownGone = false; const exit = (code = 0): void => { try { process.exit(code); } catch { // ignore } }; // Best-effort two-phase kill: SIGTERM first, then SIGKILL after timeout. // Treat only ESRCH as "already gone"; other errors should not imply dead. const politeKill = async (pid: number): Promise => { const isAlive = (): boolean => { try { process.kill(pid, 0); return true; } catch (error) { const err = error as NodeJS.ErrnoException; // ESRCH = "No such process" (PID is already gone). return err.code !== "ESRCH"; } }; if (!isAlive()) return; try { process.kill(pid, "SIGTERM"); } catch (error) { const err = error as NodeJS.ErrnoException; // ESRCH = process already exited; no further action needed. if (err.code === "ESRCH") return; } const deadline = Date.now() + SIGKILL_TIMEOUT_MS; while (Date.now() < deadline) { await new Promise((resolve) => setTimeout(resolve, SIGKILL_POLL_MS)); if (!isAlive()) return; } try { process.kill(pid, "SIGKILL"); } catch { // best-effort } }; let pidPollTimer: NodeJS.Timeout | null = null; // Local-only fallback: if Chrome dies while parent still lives, run cleanup and exit. const startPidPolling = (pid: number): void => { if (pidPollTimer) return; pidPollTimer = setInterval(() => { try { process.kill(pid, 0); return; } catch (error) { const err = error as NodeJS.ErrnoException; // Only ESRCH means the process is definitely gone. if (err.code !== "ESRCH") return; } localPidKnownGone = true; if (pidPollTimer) { clearInterval(pidPollTimer); pidPollTimer = null; } void runCleanup("Browser process exited").finally(() => exit(0)); }, PID_POLL_INTERVAL_MS); }; const cleanupLocal = async ( cfg: Extract, reason: string, ) => { const deletingUserDataDir = Boolean( cfg.createdTempProfile && !cfg.preserveUserDataDir && cfg.userDataDir, ); await cleanupLocalBrowser({ // If polling already observed ESRCH, avoid a follow-up PID kill. // The PID could be reused by a different process before cleanup runs. killChrome: cfg.pid && !localPidKnownGone ? () => { console.error( `[shutdown-supervisor] Shutting down Chrome pid=${cfg.pid} ` + `(reason=${reason}, deletingUserDataDir=${deletingUserDataDir})`, ); return politeKill(cfg.pid); } : undefined, userDataDir: cfg.userDataDir, createdTempProfile: cfg.createdTempProfile, preserveUserDataDir: cfg.preserveUserDataDir, }); }; const cleanupBrowserbase = async ( cfg: Extract, reason: string, ) => { if (!cfg.apiKey || !cfg.sessionId) return; try { console.error( `[shutdown-supervisor] Ending Browserbase session ${cfg.sessionId} ` + `(reason=${reason})`, ); const bb = new Browserbase({ apiKey: cfg.apiKey }); await bb.sessions.update(cfg.sessionId, { status: "REQUEST_RELEASE", ...(cfg.projectId ? { projectId: cfg.projectId } : {}), } as Browserbase.Sessions.SessionUpdateParams); } catch { // best-effort cleanup } }; // Idempotent cleanup entrypoint used by all supervisor shutdown paths. const runCleanup = (reason: string): Promise => { if (!cleanupPromise) { cleanupPromise = (async () => { const cfg = config; if (!cfg) return; if (cfg.kind === "LOCAL") { await cleanupLocal(cfg, reason); return; } if (cfg.kind === "STAGEHAND_API") { await cleanupBrowserbase(cfg, reason); } })(); } return cleanupPromise; }; const applyConfig = (nextConfig: ShutdownSupervisorConfig): void => { config = nextConfig; localPidKnownGone = false; if (config.kind === "LOCAL" && config.pid) { startPidPolling(config.pid); } }; const onLifelineClosed = (reason: string) => { void runCleanup(reason).finally(() => exit(0)); }; const parseConfigFromArgv = ( argv: readonly string[] = process.argv.slice(2), ): ShutdownSupervisorConfig | null => { const prefix = "--supervisor-config="; const raw = argv.find((arg) => arg.startsWith(prefix))?.slice(prefix.length); if (!argv.includes("--supervisor") || !raw) return null; try { return JSON.parse(raw) as ShutdownSupervisorConfig; } catch { return null; } }; export const runShutdownSupervisor = ( initialConfig: ShutdownSupervisorConfig, ): void => { if (started) return; started = true; applyConfig(initialConfig); // Stdin is the lifeline; losing it means parent is gone. try { process.stdin.resume(); process.stdin.on("end", () => onLifelineClosed("Stagehand process completed"), ); process.stdin.on("close", () => onLifelineClosed("Stagehand process completed"), ); process.stdin.on("error", () => onLifelineClosed("Stagehand process crashed or was killed"), ); } catch { // ignore } }; export const maybeRunShutdownSupervisorFromArgv = ( argv: readonly string[] = process.argv.slice(2), ): boolean => { const parsed = parseConfigFromArgv(argv); if (!parsed) return false; runShutdownSupervisor(parsed); return true; }; ================================================ FILE: packages/core/lib/v3/shutdown/supervisorClient.ts ================================================ /** * Parent-side helper for spawning the shutdown supervisor process. * * The supervisor runs out-of-process and watches a lifeline pipe. If the parent * dies, the supervisor performs best-effort cleanup (Chrome kill or Browserbase * session release) when keepAlive is false. */ import fs from "node:fs"; import path from "node:path"; import { spawn } from "node:child_process"; import { createRequire } from "node:module"; import type { ShutdownSupervisorConfig, ShutdownSupervisorHandle, } from "../types/private/shutdown.js"; import { ShutdownSupervisorResolveError, ShutdownSupervisorSpawnError, } from "../types/private/shutdownErrors.js"; import { getCurrentFilePath } from "../runtimePaths.js"; const moduleFilename = getCurrentFilePath(); const moduleDir = path.dirname(moduleFilename); const nodeRequire = createRequire(moduleFilename); const isSeaRuntime = (): boolean => { try { const sea = nodeRequire("node:sea") as { isSea?: () => boolean }; return Boolean(sea.isSea?.()); } catch { return false; } }; // SEA: re-exec current binary with supervisor args. // Non-SEA: execute Stagehand CLI entrypoint with supervisor args. const resolveCliPath = (): string => `${moduleDir}/../cli.js`; const resolveSupervisorCommand = ( config: ShutdownSupervisorConfig, ): { command: string; args: string[]; } | null => { const baseArgs = ["--supervisor", serializeConfigArg(config)]; if (isSeaRuntime()) { return { command: process.execPath, args: baseArgs }; } const cliPath = resolveCliPath(); if (!fs.existsSync(cliPath)) return null; const needsTsxLoader = fs.existsSync(`${moduleDir}/supervisor.ts`) && !fs.existsSync(`${moduleDir}/supervisor.js`); return { command: process.execPath, args: needsTsxLoader ? ["--import", "tsx", cliPath, ...baseArgs] : [cliPath, ...baseArgs], }; }; // Single JSON arg keeps supervisor bootstrap parsing tiny and versionable. const serializeConfigArg = (config: ShutdownSupervisorConfig): string => `--supervisor-config=${JSON.stringify({ ...config, parentPid: process.pid, })}`; /** * Start a supervisor process for crash cleanup. Returns a handle that can * stop the supervisor during a normal shutdown. */ export function startShutdownSupervisor( config: ShutdownSupervisorConfig, opts?: { onError?: (error: Error, context: string) => void }, ): ShutdownSupervisorHandle | null { const resolved = resolveSupervisorCommand(config); if (!resolved) { opts?.onError?.( new ShutdownSupervisorResolveError( "Shutdown supervisor entry missing (expected Stagehand CLI entrypoint).", ), "resolve", ); return null; } const child = spawn(resolved.command, resolved.args, { // stdin is the parent lifeline. // Preserve supervisor stderr so crash-cleanup debug lines are visible. stdio: ["pipe", "ignore", "inherit"], detached: true, }); child.on("error", (error) => { opts?.onError?.( new ShutdownSupervisorSpawnError( `Shutdown supervisor failed to start: ${error.message}`, ), "spawn", ); }); try { child.unref(); const stdin = child.stdin as unknown as { unref?: () => void } | null; stdin?.unref?.(); } catch { // best-effort: avoid keeping the event loop alive } const stop = () => { // Normal close path: terminate supervisor directly. try { child.kill("SIGTERM"); } catch { // ignore } }; return { stop }; } ================================================ FILE: packages/core/lib/v3/timeoutConfig.ts ================================================ import { TimeoutError } from "./types/public/sdkErrors.js"; export function getEnvTimeoutMs(name: string): number | undefined { const raw = process.env[name]; if (!raw) return undefined; const normalized = raw.trim().replace(/ms$/i, ""); const value = Number(normalized); if (!Number.isFinite(value) || value <= 0) return undefined; return value; } export async function withTimeout( promise: Promise, timeoutMs: number | null | undefined, operation: string, ): Promise { if ( typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs) || timeoutMs <= 0 ) { return await promise; } let timeoutId: NodeJS.Timeout | undefined; const timeoutPromise = new Promise((_, reject) => { timeoutId = setTimeout(() => { reject(new TimeoutError(operation, timeoutMs)); }, timeoutMs); }); try { return await Promise.race([promise, timeoutPromise]); } finally { if (timeoutId) clearTimeout(timeoutId); } } ================================================ FILE: packages/core/lib/v3/types/private/agent.ts ================================================ export interface ActionMappingOptions { toolCallName: string; toolResult: unknown; args: Record; reasoning?: string; } ================================================ FILE: packages/core/lib/v3/types/private/api.ts ================================================ import type { Protocol } from "devtools-protocol"; export interface SerializableResponse { requestId: string; frameId?: string; loaderId?: string; response: Protocol.Network.Response; fromServiceWorkerFlag?: boolean; finishedSettled?: boolean; extraInfoHeaders?: Protocol.Network.Headers | null; extraInfoHeadersText?: string; } ================================================ FILE: packages/core/lib/v3/types/private/cache.ts ================================================ import type { ActOptions, ActResult, AvailableModel, Logger, AgentResult, Action, LoadState, } from "../public/index.js"; import { CacheStorage } from "../../cache/CacheStorage.js"; import type { ActHandler } from "../../handlers/actHandler.js"; import type { V3Context } from "../../understudy/context.js"; import type { LLMClient } from "../../llm/LLMClient.js"; export type ActFn = ( instruction: string, options?: ActOptions, ) => Promise; export type AgentCacheContext = { instruction: string; startUrl: string; options: SanitizedAgentExecuteOptions; configSignature: string; cacheKey: string; variableKeys: string[] /** Variable keys used in this execution (for cache key) */; /** Variable values to substitute during replay */ variables?: Record; }; export type AgentCacheTransferPayload = { cacheKey: string; entry: CachedAgentEntry; }; export type AgentCacheDeps = { storage: CacheStorage; logger: Logger; getActHandler: () => ActHandler | null; getContext: () => V3Context | null; getDefaultLlmClient: () => LLMClient; getBaseModelName: () => AvailableModel; getSystemPrompt: () => string | undefined; domSettleTimeoutMs?: number; act: ActFn; bufferLatestEntry?: boolean; }; export type ActCacheContext = { instruction: string; cacheKey: string; pageUrl: string; variableKeys: string[]; variables?: Record; }; export type ActCacheDeps = { storage: CacheStorage; logger: Logger; getActHandler: () => ActHandler | null; getDefaultLlmClient: () => LLMClient; domSettleTimeoutMs?: number; }; export type ReadJsonResult = { value: T | null; path?: string; error?: unknown; }; export type WriteJsonResult = { path?: string; error?: unknown; }; export interface CachedActEntry { version: 1; instruction: string; url: string; variableKeys: string[]; actions: Action[]; actionDescription?: string; message?: string; } export type AgentReplayStep = | AgentReplayActStep | AgentReplayFillFormStep | AgentReplayGotoStep | AgentReplayScrollStep | AgentReplayWaitStep | AgentReplayNavBackStep | AgentReplayKeysStep | { type: string; [key: string]: unknown }; export interface AgentReplayActStep { type: "act"; instruction: string; actions?: Action[]; actionDescription?: string; message?: string; timeout?: number; } export interface AgentReplayFillFormStep { type: "fillForm"; fields?: Array<{ action: string }>; observeResults?: Action[]; actions?: Action[]; } export interface AgentReplayGotoStep { type: "goto"; url: string; waitUntil?: LoadState; } export interface AgentReplayScrollStep { type: "scroll"; deltaX?: number; deltaY?: number; anchor?: { x: number; y: number }; } export interface AgentReplayWaitStep { type: "wait"; timeMs: number; } export interface AgentReplayNavBackStep { type: "navback"; waitUntil?: LoadState; } export interface AgentReplayKeysStep { type: "keys"; instruction?: string; playwrightArguments: { method: "type" | "press"; text?: string; keys?: string; times?: number; }; } export interface SanitizedAgentExecuteOptions { maxSteps?: number; highlightCursor?: boolean; } export interface CachedAgentEntry { version: 1; instruction: string; startUrl: string; options: SanitizedAgentExecuteOptions; configSignature: string; steps: AgentReplayStep[]; result: AgentResult; timestamp: string; } ================================================ FILE: packages/core/lib/v3/types/private/evaluator.ts ================================================ export type EvaluateOptions = { /** The question to ask about the task state */ question: string; /** The answer to the question */ answer?: string; /** Whether to take a screenshot of the task state, or array of screenshots to evaluate */ screenshot?: boolean | Buffer[]; /** Custom system prompt for the evaluator */ systemPrompt?: string; /** Delay in milliseconds before taking the screenshot @default 250 */ screenshotDelayMs?: number; /** The agent's reasoning/thought process for completing the task */ agentReasoning?: string; }; export type BatchAskOptions = { /** Array of questions with optional answers */ questions: Array<{ question: string; answer?: string; }>; /** Whether to take a screenshot of the task state */ screenshot?: boolean; /** Custom system prompt for the evaluator */ systemPrompt?: string; /** Delay in milliseconds before taking the screenshot @default 1000 */ screenshotDelayMs?: number; }; /** * Result of an evaluation */ export interface EvaluationResult { /** * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected) */ evaluation: "YES" | "NO" | "INVALID"; /** * The reasoning behind the evaluation */ reasoning: string; } ================================================ FILE: packages/core/lib/v3/types/private/handlers.ts ================================================ import { Page } from "../../understudy/page.js"; import { ModelConfiguration } from "../public/model.js"; import type { StagehandZodSchema } from "../../zodCompat.js"; import type { Variables } from "../public/agent.js"; export interface ActHandlerParams { instruction: string; model?: ModelConfiguration; variables?: Variables; timeout?: number; page: Page; } export interface ExtractHandlerParams { instruction?: string; schema?: T; model?: ModelConfiguration; timeout?: number; selector?: string; page: Page; } export interface ObserveHandlerParams { instruction?: string; model?: ModelConfiguration; timeout?: number; selector?: string; page: Page; } // We can use this enum to list the actions supported in performUnderstudyMethod export enum SupportedUnderstudyAction { CLICK = "click", FILL = "fill", TYPE = "type", PRESS = "press", SCROLL = "scrollTo", NEXT_CHUNK = "nextChunk", PREV_CHUNK = "prevChunk", SELECT_OPTION_FROM_DROPDOWN = "selectOptionFromDropdown", HOVER = "hover", DOUBLE_CLICK = "doubleClick", DRAG_AND_DROP = "dragAndDrop", } ================================================ FILE: packages/core/lib/v3/types/private/index.ts ================================================ export * from "./api.js"; export * from "./handlers.js"; export * from "./internal.js"; export * from "./evaluator.js"; export * from "./cache.js"; export * from "./agent.js"; export * from "./snapshot.js"; ================================================ FILE: packages/core/lib/v3/types/private/internal.ts ================================================ import Browserbase from "@browserbasehq/sdk"; import { LaunchedChrome } from "chrome-launcher"; export type InitState = | { kind: "UNINITIALIZED" } | { kind: "LOCAL"; chrome: LaunchedChrome; ws: string; userDataDir?: string; createdTempProfile?: boolean; preserveUserDataDir?: boolean; } | { kind: "BROWSERBASE"; bb: Browserbase; sessionId: string; ws: string }; export type EncodedId = `${number}-${number}`; /** * Represents a path through a Zod schema from the root object down to a * particular field. The `segments` array describes the chain of keys/indices. * * - **String** segments indicate object property names. * - **Number** segments indicate array indices. * * For example, `["users", 0, "homepage"]` might describe reaching * the `homepage` field in `schema.users[0].homepage`. */ export interface ZodPathSegments { /** * The ordered list of keys/indices leading from the schema root * to the targeted field. */ segments: Array; } export type InitScriptSource = | string | { path?: string; content?: string } | ((arg: Arg) => unknown); ================================================ FILE: packages/core/lib/v3/types/private/locator.ts ================================================ import { Buffer } from "buffer"; export interface NormalizedFilePayload { name: string; mimeType: string; buffer: Buffer; lastModified: number; /** Absolute path to the source file when provided by the caller. */ absolutePath?: string; } ================================================ FILE: packages/core/lib/v3/types/private/network.ts ================================================ import { Protocol } from "devtools-protocol"; /** Metadata tracked for each network request currently in-flight. */ export type NetworkRequestInfo = { sessionId: string; requestId: string; requestKey: string; frameId?: string; loaderId?: string; url?: string; timestamp: number; resourceType?: Protocol.Network.ResourceType; documentRequest: boolean; }; /** Callback hooks consumers can implement to observe network transitions. */ export interface NetworkObserver { onRequestStarted(info: NetworkRequestInfo): void; onRequestFinished(info: NetworkRequestInfo): void; onRequestFailed(info: NetworkRequestInfo): void; } /** Options for the idle waiter helper. */ export type WaitForIdleOptions = { startTime?: number; timeoutMs: number; idleTimeMs?: number; filter?: (info: NetworkRequestInfo) => boolean; totalBudgetMs?: number; }; export const DEFAULT_IDLE_WAIT = 500; export const IGNORED_RESOURCE_TYPES = new Set< Protocol.Network.ResourceType | undefined >(["EventSource", "WebSocket"]); /** The handle returned by the network manager idle helper. */ export type WaitForIdleHandle = { promise: Promise; dispose: () => void; }; ================================================ FILE: packages/core/lib/v3/types/private/shutdown.ts ================================================ /** * Internal-only types for the shutdown supervisor process. */ export type ShutdownSupervisorConfig = | { kind: "LOCAL"; pid: number; userDataDir?: string; createdTempProfile?: boolean; preserveUserDataDir?: boolean; } | { kind: "STAGEHAND_API"; sessionId: string; apiKey: string; projectId?: string; }; export interface ShutdownSupervisorHandle { /** Best-effort signal to stop the supervisor process. */ stop: () => void; } ================================================ FILE: packages/core/lib/v3/types/private/shutdownErrors.ts ================================================ /** * Internal-only errors for the shutdown supervisor. */ export class ShutdownSupervisorError extends Error { constructor(message: string) { super(message); this.name = "ShutdownSupervisorError"; } } export class ShutdownSupervisorResolveError extends ShutdownSupervisorError { constructor(message: string) { super(message); this.name = "ShutdownSupervisorResolveError"; } } export class ShutdownSupervisorSpawnError extends ShutdownSupervisorError { constructor(message: string) { super(message); this.name = "ShutdownSupervisorSpawnError"; } } ================================================ FILE: packages/core/lib/v3/types/private/snapshot.ts ================================================ /** * Options that control how hybrid snapshots and targeted scopes are captured. */ export type SnapshotOptions = { /** * Filter the snapshot to a specific element/subtree using a selector that can cross iframes. * Supports XPath (prefixed with `xpath=` or starting with `/`) and CSS with iframe hops via `>>`. */ focusSelector?: string; /** * Pierce shadow DOM when calling DOM.getDocument. Defaults to true to retain the * existing behaviour. */ pierceShadow?: boolean; /** * Toggle whether iframe subtrees are included in the merged snapshot. Defaults to true. */ includeIframes?: boolean; /** * Optional feature flag that surfaces experimental traversal tweaks in the Accessibility layer. */ experimental?: boolean; }; /** * Hybrid snapshot payload consumed by act/extract/observe handlers. */ export type HybridSnapshot = { /** Merged outline across every frame. */ combinedTree: string; /** EncodedId (frameOrdinal-backendNodeId) -> absolute XPath. */ combinedXpathMap: Record; /** EncodedId -> URL extracted from AX properties. */ combinedUrlMap: Record; /** Per-frame payloads expose the original relative data for debugging. */ perFrame?: PerFrameSnapshot[]; }; export type PerFrameSnapshot = { frameId: string; outline: string; xpathMap: Record; urlMap: Record; }; /** * Compact encoding of DOM data for an entire session. Shared between capture * and focus helpers so DOM traversal can be unit tested in isolation. */ export type SessionDomIndex = { rootBackend: number; absByBe: Map; tagByBe: Map; scrollByBe: Map; docRootOf: Map; contentDocRootByIframe: Map; }; export type FrameDomMaps = { tagNameMap: Record; xpathMap: Record; scrollableMap: Record; urlMap: Record; }; export type ResolvedLocation = { frameId: string; backendNodeId: number; absoluteXPath: string; }; export type ResolvedFocusFrame = { targetFrameId: string; tailXPath: string; absPrefix: string; }; export type ResolvedCssFocus = { targetFrameId: string; tailSelector: string; absPrefix: string; }; export type Axis = "child" | "desc"; export type Step = { axis: Axis; raw: string; name: string; }; export type A11yNode = { role: string; name?: string; description?: string; value?: string | number | boolean; nodeId: string; backendDOMNodeId?: number; parentId?: string; childIds?: string[]; children?: A11yNode[]; encodedId?: string; }; export type A11yOptions = { focusSelector?: string; experimental: boolean; tagNameMap: Record; scrollableMap: Record; encode: (backendNodeId: number) => string; }; export type AccessibilityTreeResult = { outline: string; urlMap: Record; scopeApplied: boolean; }; export type FrameParentIndex = Map; /** * Shared frame metadata that every snapshot step needs. * - `rootId`: stable identifier for the main frame so we can detect root prefixes. * - `parentByFrame`: lookup table for iframe parentage (used by focus scoping and prefixing). * - `frames`: DFS-ordered frame ids so merging walks parents before children. */ export type FrameContext = { rootId: string; parentByFrame: FrameParentIndex; frames: string[]; }; ================================================ FILE: packages/core/lib/v3/types/public/agent.ts ================================================ import type { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { ToolSet, ModelMessage, wrapLanguageModel, StreamTextResult, StepResult, PrepareStepFunction, GenerateTextOnStepFinishCallback, StreamTextOnStepFinishCallback, StreamTextOnErrorCallback, StreamTextOnChunkCallback, StreamTextOnFinishCallback, } from "ai"; import { LogLine } from "./logs.js"; import { ClientOptions } from "./model.js"; import { StagehandZodObject } from "../../zodCompat.js"; // Re-export ModelMessage for consumers who want to use it for conversation continuation export type { ModelMessage } from "ai"; // Re-export Tool type for consumers who want to define custom tools export type { Tool } from "ai"; import { Page as PlaywrightPage } from "playwright-core"; import { Page as PuppeteerPage } from "puppeteer-core"; import { Page as PatchrightPage } from "patchright-core"; import { Page } from "../../understudy/page.js"; // ============================================================================= // Variable Types // ============================================================================= /** * A variable value can be a simple primitive or a rich object with an optional description. * This unified type is shared across `act`, `agent.execute`, and other methods. * * @example Simple (backward-compatible): * ```typescript * variables: { username: "john@example.com" } * ``` * * @example Rich with description (useful for agents): * ```typescript * variables: { * username: { value: "john@example.com", description: "The login email" } * } * ``` */ export type VariableValue = | string | number | boolean | { value: string | number | boolean; description?: string }; /** * A collection of named variables for use in act, agent, and other methods. */ export type Variables = Record; export interface AgentContext { options: AgentExecuteOptionsBase; maxSteps: number; systemPrompt: string; allTools: ToolSet; messages: ModelMessage[]; wrappedModel: ReturnType; initialPageUrl: string; } export interface AgentState { collectedReasoning: string[]; actions: AgentAction[]; finalMessage: string; completed: boolean; currentPageUrl: string; } export interface AgentAction { type: string; reasoning?: string; taskCompleted?: boolean; action?: string; // Tool-specific fields timeMs?: number; // wait tool pageText?: string; // ariaTree tool pageUrl?: string; // ariaTree tool instruction?: string; // various tools [key: string]: unknown; } export interface AgentResult { success: boolean; message: string; actions: AgentAction[]; completed: boolean; metadata?: Record; usage?: { input_tokens: number; output_tokens: number; reasoning_tokens?: number; cached_input_tokens?: number; inference_time_ms: number; }; /** * The conversation messages from this execution. * Pass these to a subsequent execute() call via the `messages` option to continue the conversation. * @experimental */ messages?: ModelMessage[]; /** * Custom output data extracted based on the `output` schema provided in execute options. * Only populated if an `output` schema was provided. * @experimental */ output?: Record; } export type AgentStreamResult = StreamTextResult & { result: Promise; }; /** * Base callbacks shared between execute (non-streaming) and streaming modes. */ export interface AgentCallbacks { /** * Optional function called before each step to modify settings. * You can change the model, tool choices, active tools, system prompt, * and input messages for each step. */ prepareStep?: PrepareStepFunction; /** * Callback called when each step (LLM call) is finished. * This is called for intermediate steps as well as the final step. */ onStepFinish?: | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback; } /** * Error message type for streaming-only callbacks used in non-streaming mode. * This provides a clear error message when users try to use streaming callbacks without stream: true. */ type StreamingCallbackNotAvailable = "This callback requires 'stream: true' in AgentConfig. Set stream: true to use streaming callbacks like onChunk, onFinish, onError, and onAbort."; /** * Error message for safety confirmation callback misuse. * Safety confirmations are only available for non-streaming CUA agent executions. */ type SafetyConfirmationCallbackNotAvailable = "Safety confirmation callbacks are only available via non-streaming AgentExecuteOptions.callbacks when using mode: 'cua'."; /** * Callbacks specific to the non-streaming execute method. */ export interface AgentExecuteCallbacks extends AgentCallbacks { /** * Callback called when each step (LLM call) is finished. */ onStepFinish?: GenerateTextOnStepFinishCallback; /** * Callback for handling safety confirmation requests from CUA providers. * Only available when running an agent configured with mode: "cua". */ onSafetyConfirmation?: SafetyConfirmationHandler; /** * NOT AVAILABLE in non-streaming mode. * This callback requires `stream: true` in AgentConfig. * * @example * ```typescript * // Enable streaming to use onChunk: * const agent = stagehand.agent({ stream: true }); * await agent.execute({ * instruction: "...", * callbacks: { onChunk: async (chunk) => console.log(chunk) } * }); * ``` */ onChunk?: StreamingCallbackNotAvailable; /** * NOT AVAILABLE in non-streaming mode. * This callback requires `stream: true` in AgentConfig. * * @example * ```typescript * // Enable streaming to use onFinish: * const agent = stagehand.agent({ stream: true }); * await agent.execute({ * instruction: "...", * callbacks: { onFinish: (event) => console.log("Done!", event) } * }); * ``` */ onFinish?: StreamingCallbackNotAvailable; /** * NOT AVAILABLE in non-streaming mode. * This callback requires `stream: true` in AgentConfig. * * @example * ```typescript * // Enable streaming to use onError: * const agent = stagehand.agent({ stream: true }); * await agent.execute({ * instruction: "...", * callbacks: { onError: ({ error }) => console.error(error) } * }); * ``` */ onError?: StreamingCallbackNotAvailable; /** * NOT AVAILABLE in non-streaming mode. * This callback requires `stream: true` in AgentConfig. * * @example * ```typescript * // Enable streaming to use onAbort: * const agent = stagehand.agent({ stream: true }); * await agent.execute({ * instruction: "...", * callbacks: { onAbort: (event) => console.log("Aborted", event.steps) } * }); * ``` */ onAbort?: StreamingCallbackNotAvailable; } /** * Callbacks specific to the streaming mode. */ export interface AgentStreamCallbacks extends AgentCallbacks { /** * Callback called when each step (LLM call) is finished during streaming. */ onStepFinish?: StreamTextOnStepFinishCallback; /** * Callback called when an error occurs during streaming. * Use this to log errors or handle error states. */ onError?: StreamTextOnErrorCallback; /** * Callback called for each chunk of the stream. * Stream processing will pause until the callback promise resolves. */ onChunk?: StreamTextOnChunkCallback; /** * Callback called when the stream finishes. */ onFinish?: StreamTextOnFinishCallback; /** * Callback called when the stream is aborted. */ onAbort?: (event: { steps: Array>; }) => PromiseLike | void; /** * NOT AVAILABLE in streaming mode. * Safety confirmations currently require non-streaming execute() on CUA agents. */ onSafetyConfirmation?: SafetyConfirmationCallbackNotAvailable; } /** * Base options for agent execution (without callbacks). */ export interface AgentExecuteOptionsBase { instruction: string; maxSteps?: number; page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page; highlightCursor?: boolean; /** * Previous conversation messages to continue from. * Pass the `messages` from a previous AgentResult to continue that conversation. * @experimental */ messages?: ModelMessage[]; /** * An AbortSignal that can be used to cancel the agent execution. * When aborted, the agent will stop and return a partial result. * @experimental * * @example * ```typescript * const controller = new AbortController(); * setTimeout(() => controller.abort(), 30000); // 30 second timeout * * const result = await agent.execute({ * instruction: "...", * signal: controller.signal * }); * ``` */ signal?: AbortSignal; /** * Tools to exclude from this execution. * Pass an array of tool names to prevent the agent from using those tools. * * **Note:** Not supported in CUA mode (`mode: "cua"`). * * **Available tools by mode:** * * **DOM mode (default):** * - `act` - Perform semantic actions (click, type, etc.) * - `fillForm` - Fill form fields using DOM selectors * - `ariaTree` - Get accessibility tree of the page * - `extract` - Extract structured data from page * - `goto` - Navigate to a URL * - `scroll` - Scroll using semantic directions (up/down/left/right) * - `keys` - Press keyboard keys * - `navback` - Navigate back in history * - `screenshot` - Take a screenshot * - `think` - Agent reasoning/planning step * - `wait` - Wait for time or condition * - `done` - Mark task as complete * - `search` - Web search (requires useSearch: true and BROWSERBASE_API_KEY) * * **Hybrid mode:** * - `click` - Click at specific coordinates * - `type` - Type text at coordinates * - `dragAndDrop` - Drag from one point to another * - `clickAndHold` - Click and hold at coordinates * - `fillFormVision` - Fill forms using vision/coordinates * - `act` - Perform semantic actions * - `ariaTree` - Get accessibility tree * - `extract` - Extract data from page * - `goto` - Navigate to URL * - `scroll` - Scroll using coordinates * - `keys` - Press keyboard keys * - `navback` - Navigate back * - `screenshot` - Take screenshot * - `think` - Agent reasoning step * - `wait` - Wait for time/condition * - `done` - Mark task complete * - `search` - Web search (requires useSearch: true and BROWSERBASE_API_KEY) * * @experimental * @example * ```typescript * // Exclude screenshot and extract tools * const result = await agent.execute({ * instruction: "Click the submit button", * excludeTools: ["screenshot", "extract"] * }); * ``` */ excludeTools?: string[]; /** * A Zod schema defining custom output data to return when the task completes. * The agent will populate this data in the final done tool call. * * @experimental * @example * ```typescript * const result = await agent.execute({ * instruction: "Find the cheapest flight from NYC to LA", * output: z.object({ * price: z.string().describe("The price of the flight"), * airline: z.string().describe("The airline name"), * departureTime: z.string().describe("Departure time"), * }), * }); * * console.log(result.output); // { price: "$199", airline: "Delta", departureTime: "8:00 AM" } * ``` */ output?: StagehandZodObject; /** * Variables that the agent can use when filling forms or typing text. * The agent will see variable names and descriptions in the system prompt, * and can use them via `%variableName%` syntax in act/type/fillForm tool calls. * * Accepts both simple values and rich objects with descriptions (same type as `act`). * * **Note:** Not supported in CUA mode (`mode: "cua"`). Requires `experimental: true`. * * @experimental * @example * ```typescript * // Simple values * variables: { username: "john@example.com", password: "secret123" } * * // Rich values with descriptions (helps the agent understand context) * variables: { * username: { value: "john@example.com", description: "The login email" }, * password: { value: "secret123", description: "The login password" }, * } * ``` */ variables?: Variables; /** * Timeout in milliseconds for each agent tool call. * If a tool call exceeds this duration, it will be aborted and * reported back to the LLM as a timeout error so it can retry or adjust. * For tools that call v3 methods (act, extract, fillForm, ariaTree), the * timeout is also forwarded to the underlying v3 call for true cancellation. * @default 45000 (45 seconds) */ toolTimeout?: number; /** * Enable the web search tool powered by Browserbase Search API. * Requires a valid Browserbase API key (BROWSERBASE_API_KEY). * When set to true, the agent gains access to a `search` tool for web searches. * * @example * ```typescript * const result = await agent.execute({ * instruction: "Find the latest news about AI", * useSearch: true, * }); * ``` */ useSearch?: boolean; } /** * Options for non-streaming agent execution. * Only accepts AgentExecuteCallbacks (no streaming-specific callbacks like onChunk, onFinish). */ export interface AgentExecuteOptions extends AgentExecuteOptionsBase { /** * Callbacks for non-streaming agent execution. * For streaming callbacks (onChunk, onFinish, onError, onAbort), use stream: true in AgentConfig. */ callbacks?: AgentExecuteCallbacks; } /** * Options for streaming agent execution. * Accepts AgentStreamCallbacks including onChunk, onFinish, onError, and onAbort. */ export interface AgentStreamExecuteOptions extends AgentExecuteOptionsBase { /** * Callbacks for streaming agent execution. * Includes streaming-specific callbacks: onChunk, onFinish, onError, onAbort. */ callbacks?: AgentStreamCallbacks; } export type AgentType = | "openai" | "anthropic" | "google" | "microsoft" | "bedrock"; export const AVAILABLE_CUA_MODELS = [ "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-opus-4-5-20251101", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6", "anthropic/claude-haiku-4-5-20251001", "anthropic/claude-sonnet-4-20250514", "anthropic/claude-sonnet-4-5-20250929", "google/gemini-2.5-computer-use-preview-10-2025", "google/gemini-3-flash-preview", "google/gemini-3-pro-preview", "microsoft/fara-7b", ] as const; export type AvailableCuaModel = (typeof AVAILABLE_CUA_MODELS)[number]; export interface AgentExecutionOptions< TOptions extends AgentExecuteOptions = AgentExecuteOptions, > { options: TOptions; logger: (message: LogLine) => void; retries?: number; } export interface AgentHandlerOptions { modelName: string; clientOptions?: ClientOptions; userProvidedInstructions?: string; experimental?: boolean; } export interface ActionExecutionResult { success: boolean; error?: string; data?: unknown; } /** * Represents a safety check that requires user confirmation before proceeding. * These are issued by CUA providers (OpenAI, Google) when the agent attempts * potentially risky actions. */ export interface SafetyCheck { /** Unique identifier for this safety check */ id: string; /** Code identifying the type of safety concern */ code: string; /** Human-readable description of the safety concern */ message: string; } /** * Response from the user for a safety confirmation request. */ export interface SafetyConfirmationResponse { /** Whether the user acknowledged/approved the safety checks */ acknowledged: boolean; } /** * Callback for handling safety confirmation requests. * Called when the CUA provider issues safety checks that require user confirmation. * The callback should return a promise that resolves when the user has made a decision. * * @param safetyChecks - Array of safety checks requiring confirmation * @returns Promise resolving to the user's response * * @example * ```typescript * const agent = stagehand.agent({ * mode: "cua", * }); * await agent.execute({ * instruction: "...", * callbacks: { * onSafetyConfirmation: async (checks) => { * console.log("Safety checks:", checks); * const userApproved = await showConfirmationDialog(checks); * return { acknowledged: userApproved }; * }, * }, * }); * ``` */ export type SafetyConfirmationHandler = ( safetyChecks: SafetyCheck[], ) => Promise; // Anthropic types: export interface ToolUseItem extends ResponseItem { type: "tool_use"; id: string; // This is the correct property name from Anthropic's API name: string; // Name of the tool being used input: Record; } export interface AnthropicMessage { role: string; content: string | Array; } export interface AnthropicContentBlock { type: string; [key: string]: unknown; } export interface AnthropicTextBlock extends AnthropicContentBlock { type: "text"; text: string; } export interface AnthropicToolResult { type: "tool_result"; tool_use_id: string; content: string | Array; } // OpenAI types: export interface ResponseItem { type: string; id: string; [key: string]: unknown; } export interface ComputerCallItem extends ResponseItem { type: "computer_call"; call_id: string; action: { type: string; [key: string]: unknown; }; pending_safety_checks?: Array<{ id: string; code: string; message: string; }>; } export interface FunctionCallItem extends ResponseItem { type: "function_call"; call_id: string; name: string; arguments: string; } export type ResponseInputItem = | { role: string; content: string } | { type: "computer_call_output"; call_id: string; output: | { type: "input_image"; image_url: string; current_url?: string; error?: string; [key: string]: unknown; } | string; acknowledged_safety_checks?: Array<{ id: string; code: string; message: string; }>; } | { type: "function_call_output"; call_id: string; output: string; }; export interface AgentInstance { execute: ( instructionOrOptions: string | AgentExecuteOptions, ) => Promise; } export type AgentProviderType = AgentType; export type AgentModelConfig = { modelName: TModelName; } & Record; /** * Agent tool mode determines which set of tools are available to the agent. * - 'dom': Uses DOM-based tools (act, fillForm) - better for structured page interactions * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.) - better for visual/screenshot-based interactions * - 'cua': Uses Computer Use Agent (CUA) providers like Anthropic Claude or Google Gemini for screenshot-based automation */ export type AgentToolMode = "dom" | "hybrid" | "cua"; export type AgentConfig = { /** * Custom system prompt to provide to the agent. Overrides the default system prompt. */ systemPrompt?: string; /** * MCP integrations - Array of Client objects */ integrations?: (Client | string)[]; /** * Tools passed to the agent client */ tools?: ToolSet; /** * @deprecated Use `mode: "cua"` instead. This option will be removed in a future version. * Enables Computer Use Agent (CUA) mode. */ cua?: boolean; /** * The model to use for agent functionality */ model?: string | AgentModelConfig; /** * The model to use for tool execution (observe/act calls within agent tools). * If not specified, inherits from the main model configuration. * Format: "provider/model" (e.g., "openai/gpt-4o-mini", "google/gemini-2.0-flash-exp") */ executionModel?: string | AgentModelConfig; /** * Enable streaming mode for the agent. * When true, execute() returns AgentStreamResult with textStream for incremental output. * When false (default), execute() returns AgentResult after completion. */ stream?: boolean; /** * Tool mode for the agent. Determines which set of tools are available. * - 'dom' (default): Uses DOM-based tools (act, fillForm) for structured interactions * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, clickAndHold, fillFormVision) * for visual/screenshot-based interactions * - 'cua': Uses Computer Use Agent (CUA) providers for screenshot-based automation */ mode?: AgentToolMode; }; /** * Agent instance returned when stream: true is set in AgentConfig. * execute() returns a streaming result that can be consumed incrementally. * Accepts AgentStreamExecuteOptions with streaming-specific callbacks. */ export interface StreamingAgentInstance { execute: ( instructionOrOptions: string | AgentStreamExecuteOptions, ) => Promise; } /** * Agent instance returned when stream is false or not set in AgentConfig. * execute() returns a result after the agent completes. * Accepts AgentExecuteOptions with non-streaming callbacks only. */ export interface NonStreamingAgentInstance { execute: ( instructionOrOptions: string | AgentExecuteOptions, ) => Promise; } // ============================================================================= // Vision Action Tool Result Types // ============================================================================= /** * Content item type for toModelOutput return values. * Used in tool definitions to return text and/or media to the model. */ export type ModelOutputContentItem = | { type: "text"; text: string } | { type: "media"; mediaType: string; data: string }; export interface ClickToolResult { success: boolean; describe?: string; coordinates?: number[]; error?: string; screenshotBase64?: string; } export interface TypeToolResult { success: boolean; describe?: string; text?: string; error?: string; screenshotBase64?: string; } export interface DragAndDropToolResult { success: boolean; describe?: string; error?: string; screenshotBase64?: string; } export interface FillFormField { action: string; value: string; coordinates: { x: number; y: number }; } export interface FillFormVisionToolResult { success: boolean; playwrightArguments?: FillFormField[]; error?: string; screenshotBase64?: string; } export interface ScrollToolResult { success: boolean; message: string; scrolledPixels: number; error?: string; } export interface ScrollVisionToolResult extends ScrollToolResult { screenshotBase64?: string; } export interface WaitToolResult { success: boolean; waited: number; screenshotBase64?: string; error?: string; } ================================================ FILE: packages/core/lib/v3/types/public/api.ts ================================================ /** * Centralized Zod schemas for Stagehand Server API * * Naming conventions: * - `*RequestSchema` - Request body schemas (zod4), `*Request` is the inferred type * - `*ResultSchema` - Inner response data (unwrapped), `*Result` is the inferred type * - `*ResponseSchema` - Full response with success wrapper: { success: true, data: *Result }, `*Response` is the inferred type * * All TypeScript types are inferred from the Zod4 *Schemas using z.infer<> */ import { z } from "zod/v4"; import type Browserbase from "@browserbasehq/sdk"; // ============================================================================= // Shared Components // ============================================================================= /** Browser launch options for local browsers */ export const LocalBrowserLaunchOptionsSchema = z .object({ args: z.array(z.string()).optional(), executablePath: z.string().optional(), port: z.number().optional(), userDataDir: z.string().optional(), preserveUserDataDir: z.boolean().optional(), headless: z.boolean().optional(), devtools: z.boolean().optional(), chromiumSandbox: z.boolean().optional(), ignoreDefaultArgs: z.union([z.boolean(), z.array(z.string())]).optional(), proxy: z .object({ server: z.string(), bypass: z.string().optional(), username: z.string().optional(), password: z.string().optional(), }) .optional(), locale: z.string().optional(), viewport: z.object({ width: z.number(), height: z.number() }).optional(), deviceScaleFactor: z.number().optional(), hasTouch: z.boolean().optional(), ignoreHTTPSErrors: z.boolean().optional(), cdpUrl: z.string().optional(), cdpHeaders: z.record(z.string(), z.string()).optional(), connectTimeoutMs: z.number().optional(), downloadsPath: z.string().optional(), acceptDownloads: z.boolean().optional(), }) .strict() .meta({ id: "LocalBrowserLaunchOptions" }); /** Detailed model configuration object */ export const ModelConfigObjectSchema = z .object({ provider: z .enum(["openai", "anthropic", "google", "microsoft", "bedrock"]) .optional() .meta({ description: "AI provider for the model (or provide a baseURL endpoint instead)", example: "openai", }), modelName: z.string().meta({ description: "Model name string with provider prefix (e.g., 'openai/gpt-5-nano')", example: "openai/gpt-5-nano", }), apiKey: z.string().optional().meta({ description: "API key for the model provider", example: "sk-some-openai-api-key", }), baseURL: z.string().url().optional().meta({ description: "Base URL for the model provider", example: "https://api.openai.com/v1", }), }) .meta({ id: "ModelConfigObject" }); /** Model configuration */ export const ModelConfigSchema = ModelConfigObjectSchema.meta({ id: "ModelConfig", }); /** Action object returned by observe and used by act */ export const ActionSchema = z .object({ selector: z.string().meta({ description: "CSS selector or XPath for the element", example: "[data-testid='submit-button']", }), description: z.string().meta({ description: "Human-readable description of the action", example: "Click the submit button", }), backendNodeId: z.number().optional().meta({ description: "Backend node ID for the element", }), method: z.string().optional().meta({ description: "The method to execute (click, fill, etc.)", example: "click", }), arguments: z .array(z.string()) .optional() .meta({ description: "Arguments to pass to the method", example: ["Hello World"], }), }) .meta({ id: "Action", description: "Action object returned by observe and used by act", }); /** Session ID path parameter */ export const SessionIdParamsSchema = z .object({ id: z.string().meta({ description: "Unique session identifier", example: "c4dbf3a9-9a58-4b22-8a1c-9f20f9f9e123", }), }) .strict() .meta({ id: "SessionIdParams" }); /** Browser configuration for session start */ export const BrowserConfigSchema = z .object({ type: z.enum(["local", "browserbase"]).optional().meta({ description: "Browser type to use", example: "local", }), cdpUrl: z.string().optional().meta({ description: "Chrome DevTools Protocol URL for connecting to existing browser", example: "ws://localhost:9222", }), launchOptions: LocalBrowserLaunchOptionsSchema.optional(), }) .meta({ id: "BrowserConfig" }); // ============================================================================= // Request Headers (operational only - auth headers are in security schemes) // ============================================================================= /** Operational headers for all session requests (auth handled via security schemes) */ export const SessionHeadersSchema = z .object({ "x-stream-response": z.enum(["true", "false"]).optional().meta({ description: "Whether to stream the response via SSE", example: "true", }), }) .meta({ id: "SessionHeaders" }); // ============================================================================= // Response Wrapper Helper // ============================================================================= /** Wraps a result schema in standard success response format */ const wrapResponse = (resultSchema: T, name: string) => z .object({ success: z.boolean().meta({ description: "Indicates whether the request was successful", }), data: resultSchema, }) .meta({ id: name }); /** Standard error response */ export const ErrorResponseSchema = z .object({ success: z.literal(false), error: z.string(), code: z.string().optional(), }) .strict() .meta({ id: "ErrorResponse" }); // ============================================================================= // Browserbase Session Create Params (zod+hints duplicated version of Browserbase.Sessions.SessionCreateParams) // ============================================================================= /** Browserbase viewport configuration */ export const BrowserbaseViewportSchema = z .object({ width: z.number().optional(), height: z.number().optional(), }) .meta({ id: "BrowserbaseViewport" }); /** Browserbase fingerprint screen configuration */ export const BrowserbaseFingerprintScreenSchema = z .object({ maxHeight: z.number().optional(), maxWidth: z.number().optional(), minHeight: z.number().optional(), minWidth: z.number().optional(), }) .meta({ id: "BrowserbaseFingerprintScreen" }); /** Browserbase fingerprint configuration for stealth mode */ export const BrowserbaseFingerprintSchema = z .object({ browsers: z .array(z.enum(["chrome", "edge", "firefox", "safari"])) .optional(), devices: z.array(z.enum(["desktop", "mobile"])).optional(), httpVersion: z.enum(["1", "2"]).optional(), locales: z.array(z.string()).optional(), operatingSystems: z .array(z.enum(["android", "ios", "linux", "macos", "windows"])) .optional(), screen: BrowserbaseFingerprintScreenSchema.optional(), }) .meta({ id: "BrowserbaseFingerprint" }); /** Browserbase context configuration for session persistence */ export const BrowserbaseContextSchema = z .object({ id: z.string(), persist: z.boolean().optional(), }) .meta({ id: "BrowserbaseContext" }); /** Browserbase browser settings for session creation */ export const BrowserbaseBrowserSettingsSchema = z .object({ advancedStealth: z.boolean().optional(), blockAds: z.boolean().optional(), context: BrowserbaseContextSchema.optional(), extensionId: z.string().optional(), fingerprint: BrowserbaseFingerprintSchema.optional(), logSession: z.boolean().optional(), recordSession: z.boolean().optional(), solveCaptchas: z.boolean().optional(), viewport: BrowserbaseViewportSchema.optional(), }) .meta({ id: "BrowserbaseBrowserSettings" }); /** Browserbase managed proxy geolocation configuration */ export const BrowserbaseProxyGeolocationSchema = z .object({ country: z.string(), city: z.string().optional(), state: z.string().optional(), }) .meta({ id: "BrowserbaseProxyGeolocation" }); /** Browserbase managed proxy configuration */ export const BrowserbaseProxyConfigSchema = z .object({ type: z.literal("browserbase"), domainPattern: z.string().optional(), geolocation: BrowserbaseProxyGeolocationSchema.optional(), }) .meta({ id: "BrowserbaseProxyConfig" }); /** External proxy configuration */ export const ExternalProxyConfigSchema = z .object({ type: z.literal("external"), server: z.string(), domainPattern: z.string().optional(), username: z.string().optional(), password: z.string().optional(), }) .meta({ id: "ExternalProxyConfig" }); /** Union of proxy configuration types */ export const ProxyConfigSchema = z .discriminatedUnion("type", [ BrowserbaseProxyConfigSchema, ExternalProxyConfigSchema, ]) .meta({ id: "ProxyConfig" }); /** Browserbase region identifier for multi-region support */ export const BrowserbaseRegionSchema = z .enum(["us-west-2", "us-east-1", "eu-central-1", "ap-southeast-1"]) .meta({ id: "BrowserbaseRegion" }); /** Browserbase session creation parameters */ export const BrowserbaseSessionCreateParamsSchema = z .object({ projectId: z.string().optional(), browserSettings: BrowserbaseBrowserSettingsSchema.optional(), extensionId: z.string().optional(), keepAlive: z.boolean().optional(), proxies: z.union([z.boolean(), z.array(ProxyConfigSchema)]).optional(), region: BrowserbaseRegionSchema.optional(), timeout: z.number().optional(), userMetadata: z.record(z.string(), z.unknown()).optional(), }) .meta({ id: "BrowserbaseSessionCreateParams" }); // ============================================================================= // Session Start // ============================================================================= export const SessionStartRequestSchema = z .object({ modelName: z.string().meta({ description: "Model name to use for AI operations", example: "openai/gpt-4o", }), domSettleTimeoutMs: z.number().optional().meta({ description: "Timeout in ms to wait for DOM to settle", example: 5000, }), verbose: z .union([z.literal(0), z.literal(1), z.literal(2)]) .optional() .meta({ description: "Logging verbosity level (0=quiet, 1=normal, 2=debug)", example: 1, override: ({ jsonSchema }: { jsonSchema: Record }) => { delete jsonSchema.anyOf; delete jsonSchema.allOf; delete jsonSchema.oneOf; jsonSchema.type = "number"; jsonSchema.enum = [0, 1, 2]; }, }), systemPrompt: z.string().optional().meta({ description: "Custom system prompt for AI operations", }), browserbaseSessionCreateParams: BrowserbaseSessionCreateParamsSchema.optional(), browser: BrowserConfigSchema.optional(), selfHeal: z.boolean().optional().meta({ description: "Enable self-healing for failed actions", example: true, }), browserbaseSessionID: z.string().optional().meta({ description: "Existing Browserbase session ID to resume", }), // experimental is a V3 field but doesn't need to go over the wire - included because wire type imports options type experimental: z.boolean().optional(), // V2 compatibility fields - only included because the server imports this type and supports V2 // should never be used in v3 clients or v3-only server implementations waitForCaptchaSolves: z.boolean().optional().meta({ description: "Wait for captcha solves (deprecated, v2 only)", }), actTimeoutMs: z.number().optional().meta({ description: "Timeout in ms for act operations (deprecated, v2 only)", }), }) .meta({ id: "SessionStartRequest" }); export const SessionStartResultSchema = z .object({ sessionId: z.string().meta({ description: "Unique Browserbase session identifier", example: "c4dbf3a9-9a58-4b22-8a1c-9f20f9f9e123", }), cdpUrl: z.string().nullish().meta({ description: "CDP WebSocket URL for connecting to the Browserbase cloud browser (present when available)", example: "wss://connect.browserbase.com/?signingKey=abc123", }), available: z.boolean(), }) .meta({ id: "SessionStartResult" }); export const SessionStartResponseSchema = wrapResponse( SessionStartResultSchema, "SessionStartResponse", ); // ============================================================================= // Session End // ============================================================================= /** Session end request - no request body. */ export const SessionEndRequestSchema = z .object({}) .strict() .optional() .meta({ id: "SessionEndRequest" }); export const SessionEndResultSchema = z .object({}) .strict() .meta({ id: "SessionEndResult" }); /** Session end response - just success flag, no data wrapper */ export const SessionEndResponseSchema = z .object({ success: z.boolean().meta({ description: "Indicates whether the request was successful", }), }) .strict() .meta({ id: "SessionEndResponse" }); // ============================================================================= // Act // ============================================================================= export const ActOptionsSchema = z .object({ model: z.union([ModelConfigSchema, z.string()]).optional().meta({ description: "Model configuration object or model name string (e.g., 'openai/gpt-5-nano')", }), variables: z .record(z.string(), z.string()) .optional() .meta({ description: "Variables to substitute in the action instruction", example: { username: "john_doe" }, }), timeout: z.number().optional().meta({ description: "Timeout in ms for the action", example: 30000, }), }) .optional() .meta({ id: "ActOptions" }); export const ActRequestSchema = z .object({ input: z.string().or(ActionSchema).meta({ description: "Natural language instruction or Action object", example: "Click the login button", }), options: ActOptionsSchema, frameId: z.string().nullish().meta({ description: "Target frame ID for the action", }), streamResponse: z.boolean().optional().meta({ description: "Whether to stream the response via SSE", example: true, }), }) .meta({ id: "ActRequest" }); /** Inner act result data */ export const ActResultDataSchema = z .object({ success: z.boolean().meta({ description: "Whether the action completed successfully", example: true, }), message: z.string().meta({ description: "Human-readable result message", example: "Successfully clicked the login button", }), actionDescription: z.string().meta({ description: "Description of the action that was performed", example: "Clicked button with text 'Login'", }), actions: z.array(ActionSchema).meta({ description: "List of actions that were executed", }), }) .meta({ id: "ActResultData" }); export const ActResultSchema = z .object({ result: ActResultDataSchema, actionId: z.string().optional().meta({ description: "Action ID for tracking", }), }) .meta({ id: "ActResult" }); export const ActResponseSchema = wrapResponse(ActResultSchema, "ActResponse"); // ============================================================================= // Extract // ============================================================================= export const ExtractOptionsSchema = z .object({ model: z.union([ModelConfigSchema, z.string()]).optional().meta({ description: "Model configuration object or model name string (e.g., 'openai/gpt-5-nano')", }), timeout: z.number().optional().meta({ description: "Timeout in ms for the extraction", example: 30000, }), selector: z.string().optional().meta({ description: "CSS selector to scope extraction to a specific element", example: "#main-content", }), }) .optional() .meta({ id: "ExtractOptions" }); export const ExtractRequestSchema = z .object({ instruction: z.string().optional().meta({ description: "Natural language instruction for what to extract", example: "Extract all product names and prices from the page", }), schema: z.record(z.string(), z.unknown()).optional().meta({ description: "JSON Schema defining the structure of data to extract", }), options: ExtractOptionsSchema, frameId: z.string().nullish().meta({ description: "Target frame ID for the extraction", }), streamResponse: z.boolean().optional().meta({ description: "Whether to stream the response via SSE", example: true, }), }) .meta({ id: "ExtractRequest" }); export const ExtractResultSchema = z .object({ result: z.unknown().meta({ description: "Extracted data matching the requested schema", override: ({ jsonSchema }: { jsonSchema: Record }) => { jsonSchema["x-stainless-any"] = true; }, }), actionId: z.string().optional().meta({ description: "Action ID for tracking", }), }) .meta({ id: "ExtractResult" }); export const ExtractResponseSchema = wrapResponse( ExtractResultSchema, "ExtractResponse", ); // ============================================================================= // Observe // ============================================================================= export const ObserveOptionsSchema = z .object({ model: z.union([ModelConfigSchema, z.string()]).optional().meta({ description: "Model configuration object or model name string (e.g., 'openai/gpt-5-nano')", }), timeout: z.number().optional().meta({ description: "Timeout in ms for the observation", example: 30000, }), selector: z.string().optional().meta({ description: "CSS selector to scope observation to a specific element", example: "nav", }), }) .optional() .meta({ id: "ObserveOptions" }); export const ObserveRequestSchema = z .object({ instruction: z.string().optional().meta({ description: "Natural language instruction for what actions to find", example: "Find all clickable navigation links", }), options: ObserveOptionsSchema, frameId: z.string().nullish().meta({ description: "Target frame ID for the observation", }), streamResponse: z.boolean().optional().meta({ description: "Whether to stream the response via SSE", example: true, }), }) .meta({ id: "ObserveRequest" }); export const ObserveResultSchema = z .object({ result: z.array(ActionSchema), actionId: z.string().optional().meta({ description: "Action ID for tracking", }), }) .meta({ id: "ObserveResult" }); export const ObserveResponseSchema = wrapResponse( ObserveResultSchema, "ObserveResponse", ); // ============================================================================= // Agent Execute // ============================================================================= export const AgentConfigSchema = z .object({ provider: z // cloud accepts provider: at the top level for legacy reasons, in the future we should remove it .enum(["openai", "anthropic", "google", "microsoft", "bedrock"]) .optional() .meta({ description: "AI provider for the agent (legacy, use model: openai/gpt-5-nano instead)", example: "openai", }), model: z.union([ModelConfigSchema, z.string()]).optional().meta({ description: "Model configuration object or model name string (e.g., 'openai/gpt-5-nano')", }), systemPrompt: z.string().optional().meta({ description: "Custom system prompt for the agent", }), cua: z.boolean().optional().meta({ description: "Deprecated. Use mode: 'cua' instead. If both are provided, mode takes precedence.", example: true, }), mode: z.enum(["dom", "hybrid", "cua"]).optional().meta({ description: "Tool mode for the agent (dom, hybrid, cua). If set, overrides cua.", example: "cua", }), executionModel: z.union([ModelConfigSchema, z.string()]).optional().meta({ description: "Model configuration object or model name string (e.g., 'openai/gpt-5-nano') for tool execution (observe/act calls within agent tools). If not specified, inherits from the main model configuration.", }), }) .meta({ id: "AgentConfig" }); /** Action taken by the agent during execution */ export const AgentActionSchema = z .object({ type: z.string().meta({ description: "Type of action taken", example: "click", }), reasoning: z.string().optional().meta({ description: "Agent's reasoning for taking this action", }), taskCompleted: z.boolean().optional(), action: z.string().optional(), timeMs: z.number().optional().meta({ description: "Time taken for this action in ms", }), pageText: z.string().optional(), pageUrl: z.string().optional(), instruction: z.string().optional(), }) .passthrough() .meta({ id: "AgentAction" }); /** Token usage statistics for agent execution */ export const AgentUsageSchema = z .object({ input_tokens: z.number().meta({ example: 1500 }), output_tokens: z.number().meta({ example: 250 }), reasoning_tokens: z.number().optional(), cached_input_tokens: z.number().optional(), inference_time_ms: z.number().meta({ example: 2500 }), }) .meta({ id: "AgentUsage" }); /** Result data from agent execution */ export const AgentResultDataSchema = z .object({ success: z.boolean().meta({ description: "Whether the agent completed successfully", example: true, }), message: z.string().meta({ description: "Summary of what the agent accomplished", example: "Successfully logged in and navigated to dashboard", }), actions: z.array(AgentActionSchema), completed: z.boolean().meta({ description: "Whether the agent finished its task", example: true, }), metadata: z.record(z.string(), z.unknown()).optional(), usage: AgentUsageSchema.optional(), }) .meta({ id: "AgentResultData" }); export const AgentCacheEntrySchema = z .object({ cacheKey: z.string().meta({ description: "Opaque cache identifier computed from instruction, URL, options, and config", }), entry: z.unknown().meta({ description: "Serialized cache entry that can be written to disk", }), }) .meta({ id: "AgentCacheEntry" }); export const AgentExecuteOptionsSchema = z .object({ instruction: z.string().meta({ description: "Natural language instruction for the agent", example: "Log in with username 'demo' and password 'test123', then navigate to settings", }), maxSteps: z.number().optional().meta({ description: "Maximum number of steps the agent can take", example: 20, }), highlightCursor: z.boolean().optional().meta({ description: "Whether to visually highlight the cursor during execution", example: true, }), useSearch: z.boolean().optional().meta({ description: "Whether to enable the web search tool powered by Browserbase Search API", example: true, }), toolTimeout: z.number().optional().meta({ description: "Timeout in milliseconds for each agent tool call", example: 30000, }), }) .meta({ id: "AgentExecuteOptions" }); export const AgentExecuteRequestSchema = z .object({ agentConfig: AgentConfigSchema, executeOptions: AgentExecuteOptionsSchema, frameId: z.string().nullish().meta({ description: "Target frame ID for the agent", }), streamResponse: z.boolean().optional().meta({ description: "Whether to stream the response via SSE", example: true, }), shouldCache: z.boolean().optional().meta({ description: "If true, the server captures a cache entry and returns it to the client", }), }) .meta({ id: "AgentExecuteRequest" }); export const AgentExecuteResultSchema = z .object({ result: AgentResultDataSchema, cacheEntry: AgentCacheEntrySchema.optional(), }) .meta({ id: "AgentExecuteResult" }); export const AgentExecuteResponseSchema = wrapResponse( AgentExecuteResultSchema, "AgentExecuteResponse", ); // ============================================================================= // Navigate // ============================================================================= export const NavigateOptionsSchema = z .object({ referer: z.string().optional().meta({ description: "Referer header to send with the request", }), timeout: z.number().optional().meta({ description: "Timeout in ms for the navigation", example: 30000, }), waitUntil: z .enum(["load", "domcontentloaded", "networkidle"]) .optional() .meta({ description: "When to consider navigation complete", example: "networkidle", }), }) .optional() .meta({ id: "NavigateOptions" }); export const NavigateRequestSchema = z .object({ url: z.string().meta({ description: "URL to navigate to", example: "https://example.com", }), options: NavigateOptionsSchema, frameId: z.string().nullish().meta({ description: "Target frame ID for the navigation", }), streamResponse: z.boolean().optional().meta({ description: "Whether to stream the response via SSE", example: true, }), }) .meta({ id: "NavigateRequest" }); export const NavigateResultSchema = z .object({ // SerializableResponse from types/private/api.ts - no Zod schema available // as it wraps complex devtools-protocol types (Protocol.Network.Response) result: z .unknown() .nullable() .meta({ description: "Navigation response (Playwright Response object or null)", override: ({ jsonSchema }: { jsonSchema: Record }) => { jsonSchema["x-stainless-any"] = true; }, }), actionId: z.string().optional().meta({ description: "Action ID for tracking", }), }) .meta({ id: "NavigateResult" }); export const NavigateResponseSchema = wrapResponse( NavigateResultSchema, "NavigateResponse", ); // ============================================================================= // Replay Metrics // ============================================================================= /** Token usage for a single action */ export const TokenUsageSchema = z .object({ inputTokens: z.number().optional(), outputTokens: z.number().optional(), timeMs: z.number().optional(), cost: z.number().optional(), }) .meta({ id: "TokenUsage" }); /** Action entry in replay metrics */ export const ReplayActionSchema = z .object({ method: z.string(), parameters: z.record(z.string(), z.unknown()), result: z.record(z.string(), z.unknown()), timestamp: z.number(), endTime: z.number().optional(), tokenUsage: TokenUsageSchema.optional(), }) .meta({ id: "ReplayAction" }); /** Page entry in replay metrics */ export const ReplayPageSchema = z .object({ url: z.string(), timestamp: z.number(), duration: z.number(), actions: z.array(ReplayActionSchema), }) .meta({ id: "ReplayPage" }); /** Inner result data for replay */ export const ReplayResultSchema = z .object({ pages: z.array(ReplayPageSchema), clientLanguage: z.string().optional(), }) .meta({ id: "ReplayResult" }); export const ReplayResponseSchema = wrapResponse( ReplayResultSchema, "ReplayResponse", ); // ============================================================================= // SSE Stream Events // ============================================================================= // These schemas define the Server-Sent Events format for streaming responses. // Streaming is enabled by setting the `x-stream-response: true` header. /** Status values for SSE stream events */ export const StreamEventStatusSchema = z .enum(["starting", "connected", "running", "finished", "error"]) .meta({ id: "StreamEventStatus", description: "Current status of the streaming operation", }); /** Type discriminator for SSE stream events */ export const StreamEventTypeSchema = z.enum(["system", "log"]).meta({ id: "StreamEventType", description: "Type of stream event - system events or log messages", }); /** Data payload for system stream events */ export const StreamEventSystemDataSchema = z .object({ status: StreamEventStatusSchema, result: z .unknown() .optional() .meta({ description: "Operation result (present when status is 'finished')", override: ({ jsonSchema }: { jsonSchema: Record }) => { jsonSchema["x-stainless-any"] = true; }, }), error: z.string().optional().meta({ description: "Error message (present when status is 'error')", }), }) .meta({ id: "StreamEventSystemData" }); /** Data payload for log stream events */ export const StreamEventLogDataSchema = z .object({ status: z.literal("running"), message: z.string().meta({ description: "Log message from the operation", }), }) .meta({ id: "StreamEventLogData" }); /** * SSE stream event sent during streaming responses. * * IMPORTANT: Key ordering matters for Stainless SDK generation. * The `data` field MUST be serialized first, with `status` as the first key within it. * This allows Stainless to use `data_starts_with: '{"data":{"status":"finished"'` for event handling. * * Expected serialization order: {"data":{"status":...},"type":...,"id":...} */ export const StreamEventSchema = z .object({ data: z.union([StreamEventSystemDataSchema, StreamEventLogDataSchema]), type: StreamEventTypeSchema, id: z.string().uuid().meta({ description: "Unique identifier for this event", example: "c4dbf3a9-9a58-4b22-8a1c-9f20f9f9e123", }), }) .meta({ id: "StreamEvent", description: "Server-Sent Event emitted during streaming responses. Events are sent as `data: \\n\\n`. Key order: data (with status first), type, id.", }); // ============================================================================= // OpenAPI Components // ============================================================================= // These objects are exported for use in gen-openapi.ts to configure the spec. /** OpenAPI security schemes for authentication */ export const openApiSecuritySchemes = { BrowserbaseApiKey: { type: "apiKey", in: "header", name: "x-bb-api-key", description: "Browserbase API key for authentication", }, BrowserbaseProjectId: { type: "apiKey", in: "header", name: "x-bb-project-id", description: "Browserbase project ID", }, ModelApiKey: { type: "apiKey", in: "header", name: "x-model-api-key", description: "API key for the AI model provider (OpenAI, Anthropic, etc.)", }, } as const; /** OpenAPI links for session operations (used in SessionStart response) */ export const openApiLinks = { SessionAct: { operationId: "SessionAct", parameters: { id: "$response.body#/data/sessionId" }, description: "Perform an action on the session", }, SessionExtract: { operationId: "SessionExtract", parameters: { id: "$response.body#/data/sessionId" }, description: "Extract data from the session", }, SessionObserve: { operationId: "SessionObserve", parameters: { id: "$response.body#/data/sessionId" }, description: "Observe available actions on the session", }, SessionNavigate: { operationId: "SessionNavigate", parameters: { id: "$response.body#/data/sessionId" }, description: "Navigate to a URL in the session", }, SessionAgentExecute: { operationId: "SessionAgentExecute", parameters: { id: "$response.body#/data/sessionId" }, description: "Execute an agent on the session", }, SessionReplay: { operationId: "SessionReplay", parameters: { id: "$response.body#/data/sessionId" }, description: "Replay session metrics", }, SessionEnd: { operationId: "SessionEnd", parameters: { id: "$response.body#/data/sessionId" }, description: "End the session and release resources", }, } as const; /** OpenAPI operation metadata for each endpoint */ export const Operations = { SessionStart: { operationId: "SessionStart", summary: "Start a new browser session", description: "Creates a new browser session with the specified configuration. Returns a session ID used for all subsequent operations.", }, SessionEnd: { operationId: "SessionEnd", summary: "End a browser session", description: "Terminates the browser session and releases all associated resources.", }, SessionAct: { operationId: "SessionAct", summary: "Perform an action", description: "Executes a browser action using natural language instructions or a predefined Action object.", }, SessionExtract: { operationId: "SessionExtract", summary: "Extract data from the page", description: "Extracts structured data from the current page using AI-powered analysis.", }, SessionObserve: { operationId: "SessionObserve", summary: "Observe available actions", description: "Identifies and returns available actions on the current page that match the given instruction.", }, SessionNavigate: { operationId: "SessionNavigate", summary: "Navigate to a URL", description: "Navigates the browser to the specified URL.", }, SessionAgentExecute: { operationId: "SessionAgentExecute", summary: "Execute an AI agent", description: "Runs an autonomous AI agent that can perform complex multi-step browser tasks.", }, SessionReplay: { operationId: "SessionReplay", summary: "Replay session metrics", description: "Retrieves replay metrics for a session.", }, } as const; // ============================================================================= // Type Exports (inferred from schemas) // ============================================================================= // Shared types export type Action = z.infer; export type ModelConfig = z.infer; export type BrowserConfig = z.infer; export type SessionIdParams = z.infer; // Header types export type SessionHeaders = z.infer; // Browserbase types export type BrowserbaseViewport = z.infer; export type BrowserbaseFingerprintScreen = z.infer< typeof BrowserbaseFingerprintScreenSchema >; export type BrowserbaseFingerprint = z.infer< typeof BrowserbaseFingerprintSchema >; export type BrowserbaseContext = z.infer; export type BrowserbaseBrowserSettings = z.infer< typeof BrowserbaseBrowserSettingsSchema >; export type BrowserbaseProxyGeolocation = z.infer< typeof BrowserbaseProxyGeolocationSchema >; export type BrowserbaseProxyConfig = z.infer< typeof BrowserbaseProxyConfigSchema >; export type ExternalProxyConfig = z.infer; export type BrowserbaseRegion = z.infer; export type BrowserbaseSessionCreateParams = z.infer< typeof BrowserbaseSessionCreateParamsSchema >; // Type check: ensure our schema-derived type is assignable to the SDK type // This will cause a compile error if our schema drifts from the SDK // eslint-disable-next-line @typescript-eslint/no-unused-vars type _BrowserbaseSessionCreateParamsCheck = BrowserbaseSessionCreateParams extends Browserbase.Sessions.SessionCreateParams ? true : never; // /sessions/start export type SessionStartRequest = z.infer; export type SessionStartResult = z.infer; export type SessionStartResponse = z.infer; // /sessions/{id}/end export type SessionEndResult = z.infer; export type SessionEndResponse = z.infer; // /sessions/{id}/act export type ActRequest = z.infer; export type ActResultData = z.infer; export type ActResult = z.infer; export type ActResponse = z.infer; // /sessions/{id}/extract export type ExtractRequest = z.infer; export type ExtractResult = z.infer; export type ExtractResponse = z.infer; // /sessions/{id}/observe export type ObserveRequest = z.infer; export type ObserveResult = z.infer; export type ObserveResponse = z.infer; // /sessions/{id}/agentExecute export type AgentAction = z.infer; export type AgentUsage = z.infer; export type AgentResultData = z.infer; export type AgentExecuteRequest = z.infer; export type AgentExecuteResult = z.infer; export type AgentExecuteResponse = z.infer; // /sessions/{id}/navigate export type NavigateRequest = z.infer; export type NavigateResult = z.infer; export type NavigateResponse = z.infer; // /sessions/{id}/replay export type TokenUsage = z.infer; export type ReplayAction = z.infer; export type ReplayPage = z.infer; export type ReplayResult = z.infer; export type ReplayResponse = z.infer; // SSE Stream Events export type StreamEventStatus = z.infer; export type StreamEventType = z.infer; export type StreamEventSystemData = z.infer; export type StreamEventLogData = z.infer; export type StreamEvent = z.infer; ================================================ FILE: packages/core/lib/v3/types/public/apiErrors.ts ================================================ export class StagehandAPIError extends Error { constructor(message: string) { super(message); this.name = this.constructor.name; } } export class StagehandAPIUnauthorizedError extends StagehandAPIError { constructor(message?: string) { super(message || "Unauthorized request"); } } export class StagehandHttpError extends StagehandAPIError { constructor(message: string) { super(message); } } export class StagehandServerError extends StagehandAPIError { constructor(message: string) { super(message); } } export class StagehandResponseBodyError extends StagehandAPIError { constructor() { super("Response body is null"); } } export class StagehandResponseParseError extends StagehandAPIError { constructor(message: string) { super(message); } } ================================================ FILE: packages/core/lib/v3/types/public/context.ts ================================================ /** A cookie as returned by the browser. */ export interface Cookie { name: string; value: string; domain: string; path: string; /** Unix time in seconds. -1 means session cookie. */ expires: number; httpOnly: boolean; secure: boolean; sameSite: "Strict" | "Lax" | "None"; } /** Parameters for setting a cookie. Provide `url` OR `domain`+`path`, not both. */ export interface CookieParam { name: string; value: string; /** Convenience: if provided, domain/path/secure are derived from this URL. */ url?: string; domain?: string; path?: string; /** Unix timestamp in seconds. -1 or omitted = session cookie. */ expires?: number; httpOnly?: boolean; secure?: boolean; sameSite?: "Strict" | "Lax" | "None"; } /** Filter options for clearing cookies selectively. */ export interface ClearCookieOptions { name?: string | RegExp; domain?: string | RegExp; path?: string | RegExp; } ================================================ FILE: packages/core/lib/v3/types/public/index.ts ================================================ export * from "./agent.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience export type { BrowserbaseRegion } from "./api.js"; export * from "./apiErrors.js"; export * from "./logs.js"; export * from "./methods.js"; export * from "./metrics.js"; export * from "./model.js"; export * from "./options.js"; export * from "./page.js"; export * from "./sdkErrors.js"; export * from "./context.js"; export { AISdkClient } from "../../external_clients/aisdk.js"; export { CustomOpenAIClient } from "../../external_clients/customOpenAI.js"; ================================================ FILE: packages/core/lib/v3/types/public/locator.ts ================================================ import { Buffer } from "buffer"; export type MouseButton = "left" | "right" | "middle"; export interface SetInputFilePayload { name: string; mimeType?: string; buffer: ArrayBuffer | Uint8Array | Buffer | string; lastModified?: number; } export type SetInputFilesArgument = | string | string[] | SetInputFilePayload | SetInputFilePayload[]; ================================================ FILE: packages/core/lib/v3/types/public/logs.ts ================================================ export type LogLevel = 0 | 1 | 2; /** * Mapping between numeric log levels and their names * * 0 - error/warn - Critical issues or important warnings * 1 - info - Standard information messages * 2 - debug - Detailed information for debugging */ export const LOG_LEVEL_NAMES: Record = { 0: "error", 1: "info", 2: "debug", }; export type LogLine = { id?: string; category?: string; message: string; level?: LogLevel; timestamp?: string; auxiliary?: { [key: string]: { value: string; type: "object" | "string" | "html" | "integer" | "float" | "boolean"; }; }; }; export type Logger = (logLine: LogLine) => void; ================================================ FILE: packages/core/lib/v3/types/public/methods.ts ================================================ import { Page as PatchrightPage } from "patchright-core"; import { Page as PlaywrightPage } from "playwright-core"; import { Page as PuppeteerPage } from "puppeteer-core"; import { z } from "zod"; import type { InferStagehandSchema, StagehandZodSchema, } from "../../zodCompat.js"; import { Page } from "../../understudy/page.js"; import { ModelConfiguration } from "../public/model.js"; import type { Variables } from "./agent.js"; export interface ActOptions { model?: ModelConfiguration; variables?: Variables; timeout?: number; page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page; /** * Override the instance-level serverCache setting for this request. * When true, enables server-side caching. * When false, disables server-side caching. */ serverCache?: boolean; } export interface ActResult { success: boolean; message: string; actionDescription: string; actions: Action[]; cacheStatus?: "HIT" | "MISS"; } export type ExtractResult = InferStagehandSchema & { cacheStatus?: "HIT" | "MISS"; }; export interface Action { selector: string; description: string; method?: string; arguments?: string[]; } export interface HistoryEntry { method: "act" | "extract" | "observe" | "navigate" | "agent"; parameters: unknown; result: unknown; timestamp: string; } export interface ExtractOptions { model?: ModelConfiguration; timeout?: number; selector?: string; page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page; /** * Override the instance-level serverCache setting for this request. * When true, enables server-side caching. * When false, disables server-side caching. */ serverCache?: boolean; } export const defaultExtractSchema = z.object({ extraction: z.string(), }); export const pageTextSchema = z.object({ pageText: z.string(), }); export interface ObserveOptions { model?: ModelConfiguration; timeout?: number; selector?: string; page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page; /** * Override the instance-level serverCache setting for this request. * When true, enables server-side caching. * When false, disables server-side caching. */ serverCache?: boolean; } /** * Observe returns an array of candidate actions. The optional `cacheStatus` * property is attached when the server responds with a * `browserbase-cache-status` header so callers can tell whether the result * was served from the server-side cache. */ export type ObserveResult = Action[] & { cacheStatus?: "HIT" | "MISS" }; export enum V3FunctionName { ACT = "ACT", EXTRACT = "EXTRACT", OBSERVE = "OBSERVE", AGENT = "AGENT", } ================================================ FILE: packages/core/lib/v3/types/public/metrics.ts ================================================ export interface StagehandMetrics { actPromptTokens: number; actCompletionTokens: number; actReasoningTokens: number; actCachedInputTokens: number; actInferenceTimeMs: number; extractPromptTokens: number; extractCompletionTokens: number; extractReasoningTokens: number; extractCachedInputTokens: number; extractInferenceTimeMs: number; observePromptTokens: number; observeCompletionTokens: number; observeReasoningTokens: number; observeCachedInputTokens: number; observeInferenceTimeMs: number; agentPromptTokens: number; agentCompletionTokens: number; agentReasoningTokens: number; agentCachedInputTokens: number; agentInferenceTimeMs: number; totalPromptTokens: number; totalCompletionTokens: number; totalReasoningTokens: number; totalCachedInputTokens: number; totalInferenceTimeMs: number; } ================================================ FILE: packages/core/lib/v3/types/public/model.ts ================================================ import type { ClientOptions as AnthropicClientOptionsBase } from "@anthropic-ai/sdk"; import type { GoogleVertexProviderSettings as GoogleVertexProviderSettingsBase } from "@ai-sdk/google-vertex"; import type { LanguageModelV2 } from "@ai-sdk/provider"; import type { ClientOptions as OpenAIClientOptionsBase } from "openai"; import type { AgentProviderType } from "./agent.js"; export type OpenAIClientOptions = Pick< OpenAIClientOptionsBase, "baseURL" | "apiKey" >; export type AnthropicClientOptions = Pick< AnthropicClientOptionsBase, "baseURL" | "apiKey" >; export interface GoogleServiceAccountCredentials { type?: string; project_id?: string; private_key_id?: string; private_key?: string; client_email?: string; client_id?: string; auth_uri?: string; token_uri?: string; auth_provider_x509_cert_url?: string; client_x509_cert_url?: string; universe_domain?: string; } export type GoogleVertexProviderSettings = Pick< GoogleVertexProviderSettingsBase, "project" | "location" | "headers" > & { googleAuthOptions?: { credentials?: GoogleServiceAccountCredentials; }; }; export type AnthropicJsonSchemaObject = { definitions?: { MySchema?: { properties?: Record; required?: string[]; }; }; properties?: Record; required?: string[]; } & Record; export interface LLMTool { type: "function"; name: string; description: string; parameters: Record; } export type AISDKProvider = (modelName: string) => LanguageModelV2; // Represents a function that takes options (like apiKey) and returns an AISDKProvider export type AISDKCustomProvider = (options: ClientOptions) => AISDKProvider; export type AvailableModel = | "gpt-4.1" | "gpt-4.1-mini" | "gpt-4.1-nano" | "o4-mini" | "o3" | "o3-mini" | "o1" | "o1-mini" | "gpt-4o" | "gpt-4o-mini" | "gpt-4o-2024-08-06" | "gpt-4.5-preview" | "o1-preview" | "cerebras-llama-3.3-70b" | "cerebras-llama-3.1-8b" | "groq-llama-3.3-70b-versatile" | "groq-llama-3.3-70b-specdec" | "gemini-1.5-flash" | "gemini-1.5-pro" | "gemini-1.5-flash-8b" | "gemini-2.0-flash-lite" | "gemini-2.0-flash" | "gemini-2.5-flash-preview-04-17" | "gemini-2.5-pro-preview-03-25" | string; export type ModelProvider = | "openai" | "anthropic" | "cerebras" | "groq" | "google" | "aisdk"; export type ClientOptions = ( | OpenAIClientOptions | AnthropicClientOptions | GoogleVertexProviderSettings ) & { apiKey?: string; provider?: AgentProviderType; baseURL?: string; /** OpenAI organization ID */ organization?: string; /** Delay between agent actions in ms */ waitBetweenActions?: number; /** Anthropic thinking budget for extended thinking */ thinkingBudget?: number; /** Environment type for CUA agents (browser, mac, windows, ubuntu) */ environment?: string; /** Max images for Microsoft FARA agent */ maxImages?: number; /** Temperature for model inference */ temperature?: number; /** Custom headers sent with every request to the provider */ headers?: Record; }; export type ModelConfiguration = | AvailableModel | (ClientOptions & { modelName: AvailableModel }); ================================================ FILE: packages/core/lib/v3/types/public/options.ts ================================================ import { z } from "zod"; import { LLMClient } from "../../llm/LLMClient.js"; import { ModelConfiguration } from "./model.js"; import { LogLine } from "./logs.js"; import { type BrowserbaseSessionCreateParams, LocalBrowserLaunchOptionsSchema, } from "./api.js"; export type V3Env = "LOCAL" | "BROWSERBASE"; // Re-export for backwards compatibility (camelCase alias) export const localBrowserLaunchOptionsSchema = LocalBrowserLaunchOptionsSchema; export type LocalBrowserLaunchOptions = z.infer< typeof LocalBrowserLaunchOptionsSchema >; /** Constructor options for V3 */ export interface V3Options { env: V3Env; /** * Optional external session identifier to use for flow logging/event storage. * When omitted, Stagehand falls back to its internal instance id. * This currently ends up 1:1 with the Browserbase session id when one exists, * but callers should not rely on that remaining a permanent invariant. */ sessionId?: string; // Browserbase (required when env = "BROWSERBASE") apiKey?: string; projectId?: string; /** * Optional: fine-tune Browserbase session creation or resume an existing session. */ browserbaseSessionCreateParams?: BrowserbaseSessionCreateParams; browserbaseSessionID?: string; /** * Controls browser keepalive behavior. When set, it overrides any value in * browserbaseSessionCreateParams.keepAlive. */ keepAlive?: boolean; // Local Chromium (optional) localBrowserLaunchOptions?: LocalBrowserLaunchOptions; model?: ModelConfiguration; llmClient?: LLMClient; // allow user to pass their own systemPrompt?: string; logInferenceToFile?: boolean; experimental?: boolean; verbose?: 0 | 1 | 2; selfHeal?: boolean; // V2 compatibility fields - only included because the server imports this type and supports V2 waitForCaptchaSolves?: boolean; actTimeoutMs?: number; /** Disable pino logging backend (useful for tests or minimal environments). */ disablePino?: boolean; /** Optional external logger hook for integrating with host apps. */ logger?: (line: LogLine) => void; /** Directory used to persist cached actions for act(). */ cacheDir?: string; domSettleTimeout?: number; disableAPI?: boolean; /** * When true, enables server-side caching for API requests. * When false, disables server-side caching. * Defaults to true (caching enabled). * Can be overridden per-method in act(), extract(), and observe() options. */ serverCache?: boolean; } ================================================ FILE: packages/core/lib/v3/types/public/page.ts ================================================ import { Page } from "../../understudy/page.js"; import { Page as PlaywrightPage } from "playwright-core"; import { Page as PatchrightPage } from "patchright-core"; import { Page as PuppeteerPage } from "puppeteer-core"; export type { PlaywrightPage, PatchrightPage, PuppeteerPage, Page }; export type AnyPage = PlaywrightPage | PuppeteerPage | PatchrightPage | Page; export { ConsoleMessage } from "../../understudy/consoleMessage.js"; export type { ConsoleListener } from "../../understudy/consoleMessage.js"; export type LoadState = "load" | "domcontentloaded" | "networkidle"; export { Response } from "../../understudy/response.js"; export type SnapshotResult = { formattedTree: string; xpathMap: Record; urlMap: Record; }; export type PageSnapshotOptions = { includeIframes?: boolean; }; ================================================ FILE: packages/core/lib/v3/types/public/screenshotTypes.ts ================================================ import type { Locator } from "../../understudy/locator.js"; export type ScreenshotAnimationsOption = "disabled" | "allow"; export type ScreenshotCaretOption = "hide" | "initial"; export type ScreenshotScaleOption = "css" | "device"; export interface ScreenshotClip { x: number; y: number; width: number; height: number; } export interface ScreenshotOptions { animations?: ScreenshotAnimationsOption; caret?: ScreenshotCaretOption; clip?: ScreenshotClip; fullPage?: boolean; mask?: Locator[]; maskColor?: string; omitBackground?: boolean; path?: string; quality?: number; scale?: ScreenshotScaleOption; style?: string; timeout?: number; type?: "png" | "jpeg"; } ================================================ FILE: packages/core/lib/v3/types/public/sdkErrors.ts ================================================ import { ZodError } from "zod"; // Avoid .js extension so bundlers resolve TS source import { STAGEHAND_VERSION } from "../../../version.js"; export class StagehandError extends Error { public readonly cause?: unknown; constructor(message: string, cause?: unknown) { super(message); this.name = this.constructor.name; if (cause !== undefined) { this.cause = cause; } } } export class StagehandDefaultError extends StagehandError { constructor(error?: unknown) { if (error instanceof Error || error instanceof StagehandError) { super( `\nHey! We're sorry you ran into an error. \nStagehand version: ${STAGEHAND_VERSION} \nIf you need help, please open a Github issue or reach out to us on Discord: https://stagehand.dev/discord\n\nFull error:\n${error.message}`, ); } } } export class StagehandEnvironmentError extends StagehandError { constructor( currentEnvironment: string, requiredEnvironment: string, feature: string, ) { super( `You seem to be setting the current environment to ${currentEnvironment}.` + `Ensure the environment is set to ${requiredEnvironment} if you want to use ${feature}.`, ); } } export class MissingEnvironmentVariableError extends StagehandError { constructor(missingEnvironmentVariable: string, feature: string) { super( `${missingEnvironmentVariable} is required to use ${feature}.` + `Please set ${missingEnvironmentVariable} in your environment.`, ); } } export class UnsupportedModelError extends StagehandError { constructor(supportedModels: string[], feature?: string) { const message = feature ? `${feature} requires a valid model.` : `Unsupported model.`; const guidance = `\n\nPlease use the provider/model format (e.g., "openai/gpt-4o", "anthropic/claude-sonnet-4-5", "google/gemini-3-flash-preview").` + `\n\nFor a complete list of supported models and providers, see: https://docs.stagehand.dev/v3/configuration/models#configuration-setup`; super(`${message}${guidance}`); } } export class UnsupportedModelProviderError extends StagehandError { constructor(supportedProviders: string[], feature?: string) { super( feature ? `${feature} requires one of the following model providers: ${supportedProviders}` : `please use one of the supported model providers: ${supportedProviders}`, ); } } export class UnsupportedAISDKModelProviderError extends StagehandError { constructor(provider: string, supportedProviders: string[]) { super( `${provider} is not currently supported for aiSDK. please use one of the supported model providers: ${supportedProviders}`, ); } } export class InvalidAISDKModelFormatError extends StagehandError { constructor(modelName: string) { super( `${modelName} does not follow correct format for specifying aiSDK models. Please define your model as 'provider/model-name'. For example: \`model: 'openai/gpt-4o-mini'\``, ); } } export class StagehandNotInitializedError extends StagehandError { constructor(prop: string) { super( `You seem to be calling \`${prop}\` on a page in an uninitialized \`Stagehand\` object. ` + `Ensure you are running \`await stagehand.init()\` on the Stagehand object before ` + `referencing the \`page\` object.`, ); } } export class BrowserbaseSessionNotFoundError extends StagehandError { constructor() { super("No Browserbase session ID found"); } } export class CaptchaTimeoutError extends StagehandError { constructor() { super("Captcha timeout"); } } export class MissingLLMConfigurationError extends StagehandError { constructor() { super( "No LLM API key or LLM Client configured. An LLM API key or a custom LLM Client " + "is required to use act, extract, or observe.", ); } } export class HandlerNotInitializedError extends StagehandError { constructor(handlerType: string) { super(`${handlerType} handler not initialized`); } } export class StagehandInvalidArgumentError extends StagehandError { constructor(message: string) { super(`InvalidArgumentError: ${message}`); } } export class CookieValidationError extends StagehandError { constructor(message: string) { super(message); } } export class CookieSetError extends StagehandError { constructor(message: string) { super(message); } } export class StagehandElementNotFoundError extends StagehandError { constructor(xpaths: string[]) { super(`Could not find an element for the given xPath(s): ${xpaths}`); } } export class AgentScreenshotProviderError extends StagehandError { constructor(message: string) { super(`ScreenshotProviderError: ${message}`); } } export class StagehandMissingArgumentError extends StagehandError { constructor(message: string) { super(`MissingArgumentError: ${message}`); } } export class CreateChatCompletionResponseError extends StagehandError { constructor(message: string) { super(`CreateChatCompletionResponseError: ${message}`); } } export class StagehandEvalError extends StagehandError { constructor(message: string) { super(`StagehandEvalError: ${message}`); } } export class StagehandDomProcessError extends StagehandError { constructor(message: string) { super(`Error Processing Dom: ${message}`); } } export class StagehandLocatorError extends StagehandError { constructor(action: string, selector: string, message: string) { super( `Error ${action} Element with selector: ${selector} Reason: ${message}`, ); } } export class StagehandClickError extends StagehandError { constructor(message: string, selector: string) { super( `Error Clicking Element with selector: ${selector} Reason: ${message}`, ); } } export class LLMResponseError extends StagehandError { constructor(primitive: string, message: string) { super(`${primitive} LLM response error: ${message}`); } } export class StagehandIframeError extends StagehandError { constructor(frameUrl: string, message: string) { super( `Unable to resolve frameId for iframe with URL: ${frameUrl} Full error: ${message}`, ); } } export class ContentFrameNotFoundError extends StagehandError { constructor(selector: string) { super(`Unable to obtain a content frame for selector: ${selector}`); } } export class XPathResolutionError extends StagehandError { constructor(xpath: string) { super(`XPath "${xpath}" does not resolve in the current page or frames`); } } export class ExperimentalApiConflictError extends StagehandError { constructor() { super( "`experimental` mode cannot be used together with the Stagehand API. " + "To use experimental features, set experimental: true and disableAPI: true in the stagehand constructor. " + "To use the Stagehand API, set experimental: false and disableAPI: false (or omit it) in the stagehand constructor.", ); } } export class ExperimentalNotConfiguredError extends StagehandError { constructor(featureName: string) { super(`Feature "${featureName}" is an experimental feature, and cannot be configured when disableAPI: false. Please set experimental: true and disableAPI: true in the stagehand constructor to use this feature. If you wish to use the Stagehand API, please ensure ${featureName} is not defined in your function call, and set experimental: false, disableAPI: false (or omit it) in the Stagehand constructor.`); } } export class CuaModelRequiredError extends StagehandError { constructor(availableModels: readonly string[]) { super( `To use the computer use agent (CUA), please provide a CUA model in the agent constructor or stagehand config. ` + `Try one of our supported CUA models: ${availableModels.join(", ")}`, ); } } export class ZodSchemaValidationError extends Error { constructor( public readonly received: unknown, public readonly issues: ReturnType, ) { super(`Zod schema validation failed — Received — ${JSON.stringify(received, null, 2)} — Issues — ${JSON.stringify(issues, null, 2)}`); this.name = "ZodSchemaValidationError"; } } export class StagehandInitError extends StagehandError { constructor(message: string) { super(message); } } export class MCPConnectionError extends StagehandError { public readonly serverUrl: string; public readonly originalError: unknown; constructor(serverUrl: string, originalError: unknown) { const errorMessage = originalError instanceof Error ? originalError.message : String(originalError); super( `Failed to connect to MCP server at "${serverUrl}". ${errorMessage}. ` + `Please verify the server URL is correct and the server is running.`, ); this.serverUrl = serverUrl; this.originalError = originalError; } } export class StagehandShadowRootMissingError extends StagehandError { constructor(detail?: string) { super( `No shadow root present on the resolved host` + (detail ? `: ${detail}` : ""), ); } } export class StagehandShadowSegmentEmptyError extends StagehandError { constructor() { super(`Empty selector segment after shadow-DOM hop ("//")`); } } export class StagehandShadowSegmentNotFoundError extends StagehandError { constructor(segment: string, hint?: string) { super( `Shadow segment '${segment}' matched no element inside shadow root` + (hint ? ` ${hint}` : ""), ); } } export class ElementNotVisibleError extends StagehandError { constructor(selector: string) { super(`Element not visible (no box model): ${selector}`); } } export class ResponseBodyError extends StagehandError { constructor(message: string) { super(`Failed to retrieve response body: ${message}`); } } export class ResponseParseError extends StagehandError { constructor(message: string) { super(`Failed to parse response: ${message}`); } } export class TimeoutError extends StagehandError { constructor(operation: string, timeoutMs: number) { super(`${operation} timed out after ${timeoutMs}ms`); } } export class ActTimeoutError extends TimeoutError { constructor(timeoutMs: number) { super("act()", timeoutMs); this.name = "ActTimeoutError"; } } export class ExtractTimeoutError extends TimeoutError { constructor(timeoutMs: number) { super("extract()", timeoutMs); this.name = "ExtractTimeoutError"; } } export class ObserveTimeoutError extends TimeoutError { constructor(timeoutMs: number) { super("observe()", timeoutMs); this.name = "ObserveTimeoutError"; } } export class PageNotFoundError extends StagehandError { constructor(identifier: string) { super(`No Page found for ${identifier}`); } } export class ConnectionTimeoutError extends StagehandError { constructor(message: string) { super(`Connection timeout: ${message}`); } } export class StreamingCallbacksInNonStreamingModeError extends StagehandError { public readonly invalidCallbacks: string[]; constructor(invalidCallbacks: string[]) { super( `Streaming-only callback(s) "${invalidCallbacks.join('", "')}" cannot be used in non-streaming mode. ` + `Set 'stream: true' in AgentConfig to use these callbacks.`, ); this.invalidCallbacks = invalidCallbacks; } } export class AgentAbortError extends StagehandError { public readonly reason: string; constructor(reason?: string) { const message = reason ? `Agent execution was aborted: ${reason}` : "Agent execution was aborted"; super(message); this.reason = reason || "aborted"; } } export class StagehandClosedError extends StagehandError { constructor() { super("Stagehand session was closed"); } } export class CdpConnectionClosedError extends StagehandError { constructor(reason: string) { super(`CDP connection closed: ${reason}`); } } export class StagehandSetExtraHTTPHeadersError extends StagehandError { public readonly failures: string[]; constructor(failures: string[]) { super( `setExtraHTTPHeaders failed for ${failures.length} session(s): ${failures.join(", ")}`, ); this.failures = failures; } } export class StagehandSnapshotError extends StagehandError { constructor(cause?: unknown) { const suffix = cause instanceof Error ? `: ${cause.message}` : cause ? `: ${String(cause)}` : ""; super(`error taking snapshot${suffix}`, cause); } } export class UnderstudyCommandException extends StagehandError { constructor(message: string, cause?: unknown) { super(message, cause); this.name = "UnderstudyCommandException"; } } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import type { A11yNode, A11yOptions, AccessibilityTreeResult, } from "../../../types/private/snapshot.js"; import { resolveObjectIdForCss, resolveObjectIdForXPath, } from "./focusSelectors.js"; import { formatTreeLine, normaliseSpaces } from "./treeFormatUtils.js"; /** * Fetch and prune the accessibility tree for a frame, optionally scoping the * output to a selector root for faster targeted snapshots. */ export async function a11yForFrame( session: CDPSessionLike, frameId: string | undefined, opts: A11yOptions, ): Promise { await session.send("Accessibility.enable").catch(() => {}); await session.send("Runtime.enable").catch(() => {}); await session.send("DOM.enable").catch(() => {}); let nodes: Protocol.Accessibility.AXNode[] = []; try { const params = frameId ? ({ frameId } as Record) : {}; ({ nodes } = await session.send<{ nodes: Protocol.Accessibility.AXNode[]; }>("Accessibility.getFullAXTree", params)); } catch (e) { const msg = String((e as Error)?.message ?? e ?? ""); const isFrameScopeError = msg.includes("Frame with the given") || msg.includes("does not belong to the target") || msg.includes("is not found"); if (!isFrameScopeError || !frameId) throw e; ({ nodes } = await session.send<{ nodes: Protocol.Accessibility.AXNode[]; }>("Accessibility.getFullAXTree")); } const urlMap: Record = {}; for (const n of nodes) { const be = n.backendDOMNodeId; if (typeof be !== "number") continue; const url = extractUrlFromAXNode(n); if (!url) continue; const enc = opts.encode(be); urlMap[enc] = url; } let scopeApplied = false; const nodesForOutline = await (async () => { const sel = opts.focusSelector?.trim(); if (!sel) return nodes; try { const looksLikeXPath = /^xpath=/i.test(sel) || sel.startsWith("/"); const objectId = looksLikeXPath ? await resolveObjectIdForXPath(session, sel, frameId) : await resolveObjectIdForCss(session, sel, frameId); if (!objectId) return nodes; const desc = await session.send<{ node?: { backendNodeId?: number } }>( "DOM.describeNode", { objectId }, ); const be = desc.node?.backendNodeId; if (typeof be !== "number") return nodes; const target = nodes.find((n) => n.backendDOMNodeId === be); if (!target) return nodes; scopeApplied = true; const keep = new Set([target.nodeId]); const queue: Protocol.Accessibility.AXNode[] = [target]; while (queue.length) { const cur = queue.shift()!; for (const id of cur.childIds ?? []) { if (keep.has(id)) continue; keep.add(id); const child = nodes.find((n) => n.nodeId === id); if (child) queue.push(child); } } return nodes .filter((n) => keep.has(n.nodeId)) .map((n) => n.nodeId === target.nodeId ? { ...n, parentId: undefined } : n, ); } catch { return nodes; } })(); const decorated = decorateRoles(nodesForOutline, opts); const { tree } = await buildHierarchicalTree(decorated, opts); const simplified = tree.map((n) => formatTreeLine(n)).join("\n"); return { outline: simplified.trimEnd(), urlMap, scopeApplied }; } export function decorateRoles( nodes: Protocol.Accessibility.AXNode[], opts: A11yOptions, ): A11yNode[] { const asRole = (n: Protocol.Accessibility.AXNode) => String(n.role?.value ?? ""); return nodes.map((n) => { let encodedId: string | undefined; if (typeof n.backendDOMNodeId === "number") { try { encodedId = opts.encode(n.backendDOMNodeId); } catch { // } } let role = asRole(n); const domIsScrollable = encodedId ? opts.scrollableMap[encodedId] === true : false; const tag = encodedId ? opts.tagNameMap[encodedId] : undefined; const isHtmlElement = tag === "html"; if ((domIsScrollable || isHtmlElement) && tag !== "#document") { const tagLabel = tag && tag.startsWith("#") ? tag.slice(1) : tag; role = tagLabel ? `scrollable, ${tagLabel}` : `scrollable${role ? `, ${role}` : ""}`; } return { role, name: n.name?.value, description: n.description?.value, value: n.value?.value, nodeId: n.nodeId, backendDOMNodeId: n.backendDOMNodeId, parentId: n.parentId, childIds: n.childIds, encodedId, }; }); } export async function buildHierarchicalTree( nodes: A11yNode[], opts: A11yOptions, ): Promise<{ tree: A11yNode[] }> { const nodeMap = new Map(); for (const n of nodes) { const keep = !!(n.name && n.name.trim()) || !!(n.childIds && n.childIds.length) || !isStructural(n.role); if (!keep) continue; nodeMap.set(n.nodeId, { ...n }); } for (const n of nodes) { if (!n.parentId) continue; const parent = nodeMap.get(n.parentId); const cur = nodeMap.get(n.nodeId); if (parent && cur) (parent.children ??= []).push(cur); } const roots = nodes .filter((n) => !n.parentId && nodeMap.has(n.nodeId)) .map((n) => nodeMap.get(n.nodeId)!) as A11yNode[]; const cleaned = (await Promise.all(roots.map(pruneStructuralSafe))).filter( Boolean, ) as A11yNode[]; return { tree: cleaned }; async function pruneStructuralSafe(node: A11yNode): Promise { if (+node.nodeId < 0) return null; const children = node.children ?? []; if (!children.length) { return isStructural(node.role) ? null : node; } const cleanedKids = ( await Promise.all(children.map(pruneStructuralSafe)) ).filter(Boolean) as A11yNode[]; const prunedStatic = removeRedundantStaticTextChildren(node, cleanedKids); if (isStructural(node.role)) { if (prunedStatic.length === 1) return prunedStatic[0]!; if (prunedStatic.length === 0) return null; } let newRole = node.role; if ((newRole === "generic" || newRole === "none") && node.encodedId) { const tagName = opts.tagNameMap[node.encodedId]; if (tagName) newRole = tagName; } if (newRole === "combobox" && node.encodedId) { const tagName = opts.tagNameMap[node.encodedId]; if (tagName === "select") newRole = "select"; } return { ...node, role: newRole, children: prunedStatic }; } } export function isStructural(role: string): boolean { const r = role?.toLowerCase(); return r === "generic" || r === "none" || r === "inlinetextbox"; } export function extractUrlFromAXNode( ax: Protocol.Accessibility.AXNode, ): string | undefined { const props = ax.properties ?? []; const urlProp = props.find((p) => p.name === "url"); const value = urlProp?.value?.value; return typeof value === "string" && value.trim() ? value.trim() : undefined; } export function removeRedundantStaticTextChildren( parent: A11yNode, children: A11yNode[], ): A11yNode[] { if (!parent.name) return children; const parentNorm = normaliseSpaces(parent.name).trim(); let combined = ""; for (const c of children) { if (c.role === "StaticText" && c.name) { combined += normaliseSpaces(c.name).trim(); } } if (combined === parentNorm) { return children.filter((c) => c.role !== "StaticText"); } return children; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/activeElement.ts ================================================ import type { Protocol } from "devtools-protocol"; import { Page } from "../../page.js"; import { executionContexts } from "../../executionContextRegistry.js"; import { buildA11yInvocation } from "../../a11yInvocation.js"; import { a11yScriptSources } from "../../../dom/build/a11yScripts.generated.js"; import { absoluteXPathForBackendNode, normalizeXPath, prefixXPath, } from "./xpathUtils.js"; /** * Compute the absolute XPath for the currently focused element. * - Detects which frame has focus via document.hasFocus(). * - Finds the deepest activeElement (dives into shadow DOM). * - Builds an absolute, cross-frame XPath by prefixing iframe hosts. */ export async function computeActiveElementXpath( page: Page, ): Promise { const tree = page.getFullFrameTree(); const parentByFrame = new Map(); (function index(n: Protocol.Page.FrameTree, parent: string | null) { parentByFrame.set(n.frame.id, parent); for (const c of n.childFrames ?? []) index(c, n.frame.id); })(tree, null); const frames = page.listAllFrameIds(); let focusedFrameId: string | null = null; for (const fid of frames) { const sess = page.getSessionForFrame(fid); try { await sess.send("Runtime.enable").catch(() => {}); const ctxId = await executionContexts .waitForMainWorld(sess, fid, 1000) .catch(() => {}); const hasFocusExpr = buildA11yInvocation("documentHasFocusStrict", []); const evalParams = ctxId ? { contextId: ctxId, expression: hasFocusExpr, returnByValue: true, } : { expression: hasFocusExpr, returnByValue: true }; const { result } = await sess.send( "Runtime.evaluate", evalParams, ); if (result?.value === true) { focusedFrameId = fid; break; } } catch { // } } if (!focusedFrameId) focusedFrameId = page.mainFrameId(); const focusedSession = page.getSessionForFrame(focusedFrameId); let objectId: string | undefined; try { await focusedSession.send("Runtime.enable").catch(() => {}); const ctxId = await executionContexts .waitForMainWorld(focusedSession, focusedFrameId, 1000) .catch(() => {}); const activeExpr = buildA11yInvocation("resolveDeepActiveElement", []); const evalParams = ctxId ? { contextId: ctxId, expression: activeExpr, returnByValue: false, } : { expression: activeExpr, returnByValue: false }; const { result } = await focusedSession.send( "Runtime.evaluate", evalParams, ); objectId = result?.objectId as string | undefined; } catch { objectId = undefined; } if (!objectId) return null; const leafXPath = await (async () => { try { const { result } = await focusedSession.send<{ result: { value?: string }; }>("Runtime.callFunctionOn", { objectId, functionDeclaration: a11yScriptSources.nodeToAbsoluteXPath, returnByValue: true, }); try { await focusedSession.send("Runtime.releaseObject", { objectId }); } catch { // } const xp = result?.value || ""; return typeof xp === "string" && xp ? xp : null; } catch { try { await focusedSession.send("Runtime.releaseObject", { objectId }); } catch { // } return null; } })(); if (!leafXPath) return null; let prefix = ""; let cur: string | null | undefined = focusedFrameId; while (cur) { const parent = parentByFrame.get(cur) ?? null; if (!parent) break; const parentSess = page.getSessionForFrame(parent); try { const { backendNodeId } = await parentSess.send<{ backendNodeId?: number; }>("DOM.getFrameOwner", { frameId: cur }); if (typeof backendNodeId === "number") { const xp = await absoluteXPathForBackendNode(parentSess, backendNodeId); if (xp) prefix = prefix ? prefixXPath(prefix, xp) : normalizeXPath(xp); } } catch { // } cur = parent; } return prefix ? prefixXPath(prefix, leafXPath) : normalizeXPath(leafXPath); } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/capture.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import { Page } from "../../page.js"; import { v3Logger } from "../../../logger.js"; import type { FrameContext, FrameDomMaps, FrameParentIndex, HybridSnapshot, SnapshotOptions, SessionDomIndex, } from "../../../types/private/index.js"; import { a11yForFrame } from "./a11yTree.js"; import { resolveCssFocusFrameAndTail, resolveFocusFrameAndTail, } from "./focusSelectors.js"; import { buildSessionDomIndex, domMapsForSession, relativizeXPath, } from "./domTree.js"; import { injectSubtrees } from "./treeFormatUtils.js"; import { ownerSession, parentSession } from "./sessions.js"; import { normalizeXPath, prefixXPath } from "./xpathUtils.js"; /** * Capture a hybrid DOM + Accessibility snapshot for the provided page. * * Flow overview: * 1. (Optional) Scope directly to a requested selector. We walk iframe hops to * find the owning frame, build just that frame’s DOM + AX tree, and bail out * early when the subtree satisfies the caller. * 2. Build DOM indexes for every unique CDP session. DOM.getDocument is called * once per session and hydrated so per-frame slices can share the result. * 3. Slice each frame’s DOM data from its session index and fetch its AX tree. * This yields relative XPath/tag/url maps for the document rooted at that frame. * 4. Walk the frame tree to compute absolute iframe prefixes. Every child frame * needs the XPath of the iframe element that hosts it so we can prefix maps. * 5. Merge all per-frame results into global combined maps and stitch the text * outline. The final payload mirrors the legacy shape but is built in layers. * * Each numbered block below references the step above for easier debugging. */ export async function captureHybridSnapshot( page: Page, options?: SnapshotOptions, ): Promise { const pierce = options?.pierceShadow ?? true; const includeIframes = options?.includeIframes !== false; const context = buildFrameContext(page); const scopedSnapshot = await tryScopedSnapshot( page, options, context, pierce, ); if (scopedSnapshot) return scopedSnapshot; const framesInScope = includeIframes ? [...context.frames] : [context.rootId]; if (!framesInScope.includes(context.rootId)) { framesInScope.unshift(context.rootId); } const sessionToIndex = await buildSessionIndexes(page, framesInScope, pierce); const { perFrameMaps, perFrameOutlines } = await collectPerFrameMaps( page, context, sessionToIndex, options, pierce, framesInScope, ); const { absPrefix, iframeHostEncByChild } = await computeFramePrefixes( page, context, perFrameMaps, framesInScope, ); return mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, absPrefix, iframeHostEncByChild, framesInScope, ); } /** * Snapshot the current frame tree so downstream helpers have consistent topology * without re-querying CDP. The map is intentionally shallow (frameId → parentId) * so it is serializable/testable without holding on to CDP handles. */ export function buildFrameContext(page: Page): FrameContext { const rootId = page.mainFrameId(); const frameTree = page.asProtocolFrameTree(rootId); const parentByFrame: FrameParentIndex = new Map(); (function index(n: Protocol.Page.FrameTree, parent: string | null) { parentByFrame.set(n.frame.id, parent); for (const c of n.childFrames ?? []) index(c, n.frame.id); })(frameTree, null); const frames = page.listAllFrameIds(); return { rootId, parentByFrame, frames }; } /** * Step 1 – scoped snapshot fast-path. If a selector is provided we try to: * 1) Resolve the selector (XPath or CSS) across iframes. * 2) Build DOM + AX data only for the owning frame. * 3) Bail out early when the selector's subtree satisfies the request. * * Returns `null` when scoping fails (e.g., selector miss) so the caller can * fall back to the full multi-frame snapshot. */ export async function tryScopedSnapshot( page: Page, options: SnapshotOptions | undefined, context: FrameContext, pierce: boolean, ): Promise { const requestedFocus = options?.focusSelector?.trim(); if (!requestedFocus) return null; const logScopeFallback = () => { v3Logger({ message: `Unable to narrow scope with selector. Falling back to using full DOM`, level: 1, auxiliary: { arguments: { value: `selector: ${options?.focusSelector?.trim()}`, type: "string", }, }, }); }; try { let targetFrameId: string; let tailSelector: string | undefined; let absPrefix: string | undefined; const looksLikeXPath = /^xpath=/i.test(requestedFocus) || requestedFocus.startsWith("/"); if (looksLikeXPath) { const focus = normalizeXPath(requestedFocus); const hit = await resolveFocusFrameAndTail( page, focus, context.parentByFrame, context.rootId, ); targetFrameId = hit.targetFrameId; tailSelector = hit.tailXPath || undefined; absPrefix = hit.absPrefix; } else { const cssHit = await resolveCssFocusFrameAndTail( page, requestedFocus, context.parentByFrame, context.rootId, ); targetFrameId = cssHit.targetFrameId; tailSelector = cssHit.tailSelector || undefined; absPrefix = cssHit.absPrefix; } const owningSess = ownerSession(page, targetFrameId); const parentId = context.parentByFrame.get(targetFrameId); const sameSessionAsParent = !!parentId && ownerSession(page, parentId) === ownerSession(page, targetFrameId); const { tagNameMap, xpathMap, scrollableMap } = await domMapsForSession( owningSess, targetFrameId, pierce, (fid, be) => `${page.getOrdinal(fid)}-${be}`, sameSessionAsParent, ); const { outline, urlMap, scopeApplied } = await a11yForFrame( owningSess, targetFrameId, { focusSelector: tailSelector || undefined, tagNameMap, experimental: options?.experimental ?? false, scrollableMap, encode: (backendNodeId) => `${page.getOrdinal(targetFrameId)}-${backendNodeId}`, }, ); const scopedXpathMap: Record = {}; const abs = absPrefix ?? ""; const isRoot = !abs || abs === "/"; if (isRoot) { Object.assign(scopedXpathMap, xpathMap); } else { // Prefix relative XPaths so the scoped result matches the global encoding. for (const [encId, xp] of Object.entries(xpathMap)) { scopedXpathMap[encId] = prefixXPath(abs, xp); } } const scopedUrlMap: Record = { ...urlMap }; const snapshot: HybridSnapshot = { combinedTree: outline, combinedXpathMap: scopedXpathMap, combinedUrlMap: scopedUrlMap, perFrame: [ { frameId: targetFrameId, outline, xpathMap, urlMap, }, ], }; if (scopeApplied) { return snapshot; } logScopeFallback(); } catch { logScopeFallback(); } return null; } /** * Step 2 – call DOM.getDocument once per unique CDP session and hydrate the * result so per-frame slices can share the structure. We key by session id * because same process iframes live inside the same session. */ export async function buildSessionIndexes( page: Page, frames: string[], pierce: boolean, ): Promise> { const sessionToIndex = new Map(); const sessionById = new Map(); for (const frameId of frames) { const sess = ownerSession(page, frameId); const sid = sess.id ?? "root"; if (!sessionById.has(sid)) sessionById.set(sid, sess); } for (const [sid, sess] of sessionById.entries()) { const idx = await buildSessionDomIndex(sess, pierce); sessionToIndex.set(sid, idx); } return sessionToIndex; } /** * Step 3 – derive per-frame DOM maps and accessibility outlines. * Each frame: * - slices the shared session index down to its document root * - builds frame-aware encoded ids (ordinal-backendNodeId) * - collects tag/xpath/scrollability maps for DOM-based lookups * - fetches its AX tree to produce outlines and URL maps */ export async function collectPerFrameMaps( page: Page, context: FrameContext, sessionToIndex: Map, options: SnapshotOptions | undefined, pierce: boolean, frameIds: string[], ): Promise<{ perFrameMaps: Map; perFrameOutlines: Array<{ frameId: string; outline: string }>; }> { const perFrameMaps = new Map(); const perFrameOutlines: Array<{ frameId: string; outline: string }> = []; for (const frameId of frameIds) { const sess = ownerSession(page, frameId); const sid = sess.id ?? "root"; let idx = sessionToIndex.get(sid); if (!idx) { idx = await buildSessionDomIndex(sess, pierce); sessionToIndex.set(sid, idx); } const parentId = context.parentByFrame.get(frameId); const sameSessionAsParent = !!parentId && ownerSession(page, parentId) === sess; let docRootBe = idx.rootBackend; if (sameSessionAsParent) { try { const { backendNodeId } = await sess.send<{ backendNodeId?: number }>( "DOM.getFrameOwner", { frameId }, ); if (typeof backendNodeId === "number") { const cdBe = idx.contentDocRootByIframe.get(backendNodeId); if (typeof cdBe === "number") docRootBe = cdBe; } } catch { // } } const tagNameMap: Record = {}; const xpathMap: Record = {}; const scrollableMap: Record = {}; const enc = (be: number) => `${page.getOrdinal(frameId)}-${be}`; const baseAbs = idx.absByBe.get(docRootBe) ?? "/"; for (const [be, nodeAbs] of idx.absByBe.entries()) { const nodeDocRoot = idx.docRootOf.get(be); if (nodeDocRoot !== docRootBe) continue; // Translate absolute XPaths into document-relative ones for this frame. const rel = relativizeXPath(baseAbs, nodeAbs); const key = enc(be); xpathMap[key] = rel; const tag = idx.tagByBe.get(be); if (tag) tagNameMap[key] = tag; if (idx.scrollByBe.get(be)) scrollableMap[key] = true; } const { outline, urlMap } = await a11yForFrame(sess, frameId, { experimental: options?.experimental ?? false, tagNameMap, scrollableMap, encode: (backendNodeId) => `${page.getOrdinal(frameId)}-${backendNodeId}`, }); perFrameOutlines.push({ frameId, outline }); perFrameMaps.set(frameId, { tagNameMap, xpathMap, scrollableMap, urlMap }); } return { perFrameMaps, perFrameOutlines }; } /** * Step 4 – walk the frame tree (parent-first) to compute absolute prefixes for * every frame. The prefix is the absolute XPath of the iframe element hosting * the frame, so we can later convert relative XPaths into cross-frame ones. */ export async function computeFramePrefixes( page: Page, context: FrameContext, perFrameMaps: Map, frameIds: string[], ): Promise<{ absPrefix: Map; iframeHostEncByChild: Map; }> { const absPrefix = new Map(); const iframeHostEncByChild = new Map(); absPrefix.set(context.rootId, ""); const included = new Set(frameIds); const queue: string[] = []; if (included.has(context.rootId)) { queue.push(context.rootId); } while (queue.length) { const parent = queue.shift()!; const parentAbs = absPrefix.get(parent)!; for (const child of context.frames) { if (!included.has(child)) continue; if (context.parentByFrame.get(child) !== parent) continue; queue.push(child); const parentSess = parentSession(page, context.parentByFrame, child); const ownerBackendNodeId = await (async () => { try { const { backendNodeId } = await parentSess.send<{ backendNodeId?: number; }>("DOM.getFrameOwner", { frameId: child }); return backendNodeId; } catch { return undefined; } })(); if (!ownerBackendNodeId) { // OOPIFs resolved via a different session inherit the parent prefix. absPrefix.set(child, parentAbs); continue; } const parentDom = perFrameMaps.get(parent); const iframeEnc = `${page.getOrdinal(parent)}-${ownerBackendNodeId}`; const iframeXPath = parentDom?.xpathMap[iframeEnc]; const childAbs = iframeXPath ? prefixXPath(parentAbs || "/", iframeXPath) : parentAbs; absPrefix.set(child, childAbs); iframeHostEncByChild.set(child, iframeEnc); } } return { absPrefix, iframeHostEncByChild }; } /** * Step 5 – merge per-frame maps into the combined snapshot payload. We prefix * each frame's relative XPaths with the absolute path collected in step 4, * merge URL maps, and stitch text outlines by nesting child trees under the * encoded id of their parent iframe host. */ export function mergeFramesIntoSnapshot( context: FrameContext, perFrameMaps: Map, perFrameOutlines: Array<{ frameId: string; outline: string }>, absPrefix: Map, iframeHostEncByChild: Map, frameIds: string[], ): HybridSnapshot { const combinedXpathMap: Record = {}; const combinedUrlMap: Record = {}; for (const frameId of frameIds) { const maps = perFrameMaps.get(frameId); if (!maps) continue; const abs = absPrefix.get(frameId) ?? ""; const isRoot = abs === "" || abs === "/"; if (isRoot) { Object.assign(combinedXpathMap, maps.xpathMap); Object.assign(combinedUrlMap, maps.urlMap); continue; } for (const [encId, xp] of Object.entries(maps.xpathMap)) { combinedXpathMap[encId] = prefixXPath(abs, xp); } Object.assign(combinedUrlMap, maps.urlMap); } const idToTree = new Map(); for (const { frameId, outline } of perFrameOutlines) { const parentEnc = iframeHostEncByChild.get(frameId); // The key is the parent iframe's encoded id so injectSubtrees can nest lines. if (parentEnc) idToTree.set(parentEnc, outline); } const rootOutline = perFrameOutlines.find((o) => o.frameId === context.rootId)?.outline ?? perFrameOutlines[0]?.outline ?? ""; const combinedTree = injectSubtrees(rootOutline, idToTree); return { combinedTree, combinedXpathMap, combinedUrlMap, perFrame: perFrameOutlines.map(({ frameId, outline }) => { const maps = perFrameMaps.get(frameId); return { frameId, outline, xpathMap: maps?.xpathMap ?? {}, urlMap: maps?.urlMap ?? {}, }; }), }; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/coordinateResolver.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import { Page } from "../../page.js"; import { executionContexts } from "../../executionContextRegistry.js"; import { a11yScriptSources } from "../../../dom/build/a11yScripts.generated.js"; import { buildA11yInvocation } from "../../a11yInvocation.js"; import type { ResolvedLocation } from "../../../types/private/snapshot.js"; import { listChildrenOf } from "./focusSelectors.js"; import { buildAbsoluteXPathFromChain } from "./xpathUtils.js"; /** * Resolve deepest node for a page coordinate and compute its absolute XPath across frames. * More efficient than building a full hybrid snapshot when only a single node’s XPath is needed. */ export async function resolveXpathForLocation( page: Page, x: number, y: number, ): Promise { const tree = page.getFullFrameTree(); const parentByFrame = new Map(); (function index(n: Protocol.Page.FrameTree, parent: string | null) { parentByFrame.set(n.frame.id, parent); for (const c of n.childFrames ?? []) index(c, n.frame.id); })(tree, null); const iframeChain: Array<{ parentSession: CDPSessionLike; iframeBackendNodeId: number; }> = []; let curFrameId = page.mainFrameId(); let curSession = page.getSessionForFrame(curFrameId); let curX = x; let curY = y; for (let depth = 0; depth < 8; depth++) { try { await curSession.send("DOM.enable").catch(() => {}); let sx = 0; let sy = 0; try { await curSession.send("Runtime.enable").catch(() => {}); const ctxId = await executionContexts .waitForMainWorld(curSession, curFrameId) .catch(() => {}); const scrollExpr = buildA11yInvocation("getScrollOffsets", []); const evalParams = ctxId ? { contextId: ctxId, expression: scrollExpr, returnByValue: true, } : { expression: scrollExpr, returnByValue: true }; const { result } = await curSession.send<{ result: { value?: { sx?: number; sy?: number } }; }>("Runtime.evaluate", evalParams); sx = Number(result?.value?.sx ?? 0); sy = Number(result?.value?.sy ?? 0); } catch { // } const xi = Math.max(0, Math.floor(curX + sx)); const yi = Math.max(0, Math.floor(curY + sy)); let res: { backendNodeId?: number; frameId?: string } | undefined; try { res = await curSession.send<{ backendNodeId?: number; frameId?: string; }>("DOM.getNodeForLocation", { x: xi, y: yi, includeUserAgentShadowDOM: false, ignorePointerEventsNone: false, }); } catch { return null; } const be = res?.backendNodeId; const reportedFrameId = res?.frameId; if ( typeof be === "number" && reportedFrameId && reportedFrameId !== curFrameId ) { const abs = await buildAbsoluteXPathFromChain( iframeChain, curSession, be, ); return abs ? { frameId: reportedFrameId, backendNodeId: be, absoluteXPath: abs } : null; } if (typeof be !== "number") return null; let matchedChild: string | undefined; for (const fid of listChildrenOf(parentByFrame, curFrameId)) { try { const { backendNodeId } = await curSession.send<{ backendNodeId?: number; }>("DOM.getFrameOwner", { frameId: fid }); if (backendNodeId === be) { matchedChild = fid; break; } } catch { continue; } } if (!matchedChild) { const abs = await buildAbsoluteXPathFromChain( iframeChain, curSession, be, ); return abs ? { frameId: curFrameId, backendNodeId: be, absoluteXPath: abs } : null; } iframeChain.push({ parentSession: curSession, iframeBackendNodeId: be, }); let left = 0; let top = 0; try { const { object } = await curSession.send<{ object: { objectId?: string }; }>("DOM.resolveNode", { backendNodeId: be }); const objectId = object?.objectId; if (objectId) { const { result } = await curSession.send<{ result: { value?: { left: number; top: number } }; }>("Runtime.callFunctionOn", { objectId, functionDeclaration: a11yScriptSources.getBoundingRectLite, returnByValue: true, }); left = Number(result?.value?.left ?? 0); top = Number(result?.value?.top ?? 0); await curSession .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } } catch { // } curX = Math.max(0, curX - left); curY = Math.max(0, curY - top); curFrameId = matchedChild; curSession = page.getSessionForFrame(curFrameId); } catch { return null; } } return null; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/domTree.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import { StagehandDomProcessError } from "../../../types/public/sdkErrors.js"; import type { SessionDomIndex } from "../../../types/private/snapshot.js"; import { buildChildXPathSegments, joinXPath, normalizeXPath, } from "./xpathUtils.js"; // starting from infinite depth (-1), exponentially shrink down to 1 const DOM_DEPTH_ATTEMPTS = [-1, 256, 128, 64, 32, 16, 8, 4, 2, 1]; const DESCRIBE_DEPTH_ATTEMPTS = [-1, 64, 32, 16, 8, 4, 2, 1]; /** Identify CDP failures caused by deep DOM trees blowing the CBOR encoder stack. */ function isCborStackError(message: string): boolean { return message.includes("CBOR: stack limit exceeded"); } /** * Determine if CDP truncated a node's children when streaming the DOM tree. * childNodeCount stays accurate even when `children` are omitted; we use this to * decide whether DOM.describeNode must be re-run for that node. */ export function shouldExpandNode(node: Protocol.DOM.Node): boolean { const declaredChildren = node.childNodeCount ?? 0; const realizedChildren = node.children?.length ?? 0; return declaredChildren > realizedChildren; } /** Merge an expanded DescribeNode payload back into the original shallow node. */ export function mergeDomNodes( target: Protocol.DOM.Node, source: Protocol.DOM.Node, ): void { target.childNodeCount = source.childNodeCount ?? target.childNodeCount; target.children = source.children ?? target.children; target.shadowRoots = source.shadowRoots ?? target.shadowRoots; target.contentDocument = source.contentDocument ?? target.contentDocument; } /** Helper that returns every nested collection we recurse through uniformly. */ export function collectDomTraversalTargets( node: Protocol.DOM.Node, ): Protocol.DOM.Node[] { const targets: Protocol.DOM.Node[] = []; if (node.children) targets.push(...node.children); if (node.shadowRoots) targets.push(...node.shadowRoots); if (node.contentDocument) targets.push(node.contentDocument); return targets; } /** * Rehydrate a truncated DOM tree by repeatedly calling DOM.describeNode with * decreasing depths. Any non-CBOR failure is surfaced as a StagehandDomProcessError. */ export async function hydrateDomTree( session: CDPSessionLike, root: Protocol.DOM.Node, pierce: boolean, ): Promise { const stack: Protocol.DOM.Node[] = [root]; const expandedNodeIds = new Set(); const expandedBackendIds = new Set(); while (stack.length) { const node = stack.pop()!; const nodeId = typeof node.nodeId === "number" && node.nodeId > 0 ? node.nodeId : undefined; const backendId = typeof node.backendNodeId === "number" && node.backendNodeId > 0 ? node.backendNodeId : undefined; const seenByNode = nodeId ? expandedNodeIds.has(nodeId) : false; const seenByBackend = !nodeId && backendId ? expandedBackendIds.has(backendId) : false; if (seenByNode || seenByBackend) continue; if (nodeId) expandedNodeIds.add(nodeId); else if (backendId) expandedBackendIds.add(backendId); const needsExpansion = shouldExpandNode(node); if (needsExpansion && (nodeId || backendId)) { const describeParamsBase = nodeId ? { nodeId } : { backendNodeId: backendId! }; let expanded = false; for (const depth of DESCRIBE_DEPTH_ATTEMPTS) { try { const described = await session.send( "DOM.describeNode", { ...describeParamsBase, depth, pierce, }, ); mergeDomNodes(node, described.node); if (!nodeId && described.node.nodeId && described.node.nodeId > 0) { node.nodeId = described.node.nodeId; expandedNodeIds.add(described.node.nodeId); } expanded = true; break; } catch (err) { const message = err instanceof Error ? err.message : String(err); if (isCborStackError(message)) { continue; } const identifier = nodeId ?? backendId ?? "unknown"; throw new StagehandDomProcessError( `Failed to expand DOM node ${identifier}: ${String(err)}`, ); } } if (!expanded) { const identifier = nodeId ?? backendId ?? "unknown"; throw new StagehandDomProcessError( `Unable to expand DOM node ${identifier} after describeNode depth retries`, ); } } for (const child of collectDomTraversalTargets(node)) { stack.push(child); } } } /** * Attempt DOM.getDocument with progressively shallower depths until CBOR stops * complaining. When a shallower snapshot is returned we hydrate the missing * branches so downstream DOM traversals see the full tree shape. */ export async function getDomTreeWithFallback( session: CDPSessionLike, pierce: boolean, ): Promise { let lastCborMessage = ""; for (const depth of DOM_DEPTH_ATTEMPTS) { try { const { root } = await session.send<{ root: Protocol.DOM.Node }>( "DOM.getDocument", { depth, pierce }, ); if (depth !== -1) { await hydrateDomTree(session, root, pierce); } return root; } catch (err) { const message = err instanceof Error ? err.message : String(err); if (isCborStackError(message)) { lastCborMessage = message; continue; } throw err; } } throw new StagehandDomProcessError( lastCborMessage ? `CDP DOM.getDocument failed after adaptive depth retries: ${lastCborMessage}` : "CDP DOM.getDocument failed after adaptive depth retries.", ); } /** * Build tag name and XPath maps for a single frame session. * EncodedId is produced by a frame-aware encoder provided by the caller. */ export async function domMapsForSession( session: CDPSessionLike, frameId: string, pierce: boolean, encode: (fid: string, backendNodeId: number) => string, attemptOwnerLookup = true, ): Promise<{ tagNameMap: Record; xpathMap: Record; scrollableMap: Record; }> { await session.send("DOM.enable").catch(() => {}); const root = await getDomTreeWithFallback(session, pierce); let startNode: Protocol.DOM.Node = root; if (attemptOwnerLookup) { try { const owner = await session.send<{ backendNodeId?: number }>( "DOM.getFrameOwner", { frameId }, ); const ownerBackendId = owner.backendNodeId; if (typeof ownerBackendId === "number") { const ownerEl = findNodeByBackendId(root, ownerBackendId); if (ownerEl?.contentDocument) { startNode = ownerEl.contentDocument; } } } catch { // OOPIF or race → keep startNode = root } } const tagNameMap: Record = {}; const xpathMap: Record = {}; const scrollableMap: Record = {}; type StackEntry = { node: Protocol.DOM.Node; xpath: string }; const stack: StackEntry[] = [{ node: startNode, xpath: "" }]; while (stack.length) { const { node, xpath } = stack.pop()!; if (node.backendNodeId) { const encId = encode(frameId, node.backendNodeId); tagNameMap[encId] = String(node.nodeName).toLowerCase(); xpathMap[encId] = xpath || "/"; const isScrollable = node?.isScrollable === true; if (isScrollable) scrollableMap[encId] = true; } const kids = node.children ?? []; if (kids.length) { const segs = buildChildXPathSegments(kids); for (let i = kids.length - 1; i >= 0; i--) { const child = kids[i]!; const step = segs[i]!; stack.push({ node: child, xpath: joinXPath(xpath, step), }); } } for (const sr of node.shadowRoots ?? []) { stack.push({ node: sr, xpath: joinXPath(xpath, "//"), }); } } return { tagNameMap, xpathMap, scrollableMap }; } /** * Build an index of absolute XPath/tag metadata for an entire CDP session. * Once the index is cached, per-frame slices are derived without extra DOM * calls, which keeps snapshot capture linear in the number of frames. */ export async function buildSessionDomIndex( session: CDPSessionLike, pierce: boolean, ): Promise { await session.send("DOM.enable").catch(() => {}); const root = await getDomTreeWithFallback(session, pierce); const absByBe = new Map(); const tagByBe = new Map(); const scrollByBe = new Map(); const docRootOf = new Map(); const contentDocRootByIframe = new Map(); type Entry = { node: Protocol.DOM.Node; xp: string; docRootBe: number }; const rootBe = root.backendNodeId!; const stack: Entry[] = [{ node: root, xp: "/", docRootBe: rootBe }]; while (stack.length) { const { node, xp, docRootBe } = stack.pop()!; if (node.backendNodeId) { absByBe.set(node.backendNodeId, xp || "/"); tagByBe.set(node.backendNodeId, String(node.nodeName).toLowerCase()); if (node?.isScrollable === true) scrollByBe.set(node.backendNodeId, true); docRootOf.set(node.backendNodeId, docRootBe); } const kids = node.children ?? []; if (kids.length) { const segs = buildChildXPathSegments(kids); for (let i = kids.length - 1; i >= 0; i--) { const child = kids[i]!; const step = segs[i]!; stack.push({ node: child, xp: joinXPath(xp, step), docRootBe }); } } for (const sr of node.shadowRoots ?? []) { stack.push({ node: sr, xp: joinXPath(xp, "//"), docRootBe }); } const cd = node.contentDocument as Protocol.DOM.Node | undefined; if (cd && typeof cd.backendNodeId === "number") { contentDocRootByIframe.set(node.backendNodeId!, cd.backendNodeId); stack.push({ node: cd, xp, docRootBe: cd.backendNodeId }); } } return { rootBackend: rootBe, absByBe, tagByBe, scrollByBe, docRootOf, contentDocRootByIframe, }; } /** * Relativize an absolute XPath against a document root's absolute path. * When the node lives outside the document we return the absolute path as-is. */ export function relativizeXPath(baseAbs: string, nodeAbs: string): string { const base = normalizeXPath(baseAbs); const abs = normalizeXPath(nodeAbs); if (abs === base) return "/"; if (abs.startsWith(base)) { const tail = abs.slice(base.length); if (!tail) return "/"; return tail.startsWith("/") || tail.startsWith("//") ? tail : `/${tail}`; } if (base === "/") return abs; return abs; } /** Find a node by backendNodeId inside a DOM.getDocument tree. */ export function findNodeByBackendId( root: Protocol.DOM.Node, backendNodeId: number, ): Protocol.DOM.Node | undefined { const stack: Protocol.DOM.Node[] = [root]; while (stack.length) { const n = stack.pop()!; if (n.backendNodeId === backendNodeId) return n; if (n.children) for (const c of n.children) stack.push(c); if (n.shadowRoots) for (const s of n.shadowRoots) stack.push(s); } return undefined; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/focusSelectors.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import { Page } from "../../page.js"; import { executionContexts } from "../../executionContextRegistry.js"; import { buildLocatorInvocation } from "../../locatorInvocation.js"; import { StagehandIframeError } from "../../../types/public/sdkErrors.js"; import type { Axis, FrameParentIndex, ResolvedCssFocus, ResolvedFocusFrame, Step, } from "../../../types/private/snapshot.js"; import { prefixXPath } from "./xpathUtils.js"; /** * Parse a cross-frame XPath into discrete steps. Each step tracks whether it * represents a descendant hop (“//”) or a single-child hop (“/”). */ export function parseXPathToSteps(path: string): Step[] { const s = path.trim(); let i = 0; const steps: Step[] = []; while (i < s.length) { let axis: Axis = "child"; if (s.startsWith("//", i)) { axis = "desc"; i += 2; } else if (s[i] === "/") { axis = "child"; i += 1; } const start = i; while (i < s.length && s[i] !== "/") i++; const raw = s.slice(start, i).trim(); if (!raw) continue; const name = raw.replace(/\[\d+\]\s*$/u, "").toLowerCase(); steps.push({ axis, raw, name }); } return steps; } /** Rebuild an XPath string from parsed steps. */ export function buildXPathFromSteps(steps: ReadonlyArray): string { let out = ""; for (const st of steps) { out += st.axis === "desc" ? "//" : "/"; out += st.raw; } return out || "/"; } export const IFRAME_STEP_RE = /^i?frame(?:\[\d+])?$/i; /** * Given a cross-frame XPath, walk iframe steps to resolve: * - the target frameId (last iframe hop) * - the tail XPath (within the target frame) * - the absolute XPath prefix up to the iframe element hosting that frame */ export async function resolveFocusFrameAndTail( page: Page, absoluteXPath: string, parentByFrame: FrameParentIndex, rootId: string, ): Promise { const steps = parseXPathToSteps(absoluteXPath); let ctxFrameId = rootId; let buf: Step[] = []; let absPrefix = ""; const flushIntoChild = async (): Promise => { if (!buf.length) return; const selectorForIframe = buildXPathFromSteps(buf); const parentSess = page.getSessionForFrame(ctxFrameId); const objectId = await resolveObjectIdForXPath( parentSess, selectorForIframe, ctxFrameId, ); if (!objectId) throw new StagehandIframeError( selectorForIframe, "Failed to resolve iframe element by XPath", ); try { await parentSess.send("DOM.enable").catch(() => {}); const desc = await parentSess.send( "DOM.describeNode", { objectId }, ); const iframeBackendNodeId = desc.node.backendNodeId; let childFrameId: string | undefined; for (const fid of listChildrenOf(parentByFrame, ctxFrameId)) { try { const { backendNodeId } = await parentSess.send<{ backendNodeId: number; }>("DOM.getFrameOwner", { frameId: fid }); if (backendNodeId === iframeBackendNodeId) { childFrameId = fid; break; } } catch { continue; } } if (!childFrameId) throw new StagehandIframeError( selectorForIframe, "Could not map iframe to child frameId", ); absPrefix = prefixXPath(absPrefix || "/", selectorForIframe); ctxFrameId = childFrameId; } finally { await parentSess .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } buf = []; }; for (const st of steps) { buf.push(st); if (IFRAME_STEP_RE.test(st.name)) { await flushIntoChild(); } } const tailXPath = buildXPathFromSteps(buf); return { targetFrameId: ctxFrameId, tailXPath, absPrefix }; } /** Resolve focus frame and tail CSS selector using '>>' to hop iframes. */ export async function resolveCssFocusFrameAndTail( page: Page, rawSelector: string, parentByFrame: FrameParentIndex, rootId: string, ): Promise { const parts = rawSelector .split(">>") .map((s) => s.trim()) .filter(Boolean); let ctxFrameId = rootId; const absPrefix = ""; for (let i = 0; i < Math.max(0, parts.length - 1); i++) { const parentSess = page.getSessionForFrame(ctxFrameId); const objectId = await resolveObjectIdForCss( parentSess, parts[i]!, ctxFrameId, ); if (!objectId) throw new StagehandIframeError( parts[i]!, "Failed to resolve iframe via CSS hop", ); try { await parentSess.send("DOM.enable").catch(() => {}); const desc = await parentSess.send( "DOM.describeNode", { objectId }, ); const iframeBackendNodeId = desc.node.backendNodeId; let childFrameId: string | undefined; for (const fid of listChildrenOf(parentByFrame, ctxFrameId)) { try { const { backendNodeId } = await parentSess.send<{ backendNodeId: number; }>("DOM.getFrameOwner", { frameId: fid }); if (backendNodeId === iframeBackendNodeId) { childFrameId = fid; break; } } catch { continue; } } if (!childFrameId) throw new StagehandIframeError( parts[i]!, "Could not map CSS iframe hop to child frameId", ); ctxFrameId = childFrameId; } finally { await parentSess .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } } const tailSelector = parts[parts.length - 1] ?? "*"; return { targetFrameId: ctxFrameId, tailSelector, absPrefix }; } /** Resolve an XPath to a Runtime remoteObjectId in the given CDP session. */ export async function resolveObjectIdForXPath( session: CDPSessionLike, xpath: string, frameId?: string, ): Promise { let contextId: number | undefined; try { if (frameId) { contextId = await executionContexts .waitForMainWorld(session, frameId, 800) .catch( () => executionContexts.getMainWorld(session, frameId) ?? undefined, ); } } catch { contextId = undefined; } const expr = buildLocatorInvocation("resolveXPathMainWorld", [ JSON.stringify(xpath), "0", ]); const { result, exceptionDetails } = await session.send<{ result: { objectId?: string | undefined }; exceptionDetails?: Protocol.Runtime.ExceptionDetails; }>("Runtime.evaluate", { expression: expr, returnByValue: false, contextId, awaitPromise: true, }); if (exceptionDetails) return null; return result?.objectId ?? null; } /** Resolve a CSS selector (supports '>>' within the same frame only) to a Runtime objectId. */ export async function resolveObjectIdForCss( session: CDPSessionLike, selector: string, frameId?: string, ): Promise { let contextId: number | undefined; try { if (frameId) { contextId = await executionContexts .waitForMainWorld(session, frameId, 800) .catch( () => executionContexts.getMainWorld(session, frameId) ?? undefined, ); } } catch { contextId = undefined; } const primaryExpr = buildLocatorInvocation("resolveCssSelector", [ JSON.stringify(selector), "0", ]); const fallbackExpr = buildLocatorInvocation("resolveCssSelectorPierce", [ JSON.stringify(selector), "0", ]); const evaluate = async (expression: string): Promise => { const { result, exceptionDetails } = await session.send<{ result: { objectId?: string | undefined }; exceptionDetails?: Protocol.Runtime.ExceptionDetails; }>("Runtime.evaluate", { expression, returnByValue: false, contextId, awaitPromise: true, }); if (exceptionDetails) return null; return result?.objectId ?? null; }; const primary = await evaluate(primaryExpr); if (primary) return primary; return evaluate(fallbackExpr); } export function listChildrenOf( parentByFrame: FrameParentIndex, parentId: string, ): string[] { const out: string[] = []; for (const [fid, p] of parentByFrame.entries()) { if (p === parentId) out.push(fid); } return out; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/index.ts ================================================ export { captureHybridSnapshot } from "./capture.js"; export { computeActiveElementXpath } from "./activeElement.js"; export { diffCombinedTrees } from "./treeFormatUtils.js"; export { resolveXpathForLocation } from "./coordinateResolver.js"; ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/sessions.ts ================================================ import type { CDPSessionLike } from "../../cdp.js"; import { Page } from "../../page.js"; import type { FrameParentIndex } from "../../../types/private/snapshot.js"; /** * Session helpers ensure DOM lookups are always executed against the session * that actually owns a frame. Keeping this logic centralized prevents subtle * bugs when OOPIF adoption changes session ownership mid-capture. */ /** Return the owning session for a frame as registered on the Page. */ export function ownerSession(page: Page, frameId: string): CDPSessionLike { return page.getSessionForFrame(frameId); } /** * DOM.getFrameOwner must be called against the parent frame's session. * This helper hides the lookup (including main-frame fallback) so callers * always reach for the correct connection. */ export function parentSession( page: Page, parentByFrame: FrameParentIndex, frameId: string, ): CDPSessionLike { const parentId = parentByFrame.get(frameId) ?? null; if (!parentId) { return page.getSessionForFrame(frameId); } return page.getSessionForFrame(parentId); } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/treeFormatUtils.ts ================================================ import type { A11yNode } from "../../../types/private/snapshot.js"; /** * Render a formatted outline (with encoded ids) for the accessibility tree. * Keeps indentation logic shared between modules so unit tests can cover these * pure formatting helpers without a full snapshot pipeline. */ export function formatTreeLine(node: A11yNode, level = 0): string { const indent = " ".repeat(level); const labelId = node.encodedId ?? node.nodeId; const label = `[${labelId}] ${node.role}${node.name ? `: ${cleanText(node.name)}` : ""}`; const kids = node.children?.map((c) => formatTreeLine(c, level + 1)).join("\n") ?? ""; return kids ? `${indent}${label}\n${kids}` : `${indent}${label}`; } /** * Inject each child frame outline under the parent's iframe node line. * Keys in `idToTree` are the parent's iframe encoded ids. */ export function injectSubtrees( rootOutline: string, idToTree: Map, ): string { type Frame = { lines: string[]; i: number }; const out: string[] = []; const visited = new Set(); const stack: Frame[] = [{ lines: rootOutline.split("\n"), i: 0 }]; while (stack.length) { const top = stack[stack.length - 1]; if (top.i >= top.lines.length) { stack.pop(); continue; } const raw = top.lines[top.i++]; out.push(raw); const indent = raw.match(/^(\s*)/)?.[1] ?? ""; const content = raw.slice(indent.length); const m = content.match(/^\[([^\]]+)]/); if (!m) continue; const encId = m[1]!; const childOutline = idToTree.get(encId); if (!childOutline || visited.has(encId)) continue; visited.add(encId); const fullyInjectedChild = injectSubtrees(childOutline, idToTree); out.push(indentBlock(fullyInjectedChild.trimEnd(), indent + " ")); } return out.join("\n"); } export function indentBlock(block: string, indent: string): string { if (!block) return ""; return block .split("\n") .map((line) => (line.length ? indent + line : indent + line)) .join("\n"); } /** * Return the lines that appear in `nextTree` but not in `prevTree`. * Comparison is done line-by-line, ignoring leading whitespace in both trees. * The returned block is re-indented so the minimal indent becomes column 0. */ export function diffCombinedTrees(prevTree: string, nextTree: string): string { const prevSet = new Set( (prevTree || "") .split("\n") .map((l) => l.trim()) .filter((l) => l.length > 0), ); const nextLines = (nextTree || "").split("\n"); const added: string[] = []; for (const line of nextLines) { const core = line.trim(); if (!core) continue; if (!prevSet.has(core)) added.push(line); } if (added.length === 0) return ""; let minIndent = Infinity; for (const l of added) { if (!l.trim()) continue; const m = l.match(/^\s*/); const indentLen = m ? m[0]!.length : 0; if (indentLen < minIndent) minIndent = indentLen; } if (!isFinite(minIndent)) minIndent = 0; const out = added.map((l) => l.length >= minIndent ? l.slice(minIndent) : l, ); return out.join("\n"); } /** * Remove whitespace noise and invisible code points before rendering names. */ export function cleanText(input: string): string { const PUA_START = 0xe000; const PUA_END = 0xf8ff; const NBSP = new Set([0x00a0, 0x202f, 0x2007, 0xfeff]); let out = ""; let prevSpace = false; for (let i = 0; i < input.length; i++) { const code = input.charCodeAt(i); if (code >= PUA_START && code <= PUA_END) continue; if (NBSP.has(code)) { if (!prevSpace) { out += " "; prevSpace = true; } continue; } out += input[i]; prevSpace = input[i] === " "; } return out.trim(); } /** * Collapse all whitespace runs in a string to a single space without trimming. * Exported for pruning routines that need the same normalization. */ export function normaliseSpaces(s: string): string { let out = ""; let inWs = false; for (let i = 0; i < s.length; i++) { const ch = s[i]!; const isWs = /\s/.test(ch); if (isWs) { if (!inWs) { out += " "; inWs = true; } } else { out += ch; inWs = false; } } return out; } ================================================ FILE: packages/core/lib/v3/understudy/a11y/snapshot/xpathUtils.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "../../cdp.js"; import { a11yScriptSources } from "../../../dom/build/a11yScripts.generated.js"; /** * Build the absolute XPath for a node by walking through every iframe host * we've traversed so far followed by the leaf backend node. */ export async function buildAbsoluteXPathFromChain( chain: Array<{ parentSession: CDPSessionLike; iframeBackendNodeId: number; }>, leafSession: CDPSessionLike, leafBackendNodeId: number, ): Promise { let prefix = ""; for (const step of chain) { const xp = await absoluteXPathForBackendNode( step.parentSession, step.iframeBackendNodeId, ); if (!xp) continue; prefix = prefix ? prefixXPath(prefix, xp) : normalizeXPath(xp); } const leaf = await absoluteXPathForBackendNode( leafSession, leafBackendNodeId, ); if (!leaf) return prefix || "/"; return prefix ? prefixXPath(prefix, leaf) : normalizeXPath(leaf); } /** * Resolve a backend node to an absolute XPath within the provided session. * The CDP Runtime is used so we can invoke a small helper that walks the DOM. */ export async function absoluteXPathForBackendNode( session: CDPSessionLike, backendNodeId: number, ): Promise { try { const { object } = await session.send<{ object: { objectId?: string } }>( "DOM.resolveNode", { backendNodeId }, ); const objectId = object?.objectId; if (!objectId) return null; const { result } = await session.send<{ result: { value?: string } }>( "Runtime.callFunctionOn", { objectId, functionDeclaration: a11yScriptSources.nodeToAbsoluteXPath, returnByValue: true, }, ); await session.send("Runtime.releaseObject", { objectId }).catch(() => {}); return typeof result?.value === "string" && result.value ? result.value : null; } catch { return null; } } /** * Prefix `child` XPath with an absolute iframe path `parentAbs`. * Handles root slashes and shadow hops (“//”) cleanly. */ export function prefixXPath(parentAbs: string, child: string): string { const p = parentAbs === "/" ? "" : parentAbs.replace(/\/$/, ""); if (!child || child === "/") return p || "/"; if (child.startsWith("//")) return p ? `${p}//${child.slice(2)}` : `//${child.slice(2)}`; const c = child.replace(/^\//, ""); return p ? `${p}/${c}` : `/${c}`; } /** Normalize an XPath: strip `xpath=`, ensure leading '/', remove trailing '/'. */ export function normalizeXPath(x?: string): string { if (!x) return ""; let s = x.trim().replace(/^xpath=/i, ""); if (!s.startsWith("/")) s = "/" + s; if (s.length > 1 && s.endsWith("/")) s = s.slice(0, -1); return s; } /** Build per-sibling XPath steps for DOM traversal. */ export function buildChildXPathSegments(kids: Protocol.DOM.Node[]): string[] { const segs: string[] = []; const ctr: Record = {}; for (const child of kids) { const tag = String(child.nodeName).toLowerCase(); const key = `${child.nodeType}:${tag}`; const idx = (ctr[key] = (ctr[key] ?? 0) + 1); if (child.nodeType === 3) { segs.push(`text()[${idx}]`); } else if (child.nodeType === 8) { segs.push(`comment()[${idx}]`); } else { segs.push( tag.includes(":") ? `*[name()='${tag}'][${idx}]` : `${tag}[${idx}]`, ); } } return segs; } /** Join two XPath fragments while preserving special shadow-root hops. */ export function joinXPath(base: string, step: string): string { if (step === "//") { if (!base || base === "/") return "//"; return base.endsWith("/") ? `${base}/` : `${base}//`; } if (!base || base === "/") return step ? `/${step}` : "/"; if (base.endsWith("//")) return `${base}${step}`; if (!step) return base; return `${base}/${step}`; } ================================================ FILE: packages/core/lib/v3/understudy/a11yInvocation.ts ================================================ import { a11yScriptBootstrap, a11yScriptGlobalRefs, type A11yScriptName, } from "../dom/build/a11yScripts.generated.js"; /** * Wrap a generated a11y script in a self-invoking expression that first ensures * the bootstrap has run, then calls the requested helper via its global ref. * This mirrors the locator resolver’s injection path so any CDP Runtime.evaluate * can reuse the shared bundle without inlining JS strings. */ export function buildA11yInvocation( name: A11yScriptName, args: string[], ): string { const invocation = `${a11yScriptGlobalRefs[name]}(${args.join(", ")})`; return `(() => { ${a11yScriptBootstrap}; return ${invocation}; })()`; } ================================================ FILE: packages/core/lib/v3/understudy/cdp.ts ================================================ // lib/v3/understudy/cdp.ts import WebSocket from "ws"; import type { Protocol } from "devtools-protocol"; import { STAGEHAND_VERSION } from "../../version.js"; import { FlowLogger, type FlowEvent, type FlowLoggerContext, } from "../flowlogger/FlowLogger.js"; import { CdpConnectionClosedError, PageNotFoundError, } from "../types/public/sdkErrors.js"; /** * CDP transport & session multiplexer * * Owns the browser WebSocket and multiplexes flattened Target sessions. * Tracks inflight CDP calls, routes responses to the right session, and forwards events. * * This does not interpret Page/DOM/Runtime semantics — callers own that logic. */ export interface CDPSessionLike { send(method: string, params?: object): Promise; on

(event: string, handler: (params: P) => void): void; off

(event: string, handler: (params: P) => void): void; close(): Promise; readonly id: string | null; } type Inflight = { resolve: (v: unknown) => void; reject: (e: Error) => void; sessionId?: string | null; method: string; params?: object; stack?: string; ts: number; flowLoggerContext?: FlowLoggerContext | null; // Snapshot of the flow context captured when the request was sent; response handling re-enters this if ALS is gone. cdpCallEvent?: Pick | null; // The emitted CdpCallEvent identity; later response/error events attach under this exact parent. }; type EventHandler = (params: unknown) => void; type SessionDispatchWaiter = { sessionId: string; method: string; match?: (params?: object) => boolean; resolve: () => void; reject: (error: Error) => void; }; type RawMessage = | { id: number; result?: unknown; error?: { code: number; message: string; data?: unknown }; sessionId?: string; } | { method: string; params?: unknown; sessionId?: string }; export class CdpConnection implements CDPSessionLike { private ws: WebSocket; private nextId = 1; private inflight = new Map(); // Outstanding request records; `_sendViaSession()` inserts and `onMessage()` removes/resolves them. private latestCdpCallEvent = new Map< // Most recent CDP call per session/root; `_sendViaSession()` refreshes it and later unsolicited messages reuse it as their parent anchor. string | null, { flowLoggerContext: FlowLoggerContext; // Flow context captured when the latest call on this session/root was emitted. cdpCallEvent: Pick; // Identity of that latest call event; unsolicited messages reuse it as their parent. } >(); private eventHandlers = new Map>(); private sessions = new Map(); /** Maps sessionId -> targetId (1:1 mapping) */ private sessionToTarget = new Map(); private sessionDispatchWaiters = new Set(); public readonly id: string | null = null; // root private transportCloseHandlers = new Set<(why: string) => void>(); public flowLoggerContext?: FlowLoggerContext; // Instance-owned fallback flow context; V3 sets this once and later sends/callbacks re-enter it when ALS is absent. public onTransportClosed(handler: (why: string) => void): void { this.transportCloseHandlers.add(handler); } public offTransportClosed(handler: (why: string) => void): void { this.transportCloseHandlers.delete(handler); } private emitTransportClosed(why: string) { for (const h of this.transportCloseHandlers) { try { h(why); } catch { // } } } private constructor(ws: WebSocket) { this.ws = ws; this.ws.on("close", (code, reason) => { // Reason is a Buffer in ws; stringify defensively const why = `socket-close code=${code} reason=${String(reason || "")}`; this.rejectAllInflight(why); this.emitTransportClosed(why); }); this.ws.on("error", (err) => { const why = `socket-error ${err?.message ?? String(err)}`; this.rejectAllInflight(why); this.emitTransportClosed(why); }); this.ws.on("message", (data) => this.onMessage(data.toString())); } static async connect( wsUrl: string, options?: { headers?: Record }, ): Promise { // Include User-Agent header for server-side observability and version tracking // Merge user-provided headers, letting them override defaults const headers = { "User-Agent": `Stagehand/${STAGEHAND_VERSION}`, ...options?.headers, }; const ws = new WebSocket(wsUrl, { headers }); await new Promise((resolve, reject) => { ws.once("open", () => resolve()); ws.once("error", (e) => reject(e)); }); return new CdpConnection(ws); } async enableAutoAttach(): Promise { await this.send("Target.setAutoAttach", { autoAttach: true, flatten: true, waitForDebuggerOnStart: true, }); await this.send("Target.setDiscoverTargets", { discover: true }); } async send(method: string, params?: object): Promise { const id = this.nextId++; const payload = { id, method, params }; const stack = new Error().stack?.split("\n").slice(1, 4).join("\n"); const flowLoggerContext = FlowLogger.resolveContext(this.flowLoggerContext); const cdpCallEvent = flowLoggerContext ? FlowLogger.logCdpCallEvent(flowLoggerContext, { method, params, targetId: null, }) : null; if (flowLoggerContext && cdpCallEvent) { this.latestCdpCallEvent.set(null, { flowLoggerContext, cdpCallEvent, }); } const p = new Promise((resolve, reject) => { this.inflight.set(id, { resolve, reject, sessionId: null, method, params, stack, ts: Date.now(), flowLoggerContext, cdpCallEvent, }); }); // Prevent unhandledRejection if a session detaches before the caller awaits. void p.catch(() => {}); this.ws.send(JSON.stringify(payload)); return p; } on

(event: string, handler: (params: P) => void): void { const set = this.eventHandlers.get(event) ?? new Set(); set.add(handler as EventHandler); this.eventHandlers.set(event, set); } off

(event: string, handler: (params: P) => void): void { const set = this.eventHandlers.get(event); if (set) set.delete(handler as EventHandler); } async close(): Promise { if (this.ws.readyState === WebSocket.CLOSED) return; await new Promise((resolve) => { this.ws.once("close", () => resolve()); this.ws.close(); }); } private rejectAllInflight(why: string): void { for (const [id, entry] of this.inflight.entries()) { entry.reject(new CdpConnectionClosedError(why)); this.inflight.delete(id); } this.latestCdpCallEvent.clear(); for (const waiter of Array.from(this.sessionDispatchWaiters)) { waiter.reject(new CdpConnectionClosedError(why)); } } getSession(sessionId: string): CdpSession | undefined { return this.sessions.get(sessionId); } waitForSessionDispatch( sessionId: string, method: string, match?: (params?: object) => boolean, ): Promise { return new Promise((resolve, reject) => { const waiter: SessionDispatchWaiter = { sessionId, method, match, resolve: () => { this.sessionDispatchWaiters.delete(waiter); resolve(); }, reject: (error: Error) => { this.sessionDispatchWaiters.delete(waiter); reject(error); }, }; this.sessionDispatchWaiters.add(waiter); }); } async attachToTarget(targetId: string): Promise { const { sessionId } = (await this.send<{ sessionId: string }>( "Target.attachToTarget", { targetId, flatten: true }, )) as { sessionId: string }; let session = this.sessions.get(sessionId); if (!session) { session = new CdpSession(this, sessionId); this.sessions.set(sessionId, session); } this.sessionToTarget.set(sessionId, targetId); return session; } async getTargets(): Promise { const res = await this.send<{ targetInfos: Protocol.Target.TargetInfo[]; }>("Target.getTargets"); return res.targetInfos; } private onMessage(json: string): void { const msg = JSON.parse(json) as RawMessage; if ("id" in msg) { const rec = this.inflight.get(msg.id); if (!rec) return; this.inflight.delete(msg.id); if ("error" in msg && msg.error) { // Response/error events only make sense if the original send captured // both a flow context to re-enter and the emitted CdpCallEvent to hang // the terminal edge under. if (rec.flowLoggerContext && rec.cdpCallEvent) { let targetId: string | null; if (rec.sessionId) { const mappedTargetId = this.sessionToTarget.get(rec.sessionId); if (mappedTargetId) { targetId = mappedTargetId; } else { targetId = rec.sessionId; } } else { targetId = null; } FlowLogger.logCdpResponseEvent( rec.flowLoggerContext, rec.cdpCallEvent, { method: rec.method, error: `${msg.error.code} ${msg.error.message}`, targetId, }, ); } rec.reject(new Error(`${msg.error.code} ${msg.error.message}`)); } else { // Successful responses reuse the same cached call context so the // response lands under the exact CdpCallEvent emitted at send time. if (rec.flowLoggerContext && rec.cdpCallEvent) { let targetId: string | null; if (rec.sessionId) { const mappedTargetId = this.sessionToTarget.get(rec.sessionId); if (mappedTargetId) { targetId = mappedTargetId; } else { targetId = rec.sessionId; } } else { targetId = null; } FlowLogger.logCdpResponseEvent( rec.flowLoggerContext, rec.cdpCallEvent, { method: rec.method, result: (msg as { result?: unknown }).result, targetId, }, ); } rec.resolve((msg as { result?: unknown }).result); } return; } if ("method" in msg) { if (msg.method === "Target.attachedToTarget") { const p = (msg as { params: Protocol.Target.AttachedToTargetEvent }) .params; if (!this.sessions.has(p.sessionId)) { this.sessions.set(p.sessionId, new CdpSession(this, p.sessionId)); } this.sessionToTarget.set(p.sessionId, p.targetInfo.targetId); } else if (msg.method === "Target.detachedFromTarget") { const p = (msg as { params: Protocol.Target.DetachedFromTargetEvent }) .params; for (const [id, entry] of this.inflight.entries()) { if (entry.sessionId === p.sessionId) { entry.reject( new PageNotFoundError( `target closed before CDP response (sessionId=${p.sessionId}, targetId=${p.targetId})`, ), ); this.inflight.delete(id); } } for (const waiter of Array.from(this.sessionDispatchWaiters)) { if (waiter.sessionId === p.sessionId) { waiter.reject( new PageNotFoundError( `target closed before CDP send (sessionId=${p.sessionId}, targetId=${p.targetId})`, ), ); } } this.sessions.delete(p.sessionId); this.sessionToTarget.delete(p.sessionId); this.latestCdpCallEvent.delete(p.sessionId); } else if (msg.method === "Target.targetDestroyed") { const p = (msg as { params: { targetId: string } }).params; // Remove any session mapping for this target for (const [sessionId, targetId] of this.sessionToTarget.entries()) { if (targetId === p.targetId) { this.sessionToTarget.delete(sessionId); this.latestCdpCallEvent.delete(sessionId); break; } } } const { method, params, sessionId } = msg; const latestCdpCallEvent = this.latestCdpCallEvent.get(sessionId ?? null) ?? (sessionId ? this.latestCdpCallEvent.get(null) : null); let targetId: string | null; if (sessionId) { const mappedTargetId = this.sessionToTarget.get(sessionId); if (mappedTargetId) { targetId = mappedTargetId; } else { targetId = sessionId; } } else { targetId = null; } // Unsolicited protocol messages are attached under the most recent call on // that session/root when one is known, so later callbacks still show up // in the same flow subtree. if (latestCdpCallEvent) { FlowLogger.logCdpMessageEvent( latestCdpCallEvent.flowLoggerContext, latestCdpCallEvent.cdpCallEvent, { method, params, targetId, }, ); } const dispatch = () => { if (sessionId) { const session = this.sessions.get(sessionId); session?.dispatch(method, params); // Forward target lifecycle events to root listeners as well. // Some browsers emit these via a parent session rather than the root // connection; fan-out keeps target tracking consistent. if (method.startsWith("Target.")) { const handlers = this.eventHandlers.get(method); if (handlers) for (const h of handlers) h(params); } return; } const handlers = this.eventHandlers.get(method); if (handlers) for (const h of handlers) h(params); }; if (latestCdpCallEvent) { FlowLogger.withContext(latestCdpCallEvent.flowLoggerContext, dispatch); } else { dispatch(); } } } _sendViaSession( sessionId: string, method: string, params?: object, ): Promise { const id = this.nextId++; const payload = { id, method, params, sessionId }; const stack = new Error().stack?.split("\n").slice(1, 4).join("\n"); const flowLoggerContext = FlowLogger.resolveContext(this.flowLoggerContext); let targetId: string | null; const mappedTargetId = this.sessionToTarget.get(sessionId); if (mappedTargetId) { targetId = mappedTargetId; } else { targetId = null; } const cdpCallEvent = flowLoggerContext ? FlowLogger.logCdpCallEvent(flowLoggerContext, { method, params, targetId, }) : null; if (flowLoggerContext && cdpCallEvent) { this.latestCdpCallEvent.set(sessionId, { flowLoggerContext, cdpCallEvent, }); } const p = new Promise((resolve, reject) => { this.inflight.set(id, { resolve, reject, sessionId, method, params, stack, ts: Date.now(), flowLoggerContext, cdpCallEvent, }); }); // Prevent unhandledRejection if a session detaches before the caller awaits. void p.catch(() => {}); for (const waiter of Array.from(this.sessionDispatchWaiters)) { if (waiter.sessionId !== sessionId) continue; if (waiter.method !== method) continue; if (waiter.match && !waiter.match(params)) continue; waiter.resolve(); break; } this.ws.send(JSON.stringify(payload)); return p; } _onSessionEvent( sessionId: string, event: string, handler: EventHandler, ): void { const key = `${sessionId}:${event}`; const set = this.eventHandlers.get(key) ?? new Set(); set.add(handler); this.eventHandlers.set(key, set); } _offSessionEvent( sessionId: string, event: string, handler: EventHandler, ): void { const key = `${sessionId}:${event}`; const set = this.eventHandlers.get(key); if (set) set.delete(handler); } _dispatchToSession(sessionId: string, event: string, params: unknown): void { const key = `${sessionId}:${event}`; const handlers = this.eventHandlers.get(key); if (handlers) for (const h of handlers) h(params); } } export class CdpSession implements CDPSessionLike { constructor( private readonly root: CdpConnection, public readonly id: string, ) {} send(method: string, params?: object): Promise { return this.root._sendViaSession(this.id, method, params); } on

(event: string, handler: (params: P) => void): void { this.root._onSessionEvent(this.id, event, handler as EventHandler); } off

(event: string, handler: (params: P) => void): void { this.root._offSessionEvent(this.id, event, handler as EventHandler); } async close(): Promise { await this.root.send("Target.detachFromTarget", { sessionId: this.id, }); } dispatch(event: string, params: unknown): void { this.root._dispatchToSession(this.id, event, params); } } ================================================ FILE: packages/core/lib/v3/understudy/consoleMessage.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { Page } from "./page.js"; type RemoteObject = Protocol.Runtime.RemoteObject; export type ConsoleListener = (message: ConsoleMessage) => void; function formatRemoteObject(obj: RemoteObject | undefined): string { if (!obj) return ""; if ("value" in obj) { const value = obj.value; if (value === undefined) return ""; if (typeof value === "string") return value; try { return JSON.stringify(value); } catch { return String(value); } } if (obj.unserializableValue) return obj.unserializableValue; if (obj.description) return obj.description; return obj.type ?? ""; } export class ConsoleMessage { constructor( private readonly event: Protocol.Runtime.ConsoleAPICalledEvent, private readonly pageRef?: Page, ) {} type(): Protocol.Runtime.ConsoleAPICalledEvent["type"] { return this.event.type; } text(): string { const args = this.args(); if (!args.length) return ""; return args .map((arg) => formatRemoteObject(arg)) .filter((chunk) => chunk.length > 0) .join(" "); } args(): RemoteObject[] { return this.event.args ? [...this.event.args] : []; } location(): { url?: string; lineNumber?: number; columnNumber?: number } { const frame = this.event.stackTrace?.callFrames?.[0]; return { url: frame?.url, lineNumber: frame?.lineNumber, columnNumber: frame?.columnNumber, }; } page(): Page | undefined { return this.pageRef; } timestamp(): number | undefined { return this.event.timestamp; } raw(): Protocol.Runtime.ConsoleAPICalledEvent { return this.event; } toString(): string { return this.text(); } } ================================================ FILE: packages/core/lib/v3/understudy/context.ts ================================================ // lib/v3/understudy/context.ts import type { Protocol } from "devtools-protocol"; import { v3Logger } from "../logger.js"; import { CdpConnection, CDPSessionLike } from "./cdp.js"; import { Page } from "./page.js"; import { installV3PiercerIntoSession } from "./piercer.js"; import { v3ScriptContent } from "../dom/build/scriptV3Content.js"; import { executionContexts } from "./executionContextRegistry.js"; import type { StagehandAPIClient } from "../api.js"; import { LocalBrowserLaunchOptions } from "../types/public/index.js"; import { InitScriptSource } from "../types/private/index.js"; import { normalizeInitScriptSource } from "./initScripts.js"; import { TimeoutError, CookieSetError, PageNotFoundError, StagehandSetExtraHTTPHeadersError, } from "../types/public/sdkErrors.js"; import { getEnvTimeoutMs, withTimeout } from "../timeoutConfig.js"; import { filterCookies, normalizeCookieParams, cookieMatchesFilter, toCdpCookieParam, } from "./cookies.js"; import { Cookie, ClearCookieOptions, CookieParam, } from "../types/public/context.js"; type TargetId = string; type SessionId = string; type TargetType = "page" | "iframe" | string; /** * Returns true when the target's URL points to a document with a real, * pierceable HTML DOM. We allowlist the small set of schemes that carry * web content rather than trying to blacklist every internal browser scheme * (chrome://, chrome-extension://, devtools://, brave://, edge://, …). */ function hasInjectableDOM(url: string | undefined): boolean { if (!url || url === "") return true; if ( url === "about:blank" || url === "about:srcdoc" || url.startsWith("about:blank#") ) return true; if (url.startsWith("http://") || url.startsWith("https://")) return true; if ( url.startsWith("data:") || url.startsWith("blob:") || url.startsWith("file://") || url.startsWith("filesystem:") ) return true; return false; } function isNonWebTarget(info: Protocol.Target.TargetInfo): boolean { return ( (info.type !== "page" && info.type !== "iframe") || !hasInjectableDOM(info.url) ); } function isTopLevelPage(info: Protocol.Target.TargetInfo): boolean { const ti = info as unknown as { subtype?: string }; return info.type === "page" && ti.subtype !== "iframe"; } const DEFAULT_FIRST_TOP_LEVEL_PAGE_TIMEOUT_MS = 5000; const CI_FIRST_TOP_LEVEL_PAGE_TIMEOUT_MS = 30000; const FIRST_TOP_LEVEL_PAGE_TIMEOUT_ENV = "STAGEHAND_FIRST_TOP_LEVEL_PAGE_TIMEOUT_MS"; const WAIT_FOR_FIRST_TOP_LEVEL_PAGE_OPERATION = "waitForFirstTopLevelPage (no top-level Page)"; function getFirstTopLevelPageTimeoutMs(): number { return ( getEnvTimeoutMs(FIRST_TOP_LEVEL_PAGE_TIMEOUT_ENV) ?? (process.env.CI ? CI_FIRST_TOP_LEVEL_PAGE_TIMEOUT_MS : DEFAULT_FIRST_TOP_LEVEL_PAGE_TIMEOUT_MS) ); } /** * V3Context * * Owns the root CDP connection and wires Target/Page events into Page. * Maintains one Page per top-level target, adopts OOPIF child sessions into the owner Page, * and tracks target→page and (root) frame→target mappings for lookups. * * IMPORTANT: FrameId → session ownership is managed inside Page (via its FrameRegistry). * Context never “guesses” owners; it simply forwards events (with the emitting session) * so Page can record the correct owner at event time. */ export class V3Context { private constructor( readonly conn: CdpConnection, private readonly env: "LOCAL" | "BROWSERBASE" = "LOCAL", private readonly apiClient: StagehandAPIClient | null = null, private readonly localBrowserLaunchOptions: LocalBrowserLaunchOptions | null = null, ) {} private readonly _piercerInstalled = new Set(); // Timestamp for most recent popup/open signal private _lastPopupSignalAt = 0; private readonly _targetSessionListeners = new Set(); private readonly _sessionInit = new Set(); private pagesByTarget = new Map(); private mainFrameToTarget = new Map(); private sessionOwnerPage = new Map(); private frameOwnerPage = new Map(); private pendingOopifByMainFrame = new Map(); private createdAtByTarget = new Map(); private typeByTarget = new Map(); private _pageOrder: TargetId[] = []; private pendingCreatedTargetUrl = new Map(); private readonly initScripts: string[] = []; private extraHttpHeaders: Record | null = null; private installTargetSessionListeners(session: CDPSessionLike): void { const sessionId = session.id; if (!sessionId) return; if (this._targetSessionListeners.has(sessionId)) return; this._targetSessionListeners.add(sessionId); session.on( "Target.attachedToTarget", (evt) => { void this.onAttachedToTarget(evt.targetInfo, evt.sessionId); }, ); session.on( "Target.detachedFromTarget", (evt) => { this.onDetachedFromTarget(evt.sessionId, evt.targetId ?? null); }, ); session.on( "Target.targetDestroyed", (evt) => { this.cleanupByTarget(evt.targetId); }, ); } /** * Create a Context for a given CDP websocket URL and bootstrap target wiring. */ static async create( wsUrl: string, opts?: { env?: "LOCAL" | "BROWSERBASE"; apiClient?: StagehandAPIClient | null; localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null; cdpHeaders?: Record; }, ): Promise { const connectTask = async () => { const conn = await CdpConnection.connect(wsUrl, { headers: opts?.cdpHeaders, }); const ctx = new V3Context( conn, opts?.env ?? "LOCAL", opts?.apiClient ?? null, opts?.localBrowserLaunchOptions ?? null, ); await ctx.bootstrap(); await ctx.ensureFirstTopLevelPage(getFirstTopLevelPageTimeoutMs()); return ctx; }; const cdpTimeoutMs = opts?.env === "BROWSERBASE" ? getEnvTimeoutMs("BROWSERBASE_CDP_CONNECT_MAX_MS") : undefined; if (cdpTimeoutMs) { let timedOut = false; const connectPromise = connectTask(); const guarded = withTimeout( connectPromise, cdpTimeoutMs, "Browserbase CDP connect", ).catch((err) => { timedOut = true; throw err; }); connectPromise .then((ctx) => { if (timedOut) void ctx.close(); }) .catch(() => {}); return await guarded; } return await connectTask(); } private hasTopLevelPage(): boolean { for (const [targetId, targetType] of this.typeByTarget) { if (targetType === "page" && this.pagesByTarget.has(targetId)) { return true; } } return false; } private async ensureFirstTopLevelPage(timeoutMs: number): Promise { if (this.hasTopLevelPage()) return; try { await this.waitForFirstTopLevelPage(timeoutMs); return; } catch (err) { if (!(err instanceof TimeoutError)) { throw err; } v3Logger({ category: "ctx", message: "No open browser pages found after connect; creating an initial about:blank page", level: 1, }); } await this.newPage("about:blank"); } /** * Wait until at least one top-level Page has been created and registered. * We poll internal maps that bootstrap/onAttachedToTarget populate. */ private async waitForFirstTopLevelPage(timeoutMs: number): Promise { const deadline = Date.now() + timeoutMs; while (Date.now() < deadline) { // A top-level Page is present if typeByTarget has an entry "page" // and pagesByTarget has the corresponding Page object. for (const [tid, ttype] of this.typeByTarget) { if (ttype === "page") { const p = this.pagesByTarget.get(tid); if (p) return; } } await new Promise((r) => setTimeout(r, 25)); } throw new TimeoutError(WAIT_FOR_FIRST_TOP_LEVEL_PAGE_OPERATION, timeoutMs); } private async waitForInitialTopLevelTargets( targetIds: TargetId[], timeoutMs = 3000, ): Promise { if (!targetIds.length) return; const pending = new Set(targetIds); const deadline = Date.now() + timeoutMs; while (pending.size && Date.now() < deadline) { for (const tid of Array.from(pending)) { if (this.pagesByTarget.has(tid)) { pending.delete(tid); } } if (!pending.size) return; await new Promise((r) => setTimeout(r, 25)); } if (pending.size) { v3Logger({ category: "ctx", message: "Timed out waiting for existing top-level targets to attach", level: 2, auxiliary: { remainingTargets: { value: JSON.stringify(Array.from(pending)), type: "object", }, }, }); } } private async ensurePiercer(session: CDPSessionLike): Promise { const id = session.id ?? ""; if (this._piercerInstalled.has(id)) return true; const installed = await installV3PiercerIntoSession(session); if (installed) { this._piercerInstalled.add(id); } return installed; } /** Mark a page target as the most-recent one (active). */ private _pushActive(tid: TargetId): void { // remove prior entry if any const i = this._pageOrder.indexOf(tid); if (i !== -1) this._pageOrder.splice(i, 1); this._pageOrder.push(tid); } /** Remove a page target from the recency list (used on close). */ private _removeFromOrder(tid: TargetId): void { const i = this._pageOrder.indexOf(tid); if (i !== -1) this._pageOrder.splice(i, 1); } /** Return the current active Page (most-recent page that still exists). */ public activePage(): Page | undefined { // prune any stale ids from the tail for (let i = this._pageOrder.length - 1; i >= 0; i--) { const tid = this._pageOrder[i]!; const p = this.pagesByTarget.get(tid); if (p) return p; // stale — remove and continue this._pageOrder.splice(i, 1); } // fallback: pick the newest by createdAt if order is empty let newestTid: TargetId | undefined; let newestTs = -1; for (const [tid] of this.pagesByTarget) { const ts = this.createdAtByTarget.get(tid) ?? 0; if (ts > newestTs) { newestTs = ts; newestTid = tid; } } return newestTid ? this.pagesByTarget.get(newestTid) : undefined; } /** Explicitly mark a known Page as the most-recent active page (and focus it). */ public setActivePage(page: Page): void { let targetId = page.targetId(); if (this.pagesByTarget.get(targetId) !== page) { const lookup = this.findTargetIdByPage(page); if (!lookup) { v3Logger({ category: "ctx", message: "setActivePage called with unknown Page", level: 2, auxiliary: { targetId: { value: String(targetId), type: "string" }, }, }); return; } targetId = lookup; } this._pushActive(targetId); // Bring the tab to the foreground in headful Chrome (best effort). void this.conn.send("Target.activateTarget", { targetId }).catch(() => {}); } public async addInitScript( script: InitScriptSource, arg?: Arg, ): Promise { const source = await normalizeInitScriptSource(script, arg); if (this.initScripts.includes(source)) return; this.initScripts.push(source); const pages = this.pages(); await Promise.all(pages.map((page) => page.registerInitScript(source))); } public async setExtraHTTPHeaders( headers: Record, ): Promise { const nextHeaders = { ...headers }; this.extraHttpHeaders = nextHeaders; const sessions: CDPSessionLike[] = []; for (const sessionId of this._sessionInit) { const session = this.conn.getSession(sessionId); if (session) sessions.push(session); } if (!sessions.length) return; const results = await Promise.allSettled( sessions.map(async (session) => { await session.send("Network.enable"); await session.send("Network.setExtraHTTPHeaders", { headers: nextHeaders, }); }), ); const failures = results .map((result, index) => ({ result, session: sessions[index] })) .filter( ( entry, ): entry is { result: PromiseRejectedResult; session: CDPSessionLike; } => entry.result.status === "rejected", ) .map((entry) => { const reason = entry.result.reason as Error; const sid = entry.session.id ?? "unknown"; const message = reason?.message ?? String(reason); return `session=${sid} error=${message}`; }); if (failures.length) { throw new StagehandSetExtraHTTPHeadersError(failures); } } /** * Return top-level `Page`s (oldest → newest). OOPIF targets are not included. */ pages(): Page[] { const rows: Array<{ tid: TargetId; page: Page; created: number }> = []; for (const [tid, page] of this.pagesByTarget) { if (this.typeByTarget.get(tid) === "page") { rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 }); } } rows.sort((a, b) => a.created - b.created); return rows.map((r) => r.page); } private async applyInitScriptsToPage( page: Page, opts?: { seedOnly?: boolean }, ): Promise { if (opts?.seedOnly) { for (const source of this.initScripts) { page.seedInitScript(source); } return; } for (const source of this.initScripts) { await page.registerInitScript(source); } } /** * Resolve an owning `Page` by the **top-level main frame id**. * Note: child (OOPIF) roots are intentionally not present in this mapping. */ resolvePageByMainFrameId(frameId: string): Page | undefined { const targetId = this.mainFrameToTarget.get(frameId); return targetId ? this.pagesByTarget.get(targetId) : undefined; } /** * Serialize the full frame tree for a given top-level main frame id. */ async getFullFrameTreeByMainFrameId( rootMainFrameId: string, ): Promise { const owner = this.resolvePageByMainFrameId(rootMainFrameId); if (!owner) throw new PageNotFoundError(`mainFrameId=${rootMainFrameId}`); return owner.asProtocolFrameTree(rootMainFrameId); } /** * Create a new top-level page (tab) with the given URL and return its Page object. * Waits until the target is attached and registered. */ public async newPage(url = "about:blank"): Promise { const targetUrl = String(url ?? "about:blank"); const { targetId } = await this.conn.send<{ targetId: string }>( "Target.createTarget", // Create at about:blank so init scripts can install before first real navigation. { url: "about:blank" }, ); this.pendingCreatedTargetUrl.set(targetId, "about:blank"); // Best-effort bring-to-front await this.conn.send("Target.activateTarget", { targetId }).catch(() => {}); const deadline = Date.now() + 5000; while (Date.now() < deadline) { const page = this.pagesByTarget.get(targetId); if (page) { // we created at about:blank; navigate only after attach so init scripts run // on the first real document. Fire-and-forget so newPage() resolves on attach. if (targetUrl !== "about:blank") { // Seed requested URL into the page cache before navigation events arrive. page.seedCurrentUrl(targetUrl); void page .sendCDP("Page.navigate", { url: targetUrl }) .catch(() => {}); } return page; } await new Promise((r) => setTimeout(r, 25)); } throw new TimeoutError(`newPage: target not attached (${targetId})`, 5000); } /** * Close CDP and clear all mappings. Best-effort cleanup. */ async close(): Promise { await this.conn.close(); this.pagesByTarget.clear(); this.mainFrameToTarget.clear(); this.sessionOwnerPage.clear(); this.frameOwnerPage.clear(); this.pendingOopifByMainFrame.clear(); this.createdAtByTarget.clear(); this.typeByTarget.clear(); this.pendingCreatedTargetUrl.clear(); } /** * Bootstrap target lifecycle: * - Attach to existing targets. * - Handle auto-attach events. * - Clean up on detach/destroy. */ private async bootstrap(): Promise { // Live attach via auto-attach (normal path) this.conn.on( "Target.attachedToTarget", async (evt) => { await this.onAttachedToTarget(evt.targetInfo, evt.sessionId); }, ); // Live detach (clean up session from owner page & frame graph) this.conn.on( "Target.detachedFromTarget", (evt) => { this.onDetachedFromTarget(evt.sessionId, evt.targetId ?? null); }, ); // Destroyed targets (fallback cleanup by targetId) this.conn.on( "Target.targetDestroyed", (evt) => { this.cleanupByTarget(evt.targetId); }, ); this.conn.on( "Target.targetCreated", async (evt) => { const info = evt.targetInfo; // Note popups to help activePage settle const ti = info; if (info.type === "page" && (ti?.openerId || ti?.openerFrameId)) { this._notePopupSignal(); } }, ); // Only enable auto-attach after listeners are ready so replayed targets are captured. await this.conn.enableAutoAttach(); const targets = await this.conn.getTargets(); for (const t of targets) { if (t.attached) continue; // auto-attach already handled this target try { await this.conn.attachToTarget(t.targetId); } catch { // ignore attach race } } const topLevelTargetIds = targets .filter((t) => isTopLevelPage(t)) .map((t) => t.targetId); await this.waitForInitialTopLevelTargets(topLevelTargetIds); } /** * Handle a newly attached target (top-level or potential OOPIF): * - Enable Page domain and lifecycle events. * - If top-level → create Page, wire listeners, resume. * - Else → probe child root frame id via `Page.getFrameTree` and adopt immediately * if the parent is known; otherwise stage until parent `frameAttached`. * - Resume the target only after listeners are wired. */ private async onAttachedToTarget( info: Protocol.Target.TargetInfo, sessionId: SessionId, ): Promise { // Skip non-web targets (workers, chrome extensions, background pages, etc.). // They still need to be resumed so we don't leave them paused by // waitForDebuggerOnStart, but injecting the piercer into these targets // can throw or corrupt their internal state (e.g. Chrome's PDF viewer). if (isNonWebTarget(info)) { const session = this.conn.getSession(sessionId); if (session) { await session.send("Runtime.runIfWaitingForDebugger").catch(() => {}); } return; } const session = this.conn.getSession(sessionId); if (!session) return; // Init guard if (this._sessionInit.has(sessionId)) return; this._sessionInit.add(sessionId); this.installTargetSessionListeners(session); // Register for Runtime events before enabling it so we don't miss initial contexts. executionContexts.attachSession(session); // Ensure we only resume once even if multiple code paths hit finally. let resumed = false; const resume = async (): Promise => { if (resumed) return; resumed = true; // waitForDebuggerOnStart pauses new targets; resume once we've done // any "must happen before first document" work. await session.send("Runtime.runIfWaitingForDebugger").catch(() => {}); }; // Attach lifecycle (per target session): // 1) while paused, enable domains + child auto-attach and register init scripts; // 2) resume target execution; // 3) build/adopt Page ownership and frame bridges. // Some CDP backends defer *.enable() responses until after resume, so we // cannot await those responses before resuming. Instead we: // - wait for transport-level dispatch of required pre-resume commands; // - then dispatch resume; // - then await responses. const queuePreResume = ( method: string, params?: object, match?: (sentParams?: object) => boolean, ) => { const dispatched = this.conn .waitForSessionDispatch(sessionId, method, match) .then(() => true) .catch(() => false); const response = session .send(method, params) .then(() => true) .catch(() => false); return { dispatched, response }; }; const initScriptOps: Array<{ dispatched: Promise; response: Promise; }> = []; // Pre-resume ordering matters: // - enable domains; // - enable child auto-attach with waitForDebuggerOnStart; // - register init scripts. // Commands are sent in-order on the same session before resume. const corePreResumeOps = [ queuePreResume("Page.enable"), queuePreResume("Runtime.enable"), queuePreResume("Target.setAutoAttach", { autoAttach: true, waitForDebuggerOnStart: true, flatten: true, }), ]; const headerPreResumeOps: Array<{ dispatched: Promise; response: Promise; }> = []; if (this.extraHttpHeaders) { const headers = { ...this.extraHttpHeaders }; headerPreResumeOps.push(queuePreResume("Network.enable")); headerPreResumeOps.push( queuePreResume("Network.setExtraHTTPHeaders", { headers }), ); } // Send init scripts only after auto-attach has been queued. if (this.initScripts.length) { for (const source of this.initScripts) { initScriptOps.push( queuePreResume( "Page.addScriptToEvaluateOnNewDocument", { source, runImmediately: true, }, (sentParams) => (sentParams as { source?: string } | undefined)?.source === source, ), ); } } const piercerPreloadOp = queuePreResume( "Page.addScriptToEvaluateOnNewDocument", { source: v3ScriptContent, runImmediately: true, }, (sentParams) => (sentParams as { source?: string } | undefined)?.source === v3ScriptContent, ); const preResumeDispatched = ( await Promise.all([ ...corePreResumeOps.map((op) => op.dispatched), ...headerPreResumeOps.map((op) => op.dispatched), ...initScriptOps.map((op) => op.dispatched), piercerPreloadOp.dispatched, ]) ).every(Boolean); // Dispatch resume only after pre-resume setup has actually been sent. const resumeOp = queuePreResume("Runtime.runIfWaitingForDebugger"); const [resumedDispatched, resumedOk] = await Promise.all([ resumeOp.dispatched, resumeOp.response, ]); const [ coreResults, headerResults, initScriptResults, piercerPreRegistered, ] = await Promise.all([ Promise.all(corePreResumeOps.map((op) => op.response)), Promise.all(headerPreResumeOps.map((op) => op.response)), Promise.all(initScriptOps.map((op) => op.response)), piercerPreloadOp.response, ]); // Header propagation is independent of init-script determinism but still // part of pre-resume attach setup; awaited above for ordering/lifecycle. void headerResults; if (!preResumeDispatched || !resumedDispatched || !resumedOk) { // Short-lived child targets can detach before resume is acknowledged. // Keep this noisy only for top-level pages where missing attach is fatal. if (isTopLevelPage(info)) { v3Logger({ category: "ctx", message: "Failed target pre-resume setup ordering", level: 2, auxiliary: { targetId: { value: String(info.targetId), type: "string" }, targetType: { value: String(info.type), type: "string" }, preResumeDispatched: { value: String(preResumeDispatched), type: "string", }, resumedDispatched: { value: String(resumedDispatched), type: "string", }, resumedOk: { value: String(resumedOk), type: "string" }, }, }); } return; } resumed = true; const scriptsInstalled = coreResults.every(Boolean) && initScriptResults.every(Boolean); try { // Best-effort lifecycle events; do not block top-level page registration // on this optional signal stream. void session .send("Page.setLifecycleEventsEnabled", { enabled: true }) .catch(() => {}); // Top-level handling if (isTopLevelPage(info)) { let page: Page | null = null; let createError: unknown; // Deterministic contract: never drop a newly attached top-level target // because an arbitrary local timeout fired. We wait for Page.create and // let it finish regardless of CDP call latency. try { page = await Page.create( this.conn, session, info.targetId, this.apiClient, this.localBrowserLaunchOptions, this.env === "BROWSERBASE", ); } catch (error) { createError = error; } if (!page) { v3Logger({ category: "ctx", message: "Failed to create top-level Page", level: 2, auxiliary: { targetId: { value: String(info.targetId), type: "string" }, targetType: { value: String(info.type), type: "string" }, targetUrl: { value: String(info.url ?? ""), type: "string" }, error: { value: String( createError instanceof Error ? createError.message : createError, ), type: "string", }, }, }); return; } this.wireSessionToOwnerPage(sessionId, page); this.pagesByTarget.set(info.targetId, page); this.mainFrameToTarget.set(page.mainFrameId(), info.targetId); this.sessionOwnerPage.set(sessionId, page); this.frameOwnerPage.set(page.mainFrameId(), page); this.typeByTarget.set(info.targetId, "page"); if (!this.createdAtByTarget.has(info.targetId)) { this.createdAtByTarget.set(info.targetId, Date.now()); } const pendingSeedUrl = this.pendingCreatedTargetUrl.get(info.targetId); this.pendingCreatedTargetUrl.delete(info.targetId); page.seedCurrentUrl(pendingSeedUrl ?? info.url ?? ""); this._pushActive(info.targetId); this.installFrameEventBridges(sessionId, page); if (piercerPreRegistered) { this._piercerInstalled.add(sessionId); } // If we already installed scripts at the session level, only seed the // Page's registry to avoid double-installing DOMContentLoaded handlers. await this.applyInitScriptsToPage(page, { seedOnly: scriptsInstalled, }); if (!piercerPreRegistered) { void this.ensurePiercer(session).catch(() => {}); } return; } const piercerReady = await this.ensurePiercer(session).catch(() => false); if (!piercerReady) return; // Child (iframe / OOPIF) try { const { frameTree } = await session.send( "Page.getFrameTree", ); const childMainId = frameTree.frame.id; // Try to find owner Page now (it may already have the node in its tree) let owner = this.frameOwnerPage.get(childMainId); if (!owner) { for (const p of this.pagesByTarget.values()) { const tree = p.asProtocolFrameTree(p.mainFrameId()); const has = (function find(n: Protocol.Page.FrameTree): boolean { if (n.frame.id === childMainId) return true; for (const c of n.childFrames ?? []) if (find(c)) return true; return false; })(tree); if (has) { owner = p; break; } } } if (owner) { owner.adoptOopifSession(session, childMainId); this.sessionOwnerPage.set(sessionId, owner); this.installFrameEventBridges(sessionId, owner); // Prime the execution-context registry so later lookups succeed even if // the frame navigates before we issue a command. void executionContexts .waitForMainWorld(session, childMainId) .catch(() => {}); } else { this.pendingOopifByMainFrame.set(childMainId, sessionId); } } catch { // page.getFrameTree failed. Most likely was an ad iframe // that opened & closed before we could attach. ignore } } finally { await resume(); } } /** * Detach handler: * - Remove child session ownership and prune its subtree. * - If a top-level target, cleanup its `Page` and mappings. * - Drop any staged child for this session. */ private onDetachedFromTarget( sessionId: SessionId, targetId: string | null, ): void { const owner = this.sessionOwnerPage.get(sessionId); if (owner) { owner.detachOopifSession(sessionId); this.sessionOwnerPage.delete(sessionId); } if (targetId && this.pagesByTarget.has(targetId)) { this.cleanupByTarget(targetId); } for (const [fid, sid] of Array.from( this.pendingOopifByMainFrame.entries(), )) { if (sid === sessionId) this.pendingOopifByMainFrame.delete(fid); } this._targetSessionListeners.delete(sessionId); this._sessionInit.delete(sessionId); this._piercerInstalled.delete(sessionId); } /** * Cleanup a top-level Page by target id, removing its root and staged children. */ private cleanupByTarget(targetId: TargetId): void { const page = this.pagesByTarget.get(targetId); if (!page) return; const mainId = page.mainFrameId(); this.mainFrameToTarget.delete(mainId); this.frameOwnerPage.delete(mainId); for (const [sid, p] of Array.from(this.sessionOwnerPage.entries())) { if (p === page) this.sessionOwnerPage.delete(sid); } for (const [fid] of Array.from(this.pendingOopifByMainFrame.entries())) { const owner = this.frameOwnerPage.get(fid); if (!owner || owner === page) this.pendingOopifByMainFrame.delete(fid); } this._removeFromOrder(targetId); this.pagesByTarget.delete(targetId); this.createdAtByTarget.delete(targetId); this.typeByTarget.delete(targetId); this.pendingCreatedTargetUrl.delete(targetId); } /** * Wire Page-domain frame events for a session into the owning Page & mappings. * We forward the *emitting session* with every event so Page can stamp ownership precisely. */ private installFrameEventBridges(sessionId: SessionId, owner: Page): void { const session = this.conn.getSession(sessionId); if (!session) return; session.on( "Page.frameAttached", (evt) => { const { frameId, parentFrameId } = evt; owner.onFrameAttached(frameId, parentFrameId ?? null, session); // If we were waiting for this id (OOPIF child), adopt now. const pendingChildSessionId = this.pendingOopifByMainFrame.get(frameId); if (pendingChildSessionId) { const child = this.conn.getSession(pendingChildSessionId); if (child) { owner.adoptOopifSession(child, frameId); this.sessionOwnerPage.set(child.id, owner); // Wire bridges for the child so its Page events keep flowing. this.installFrameEventBridges(pendingChildSessionId, owner); } this.pendingOopifByMainFrame.delete(frameId); } // Track Page ownership for quick reverse lookups (debug helpers). this.frameOwnerPage.set(frameId, owner); // Root handoff: keep mainFrameToTarget aligned for the page if (!parentFrameId) { const newRoot = owner.mainFrameId(); const topTargetId = this.findTargetIdByPage(owner); if (topTargetId) { this.mainFrameToTarget.set(newRoot, topTargetId); } this.frameOwnerPage.set(newRoot, owner); } }, ); session.on( "Page.frameDetached", (evt) => { owner.onFrameDetached(evt.frameId, evt.reason ?? "remove"); if (evt.reason !== "swap") { this.frameOwnerPage.delete(evt.frameId); } }, ); session.on( "Page.frameNavigated", (evt) => { owner.onFrameNavigated(evt.frame, session); }, ); session.on( "Page.navigatedWithinDocument", (evt) => { owner.onNavigatedWithinDocument(evt.frameId, evt.url, session); }, ); // Observe window.open to anticipate default page changes session.on("Page.windowOpen", () => { this._notePopupSignal(); }); } /** * Register that a session belongs to a Page (used by event routing). */ private wireSessionToOwnerPage(sessionId: SessionId, owner: Page): void { this.sessionOwnerPage.set(sessionId, owner); } /** * Utility: reverse-lookup the top-level target id that owns a given Page. */ private findTargetIdByPage(page: Page): TargetId | undefined { for (const [tid, p] of this.pagesByTarget) { if (p === page) return tid; } return undefined; } private _notePopupSignal(): void { this._lastPopupSignalAt = Date.now(); } /** * Await the current active page, waiting briefly if a popup/open was just triggered. * Normal path returns immediately; popup path waits up to timeoutMs for the new page. */ async awaitActivePage(timeoutMs?: number): Promise { const defaultTimeout = this.env === "BROWSERBASE" ? 4000 : 2000; timeoutMs = timeoutMs ?? defaultTimeout; // If a popup was just triggered, Chrome (especially on Browserbase) // may briefly pause new targets at document start ("waiting for debugger"). const recentWindowMs = this.env === "BROWSERBASE" ? 1000 : 300; const now = Date.now(); const hasRecentPopup = now - this._lastPopupSignalAt <= recentWindowMs; const immediate = this.activePage(); if (!hasRecentPopup && immediate) return immediate; const deadline = now + timeoutMs; while (Date.now() < deadline) { // Prefer most-recent by createdAt let newestTid: TargetId | undefined; let newestTs = -1; for (const [tid] of this.pagesByTarget) { const ts = this.createdAtByTarget.get(tid) ?? 0; if (ts > newestTs) { newestTs = ts; newestTid = tid; } } if (newestTid) { const p = this.pagesByTarget.get(newestTid); if (p && newestTs >= this._lastPopupSignalAt) return p; } await new Promise((r) => setTimeout(r, 25)); } if (immediate) return immediate; throw new PageNotFoundError("awaitActivePage: no page available"); } /** * Get all browser cookies, optionally filtered by URL(s). * * When `urls` is omitted or empty every cookie in the browser context is * returned. When one or more URLs are supplied only cookies whose * domain/path/secure attributes match are included. */ async cookies(urls?: string | string[]): Promise { const urlList = !urls ? [] : typeof urls === "string" ? [urls] : urls; const { cookies } = await this.conn.send<{ cookies: Protocol.Network.Cookie[]; }>("Storage.getCookies"); const mapped: Cookie[] = cookies.map((c) => ({ name: c.name, value: c.value, domain: c.domain, path: c.path, expires: c.expires, httpOnly: c.httpOnly, secure: c.secure, sameSite: (c.sameSite as Cookie["sameSite"]) ?? "Lax", })); return filterCookies(mapped, urlList); } /** * Add one or more cookies to the browser context. * * Each cookie must specify either a `url` (from which domain/path/secure are * derived) or an explicit `domain` + `path` pair. * * We surface CDP errors if the browser rejects a cookie. */ async addCookies(cookies: CookieParam[]): Promise { const normalized = normalizeCookieParams(cookies); if (!normalized.length) return; const cdpCookies = normalized.map(toCdpCookieParam); try { await this.conn.send("Storage.setCookies", { cookies: cdpCookies }); } catch (err) { const detail = err instanceof Error ? err.message : String(err); const names = normalized.map((c) => `"${c.name}"`).join(", "); throw new CookieSetError( `Failed to set cookies [${names}] — ` + `the browser rejected the batch. Check that the domain, path, and secure/sameSite values are valid.` + (detail ? ` (CDP error: ${detail})` : ""), ); } } /** * Clear cookies from the browser context. * * - Called with no arguments: clears **all** cookies atomically via * `Storage.clearCookies`. * - Called with filter options: fetches all cookies, clears everything, * then re-adds only the cookies that do NOT match the filter via * `Storage.setCookies`. This is necessary on the browser endpoint because * the Storage domain does not support targeted deletes. */ async clearCookies(options?: ClearCookieOptions): Promise { const hasFilter = options?.name !== undefined || options?.domain !== undefined || options?.path !== undefined; if (!hasFilter) { // Atomic single-call wipe — no race condition, no O(N) roundtrips. await this.conn.send("Storage.clearCookies"); return; } const current = await this.cookies(); const toKeep = current.filter((c) => !cookieMatchesFilter(c, options!)); if (toKeep.length === current.length) return; // Storage domain doesn't support targeted deletes on the browser endpoint. // Clear everything, then re-add only the cookies we're keeping. await this.conn.send("Storage.clearCookies"); if (toKeep.length) { try { await this.conn.send("Storage.setCookies", { cookies: toKeep.map(toCdpCookieParam), }); } catch (err) { const detail = err instanceof Error ? err.message : String(err); const names = toKeep.map((c) => `"${c.name}"`).join(", "); throw new CookieSetError( `clearCookies: cookies were cleared but failed to re-add the ${toKeep.length} ` + `non-matching cookie(s) [${names}]. The browser cookie jar is now empty. ` + (detail ? `(CDP error: ${detail})` : ""), ); } } } } ================================================ FILE: packages/core/lib/v3/understudy/cookies.ts ================================================ import { Cookie, CookieParam, ClearCookieOptions, } from "../types/public/context.js"; import { CookieValidationError } from "../types/public/sdkErrors.js"; /** * helpers for browser cookie management. * * Mirrors Playwright's cookie API surface, adapted for direct CDP usage * against a single default browser context. */ /** * Filter cookies by URL matching (domain, path, secure). * If `urls` is empty every cookie passes. */ export function filterCookies(cookies: Cookie[], urls: string[]): Cookie[] { if (!urls.length) return cookies; const parsed = urls.map((u) => { try { return new URL(u); } catch { throw new CookieValidationError( `Invalid URL passed to cookies(): "${u}"`, ); } }); return cookies.filter((c) => { for (const url of parsed) { let domain = c.domain; if (!domain.startsWith(".")) domain = "." + domain; if (!("." + url.hostname).endsWith(domain)) continue; // Path must match on a "/" boundary: cookie path "/foo" should match // "/foo" and "/foo/bar" but NOT "/foobar". const p = url.pathname; if ( !p.startsWith(c.path) || (c.path.length < p.length && !c.path.endsWith("/") && p[c.path.length] !== "/") ) continue; const isLoopback = url.hostname === "localhost" || url.hostname === "127.0.0.1" || url.hostname === "[::1]"; if (url.protocol !== "https:" && !isLoopback && c.secure) continue; return true; } return false; }); } /** * Validate and normalise `CookieParam` values before sending to CDP. * * - Ensures every cookie has either `url` or `domain`+`path`. * - When `url` is provided, derives `domain`, `path`, and `secure` from it. * - Validates that `sameSite: "None"` is paired with `secure: true` * (browsers silently reject this — we throw early with a clear message). */ export function normalizeCookieParams(cookies: CookieParam[]): CookieParam[] { return cookies.map((c) => { if (!c.url && !(c.domain && c.path)) { throw new CookieValidationError( `Cookie "${c.name}" must have a url or a domain/path pair`, ); } if (c.url && c.domain) { throw new CookieValidationError( `Cookie "${c.name}" should have either url or domain, not both`, ); } if (c.url && c.path) { throw new CookieValidationError( `Cookie "${c.name}" should have either url or path, not both`, ); } if (c.expires !== undefined && c.expires < 0 && c.expires !== -1) { throw new CookieValidationError( `Cookie "${c.name}" has an invalid expires value; use -1 for session cookies or a positive unix timestamp`, ); } const copy = { ...c }; if (copy.url) { if (copy.url === "about:blank") { throw new CookieValidationError( `Blank page cannot have cookie "${c.name}"`, ); } if (copy.url.startsWith("data:")) { throw new CookieValidationError( `Data URL page cannot have cookie "${c.name}"`, ); } let url: URL; try { url = new URL(copy.url); } catch { throw new CookieValidationError( `Cookie "${c.name}" has an invalid url: "${copy.url}"`, ); } copy.domain = url.hostname; copy.path = url.pathname.substring(0, url.pathname.lastIndexOf("/") + 1); copy.secure = url.protocol === "https:"; delete copy.url; } // Browsers silently reject SameSite=None cookies that aren't Secure. // Catch this early with a clear error instead of a silent CDP failure. // Use !copy.secure to catch both explicit false AND undefined (omitted), // since CDP defaults secure to false when omitted. if (copy.sameSite === "None" && !copy.secure) { throw new CookieValidationError( `Cookie "${c.name}" has sameSite: "None" without secure: true. ` + `Browsers require secure: true when sameSite is "None".`, ); } return copy; }); } /** * Map a Cookie or CookieParam to the shape CDP's Storage.setCookies expects. * Session cookies (expires === -1) omit the expires field so CDP treats them * as session-scoped. */ export function toCdpCookieParam( c: Cookie | CookieParam, ): Record { return { name: c.name, value: c.value, domain: c.domain, path: c.path, expires: c.expires === -1 ? undefined : c.expires, httpOnly: c.httpOnly, secure: c.secure, sameSite: c.sameSite, }; } /** * Returns true if a cookie matches all supplied filter criteria. * Undefined filters are treated as "match anything". */ export function cookieMatchesFilter( cookie: Cookie, options: ClearCookieOptions, ): boolean { const check = ( prop: "name" | "domain" | "path", value: string | RegExp | undefined, ): boolean => { if (value === undefined) return true; if (value instanceof RegExp) { value.lastIndex = 0; return value.test(cookie[prop]); } return cookie[prop] === value; }; return ( check("name", options.name) && check("domain", options.domain) && check("path", options.path) ); } ================================================ FILE: packages/core/lib/v3/understudy/deepLocator.ts ================================================ import { Locator } from "./locator.js"; import type { Frame } from "./frame.js"; import type { Page } from "./page.js"; import { FrameLocator, frameLocatorFromFrame } from "./frameLocator.js"; import { StagehandInvalidArgumentError } from "../types/public/sdkErrors.js"; import { IFRAME_STEP_RE } from "./a11y/snapshot/focusSelectors.js"; type Axis = "child" | "desc"; type Step = { axis: Axis; raw: string; name: string }; export type ResolvedLocatorTarget = { frame: Frame; selector: string; }; /** Parse XPath into steps preserving '/' vs '//' and the raw token (with [n]) */ function parseXPath(path: string): Step[] { const s = path.trim(); let i = 0; const steps: Step[] = []; while (i < s.length) { let axis: Axis = "child"; if (s.startsWith("//", i)) { axis = "desc"; i += 2; } else if (s[i] === "/") { axis = "child"; i += 1; } const start = i; while (i < s.length && s[i] !== "/") i++; const raw = s.slice(start, i).trim(); if (!raw) continue; const name = raw.replace(/\[\d+\]\s*$/u, "").toLowerCase(); steps.push({ axis, raw, name }); } return steps; } function buildXPathFromSteps(steps: ReadonlyArray): string { let out = ""; for (const st of steps) { out += st.axis === "desc" ? "//" : "/"; out += st.raw; // keep predicates intact } return out || "/"; } /** Build a Locator scoped to the correct frame for a deep XPath crossing iframes. */ export async function deepLocatorThroughIframes( page: Page, root: Frame, xpathOrSelector: string, ): Promise { const target = await resolveDeepXPathTarget(page, root, xpathOrSelector); return new Locator(target.frame, target.selector); } /** * Unified resolver that supports '>>' hop notation, deep XPath across iframes, * and plain single-frame selectors. Keeps hop logic in one shared place. */ export async function resolveLocatorTarget( page: Page, root: Frame, selectorRaw: string, ): Promise { const sel = selectorRaw.trim(); const parts = sel .split(">>") .map((s) => s.trim()) .filter(Boolean); if (parts.length > 1) { // Build a FrameLocator chain for all but the last segment let fl = frameLocatorFromFrame(page, root, parts[0]!); for (let i = 1; i < parts.length - 1; i++) { fl = fl.frameLocator(parts[i]!); } const targetFrame = await fl.resolveFrame(); return { frame: targetFrame, selector: parts[parts.length - 1]! }; } // No hops — delegate to XPath-aware deep resolver when needed const isXPath = sel.startsWith("xpath=") || sel.startsWith("/"); if (isXPath) { return resolveDeepXPathTarget(page, root, sel); } return { frame: root, selector: sel }; } export async function resolveLocatorWithHops( page: Page, root: Frame, selectorRaw: string, ): Promise { const target = await resolveLocatorTarget(page, root, selectorRaw); return new Locator(target.frame, target.selector); } /** * DeepLocatorDelegate: a lightweight wrapper that looks like a Locator and * resolves to the correct frame/element on each call using hop/deep-XPath logic. * * Returned by `page.deepLocator()` for ergonomic, await-free chaining: * page.deepLocator('iframe#ifrA >> #btn').click() */ export class DeepLocatorDelegate { constructor( private readonly page: Page, private readonly root: Frame, private readonly selector: string, private readonly nthIndex: number = 0, ) {} private async real(): Promise { const base = await resolveLocatorWithHops( this.page, this.root, this.selector, ); return base.nth(this.nthIndex); } // Locator API delegates async click(options?: { button?: "left" | "right" | "middle"; clickCount?: number; }) { return (await this.real()).click(options); } async count() { return (await this.real()).count(); } async hover() { return (await this.real()).hover(); } async fill(value: string) { return (await this.real()).fill(value); } async type(text: string, options?: { delay?: number }) { return (await this.real()).type(text, options); } async selectOption(values: string | string[]) { return (await this.real()).selectOption(values); } async scrollTo(percent: number | string) { return (await this.real()).scrollTo(percent); } async isVisible() { return (await this.real()).isVisible(); } async isChecked() { return (await this.real()).isChecked(); } async inputValue() { return (await this.real()).inputValue(); } async textContent() { return (await this.real()).textContent(); } async innerHtml() { return (await this.real()).innerHtml(); } async innerText() { return (await this.real()).innerText(); } async centroid() { return (await this.real()).centroid(); } async backendNodeId() { return (await this.real()).backendNodeId(); } async highlight(options?: { durationMs?: number; borderColor?: { r: number; g: number; b: number; a?: number }; contentColor?: { r: number; g: number; b: number; a?: number }; }) { return (await this.real()).highlight(options); } async sendClickEvent(options?: { bubbles?: boolean; cancelable?: boolean; composed?: boolean; detail?: number; }) { return (await this.real()).sendClickEvent(options); } async setInputFiles( files: | string | string[] | { name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer | string; } | Array<{ name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer | string; }>, ) { return (await this.real()).setInputFiles(files); } first() { return this.nth(0); } nth(index: number): DeepLocatorDelegate { const value = Number(index); if (!Number.isFinite(value) || value < 0) { throw new StagehandInvalidArgumentError( "deepLocator().nth() expects a non-negative index", ); } const nextIndex = Math.floor(value); if (nextIndex === this.nthIndex) return this; return new DeepLocatorDelegate( this.page, this.root, this.selector, nextIndex, ); } } /** Factory to create a deep locator delegate from a Page + root frame. */ export function deepLocatorFromPage( page: Page, root: Frame, selector: string, ): DeepLocatorDelegate { return new DeepLocatorDelegate(page, root, selector); } async function resolveDeepXPathTarget( page: Page, root: Frame, xpathOrSelector: string, ): Promise { let path = xpathOrSelector.trim(); if (path.startsWith("xpath=")) path = path.slice("xpath=".length).trim(); if (!path.startsWith("/")) path = "/" + path; const steps = parseXPath(path); let fl: FrameLocator | undefined; let buf: Step[] = []; const flushIntoFrameLocator = () => { if (!buf.length) return; const selectorForIframe = "xpath=" + buildXPathFromSteps(buf); fl = fl ? fl.frameLocator(selectorForIframe) : frameLocatorFromFrame(page, root, selectorForIframe); buf = []; }; for (const st of steps) { buf.push(st); if (IFRAME_STEP_RE.test(st.name)) flushIntoFrameLocator(); } const finalSelector = "xpath=" + buildXPathFromSteps(buf); const targetFrame = fl ? await fl.resolveFrame() : root; return { frame: targetFrame, selector: finalSelector }; } ================================================ FILE: packages/core/lib/v3/understudy/executionContextRegistry.ts ================================================ import type { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "./cdp.js"; type FrameId = Protocol.Page.FrameId; type ExecId = Protocol.Runtime.ExecutionContextId; export class ExecutionContextRegistry { private readonly byFrame = new WeakMap< CDPSessionLike, Map >(); private readonly byExec = new WeakMap>(); /** Wire listeners for this session. Call BEFORE Runtime.enable. */ attachSession(session: CDPSessionLike): void { const onCreated = ( evt: Protocol.Runtime.ExecutionContextCreatedEvent, ): void => { const aux = (evt.context.auxData ?? {}) as { frameId?: string; isDefault?: boolean; }; if (aux.isDefault === true && typeof aux.frameId === "string") { this.register(session, aux.frameId as FrameId, evt.context.id); } }; const onDestroyed = ( evt: Protocol.Runtime.ExecutionContextDestroyedEvent, ): void => { const rev = this.byExec.get(session); const fwd = this.byFrame.get(session); if (!rev || !fwd) return; const frameId = rev.get(evt.executionContextId); if (!frameId) return; rev.delete(evt.executionContextId); if (fwd.get(frameId) === evt.executionContextId) fwd.delete(frameId); }; const onCleared = (): void => { this.byFrame.delete(session); this.byExec.delete(session); }; session.on("Runtime.executionContextCreated", onCreated); session.on("Runtime.executionContextDestroyed", onDestroyed); session.on("Runtime.executionContextsCleared", onCleared); } getMainWorld(session: CDPSessionLike, frameId: FrameId): ExecId | null { return this.byFrame.get(session)?.get(frameId) ?? null; } async waitForMainWorld( session: CDPSessionLike, frameId: FrameId, timeoutMs: number = 800, ): Promise { const cached = this.getMainWorld(session, frameId); if (cached) return cached; await session.send("Runtime.enable").catch(() => {}); const after = this.getMainWorld(session, frameId); if (after) return after; return await new Promise((resolve, reject) => { let done = false; const onCreated = ( evt: Protocol.Runtime.ExecutionContextCreatedEvent, ): void => { const aux = (evt.context.auxData ?? {}) as { frameId?: string; isDefault?: boolean; }; if (aux.isDefault === true && aux.frameId === frameId) { this.register(session, frameId, evt.context.id); if (!done) { done = true; clearTimeout(timer); session.off("Runtime.executionContextCreated", onCreated); resolve(evt.context.id); } } }; const timer = setTimeout(() => { if (!done) { done = true; session.off("Runtime.executionContextCreated", onCreated); reject(new Error(`main world not ready for frame ${frameId}`)); } }, timeoutMs); session.on("Runtime.executionContextCreated", onCreated); }); } private register( session: CDPSessionLike, frameId: FrameId, ctxId: ExecId, ): void { let fwd = this.byFrame.get(session); if (!fwd) { fwd = new Map(); this.byFrame.set(session, fwd); } let rev = this.byExec.get(session); if (!rev) { rev = new Map(); this.byExec.set(session, rev); } fwd.set(frameId, ctxId); rev.set(ctxId, frameId); } } export const executionContexts = new ExecutionContextRegistry(); ================================================ FILE: packages/core/lib/v3/understudy/fileUploadUtils.ts ================================================ import { promises as fs, type Stats } from "fs"; import path from "path"; import { Buffer } from "buffer"; import { StagehandInvalidArgumentError } from "../types/public/sdkErrors.js"; import { SetInputFilesArgument, SetInputFilePayload, } from "../types/public/locator.js"; import { NormalizedFilePayload } from "../types/private/locator.js"; const DEFAULT_MIME_TYPE = "application/octet-stream"; /** * Normalize user-provided setInputFiles arguments into in-memory payloads. * - Resolves string paths relative to the provided base directory. * - Validates that each path exists and is a regular file. * - Converts all buffers into Node Buffers for downstream processing. */ export async function normalizeInputFiles( files: SetInputFilesArgument, opts: { baseDir?: string } = {}, ): Promise { if (files === null || files === undefined) return []; const flattened = Array.isArray(files) ? (files as Array) : [files]; if (!flattened.length) return []; const baseDir = opts.baseDir ?? process.cwd(); const normalized: NormalizedFilePayload[] = []; for (const entry of flattened) { if (typeof entry === "string") { const absolutePath = path.isAbsolute(entry) ? entry : path.resolve(baseDir, entry); const stat = await statFile(absolutePath); if (!stat.isFile()) { throw new StagehandInvalidArgumentError( `setInputFiles(): expected a file but received directory or special entry at ${absolutePath}`, ); } const buffer = await fs.readFile(absolutePath); normalized.push({ name: path.basename(absolutePath) || "upload.bin", mimeType: DEFAULT_MIME_TYPE, buffer, lastModified: stat.mtimeMs || Date.now(), absolutePath, }); continue; } if (entry && typeof entry === "object" && "buffer" in entry) { const payload = entry as SetInputFilePayload; const buffer = toBuffer(payload.buffer); normalized.push({ name: payload.name || "upload.bin", mimeType: payload.mimeType || DEFAULT_MIME_TYPE, buffer, lastModified: typeof payload.lastModified === "number" ? payload.lastModified : Date.now(), }); continue; } throw new StagehandInvalidArgumentError( "setInputFiles(): expected file path(s) or payload object(s)", ); } return normalized; } async function statFile(absolutePath: string): Promise { try { return await fs.stat(absolutePath); } catch (error) { const code = (error as NodeJS.ErrnoException)?.code; if (code === "ENOENT") { throw new StagehandInvalidArgumentError( `setInputFiles(): file not found at ${absolutePath}`, ); } throw error; } } export function toBuffer( data: ArrayBuffer | Uint8Array | Buffer | string, ): Buffer { if (Buffer.isBuffer(data)) return data; if (data instanceof Uint8Array) return Buffer.from(data); if (typeof data === "string") return Buffer.from(data); if (data instanceof ArrayBuffer) return Buffer.from(new Uint8Array(data)); throw new StagehandInvalidArgumentError( "Unsupported file payload buffer type", ); } ================================================ FILE: packages/core/lib/v3/understudy/frame.ts ================================================ // lib/v3/understudy/frame.ts import { Protocol } from "devtools-protocol"; import type { CDPSessionLike } from "./cdp.js"; import { Locator } from "./locator.js"; import { StagehandEvalError } from "../types/public/sdkErrors.js"; import { executionContexts } from "./executionContextRegistry.js"; interface FrameManager { session: CDPSessionLike; frameId: string; pageId: string; } /** * Frame * * A thin, session-bound handle to a specific DOM frame (by frameId). * All CDP calls in this class go through `this.session`, which MUST be the * owning session for `this.frameId`. Page is responsible for constructing * Frames with the correct session. */ export class Frame implements FrameManager { /** Owning CDP session id (useful for logs); null for root connection (should not happen for targets) */ public readonly sessionId: string | null; constructor( public session: CDPSessionLike, public frameId: string, public pageId: string, private readonly remoteBrowser: boolean, ) { this.sessionId = this.session.id ?? null; } /** True when the controlled browser runs on a different machine. */ public isBrowserRemote(): boolean { return this.remoteBrowser; } /** DOM.getNodeForLocation → DOM.describeNode */ async getNodeAtLocation(x: number, y: number): Promise { await this.session.send("DOM.enable"); const { backendNodeId } = await this.session.send<{ backendNodeId: Protocol.DOM.BackendNodeId; }>("DOM.getNodeForLocation", { x, y, includeUserAgentShadowDOM: true, ignorePointerEventsNone: false, }); const { node } = await this.session.send<{ node: Protocol.DOM.Node; }>("DOM.describeNode", { backendNodeId }); return node; } /** CSS selector → DOM.querySelector → DOM.getBoxModel */ async getLocationForSelector( selector: string, ): Promise<{ x: number; y: number; width: number; height: number }> { await this.session.send("DOM.enable"); const { root } = await this.session.send<{ root: Protocol.DOM.Node }>( "DOM.getDocument", ); const { nodeId } = await this.session.send<{ nodeId: Protocol.DOM.NodeId }>( "DOM.querySelector", { nodeId: root.nodeId, selector }, ); const { model } = await this.session.send<{ model: Protocol.DOM.BoxModel }>( "DOM.getBoxModel", { nodeId }, ); const x = model.content[0]; const y = model.content[1]; const width = model.width; const height = model.height; return { x, y, width, height }; } /** Accessibility.getFullAXTree (+ recurse into child frames if requested) */ async getAccessibilityTree( withFrames = false, ): Promise { await this.session.send("Accessibility.enable"); let nodes: Protocol.Accessibility.AXNode[]; try { ({ nodes } = await this.session.send<{ nodes: Protocol.Accessibility.AXNode[]; }>("Accessibility.getFullAXTree", { frameId: this.frameId })); } catch (e) { const msg = String((e as Error)?.message ?? e ?? ""); const isFrameScopeError = msg.includes("Frame with the given") || msg.includes("does not belong to the target") || msg.includes("is not found"); if (!isFrameScopeError) throw e; // Retry unscoped: on OOPIF sessions, returns the child doc's AX tree. ({ nodes } = await this.session.send<{ nodes: Protocol.Accessibility.AXNode[]; }>("Accessibility.getFullAXTree")); } if (!withFrames) return nodes; const children = await this.childFrames(); for (const child of children) { const childNodes = await child.getAccessibilityTree(false); nodes.push(...childNodes); } return nodes; } /** * Evaluate a function or expression in this frame's main world. * - If a string is provided, treated as a JS expression. * - If a function is provided, it is stringified and invoked with the optional argument. */ async evaluate( pageFunctionOrExpression: string | ((arg: Arg) => R | Promise), arg?: Arg, ): Promise { await this.session.send("Runtime.enable").catch(() => {}); const contextId = await this.getMainWorldExecutionContextId(); const isString = typeof pageFunctionOrExpression === "string"; let expression: string; if (isString) { expression = String(pageFunctionOrExpression); } else { const fnSrc = pageFunctionOrExpression.toString(); const argJson = JSON.stringify(arg); expression = `(() => { const __fn = ${fnSrc}; const __arg = ${argJson}; try { const __res = __fn(__arg); return Promise.resolve(__res).then(v => { try { return JSON.parse(JSON.stringify(v)); } catch { return v; } }); } catch (e) { throw e; } })()`; } let res: Protocol.Runtime.EvaluateResponse; try { res = await this.session.send( "Runtime.evaluate", { expression, contextId, awaitPromise: true, returnByValue: true, }, ); } catch (error) { // Execution contexts can be recreated between context lookup and // Runtime.evaluate during popup/navigate churn. Retry once with a fresh id. const msg = error instanceof Error ? error.message : String(error); if (!msg.includes("Cannot find context with specified id")) throw error; const freshContextId = await this.getMainWorldExecutionContextId(); res = await this.session.send( "Runtime.evaluate", { expression, contextId: freshContextId, awaitPromise: true, returnByValue: true, }, ); } if (res.exceptionDetails) { throw new StagehandEvalError( res.exceptionDetails.text ?? "Evaluation failed", ); } return res.result.value as R; } /** Page.captureScreenshot (frame-scoped session) */ async screenshot(options?: { fullPage?: boolean; clip?: { x: number; y: number; width: number; height: number }; type?: "png" | "jpeg"; quality?: number; scale?: number; }): Promise { await this.session.send("Page.enable"); const format = options?.type ?? "png"; const params: Protocol.Page.CaptureScreenshotRequest & { scale?: number } = { format, fromSurface: true, captureBeyondViewport: options?.fullPage, }; const clampScale = (value: number): number => Math.min(2, Math.max(0.1, value)); const normalizedScale = typeof options?.scale === "number" ? clampScale(options.scale) : undefined; if (options?.clip) { const clip = { x: options.clip.x, y: options.clip.y, width: options.clip.width, height: options.clip.height, scale: normalizedScale ?? 1, }; params.clip = clip; } else if (normalizedScale !== undefined && normalizedScale !== 1) { params.scale = normalizedScale; } if (format === "jpeg" && typeof options?.quality === "number") { const q = Math.round(options.quality); params.quality = Math.min(100, Math.max(0, q)); } const { data } = await this.session.send( "Page.captureScreenshot", params, ); return Buffer.from(data, "base64"); } /** Child frames via Page.getFrameTree */ async childFrames(): Promise { const { frameTree } = await this.session.send<{ frameTree: Protocol.Page.FrameTree; }>("Page.getFrameTree"); const frames: Frame[] = []; const collect = (tree: Protocol.Page.FrameTree) => { if (tree.frame.parentId === this.frameId) { frames.push( new Frame( this.session, tree.frame.id, this.pageId, this.remoteBrowser, ), ); } tree.childFrames?.forEach(collect); }; collect(frameTree); return frames; } /** Wait for a lifecycle state (load/domcontentloaded/networkidle) */ async waitForLoadState( state: "load" | "domcontentloaded" | "networkidle" = "load", timeoutMs: number = 15_000, ): Promise { await this.session.send("Page.enable"); const targetState = state.toLowerCase(); const timeout = Math.max(0, timeoutMs); await new Promise((resolve, reject) => { let done = false; let timer: ReturnType | null = null; const finish = () => { if (done) return; done = true; this.session.off("Page.lifecycleEvent", handler); if (timer) { clearTimeout(timer); timer = null; } resolve(); }; const handler = (evt: Protocol.Page.LifecycleEventEvent) => { const sameFrame = evt.frameId === this.frameId; // need to normalize here because CDP lifecycle names look like 'DOMContentLoaded' // but we accept 'domcontentloaded' const lifecycleName = String(evt.name ?? "").toLowerCase(); if (sameFrame && lifecycleName === targetState) { finish(); } }; this.session.on("Page.lifecycleEvent", handler); timer = setTimeout(() => { if (done) return; done = true; this.session.off("Page.lifecycleEvent", handler); reject( new Error( `waitForLoadState(${state}) timed out after ${timeout}ms for frame ${this.frameId}`, ), ); }, timeout); }); } /** Simple placeholder for your own locator abstraction */ locator( selector: string, options?: { deep?: boolean; depth?: number }, ): Locator { return new Locator(this, selector, options); } /** Resolve the main-world execution context id for this frame. */ private async getMainWorldExecutionContextId(): Promise { return executionContexts.waitForMainWorld(this.session, this.frameId, 1000); } } ================================================ FILE: packages/core/lib/v3/understudy/frameLocator.ts ================================================ import type { Protocol } from "devtools-protocol"; import { Locator } from "./locator.js"; import type { Page } from "./page.js"; import { Frame } from "./frame.js"; import { executionContexts } from "./executionContextRegistry.js"; import { ContentFrameNotFoundError, StagehandInvalidArgumentError, } from "../types/public/sdkErrors.js"; /** * FrameLocator: resolves iframe elements to their child Frames and allows * creating locators scoped to that frame. Supports chaining. */ export class FrameLocator { private readonly parent?: FrameLocator; private readonly selector: string; private readonly page: Page; private readonly root?: Frame; constructor( page: Page, selector: string, parent?: FrameLocator, root?: Frame, ) { this.page = page; this.selector = selector; this.parent = parent; this.root = root; } /** Create a nested FrameLocator under this one. */ frameLocator(selector: string): FrameLocator { return new FrameLocator(this.page, selector, this); } /** Resolve to the concrete Frame for this FrameLocator chain. */ async resolveFrame(): Promise { const parentFrame: Frame = this.parent ? await this.parent.resolveFrame() : (this.root ?? this.page.mainFrame()); // Resolve the iframe element inside the parent frame const tmp = parentFrame.locator(this.selector); const parentSession = parentFrame.session; const { objectId } = await tmp.resolveNode(); try { await parentSession.send("DOM.enable").catch(() => {}); const desc = await parentSession.send( "DOM.describeNode", { objectId }, ); const iframeBackendNodeId = desc.node.backendNodeId; // Find direct child frames under the parent by consulting the Page's registry const childIds = await listDirectChildFrameIdsFromRegistry( this.page, parentFrame.frameId, 1000, ); for (const fid of childIds) { try { const owner = await parentSession.send<{ backendNodeId: Protocol.DOM.BackendNodeId; nodeId?: Protocol.DOM.NodeId; }>("DOM.getFrameOwner", { frameId: fid as Protocol.Page.FrameId }); if (owner.backendNodeId === iframeBackendNodeId) { // Ensure child frame is ready (handles OOPIF adoption or same-process) await ensureChildFrameReady(this.page, parentFrame, fid, 1200); return this.page.frameForId(fid); } } catch { // ignore and try next } } throw new ContentFrameNotFoundError(this.selector); } finally { await parentSession .send("Runtime.releaseObject", { objectId }) .catch(() => {}); } } /** Return a Locator scoped to this frame. Methods delegate to the frame lazily. */ locator(selector: string): LocatorDelegate { return new LocatorDelegate(this, selector); } } /** A small delegating wrapper that resolves the frame lazily per call. */ class LocatorDelegate { constructor( private readonly fl: FrameLocator, private readonly sel: string, private readonly nthIndex: number = -1, ) {} private async real(): Promise { const frame = await this.fl.resolveFrame(); const locator = frame.locator(this.sel); if (this.nthIndex < 0) return locator; return locator.nth(this.nthIndex); } // Locator API delegates async click(options?: { button?: "left" | "right" | "middle"; clickCount?: number; }) { return (await this.real()).click(options); } async hover() { return (await this.real()).hover(); } async fill(value: string) { return (await this.real()).fill(value); } async type(text: string, options?: { delay?: number }) { return (await this.real()).type(text, options); } async selectOption(values: string | string[]) { return (await this.real()).selectOption(values); } async scrollTo(percent: number | string) { return (await this.real()).scrollTo(percent); } async isVisible() { return (await this.real()).isVisible(); } async isChecked() { return (await this.real()).isChecked(); } async inputValue() { return (await this.real()).inputValue(); } async textContent() { return (await this.real()).textContent(); } async innerHtml() { return (await this.real()).innerHtml(); } async innerText() { return (await this.real()).innerText(); } async count() { return (await this.real()).count(); } first(): LocatorDelegate { return this.nth(0); } nth(index: number): LocatorDelegate { const value = Number(index); if (!Number.isFinite(value) || value < 0) { throw new StagehandInvalidArgumentError( "locator().nth() expects a non-negative index", ); } const nextIndex = Math.floor(value); if (nextIndex === this.nthIndex) return this; return new LocatorDelegate(this.fl, this.sel, nextIndex); } } /** Factory to start a FrameLocator chain from an arbitrary root Frame. */ export function frameLocatorFromFrame( page: Page, root: Frame, selector: string, ): FrameLocator { return new FrameLocator(page, selector, undefined, root); } async function listDirectChildFrameIdsFromRegistry( page: Page, parentFrameId: string, timeoutMs: number, ): Promise { const deadline = Date.now() + timeoutMs; while (true) { try { const tree = page.getFullFrameTree(); const node = findFrameNode(tree, parentFrameId); const ids = node?.childFrames?.map((c) => c.frame.id as string) ?? []; if (ids.length > 0 || Date.now() >= deadline) return ids; } catch { // ignore } await new Promise((r) => setTimeout(r, 50)); } } function findFrameNode( tree: Protocol.Page.FrameTree, targetId: string, ): Protocol.Page.FrameTree | undefined { if (tree.frame.id === targetId) return tree; for (const c of tree.childFrames ?? []) { const hit = findFrameNode(c, targetId); if (hit) return hit; } return undefined; } /** * Ensure we can evaluate in the child frame with minimal delay. * - If the child is same-process: parent session owns it and main world appears quickly. * - If OOPIF and adoption not finished: wait briefly for ownership change, then main world. */ async function ensureChildFrameReady( page: Page, parentFrame: Frame, childFrameId: string, budgetMs: number, ): Promise { const parentSession = parentFrame.session; const deadline = Date.now() + Math.max(0, budgetMs); // If already owned by a different session (OOPIF adopted), wait briefly there. const owner = page.getSessionForFrame(childFrameId); if (owner && owner !== parentSession) { try { await executionContexts.waitForMainWorld(owner, childFrameId, 600); } catch { // best effort } return; } const hasMainWorldOnParent = (): boolean => { try { return ( executionContexts.getMainWorld(parentSession, childFrameId) !== null ); } catch { return false; } }; if (hasMainWorldOnParent()) return; await parentSession .send("Page.setLifecycleEventsEnabled", { enabled: true }) .catch(() => {}); await parentSession.send("Runtime.enable").catch(() => {}); await new Promise((resolve) => { let done = false; const finish = () => { if (done) return; done = true; parentSession.off("Page.lifecycleEvent", onLifecycle); resolve(); }; const onLifecycle = (evt: Protocol.Page.LifecycleEventEvent) => { if ( evt.frameId !== childFrameId || (evt.name !== "DOMContentLoaded" && evt.name !== "load" && evt.name !== "networkIdle" && evt.name !== "networkidle") ) { return; } if (hasMainWorldOnParent()) return finish(); try { const nowOwner = page.getSessionForFrame(childFrameId); if (nowOwner && nowOwner !== parentSession) { const left = Math.max(150, deadline - Date.now()); executionContexts .waitForMainWorld(nowOwner, childFrameId, left) .finally(finish); } } catch { // ignore } }; parentSession.on("Page.lifecycleEvent", onLifecycle); const tick = () => { if (done) return; if (hasMainWorldOnParent()) return finish(); try { const nowOwner = page.getSessionForFrame(childFrameId); if (nowOwner && nowOwner !== parentSession) { const left = Math.max(150, deadline - Date.now()); executionContexts .waitForMainWorld(nowOwner, childFrameId, left) .finally(finish); return; } } catch { // ignore } if (Date.now() >= deadline) return finish(); setTimeout(tick, 50); }; tick(); }); } ================================================ FILE: packages/core/lib/v3/understudy/frameRegistry.ts ================================================ // lib/v3/understudy/frameRegistry.ts import type { Protocol } from "devtools-protocol"; /** * FrameRegistry * * Purpose: * A single, authoritative source of truth for **both**: * 1) Frame topology (parent/children, current main/root id, last-seen CDP `Frame`) * 2) Frame → Session ownership (which CDP session owns a given frameId) * 3) Optional iframe-owner metadata (backendNodeId of the `), ); // Wait for iframe to load await new Promise((resolve) => setTimeout(resolve, 500)); // Count buttons in main frame only const mainFrameCount = await page.mainFrame().locator("button").count(); expect(mainFrameCount).toBe(2); // Should only find buttons in main frame }); test("count() works with frameLocator for iframe content", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(` `), ); // Wait for iframe to load await new Promise((resolve) => setTimeout(resolve, 500)); // Count buttons in iframe using frameLocator const iframeLocator = page.frameLocator("#test-iframe"); const iframeCount = await iframeLocator.locator("button").count(); expect(iframeCount).toBe(3); // Should find 3 buttons in iframe }); test("count() with nested iframes", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`

Main Frame
`), ); // Wait for all iframes to load await new Promise((resolve) => setTimeout(resolve, 800)); // Count at each level const mainCount = await page.mainFrame().locator(".level-0").count(); expect(mainCount).toBe(1); const frame1Count = await page .frameLocator("#frame1") .locator(".level-1") .count(); expect(frame1Count).toBe(1); const frame2Count = await page .frameLocator("#frame1") .frameLocator("#frame2") .locator(".level-2") .count(); expect(frame2Count).toBe(2); }); test("count() with same selector in multiple contexts", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(` Main 1 Main 2 `), ); // Wait for iframes to load await new Promise((resolve) => setTimeout(resolve, 500)); // Count in each context const mainCount = await page.mainFrame().locator(".item").count(); const frame1Count = await page .frameLocator("#frame1") .locator(".item") .count(); const frame2Count = await page .frameLocator("#frame2") .locator(".item") .count(); expect(mainCount).toBe(2); // Main frame items only expect(frame1Count).toBe(1); // Frame 1 items only expect(frame2Count).toBe(3); // Frame 2 items only }); test("count() returns 0 for non-existent iframe", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,
No iframes here
"); try { const frameLocator = page.frameLocator("#non-existent"); await frameLocator.locator("button").count(); // If we get here, the test should fail expect(true).toBe(false); } catch (error) { // Expected behavior - frameLocator should throw when iframe doesn't exist expect(error.message).toContain( "Could not find an element for the given xPath(s):", ); } }); }); ================================================ FILE: packages/core/tests/integration/locator-count.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; test.describe("Locator count() method tests", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("count() returns correct number for CSS selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html,
1
2
3
4", ); const locator = page.mainFrame().locator(".test"); const count = await locator.count(); expect(count).toBe(3); }); test("count() returns 0 for non-matching selectors", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,
Test
"); const locator = page.mainFrame().locator(".non-existent"); const count = await locator.count(); expect(count).toBe(0); }); test("count() works with XPath selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html,", ); const locator = page.mainFrame().locator("//button"); const count = await locator.count(); expect(count).toBe(3); }); test("count() works with text selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html,
Click me
Don't click me", ); const locator = page.mainFrame().locator("text=Click me"); const count = await locator.count(); // Case-insensitive substring match: also matches "Don't click me" expect(count).toBe(3); }); test("count() handles shadow DOM elements", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); // Wait a bit for shadow DOM to be attached await new Promise((resolve) => setTimeout(resolve, 100)); const locator = page.mainFrame().locator("button"); const count = await locator.count(); expect(count).toBe(2); }); test("count() works with complex CSS selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html,
12
3
", ); const locator = page.mainFrame().locator(".container .item"); const count = await locator.count(); expect(count).toBe(2); }); }); ================================================ FILE: packages/core/tests/integration/locator-fill.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { StagehandLocatorError } from "../../lib/v3/types/public/sdkErrors.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Locator.fill()", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch((e) => { void e; }); }); test("fills date inputs via value setter even when beforeinput blocks insertText", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const dateInput = page.mainFrame().locator("xpath=/html/body/input"); await dateInput.fill("2026-01-01"); const value = await dateInput.inputValue(); expect(value).toBe("2026-01-01"); }); test("xpath case: throws StagehandLocatorError when fill encounters an exception", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); await page.waitForSelector("xpath=/html/body/input"); await page.evaluate(() => { const input = document.querySelector("input"); Object.defineProperty(input, "isConnected", { get() { throw new Error("boom"); }, }); }); const dateInput = page.mainFrame().locator("xpath=/html/body/input"); let error: unknown; try { await dateInput.fill("2026-01-01"); } catch (err) { error = err; } expect(error).toBeInstanceOf(StagehandLocatorError); if (error instanceof Error) { // Log the message so it's visible in test output. expect(error.message).toContain("Error Filling Element"); expect(error.message).toContain("selector: xpath=/html/body/input"); expect(error.message).toContain("boom"); } }); test("css selector case: throws StagehandLocatorError when fill encounters an exception", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); await page.waitForSelector("#date"); // Override in main world await page.evaluate(() => { const input = document.querySelector("input"); Object.defineProperty(input, "isConnected", { get() { throw new Error("boom"); }, configurable: true, }); }); // Also override in the isolated world that CSS selectors use const frameId = page.mainFrameId(); const { executionContextId } = await page.sendCDP<{ executionContextId: number; }>("Page.createIsolatedWorld", { frameId, worldName: "v3-world", }); await page.sendCDP("Runtime.evaluate", { expression: `(() => { const input = document.querySelector('input'); if (input) { Object.defineProperty(input, 'isConnected', { get() { throw new Error("boom"); }, configurable: true }); } })()`, contextId: executionContextId, }); const dateInput = page.mainFrame().locator("#date"); let error: unknown; try { await dateInput.fill("2026-01-01"); } catch (err) { error = err; } expect(error).toBeInstanceOf(StagehandLocatorError); if (error instanceof Error) { expect(error.message).toContain("Error Filling Element"); expect(error.message).toContain("selector: #date"); expect(error.message).toContain("boom"); } }); }); ================================================ FILE: packages/core/tests/integration/locator-input-methods.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Locator input methods (fill, type, hover, isVisible, isChecked)", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch((e) => { void e; }); }); test("Locator.fill() sets input value directly", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
`, ), ); const input = page.mainFrame().locator("#name"); await input.fill("Hello World"); const value = await input.inputValue(); expect(value).toBe("Hello World"); }); test("Locator.type() types text character by character", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const input = page.mainFrame().locator("#search"); await input.type("test123", { delay: 10 }); const value = await input.inputValue(); expect(value).toBe("test123"); }); test("Locator.hover() moves mouse to element center", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const btn = page.mainFrame().locator("#btn"); await btn.hover(); const hovered = await page.mainFrame().evaluate(() => { const b = document.getElementById("btn") as HTMLButtonElement | null; return b?.dataset.hovered === "true"; }); expect(hovered).toBe(true); }); test("Locator.isVisible() returns true for visible elements", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
I am visible
I am transparent
Zero size
`, ), ); const visible = await page.mainFrame().locator("#visible").isVisible(); expect(visible).toBe(true); const hidden = await page.mainFrame().locator("#hidden").isVisible(); expect(hidden).toBe(false); const invisible = await page.mainFrame().locator("#invisible").isVisible(); expect(invisible).toBe(false); const transparent = await page .mainFrame() .locator("#transparent") .isVisible(); expect(transparent).toBe(false); const zeroSize = await page.mainFrame().locator("#zero-size").isVisible(); expect(zeroSize).toBe(false); }); test("Locator.isChecked() detects checkbox state", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const checked = await page.mainFrame().locator("#checked").isChecked(); expect(checked).toBe(true); const unchecked = await page.mainFrame().locator("#unchecked").isChecked(); expect(unchecked).toBe(false); const radioSelected = await page .mainFrame() .locator("#radio-selected") .isChecked(); expect(radioSelected).toBe(true); const radioUnselected = await page .mainFrame() .locator("#radio-unselected") .isChecked(); expect(radioUnselected).toBe(false); }); test("Locator.fill() on textarea", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const ta = page.mainFrame().locator("#ta"); await ta.fill("Multi\nline\ntext"); const value = await ta.inputValue(); expect(value).toBe("Multi\nline\ntext"); }); test("Locator.fill() clears and sets new value", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const inp = page.mainFrame().locator("#inp"); let value = await inp.inputValue(); expect(value).toBe("initial"); await inp.fill("replaced"); value = await inp.inputValue(); expect(value).toBe("replaced"); }); }); ================================================ FILE: packages/core/tests/integration/locator-nth.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; test.describe("Locator nth() method tests", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("nth() returns correct element for CSS selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
1
' + '
2
' + '
3
' + '4', ), ); // Test nth() with CSS selectors const locator0 = page.mainFrame().locator(".test").nth(0); const text0 = await locator0.textContent(); expect(text0).toBe("1"); const locator1 = page.mainFrame().locator(".test").nth(1); const text1 = await locator1.textContent(); expect(text1).toBe("2"); const locator2 = page.mainFrame().locator(".test").nth(2); const text2 = await locator2.textContent(); expect(text2).toBe("3"); }); test("nth() returns correct element for XPath selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + '' + '', ), ); // Test nth() with XPath selectors const locator0 = page.mainFrame().locator("//button").nth(0); const text0 = await locator0.textContent(); expect(text0).toBe("Button 1"); const locator1 = page.mainFrame().locator("//button").nth(1); const text1 = await locator1.textContent(); expect(text1).toBe("Button 2"); const locator2 = page.mainFrame().locator("//button").nth(2); const text2 = await locator2.textContent(); expect(text2).toBe("Button 3"); }); test("nth() returns correct element for text selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Click me
' + '' + 'Click me', ), ); // Test nth() with text selectors const locator0 = page.mainFrame().locator("text=Click me").nth(0); const text0 = await locator0.textContent(); expect(text0).toBe("Click me"); const locator1 = page.mainFrame().locator("text=Click me").nth(1); const text1 = await locator1.textContent(); expect(text1).toBe("Click me"); const locator2 = page.mainFrame().locator("text=Click me").nth(2); const text2 = await locator2.textContent(); expect(text2).toBe("Click me"); }); test("nth() with shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); // Wait a bit for shadow DOM to be attached await new Promise((resolve) => setTimeout(resolve, 100)); // Test nth() with shadow DOM elements const locator0 = page.mainFrame().locator("button").nth(0); const text0 = await locator0.textContent(); expect(text0).toBe("Shadow Button 1"); const locator1 = page.mainFrame().locator("button").nth(1); const text1 = await locator1.textContent(); expect(text1).toBe("Shadow Button 2"); const locator2 = page.mainFrame().locator("button").nth(2); const text2 = await locator2.textContent(); expect(text2).toBe("Shadow Button 3"); }); test("nth() with out of bounds index throws error", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
1
' + '
2
', ), ); // Test with out of bounds index - should throw an error const locator = page.mainFrame().locator(".test").nth(5); let error = null; try { await locator.textContent(); } catch (e) { error = e; } expect(error).not.toBeNull(); }); test("nth() works with complex CSS selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + '1' + '2' + "
" + "
" + '3' + "
", ), ); // Test nth() with complex CSS selectors const locator0 = page.mainFrame().locator(".container .item").nth(0); const text0 = await locator0.textContent(); expect(text0).toBe("1"); const locator1 = page.mainFrame().locator(".container .item").nth(1); const text1 = await locator1.textContent(); expect(text1).toBe("2"); }); test("nth() can be chained with other locator methods", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
First
' + '
Second
' + '
Third
', ), ); // Test that nth() returns a Locator that can be used for other actions const locator = page.mainFrame().locator(".test").nth(1); const text = await locator.textContent(); expect(text).toBe("Second"); // Verify it's visible const isVisible = await locator.isVisible(); expect(isVisible).toBe(true); }); test("nth(0) is equivalent to first()", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
First
' + '
Second
' + '
Third
', ), ); // Verify nth(0) returns the same element as first() const nthLocator = page.mainFrame().locator(".test").nth(0); const nthText = await nthLocator.textContent(); const firstLocator = page.mainFrame().locator(".test").first(); const firstText = await firstLocator.textContent(); expect(nthText).toBe(firstText); expect(nthText).toBe("First"); }); test("nth() works correctly with iframe selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + '' + '' + "", ), ); // Wait for iframe to load await new Promise((resolve) => setTimeout(resolve, 100)); // Test that nth() works correctly with buttons in the main frame const mainLocator0 = page.mainFrame().locator("button").nth(0); const mainText0 = await mainLocator0.textContent(); expect(mainText0).toBe("Main Button 1"); const mainLocator1 = page.mainFrame().locator("button").nth(1); const mainText1 = await mainLocator1.textContent(); expect(mainText1).toBe("Main Button 2"); }); }); ================================================ FILE: packages/core/tests/integration/locator-select-option.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Locator.selectOption() method", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch((e) => { void e; // ignore cleanup errors }); }); test("selectOption() selects single option by value", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#fruit"); const selected = await select.selectOption("banana"); expect(selected).toEqual(["banana"]); const value = await page.mainFrame().evaluate(() => { const s = document.getElementById("fruit") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("banana"); }); test("selectOption() selects option by label/text", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#country"); const selected = await select.selectOption("United Kingdom"); expect(selected).toEqual(["uk"]); }); test("selectOption() selects multiple options in multiple select", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#colors"); const selected = await select.selectOption(["red", "blue"]); expect(selected.sort()).toEqual(["blue", "red"]); const values = await page.mainFrame().evaluate(() => { const s = document.getElementById("colors") as HTMLSelectElement | null; return Array.from(s?.selectedOptions ?? []).map((o) => o.value); }); expect(values.sort()).toEqual(["blue", "red"]); }); test("selectOption() deselects previous option on single select", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#size"); let value = await page.mainFrame().evaluate(() => { const s = document.getElementById("size") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("m"); await select.selectOption("l"); value = await page.mainFrame().evaluate(() => { const s = document.getElementById("size") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("l"); }); test("selectOption() triggers change event", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
`, ), ); const select = page.mainFrame().locator("#opt"); await select.selectOption("b"); const output = await page.mainFrame().evaluate(() => { const out = document.getElementById("out"); return out?.textContent; }); expect(output).toBe("changed-b"); }); test("selectOption() with optgroup structure", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#grouped"); await select.selectOption("celery"); const value = await page.mainFrame().evaluate(() => { const s = document.getElementById("grouped") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("celery"); }); test("selectOption() returns array of selected values", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#multi"); const selected = await select.selectOption(["1", "3"]); expect(selected).toContain("1"); expect(selected).toContain("3"); expect(selected.length).toBe(2); }); test("selectOption() with empty string value", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#opt"); const selected = await select.selectOption(""); expect(selected).toEqual([""]); const value = await page.mainFrame().evaluate(() => { const s = document.getElementById("opt") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe(""); }); test("selectOption() with numeric values", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#nums"); await select.selectOption("10"); const value = await page.mainFrame().evaluate(() => { const s = document.getElementById("nums") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("10"); }); test("selectOption() with disabled option", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); const select = page.mainFrame().locator("#mixed"); // Should still select disabled option if explicitly requested await select.selectOption("b"); const value = await page.mainFrame().evaluate(() => { const s = document.getElementById("mixed") as HTMLSelectElement | null; return s?.value; }); expect(value).toBe("b"); }); }); ================================================ FILE: packages/core/tests/integration/logger-initialization.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { bindInstanceLogger, unbindInstanceLogger, withInstanceLogContext, v3Logger, } from "../../lib/v3/logger.js"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; test.describe("V3 Logger Instance Routing", () => { test.afterEach(() => { // Clean up is handled by unbindInstanceLogger calls in tests }); test("bindInstanceLogger routes logs to correct instance", () => { const instanceId = "test-instance-001"; const capturedLogs: LogLine[] = []; bindInstanceLogger(instanceId, (line) => { capturedLogs.push(line); }); try { // Log within context withInstanceLogContext(instanceId, () => { v3Logger({ category: "test", message: "Test message for instance", level: 1, }); }); // Should have captured the log expect(capturedLogs.length).toBe(1); expect(capturedLogs[0].message).toBe("Test message for instance"); } finally { unbindInstanceLogger(instanceId); } }); test("unbindInstanceLogger stops routing", () => { const instanceId = "test-instance-002"; const capturedLogs: LogLine[] = []; const consoleOutput: string[] = []; const originalConsoleLog = console.log; try { console.log = (msg: string) => { consoleOutput.push(msg); }; bindInstanceLogger(instanceId, (line) => { capturedLogs.push(line); }); // Unbind immediately unbindInstanceLogger(instanceId); // Log - should fall back to console withInstanceLogContext(instanceId, () => { v3Logger({ category: "test", message: "After unbind", level: 1, }); }); // Should not have captured via instance logger expect(capturedLogs.length).toBe(0); // But should have logged to console expect(consoleOutput.length).toBeGreaterThan(0); } finally { console.log = originalConsoleLog; unbindInstanceLogger(instanceId); } }); test("multiple instances have isolated log routing", () => { const instance1Id = "test-instance-1"; const instance2Id = "test-instance-2"; const instance1Logs: LogLine[] = []; const instance2Logs: LogLine[] = []; bindInstanceLogger(instance1Id, (line) => instance1Logs.push(line)); bindInstanceLogger(instance2Id, (line) => instance2Logs.push(line)); try { // Log from instance 1 withInstanceLogContext(instance1Id, () => { v3Logger({ category: "test", message: "From instance 1", level: 1, }); }); // Log from instance 2 withInstanceLogContext(instance2Id, () => { v3Logger({ category: "test", message: "From instance 2", level: 1, }); }); // Each instance should have only its own log expect(instance1Logs.length).toBe(1); expect(instance2Logs.length).toBe(1); expect(instance1Logs[0].message).toBe("From instance 1"); expect(instance2Logs[0].message).toBe("From instance 2"); } finally { unbindInstanceLogger(instance1Id); unbindInstanceLogger(instance2Id); } }); test("v3Logger falls back to console when no instance context", () => { const capturedLogs: string[] = []; const originalConsoleLog = console.log; try { console.log = (msg: string) => { capturedLogs.push(msg); }; // Log without any instance context v3Logger({ category: "test", message: "Console fallback log", level: 1, }); // Should have used console logger expect(capturedLogs.length).toBeGreaterThan(0); const logOutput = capturedLogs.join("\n"); expect(logOutput).toContain("Console fallback log"); } finally { console.log = originalConsoleLog; } }); test("v3Logger falls back to console when instance logger throws", () => { const instanceId = "failing-instance"; const capturedConsoleLogs: string[] = []; const originalConsoleLog = console.log; try { console.log = (msg: string) => { capturedConsoleLogs.push(msg); }; // Bind a logger that throws bindInstanceLogger(instanceId, () => { throw new Error("Instance logger failed"); }); // Should fall back to console without throwing withInstanceLogContext(instanceId, () => { expect(() => { v3Logger({ category: "test", message: "Test with failing instance logger", level: 1, }); }).not.toThrow(); }); // Console should have received the log as fallback expect(capturedConsoleLogs.length).toBeGreaterThan(0); const logOutput = capturedConsoleLogs.join("\n"); expect(logOutput).toContain("Test with failing instance logger"); } finally { console.log = originalConsoleLog; unbindInstanceLogger(instanceId); } }); test("withInstanceLogContext nests properly", () => { const outerInstanceId = "outer-instance"; const innerInstanceId = "inner-instance"; const outerLogs: LogLine[] = []; const innerLogs: LogLine[] = []; bindInstanceLogger(outerInstanceId, (line) => outerLogs.push(line)); bindInstanceLogger(innerInstanceId, (line) => innerLogs.push(line)); try { withInstanceLogContext(outerInstanceId, () => { v3Logger({ category: "test", message: "Outer context", level: 1, }); withInstanceLogContext(innerInstanceId, () => { v3Logger({ category: "test", message: "Inner context", level: 1, }); }); v3Logger({ category: "test", message: "Back to outer context", level: 1, }); }); // Outer instance should have 2 logs expect(outerLogs.length).toBe(2); expect(outerLogs[0].message).toBe("Outer context"); expect(outerLogs[1].message).toBe("Back to outer context"); // Inner instance should have 1 log expect(innerLogs.length).toBe(1); expect(innerLogs[0].message).toBe("Inner context"); } finally { unbindInstanceLogger(outerInstanceId); unbindInstanceLogger(innerInstanceId); } }); test("withInstanceLogContext returns function result", () => { const instanceId = "return-test-instance"; bindInstanceLogger(instanceId, () => {}); try { const result = withInstanceLogContext(instanceId, () => { return { success: true, value: 42 }; }); expect(result).toEqual({ success: true, value: 42 }); } finally { unbindInstanceLogger(instanceId); } }); test("withInstanceLogContext works with async functions", async () => { const instanceId = "async-test-instance"; const capturedLogs: LogLine[] = []; bindInstanceLogger(instanceId, (line) => capturedLogs.push(line)); try { const asyncResult = await withInstanceLogContext(instanceId, async () => { v3Logger({ category: "test", message: "Log from async context", level: 1, }); await new Promise((resolve) => setTimeout(resolve, 10)); v3Logger({ category: "test", message: "Log after await", level: 1, }); return "async result"; }); expect(asyncResult).toBe("async result"); expect(capturedLogs.length).toBe(2); expect(capturedLogs[0].message).toBe("Log from async context"); expect(capturedLogs[1].message).toBe("Log after await"); } finally { unbindInstanceLogger(instanceId); } }); test("console fallback formats different log levels correctly", () => { const consoleOutput: { level: string; msg: string }[] = []; const originalConsoleLog = console.log; const originalConsoleError = console.error; const originalConsoleDebug = console.debug; try { console.log = (msg: string) => { consoleOutput.push({ level: "log", msg }); }; console.error = (msg: string) => { consoleOutput.push({ level: "error", msg }); }; console.debug = (msg: string) => { consoleOutput.push({ level: "debug", msg }); }; // Test error level (0) v3Logger({ category: "test", message: "Error message", level: 0, }); // Test info level (1) v3Logger({ category: "test", message: "Info message", level: 1, }); // Test debug level (2) v3Logger({ category: "test", message: "Debug message", level: 2, }); expect(consoleOutput.length).toBe(3); expect(consoleOutput[0].level).toBe("error"); expect(consoleOutput[0].msg).toContain("ERROR"); expect(consoleOutput[0].msg).toContain("Error message"); expect(consoleOutput[1].level).toBe("log"); expect(consoleOutput[1].msg).toContain("INFO"); expect(consoleOutput[1].msg).toContain("Info message"); expect(consoleOutput[2].level).toBe("debug"); expect(consoleOutput[2].msg).toContain("DEBUG"); expect(consoleOutput[2].msg).toContain("Debug message"); } finally { console.log = originalConsoleLog; console.error = originalConsoleError; console.debug = originalConsoleDebug; } }); test("console fallback formats auxiliary data", () => { const consoleOutput: string[] = []; const originalConsoleLog = console.log; try { console.log = (msg: string) => { consoleOutput.push(msg); }; v3Logger({ category: "test", message: "Message with auxiliary", level: 1, auxiliary: { stringValue: { value: "test", type: "string" }, integerValue: { value: "42", type: "integer" }, objectValue: { value: JSON.stringify({ nested: "data" }), type: "object", }, }, }); expect(consoleOutput.length).toBe(1); const output = consoleOutput[0]; expect(output).toContain("Message with auxiliary"); expect(output).toContain("stringValue"); expect(output).toContain("integerValue"); expect(output).toContain("objectValue"); } finally { console.log = originalConsoleLog; } }); test("concurrent instances don't interfere", () => { const instances = Array.from({ length: 10 }, (_, i) => `instance-${i}`); const logsByInstance = new Map(); // Bind all instances instances.forEach((id) => { const logs: LogLine[] = []; logsByInstance.set(id, logs); bindInstanceLogger(id, (line) => logs.push(line)); }); try { // Log from each instance instances.forEach((id, index) => { withInstanceLogContext(id, () => { v3Logger({ category: "test", message: `Message from ${id}`, level: 1, auxiliary: { index: { value: String(index), type: "integer" }, }, }); }); }); // Verify each instance received only its own log instances.forEach((id) => { const logs = logsByInstance.get(id)!; expect(logs.length).toBe(1); expect(logs[0].message).toBe(`Message from ${id}`); }); } finally { instances.forEach((id) => unbindInstanceLogger(id)); } }); }); test.describe("V3 Logger with External Logger (Production Pattern)", () => { test.afterEach(() => { // Clean up instance loggers }); test("external logger receives all logs from v3Logger", () => { const instanceId = "v3-instance-with-external"; const externalLogs: LogLine[] = []; // Simulate V3 constructor behavior with external logger const externalLogger = (line: LogLine) => { externalLogs.push(line); }; bindInstanceLogger(instanceId, externalLogger); try { withInstanceLogContext(instanceId, () => { v3Logger({ category: "a11y/snapshot", message: "Capturing hybrid snapshot", level: 0, }); v3Logger({ category: "handlers/act", message: "Executing action", level: 1, auxiliary: { action: { value: "click", type: "string" }, }, }); v3Logger({ category: "debug", message: "Debug details", level: 2, }); }); // All logs should be captured by external logger expect(externalLogs.length).toBe(3); expect(externalLogs[0].message).toBe("Capturing hybrid snapshot"); expect(externalLogs[1].message).toBe("Executing action"); expect(externalLogs[2].message).toBe("Debug details"); } finally { unbindInstanceLogger(instanceId); } }); test("StagehandLogger wrapper forwards to external logger", () => { const instanceId = "v3-with-stagehand-wrapper"; const externalLogs: LogLine[] = []; // Simulate V3's stagehandLogger.log() wrapping pattern const mockStagehandLogger = { log: (line: LogLine) => { // This simulates StagehandLogger.log() which internally calls externalLogger externalLogs.push(line); }, }; bindInstanceLogger(instanceId, (line) => mockStagehandLogger.log(line)); try { withInstanceLogContext(instanceId, () => { v3Logger({ category: "test", message: "Log through StagehandLogger wrapper", level: 1, }); }); expect(externalLogs.length).toBe(1); expect(externalLogs[0].message).toBe( "Log through StagehandLogger wrapper", ); } finally { unbindInstanceLogger(instanceId); } }); test("multiple V3 instances with different external loggers", () => { const instance1Id = "v3-instance-1"; const instance2Id = "v3-instance-2"; const external1Logs: LogLine[] = []; const external2Logs: LogLine[] = []; // Simulate two V3 instances with different external loggers bindInstanceLogger(instance1Id, (line) => external1Logs.push(line)); bindInstanceLogger(instance2Id, (line) => external2Logs.push(line)); try { // Instance 1 logs withInstanceLogContext(instance1Id, () => { v3Logger({ category: "instance1", message: "Instance 1 activity", level: 1, }); }); // Instance 2 logs withInstanceLogContext(instance2Id, () => { v3Logger({ category: "instance2", message: "Instance 2 activity", level: 1, }); }); // Each external logger should only have its instance's logs expect(external1Logs.length).toBe(1); expect(external2Logs.length).toBe(1); expect(external1Logs[0].message).toBe("Instance 1 activity"); expect(external2Logs[0].message).toBe("Instance 2 activity"); } finally { unbindInstanceLogger(instance1Id); unbindInstanceLogger(instance2Id); } }); test("external logger receives logs with auxiliary data preserved", () => { const instanceId = "v3-with-auxiliary"; const externalLogs: LogLine[] = []; bindInstanceLogger(instanceId, (line) => externalLogs.push(line)); try { withInstanceLogContext(instanceId, () => { v3Logger({ category: "extract", message: "Extracting data", level: 1, auxiliary: { selector: { value: "xpath=/html/body", type: "string" }, timeout: { value: "5000", type: "integer" }, retries: { value: "3", type: "integer" }, metadata: { value: JSON.stringify({ key: "value" }), type: "object", }, }, }); }); expect(externalLogs.length).toBe(1); const log = externalLogs[0]; expect(log.auxiliary).toBeDefined(); expect(log.auxiliary?.selector?.value).toBe("xpath=/html/body"); expect(log.auxiliary?.timeout?.value).toBe("5000"); expect(log.auxiliary?.retries?.value).toBe("3"); expect(log.auxiliary?.metadata?.type).toBe("object"); } finally { unbindInstanceLogger(instanceId); } }); test("external logger handles rapid concurrent logs", () => { const instanceId = "v3-rapid-logs"; const externalLogs: LogLine[] = []; bindInstanceLogger(instanceId, (line) => externalLogs.push(line)); try { withInstanceLogContext(instanceId, () => { // Simulate rapid logging like during snapshot capture for (let i = 0; i < 50; i++) { v3Logger({ category: "perf", message: `Operation ${i}`, level: 2, auxiliary: { iteration: { value: String(i), type: "integer" }, }, }); } }); // All logs should be captured expect(externalLogs.length).toBe(50); expect(externalLogs[0].message).toBe("Operation 0"); expect(externalLogs[49].message).toBe("Operation 49"); } finally { unbindInstanceLogger(instanceId); } }); test("external logger can filter by log level", () => { const instanceId = "v3-with-filtering"; const errorLogs: LogLine[] = []; // External logger that only captures errors const filteringLogger = (line: LogLine) => { if (line.level === 0) { errorLogs.push(line); } }; bindInstanceLogger(instanceId, filteringLogger); try { withInstanceLogContext(instanceId, () => { v3Logger({ category: "test", message: "Info message", level: 1, }); v3Logger({ category: "test", message: "Error message", level: 0, }); v3Logger({ category: "test", message: "Debug message", level: 2, }); v3Logger({ category: "test", message: "Another error", level: 0, }); }); // Only error logs should be captured expect(errorLogs.length).toBe(2); expect(errorLogs[0].message).toBe("Error message"); expect(errorLogs[1].message).toBe("Another error"); } finally { unbindInstanceLogger(instanceId); } }); test("external logger persists across async operations", async () => { const instanceId = "v3-async-ops"; const externalLogs: LogLine[] = []; bindInstanceLogger(instanceId, (line) => externalLogs.push(line)); try { await withInstanceLogContext(instanceId, async () => { v3Logger({ category: "async", message: "Before async operation", level: 1, }); await new Promise((resolve) => setTimeout(resolve, 50)); v3Logger({ category: "async", message: "After async operation", level: 1, }); await Promise.all([ Promise.resolve().then(() => v3Logger({ category: "async", message: "Parallel operation 1", level: 1, }), ), Promise.resolve().then(() => v3Logger({ category: "async", message: "Parallel operation 2", level: 1, }), ), ]); }); // All logs should be captured despite async boundaries expect(externalLogs.length).toBe(4); expect(externalLogs[0].message).toBe("Before async operation"); expect(externalLogs[1].message).toBe("After async operation"); } finally { unbindInstanceLogger(instanceId); } }); }); ================================================ FILE: packages/core/tests/integration/multi-instance-logger.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { getV3DynamicTestConfig } from "./v3.dynamic.config.js"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; import { closeV3 } from "./testUtils.js"; test.describe("V3 Multi-Instance Logger Isolation", () => { // Run tests serially to avoid resource exhaustion from creating many Chrome instances test.describe.configure({ mode: "serial" }); // Increase timeout for stress tests that create/destroy multiple instances test.setTimeout(120_000); test("multiple V3 instances can be created concurrently without logger conflicts", async () => { const instanceCount = 5; const instances: V3[] = []; const instanceLogs: Map = new Map(); try { // Create multiple instances with individual loggers const creationPromises = Array.from({ length: instanceCount }, (_, i) => { const logs: LogLine[] = []; instanceLogs.set(i, logs); const config = getV3DynamicTestConfig({ verbose: 2, disablePino: true, logger: (line: LogLine) => { logs.push({ ...line, auxiliary: { ...line.auxiliary, index: { value: String(i), type: "integer" }, }, }); }, }); const v3 = new V3(config); instances.push(v3); return v3.init(); }); // All instances should initialize successfully await Promise.all(creationPromises); // Each instance should be initialized expect(instances.length).toBe(instanceCount); for (const instance of instances) { expect(instance.context).toBeDefined(); } // Perform operations that generate logs await Promise.all( instances.map(async (instance) => { const page = await instance.context.awaitActivePage(); await page.goto("about:blank"); }), ); // Each instance should have logged to its own logger for (let i = 0; i < instanceCount; i++) { const logs = instanceLogs.get(i)!; // Each instance should have some logs expect(logs.length).toBeGreaterThan(0); // Logs should not contain data from other instances // (though this is harder to verify without more specific markers) const hasOwnLogs = logs.some( (log) => log.auxiliary?.index?.value === String(i) || log.category === "init", ); expect(hasOwnLogs).toBe(true); } } finally { // Clean up all instances await Promise.all(instances.map((instance) => closeV3(instance))); } }); test("V3 instances with external loggers don't leak logs to each other", async () => { const instance1Logs: LogLine[] = []; const instance2Logs: LogLine[] = []; const v3Instance1 = new V3( getV3DynamicTestConfig({ verbose: 2, disablePino: true, logger: (line: LogLine) => instance1Logs.push(line), }), ); const v3Instance2 = new V3( getV3DynamicTestConfig({ verbose: 2, disablePino: true, logger: (line: LogLine) => instance2Logs.push(line), }), ); try { // Initialize both instances await Promise.all([v3Instance1.init(), v3Instance2.init()]); // Perform operations on each instance const page1 = await v3Instance1.context.awaitActivePage(); await page1.goto("about:blank"); const page2 = await v3Instance2.context.awaitActivePage(); await page2.goto("data:text/html,

Instance 2

"); // Both instances should have logs expect(instance1Logs.length).toBeGreaterThan(0); expect(instance2Logs.length).toBeGreaterThan(0); // Logs should be distinct (no exact duplicates) // This is a weak check, but verifies basic isolation const instance1Messages = new Set(instance1Logs.map((l) => l.message)); const instance2Messages = new Set(instance2Logs.map((l) => l.message)); // At least some messages should be unique to each instance // (This might not always be true for very generic messages like "init", // but serves as a smoke test) const allMessages = new Set([...instance1Messages, ...instance2Messages]); expect(allMessages.size).toBeGreaterThanOrEqual( Math.max(instance1Messages.size, instance2Messages.size), ); } finally { await Promise.all([closeV3(v3Instance1), closeV3(v3Instance2)]); } }); test("V3 instances without external loggers use shared global logger", async () => { // Create instances without external loggers const v3Instance1 = new V3( getV3DynamicTestConfig({ verbose: 1, disablePino: true, }), ); const v3Instance2 = new V3( getV3DynamicTestConfig({ verbose: 1, disablePino: true, }), ); try { // Initialize both instances concurrently await Promise.all([v3Instance1.init(), v3Instance2.init()]); // Both should work fine expect(v3Instance1.context).toBeDefined(); expect(v3Instance2.context).toBeDefined(); // Perform basic operations to ensure logging doesn't cause issues const page1 = await v3Instance1.context.awaitActivePage(); const page2 = await v3Instance2.context.awaitActivePage(); await Promise.all([page1.goto("about:blank"), page2.goto("about:blank")]); // Both should still be operational expect(page1.url()).toContain("about:blank"); expect(page2.url()).toContain("about:blank"); } finally { await Promise.all([closeV3(v3Instance1), closeV3(v3Instance2)]); } }); test("rapidly creating and destroying instances doesn't cause logger issues", async () => { const iterations = 5; const results: boolean[] = []; for (let i = 0; i < iterations; i++) { const logs: LogLine[] = []; const v3 = new V3( getV3DynamicTestConfig({ verbose: 1, // Capture INFO logs for verification disablePino: true, logger: (line: LogLine) => logs.push(line), }), ); try { await v3.init(); const page = await v3.context.awaitActivePage(); await page.goto("about:blank"); results.push(true); // Verify some logs were captured expect(logs.length).toBeGreaterThan(0); } finally { await closeV3(v3); } } // All iterations should succeed expect(results.length).toBe(iterations); expect(results.every((r) => r === true)).toBe(true); }); test("concurrent instance creation with mixed logger configurations", async () => { const instances: V3[] = []; const configs = [ // With Pino disabled getV3DynamicTestConfig({ verbose: 1, disablePino: true }), // With external logger getV3DynamicTestConfig({ verbose: 2, disablePino: true, //eslint-disable-next-line @typescript-eslint/no-unused-vars logger: (_line: LogLine) => { // External logger }, }), // Without external logger getV3DynamicTestConfig({ verbose: 0, disablePino: true }), // High verbosity getV3DynamicTestConfig({ verbose: 2, disablePino: true }), ]; try { // Create all instances concurrently const creationPromises = configs.map((config) => { const v3 = new V3(config); instances.push(v3); return v3.init(); }); await Promise.all(creationPromises); // All should be initialized successfully expect(instances.length).toBe(configs.length); for (const instance of instances) { expect(instance.context).toBeDefined(); } // All should be able to perform operations await Promise.all( instances.map(async (instance) => { const page = await instance.context.awaitActivePage(); await page.goto("about:blank"); expect(page.url()).toContain("about:blank"); }), ); } finally { await Promise.all(instances.map((instance) => closeV3(instance))); } }); test("V3 instance logger is properly cleaned up on close", async () => { const logs: LogLine[] = []; const v3 = new V3( getV3DynamicTestConfig({ verbose: 2, disablePino: true, logger: (line: LogLine) => logs.push(line), }), ); await v3.init(); const initialLogCount = logs.length; expect(initialLogCount).toBeGreaterThan(0); await closeV3(v3); // After close, the instance should not generate new logs // (This is hard to test directly, but we can verify the instance is closed) expect(v3["state"].kind).toBe("UNINITIALIZED"); }); test("logger works correctly across instance lifecycle", async () => { const logs: LogLine[] = []; const v3 = new V3( getV3DynamicTestConfig({ verbose: 2, disablePino: true, logger: (line: LogLine) => logs.push(line), }), ); try { // Before init expect(logs.length).toBe(0); // After init await v3.init(); const afterInitCount = logs.length; expect(afterInitCount).toBeGreaterThan(0); // During operation const page = await v3.context.awaitActivePage(); await page.goto("data:text/html,

Test

"); const afterOperationCount = logs.length; expect(afterOperationCount).toBeGreaterThanOrEqual(afterInitCount); // Verify log structure const initLogs = logs.filter((log) => log.category === "init"); expect(initLogs.length).toBeGreaterThan(0); // All logs should have required fields for (const log of logs) { expect(log.category).toBeDefined(); expect(log.message).toBeDefined(); expect(typeof log.level).toBe("number"); } } finally { await closeV3(v3); } }); test("multiple instances can navigate concurrently without logger interference", async () => { const instanceCount = 3; const instances: V3[] = []; const instanceLogs: Map = new Map(); try { // Create instances for (let i = 0; i < instanceCount; i++) { const logs: LogLine[] = []; instanceLogs.set(i, logs); const v3 = new V3( getV3DynamicTestConfig({ verbose: 1, disablePino: true, logger: (line: LogLine) => logs.push(line), }), ); instances.push(v3); await v3.init(); } // Navigate all instances concurrently to different URLs const urls = [ "data:text/html,

Page 1

", "data:text/html,

Page 2

", "data:text/html,

Page 3

", ]; await Promise.all( instances.map(async (instance, i) => { const page = await instance.context.awaitActivePage(); await page.goto(urls[i]); }), ); // Verify each instance navigated to the correct URL for (let i = 0; i < instanceCount; i++) { const page = await instances[i].context.awaitActivePage(); expect(page.url()).toContain(`Page ${i + 1}`); } // Each instance should have its own logs for (let i = 0; i < instanceCount; i++) { const logs = instanceLogs.get(i)!; expect(logs.length).toBeGreaterThan(0); } } finally { await Promise.all(instances.map((instance) => closeV3(instance))); } }); }); ================================================ FILE: packages/core/tests/integration/nested-div.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { captureHybridSnapshot } from "../../lib/v3/understudy/a11y/snapshot/index.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("tests captureHybridSnapshot() does not break due to -32000 Failed to convert response to JSON: CBOR: stack limit exceeded", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("captureHybridSnapshot does not throw", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/nested-div/", ); await expect(captureHybridSnapshot(page)).resolves.toBeDefined(); }); }); ================================================ FILE: packages/core/tests/integration/page-addInitScript.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; import { V3Context } from "../../lib/v3/understudy/context.js"; const EXAMPLE_URL = "https://example.com"; test.describe("page.addInitScript", () => { let v3: V3; let ctx: V3Context; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); ctx = v3.context; }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("runs scripts on real network navigations", async () => { const page = await ctx.awaitActivePage(); await page.addInitScript(() => { (window as unknown as { __fromPageInit?: string }).__fromPageInit = "page-level"; }); await page.goto(EXAMPLE_URL, { waitUntil: "domcontentloaded" }); const observed = await page.evaluate(() => { return (window as unknown as { __fromPageInit?: string }).__fromPageInit; }); expect(observed).toBe("page-level"); }); test("scopes scripts to the page only", async () => { const first = await ctx.awaitActivePage(); await first.addInitScript(` (function () { function markScope() { var root = document.documentElement; if (!root) return; root.dataset.scopeWitness = "page-one"; } if (document.readyState === "loading") { document.addEventListener("DOMContentLoaded", markScope, { once: true, }); } else { markScope(); } })(); `); await first.goto(`${EXAMPLE_URL}/?page=one`, { waitUntil: "domcontentloaded", }); const second = await ctx.newPage(); await second.goto(`${EXAMPLE_URL}/?page=two`, { waitUntil: "domcontentloaded", }); const firstValue = await first.evaluate(() => { return document.documentElement.dataset.scopeWitness ?? "missing"; }); const secondValue = await second.evaluate(() => { return document.documentElement.dataset.scopeWitness ?? "missing"; }); expect(firstValue).toBe("page-one"); expect(secondValue).toBe("missing"); }); test("supports passing arguments to function sources", async () => { const page = await ctx.awaitActivePage(); const payload = { greeting: "hi", nested: { count: 1 } }; const initPayload = ((arg) => { function setPayload() { const root = document.documentElement; if (!root) return; root.dataset.pageInitPayload = JSON.stringify(arg); } if (document.readyState === "loading") { document.addEventListener("DOMContentLoaded", setPayload, { once: true, }); } else { setPayload(); } }) as (arg: typeof payload) => void; await page.addInitScript(initPayload, payload); await page.goto(`${EXAMPLE_URL}/?page=payload`, { waitUntil: "domcontentloaded", }); const observed = await page.evaluate(() => { const raw = document.documentElement.dataset.pageInitPayload; return raw ? JSON.parse(raw) : undefined; }); expect(observed).toEqual(payload); }); }); ================================================ FILE: packages/core/tests/integration/page-console.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page console events", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("captures console messages emitted by the page", async () => { const browserTarget = ( process.env.STAGEHAND_BROWSER_TARGET ?? "local" ).toLowerCase(); const isBrowserbase = browserTarget === "browserbase"; if (isBrowserbase) { console.warn( "[page-console] TODO: re-enable once BB cloud browsers support Runtime.consoleAPICalled events again. See https://browserbase.slack.com/archives/C06U6CM7YS1/p1769483322836589", ); test.skip( true, "TODO: re-enable once BB cloud browsers support Runtime.consoleAPICalled events again.", ); } const page = v3.context.pages()[0]; const received: Array<{ type: string; text: string }> = []; page.on("console", (message) => { received.push({ type: message.type(), text: message.text() }); }); await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-hn/", ); await page.evaluate(() => { console.log("stagehand console", { ok: true }); console.error("stagehand console error"); }); const waitForConsole = async ( predicate: () => boolean, timeoutMs = 2000, ) => { const deadline = Date.now() + timeoutMs; while (Date.now() < deadline) { if (predicate()) return; await new Promise((resolve) => setTimeout(resolve, 50)); } }; await waitForConsole( () => received.some((m) => m.type === "log") && received.some((m) => m.type === "error" && m.text.includes("error")), 5000, ); expect(received.length).toBeGreaterThanOrEqual(2); expect(received.some((m) => m.type === "log")).toBeTruthy(); expect( received.some((m) => m.type === "error" && m.text.includes("error")), ).toBeTruthy(); }); }); ================================================ FILE: packages/core/tests/integration/page-drag-and-drop.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page.dragAndDrop() - dragging elements", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("drags and drops element to target zone", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
Drag Me
Drop Here
Status: Waiting
`), ); // Get coordinates for drag and drop const sourceLocation = await page .frames()[0] .getLocationForSelector("#source"); const dropZoneLocation = await page .frames()[0] .getLocationForSelector("#dropZone"); const fromX = sourceLocation.x + sourceLocation.width / 2; const fromY = sourceLocation.y + sourceLocation.height / 2; const toX = dropZoneLocation.x + dropZoneLocation.width / 2; const toY = dropZoneLocation.y + dropZoneLocation.height / 2; // Perform drag and drop await page.dragAndDrop(fromX, fromY, toX, toY); // Wait for events to be processed await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); // Verify visual result const resultText = await page.evaluate( () => document.getElementById("result").textContent, ); expect(resultText).toContain("DROP SUCCESSFUL"); }); test("drag and drop with steps parameter", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
Not dropped
`), ); const boxLocation = await page.frames()[0].getLocationForSelector("#box"); const targetLocation = await page .frames()[0] .getLocationForSelector("#target"); const fromX = boxLocation.x + boxLocation.width / 2; const fromY = boxLocation.y + boxLocation.height / 2; const toX = targetLocation.x + targetLocation.width / 2; const toY = targetLocation.y + targetLocation.height / 2; // Drag with multiple steps for smoother motion await page.dragAndDrop(fromX, fromY, toX, toY, { steps: 5 }); // Wait for events to be processed await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); const status = await page.evaluate( () => document.getElementById("status").textContent, ); expect(status).toContain("Dropped"); }); test("drag and drop with delay between steps", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
false
`), ); const itemLocation = await page .frames()[0] .getLocationForSelector("#dragItem"); const areaLocation = await page .frames()[0] .getLocationForSelector("#dropArea"); const fromX = itemLocation.x + itemLocation.width / 2; const fromY = itemLocation.y + itemLocation.height / 2; const toX = areaLocation.x + areaLocation.width / 2; const toY = areaLocation.y + areaLocation.height / 2; // Drag with delay between steps await page.dragAndDrop(fromX, fromY, toX, toY, { steps: 3, delay: 50 }); // Wait for events to be processed await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); const isComplete = await page.evaluate( () => document.getElementById("complete").textContent === "true", ); expect(isComplete).toBe(true); }); test("drag and drop returns xpath when requested", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
`), ); const sourceLocation = await page .frames()[0] .getLocationForSelector("#source"); const targetLocation = await page .frames()[0] .getLocationForSelector("#target"); const fromX = sourceLocation.x + sourceLocation.width / 2; const fromY = sourceLocation.y + sourceLocation.height / 2; const toX = targetLocation.x + targetLocation.width / 2; const toY = targetLocation.y + targetLocation.height / 2; const [fromXpath, toXpath] = await page.dragAndDrop( fromX, fromY, toX, toY, { returnXpath: true, }, ); // Should return xpaths for both start and end positions expect(typeof fromXpath).toBe("string"); expect(typeof toXpath).toBe("string"); expect(fromXpath.length).toBeGreaterThan(0); expect(toXpath.length).toBeGreaterThan(0); }); test("drag and drop without returnXpath returns empty strings", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
`), ); const item1Location = await page .frames()[0] .getLocationForSelector("#item1"); const item2Location = await page .frames()[0] .getLocationForSelector("#item2"); const fromX = item1Location.x + item1Location.width / 2; const fromY = item1Location.y + item1Location.height / 2; const toX = item2Location.x + item2Location.width / 2; const toY = item2Location.y + item2Location.height / 2; const [fromXpath, toXpath] = await page.dragAndDrop(fromX, fromY, toX, toY); // Should return empty strings when returnXpath is not set expect(fromXpath).toBe(""); expect(toXpath).toBe(""); }); test("drag and drop with different mouse buttons", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
none
`), ); const sourceLocation = await page .frames()[0] .getLocationForSelector("#source"); const targetLocation = await page .frames()[0] .getLocationForSelector("#target"); const fromX = sourceLocation.x + sourceLocation.width / 2; const fromY = sourceLocation.y + sourceLocation.height / 2; const toX = targetLocation.x + targetLocation.width / 2; const toY = targetLocation.y + targetLocation.height / 2; // Test with left button (default) await page.dragAndDrop(fromX, fromY, toX, toY, { button: "left" }); // Wait for events to be processed await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); const buttonUsed = await page.evaluate( () => document.getElementById("buttonUsed").textContent, ); expect(buttonUsed).toBe("left"); }); test("multiple sequential drag and drops", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
Item 1
Item 2
Drops: 0
`), ); const item1Location = await page .frames()[0] .getLocationForSelector("#item1"); const zone1Location = await page .frames()[0] .getLocationForSelector("#zone1"); const from1X = item1Location.x + item1Location.width / 2; const from1Y = item1Location.y + item1Location.height / 2; const to1X = zone1Location.x + zone1Location.width / 2; const to1Y = zone1Location.y + zone1Location.height / 2; await page.dragAndDrop(from1X, from1Y, to1X, to1Y); await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); let dropCountText = await page.evaluate( () => document.getElementById("log").textContent, ); expect(dropCountText).toContain("Drops: 1"); const item2Location = await page .frames()[0] .getLocationForSelector("#item2"); const zone2Location = await page .frames()[0] .getLocationForSelector("#zone2"); const from2X = item2Location.x + item2Location.width / 2; const from2Y = item2Location.y + item2Location.height / 2; const to2X = zone2Location.x + zone2Location.width / 2; const to2Y = zone2Location.y + zone2Location.height / 2; await page.dragAndDrop(from2X, from2Y, to2X, to2Y); // Wait for events to be processed await page.evaluate(() => new Promise((r) => setTimeout(r, 100))); dropCountText = await page.evaluate( () => document.getElementById("log").textContent, ); expect(dropCountText).toContain("Drops: 2"); }); }); ================================================ FILE: packages/core/tests/integration/page-extra-http-headers.spec.ts ================================================ import { test, expect } from "@playwright/test"; import type { Protocol } from "devtools-protocol"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; import { closeV3 } from "./testUtils.js"; const TEST_URL = "https://browserbase.github.io/stagehand-eval-sites/sites/example/"; test.describe("page.setExtraHTTPHeaders", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("applies headers to navigation requests", async () => { const ctx = v3.context; const page = await ctx.awaitActivePage(); await page.setExtraHTTPHeaders({ "x-page-header": "from-page" }); const internal = page as unknown as { mainSession: { send: (method: string, params?: unknown) => Promise; on: (event: string, handler: (params: unknown) => void) => void; off: (event: string, handler: (params: unknown) => void) => void; }; }; await internal.mainSession.send("Network.enable"); const requestPromise = new Promise( (resolve, reject) => { const timeout = setTimeout(() => { internal.mainSession.off("Network.requestWillBeSent", handler); reject(new Error("Timed out waiting for request")); }, 5000); const handler = (evt: Protocol.Network.RequestWillBeSentEvent) => { if (evt.type !== "Document") return; const url = String(evt.request?.url ?? ""); if (!url.startsWith(TEST_URL)) return; clearTimeout(timeout); internal.mainSession.off("Network.requestWillBeSent", handler); resolve(evt); }; internal.mainSession.on("Network.requestWillBeSent", handler); }, ); await page.goto(TEST_URL, { waitUntil: "domcontentloaded" }); const request = await requestPromise; const headers = Object.fromEntries( Object.entries(request.request.headers ?? {}).map(([key, value]) => [ key.toLowerCase(), String(value), ]), ); expect(headers["x-page-header"]).toBe("from-page"); }); test("updated headers replace previous ones", async () => { const ctx = v3.context; const page = await ctx.awaitActivePage(); const internal = page as unknown as { mainSession: { send: (method: string, params?: unknown) => Promise; on: (event: string, handler: (params: unknown) => void) => void; off: (event: string, handler: (params: unknown) => void) => void; }; }; await internal.mainSession.send("Network.enable"); // Set initial headers and navigate await page.setExtraHTTPHeaders({ "x-first": "yes" }); await page.goto(TEST_URL, { waitUntil: "domcontentloaded" }); // Update headers await page.setExtraHTTPHeaders({ "x-second": "yes" }); const requestPromise = new Promise( (resolve, reject) => { const timeout = setTimeout(() => { internal.mainSession.off("Network.requestWillBeSent", handler); reject(new Error("Timed out waiting for request")); }, 5000); const handler = (evt: Protocol.Network.RequestWillBeSentEvent) => { if (evt.type !== "Document") return; const url = String(evt.request?.url ?? ""); if (!url.startsWith(TEST_URL)) return; clearTimeout(timeout); internal.mainSession.off("Network.requestWillBeSent", handler); resolve(evt); }; internal.mainSession.on("Network.requestWillBeSent", handler); }, ); await page.goto(TEST_URL, { waitUntil: "domcontentloaded" }); const request = await requestPromise; const headers = Object.fromEntries( Object.entries(request.request.headers ?? {}).map(([key, value]) => [ key.toLowerCase(), String(value), ]), ); expect(headers["x-second"]).toBe("yes"); expect(headers["x-first"]).toBeUndefined(); }); }); ================================================ FILE: packages/core/tests/integration/page-goto-response.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page.goto() response surface", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("returns a response object for network navigations", async () => { const page = v3.context.pages()[0]; const response = await page.goto("https://example.com"); expect(response).not.toBeNull(); expect(response!.status()).toBe(200); expect(response!.ok()).toBeTruthy(); const headers = await response.headersArray(); expect(headers.length).toBeGreaterThan(0); const body = await response.text(); expect(body).toContain("Example Domain"); const finished = await response.finished(); expect(finished).toBeNull(); }); test("falls back to null for data URLs", async () => { const page = v3.context.pages()[0]; const response = await page.goto( "data:text/html,inline", ); expect(response).toBeNull(); }); }); ================================================ FILE: packages/core/tests/integration/page-hover.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page.hover() - mouse hover at coordinates", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("hover triggers mouseover event at coordinates", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Hover Me
`, ), ); // Check initial state let hovered = await page.evaluate(() => { const el = document.getElementById("target"); return el?.dataset.hovered === "true"; }); expect(hovered).toBe(false); // Hover at coordinates within the target element (200, 200 is center of the div) await page.hover(200, 200); // Verify mouseover was triggered hovered = await page.evaluate(() => { const el = document.getElementById("target"); return el?.dataset.hovered === "true"; }); expect(hovered).toBe(true); }); test("hover moves mouse without clicking", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( ` `, ), ); // Hover over the button await page.hover(200, 150); // Check that hover happened but click did not const state = await page.evaluate(() => { const btn = document.getElementById("btn"); return { hovered: btn?.dataset.hovered === "true", clicked: btn?.dataset.clicked === "true", }; }); expect(state.hovered).toBe(true); expect(state.clicked).toBe(false); }); test("hover returns xpath when requested", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Target element

Content below

`, ), ); // Hover at coordinate (550, 50) which should be directly over the target div const xpath = await page.hover(550, 50, { returnXpath: true }); // Should return a non-empty xpath string for the element at that coordinate expect(typeof xpath).toBe("string"); expect(xpath.length).toBeGreaterThan(0); // Xpath should reference the div expect(xpath.toLowerCase()).toMatch(/div|target/); }); test("hover without returnXpath returns empty string", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Content
`, ), ); // Hover without returnXpath const result = await page.hover(50, 50); // Should return empty string expect(result).toBe(""); }); test("hover triggers CSS :hover styles", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Hover to change color
`, ), ); // Get initial background color let bgColor = await page.evaluate(() => { const el = document.getElementById("hoverable"); return getComputedStyle(el!).backgroundColor; }); expect(bgColor).toBe("rgb(255, 0, 0)"); // red // Hover over the element await page.hover(200, 200); // Check that CSS :hover state is applied bgColor = await page.evaluate(() => { const el = document.getElementById("hoverable"); return getComputedStyle(el!).backgroundColor; }); expect(bgColor).toBe("rgb(0, 128, 0)"); // green }); test("multiple hovers move the mouse correctly", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Box 1
Box 2
`, ), ); // Hover over box1 await page.hover(50, 50); let state = await page.evaluate(() => ({ box1: document.getElementById("box1")?.dataset.hovered === "true", box2: document.getElementById("box2")?.dataset.hovered === "true", })); expect(state.box1).toBe(true); expect(state.box2).toBe(false); // Move hover to box2 await page.hover(250, 50); state = await page.evaluate(() => ({ box1: document.getElementById("box1")?.dataset.hovered === "true", box2: document.getElementById("box2")?.dataset.hovered === "true", })); expect(state.box1).toBe(false); expect(state.box2).toBe(true); }); }); ================================================ FILE: packages/core/tests/integration/page-screenshot.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { promises as fs } from "fs"; import * as os from "os"; import * as path from "path"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; import { Frame } from "../../lib/v3/understudy/frame.js"; const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); test.describe("Page.screenshot options", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("rejects clip combined with fullPage", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,test"); await expect( page.screenshot({ fullPage: true, clip: { x: 0, y: 0, width: 100, height: 100 }, }), ).rejects.toThrow(/clip and fullPage/); }); test("rejects unsupported image type", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,noop"); await expect( page.screenshot({ // @ts-expect-error intentional invalid type for runtime validation type: "webp", }), ).rejects.toThrow(/unsupported image type/); }); test("rejects jpeg quality for png screenshots", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,noop"); await expect(page.screenshot({ type: "png", quality: 50 })).rejects.toThrow( /quality option is only valid/, ); }); test("honours timeout option", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html,noop"); const mainFrame = page.mainFrame(); const originalScreenshot = mainFrame.screenshot.bind(mainFrame); ( mainFrame as typeof mainFrame & { screenshot: typeof mainFrame.screenshot; } ).screenshot = async () => { await wait(50); return Buffer.from("late"); }; try { await expect(page.screenshot({ timeout: 10 })).rejects.toThrow( /timed out|timeout/i, ); } finally { ( mainFrame as typeof mainFrame & { screenshot: typeof mainFrame.screenshot; } ).screenshot = originalScreenshot; } }); test("applies advanced options and cleans up overlays", async () => { const page = v3.context.pages()[0]; const screenshotTimeout = process.env.CI ? 15000 : 5000; const testStart = Date.now(); console.log( `[screenshot-test] start ${new Date(testStart).toISOString()} timeout=${screenshotTimeout}`, ); const html = `
`; await page.goto("data:text/html," + encodeURIComponent(html)); console.log(`[screenshot-test] page loaded in ${Date.now() - testStart}ms`); const maskLocator = page.locator(".mask-target"); const tempPath = path.join( os.tmpdir(), `stagehand-screenshot-${Date.now()}-${Math.random().toString(36).slice(2)}.jpeg`, ); console.log(`[screenshot-test] tempPath=${tempPath}`); const targetId = page.targetId(); const screenshotCalls: Array<{ frameId: string; options: Parameters[0]; }> = []; const evaluateCalls: Array<{ frameId: string; arg: unknown }> = []; const originalScreenshot = Frame.prototype.screenshot; const originalEvaluate = Frame.prototype.evaluate; // Hook Frame.screenshot so we can assert which options reach CDP without writing real data. Frame.prototype.screenshot = async function screenshotSpy(options) { const frame = this as Frame; if (frame.pageId === targetId) { screenshotCalls.push({ frameId: frame.frameId, options }); return Buffer.from("stub-image"); } return originalScreenshot.call(this, options); }; // Spy on Frame.evaluate to capture the arguments used to inject CSS/masks. Frame.prototype.evaluate = async function evaluateSpy(expression, arg?) { const frame = this as Frame; if (frame.pageId === targetId) { evaluateCalls.push({ frameId: frame.frameId, arg }); } return originalEvaluate.call(this, expression as never, arg); } as Frame["evaluate"]; const internalPage = page as unknown as { mainSession: { send: (method: string, params?: unknown) => Promise; }; }; const sendCalls: Array<{ method: string; params: unknown }> = []; const originalSend = internalPage.mainSession.send.bind( internalPage.mainSession, ) as (method: string, params?: unknown) => Promise; // Capture background overrides so we can confirm omitBackground toggles on/off. internalPage.mainSession.send = async ( method: string, params?: unknown, ) => { sendCalls.push({ method, params }); return originalSend(method, params); }; try { const maskCount = await maskLocator.count(); console.log(`[screenshot-test] maskLocator.count=${maskCount}`); const buffer = await page.screenshot({ animations: "disabled", caret: "hide", clip: { x: 0, y: 0, width: 200, height: 200 }, mask: [maskLocator], maskColor: "rgba(255, 0, 0, 0.4)", omitBackground: true, path: tempPath, quality: 80, scale: "css", style: "body { border: 3px solid black; }", timeout: screenshotTimeout, type: "jpeg", }); console.log( `[screenshot-test] screenshot returned bytes=${buffer.length} elapsed=${Date.now() - testStart}ms`, ); expect(Buffer.isBuffer(buffer)).toBeTruthy(); expect(screenshotCalls.length).toBeGreaterThanOrEqual(1); console.log( `[screenshot-test] screenshotCalls=${screenshotCalls.length} evaluateCalls=${evaluateCalls.length} sendCalls=${sendCalls.length}`, ); const recorded = screenshotCalls[0]?.options ?? {}; expect(recorded).toMatchObject({ type: "jpeg", quality: 80 }); expect(recorded?.clip).toMatchObject({ x: 0, y: 0, width: 200, height: 200, }); if (typeof recorded?.scale === "number") { expect(recorded.scale).toBeGreaterThan(0); expect(recorded.scale).toBeLessThanOrEqual(2); } await fs.stat(tempPath); const maskNodes = await page.evaluate( () => document.querySelectorAll("[data-stagehand-mask]").length, ); expect(maskNodes).toBe(0); const styleNodes = await page.evaluate( () => document.querySelectorAll("[data-stagehand-style]").length, ); expect(styleNodes).toBe(0); const backgroundCalls = sendCalls.filter( (call) => call.method === "Emulation.setDefaultBackgroundColorOverride", ); expect(backgroundCalls.length).toBeGreaterThan(1); expect( backgroundCalls.some( (call) => call.params && typeof call.params === "object" && "color" in (call.params as Record), ), ).toBeTruthy(); expect( backgroundCalls.some( (call) => !call.params || Object.keys(call.params as Record).length === 0, ), ).toBeTruthy(); const cssArgs = evaluateCalls .map((entry) => { const value = entry.arg as { css?: string } | undefined; return value?.css ?? null; }) .filter((css): css is string => typeof css === "string"); const tokens = evaluateCalls .map((entry) => { const arg = entry.arg as { token?: string } | undefined; return arg?.token ?? null; }) .filter((token): token is string => typeof token === "string"); // Tokens include which helper injected the style (animations/caret/custom). expect(tokens.some((token) => token.includes("animations"))).toBeTruthy(); expect(tokens.some((token) => token.includes("caret"))).toBeTruthy(); expect(tokens.some((token) => token.includes("custom"))).toBeTruthy(); // Custom style should bubble through so we check the actual CSS text. expect( cssArgs.some((css) => css.includes("border: 3px solid black")), ).toBeTruthy(); const maskCalls = evaluateCalls.filter((entry) => { const arg = entry.arg; return ( arg && typeof arg === "object" && "rects" in (arg as Record) ); }); expect(maskCalls.length).toBeGreaterThan(0); const rects = (maskCalls[0]?.arg as { rects?: unknown } | undefined) ?.rects; expect(Array.isArray(rects)).toBeTruthy(); expect((rects as unknown[]).length).toBe(2); } finally { Frame.prototype.screenshot = originalScreenshot; Frame.prototype.evaluate = originalEvaluate; internalPage.mainSession.send = originalSend; await fs.unlink(tempPath).catch(() => {}); } }); test("masks elements inside dialog top layer", async () => { const page = v3.context.pages()[0]; const html = ` `; await page.goto("data:text/html," + encodeURIComponent(html)); const targetId = page.targetId(); const originalScreenshot = Frame.prototype.screenshot; let dialogMaskCount = 0; Frame.prototype.screenshot = async function screenshotSpy(options) { const frame = this as Frame; if (frame.pageId === targetId) { dialogMaskCount = await frame.evaluate(() => { const dialog = document.querySelector("dialog[open]"); if (!dialog) return 0; return dialog.querySelectorAll("[data-stagehand-mask]").length; }); return Buffer.from("stub-image"); } return originalScreenshot.call(this, options); }; try { await page.screenshot({ mask: [page.locator("#dialog-input")], }); expect(dialogMaskCount).toBeGreaterThan(0); } finally { Frame.prototype.screenshot = originalScreenshot; } }); }); ================================================ FILE: packages/core/tests/integration/page-scroll.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page.scroll() - mouse wheel scrolling", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("scrolls page vertically with positive deltaY", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Section 1
Section 2
Section 3
Section 4
Section 5
`, ), ); // Get initial scroll position let scrollY = await page.evaluate(() => window.scrollY); expect(scrollY).toBe(0); // Scroll down (positive deltaY) await page.scroll(640, 400, 0, 300); // Wait for scroll to complete await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); // Check that we've scrolled down scrollY = await page.evaluate(() => window.scrollY); expect(scrollY).toBeGreaterThan(0); }); test("scrolls page horizontally with positive deltaX", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Section 1
Section 2
Section 3
Section 4
Section 5
`, ), ); let scrollX = await page.evaluate(() => window.scrollX); expect(scrollX).toBe(0); // Scroll right (positive deltaX) await page.scroll(640, 400, 300, 0); // Wait for scroll to complete await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); // Check that we've scrolled right scrollX = await page.evaluate(() => window.scrollX); expect(scrollX).toBeGreaterThan(0); }); test("scrolls in both directions simultaneously", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Diagonal content
`, ), ); // Scroll both horizontally and vertically await page.scroll(640, 400, 200, 200); // Wait for scroll to complete await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); // Check both directions changed const scrollPos = await page.evaluate(() => ({ x: window.scrollX, y: window.scrollY, })); expect(scrollPos.x).toBeGreaterThan(0); expect(scrollPos.y).toBeGreaterThan(0); }); test("scrolls at specific coordinate on page", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Top
Middle
Bottom
`, ), ); // Scroll from specific coordinates await page.scroll(640, 400, 0, 400); // Wait for scroll to complete await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); // Verify scroll happened const scrollY = await page.evaluate(() => window.scrollY); expect(scrollY).toBeGreaterThan(0); }); test("scrolls with large deltaY values", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Section 1
Section 2
Section 3
Section 4
Section 5
`, ), ); // Scroll with large delta await page.scroll(640, 400, 0, 1000); // Wait for scroll to complete await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); // Should scroll significantly const scrollY = await page.evaluate(() => window.scrollY); expect(scrollY).toBeGreaterThan(500); }); test("negative deltaY scrolls up", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Top
Middle 1
Middle 2
Bottom
`, ), ); // First scroll down await page.scroll(640, 400, 0, 500); await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); let scrollY = await page.evaluate(() => window.scrollY); const scrolledDown = scrollY; expect(scrolledDown).toBeGreaterThan(0); // Now scroll up (negative delta) await page.scroll(640, 400, 0, -300); await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); scrollY = await page.evaluate(() => window.scrollY); expect(scrollY).toBeLessThan(scrolledDown); }); test("scroll returns xpath when requested", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Target element

Content below

`, ), ); // Scroll at coordinate (550, 50) which should be directly over the target div // div spans: left 400-700px, top 0-100px // coordinate 550,50 is within that range const xpath = await page.scroll(550, 50, 0, 200, { returnXpath: true }); // Should return a non-empty xpath string for the element at that coordinate expect(typeof xpath).toBe("string"); expect(xpath.length).toBeGreaterThan(0); // Xpath should reference the div or contain "target" expect(xpath.toLowerCase()).toMatch(/div|target/); }); test("scroll without returnXpath returns empty string", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Content
`, ), ); // Scroll without returnXpath const result = await page.scroll(640, 400, 0, 200); // Should return empty string expect(result).toBe(""); }); test("multiple sequential scrolls accumulate", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( `
Section 1
Section 2
Section 3
Section 4
`, ), ); // First scroll await page.scroll(640, 400, 0, 200); await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); const after1 = await page.evaluate(() => window.scrollY); expect(after1).toBeGreaterThan(0); // Second scroll await page.scroll(640, 400, 0, 200); await page.evaluate(() => new Promise((r) => setTimeout(r, 200))); const after2 = await page.evaluate(() => window.scrollY); expect(after2).toBeGreaterThan(after1); }); }); ================================================ FILE: packages/core/tests/integration/page-send-cdp.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; test.describe("Page sendCDP method", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); }); test("sends CDP commands and requires domain to be enabled first", async () => { const page = v3.context.pages()[0]; await page.goto("https://example.com"); // Try to add a virtual authenticator without enabling WebAuthn first // This should fail because the domain needs to be enabled await expect( page.sendCDP("WebAuthn.addVirtualAuthenticator", { options: { protocol: "ctap2", transport: "usb", hasResidentKey: false, hasUserVerification: false, isUserVerified: false, }, }), ).rejects.toThrow(); // Enable the WebAuthn domain await page.sendCDP("WebAuthn.enable"); // Now adding a virtual authenticator should succeed const result = await page.sendCDP<{ authenticatorId: string }>( "WebAuthn.addVirtualAuthenticator", { options: { protocol: "ctap2", transport: "usb", hasResidentKey: false, hasUserVerification: false, isUserVerified: false, }, }, ); // Verify we got an authenticator ID back expect(result).toHaveProperty("authenticatorId"); expect(typeof result.authenticatorId).toBe("string"); expect(result.authenticatorId.length).toBeGreaterThan(0); }); }); ================================================ FILE: packages/core/tests/integration/perform-understudy-method.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { performUnderstudyMethod } from "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js"; import { closeV3 } from "./testUtils.js"; test.describe("tests performUnderstudyMethod", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("tests that clicking works", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/no-js-click/", ); await performUnderstudyMethod( page, page.mainFrame(), "click", "/html/body/button", [], 30000, ); const isVisible = await page.locator("#success-msg").isVisible(); expect(isVisible).toBe(true); }); test("fill sets input value", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/login/", ); await performUnderstudyMethod( page, page.mainFrame(), "fill", "/html/body/main/form/div[1]/input", ["Alice"], 30000, ); const textContent = await page .locator("/html/body/main/form/div[1]/input") .inputValue(); expect(textContent).toBe("Alice"); }); test("tests that key presses work", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/key-press/", ); await performUnderstudyMethod( page, page.mainFrame(), "press", "xpath=/html", ["Enter"], 30000, ); const textContent = await page .locator("/html/body/div/div/h1") .textContent(); expect(textContent).toContain("Enter"); }); test("tests select option from a dropdown", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/nested-dropdown/", ); await performUnderstudyMethod( page, page.mainFrame(), "selectOptionFromDropdown", "xpath=//*[@id='licenseType']", ["Smog Check Technician"], 30000, ); const inputValue = await page .locator("#licenseType >> option:checked") .textContent(); expect(inputValue).toBe("Smog Check Technician"); }); test("tests drag & drop works (start xpath & end xpath)", async () => { const page = v3.context.pages()[0]; await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/drag-drop/", ); await performUnderstudyMethod( page, page.mainFrame(), "dragAndDrop", "xpath=/html/body/div/section[1]/div[1]/div[1]", // start xpath ["/html/body/div/section[2]/div/div[1]"], // end xpath 30000, ); const droppedContent = await page .locator("/html/body/div/section[2]/div/div[1]/div") .textContent(); expect(droppedContent).toBe("TEXT: Hello from text"); }); }); ================================================ FILE: packages/core/tests/integration/setinputfiles.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { Buffer } from "buffer"; import { promises as fs } from "fs"; import path from "path"; import crypto from "crypto"; import type { Page as V3Page } from "../../lib/v3/understudy/page.js"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; const FILE_UPLOAD_IFRAME_URL = "https://browserbase.github.io/stagehand-eval-sites/sites/file-uploads-iframe/"; const FILE_UPLOAD_V2_URL = "https://browserbase.github.io/stagehand-eval-sites/sites/file-uploads-2/"; const RESUME_INPUT = "#resumeUpload"; const RESUME_SUCCESS = "#resumeSuccess"; const IMAGES_INPUT = "#imagesUpload"; const IMAGES_SUCCESS = "#imagesSuccess"; const AUDIO_INPUT = "#audioUpload"; const AUDIO_SUCCESS = "#audioSuccess"; const IFRAME_UPLOAD_INPUT = "/html/body/div/iframe/html/body/div/div[1]/input"; const IFRAME_SUCCESS = "body > div > iframe >> html > body > div > div:nth-of-type(2)"; test.describe("tests setInputFiles()", () => { let v3: V3; const fixtures: string[] = []; test.beforeEach(async () => { v3 = new V3(v3TestConfig); await v3.init(); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); await Promise.all( fixtures.splice(0).map((file) => fs.unlink(file).catch(() => {})), ); }); const createFixture = async ( namePrefix: string, contents: string, ext = ".txt", ): Promise => { const normalizedExt = ext.startsWith(".") ? ext : `.${ext}`; const filename = `${namePrefix}-${crypto.randomBytes(4).toString("hex")}${normalizedExt}`; const filePath = path.resolve(process.cwd(), filename); await fs.writeFile(filePath, contents, "utf-8"); fixtures.push(filePath); return filePath; }; const expectUploadSuccess = async ( page: V3Page, successSelector: string, expectedText: string, ) => { await expect .poll( () => page.evaluate((selector) => { const el = document.querySelector(selector); if (!el) return ""; const display = window.getComputedStyle(el).display; if (display === "none") return ""; return el.textContent ?? ""; }, successSelector), { message: `wait for success message at ${successSelector}` }, ) .toContain(expectedText); }; const getInputFileCount = async (page: V3Page, inputSelector: string) => { return await page.evaluate((selector) => { const el = document.querySelector(selector); if (!(el instanceof HTMLInputElement)) return 0; return el.files?.length ?? 0; }, inputSelector); }; const expectFileCount = async ( page: V3Page, inputSelector: string, expected: number, ) => { await expect .poll(() => getInputFileCount(page, inputSelector), { message: `wait for file count on ${inputSelector}`, }) .toBe(expected); }; test("deepLocator uploads and validates within iframe", async () => { const page = v3.context.pages()[0]; await page.goto(FILE_UPLOAD_IFRAME_URL); const fixture = await createFixture( "iframe-upload", "

iframe upload

", ".txt", ); await page .deepLocator(IFRAME_UPLOAD_INPUT) .setInputFiles(path.relative(process.cwd(), fixture)); const successLocator = page.deepLocator(IFRAME_SUCCESS); await expect .poll(async () => (await successLocator.textContent()) ?? "", { message: "wait for iframe upload success", }) .toContain("file uploaded successfully"); }); test("locator uploads resume via relative path string", async () => { const page = v3.context.pages()[0]; await page.goto(FILE_UPLOAD_V2_URL); const fixture = await createFixture("resume", "

resume

", ".pdf"); await page .locator(RESUME_INPUT) .setInputFiles(path.relative(process.cwd(), fixture)); await expectUploadSuccess(page, RESUME_SUCCESS, "Resume uploaded!"); await expectFileCount(page, RESUME_INPUT, 1); }); test("locator uploads multiple images via absolute paths", async () => { const page = v3.context.pages()[0]; await page.goto(FILE_UPLOAD_V2_URL); const first = await createFixture("image-a", "

A

", ".png"); const second = await createFixture("image-b", "

B

", ".jpeg"); await page.locator(IMAGES_INPUT).setInputFiles([first, second]); await expectUploadSuccess(page, IMAGES_SUCCESS, "Images uploaded!"); await expectFileCount(page, IMAGES_INPUT, 2); }); test("locator uploads audio via payload object", async () => { const page = v3.context.pages()[0]; await page.goto(FILE_UPLOAD_V2_URL); await page.locator(AUDIO_INPUT).setInputFiles({ name: "voice-sample.mp3", mimeType: "audio/mpeg", buffer: Buffer.from("fake audio bytes", "utf-8"), }); await expectUploadSuccess(page, AUDIO_SUCCESS, "Audio file uploaded!"); await expectFileCount(page, AUDIO_INPUT, 1); }); test("locator uploads multiple payload objects to images input", async () => { const page = v3.context.pages()[0]; await page.goto(FILE_UPLOAD_V2_URL); await page.locator(IMAGES_INPUT).setInputFiles([ { name: "payload-a.png", mimeType: "image/png", buffer: Buffer.from("payload-a", "utf-8"), }, { name: "payload-b.png", mimeType: "image/png", buffer: Buffer.from("payload-b", "utf-8"), }, ]); await expectUploadSuccess(page, IMAGES_SUCCESS, "Images uploaded!"); await expectFileCount(page, IMAGES_INPUT, 2); }); }); ================================================ FILE: packages/core/tests/integration/shadow-iframe-oopif.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import puppeteer from "puppeteer-core"; import { chromium as playwrightChromium } from "playwright-core"; import { chromium as patchrightChromium } from "patchright-core"; import { Action } from "../../lib/v3/types/public/methods.js"; import { AnyPage } from "../../lib/v3/types/public/page.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; /** * IMPORTANT: * - We create a single V3 instance/test to avoid cross-test state. Increase parallelism later if needed. * - We assert an *effect* when feasible (e.g. input value). For pure clicks we assert no thrown error. */ type Case = { title: string; url: string; action: Action; expectedSubstrings: string[]; // check v3.extract().pageText contains these }; type Framework = "v3" | "puppeteer" | "playwright" | "patchright"; async function runCase(v3: V3, c: Case, framework: Framework): Promise { let cleanup: (() => Promise | void) | null = null; // Acquire the correct page for the requested framework let page: AnyPage | undefined; switch (framework) { case "v3": { const v3Page = v3.context.pages()[0]; await v3Page.goto(c.url, { waitUntil: "networkidle" }); page = v3Page; break; } case "puppeteer": { const browser = await puppeteer.connect({ browserWSEndpoint: v3.connectURL(), defaultViewport: null, }); const pages = await browser.pages(); const puppeteerPage = pages[0]; await puppeteerPage.goto(c.url, { waitUntil: "networkidle0" }); page = puppeteerPage; cleanup = async () => { try { await browser.close(); } catch { // } }; break; } case "playwright": { const pwBrowser = await playwrightChromium.connectOverCDP( v3.connectURL(), ); const pwContext = pwBrowser.contexts()[0]; const pwPage = pwContext.pages()[0]; await pwPage.goto(c.url, { waitUntil: "networkidle" as never }); page = pwPage as unknown as AnyPage; cleanup = async () => { try { await pwBrowser.close(); } catch { // ignore } }; break; } case "patchright": { const prBrowser = await patchrightChromium.connectOverCDP( v3.connectURL(), ); const prContext = prBrowser.contexts()[0]; const prPage = prContext.pages()[0]; await prPage.goto(c.url, { waitUntil: "networkidle" as never }); page = prPage as unknown as AnyPage; cleanup = async () => { try { await prBrowser.close(); } catch { // ignore } }; break; } } try { if (!page) throw new Error("Missing page for selected framework"); await v3.act(c.action, { page }); // Post-action extraction; verify expected text appears const extraction = await v3.extract({ page }); const text = extraction.pageText ?? ""; for (const s of c.expectedSubstrings) { expect( text.includes(s), `expected pageText to include substring: ${s}`, ).toBeTruthy(); } } finally { await cleanup?.(); } } const cases: Case[] = [ { title: "Closed shadow root inside OOPIF", url: "https://browserbase.github.io/stagehand-eval-sites/sites/closed-shadow-root-in-oopif/", action: { selector: "xpath=/html/body/main/section/iframe/html/body/shadow-demo//div/button", method: "click", arguments: [""], description: "click button inside closed shadow root in OOPIF", }, expectedSubstrings: ["button successfully clicked"], }, { title: "Open shadow root inside OOPIF", url: "https://browserbase.github.io/stagehand-eval-sites/sites/open-shadow-root-in-oopif/", action: { selector: "xpath=/html/body/main/section/iframe/html/body/shadow-demo//div/button", method: "click", arguments: [""], description: "", }, expectedSubstrings: ["button successfully clicked"], }, { title: "OOPIF inside open shadow root", url: "https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-open-shadow-dom/", action: { selector: "xpath=/html/body/shadow-host//section/iframe/html/body/main/section[1]/form/div/div[1]/input", method: "fill", arguments: ["nunya"], description: "", }, expectedSubstrings: ["nunya"], }, { title: "OOPIF inside closed shadow root", url: "https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-closed-shadow-dom/", action: { selector: "xpath=/html/body/shadow-host//section/iframe/html/body/main/section[1]/form/div/div[1]/input", method: "fill", arguments: ["nunya"], description: "fill input inside OOPIF", }, expectedSubstrings: ["nunya"], }, ]; test.describe .parallel("Stagehand v3: shadow <-> iframe OOPIF scenarios", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); const frameworks: Framework[] = [ "v3", "playwright", "puppeteer", "patchright", ]; for (const fw of frameworks) { for (const c of cases) { test(`[${fw}] ${c.title}`, async () => { await runCase(v3, c, fw); }); } } }); ================================================ FILE: packages/core/tests/integration/shadow-iframe-spif.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import puppeteer from "puppeteer-core"; import { chromium as playwrightChromium } from "playwright-core"; import { chromium as patchrightChromium } from "patchright-core"; import { Action } from "../../lib/v3/types/public/methods.js"; import { AnyPage } from "../../lib/v3/types/public/page.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; /** * IMPORTANT: * - We create a single V3 instance/test to avoid cross-test state. Increase parallelism later if needed. * - We assert an *effect* when feasible (e.g. input value). For pure clicks we assert no thrown error. */ type Case = { title: string; url: string; action: Action; expectedSubstrings: string[]; // check v3.extract().pageText contains these }; type Framework = "v3" | "puppeteer" | "playwright" | "patchright"; async function runCase(v3: V3, c: Case, framework: Framework): Promise { let cleanup: (() => Promise | void) | null = null; // Acquire the correct page for the requested framework let page: AnyPage | undefined; switch (framework) { case "v3": { const v3Page = v3.context.pages()[0]; await v3Page.goto(c.url, { waitUntil: "networkidle" }); page = v3Page; break; } case "puppeteer": { const browser = await puppeteer.connect({ browserWSEndpoint: v3.connectURL(), defaultViewport: null, }); const pages = await browser.pages(); const puppeteerPage = pages[0]; await puppeteerPage.goto(c.url, { waitUntil: "networkidle0" }); page = puppeteerPage; cleanup = async () => { try { await browser.close(); } catch { // } }; break; } case "playwright": { const pwBrowser = await playwrightChromium.connectOverCDP( v3.connectURL(), ); const pwContext = pwBrowser.contexts()[0]; const pwPage = pwContext.pages()[0]; await pwPage.goto(c.url, { waitUntil: "networkidle" as never }); page = pwPage as unknown as AnyPage; cleanup = async () => { try { await pwBrowser.close(); } catch { // ignore } }; break; } case "patchright": { const prBrowser = await patchrightChromium.connectOverCDP( v3.connectURL(), ); const prContext = prBrowser.contexts()[0]; const prPage = prContext.pages()[0]; await prPage.goto(c.url, { waitUntil: "networkidle" as never }); page = prPage as unknown as AnyPage; cleanup = async () => { try { await prBrowser.close(); } catch { // ignore } }; break; } } try { if (!page) throw new Error("Missing page for selected framework"); await v3.act(c.action, { page }); // Post-action extraction; verify expected text appears const extraction = await v3.extract({ page }); const text = extraction.pageText ?? ""; for (const s of c.expectedSubstrings) { expect( text.includes(s), `expected pageText to include substring: ${s}`, ).toBeTruthy(); } } finally { await cleanup?.(); } } const cases: Case[] = [ { title: "Open shadow root inside SPIF", url: "https://browserbase.github.io/stagehand-eval-sites/sites/open-shadow-root-in-spif/", action: { selector: "xpath=/html/body/main/section/iframe/html/body/shadow-demo//div/button", method: "click", arguments: [""], description: "", }, expectedSubstrings: ["button successfully clicked"], }, { title: "Closed shadow root inside SPIF", url: "https://browserbase.github.io/stagehand-eval-sites/sites/closed-shadow-dom-in-spif/", action: { selector: "xpath=/html/body/div/iframe/html/body/shadow-demo//div/button", method: "click", arguments: [""], description: "", }, expectedSubstrings: ["button successfully clicked"], }, { title: "SPIF inside closed shadow root", url: "https://browserbase.github.io/stagehand-eval-sites/sites/spif-in-closed-shadow-dom/", action: { selector: "xpath=/html/body/shadow-host//div/iframe/html/body/button", method: "click", arguments: [""], description: "", }, expectedSubstrings: ["button successfully clicked"], }, { title: "SPIF inside open shadow root", url: "https://browserbase.github.io/stagehand-eval-sites/sites/spif-in-open-shadow-dom/", action: { selector: "xpath=/html/body/shadow-host//div/iframe/html/body/button", method: "click", arguments: [""], description: "click button inside SPIF under open shadow", }, expectedSubstrings: ["button successfully clicked"], }, ]; test.describe.parallel("Stagehand v3: shadow <-> iframe SPIF scenarios", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); const frameworks: Framework[] = [ "v3", "playwright", "puppeteer", "patchright", ]; for (const fw of frameworks) { for (const c of cases) { test(`[${fw}] ${c.title}`, async () => { await runCase(v3, c, fw); }); } } }); ================================================ FILE: packages/core/tests/integration/testUtils.ts ================================================ import type { V3 } from "../../lib/v3/v3.js"; import type { LanguageModelV2, LanguageModelV2CallOptions, LanguageModelV2Content, LanguageModelV2FinishReason, LanguageModelV2Usage, } from "@ai-sdk/provider"; import { AISdkClient } from "../../lib/v3/llm/aisdk.js"; /** * Races a promise against a timeout. * Resolves to the promise value or "timeout" if the deadline expires. */ export function raceTimeout( promise: Promise, ms: number, ): Promise { let timer: ReturnType; const timeout = new Promise<"timeout">((resolve) => { timer = setTimeout(() => resolve("timeout"), ms); }); return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); } const CLOSE_TIMEOUT_MS = 5_000; async function settleWithTimeout( promise: Promise, timeoutMs: number, ): Promise { let timeoutId: NodeJS.Timeout | undefined; const timeout = new Promise((resolve) => { timeoutId = setTimeout(resolve, timeoutMs); }); try { await Promise.race([promise.catch(() => {}), timeout]); } finally { if (timeoutId) clearTimeout(timeoutId); } } export async function closeV3(v3?: V3 | null): Promise { if (!v3) return; const isBrowserbase = v3.isBrowserbase; if (isBrowserbase) { try { await settleWithTimeout( v3.context.conn.send("Browser.close"), CLOSE_TIMEOUT_MS, ); } catch { // best-effort cleanup } } await settleWithTimeout(v3.close(), CLOSE_TIMEOUT_MS); } type JsonResponseKey = | "act" | "Observation" | "Metadata" | "Extraction" | "default"; type JsonResponseValue = | Record | ((options: LanguageModelV2CallOptions) => Record); type JsonResponseScript = JsonResponseValue | JsonResponseValue[]; type GenerateResponseValue = | { content: LanguageModelV2Content[]; finishReason?: LanguageModelV2FinishReason; usage?: Partial; } | ((options: LanguageModelV2CallOptions) => { content: LanguageModelV2Content[]; finishReason?: LanguageModelV2FinishReason; usage?: Partial; }); type ScriptedLanguageModel = LanguageModelV2 & { doGenerateCalls: LanguageModelV2CallOptions[]; }; type ScriptedGenerateResult = { content: LanguageModelV2Content[]; finishReason?: LanguageModelV2FinishReason; usage?: Partial; }; const DEFAULT_USAGE: LanguageModelV2Usage = { inputTokens: 1, outputTokens: 1, totalTokens: 2, reasoningTokens: 0, cachedInputTokens: 0, }; const mergeUsage = ( usage?: Partial, ): LanguageModelV2Usage => ({ ...DEFAULT_USAGE, ...(usage ?? {}), }); function consumeScriptValue(value: T | T[] | undefined, fallback: T): T { if (!Array.isArray(value)) { return value ?? fallback; } if (value.length <= 1) { return value[0] ?? fallback; } return value.shift() ?? fallback; } function resolveJsonResponseKey( options: LanguageModelV2CallOptions, ): JsonResponseKey { const responseFormat = options.responseFormat; if (!responseFormat || responseFormat.type !== "json") { return "default"; } const schema = responseFormat.schema as { type?: string; properties?: Record; }; const properties = schema?.properties ?? {}; if ("elementId" in properties && "twoStep" in properties) { return "act"; } if ("elements" in properties) { return "Observation"; } if ("completed" in properties && "progress" in properties) { return "Metadata"; } return "Extraction"; } export function promptToText( prompt: LanguageModelV2CallOptions["prompt"], ): string { return (prompt ?? []) .flatMap((message) => { if (typeof message.content === "string") { return [message.content]; } return (message.content ?? []) .map((part) => (part.type === "text" ? part.text : "")) .filter((text): text is string => text.length > 0); }) .join("\n"); } function findEncodedIds(options: LanguageModelV2CallOptions): string[] { return [...promptToText(options.prompt).matchAll(/\b\d+-\d+\b/g)].map( (match) => match[0], ); } export function findEncodedIdForText( options: LanguageModelV2CallOptions, text: string, ): string { const promptText = promptToText(options.prompt); const lines = promptText.split("\n"); const line = lines.find((entry) => entry.includes(text)); const match = line?.match(/\b\d+-\d+\b/); if (!match) { throw new Error(`Could not find encoded id for text: ${text}`); } return match[0]; } export function findLastEncodedId(options: LanguageModelV2CallOptions): string { const matches = findEncodedIds(options); if (matches.length === 0) { throw new Error("Could not find any encoded ids in the prompt."); } return matches[matches.length - 1]; } export function toolCallResponse( toolName: string, input: Record, toolCallId = `${toolName}-1`, ): { content: LanguageModelV2Content[]; finishReason: LanguageModelV2FinishReason; usage: LanguageModelV2Usage; } { return { content: [ { type: "tool-call", toolCallId, toolName, input: JSON.stringify(input), }, ], finishReason: "tool-calls", usage: DEFAULT_USAGE, }; } export function doneToolResponse( reasoning = "done", taskComplete = true, toolCallId = "done-1", ): { content: LanguageModelV2Content[]; finishReason: LanguageModelV2FinishReason; usage: LanguageModelV2Usage; } { return toolCallResponse("done", { reasoning, taskComplete }, toolCallId); } function createGenerateResult(result: ScriptedGenerateResult): { content: LanguageModelV2Content[]; finishReason: LanguageModelV2FinishReason; usage: LanguageModelV2Usage; warnings: []; } { return { content: result.content, finishReason: result.finishReason ?? "stop", usage: mergeUsage(result.usage), warnings: [], }; } export function createScriptedAisdkTestLlmClient(options?: { modelId?: string; jsonResponses?: Partial>; generateResponses?: GenerateResponseValue[]; }): AISdkClient { const jsonResponses = Object.fromEntries( Object.entries(options?.jsonResponses ?? {}).map(([key, value]) => [ key, Array.isArray(value) ? [...value] : value, ]), ) as Partial>; const generateResponses = [...(options?.generateResponses ?? [])]; const model: ScriptedLanguageModel = { provider: "mock", modelId: options?.modelId ?? "mock/stagehand-flow-logger", specificationVersion: "v2", supportedUrls: {}, doGenerateCalls: [], doGenerate: async (callOptions) => { model.doGenerateCalls.push(callOptions); if (callOptions.responseFormat?.type === "json") { const key = resolveJsonResponseKey(callOptions); const responseScripts = consumeScriptValue< JsonResponseScript | undefined >(jsonResponses[key], jsonResponses.default); const responseScript = consumeScriptValue< JsonResponseValue | undefined >(responseScripts, undefined); const response = typeof responseScript === "function" ? responseScript(callOptions) : (responseScript ?? {}); return createGenerateResult({ content: [{ type: "text", text: JSON.stringify(response) }], }); } const responseScript = consumeScriptValue< GenerateResponseValue | undefined >(generateResponses, undefined); if (!responseScript) { return createGenerateResult({ content: [{ type: "text", text: "done" }], }); } const response = typeof responseScript === "function" ? responseScript(callOptions) : responseScript; return createGenerateResult(response); }, doStream: async () => { throw new Error("Streaming is not implemented for this test model."); }, }; return new AISdkClient({ model }); } ================================================ FILE: packages/core/tests/integration/text-selector-innermost.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { Protocol } from "devtools-protocol"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; test.describe("Text selector innermost element matching", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("text selector matches only innermost elements", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
`), ); // Only the button should be counted, not the parent elements const count = await page.mainFrame().locator("text=Click me").count(); expect(count).toBe(1); // Verify it finds the button element specifically const session = page.mainFrame().session; const { executionContextId } = await session.send<{ executionContextId: number; }>("Page.createIsolatedWorld", { frameId: page.mainFrame().frameId, worldName: "test-world", }); const evalRes = await session.send( "Runtime.evaluate", { expression: `(() => { const candidates = []; const iter = document.createNodeIterator(document.documentElement, NodeFilter.SHOW_ELEMENT); let n; while ((n = iter.nextNode())) { const el = n; const t = (el.innerText ?? el.textContent ?? '').trim(); if (t && t.includes("Click me")) { candidates.push(el); } } // Find innermost for (const candidate of candidates) { let isInnermost = true; for (const other of candidates) { if (candidate !== other && candidate.contains(other)) { isInnermost = false; break; } } if (isInnermost) return candidate.id; } return null; })()`, contextId: executionContextId, returnByValue: true, }, ); expect(evalRes.result.value).toBe("inner"); }); test("multiple innermost elements with same text", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
Some other content
`), ); // Should find all three innermost elements (2 buttons + 1 link) const count = await page.mainFrame().locator("text=Submit").count(); expect(count).toBe(3); }); test("nested text with different innermost elements", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(`
Hello World
`), ); // "Hello" is only in the parent div const helloCount = await page.mainFrame().locator("text=Hello").count(); expect(helloCount).toBe(1); // Only the div // "World" is only in the span const worldCount = await page.mainFrame().locator("text=World").count(); expect(worldCount).toBe(1); // Only the span // "Hello World" matches only the parent div (as it's the innermost containing both words) const bothCount = await page .mainFrame() .locator("text=Hello World") .count(); expect(bothCount).toBe(1); // Only the div }); }); ================================================ FILE: packages/core/tests/integration/timeouts.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { z } from "zod"; import { closeV3 } from "./testUtils.js"; import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; import { generateText } from "ai"; type AgentToolNameWithTimeout = | "act" | "extract" | "fillForm" | "ariaTree" | "click" | "type" | "dragAndDrop" | "clickAndHold" | "fillFormVision" | "goto" | "navback" | "screenshot" | "scroll" | "keys"; type ToolTimeoutTestModel = { provider: string; modelId: string; specificationVersion: "v2"; supportedUrls: Record; doGenerate: () => Promise<{ content: Array<{ type: "tool-call"; toolCallId: string; toolName: string; input: string; }>; finishReason: "tool-calls"; usage: { inputTokens: number; outputTokens: number; totalTokens: number }; warnings: []; }>; doStream: (_options: unknown) => Promise; }; type ToolTimeoutTestLLMClient = LLMClient & { model: ToolTimeoutTestModel; }; function createToolTimeoutTestLlmClient( toolName: AgentToolNameWithTimeout, toolInput: Record, ): ToolTimeoutTestLLMClient { const usage = { prompt_tokens: 0, completion_tokens: 0, reasoning_tokens: 0, cached_input_tokens: 0, total_tokens: 0, }; let generateCallCount = 0; const model: ToolTimeoutTestModel = { provider: "mock", modelId: "mock/tool-timeout-test", specificationVersion: "v2", supportedUrls: {}, doGenerate: async () => { generateCallCount += 1; if (generateCallCount === 1) { return { content: [ { type: "tool-call", toolCallId: "tool-1", toolName, input: JSON.stringify(toolInput), }, ], finishReason: "tool-calls", usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, warnings: [], }; } return { content: [ { type: "tool-call", toolCallId: "done-1", toolName: "done", input: JSON.stringify({ reasoning: "done", taskComplete: true }), }, ], finishReason: "tool-calls", usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, warnings: [], }; }, doStream: async () => { throw new Error("doStream not implemented in timeout test model"); }, }; const llm = { type: "openai", modelName: "openai/gpt-4.1-mini", hasVision: false, clientOptions: {}, model, getLanguageModel: () => model, generateText, createChatCompletion: async (options: unknown): Promise => { const responseModelName = ( options as { options?: { response_model?: { name?: string } } } )?.options?.response_model?.name; if (responseModelName === "act") { return { data: { elementId: "1-0", description: "click body", method: "click", arguments: [], twoStep: false, }, usage, } as T; } if (responseModelName === "Observation") { return { data: { elements: [] }, usage } as T; } if (responseModelName === "Extraction") { return { data: {}, usage } as T; } if (responseModelName === "Metadata") { return { data: { completed: true, progress: "" }, usage } as T; } return { data: {}, usage } as T; }, }; return llm as unknown as ToolTimeoutTestLLMClient; } function findToolOutput( stepEvents: Array<{ toolCalls?: Array<{ toolName?: string }>; toolResults?: Array<{ output?: unknown }>; }>, toolName: string, ) { for (const event of stepEvents) { if (!event.toolCalls || !event.toolResults) continue; const toolIndex = event.toolCalls.findIndex( (tc) => tc.toolName === toolName, ); if (toolIndex !== -1) { return event.toolResults[toolIndex]?.output; } } return undefined; } async function runAgentToolTimeoutScenario( toolName: AgentToolNameWithTimeout, toolInput: Record, options?: { mode?: "dom" | "hybrid" }, ) { const llmClient = createToolTimeoutTestLlmClient(toolName, toolInput); const stepEvents: Array<{ toolCalls?: Array<{ toolName?: string }>; toolResults?: Array<{ output?: unknown }>; }> = []; const v3 = new V3({ ...v3DynamicTestConfig, experimental: true, llmClient, }); await v3.init(); try { const page = v3.context.pages()[0]; await page.goto("https://example.com"); const agent = v3.agent({ ...(options?.mode ? { mode: options.mode } : {}), }); await agent.execute({ instruction: `Use ${toolName} and then finish`, maxSteps: 2, toolTimeout: 1, callbacks: { onStepFinish: (event) => { stepEvents.push({ toolCalls: event.toolCalls?.map((tc) => ({ toolName: tc.toolName, })), toolResults: event.toolResults?.map((tr) => ({ output: tr.output, })), }); }, }, }); const toolOutput = findToolOutput(stepEvents, toolName); if (!toolOutput) { throw new Error(`No tool output captured for ${toolName}`); } return { toolOutput }; } finally { await closeV3(v3); } } test.describe("V3 hard timeouts", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("observe() enforces timeoutMs", async () => { // Tiny timeout to force the race to hit the timeout branch await expect(v3.observe("find something", { timeout: 5 })).rejects.toThrow( /timed out/i, ); }); test("extract() enforces timeoutMs", async () => { const schema = z.object({ title: z.string().optional() }); await expect( v3.extract("Extract title", schema, { timeout: 5 }), ).rejects.toThrow(/timed out/i); }); test("act() enforces timeoutMs", async () => { await expect(v3.act("do nothing", { timeout: 5 })).rejects.toThrow( /timed out/i, ); }); test("agent toolTimeout enforces timeout for act tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("act", { action: "click somewhere", }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for extract tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("extract", { instruction: "extract the page title", schema: { type: "object", properties: { title: { type: "string" } } }, }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for fillForm tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("fillForm", { fields: [{ action: "type hello into name" }], }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for ariaTree", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("ariaTree", {}); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for goto tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("goto", { url: "https://example.com/slow", }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for navback tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("navback", { reasoningText: "going back", }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for screenshot tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("screenshot", {}); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for scroll tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("scroll", { direction: "down", }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for keys tool", async () => { const { toolOutput } = await runAgentToolTimeoutScenario("keys", { method: "press", value: "Enter", }); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for click tool (hybrid)", async () => { const { toolOutput } = await runAgentToolTimeoutScenario( "click", { describe: "click element", coordinates: [100, 100] }, { mode: "hybrid" }, ); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for type tool (hybrid)", async () => { const { toolOutput } = await runAgentToolTimeoutScenario( "type", { describe: "type into field", text: "hello", coordinates: [100, 100], }, { mode: "hybrid" }, ); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for dragAndDrop tool (hybrid)", async () => { const { toolOutput } = await runAgentToolTimeoutScenario( "dragAndDrop", { describe: "drag element", startCoordinates: [100, 100], endCoordinates: [200, 200], }, { mode: "hybrid" }, ); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for clickAndHold tool (hybrid)", async () => { const { toolOutput } = await runAgentToolTimeoutScenario( "clickAndHold", { describe: "hold element", coordinates: [100, 100], duration: 1000, }, { mode: "hybrid" }, ); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); test("agent toolTimeout enforces timeout for fillFormVision tool (hybrid)", async () => { const { toolOutput } = await runAgentToolTimeoutScenario( "fillFormVision", { fields: [ { action: "type hello into name", value: "hello", coordinates: { x: 100, y: 100 }, }, { action: "type world into email", value: "world", coordinates: { x: 100, y: 200 }, }, ], }, { mode: "hybrid" }, ); const output = toolOutput as { success: boolean; error: string }; expect(output.success).toBe(false); expect(output.error).toContain("TimeoutError"); expect(output.error).toContain("1ms"); }); }); ================================================ FILE: packages/core/tests/integration/user-data-dir.spec.ts ================================================ import { test, expect } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3TestConfig } from "./v3.config.js"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; test.describe("userDataDir persistence", () => { let v3: V3; let testDir: string; test.beforeEach(() => { testDir = fs.mkdtempSync( path.join(os.tmpdir(), "stagehand-userdata-test-"), ); }); test.afterEach(async () => { await v3?.close?.().catch(() => {}); if (testDir && fs.existsSync(testDir)) { fs.rmSync(testDir, { recursive: true, force: true }); } }); test("Chrome uses the specified userDataDir", async () => { const browserTarget = ( process.env.STAGEHAND_BROWSER_TARGET ?? "local" ).toLowerCase(); const isBrowserbase = browserTarget === "browserbase"; test.skip(isBrowserbase, "Requires local Chromium for userDataDir checks"); v3 = new V3({ ...v3TestConfig, localBrowserLaunchOptions: { ...(v3TestConfig.localBrowserLaunchOptions ?? {}), userDataDir: testDir, preserveUserDataDir: true, }, }); await v3.init(); const page = v3.context.pages()[0]; await page.goto("about:blank"); await expect .poll(() => fs.existsSync(path.join(testDir, "Default")), { timeout: 10_000, }) .toBe(true); expect(fs.existsSync(path.join(testDir, "Local State"))).toBe(true); }); }); ================================================ FILE: packages/core/tests/integration/v3.config.ts ================================================ import type { V3Options } from "../../lib/v3/types/public/options.js"; import { v3DynamicTestConfig, getV3DynamicTestConfig, } from "./v3.dynamic.config.js"; export const v3TestConfig: V3Options = v3DynamicTestConfig; export function getV3TestConfig(overrides: Partial = {}): V3Options { return getV3DynamicTestConfig(overrides); } export default getV3TestConfig; ================================================ FILE: packages/core/tests/integration/v3.dynamic.config.ts ================================================ import type { V3Options } from "../../lib/v3/types/public/options.js"; import type { BrowserbaseSessionCreateParams } from "../../lib/v3/types/public/api.js"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; const browserTarget = ( process.env.STAGEHAND_BROWSER_TARGET ?? "local" ).toLowerCase(); const isBrowserbase = browserTarget === "browserbase"; const browserbaseRegionRaw = process.env.BROWSERBASE_REGION; const browserbaseRegion = ( [ "us-west-2", "us-east-1", "eu-central-1", "ap-southeast-1", ] as BrowserbaseSessionCreateParams["region"][] ).includes(browserbaseRegionRaw as BrowserbaseSessionCreateParams["region"]) ? (browserbaseRegionRaw as BrowserbaseSessionCreateParams["region"]) : undefined; const baseConfig = { verbose: 0 as const, disablePino: true, logger: (line: LogLine) => console.log(line), disableAPI: true, }; export const v3DynamicTestConfig: V3Options = isBrowserbase ? { ...baseConfig, env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY!, projectId: process.env.BROWSERBASE_PROJECT_ID!, disableAPI: true, selfHeal: false, ...(browserbaseRegion ? { browserbaseSessionCreateParams: { region: browserbaseRegion } } : {}), } : { ...baseConfig, env: "LOCAL", localBrowserLaunchOptions: { executablePath: process.env.CHROME_PATH, args: process.env.CI ? ["--no-sandbox"] : undefined, headless: true, viewport: { width: 1288, height: 711 }, }, }; export function getV3DynamicTestConfig( overrides: Partial = {}, ): V3Options { return { ...v3DynamicTestConfig, ...overrides }; } export default getV3DynamicTestConfig; ================================================ FILE: packages/core/tests/integration/v3.playwright.config.ts ================================================ import { defineConfig, type ReporterDescription } from "@playwright/test"; import { getPackageRootDir } from "../../lib/v3/runtimePaths.js"; const coreDir = getPackageRootDir(); const testDir = `${coreDir}/dist/esm/tests/integration`; const browserTarget = ( process.env.STAGEHAND_BROWSER_TARGET ?? "local" ).toLowerCase(); const isBrowserbase = browserTarget === "browserbase"; const consoleReporter = process.env.PLAYWRIGHT_CONSOLE_REPORTER ?? "list"; const localWorkerOverride = Number( process.env.LOCAL_SESSION_LIMIT_PER_E2E_TEST, ); const localWorkers = Number.isFinite(localWorkerOverride) && localWorkerOverride > 0 ? localWorkerOverride : process.env.CI ? 3 : 5; const ciWorkerOverride = Number( process.env.BROWSERBASE_SESSION_LIMIT_PER_E2E_TEST, ); const bbWorkers = process.env.CI && Number.isFinite(ciWorkerOverride) && ciWorkerOverride > 0 ? ciWorkerOverride : 3; const ctrfJunitPath = process.env.CTRF_JUNIT_PATH; const reporter: ReporterDescription[] = ctrfJunitPath ? [ [consoleReporter] as ReporterDescription, [ "junit", { outputFile: ctrfJunitPath, includeProjectInTestName: true }, ] as ReporterDescription, ] : [[consoleReporter] as ReporterDescription]; export default defineConfig({ testDir, timeout: 90_000, expect: { timeout: 10_000 }, retries: process.env.CI ? 1 : 0, workers: isBrowserbase ? bbWorkers : localWorkers, fullyParallel: true, projects: [ { name: isBrowserbase ? "e2e-bb" : "e2e-local", }, ], reporter, use: { // we're not launching Playwright browsers in these tests; we connect via Puppeteer/CDP to V3. headless: false, }, }); ================================================ FILE: packages/core/tests/integration/wait-for-selector.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; test.describe.configure({ mode: "serial" }); test.describe("Page.waitForSelector tests", () => { let v3: V3; test.beforeAll(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.beforeEach(async () => { const pages = v3.context.pages(); if (pages.length === 0) { await v3.context.newPage("about:blank"); return; } const [primary, ...extras] = pages; for (const page of extras) { await page.close().catch(() => {}); } v3.context.setActivePage(primary); await primary.goto("about:blank", { waitUntil: "load", timeoutMs: 15_000, }); }); test.afterAll(async () => { await closeV3(v3); }); test.describe("Basic state tests", () => { test("resolves when element is already visible", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(''), ); const result = await page.waitForSelector("#submit-btn"); expect(result).toBe(true); }); test("resolves when element appears after delay", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
" + "", ), ); const result = await page.waitForSelector("#delayed-btn", { timeout: 5000, }); expect(result).toBe(true); }); test("state 'attached' resolves for hidden elements", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '', ), ); const result = await page.waitForSelector("#hidden-div", { state: "attached", }); expect(result).toBe(true); }); test("state 'visible' waits for element to become visible", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + "", ), ); const result = await page.waitForSelector("#show-later", { state: "visible", timeout: 5000, }); expect(result).toBe(true); }); test("state 'hidden' waits for element to become hidden", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Will Hide
' + "", ), ); const result = await page.waitForSelector("#hide-later", { state: "hidden", timeout: 5000, }); expect(result).toBe(true); }); test("state 'detached' waits for element to be removed", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Will Be Removed
' + "", ), ); const result = await page.waitForSelector("#remove-me", { state: "detached", timeout: 5000, }); expect(result).toBe(true); }); test("state 'detached' resolves immediately for non-existent element", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent("
Content
"), ); const result = await page.waitForSelector("#does-not-exist", { state: "detached", timeout: 1000, }); expect(result).toBe(true); }); }); test.describe("Timeout behavior", () => { test("throws on timeout when element never appears", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent("
No button here
"), ); let error: Error | null = null; try { await page.waitForSelector("#nonexistent", { timeout: 300 }); } catch (e) { error = e as Error; } expect(error).not.toBeNull(); expect(error?.message).toContain("Timeout"); expect(error?.message).toContain("#nonexistent"); }); test("respects custom timeout duration", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent("
Content
"), ); const startTime = Date.now(); try { await page.waitForSelector("#nonexistent", { timeout: 500 }); } catch { // Expected to timeout } const elapsed = Date.now() - startTime; // Should timeout around 500ms (allow some margin) expect(elapsed).toBeGreaterThanOrEqual(450); expect(elapsed).toBeLessThan(2000); }); }); test.describe("CSS selector variants", () => { test("handles complex CSS selectors", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + '
' + '' + "
" + "
", ), ); const result = await page.waitForSelector( ".container #login-form button[type='submit']", ); expect(result).toBe(true); }); }); test.describe("Open shadow DOM", () => { test("finds element inside open shadow DOM with pierceShadow: true", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector("#shadow-btn", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); test("does NOT find shadow DOM element with pierceShadow: false", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); let error: Error | null = null; try { await page.waitForSelector("#shadow-only-btn", { pierceShadow: false, timeout: 300, }); } catch (e) { error = e as Error; } expect(error).not.toBeNull(); expect(error?.message).toContain("Timeout"); }); test("finds element in nested open shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector("#deep-element", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); }); test.describe("Closed shadow DOM (via piercer)", () => { test("finds element inside closed shadow DOM via custom element", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "" + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); // The piercer hooks attachShadow and stores closed shadow roots const result = await page.waitForSelector("#closed-btn", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); test("finds element in nested closed shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "" + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector("#deeply-closed", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); test("finds element in mixed open/closed nested shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector("#mixed-deep-btn", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); test("waits for element to appear inside closed shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "" + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); const result = await page.waitForSelector("#delayed-closed-btn", { pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); }); test.describe("XPath selectors", () => { test("finds element with basic XPath", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent(''), ); const result = await page.waitForSelector("//button[@id='xpath-btn']", { timeout: 5000, }); expect(result).toBe(true); }); test("finds element with xpath= prefix", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Target
', ), ); const result = await page.waitForSelector( "xpath=//span[@class='target']", { timeout: 5000, }, ); expect(result).toBe(true); }); test("waits for element to appear with XPath", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
" + "", ), ); const result = await page.waitForSelector("//span[@id='delayed-xpath']", { timeout: 5000, }); expect(result).toBe(true); }); test("finds element in open shadow DOM with XPath", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector( "//button[@id='shadow-xpath-btn']", { pierceShadow: true, timeout: 5000, }, ); expect(result).toBe(true); }); test("finds element in closed shadow DOM with XPath", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "" + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector( "//span[@id='xpath-closed-target']", { pierceShadow: true, timeout: 5000, }, ); expect(result).toBe(true); }); }); test.describe("Iframe hop notation (>>)", () => { test("finds element inside single iframe", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + '' + "", ), ); await page.waitForTimeout(100); const result = await page.waitForSelector( "iframe#my-frame >> #frame-btn", { timeout: 5000, }, ); expect(result).toBe(true); }); test("finds element through multiple iframe hops", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + "", ), ); await page.waitForTimeout(300); const result = await page.waitForSelector( "iframe#outer-frame >> iframe#inner-frame >> #nested-content", { timeout: 5000 }, ); expect(result).toBe(true); }); test("waits for element to appear inside iframe", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + "", ), ); const result = await page.waitForSelector( "iframe#delay-frame >> #delayed-in-frame", { timeout: 5000, }, ); expect(result).toBe(true); }); }); test.describe("Visibility edge cases", () => { test("visibility: hidden is not visible", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '', ), ); // Should be attached but not visible const attached = await page.waitForSelector("#vis-hidden", { state: "attached", }); expect(attached).toBe(true); let error: Error | null = null; try { await page.waitForSelector("#vis-hidden", { state: "visible", timeout: 200, }); } catch (e) { error = e as Error; } expect(error).not.toBeNull(); }); test("opacity: 0 is not visible", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Transparent
', ), ); const attached = await page.waitForSelector("#transparent", { state: "attached", }); expect(attached).toBe(true); let error: Error | null = null; try { await page.waitForSelector("#transparent", { state: "visible", timeout: 200, }); } catch (e) { error = e as Error; } expect(error).not.toBeNull(); }); test("zero dimensions is not visible", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Zero
', ), ); const attached = await page.waitForSelector("#zero-size", { state: "attached", }); expect(attached).toBe(true); let error: Error | null = null; try { await page.waitForSelector("#zero-size", { state: "visible", timeout: 200, }); } catch (e) { error = e as Error; } expect(error).not.toBeNull(); }); test("detects visibility change via class toggle", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "" + '' + "", ), ); const result = await page.waitForSelector("#class-toggle", { state: "visible", timeout: 5000, }); expect(result).toBe(true); }); test("detects visibility change via style attribute", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '' + "", ), ); const result = await page.waitForSelector("#style-toggle", { state: "visible", timeout: 5000, }); expect(result).toBe(true); }); }); test.describe("Dynamic DOM scenarios", () => { test("handles rapid DOM mutations", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
" + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); // Small delay to ensure script starts await page.waitForTimeout(50); const result = await page.waitForSelector("#item-7", { timeout: 10000 }); expect(result).toBe(true); }); test("handles element removed and re-added", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent('
Toggle
'), ); const browserTarget = ( process.env.STAGEHAND_BROWSER_TARGET ?? "local" ).toLowerCase(); const isBrowserbase = browserTarget === "browserbase"; const removeDelayMs = isBrowserbase ? 1000 : 200; const addDelayMs = isBrowserbase ? 1600 : 500; const waitTimeoutMs = isBrowserbase ? 10000 : 5000; // Start waiting before scheduling DOM changes to avoid racey timing in CI. const detachedPromise = page.waitForSelector("#toggle-me", { state: "detached", timeout: waitTimeoutMs, }); await page.evaluate( ({ removeDelay, addDelay }) => { const el = document.getElementById("toggle-me"); const parent = el?.parentNode; if (!el || !parent) return; setTimeout(() => parent.removeChild(el), removeDelay); setTimeout(() => parent.appendChild(el), addDelay); }, { removeDelay: removeDelayMs, addDelay: addDelayMs }, ); const detached = await detachedPromise; expect(detached).toBe(true); // Then wait for visible again const visible = await page.waitForSelector("#toggle-me", { state: "visible", timeout: waitTimeoutMs, }); expect(visible).toBe(true); }); test("handles dynamically replaced innerHTML", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
Loading...
' + "", ), ); const result = await page.waitForSelector("#loaded-btn", { timeout: 5000, }); expect(result).toBe(true); }); test("handles element created via insertAdjacentHTML", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), ); const result = await page.waitForSelector("#inserted", { timeout: 5000 }); expect(result).toBe(true); }); }); test.describe("Shadow DOM visibility changes", () => { test("detects element becoming visible inside open shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); const result = await page.waitForSelector("#shadow-btn", { state: "visible", pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); test("detects element becoming hidden inside shadow DOM", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( '
' + "", ), { waitUntil: "load", timeoutMs: 30000 }, ); await page.waitForTimeout(100); const result = await page.waitForSelector("#hide-shadow-btn", { state: "hidden", pierceShadow: true, timeout: 5000, }); expect(result).toBe(true); }); }); }); ================================================ FILE: packages/core/tests/integration/wait-for-timeout.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { closeV3 } from "./testUtils.js"; test.describe("Page.waitForTimeout tests", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("waitForTimeout resolves after specified duration", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent("
Test Page
"), ); const startTime = Date.now(); await page.waitForTimeout(200); const elapsed = Date.now() - startTime; // Should have waited at least 200ms (allow some tolerance) expect(elapsed).toBeGreaterThanOrEqual(190); }); test("waitForTimeout resolves immediately for 0ms", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent("
Test Page
"), ); const startTime = Date.now(); await page.waitForTimeout(0); const elapsed = Date.now() - startTime; // Should resolve nearly immediately (within 50ms tolerance) expect(elapsed).toBeLessThan(50); }); test("waitForTimeout can be chained with other operations", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
0
" + "", ), ); // Wait for counter to increment await page.waitForTimeout(350); // Counter should have incremented at least 3 times const text = await page.mainFrame().locator("#counter").textContent(); expect(parseInt(text ?? "0")).toBeGreaterThanOrEqual(3); }); test("waitForTimeout works with async/await syntax", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html," + encodeURIComponent("
Test
")); const results: number[] = []; results.push(1); await page.waitForTimeout(50); results.push(2); await page.waitForTimeout(50); results.push(3); expect(results).toEqual([1, 2, 3]); }); test("waitForTimeout allows DOM to update", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
" + "", ), ); // Trigger the delayed update await page.evaluate(() => { (window as unknown as { startUpdate: () => void }).startUpdate(); }); // Wait for the timeout to allow DOM update await page.waitForTimeout(300); // Content should now be loaded const afterText = await page.mainFrame().locator("#delayed").textContent(); expect(afterText).toBe("Loaded"); }); test("waitForTimeout with small increments", async () => { const page = v3.context.pages()[0]; await page.goto("data:text/html," + encodeURIComponent("
Test
")); const startTime = Date.now(); // Multiple small waits await page.waitForTimeout(50); await page.waitForTimeout(50); await page.waitForTimeout(50); await page.waitForTimeout(50); const elapsed = Date.now() - startTime; // Should have waited at least 200ms total (4 * 50ms) expect(elapsed).toBeGreaterThanOrEqual(190); }); test("waitForTimeout does not block other async operations", async () => { const page = v3.context.pages()[0]; await page.goto( "data:text/html," + encodeURIComponent( "
Initial
" + "", ), ); // Start a timeout const timeoutPromise = page.waitForTimeout(100); // Execute something else while waiting await page.evaluate(() => { (window as unknown as { updateText: () => void }).updateText(); }); // Verify the update happened const text = await page.mainFrame().locator("#async-test").textContent(); expect(text).toBe("Updated"); // Wait for the timeout to complete await timeoutPromise; }); }); ================================================ FILE: packages/core/tests/integration/xpath-for-location-deep.spec.ts ================================================ import { expect, test } from "@playwright/test"; import { V3 } from "../../lib/v3/v3.js"; import { v3DynamicTestConfig } from "./v3.dynamic.config.js"; import { resolveXpathForLocation } from "../../lib/v3/understudy/a11y/snapshot/index.js"; import { executionContexts } from "../../lib/v3/understudy/executionContextRegistry.js"; import { closeV3 } from "./testUtils.js"; test.describe("resolveNodeForLocationDeep", () => { let v3: V3; test.beforeEach(async () => { v3 = new V3(v3DynamicTestConfig); await v3.init(); }); test.afterEach(async () => { await closeV3(v3); }); test("click resolves inside same-process iframe and returns absolute XPath", async () => { const page = await v3.context.awaitActivePage(); // Set consistent viewport size to ensure stable rendering across environments await page.setViewportSize(1280, 720); await page.goto( "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-hn/", { waitUntil: "networkidle" }, ); await page.waitForSelector("section iframe", { state: "attached", timeout: 10000, }); const frame = await page.frameLocator("section iframe").resolveFrame(); await executionContexts.waitForMainWorld( frame.session, frame.frameId, 5000, ); // scroll to the bottom of the page await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); // scroll to the bottom of the iframe await frame.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); // Wait a bit for the iframe content to settle after scrolling await new Promise((resolve) => setTimeout(resolve, 500)); // Get the iframe's position in the main page const iframeOffset = await page.evaluate(() => { const iframe = document.querySelector("section iframe"); if (!iframe) return null; const rect = iframe.getBoundingClientRect(); return { left: rect.left, top: rect.top, }; }); // Get the link's position within the iframe const linkOffsetInFrame = await frame.evaluate(() => { // Find the 88th row, 3rd column link (the one we're testing) const table = document.querySelector( "center > table > tbody > tr:nth-child(3) > td > table", ); if (!table) return null; const row88 = table.querySelector("tbody > tr:nth-child(88)"); if (!row88) return null; const cell3 = row88.querySelector("td:nth-child(3)"); if (!cell3) return null; const link = cell3.querySelector("span > a"); if (!link) return null; const rect = link.getBoundingClientRect(); // Return center coordinates of the link relative to iframe return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2, }; }); // Combine iframe offset and link offset to get page-level coordinates // Fallback to hardcoded coordinates if element not found (shouldn't happen) const x = iframeOffset && linkOffsetInFrame ? iframeOffset.left + linkOffsetInFrame.x : 356; const y = iframeOffset && linkOffsetInFrame ? iframeOffset.top + linkOffsetInFrame.y : 503; const result = await resolveXpathForLocation(page, x, y); console.log("=== Coordinates used:", { x, y }); console.log("=== Result:", result); const xpath = result.absoluteXPath; expect(xpath).toBe( "/html[1]/body[1]/main[1]/section[3]/iframe[1]/html[1]/body[1]/center[1]/table[1]/tbody[1]/tr[3]/td[1]/table[1]/tbody[1]/tr[88]/td[3]/span[1]/a[1]", ); }); }); ================================================ FILE: packages/core/tests/unit/agent-captcha-hooks.test.ts ================================================ import { beforeEach, describe, expect, it, vi } from "vitest"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; import { CaptchaSolver } from "../../lib/v3/agent/utils/captchaSolver.js"; import { V3AgentHandler } from "../../lib/v3/handlers/v3AgentHandler.js"; const SOLVING_STARTED = "browserbase-solving-started"; const SOLVING_FINISHED = "browserbase-solving-finished"; const SOLVING_ERRORED = "browserbase-solving-errored"; type ConsoleListener = (message: { text: () => string }) => void; class MockPage { private listeners = new Set(); public captchaBoxes: Array<{ left: number; top: number; right: number; bottom: number; }> = []; on(event: string, listener: ConsoleListener): void { if (event === "console") { this.listeners.add(listener); } } off(event: string, listener: ConsoleListener): void { if (event === "console") { this.listeners.delete(listener); } } emitConsole(text: string): void { const message = { text: () => text }; for (const listener of this.listeners) { listener(message); } } url(): string { return "https://example.com"; } async screenshot(): Promise { return Buffer.from("fake-image"); } async evaluate(): Promise { return this.captchaBoxes as T; } mainFrame(): { evaluate: () => Promise<{ w: number; h: number }> } { return { evaluate: async () => ({ w: 1288, h: 711 }), }; } } class FakeCuaClient { public contextNotes: string[] = []; public preStepHook?: () => Promise; public actionHandler?: (action: Record) => Promise; public executeImpl = vi.fn(async (options: unknown) => { void options; return { success: true, message: "ok", actions: [], completed: true, }; }); public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); public setScreenshotProvider = vi.fn(); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( handler: (action: Record) => Promise, ): void { this.actionHandler = handler; } setPreStepHook(handler: () => Promise): void { this.preStepHook = handler; } addContextNote(note: string): void { this.contextNotes.push(note); } async execute(options: unknown): Promise<{ success: boolean; message: string; actions: unknown[]; completed: boolean; }> { return this.executeImpl(options); } } let fakeCuaClient: FakeCuaClient; vi.mock("../../lib/v3/agent/AgentProvider", () => ({ AgentProvider: class { constructor(logger: unknown) { void logger; } getClient(): FakeCuaClient { return fakeCuaClient; } }, })); import { V3CuaAgentHandler } from "../../lib/v3/handlers/v3CuaAgentHandler.js"; function collectUserMessages( messages: Array<{ role: string; content: unknown }>, ): Array<{ role: "user"; content: string }> { return messages.filter( (message): message is { role: "user"; content: string } => message.role === "user" && typeof message.content === "string", ); } describe("agent captcha hooks", () => { let page: MockPage; let logs: LogLine[]; let logger: (line: LogLine) => void; beforeEach(() => { page = new MockPage(); logs = []; logger = (line) => { logs.push(line); }; fakeCuaClient = new FakeCuaClient(); }); it("blocks regular agent prepareStep until the solver finishes and injects one solved message", async () => { const handler = new V3AgentHandler( { isCaptchaAutoSolveEnabled: true, } as never, logger, {} as never, ); const solver = new CaptchaSolver(); solver.init(async () => page as never); const userCallback = vi.fn(async (options) => options); const prepareStep = ( handler as unknown as { createPrepareStep: ( callback?: (options: Record) => Promise, captchaSolver?: CaptchaSolver, ) => (options: Record) => Promise; } ).createPrepareStep(userCallback, solver); const options = { messages: [{ role: "user", content: "start" }], }; await prepareStep(options); page.emitConsole(SOLVING_STARTED); const secondCall = prepareStep(options); await Promise.resolve(); expect(userCallback).toHaveBeenCalledTimes(1); page.emitConsole(SOLVING_FINISHED); await secondCall; expect(userCallback).toHaveBeenCalledTimes(2); expect( collectUserMessages( options.messages as Array<{ role: string; content: unknown }>, ).filter((message) => message.content.includes("automatically detected and solved"), ), ).toHaveLength(1); }); it("injects one error message when the regular agent solver errors", async () => { const handler = new V3AgentHandler( { isCaptchaAutoSolveEnabled: true, } as never, logger, {} as never, ); const solver = new CaptchaSolver(); solver.init(async () => page as never); const prepareStep = ( handler as unknown as { createPrepareStep: ( callback?: (options: Record) => Promise, captchaSolver?: CaptchaSolver, ) => (options: Record) => Promise; } ).createPrepareStep(undefined, solver); const options = { messages: [{ role: "user", content: "start" }], }; await prepareStep(options); page.emitConsole(SOLVING_STARTED); const pending = prepareStep(options); page.emitConsole(SOLVING_ERRORED); await pending; expect( collectUserMessages( options.messages as Array<{ role: string; content: unknown }>, ).filter((message) => message.content.includes("automatic captcha solver failed"), ), ).toHaveLength(1); }); it("pauses the CUA loop at prepareStep while Browserbase solves a captcha", async () => { let secondPrepareStarted = false; fakeCuaClient.executeImpl = vi.fn(async () => { await fakeCuaClient.preStepHook?.(); page.emitConsole(SOLVING_STARTED); const blockedPrepare = fakeCuaClient.preStepHook?.() ?? Promise.resolve(); secondPrepareStarted = true; await blockedPrepare; return { success: true, message: "ok", actions: [], completed: true, }; }); const handler = new V3CuaAgentHandler( { context: { awaitActivePage: async () => page, }, bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, isAgentReplayActive: () => false, updateMetrics: vi.fn(), } as never, logger, { modelName: "anthropic/claude-haiku-4-5-20251001", clientOptions: { waitBetweenActions: 1 }, } as never, ); const execution = handler.execute({ instruction: "Describe the page briefly.", highlightCursor: false, }); await vi.waitFor(() => { expect(secondPrepareStarted).toBe(true); expect( logs.some((line) => line.message.includes("waiting for Browserbase to solve"), ), ).toBe(true); }); expect(logs.some((line) => line.message.includes("Captcha solved"))).toBe( false, ); page.emitConsole(SOLVING_FINISHED); await execution; expect(fakeCuaClient.contextNotes).toEqual([ expect.stringContaining("automatically detected and solved"), ]); expect(logs.some((line) => line.message.includes("Captcha solved"))).toBe( true, ); }); it("pauses CUA actions until the captcha solver finishes", async () => { let actionStarted = false; fakeCuaClient.executeImpl = vi.fn(async () => { await fakeCuaClient.preStepHook?.(); page.emitConsole(SOLVING_STARTED); const pendingAction = fakeCuaClient.actionHandler?.({ type: "screenshot" }) ?? Promise.resolve(); actionStarted = true; await pendingAction; return { success: true, message: "ok", actions: [], completed: true, }; }); const handler = new V3CuaAgentHandler( { context: { awaitActivePage: async () => page, }, bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, isAgentReplayActive: () => false, updateMetrics: vi.fn(), } as never, logger, { modelName: "anthropic/claude-haiku-4-5-20251001", clientOptions: { waitBetweenActions: 1 }, } as never, ); const executeActionSpy = vi .spyOn( handler as unknown as { executeAction: (action: Record) => Promise; }, "executeAction", ) .mockResolvedValue({ success: true }); vi.spyOn(handler, "captureAndSendScreenshot").mockResolvedValue(null); const execution = handler.execute({ instruction: "Describe the page briefly.", highlightCursor: false, }); await vi.waitFor(() => { expect(actionStarted).toBe(true); }); expect(executeActionSpy).not.toHaveBeenCalled(); page.emitConsole(SOLVING_FINISHED); await execution; expect(executeActionSpy).toHaveBeenCalledTimes(1); expect(fakeCuaClient.contextNotes).toEqual([ expect.stringContaining("automatically detected and solved"), ]); expect(logs.some((line) => line.message.includes("Captcha solved"))).toBe( true, ); }); it("skips post-solve clicks on the captcha widget and injects another note", async () => { page.captchaBoxes = [{ left: 0, top: 400, right: 140, bottom: 470 }]; fakeCuaClient.executeImpl = vi.fn(async () => { await fakeCuaClient.preStepHook?.(); page.emitConsole(SOLVING_STARTED); const blockedPrepare = fakeCuaClient.preStepHook?.() ?? Promise.resolve(); page.emitConsole(SOLVING_FINISHED); await blockedPrepare; await fakeCuaClient.actionHandler?.({ type: "click", button: "left", x: 63, y: 436, }); return { success: true, message: "ok", actions: [], completed: true, }; }); const handler = new V3CuaAgentHandler( { context: { awaitActivePage: async () => page, }, bus: { emit: vi.fn() }, isCaptchaAutoSolveEnabled: true, isAdvancedStealth: false, configuredViewport: { width: 1288, height: 711 }, isAgentReplayActive: () => false, updateMetrics: vi.fn(), } as never, logger, { modelName: "anthropic/claude-haiku-4-5-20251001", clientOptions: { waitBetweenActions: 1 }, } as never, ); const executeActionSpy = vi .spyOn( handler as unknown as { executeAction: (action: Record) => Promise; }, "executeAction", ) .mockResolvedValue({ success: true }); vi.spyOn(handler, "captureAndSendScreenshot").mockResolvedValue(null); await handler.execute({ instruction: "Describe the page briefly.", highlightCursor: false, }); expect(executeActionSpy).not.toHaveBeenCalled(); expect(fakeCuaClient.contextNotes).toEqual([ expect.stringContaining("automatically detected and solved"), expect.stringContaining("Original task: Describe the page briefly."), ]); expect( logs.some((line) => line.message.includes("Skipped click on solved captcha widget"), ), ).toBe(true); }); }); ================================================ FILE: packages/core/tests/unit/agent-execution-model.test.ts ================================================ import { describe, expect, it, vi } from "vitest"; import { actTool } from "../../lib/v3/agent/tools/act.js"; import { extractTool } from "../../lib/v3/agent/tools/extract.js"; import { fillFormTool } from "../../lib/v3/agent/tools/fillform.js"; import type { V3 } from "../../lib/v3/v3.js"; /** * Minimal mock of V3 that captures how tools pass `model` options * into v3.act(), v3.extract(), and v3.observe(). */ function createMockV3() { const calls: { method: string; model: unknown }[] = []; const mock = { logger: vi.fn(), recordAgentReplayStep: vi.fn(), act: vi.fn(async (_instruction: unknown, options?: { model?: unknown }) => { calls.push({ method: "act", model: options?.model }); return { success: true, message: "ok", actionDescription: "clicked", actions: [], }; }), extract: vi.fn( async ( _instruction: unknown, _schema: unknown, options?: { model?: unknown }, ) => { calls.push({ method: "extract", model: options?.model }); return { extraction: "data" }; }, ), observe: vi.fn( async (_instruction: unknown, options?: { model?: unknown }) => { calls.push({ method: "observe", model: options?.model }); return []; }, ), calls, }; return mock as unknown as V3 & { calls: typeof calls }; } describe("agent tools pass full executionModel config to v3 methods", () => { const modelConfig = { modelName: "openai/gpt-4o-mini", apiKey: "sk-test-key", baseURL: "https://custom.api", }; it("actTool passes AgentModelConfig object to v3.act()", async () => { const v3 = createMockV3(); const tool = actTool(v3, modelConfig); await tool.execute!( { action: "click the button" }, { toolCallId: "t1", messages: [], abortSignal: new AbortController().signal, }, ); expect(v3.calls).toHaveLength(1); expect(v3.calls[0].method).toBe("act"); expect(v3.calls[0].model).toBe(modelConfig); }); it("extractTool passes AgentModelConfig object to v3.extract()", async () => { const v3 = createMockV3(); const tool = extractTool(v3, modelConfig); await tool.execute!( { instruction: "get the title", schema: undefined }, { toolCallId: "t2", messages: [], abortSignal: new AbortController().signal, }, ); expect(v3.calls).toHaveLength(1); expect(v3.calls[0].method).toBe("extract"); expect(v3.calls[0].model).toBe(modelConfig); }); it("fillFormTool passes AgentModelConfig object to v3.observe()", async () => { const v3 = createMockV3(); const tool = fillFormTool(v3, modelConfig); await tool.execute!( { fields: [{ action: "type hello into name" }] }, { toolCallId: "t3", messages: [], abortSignal: new AbortController().signal, }, ); expect(v3.calls).toHaveLength(1); expect(v3.calls[0].method).toBe("observe"); expect(v3.calls[0].model).toBe(modelConfig); }); it("actTool passes undefined when no executionModel is set", async () => { const v3 = createMockV3(); const tool = actTool(v3, undefined); await tool.execute!( { action: "click the button" }, { toolCallId: "t4", messages: [], abortSignal: new AbortController().signal, }, ); expect(v3.calls).toHaveLength(1); expect(v3.calls[0].model).toBeUndefined(); }); it("actTool passes plain string executionModel to v3.act()", async () => { const v3 = createMockV3(); const tool = actTool(v3, "openai/gpt-4o-mini"); await tool.execute!( { action: "click the button" }, { toolCallId: "t5", messages: [], abortSignal: new AbortController().signal, }, ); expect(v3.calls).toHaveLength(1); expect(v3.calls[0].model).toBe("openai/gpt-4o-mini"); }); }); describe("executionModel fallback logic", () => { // This mirrors the resolution in V3.prepareAgentExecution (v3.ts:1682): // const resolvedExecutionModel = options?.executionModel ?? options?.model; function resolveExecutionModel(options?: { executionModel?: string | { modelName: string }; model?: string | { modelName: string }; }) { return options?.executionModel ?? options?.model; } it("prefers explicit executionModel over model", () => { const result = resolveExecutionModel({ executionModel: "openai/gpt-4o-mini", model: "anthropic/claude-sonnet-4-20250514", }); expect(result).toBe("openai/gpt-4o-mini"); }); it("falls back to model when executionModel is not set", () => { const modelConfig = { modelName: "anthropic/claude-sonnet-4-20250514", apiKey: "sk-test", }; const result = resolveExecutionModel({ model: modelConfig }); expect(result).toBe(modelConfig); }); it("returns undefined when neither is set", () => { expect(resolveExecutionModel({})).toBeUndefined(); expect(resolveExecutionModel(undefined)).toBeUndefined(); }); }); ================================================ FILE: packages/core/tests/unit/api-multiregion.test.ts ================================================ import { describe, expect, it } from "vitest"; import { getApiUrlForRegion, REGION_API_URLS } from "../../lib/v3/api"; describe("Multi-region API URL mapping", () => { describe("REGION_API_URLS constant", () => { it("should have the correct URL for us-west-2 (default)", () => { expect(REGION_API_URLS["us-west-2"]).toBe( "https://api.stagehand.browserbase.com", ); }); it("should have the correct URL for us-east-1", () => { expect(REGION_API_URLS["us-east-1"]).toBe( "https://api.use1.stagehand.browserbase.com", ); }); it("should have the correct URL for eu-central-1", () => { expect(REGION_API_URLS["eu-central-1"]).toBe( "https://api.euc1.stagehand.browserbase.com", ); }); it("should have the correct URL for ap-southeast-1", () => { expect(REGION_API_URLS["ap-southeast-1"]).toBe( "https://api.apse1.stagehand.browserbase.com", ); }); }); describe("getApiUrlForRegion", () => { it("should return the correct URL for us-west-2", () => { expect(getApiUrlForRegion("us-west-2")).toBe( "https://api.stagehand.browserbase.com/v1", ); }); it("should return the correct URL for us-east-1", () => { expect(getApiUrlForRegion("us-east-1")).toBe( "https://api.use1.stagehand.browserbase.com/v1", ); }); it("should return the correct URL for eu-central-1", () => { expect(getApiUrlForRegion("eu-central-1")).toBe( "https://api.euc1.stagehand.browserbase.com/v1", ); }); it("should return the correct URL for ap-southeast-1", () => { expect(getApiUrlForRegion("ap-southeast-1")).toBe( "https://api.apse1.stagehand.browserbase.com/v1", ); }); it("should return the default us-west-2 URL when no region is specified", () => { expect(getApiUrlForRegion(undefined)).toBe( "https://api.stagehand.browserbase.com/v1", ); }); it("should return the default us-west-2 URL for unknown regions", () => { // @ts-expect-error - testing invalid region expect(getApiUrlForRegion("invalid-region")).toBe( "https://api.stagehand.browserbase.com/v1", ); }); }); describe("URL /v1 suffix handling", () => { it("getApiUrlForRegion always includes /v1 suffix for consistency", () => { // getApiUrlForRegion returns a URL with /v1 // This documents the expected contract that all API base URLs include /v1 const url = getApiUrlForRegion("us-west-2"); expect(url.endsWith("/v1")).toBe(true); }); it("all regional URLs should be base URLs without /v1 in REGION_API_URLS", () => { // Verify REGION_API_URLS contains base URLs (without /v1) // The /v1 suffix is added by getApiUrlForRegion for (const [region, baseUrl] of Object.entries(REGION_API_URLS)) { expect(baseUrl.endsWith("/v1")).toBe(false); expect(getApiUrlForRegion(region as keyof typeof REGION_API_URLS)).toBe( `${baseUrl}/v1`, ); } }); }); }); ================================================ FILE: packages/core/tests/unit/browserbase-session-accessors.test.ts ================================================ import { describe, expect, it, vi, beforeEach, afterEach } from "vitest"; import { V3 } from "../../lib/v3/v3.js"; const MOCK_SESSION_ID = "session-123"; const MOCK_SESSION_URL = `https://www.browserbase.com/sessions/${MOCK_SESSION_ID}`; const MOCK_DEBUG_URL = `https://debug.browserbase.com/${MOCK_SESSION_ID}`; vi.mock("../../lib/v3/understudy/context", () => { class MockConnection { onTransportClosed = vi.fn(); offTransportClosed = vi.fn(); send = vi.fn(async () => {}); } class MockV3Context { static async create(): Promise { return new MockV3Context(); } conn = new MockConnection(); pages(): never[] { return []; } async close(): Promise { // noop } } return { V3Context: MockV3Context }; }); vi.mock("../../lib/v3/launch/browserbase", () => ({ createBrowserbaseSession: vi.fn(async () => ({ ws: "wss://mock-browserbase", sessionId: MOCK_SESSION_ID, bb: { sessions: { debug: vi.fn(async () => ({ debuggerUrl: MOCK_DEBUG_URL })), }, }, })), })); vi.mock("../../lib/v3/launch/local", () => ({ launchLocalChrome: vi.fn(async () => ({ ws: "ws://local-cdp", chrome: { kill: vi.fn(async () => {}) }, })), })); describe("browserbase accessors", () => { beforeEach(() => { process.env.BROWSERBASE_API_KEY = "fake-key"; process.env.BROWSERBASE_PROJECT_ID = "fake-project"; }); afterEach(() => { delete process.env.BROWSERBASE_API_KEY; delete process.env.BROWSERBASE_PROJECT_ID; vi.clearAllMocks(); }); it("exposes Browserbase session and debug URLs after init", async () => { const v3 = new V3({ env: "BROWSERBASE", disableAPI: true, verbose: 0, }); try { await v3.init(); expect(v3.browserbaseSessionURL).toBe(MOCK_SESSION_URL); expect(v3.browserbaseDebugURL).toBe(MOCK_DEBUG_URL); expect(v3.isCaptchaAutoSolveEnabled).toBe(true); } finally { await v3.close().catch(() => {}); } }); it("clears stored URLs after close", async () => { const v3 = new V3({ env: "BROWSERBASE", disableAPI: true, verbose: 0, }); await v3.init(); await v3.close(); expect(v3.browserbaseSessionURL).toBeUndefined(); expect(v3.browserbaseDebugURL).toBeUndefined(); }); it("disables captcha solving when solveCaptchas is explicitly false", async () => { const v3 = new V3({ env: "BROWSERBASE", disableAPI: true, verbose: 0, browserbaseSessionCreateParams: { browserSettings: { solveCaptchas: false, }, }, }); try { await v3.init(); expect(v3.isCaptchaAutoSolveEnabled).toBe(false); } finally { await v3.close().catch(() => {}); } }); }); describe("local accessors", () => { it("stay empty for LOCAL environments", async () => { const v3 = new V3({ env: "LOCAL", disableAPI: true, verbose: 0, localBrowserLaunchOptions: { cdpUrl: "ws://local-existing-session", }, }); try { await v3.init(); expect(v3.browserbaseSessionURL).toBeUndefined(); expect(v3.browserbaseDebugURL).toBeUndefined(); } finally { await v3.close().catch(() => {}); } }); }); ================================================ FILE: packages/core/tests/unit/cache-llm-resolution.test.ts ================================================ import { describe, expect, it, vi } from "vitest"; import { ActCache } from "../../lib/v3/cache/ActCache.js"; import { AgentCache } from "../../lib/v3/cache/AgentCache.js"; import type { CacheStorage } from "../../lib/v3/cache/CacheStorage.js"; import type { ActHandler } from "../../lib/v3/handlers/actHandler.js"; import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import type { V3Context } from "../../lib/v3/understudy/context.js"; import type { ActCacheContext, CachedActEntry, CachedAgentEntry, AgentCacheContext, AgentReplayActStep, } from "../../lib/v3/types/private/index.js"; import type { Action, AgentResult, AvailableModel, } from "../../lib/v3/types/public/index.js"; function createFakeStorage(entry: T): CacheStorage { return { enabled: true, readJson: vi.fn().mockResolvedValue({ value: entry }), writeJson: vi.fn().mockResolvedValue({}), directory: "/tmp/cache", } as unknown as CacheStorage; } describe("Cache LLM client selection", () => { it("ActCache uses provided override client during replay", async () => { const action: Action = { selector: "xpath=/html/body/button", description: "click button", method: "click", arguments: [], }; const entry: CachedActEntry = { version: 1, instruction: "click button", url: "https://example.com", variableKeys: [], actions: [action], actionDescription: "click button", message: "done", }; const storage = createFakeStorage(entry); const handler = { takeDeterministicAction: vi.fn().mockResolvedValue({ success: true, message: "ok", actionDescription: "click button", actions: [action], }), } as unknown as ActHandler; const defaultClient = { id: "default" } as unknown as LLMClient; const overrideClient = { id: "override" } as unknown as LLMClient; const cache = new ActCache({ storage, logger: vi.fn(), getActHandler: () => handler, getDefaultLlmClient: () => defaultClient, domSettleTimeoutMs: undefined, }); const context: ActCacheContext = { instruction: "click button", cacheKey: "abc", pageUrl: "https://example.com", variableKeys: [], variables: undefined, }; const result = await cache.tryReplay( context, {} as Page, undefined, overrideClient, ); expect(result?.success).toBe(true); expect(handler.takeDeterministicAction).toHaveBeenCalledTimes(1); const call = vi.mocked(handler.takeDeterministicAction).mock.calls[0]; expect(call?.[3]).toBe(overrideClient); }); it("AgentCache uses provided override client during replay", async () => { const action: Action = { selector: "xpath=/html/body/input", description: "type email", method: "type", arguments: ["test@example.com"], }; const agentStep: AgentReplayActStep = { type: "act", instruction: "type email", actions: [action], }; const entry: CachedAgentEntry = { version: 1, instruction: "fill form", startUrl: "https://example.com", options: {}, configSignature: "sig", steps: [agentStep], result: { success: true, actions: [] } as AgentResult, timestamp: new Date().toISOString(), }; const storage = { enabled: true, readJson: vi.fn().mockImplementation(async () => ({ value: entry })), writeJson: vi.fn().mockResolvedValue({}), directory: "/tmp/cache", } as unknown as CacheStorage; const handler = { takeDeterministicAction: vi.fn().mockResolvedValue({ success: true, message: "ok", actionDescription: "type email", actions: [action], }), } as unknown as ActHandler; const fakePage = {} as Page; const ctx = { awaitActivePage: vi.fn().mockResolvedValue(fakePage), } as unknown as V3Context; const defaultClient = { id: "default-agent" } as unknown as LLMClient; const overrideClient = { id: "override-agent" } as unknown as LLMClient; const cache = new AgentCache({ storage, logger: vi.fn(), getActHandler: () => handler, getContext: () => ctx, getDefaultLlmClient: () => defaultClient, getBaseModelName: () => "openai/gpt-4.1-mini" as AvailableModel, getSystemPrompt: () => undefined, domSettleTimeoutMs: undefined, act: vi.fn(), }); const context: AgentCacheContext = { instruction: "fill form", startUrl: "https://example.com", options: {}, configSignature: "sig", cacheKey: "agent-key", variableKeys: [], }; const result = await cache.tryReplay(context, overrideClient); expect(result?.success).toBe(true); expect(handler.takeDeterministicAction).toHaveBeenCalledTimes(1); const call = vi.mocked(handler.takeDeterministicAction).mock.calls[0]; expect(call?.[3]).toBe(overrideClient); }); it("AgentCache replays non-act steps without requiring an override client", async () => { const gotoEntry: CachedAgentEntry = { version: 1, instruction: "navigate home", startUrl: "https://example.com/source", options: {}, configSignature: "sig", steps: [ { type: "goto", url: "https://example.com/target", waitUntil: "load", }, ], result: { success: true, actions: [] } as AgentResult, timestamp: new Date().toISOString(), }; const storage = { enabled: true, readJson: vi.fn().mockResolvedValue({ value: gotoEntry }), writeJson: vi.fn().mockResolvedValue({}), directory: "/tmp/cache", } as unknown as CacheStorage; const handler = { takeDeterministicAction: vi.fn(), } as unknown as ActHandler; const fakePage = { goto: vi.fn() } as unknown as Page; const ctx = { awaitActivePage: vi.fn().mockResolvedValue(fakePage), } as unknown as V3Context; const cache = new AgentCache({ storage, logger: vi.fn(), getActHandler: () => handler, getContext: () => ctx, getDefaultLlmClient: () => ({ id: "default" }) as unknown as LLMClient, getBaseModelName: () => "openai/gpt-4.1-mini" as AvailableModel, getSystemPrompt: () => undefined, domSettleTimeoutMs: undefined, act: vi.fn(), }); const context: AgentCacheContext = { instruction: "navigate home", startUrl: "https://example.com/source", options: {}, configSignature: "sig", cacheKey: "agent-goto", variableKeys: [], }; const result = await cache.tryReplay(context); expect(result?.success).toBe(true); expect(handler.takeDeterministicAction).not.toHaveBeenCalled(); expect(fakePage.goto).toHaveBeenCalledWith("https://example.com/target", { waitUntil: "load", }); }); }); ================================================ FILE: packages/core/tests/unit/captcha-solver.test.ts ================================================ import { describe, expect, it } from "vitest"; import { CaptchaSolver } from "../../lib/v3/agent/utils/captchaSolver.js"; const SOLVING_STARTED = "browserbase-solving-started"; const SOLVING_FINISHED = "browserbase-solving-finished"; const SOLVING_ERRORED = "browserbase-solving-errored"; type ConsoleListener = (message: { text: () => string }) => void; class MockPage { private listeners = new Set(); public onCalls = 0; public offCalls = 0; on(event: string, listener: ConsoleListener): void { if (event !== "console") return; this.onCalls++; this.listeners.add(listener); } off(event: string, listener: ConsoleListener): void { if (event !== "console") return; this.offCalls++; this.listeners.delete(listener); } emitConsole(text: string): void { const message = { text: () => text }; for (const listener of this.listeners) { listener(message); } } listenerCount(): number { return this.listeners.size; } } describe("CaptchaSolver", () => { it("resolves all concurrent waiters when a solve finishes", async () => { const page = new MockPage(); const solver = new CaptchaSolver(); solver.init(async () => page as never); await solver.ensureAttached(); page.emitConsole(SOLVING_STARTED); const firstWait = solver.waitIfSolving(); const secondWait = solver.waitIfSolving(); await new Promise((resolve) => setTimeout(resolve, 0)); const sharedWaitPromise = ( solver as unknown as { waitPromise: Promise | null } ).waitPromise; expect(sharedWaitPromise).not.toBeNull(); expect( (solver as unknown as { waitPromise: Promise | null }).waitPromise, ).toBe(sharedWaitPromise); let firstResolved = false; let secondResolved = false; void firstWait.then(() => { firstResolved = true; }); void secondWait.then(() => { secondResolved = true; }); await Promise.resolve(); expect(firstResolved).toBe(false); expect(secondResolved).toBe(false); page.emitConsole(SOLVING_FINISHED); await Promise.all([firstWait, secondWait]); expect(firstResolved).toBe(true); expect(secondResolved).toBe(true); expect(solver.consumeSolveResult()).toEqual({ solved: true, errored: false, }); expect(solver.consumeSolveResult()).toEqual({ solved: false, errored: false, }); }); it("re-attaches to a new page and settles stale waiters when the active page changes", async () => { const firstPage = new MockPage(); const secondPage = new MockPage(); let activePage = firstPage; const solver = new CaptchaSolver(); solver.init(async () => activePage as never); await solver.ensureAttached(); firstPage.emitConsole(SOLVING_STARTED); const pendingWait = solver.waitIfSolving(); let settled = false; void pendingWait.then(() => { settled = true; }); activePage = secondPage; await solver.waitIfSolving(); await pendingWait; expect(settled).toBe(true); expect(firstPage.offCalls).toBe(1); expect(firstPage.listenerCount()).toBe(0); expect(secondPage.onCalls).toBe(1); expect(secondPage.listenerCount()).toBe(1); expect(solver.isSolving()).toBe(false); }); it("surfaces solver errors exactly once per consume", async () => { const page = new MockPage(); const solver = new CaptchaSolver(); solver.init(async () => page as never); await solver.ensureAttached(); page.emitConsole(SOLVING_STARTED); const wait = solver.waitIfSolving(); page.emitConsole(SOLVING_ERRORED); await wait; expect(solver.consumeSolveResult()).toEqual({ solved: false, errored: true, }); expect(solver.consumeSolveResult()).toEqual({ solved: false, errored: false, }); }); it("disposes cleanly while a solve is in progress", async () => { const page = new MockPage(); const solver = new CaptchaSolver(); solver.init(async () => page as never); await solver.ensureAttached(); page.emitConsole(SOLVING_STARTED); const wait = solver.waitIfSolving(); await new Promise((resolve) => setTimeout(resolve, 0)); let settled = false; void wait.then(() => { settled = true; }); solver.dispose(); await wait; expect(settled).toBe(true); expect(solver.isSolving()).toBe(false); expect(page.listenerCount()).toBe(0); expect(solver.consumeSolveResult()).toEqual({ solved: false, errored: false, }); }); it("marks errored when detached mid-solve due to page change", async () => { const firstPage = new MockPage(); const secondPage = new MockPage(); let activePage = firstPage; const solver = new CaptchaSolver(); solver.init(async () => activePage as never); await solver.ensureAttached(); firstPage.emitConsole(SOLVING_STARTED); const wait = solver.waitIfSolving(); // Switch to a new page while the solve is in progress activePage = secondPage; await solver.waitIfSolving(); await wait; // The interrupted solve should be reported as errored expect(solver.consumeSolveResult()).toEqual({ solved: false, errored: true, }); }); }); ================================================ FILE: packages/core/tests/unit/cdp-connection-close.test.ts ================================================ import { describe, it, expect, afterEach } from "vitest"; import { WebSocketServer, type WebSocket as ServerWebSocket } from "ws"; import { CdpConnection } from "../../lib/v3/understudy/cdp.js"; /** * Races a promise against a timeout. Returns "resolved" if the promise * settles before the deadline, or "timeout" if it doesn't. */ // TODO: dedupe this with the implementation in testUtils.ts after we unify the test directories function raceTimeout( promise: Promise, ms: number, ): Promise { let timer: ReturnType; const timeout = new Promise<"timeout">((resolve) => { timer = setTimeout(() => resolve("timeout"), ms); }); return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); } /** * Creates a local WebSocket server and connects a CdpConnection to it. * Returns the connection plus a handle to the server-side socket. */ async function createPair(): Promise<{ conn: CdpConnection; serverSocket: ServerWebSocket; wss: WebSocketServer; }> { const wss = new WebSocketServer({ port: 0 }); const port = (wss.address() as { port: number }).port; const serverSocketPromise = new Promise((resolve) => { wss.once("connection", resolve); }); const conn = await CdpConnection.connect(`ws://localhost:${port}`); const serverSocket = await serverSocketPromise; return { conn, serverSocket, wss }; } describe("CdpConnection", () => { let wss: WebSocketServer | null = null; afterEach(async () => { if (wss) { await new Promise((resolve) => wss!.close(() => resolve())); wss = null; } }); describe("close() when WebSocket is already closed", () => { it("resolves instead of hanging forever", async () => { const pair = await createPair(); wss = pair.wss; // Wait for the client-side close event to be fully processed. const transportClosed = new Promise((resolve) => { pair.conn.onTransportClosed(() => resolve()); }); // Simulate the hosted API terminating the Browserbase session: // the server closes the WebSocket from its side. pair.serverSocket.close(); await transportClosed; // conn.close() on an already-CLOSED WebSocket must resolve. // Without the fix it awaits a "close" event that already fired → hangs. const result = await raceTimeout( pair.conn.close().then(() => "resolved"), 3_000, ); expect(result).toBe("resolved"); }); }); describe("inflight CDP calls on unexpected close", () => { it("rejects pending calls instead of hanging forever", async () => { const pair = await createPair(); wss = pair.wss; // Send a CDP command; the mock server will never reply. const pending = pair.conn.send("Runtime.evaluate", { expression: "1+1", }); // Server terminates the connection while the call is inflight. pair.serverSocket.close(); // The pending promise must reject, not hang. const result = await raceTimeout( pending.then(() => "resolved").catch(() => "rejected"), 3_000, ); expect(result).toBe("rejected"); }); }); }); ================================================ FILE: packages/core/tests/unit/context-extra-http-headers.test.ts ================================================ import { describe, expect, it } from "vitest"; import { V3Context } from "../../lib/v3/understudy/context.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import { StagehandSetExtraHTTPHeadersError } from "../../lib/v3/types/public/sdkErrors.js"; type ContextStub = { _sessionInit: Set; conn: { getSession: (id: string) => MockCDPSession | undefined; }; extraHttpHeaders: Record | null; }; const makeContext = (sessions: MockCDPSession[]): ContextStub => { const sessionsById = new Map( sessions.map((session) => [session.id, session]), ); return { _sessionInit: new Set(sessions.map((session) => session.id)), conn: { getSession: (id: string) => sessionsById.get(id), }, extraHttpHeaders: null, }; }; describe("V3Context.setExtraHTTPHeaders", () => { const setExtraHTTPHeaders = V3Context.prototype.setExtraHTTPHeaders as ( this: ContextStub, headers: Record, ) => Promise; it("sends headers to all sessions", async () => { const sessionA = new MockCDPSession({}, "session-a"); const sessionB = new MockCDPSession({}, "session-b"); const ctx = makeContext([sessionA, sessionB]); await setExtraHTTPHeaders.call(ctx, { "x-stagehand-test": "yes", }); for (const session of [sessionA, sessionB]) { expect(session.callsFor("Network.enable").length).toBe(1); expect( session.callsFor("Network.setExtraHTTPHeaders")[0]?.params, ).toEqual({ headers: { "x-stagehand-test": "yes" }, }); } }); it("throws a custom error with session failure details", async () => { const sessionA = new MockCDPSession( { "Network.setExtraHTTPHeaders": () => { throw new Error("boom"); }, }, "session-a", ); const sessionB = new MockCDPSession({}, "session-b"); const ctx = makeContext([sessionA, sessionB]); const promise = setExtraHTTPHeaders.call(ctx, { "x-stagehand-test": "yes", }); await expect(promise).rejects.toBeInstanceOf( StagehandSetExtraHTTPHeadersError, ); try { await promise; } catch (error) { const err = error as StagehandSetExtraHTTPHeadersError; expect(err.failures).toHaveLength(1); expect(err.failures[0]).toContain("session=session-a"); expect(err.failures[0]).toContain("boom"); } expect(sessionA.callsFor("Network.setExtraHTTPHeaders").length).toBe(1); expect(sessionB.callsFor("Network.setExtraHTTPHeaders").length).toBe(1); }); }); ================================================ FILE: packages/core/tests/unit/cookies.test.ts ================================================ import { beforeEach, describe, expect, it } from "vitest"; import { filterCookies, normalizeCookieParams, cookieMatchesFilter, } from "../../lib/v3/understudy/cookies.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import type { V3Context } from "../../lib/v3/understudy/context.js"; import { Cookie, CookieParam } from "../../lib/v3/types/public/context.js"; function makeCookie(overrides: Partial = {}): Cookie { return { name: "sid", value: "abc123", domain: "example.com", path: "/", expires: -1, httpOnly: false, secure: false, sameSite: "Lax", ...overrides, }; } /** Convert our Cookie type into the shape CDP's Storage.getCookies returns. */ function toCdpCookie(c: Cookie) { return { name: c.name, value: c.value, domain: c.domain, path: c.path, expires: c.expires, httpOnly: c.httpOnly, secure: c.secure, sameSite: c.sameSite, size: c.name.length + c.value.length, session: c.expires === -1, priority: "Medium", sameParty: false, sourceScheme: "Secure", sourcePort: 443, }; } describe("filterCookies", () => { const cookies: Cookie[] = [ makeCookie({ name: "a", domain: "example.com", path: "/", secure: false }), makeCookie({ name: "b", domain: ".example.com", path: "/app", secure: true, }), makeCookie({ name: "c", domain: "other.com", path: "/", secure: false }), makeCookie({ name: "d", domain: "sub.example.com", path: "/", secure: false, }), ]; it("returns all cookies when urls is empty", () => { expect(filterCookies(cookies, [])).toEqual(cookies); }); it("filters by domain (exact host match)", () => { const result = filterCookies(cookies, ["http://example.com/"]); const names = result.map((c) => c.name); expect(names).toContain("a"); // "b" (.example.com) domain-matches but is secure — excluded on http:// expect(names).not.toContain("b"); expect(names).not.toContain("c"); expect(names).not.toContain("d"); }); it("filters by domain (dot-prefixed domain matches on https)", () => { const result = filterCookies(cookies, ["https://example.com/app/settings"]); const names = result.map((c) => c.name); expect(names).toContain("a"); // example.com domain match, path "/" prefix expect(names).toContain("b"); // .example.com domain match + secure + https }); it("filters by domain (subdomain matches dot-prefixed domain)", () => { const result = filterCookies(cookies, ["http://sub.example.com/"]); const names = result.map((c) => c.name); // "a" (example.com) → prepend dot → .example.com → matches .sub.example.com expect(names).toContain("a"); // "b" (.example.com) domain-matches sub.example.com but is secure — excluded on http:// expect(names).not.toContain("b"); expect(names).toContain("d"); // sub.example.com matches exactly expect(names).not.toContain("c"); }); it("filters by path prefix", () => { const result = filterCookies(cookies, ["https://example.com/app/settings"]); const names = result.map((c) => c.name); expect(names).toContain("a"); // path "/" is a prefix of "/app/settings" expect(names).toContain("b"); // path "/app" is a prefix of "/app/settings" }); it("excludes secure cookies for non-https URLs", () => { const result = filterCookies(cookies, ["http://example.com/app/page"]); const names = result.map((c) => c.name); expect(names).toContain("a"); expect(names).not.toContain("b"); // secure cookie, http URL }); it("allows secure cookies on loopback addresses regardless of protocol", () => { const cases = [ { domain: "localhost", url: "http://localhost/" }, { domain: "127.0.0.1", url: "http://127.0.0.1/" }, { domain: "[::1]", url: "http://[::1]/" }, ]; for (const { domain, url } of cases) { const cookie = makeCookie({ name: "loop", domain, secure: true }); const result = filterCookies([cookie], [url]); expect(result).toHaveLength(1); expect(result[0]!.name).toBe("loop"); } }); it("matches against multiple URLs (union)", () => { const result = filterCookies(cookies, [ "http://example.com/", "http://other.com/", ]); const names = result.map((c) => c.name); expect(names).toContain("a"); expect(names).toContain("c"); }); it("returns empty array when no cookies match any URL", () => { const result = filterCookies(cookies, ["http://nomatch.dev/"]); expect(result).toHaveLength(0); }); it("returns empty array when cookie list is empty", () => { const result = filterCookies([], ["http://example.com/"]); expect(result).toHaveLength(0); }); it("does not match a sibling subdomain against a host-only domain", () => { // Cookie for "api.example.com" should NOT match "www.example.com" const apiCookie = makeCookie({ name: "api", domain: "api.example.com" }); const result = filterCookies([apiCookie], ["http://www.example.com/"]); expect(result).toHaveLength(0); }); it("does not match a parent domain against a more specific cookie", () => { // Cookie for "sub.example.com" should NOT match "example.com" const subCookie = makeCookie({ name: "sub", domain: "sub.example.com" }); const result = filterCookies([subCookie], ["http://example.com/"]); expect(result).toHaveLength(0); }); it("does not match when path does not prefix the URL path", () => { const deepCookie = makeCookie({ name: "deep", domain: "example.com", path: "/admin", }); const result = filterCookies([deepCookie], ["http://example.com/public"]); expect(result).toHaveLength(0); }); it("does not match when cookie path is a string prefix but not a path boundary", () => { // "/foo" should NOT match "/foobar" — only "/foo", "/foo/", "/foo/bar" const cookie = makeCookie({ name: "boundary", domain: "example.com", path: "/foo", }); expect(filterCookies([cookie], ["http://example.com/foobar"])).toHaveLength( 0, ); expect(filterCookies([cookie], ["http://example.com/foo"])).toHaveLength(1); expect( filterCookies([cookie], ["http://example.com/foo/bar"]), ).toHaveLength(1); }); it("matches root path against any URL path", () => { const rootCookie = makeCookie({ name: "root", domain: "example.com", path: "/", }); const result = filterCookies( [rootCookie], ["http://example.com/deeply/nested/page"], ); expect(result).toHaveLength(1); }); it("handles URL with port numbers", () => { const c = makeCookie({ name: "port", domain: "localhost", path: "/" }); const result = filterCookies([c], ["http://localhost:3000/api"]); expect(result).toHaveLength(1); }); it("handles URL with query string and fragment", () => { const c = makeCookie({ name: "q", domain: "example.com", path: "/" }); const result = filterCookies( [c], ["http://example.com/page?q=1&r=2#section"], ); expect(result).toHaveLength(1); }); it("throws CookieValidationError for malformed URL", () => { const c = makeCookie({ name: "a", domain: "example.com" }); expect(() => filterCookies([c], ["not-a-valid-url"])).toThrow( /Invalid URL passed to cookies\(\)/, ); }); }); describe("normalizeCookieParams", () => { it("passes through cookies with domain+path", () => { const input: CookieParam[] = [ { name: "a", value: "1", domain: "example.com", path: "/" }, ]; const result = normalizeCookieParams(input); expect(result[0]!.domain).toBe("example.com"); expect(result[0]!.path).toBe("/"); expect(result[0]!.url).toBeUndefined(); }); it("derives domain, path, and secure from url", () => { const input: CookieParam[] = [ { name: "a", value: "1", url: "https://example.com/app/page" }, ]; const result = normalizeCookieParams(input); expect(result[0]!.domain).toBe("example.com"); expect(result[0]!.path).toBe("/app/"); expect(result[0]!.secure).toBe(true); expect(result[0]!.url).toBeUndefined(); }); it("sets secure to false for http urls", () => { const input: CookieParam[] = [ { name: "a", value: "1", url: "http://example.com/" }, ]; const result = normalizeCookieParams(input); expect(result[0]!.secure).toBe(false); }); it("throws when neither url nor domain+path is provided", () => { expect(() => normalizeCookieParams([{ name: "a", value: "1" }])).toThrow( /must have a url or a domain\/path pair/, ); }); it("throws when both url and domain are provided", () => { expect(() => normalizeCookieParams([ { name: "a", value: "1", url: "https://x.com/", domain: "x.com" }, ]), ).toThrow(/should have either url or domain/); }); it("throws when both url and path are provided", () => { expect(() => normalizeCookieParams([ { name: "a", value: "1", url: "https://x.com/", path: "/" }, ]), ).toThrow(/should have either url or path/); }); it("throws for invalid expires (negative, not -1)", () => { expect(() => normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", expires: -5 }, ]), ).toThrow(/invalid expires/); }); it("allows expires of -1 (session cookie)", () => { const result = normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", expires: -1 }, ]); expect(result[0]!.expires).toBe(-1); }); it("allows a positive expires timestamp", () => { const future = Math.floor(Date.now() / 1000) + 3600; const result = normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", expires: future }, ]); expect(result[0]!.expires).toBe(future); }); it("throws for about:blank url", () => { expect(() => normalizeCookieParams([{ name: "a", value: "1", url: "about:blank" }]), ).toThrow(/Blank page/); }); it("throws for data: url", () => { expect(() => normalizeCookieParams([ { name: "a", value: "1", url: "data:text/html,hi" }, ]), ).toThrow(/Data URL/); }); it("throws CookieValidationError for malformed url", () => { expect(() => normalizeCookieParams([{ name: "a", value: "1", url: "not-a-url" }]), ).toThrow(/Cookie "a" has an invalid url/); }); it("throws when sameSite is None but secure is false", () => { expect(() => normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", sameSite: "None", secure: false, }, ]), ).toThrow(/sameSite: "None" without secure: true/); }); it("throws when sameSite is None and secure is omitted (undefined)", () => { // CDP defaults secure to false when omitted, so the browser will reject it. expect(() => normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", sameSite: "None" }, ]), ).toThrow(/sameSite: "None" without secure: true/); }); it("does NOT throw when sameSite is None and secure is true", () => { const result = normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", sameSite: "None", secure: true, }, ]); expect(result[0]!.sameSite).toBe("None"); expect(result[0]!.secure).toBe(true); }); it("derives root path from URL with no trailing path segments", () => { const result = normalizeCookieParams([ { name: "a", value: "1", url: "https://example.com" }, ]); // URL("https://example.com").pathname is "/", lastIndexOf("/") + 1 = 1 → "/" expect(result[0]!.path).toBe("/"); }); it("handles URL with port number", () => { const result = normalizeCookieParams([ { name: "a", value: "1", url: "https://localhost:3000/api/v1" }, ]); expect(result[0]!.domain).toBe("localhost"); expect(result[0]!.path).toBe("/api/"); expect(result[0]!.secure).toBe(true); }); it("handles URL with query string (ignores query)", () => { const result = normalizeCookieParams([ { name: "a", value: "1", url: "https://example.com/page?q=1" }, ]); expect(result[0]!.domain).toBe("example.com"); expect(result[0]!.path).toBe("/"); }); it("normalises multiple cookies in a single call", () => { const result = normalizeCookieParams([ { name: "a", value: "1", url: "https://one.com/x" }, { name: "b", value: "2", domain: "two.com", path: "/" }, { name: "c", value: "3", url: "http://three.com/y/z" }, ]); expect(result).toHaveLength(3); expect(result[0]!.domain).toBe("one.com"); expect(result[1]!.domain).toBe("two.com"); expect(result[2]!.domain).toBe("three.com"); expect(result[2]!.secure).toBe(false); }); it("does not mutate the original input array", () => { const input: CookieParam[] = [ { name: "a", value: "1", url: "https://example.com/app" }, ]; const frozen = { ...input[0]! }; normalizeCookieParams(input); expect(input[0]).toEqual(frozen); }); it("preserves optional fields that are explicitly set", () => { const result = normalizeCookieParams([ { name: "full", value: "val", domain: "x.com", path: "/p", expires: 9999999999, httpOnly: true, secure: true, sameSite: "Strict", }, ]); const c = result[0]!; expect(c.httpOnly).toBe(true); expect(c.secure).toBe(true); expect(c.sameSite).toBe("Strict"); expect(c.expires).toBe(9999999999); }); it("allows expires of 0 (epoch — effectively expired)", () => { // 0 is a positive-ish edge case; browsers treat it as already expired const result = normalizeCookieParams([ { name: "a", value: "1", domain: "x.com", path: "/", expires: 0 }, ]); expect(result[0]!.expires).toBe(0); }); it("throws on the first invalid cookie in a batch", () => { expect(() => normalizeCookieParams([ { name: "ok", value: "1", domain: "x.com", path: "/" }, { name: "bad", value: "2" }, // missing url/domain+path ]), ).toThrow(/Cookie "bad"/); }); it("includes cookie name in every error message", () => { const cases = [ () => normalizeCookieParams([{ name: "NAMED", value: "1" }]), () => normalizeCookieParams([ { name: "NAMED", value: "1", url: "https://x.com/", domain: "x" }, ]), () => normalizeCookieParams([ { name: "NAMED", value: "1", url: "about:blank" }, ]), () => normalizeCookieParams([ { name: "NAMED", value: "1", domain: "x.com", path: "/", sameSite: "None", secure: false, }, ]), ]; for (const fn of cases) { expect(fn).toThrow(/NAMED/); } }); }); describe("cookieMatchesFilter", () => { const cookie = makeCookie({ name: "session", domain: ".example.com", path: "/app", }); it("matches when all filters match (exact strings)", () => { expect( cookieMatchesFilter(cookie, { name: "session", domain: ".example.com", path: "/app", }), ).toBe(true); }); it("does not match when name differs", () => { expect(cookieMatchesFilter(cookie, { name: "other" })).toBe(false); }); it("does not match when domain differs", () => { expect(cookieMatchesFilter(cookie, { domain: "other.com" })).toBe(false); }); it("does not match when path differs", () => { expect(cookieMatchesFilter(cookie, { path: "/other" })).toBe(false); }); it("matches with regex name", () => { expect(cookieMatchesFilter(cookie, { name: /^sess/ })).toBe(true); expect(cookieMatchesFilter(cookie, { name: /^nope/ })).toBe(false); }); it("matches with regex domain", () => { expect(cookieMatchesFilter(cookie, { domain: /example\.com$/ })).toBe(true); expect(cookieMatchesFilter(cookie, { domain: /^other/ })).toBe(false); }); it("matches with regex path", () => { expect(cookieMatchesFilter(cookie, { path: /^\/app/ })).toBe(true); }); it("undefined filters match everything", () => { expect(cookieMatchesFilter(cookie, {})).toBe(true); expect(cookieMatchesFilter(cookie, { name: undefined })).toBe(true); }); it("requires ALL filters to match (AND logic)", () => { // name matches but domain does not expect( cookieMatchesFilter(cookie, { name: "session", domain: "wrong.com" }), ).toBe(false); }); it("handles global regex lastIndex correctly", () => { const re = /sess/g; re.lastIndex = 999; expect(cookieMatchesFilter(cookie, { name: re })).toBe(true); }); it("exact string does not do substring matching", () => { // filter name "sess" should NOT match cookie name "session" expect(cookieMatchesFilter(cookie, { name: "sess" })).toBe(false); }); it("regex can do substring matching", () => { // regex /sess/ SHOULD match cookie name "session" (substring) expect(cookieMatchesFilter(cookie, { name: /sess/ })).toBe(true); }); it("works with all three regex filters combined", () => { expect( cookieMatchesFilter(cookie, { name: /^session$/, domain: /example/, path: /^\/app$/, }), ).toBe(true); // One of three fails expect( cookieMatchesFilter(cookie, { name: /^session$/, domain: /example/, path: /^\/wrong$/, }), ).toBe(false); }); it("empty string filter only matches empty cookie property", () => { const emptyPathCookie = makeCookie({ name: "x", domain: "a.com", path: "", }); expect(cookieMatchesFilter(emptyPathCookie, { path: "" })).toBe(true); expect(cookieMatchesFilter(cookie, { path: "" })).toBe(false); }); it("is called once per cookie (no cross-contamination between calls)", () => { const c1 = makeCookie({ name: "alpha", domain: "a.com", path: "/" }); const c2 = makeCookie({ name: "beta", domain: "b.com", path: "/x" }); const filter = { name: "alpha", domain: "a.com" }; expect(cookieMatchesFilter(c1, filter)).toBe(true); expect(cookieMatchesFilter(c2, filter)).toBe(false); }); }); describe("V3Context cookie methods", () => { // We test V3Context methods by constructing a minimal instance with a mock // CDP connection. V3Context.create() requires a real WebSocket, so we build // one via type-casting a MockCDPSession into the `conn` slot. // eslint-disable-next-line @typescript-eslint/no-explicit-any let V3ContextClass: { prototype: V3Context } & Record; beforeEach(async () => { const mod = await import("../../lib/v3/understudy/context.js"); V3ContextClass = mod.V3Context as typeof V3ContextClass; }); function makeContext( cdpHandlers: Record) => unknown>, ): V3Context { const mockConn = new MockCDPSession(cdpHandlers, "root"); // V3Context stores the connection as `conn` (readonly). We create an // object with the real prototype so we get the actual method implementations. const ctx = Object.create(V3ContextClass.prototype) as V3Context & { conn: MockCDPSession; }; // Assign the mock connection Object.defineProperty(ctx, "conn", { value: mockConn, writable: false }); return ctx; } function getMockConn(ctx: V3Context): MockCDPSession { return (ctx as unknown as { conn: MockCDPSession }).conn; } describe("cookies()", () => { it("returns all cookies from Storage.getCookies", async () => { const cdpCookies = [ toCdpCookie(makeCookie({ name: "a", domain: "example.com" })), toCdpCookie(makeCookie({ name: "b", domain: "other.com" })), ]; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: cdpCookies }), }); const result = await ctx.cookies(); expect(result).toHaveLength(2); expect(result.map((c) => c.name)).toEqual(["a", "b"]); }); it("filters by URL when provided as string", async () => { const cdpCookies = [ toCdpCookie(makeCookie({ name: "a", domain: "example.com" })), toCdpCookie(makeCookie({ name: "b", domain: "other.com" })), ]; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: cdpCookies }), }); const result = await ctx.cookies("http://example.com/"); expect(result).toHaveLength(1); expect(result[0]!.name).toBe("a"); }); it("filters by URL when provided as array", async () => { const cdpCookies = [ toCdpCookie(makeCookie({ name: "a", domain: "example.com" })), toCdpCookie(makeCookie({ name: "b", domain: "other.com" })), ]; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: cdpCookies }), }); const result = await ctx.cookies(["http://other.com/"]); expect(result).toHaveLength(1); expect(result[0]!.name).toBe("b"); }); it("defaults sameSite to Lax when CDP returns undefined", async () => { const cdpCookie = { ...toCdpCookie(makeCookie()), sameSite: undefined as string | undefined, }; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [cdpCookie] }), }); const result = await ctx.cookies(); expect(result[0]!.sameSite).toBe("Lax"); }); it("returns empty array when browser has no cookies", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [] }), }); const result = await ctx.cookies(); expect(result).toEqual([]); }); it("maps all CDP cookie fields to our Cookie type", async () => { const cdpCookie = toCdpCookie( makeCookie({ name: "full", value: "v", domain: ".test.com", path: "/p", expires: 1700000000, httpOnly: true, secure: true, sameSite: "Strict", }), ); const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [cdpCookie] }), }); const result = await ctx.cookies(); expect(result[0]).toEqual({ name: "full", value: "v", domain: ".test.com", path: "/p", expires: 1700000000, httpOnly: true, secure: true, sameSite: "Strict", }); }); it("strips extra CDP fields (size, priority, etc.) from result", async () => { const cdpCookie = toCdpCookie(makeCookie({ name: "stripped" })); const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [cdpCookie] }), }); const result = await ctx.cookies(); const keys = Object.keys(result[0]!); expect(keys).not.toContain("size"); expect(keys).not.toContain("priority"); expect(keys).not.toContain("sourceScheme"); expect(keys).not.toContain("sourcePort"); }); it("calls Storage.getCookies exactly once per invocation", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [] }), }); await ctx.cookies(); await ctx.cookies("http://example.com"); const calls = getMockConn(ctx).callsFor("Storage.getCookies"); expect(calls).toHaveLength(2); }); }); describe("addCookies()", () => { it("sends all cookies in a single Storage.setCookies call", async () => { const ctx = makeContext({ "Storage.setCookies": () => ({}), }); await ctx.addCookies([ { name: "a", value: "1", domain: "example.com", path: "/" }, { name: "b", value: "2", domain: "other.com", path: "/" }, ]); const calls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(calls).toHaveLength(1); expect(calls[0]!.params).toMatchObject({ cookies: [ { name: "a", domain: "example.com" }, { name: "b", domain: "other.com" }, ], }); }); it("derives domain/path/secure from url", async () => { const ctx = makeContext({ "Storage.setCookies": () => ({}), }); await ctx.addCookies([ { name: "a", value: "1", url: "https://example.com/app/page" }, ]); const calls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(calls[0]!.params).toMatchObject({ cookies: [ { name: "a", domain: "example.com", path: "/app/", secure: true }, ], }); }); it("throws when Storage.setCookies fails", async () => { const ctx = makeContext({ "Storage.setCookies": () => { throw new Error("CDP failure"); }, }); await expect( ctx.addCookies([ { name: "bad", value: "x", domain: "example.com", path: "/" }, ]), ).rejects.toThrow(/Failed to set cookies \["bad"\]/); }); it("throws for sameSite None without secure", async () => { const ctx = makeContext({ "Storage.setCookies": () => ({}), }); await expect( ctx.addCookies([ { name: "x", value: "1", domain: "example.com", path: "/", sameSite: "None", secure: false, }, ]), ).rejects.toThrow(/sameSite: "None" without secure: true/); }); it("does nothing when passed an empty array", async () => { const ctx = makeContext({ "Storage.setCookies": () => ({}), }); await ctx.addCookies([]); const calls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(calls).toHaveLength(0); }); it("sends all cookie fields to CDP (including optional ones)", async () => { const ctx = makeContext({ "Storage.setCookies": () => ({}), }); await ctx.addCookies([ { name: "full", value: "val", domain: "x.com", path: "/p", expires: 9999999999, httpOnly: true, secure: true, sameSite: "Strict", }, ]); const calls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(calls[0]!.params).toEqual({ cookies: [ { name: "full", value: "val", domain: "x.com", path: "/p", expires: 9999999999, httpOnly: true, secure: true, sameSite: "Strict", }, ], }); }); it("error message includes all cookie names when batch fails", async () => { const ctx = makeContext({ "Storage.setCookies": () => { throw new Error("CDP failure"); }, }); await expect( ctx.addCookies([ { name: "alpha", value: "1", domain: "a.com", path: "/" }, { name: "beta", value: "2", domain: "b.com", path: "/" }, ]), ).rejects.toThrow(/Failed to set cookies \["alpha", "beta"\]/); }); }); describe("clearCookies()", () => { const cdpCookies = [ toCdpCookie( makeCookie({ name: "session", domain: "example.com", path: "/" }), ), toCdpCookie( makeCookie({ name: "_ga", domain: ".example.com", path: "/" }), ), toCdpCookie( makeCookie({ name: "pref", domain: "other.com", path: "/settings" }), ), ]; it("uses atomic Storage.clearCookies when called with no options", async () => { const ctx = makeContext({ "Storage.clearCookies": () => ({}), }); await ctx.clearCookies(); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); // Should NOT have fetched or re-set anything const getCalls = getMockConn(ctx).callsFor("Storage.getCookies"); expect(getCalls).toHaveLength(0); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("clears and re-adds only non-matching cookies (name filter)", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: "_ga" }); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(1); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toEqual(["session", "pref"]); }); it("clears and re-adds only non-matching cookies (domain filter)", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ domain: "other.com" }); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toEqual(["session", "_ga"]); }); it("clears and re-adds only non-matching cookies (regex name)", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: /^_ga/ }); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toEqual(["session", "pref"]); }); it("applies AND logic across multiple filters", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: "session", domain: "example.com" }); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toEqual(["_ga", "pref"]); }); it("does nothing when filter matches no cookies", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: "nonexistent" }); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(0); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("clears without re-adding when filter matches all cookies", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: /.*/ }); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("handles regex that matches multiple cookies", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [ toCdpCookie( makeCookie({ name: "_ga_ABC", domain: "example.com", path: "/" }), ), toCdpCookie( makeCookie({ name: "_ga_DEF", domain: "example.com", path: "/" }), ), toCdpCookie( makeCookie({ name: "_gid", domain: "example.com", path: "/" }), ), toCdpCookie( makeCookie({ name: "session", domain: "example.com", path: "/" }), ), ], }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ name: /^_ga/ }); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toContain("_gid"); expect(kept).toContain("session"); expect(kept).not.toContain("_ga_ABC"); expect(kept).not.toContain("_ga_DEF"); }); it("regex domain filter combined with path filter", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ domain: /example/, path: "/settings" }); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(0); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("clearCookies with empty options object uses atomic clear (same as no args)", async () => { const ctx = makeContext({ "Storage.clearCookies": () => ({}), }); await ctx.clearCookies({}); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); }); it("clears and re-adds only non-matching cookies (path filter)", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await ctx.clearCookies({ path: "/settings" }); const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(1); const kept = ( setCalls[0]!.params?.cookies as Array<{ name: string }> ).map((c) => c.name); expect(kept).toEqual(["session", "_ga"]); expect(kept).not.toContain("pref"); }); it("throws when Storage.getCookies fails during filtered clear", async () => { const ctx = makeContext({ "Storage.getCookies": () => { throw new Error("CDP getCookies failure"); }, "Storage.clearCookies": () => ({}), "Storage.setCookies": () => ({}), }); await expect(ctx.clearCookies({ name: "session" })).rejects.toThrow( /CDP getCookies failure/, ); // clearCookies and setCookies should never have been called const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(0); const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("throws when Storage.clearCookies fails during filtered clear", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => { throw new Error("CDP clearCookies failure"); }, "Storage.setCookies": () => ({}), }); await expect(ctx.clearCookies({ name: "session" })).rejects.toThrow( /CDP clearCookies failure/, ); // setCookies should never have been called — cookies are untouched const setCalls = getMockConn(ctx).callsFor("Storage.setCookies"); expect(setCalls).toHaveLength(0); }); it("throws when Storage.setCookies fails during re-add, cookies are already wiped", async () => { const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [...cdpCookies] }), "Storage.clearCookies": () => ({}), "Storage.setCookies": () => { throw new Error("CDP setCookies failure"); }, }); await expect(ctx.clearCookies({ name: "session" })).rejects.toThrow( /cookie jar is now empty/, ); // clearCookies WAS called — cookies are gone const clearCalls = getMockConn(ctx).callsFor("Storage.clearCookies"); expect(clearCalls).toHaveLength(1); }); }); describe("cookies() sameSite mapping", () => { it("passes through valid sameSite values as-is", async () => { for (const sameSite of ["Strict", "Lax", "None"] as const) { const cdpCookie = { ...toCdpCookie(makeCookie()), sameSite }; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [cdpCookie] }), }); const result = await ctx.cookies(); expect(result[0]!.sameSite).toBe(sameSite); } }); it("does not normalize lowercase sameSite values from CDP", async () => { // CDP may return lowercase values; the current implementation casts // without normalizing, so "none" passes through as-is. const cdpCookie = { ...toCdpCookie(makeCookie()), sameSite: "none" }; const ctx = makeContext({ "Storage.getCookies": () => ({ cookies: [cdpCookie] }), }); const result = await ctx.cookies(); // This documents the current behavior: lowercase is NOT normalized. expect(result[0]!.sameSite).toBe("none"); }); }); }); ================================================ FILE: packages/core/tests/unit/flowlogger-capturing-cdp.test.ts ================================================ import { EventEmitter } from "node:events"; import { describe, it, expect } from "vitest"; import { CdpConnection } from "../../lib/v3/understudy/cdp.js"; import { InMemoryEventSink } from "../../lib/v3/flowlogger/EventSink.js"; import { EventEmitterWithWildcardSupport } from "../../lib/v3/flowlogger/EventEmitter.js"; import { EventStore } from "../../lib/v3/flowlogger/EventStore.js"; import { FlowEvent, FlowLogger } from "../../lib/v3/flowlogger/FlowLogger.js"; function attachEventStoreToBus( store: EventStore, bus: EventEmitterWithWildcardSupport, ): () => void { const onFlowEvent = (event: unknown) => { if (event instanceof FlowEvent) { void store.emit(event); } }; bus.on("*", onFlowEvent); return () => { bus.off("*", onFlowEvent); }; } class FakeSocket extends EventEmitter { sentPayloads: string[] = []; readyState = 1; send(payload: string): void { this.sentPayloads.push(payload); } close(): void { this.readyState = 3; this.emit("close", 1000, ""); } } function createConnection(socket: FakeSocket): CdpConnection { // The production constructor is private; tests instantiate it directly so // they can drive raw websocket messages without a real browser. const ConnectionCtor = CdpConnection as unknown as { new (ws: FakeSocket): CdpConnection; }; return new ConnectionCtor(socket); } function requireEvent( events: FlowEvent[], predicate: (event: FlowEvent) => boolean, description: string, ): FlowEvent { const match = events.find(predicate); expect(match, `missing ${description}`).toBeDefined(); return match as FlowEvent; } describe("flow logger cdp context", () => { it("preserves the active parent chain when a session event handler issues a nested CDP call", async () => { const sessionId = "session-test"; const socket = new FakeSocket(); const eventBus = new EventEmitterWithWildcardSupport(); const sink = new InMemoryEventSink(); const eventStore = new EventStore(sessionId, undefined, sink); const detachBus = attachEventStoreToBus(eventStore, eventBus); const conn = createConnection(socket); conn.flowLoggerContext = FlowLogger.init(sessionId, eventBus); // Seed the target/session mapping the same way a real attach flow would // before any session-scoped messages are dispatched. (conn as unknown as { onMessage(json: string): void }).onMessage( JSON.stringify({ method: "Target.attachedToTarget", params: { sessionId: "target-session", targetInfo: { targetId: "target-1" }, }, }), ); const session = conn.getSession("target-session"); expect(session).toBeDefined(); session!.on("Runtime.consoleAPICalled", () => { // This nested send used to lose its parent chain because the callback ran // after the original ALS scope had already unwound. void session!.send("Runtime.evaluate", { expression: "2 + 2", }); }); await FlowLogger.runWithLogging( { context: conn.flowLoggerContext, eventType: "SyntheticParentEvent", }, async () => { void session!.send("Page.navigate", { url: "https://example.com", }); }, [], ); (conn as unknown as { onMessage(json: string): void }).onMessage( JSON.stringify({ method: "Runtime.consoleAPICalled", sessionId: "target-session", params: { type: "log" }, }), ); // The nested Runtime.evaluate call should still attach under the synthetic // parent event even though it was triggered by a later session callback. const events = await eventStore.query({}); const parentEvent = requireEvent( events, (event) => event.eventType === "SyntheticParentEvent", "SyntheticParentEvent", ); const nestedCallEvent = requireEvent( events, (event) => event.eventType === "CdpCallEvent" && String(event.data.method) === "Runtime.evaluate", "nested Runtime.evaluate CdpCallEvent", ); expect(nestedCallEvent.eventParentIds).toEqual([parentEvent.eventId]); detachBus(); await eventStore.destroy(); }); }); ================================================ FILE: packages/core/tests/unit/flowlogger-capturing-llm.test.ts ================================================ import { describe, expect, it } from "vitest"; import { FlowLogger } from "../../lib/v3/flowlogger/FlowLogger.js"; describe("flow logger llm logging", () => { it("no-ops direct llm logging calls when no flow context is active", () => { // These helpers are called from multiple model adapters, so they must stay // safe even when a test or utility invokes them outside any ALS flow scope. expect(() => FlowLogger.logLlmRequest({ requestId: "req-1", model: "mock-model", prompt: "hello", }), ).not.toThrow(); expect(() => FlowLogger.logLlmResponse({ requestId: "req-1", model: "mock-model", output: "world", inputTokens: 1, outputTokens: 1, }), ).not.toThrow(); }); it("does not throw from llm middleware when no flow context is active", async () => { const middleware = FlowLogger.createLlmLoggingMiddleware("mock-model"); // Missing flow context should degrade to a silent no-op and preserve the // underlying model result. await expect( middleware.wrapGenerate({ doGenerate: async () => ({ text: "done", usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2, }, }), params: { prompt: [], }, } as never), ).resolves.toMatchObject({ text: "done", }); }); }); ================================================ FILE: packages/core/tests/unit/flowlogger-eventstore.test.ts ================================================ import { afterEach, describe, expect, it } from "vitest"; import { EventStore } from "../../lib/v3/flowlogger/EventStore.js"; import { EventEmitterWithWildcardSupport } from "../../lib/v3/flowlogger/EventEmitter.js"; import { FlowEvent } from "../../lib/v3/flowlogger/FlowLogger.js"; function attachEventStoreToBus( store: EventStore, bus: EventEmitterWithWildcardSupport, ): () => void { const onFlowEvent = (event: unknown) => { if (event instanceof FlowEvent) { void store.emit(event); } }; bus.on("*", onFlowEvent); return () => { bus.off("*", onFlowEvent); }; } function createVerboseStoreHarness(): { writes: string[]; store: EventStore; bus: EventEmitterWithWildcardSupport; detachBus: () => void; } { const writes: string[] = []; process.stderr.write = (( chunk: string, cb?: (error?: Error | null) => void, ) => { writes.push(String(chunk)); cb?.(null); return true; }) as typeof process.stderr.write; const store = new EventStore("session-test"); const bus = new EventEmitterWithWildcardSupport(); const detachBus = attachEventStoreToBus(store, bus); return { writes, store, bus, detachBus }; } describe("flow logger event store", () => { const stderrWrite = process.stderr.write.bind(process.stderr); afterEach(() => { process.stderr.write = stderrWrite; }); it("queries recent events from the default in-memory sink", async () => { const store = new EventStore("session-test"); await store.emit( new FlowEvent({ eventType: "StagehandExtractEvent", sessionId: "session-test", eventId: "stagehand-1234", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { params: ["grab title"] }, }), ); const events = await store.query({}); expect(events).toHaveLength(1); expect(events[0].eventType).toBe("StagehandExtractEvent"); await store.destroy(); }); it("drops payloads from the default in-memory sink", async () => { const store = new EventStore("session-test"); await store.emit( new FlowEvent({ eventType: "LlmRequestEvent", sessionId: "session-test", eventId: "llm-1234", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { prompt: [{ type: "image_url", image_url: { url: "huge" } }], output: "huge", }, }), ); const [event] = await store.query({}); expect(event.eventType).toBe("LlmRequestEvent"); expect(event.eventId).toBe("llm-1234"); expect(event.data).toEqual({}); await store.destroy(); }); it("renders semantic hierarchy tags for non-cdp stderr events only", async () => { // Intercept stderr so the pretty sink can be asserted without polluting the // real test runner output. const { writes, store, bus, detachBus } = createVerboseStoreHarness(); const stepEvent = new FlowEvent({ eventType: "StagehandExtractEvent", sessionId: "session-test", eventId: "stagehand-1234", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { params: ["grab title"] }, }); const cdpEvent = new FlowEvent({ eventType: "CdpCallEvent", sessionId: "session-test", eventId: "cdp-call-5678", eventCreatedAt: "2026-03-16T21:45:00.100Z", eventParentIds: [stepEvent.eventId], data: { method: "Runtime.evaluate", params: { expression: "2 + 2" }, targetId: "1234567890ABCDEF1234567890ABCDEF", }, }); // The stderr sink intentionally suppresses CDP noise even though the event // still exists for in-memory and file-backed sinks. bus.emit(stepEvent.eventType, stepEvent); bus.emit(cdpEvent.eventType, cdpEvent); await new Promise((resolve) => setTimeout(resolve, 0)); expect(writes).toHaveLength(1); expect(writes[0]).toContain("[🆂 #1234 EXTRACT]"); expect(writes[0]).toContain("Stagehand.extract"); expect(writes[0]).not.toContain("Runtime.evaluate"); detachBus(); await store.destroy(); }); it("renders generic stagehand events without crashing the stderr sink", async () => { const { writes, store, bus, detachBus } = createVerboseStoreHarness(); // `StagehandEvent` has no action suffix, so this guards the formatter path // that cannot assume a method name exists. bus.emit( "StagehandEvent", new FlowEvent({ eventType: "StagehandEvent", sessionId: "session-test", eventId: "stagehand-0001", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { params: ["noop"] }, }), ); await new Promise((resolve) => setTimeout(resolve, 0)); expect(writes).toHaveLength(1); expect(writes[0]).toContain("[🆂 #0001"); expect(writes[0]).toContain("Stagehand("); detachBus(); await store.destroy(); }); it("colorizes pretty stderr output with ansi escapes when enabled", async () => { const previousForceColor = process.env.FORCE_COLOR; const previousNoColor = process.env.NO_COLOR; delete process.env.NO_COLOR; process.env.FORCE_COLOR = "1"; const { writes, store, bus, detachBus } = createVerboseStoreHarness(); try { bus.emit( "StagehandActEvent", new FlowEvent({ eventType: "StagehandActEvent", sessionId: "session-test", eventId: "stagehand-0002", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { params: ["click submit"] }, }), ); await new Promise((resolve) => setTimeout(resolve, 0)); expect(writes).toHaveLength(1); expect(writes[0]).toContain("\u001B["); } finally { if (previousNoColor === undefined) { delete process.env.NO_COLOR; } else { process.env.NO_COLOR = previousNoColor; } if (previousForceColor === undefined) { delete process.env.FORCE_COLOR; } else { process.env.FORCE_COLOR = previousForceColor; } detachBus(); await store.destroy(); } }); it("keeps agent ancestry and start ids for completion events after many child events", async () => { const { writes, store, bus, detachBus } = createVerboseStoreHarness(); const agentEvent = new FlowEvent({ eventType: "AgentExecuteEvent", sessionId: "session-test", eventId: "agent-1234", eventCreatedAt: "2026-03-16T21:45:00.000Z", data: { params: [{ instruction: "click the button" }] }, }); const actEvent = new FlowEvent({ eventType: "StagehandActEvent", sessionId: "session-test", eventId: "stagehand-2222", eventCreatedAt: "2026-03-16T21:45:00.001Z", eventParentIds: [agentEvent.eventId], data: { params: ["click the button"] }, }); const clickEvent = new FlowEvent({ eventType: "UnderstudyClickEvent", sessionId: "session-test", eventId: "action-3333", eventCreatedAt: "2026-03-16T21:45:00.002Z", eventParentIds: [agentEvent.eventId, actEvent.eventId], data: { target: "xpath=/button[1]" }, }); bus.emit(agentEvent.eventType, agentEvent); bus.emit(actEvent.eventType, actEvent); bus.emit(clickEvent.eventType, clickEvent); // Flood the retained history with child events so the completion lines have // to recover their displayed ancestry from the queryable sink. for (let index = 0; index < 150; index += 1) { bus.emit( "CdpCallEvent", new FlowEvent({ eventType: "CdpCallEvent", sessionId: "session-test", eventId: `cdp-${String(index).padStart(4, "0")}`, eventCreatedAt: `2026-03-16T21:45:00.${String(index + 10).padStart(3, "0")}Z`, eventParentIds: [ agentEvent.eventId, actEvent.eventId, clickEvent.eventId, ], data: { method: "Runtime.evaluate", params: { expression: `${index}` }, targetId: "1234567890ABCDEF1234567890ABCDEF", }, }), ); } bus.emit( "UnderstudyClickCompletedEvent", new FlowEvent({ eventType: "UnderstudyClickCompletedEvent", sessionId: "session-test", eventId: "done-4444", eventCreatedAt: "2026-03-16T21:45:01.000Z", eventParentIds: [ agentEvent.eventId, actEvent.eventId, clickEvent.eventId, ], data: { durationMs: 250 }, }), ); bus.emit( "StagehandActCompletedEvent", new FlowEvent({ eventType: "StagehandActCompletedEvent", sessionId: "session-test", eventId: "done-5555", eventCreatedAt: "2026-03-16T21:45:01.001Z", eventParentIds: [agentEvent.eventId, actEvent.eventId], data: { durationMs: 500 }, }), ); bus.emit( "AgentExecuteCompletedEvent", new FlowEvent({ eventType: "AgentExecuteCompletedEvent", sessionId: "session-test", eventId: "done-6666", eventCreatedAt: "2026-03-16T21:45:01.002Z", eventParentIds: [agentEvent.eventId], data: { durationMs: 750 }, }), ); await new Promise((resolve) => setTimeout(resolve, 0)); // Completion lines should reference the original started-event ids, not the // synthetic completed-event ids emitted at the end of the lifecycle. const clickCompletedLine = writes.find((line) => line.includes("CLICK completed"), ); const actCompletedLine = writes.find((line) => line.includes("ACT completed"), ); const agentCompletedLine = writes.find((line) => line.includes("Agent.execute() completed"), ); expect(clickCompletedLine).toContain("[🅰 #1234]"); expect(clickCompletedLine).toContain("[🆂 #2222 ACT]"); expect(clickCompletedLine).toContain("[🆄 #3333 CLICK]"); expect(clickCompletedLine).not.toContain("#4444"); expect(actCompletedLine).toContain("[🅰 #1234]"); expect(actCompletedLine).toContain("[🆂 #2222 ACT]"); expect(actCompletedLine).not.toContain("#5555"); expect(agentCompletedLine).toContain("[🅰 #1234]"); expect(agentCompletedLine).not.toContain("#6666"); detachBus(); await store.destroy(); }); }); ================================================ FILE: packages/core/tests/unit/helpers/mockCDPSession.ts ================================================ import type { CDPSessionLike } from "../../../lib/v3/understudy/cdp.js"; type Handler = (params?: Record) => Promise | unknown; export class MockCDPSession implements CDPSessionLike { public readonly id: string; public readonly calls: Array<{ method: string; params?: Record; }> = []; constructor( private readonly handlers: Record = {}, sessionId = "mock-session", ) { this.id = sessionId; } async send( method: string, params: Record = {}, ): Promise { this.calls.push({ method, params }); const handler = this.handlers[method]; if (!handler) return {} as R; return (await handler(params)) as R; } on(): void {} off(): void {} async close(): Promise {} callsFor(method: string): Array<{ params?: Record }> { return this.calls .filter((call) => call.method === method) .map(({ params }) => ({ params })); } } ================================================ FILE: packages/core/tests/unit/llm-provider.test.ts ================================================ import { describe, expect, it } from "vitest"; import { getAISDKLanguageModel } from "../../lib/v3/llm/LLMProvider.js"; describe("getAISDKLanguageModel", () => { describe("ollama provider", () => { it("works without clientOptions", () => { const model = getAISDKLanguageModel("ollama", "llama3.2"); expect(model).toBeDefined(); }); it("works with empty clientOptions", () => { const model = getAISDKLanguageModel("ollama", "llama3.2", {}); expect(model).toBeDefined(); }); it("works with clientOptions containing only undefined values", () => { const model = getAISDKLanguageModel("ollama", "llama3.2", { apiKey: undefined, }); expect(model).toBeDefined(); }); it("works with clientOptions containing only null values", () => { const model = getAISDKLanguageModel("ollama", "llama3.2", { apiKey: null as unknown as string, }); expect(model).toBeDefined(); }); it("works with custom baseURL", () => { const model = getAISDKLanguageModel("ollama", "llama3.2", { baseURL: "http://custom-ollama:11434", }); expect(model).toBeDefined(); }); it("works even when apiKey is mistakenly provided", () => { // Ollama doesn't need an API key, but users might set one anyway const model = getAISDKLanguageModel("ollama", "llama3.2", { apiKey: "unnecessary-key", }); expect(model).toBeDefined(); }); }); describe("providers with API keys", () => { it("openai requires valid clientOptions for custom configuration", () => { // Without clientOptions, uses default provider const defaultModel = getAISDKLanguageModel("openai", "gpt-4o"); expect(defaultModel).toBeDefined(); // With valid apiKey, uses custom provider const customModel = getAISDKLanguageModel("openai", "gpt-4o", { apiKey: "test-key", }); expect(customModel).toBeDefined(); }); }); describe("hasValidOptions logic", () => { it("treats undefined apiKey as no options", () => { // This should use the default provider path (AISDKProviders) // not the custom provider path (AISDKProvidersWithAPIKey) const model = getAISDKLanguageModel("ollama", "llama3.2", { apiKey: undefined, }); expect(model).toBeDefined(); }); }); }); ================================================ FILE: packages/core/tests/unit/model-deprecation.test.ts ================================================ import { describe, expect, it } from "vitest"; import { LLMProvider } from "../../lib/v3/llm/LLMProvider.js"; import { UnsupportedModelError, UnsupportedAISDKModelProviderError, } from "../../lib/v3/types/public/sdkErrors.js"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; // Mock client options with fake API keys for testing const mockClientOptions = { apiKey: "test-api-key-for-testing" }; describe("Model format deprecation", () => { describe("UnsupportedModelError", () => { it("includes guidance to use provider/model format for unknown model names", () => { const error = new UnsupportedModelError(["gpt-4o", "gemini-2.0-flash"]); // Should mention the new format expect(error.message).toContain("provider/model"); // Should include link to docs expect(error.message).toContain( "https://docs.stagehand.dev/v3/configuration/models", ); }); it("includes example of provider/model format", () => { const error = new UnsupportedModelError(["gpt-4o"]); // Should provide examples like openai/gpt-4o expect(error.message).toContain("openai/gpt-4o"); expect(error.message).toContain("anthropic/claude-sonnet-4"); }); it("works with feature parameter", () => { const error = new UnsupportedModelError(["gpt-4o"], "extract"); expect(error.message).toContain("extract"); expect(error.message).toContain("provider/model"); expect(error.message).toContain( "https://docs.stagehand.dev/v3/configuration/models", ); }); }); describe("LLMProvider.getClient deprecation warning", () => { it("logs deprecation warning for legacy model names", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); // Using a legacy model name like "gpt-4o" instead of "openai/gpt-4o" // Should not throw, but should log a deprecation warning const client = provider.getClient("gpt-4o", mockClientOptions); // Should return a client (not throw) expect(client).toBeDefined(); // Should have logged a deprecation warning at level 0 const deprecationWarning = logs.find( (log) => log.message.toLowerCase().includes("deprecated") || log.message.toLowerCase().includes("deprecation"), ); expect(deprecationWarning).toBeDefined(); expect(deprecationWarning!.level).toBe(0); }); it("deprecation warning mentions provider/model format", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); provider.getClient("gpt-4o", mockClientOptions); const deprecationWarning = logs.find( (log) => log.message.toLowerCase().includes("deprecated") || log.message.toLowerCase().includes("deprecation"), ); expect(deprecationWarning).toBeDefined(); const message = deprecationWarning!.message; // Should mention the provider/model format expect(message).toContain("provider/model"); // Should give an example expect(message).toContain("openai/gpt-5"); }); it("returns OpenAIClient for legacy OpenAI model names", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); const client = provider.getClient("gpt-4o", mockClientOptions); // Should return a client expect(client).toBeDefined(); // The client should be an OpenAIClient (check constructor name) expect(client.constructor.name).toBe("OpenAIClient"); }); it("returns GoogleClient for legacy Google model names", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); const client = provider.getClient("gemini-2.0-flash", mockClientOptions); // Should return a client expect(client).toBeDefined(); // The client should be a GoogleClient expect(client.constructor.name).toBe("GoogleClient"); }); }); describe("LLMProvider.getClient error handling", () => { it("throws UnsupportedModelError for unknown model without slash", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); // Unknown model without slash should throw UnsupportedModelError expect(() => { provider.getClient("some-unknown-model", mockClientOptions); }).toThrow(UnsupportedModelError); }); it("UnsupportedModelError includes provider/model format guidance", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); try { provider.getClient("some-unknown-model", mockClientOptions); } catch (error) { expect((error as Error).message).toContain("provider/model"); } }); it("throws UnsupportedAISDKModelProviderError for invalid provider in provider/model format", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); // Invalid provider but correct format expect(() => { provider.getClient("invalid-provider/some-model", mockClientOptions); }).toThrow(UnsupportedAISDKModelProviderError); }); it("UnsupportedAISDKModelProviderError lists valid providers", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); try { provider.getClient("invalid-provider/some-model", mockClientOptions); } catch (error) { const message = (error as Error).message; // Should list valid providers expect(message).toContain("openai"); expect(message).toContain("anthropic"); expect(message).toContain("google"); } }); }); describe("new provider/model format", () => { it("does not log deprecation warning for provider/model format", () => { const logs: LogLine[] = []; const logger = (line: LogLine) => logs.push(line); const provider = new LLMProvider(logger); // Using the new format const client = provider.getClient("openai/gpt-4o", mockClientOptions); expect(client).toBeDefined(); // Should NOT have a deprecation warning const deprecationWarning = logs.find( (log) => log.message.toLowerCase().includes("deprecated") || log.message.toLowerCase().includes("deprecation"), ); expect(deprecationWarning).toBeUndefined(); }); }); }); ================================================ FILE: packages/core/tests/unit/model-utils.test.ts ================================================ import { describe, expect, it } from "vitest"; import { extractModelName, resolveModel } from "../../lib/modelUtils.js"; describe("extractModelName", () => { it("returns undefined for undefined input", () => { expect(extractModelName(undefined)).toBeUndefined(); }); it("returns the string as-is for a string input", () => { expect(extractModelName("openai/gpt-4o")).toBe("openai/gpt-4o"); }); it("returns modelName from an object input", () => { expect( extractModelName({ modelName: "anthropic/claude-sonnet-4-20250514" }), ).toBe("anthropic/claude-sonnet-4-20250514"); }); it("returns modelName from an object with extra properties", () => { expect( extractModelName({ modelName: "openai/gpt-4o-mini", apiKey: "sk-test", baseURL: "https://custom.endpoint", }), ).toBe("openai/gpt-4o-mini"); }); }); describe("resolveModel", () => { it("extracts provider and modelName from a string", () => { const result = resolveModel("openai/gpt-4o"); expect(result.provider).toBe("openai"); expect(result.modelName).toBe("gpt-4o"); expect(result.clientOptions).toEqual({}); }); it("extracts clientOptions from an object config", () => { const result = resolveModel({ modelName: "openai/gpt-4o" as never, apiKey: "sk-test", }); expect(result.provider).toBe("openai"); expect(result.modelName).toBe("gpt-4o"); expect(result.clientOptions).toMatchObject({ apiKey: "sk-test" }); // modelName should not leak into clientOptions expect(result.clientOptions).not.toHaveProperty("modelName"); }); }); ================================================ FILE: packages/core/tests/unit/openai-cua-client.test.ts ================================================ import { describe, expect, it, vi } from "vitest"; import { OpenAICUAClient } from "../../lib/v3/agent/OpenAICUAClient.js"; function createClient() { return new OpenAICUAClient( "openai", "computer-use-preview-2025-03-11", undefined, { apiKey: "test-key" }, ); } describe("OpenAICUAClient", () => { it("exposes captchaSolvedProceed tool after a captcha context note", () => { const client = createClient(); // Before captcha note — tool should not be active expect( (client as unknown as { captchaSolvedToolActive: boolean }) .captchaSolvedToolActive, ).toBe(false); // Simulate a captcha context note being added (as the CUA handler does) client.addContextNote( "A captcha was automatically detected and solved — no further interaction needed.", ); expect( (client as unknown as { captchaSolvedToolActive: boolean }) .captchaSolvedToolActive, ).toBe(true); }); it("does NOT activate captcha tool for non-captcha context notes", () => { const client = createClient(); client.addContextNote("The page has finished loading."); expect( (client as unknown as { captchaSolvedToolActive: boolean }) .captchaSolvedToolActive, ).toBe(false); }); it("deactivates captcha tool after takeAction handles the function call", async () => { const client = createClient(); client.addContextNote("A captcha was solved."); expect( (client as unknown as { captchaSolvedToolActive: boolean }) .captchaSolvedToolActive, ).toBe(true); // Simulate the model calling the captchaSolvedProceed tool const result = await ( client as unknown as { takeAction: ( output: unknown[], logger: (msg: unknown) => void, ) => Promise; } ).takeAction( [ { type: "function_call", name: "captchaSolvedProceed", call_id: "call-1", arguments: "{}", }, ], vi.fn(), ); // Tool should be deactivated expect( (client as unknown as { captchaSolvedToolActive: boolean }) .captchaSolvedToolActive, ).toBe(false); // Result should contain a function_call_output confirming proceed expect(result).toEqual([ { type: "function_call_output", call_id: "call-1", output: expect.stringContaining("Continue completing"), }, ]); }); it("does NOT auto-continue follow-up questions without a captcha context", async () => { const client = createClient(); // No captcha context note — no tool should be exposed type ExecuteStepResult = { actions: Array<{ type: string }>; message: string; completed: boolean; nextInputItems: unknown[]; responseId: string; usage: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; }; const executeStepSpy = vi.spyOn( client as unknown as { executeStep: ( inputItems: unknown[], previousResponseId: string | undefined, logger: (message: { message: string }) => void, ) => Promise; }, "executeStep", ); executeStepSpy.mockResolvedValueOnce({ actions: [], message: "I've located the Submit button. Should I go ahead and submit it?", completed: true, nextInputItems: [], responseId: "response-1", usage: { input_tokens: 1, output_tokens: 1, inference_time_ms: 1 }, }); const result = await client.execute({ options: { instruction: "Submit the form.", maxSteps: 10 } as never, logger: vi.fn(), }); // Should NOT have continued — the model's follow-up is treated as completion expect(executeStepSpy).toHaveBeenCalledTimes(1); expect(result.completed).toBe(true); }); }); ================================================ FILE: packages/core/tests/unit/page-extra-http-headers.test.ts ================================================ import { describe, expect, it } from "vitest"; import { Page } from "../../lib/v3/understudy/page.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import { StagehandSetExtraHTTPHeadersError } from "../../lib/v3/types/public/sdkErrors.js"; type PageStub = { mainSession: MockCDPSession; sessions: Map; extraHTTPHeaders: Record; applyExtraHTTPHeadersToSession: ( session: MockCDPSession, headers: Record, ) => Promise; }; const makePage = (sessions: MockCDPSession[]): PageStub => { const mainSession = sessions[0] ?? new MockCDPSession({}, "main"); const stub: PageStub = { mainSession, sessions: new Map(sessions.map((s) => [s.id, s])), extraHTTPHeaders: {}, // Bind the private helper from Page.prototype so setExtraHTTPHeaders can call it applyExtraHTTPHeadersToSession: (Page.prototype as unknown as PageStub) .applyExtraHTTPHeadersToSession, }; return stub; }; describe("Page.setExtraHTTPHeaders", () => { const setExtraHTTPHeaders = Page.prototype.setExtraHTTPHeaders as ( this: PageStub, headers: Record, ) => Promise; it("sends headers to all sessions owned by the page", async () => { const sessionA = new MockCDPSession({}, "session-a"); const sessionB = new MockCDPSession({}, "session-b"); const page = makePage([sessionA, sessionB]); await setExtraHTTPHeaders.call(page, { "x-stagehand-test": "hello", }); for (const session of [sessionA, sessionB]) { expect(session.callsFor("Network.enable").length).toBe(1); expect( session.callsFor("Network.setExtraHTTPHeaders")[0]?.params, ).toEqual({ headers: { "x-stagehand-test": "hello" }, }); } }); it("applies headers to mainSession even when sessions map is empty", async () => { const page = makePage([]); await setExtraHTTPHeaders.call(page, { "x-test": "value" }); // mainSession should still receive headers even though it's not in the sessions map expect(page.mainSession.callsFor("Network.enable").length).toBe(1); expect( page.mainSession.callsFor("Network.setExtraHTTPHeaders")[0]?.params, ).toEqual({ headers: { "x-test": "value" }, }); }); it("throws StagehandSetExtraHTTPHeadersError with session failure details", async () => { const sessionA = new MockCDPSession( { "Network.setExtraHTTPHeaders": () => { throw new Error("connection closed"); }, }, "session-a", ); const sessionB = new MockCDPSession({}, "session-b"); const page = makePage([sessionA, sessionB]); let caughtError: StagehandSetExtraHTTPHeadersError | undefined; try { await setExtraHTTPHeaders.call(page, { "x-stagehand-test": "yes", }); } catch (error) { caughtError = error as StagehandSetExtraHTTPHeadersError; } expect(caughtError).toBeInstanceOf(StagehandSetExtraHTTPHeadersError); expect(caughtError?.failures).toHaveLength(1); expect(caughtError?.failures[0]).toContain("session=session-a"); expect(caughtError?.failures[0]).toContain("connection closed"); // sessionB should still have been called successfully expect(sessionB.callsFor("Network.setExtraHTTPHeaders").length).toBe(1); }); it("applies headers to sessions adopted after the call", async () => { const sessionA = new MockCDPSession({}, "session-a"); const page = makePage([sessionA]); await setExtraHTTPHeaders.call(page, { "x-before": "yes" }); // A new OOPIF session is adopted after headers were set const sessionB = new MockCDPSession({}, "session-b"); page.sessions.set(sessionB.id, sessionB); // Simulate what adoptOopifSession does: replay headers onto the new session await page.applyExtraHTTPHeadersToSession.call( page, sessionB, page.extraHTTPHeaders, ); // The late-arriving session should have received the headers expect(sessionB.callsFor("Network.enable").length).toBe(1); expect(sessionB.callsFor("Network.setExtraHTTPHeaders")[0]?.params).toEqual( { headers: { "x-before": "yes" }, }, ); }); it("does not mutate the original headers object", async () => { const session = new MockCDPSession({}, "session-a"); const page = makePage([session]); const original = { "x-custom": "value" }; const frozen = { ...original }; await setExtraHTTPHeaders.call(page, original); expect(original).toEqual(frozen); }); }); ================================================ FILE: packages/core/tests/unit/page-snapshot.test.ts ================================================ import { afterEach, describe, expect, it, vi } from "vitest"; import { promises as fs } from "fs"; import { Page } from "../../lib/v3/understudy/page.js"; import * as snapshotModule from "../../lib/v3/understudy/a11y/snapshot/index.js"; import type { HybridSnapshot } from "../../lib/v3/types/private/index.js"; const baseSnapshot: HybridSnapshot = { combinedTree: "tree", combinedXpathMap: {}, combinedUrlMap: {}, perFrame: [], }; describe("Page.snapshot", () => { afterEach(() => { vi.restoreAllMocks(); }); it("forwards the includeIframes flag to captureHybridSnapshot", async () => { vi.spyOn(fs, "writeFile").mockResolvedValue(); const captureSpy = vi .spyOn(snapshotModule, "captureHybridSnapshot") .mockResolvedValue(baseSnapshot); const fakePage = {} as Page; await Page.prototype.snapshot.call(fakePage, { includeIframes: false }); expect(captureSpy).toHaveBeenCalledWith(fakePage, { pierceShadow: true, includeIframes: false, }); }); it("falls back to default iframe inclusion when option is omitted", async () => { vi.spyOn(fs, "writeFile").mockResolvedValue(); const captureSpy = vi .spyOn(snapshotModule, "captureHybridSnapshot") .mockResolvedValue(baseSnapshot); const fakePage = {} as Page; await Page.prototype.snapshot.call(fakePage); expect(captureSpy).toHaveBeenCalledWith(fakePage, { pierceShadow: true, includeIframes: undefined, }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/export-surface.test.ts ================================================ import { describe, expect, it } from "vitest"; import StagehandDefaultExport, * as Stagehand from "@browserbasehq/stagehand"; import { publicErrorTypes } from "./public-error-types.test.js"; // Type matcher guidelines: // // toEqualTypeOf – Default. Assert full, deep type equality; any type change should fail. // e.g. expectTypeOf>().toEqualTypeOf() // // toMatchObjectType – Assert (part of) an object's shape while allowing extra fields. // e.g. expectTypeOf(user).toMatchObjectType<{ id: string; email: string }>() // // toExtend – Assert that a type is compatible with a broader contract (assignable/extends). // e.g. expectTypeOf().toExtend() const publicApiShape = { __internalMaybeRunShutdownSupervisorFromArgv: Stagehand.__internalMaybeRunShutdownSupervisorFromArgv, __internalCreateInMemoryAgentCacheHandle: Stagehand.__internalCreateInMemoryAgentCacheHandle, AISdkClient: Stagehand.AISdkClient, Api: Stagehand.Api, AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS, AgentProvider: Stagehand.AgentProvider, AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText, ConsoleMessage: Stagehand.ConsoleMessage, CustomOpenAIClient: Stagehand.CustomOpenAIClient, LLMClient: Stagehand.LLMClient, LOG_LEVEL_NAMES: Stagehand.LOG_LEVEL_NAMES, Response: Stagehand.Response, Stagehand: Stagehand.Stagehand, V3: Stagehand.V3, V3Evaluator: Stagehand.V3Evaluator, V3FunctionName: Stagehand.V3FunctionName, connectToMCPServer: Stagehand.connectToMCPServer, default: StagehandDefaultExport, defaultExtractSchema: Stagehand.defaultExtractSchema, getAISDKLanguageModel: Stagehand.getAISDKLanguageModel, getZodType: Stagehand.getZodType, injectUrls: Stagehand.injectUrls, isRunningInBun: Stagehand.isRunningInBun, isZod3Schema: Stagehand.isZod3Schema, isZod4Schema: Stagehand.isZod4Schema, jsonSchemaToZod: Stagehand.jsonSchemaToZod, loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv, localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema, modelToAgentProviderMap: Stagehand.modelToAgentProviderMap, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, toGeminiSchema: Stagehand.toGeminiSchema, toJsonSchema: Stagehand.toJsonSchema, tool: Stagehand.tool, transformSchema: Stagehand.transformSchema, trimTrailingTextNode: Stagehand.trimTrailingTextNode, validateZodSchema: Stagehand.validateZodSchema, ...publicErrorTypes, } as const; type StagehandExports = typeof Stagehand & { default: typeof StagehandDefaultExport; }; type PublicAPI = { [K in keyof typeof publicApiShape]: StagehandExports[K]; }; describe("Stagehand public API export surface", () => { it("public API shape matches module exports", () => { const _check: PublicAPI = publicApiShape; void _check; }); it("does not expose unexpected top-level exports", () => { const expected = Object.keys(publicApiShape).sort(); const actual = Object.keys(Stagehand).sort(); expect(actual).toStrictEqual(expected); }); it("default export mirrors the named export surface", () => { const expected = Object.keys(Stagehand) .filter((key) => key !== "default") .sort(); const actual = Object.keys(StagehandDefaultExport).sort(); expect(actual).toStrictEqual(expected); }); }); ================================================ FILE: packages/core/tests/unit/public-api/llm-and-agents.test.ts ================================================ import { describe, expect, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; describe("LLM and Agents public API types", () => { describe("ModelConfiguration", () => { it("accepts Vertex headers in model config", () => { const googleConfig = { modelName: "google/gemini-3-flash-preview", project: "test-project", location: "global", headers: { "X-Goog-Priority": "high", }, } satisfies Stagehand.ModelConfiguration; void googleConfig; }); }); describe("AISdkClient", () => { type AISdkClientInstance = InstanceType; it("is exported", () => { expect(Stagehand.AISdkClient).toBeDefined(); }); it("extends LLMClient", () => { expectTypeOf().toExtend(); }); it("constructor accepts model parameter", () => { // AISdkClient constructor takes { model: LanguageModelV2 } type CtorParams = ConstructorParameters; expectTypeOf().toEqualTypeOf<1>(); }); }); describe("AVAILABLE_CUA_MODELS", () => { const expectedModels = [ "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-opus-4-5-20251101", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6", "anthropic/claude-haiku-4-5-20251001", "anthropic/claude-sonnet-4-20250514", "anthropic/claude-sonnet-4-5-20250929", "google/gemini-2.5-computer-use-preview-10-2025", "google/gemini-3-flash-preview", "google/gemini-3-pro-preview", "microsoft/fara-7b", ] as const; it("AvailableCuaModel matches the known literals", () => { expectTypeOf().toEqualTypeOf< (typeof expectedModels)[number] >(); void expectedModels; // Mark as used to satisfy ESLint }); }); describe("AgentProvider", () => { type AgentProviderInstance = InstanceType; it("is exported", () => { expect(Stagehand.AgentProvider).toBeDefined(); }); it("has getClient method", () => { expectTypeOf().toBeCallableWith( "test-model", ); }); it("constructor accepts logger parameter", () => { expectTypeOf< ConstructorParameters >().toEqualTypeOf<[(message: Stagehand.LogLine) => void]>(); }); }); describe("AnnotatedScreenshotText", () => { type ExpectedAnnotatedScreenshotText = string; it("is a string literal", () => { expectTypeOf< typeof Stagehand.AnnotatedScreenshotText >().toExtend(); }); }); describe("ConsoleMessage", () => { type ExpectedShape = { type: () => string; text: () => string; args: () => unknown[]; location: () => { url?: string; lineNumber?: number; columnNumber?: number; }; page: () => unknown; timestamp: () => number | undefined; raw: () => unknown; toString: () => string; }; type ConsoleMessageInstance = InstanceType; it("has correct public interface shape", () => { expectTypeOf().toExtend(); }); }); describe("AgentClient", () => { type AgentProviderInstance = InstanceType; type GetClientReturn = ReturnType; it("getClient returns object with expected methods", () => { type ExpectedShape = { execute: ( options: Stagehand.AgentExecutionOptions, ) => Promise; captureScreenshot: ( options?: Record, ) => Promise; setViewport: (width: number, height: number) => void; setCurrentUrl: (url: string) => void; setScreenshotProvider: (provider: () => Promise) => void; setActionHandler: ( handler: (action: Stagehand.AgentAction) => Promise, ) => void; }; expectTypeOf().toExtend(); }); }); describe("LLMClient", () => { type ExpectedShape = { type: "openai" | "anthropic" | "cerebras" | "groq" | (string & {}); modelName: Stagehand.AvailableModel | (string & {}); hasVision: boolean; clientOptions: Stagehand.ClientOptions; userProvidedInstructions?: string; }; type ExpectedCtorParams = [Stagehand.AvailableModel, string?]; type ExpectedBasicOptions = { options: { messages: Array<{ role: "system" | "user" | "assistant"; content: string | Array; }>; }; logger: (message: unknown) => void; retries?: number; }; type ExpectedWithResponseModel = ExpectedBasicOptions & { options: ExpectedBasicOptions["options"] & { response_model: { name: string; schema: Stagehand.StagehandZodSchema; }; }; }; type LLMClientInstance = InstanceType; it("has correct public interface shape", () => { expectTypeOf().toExtend(); }); it("constructor parameters match expected signature", () => { expectTypeOf< ConstructorParameters >().toEqualTypeOf(); }); it("createChatCompletion can be called with basic options", () => { expectTypeOf< LLMClientInstance["createChatCompletion"] >().toBeCallableWith({ options: { messages: [ { role: "user", content: "Hello", }, ], }, logger: () => {}, } satisfies ExpectedBasicOptions); }); it("createChatCompletion can be called with response_model", () => { const mockSchema = {} as Stagehand.StagehandZodSchema; expectTypeOf< LLMClientInstance["createChatCompletion"] >().toBeCallableWith({ options: { messages: [ { role: "user", content: "Extract data", }, ], response_model: { name: "extracted", schema: mockSchema, }, }, logger: () => {}, } satisfies ExpectedWithResponseModel); }); it("createChatCompletion supports generic return type", () => { type Result = { custom: string }; type ExpectedSignature = ( options: Stagehand.CreateChatCompletionOptions, ) => Promise; expectTypeOf< LLMClientInstance["createChatCompletion"] >().toExtend(); }); it("has additional methods", () => { // These methods exist on LLMClient but have complex signatures from the 'ai' library // We verify they exist by checking they're functions expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); expectTypeOf().toExtend< (...args: unknown[]) => unknown >(); }); }); describe("modelToAgentProviderMap", () => { type ExpectedModelToAgentProviderMap = Record< string, Stagehand.AgentProviderType >; it("only stores valid provider types", () => { expectTypeOf< typeof Stagehand.modelToAgentProviderMap >().toExtend(); }); }); describe("Response", () => { type ExpectedShape = { url: () => string; status: () => number; statusText: () => string; ok: () => boolean; frame: () => unknown; fromServiceWorker: () => boolean; securityDetails: () => Promise; serverAddr: () => Promise; headers: () => Record; allHeaders: () => Promise>; headerValue: (name: string) => Promise; headerValues: (name: string) => Promise; headersArray: () => Promise>; body: () => Promise; text: () => Promise; json: () => Promise; finished: () => Promise; markFinished: (error: Error | null) => void; applyExtraInfo: (info: unknown) => void; }; type ResponseInstance = InstanceType; it("has correct public interface shape", () => { expectTypeOf().toExtend(); }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/public-error-types.test.ts ================================================ import { describe, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; export const publicErrorTypes = { AgentAbortError: Stagehand.AgentAbortError, CdpConnectionClosedError: Stagehand.CdpConnectionClosedError, AgentScreenshotProviderError: Stagehand.AgentScreenshotProviderError, BrowserbaseSessionNotFoundError: Stagehand.BrowserbaseSessionNotFoundError, CaptchaTimeoutError: Stagehand.CaptchaTimeoutError, ConnectionTimeoutError: Stagehand.ConnectionTimeoutError, ContentFrameNotFoundError: Stagehand.ContentFrameNotFoundError, CookieSetError: Stagehand.CookieSetError, CookieValidationError: Stagehand.CookieValidationError, CreateChatCompletionResponseError: Stagehand.CreateChatCompletionResponseError, CuaModelRequiredError: Stagehand.CuaModelRequiredError, ElementNotVisibleError: Stagehand.ElementNotVisibleError, ExperimentalApiConflictError: Stagehand.ExperimentalApiConflictError, ExperimentalNotConfiguredError: Stagehand.ExperimentalNotConfiguredError, HandlerNotInitializedError: Stagehand.HandlerNotInitializedError, InvalidAISDKModelFormatError: Stagehand.InvalidAISDKModelFormatError, LLMResponseError: Stagehand.LLMResponseError, MCPConnectionError: Stagehand.MCPConnectionError, MissingEnvironmentVariableError: Stagehand.MissingEnvironmentVariableError, MissingLLMConfigurationError: Stagehand.MissingLLMConfigurationError, PageNotFoundError: Stagehand.PageNotFoundError, ResponseBodyError: Stagehand.ResponseBodyError, ResponseParseError: Stagehand.ResponseParseError, StagehandAPIError: Stagehand.StagehandAPIError, StagehandAPIUnauthorizedError: Stagehand.StagehandAPIUnauthorizedError, StagehandClickError: Stagehand.StagehandClickError, StagehandClosedError: Stagehand.StagehandClosedError, StagehandDefaultError: Stagehand.StagehandDefaultError, StagehandDomProcessError: Stagehand.StagehandDomProcessError, StagehandElementNotFoundError: Stagehand.StagehandElementNotFoundError, StagehandEnvironmentError: Stagehand.StagehandEnvironmentError, StagehandError: Stagehand.StagehandError, StagehandEvalError: Stagehand.StagehandEvalError, StagehandHttpError: Stagehand.StagehandHttpError, StagehandIframeError: Stagehand.StagehandIframeError, StagehandInitError: Stagehand.StagehandInitError, StagehandInvalidArgumentError: Stagehand.StagehandInvalidArgumentError, StagehandLocatorError: Stagehand.StagehandLocatorError, StagehandMissingArgumentError: Stagehand.StagehandMissingArgumentError, StagehandNotInitializedError: Stagehand.StagehandNotInitializedError, StagehandResponseBodyError: Stagehand.StagehandResponseBodyError, StagehandResponseParseError: Stagehand.StagehandResponseParseError, StagehandServerError: Stagehand.StagehandServerError, StagehandShadowRootMissingError: Stagehand.StagehandShadowRootMissingError, StagehandShadowSegmentEmptyError: Stagehand.StagehandShadowSegmentEmptyError, StagehandShadowSegmentNotFoundError: Stagehand.StagehandShadowSegmentNotFoundError, StreamingCallbacksInNonStreamingModeError: Stagehand.StreamingCallbacksInNonStreamingModeError, StagehandSnapshotError: Stagehand.StagehandSnapshotError, TimeoutError: Stagehand.TimeoutError, UnsupportedAISDKModelProviderError: Stagehand.UnsupportedAISDKModelProviderError, UnsupportedModelError: Stagehand.UnsupportedModelError, UnsupportedModelProviderError: Stagehand.UnsupportedModelProviderError, XPathResolutionError: Stagehand.XPathResolutionError, ZodSchemaValidationError: Stagehand.ZodSchemaValidationError, ActTimeoutError: Stagehand.ActTimeoutError, ObserveTimeoutError: Stagehand.ObserveTimeoutError, ExtractTimeoutError: Stagehand.ExtractTimeoutError, UnderstudyCommandException: Stagehand.UnderstudyCommandException, StagehandSetExtraHTTPHeadersError: Stagehand.StagehandSetExtraHTTPHeadersError, } as const; const errorTypes = Object.keys(publicErrorTypes) as Array< keyof typeof publicErrorTypes >; describe("Stagehand public error types", () => { describe("errors", () => { it.each(errorTypes)("%s extends Error", (errorTypeName) => { const ErrorClass = Stagehand[errorTypeName]; type ErrorClassType = typeof ErrorClass; expectTypeOf>().toExtend(); void ErrorClass; // Mark as used to satisfy ESLint }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/public-types.test.ts ================================================ import { describe, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; // Type-level manifest of all expected exported types // Since these types don't exist at runtime, we currently need to manually add new publicly exported types // to this list ourselves - it's not automatically going to catch changes like our export-surface.test.ts does. // eslint-disable-next-line @typescript-eslint/no-unused-vars type ExpectedExportedTypes = { // Types from model.ts AvailableModel: Stagehand.AvailableModel; AvailableCuaModel: Stagehand.AvailableCuaModel; ModelProvider: Stagehand.ModelProvider; ClientOptions: Stagehand.ClientOptions; ModelConfiguration: Stagehand.ModelConfiguration; AnthropicJsonSchemaObject: Stagehand.AnthropicJsonSchemaObject; AISDKProvider: Stagehand.AISDKProvider; AISDKCustomProvider: Stagehand.AISDKCustomProvider; LLMTool: Stagehand.LLMTool; // Types from methods.ts ActOptions: Stagehand.ActOptions; ActResult: Stagehand.ActResult; ExtractResult: Stagehand.ExtractResult; Action: Stagehand.Action; HistoryEntry: Stagehand.HistoryEntry; ExtractOptions: Stagehand.ExtractOptions; ObserveOptions: Stagehand.ObserveOptions; ObserveResult: Stagehand.ObserveResult; V3FunctionName: Stagehand.V3FunctionName; // Types from agent.ts Tool: Stagehand.Tool; AgentAction: Stagehand.AgentAction; AgentResult: Stagehand.AgentResult; AgentExecuteOptions: Stagehand.AgentExecuteOptions; AgentType: Stagehand.AgentType; AgentExecutionOptions: Stagehand.AgentExecutionOptions; AgentHandlerOptions: Stagehand.AgentHandlerOptions; ActionExecutionResult: Stagehand.ActionExecutionResult; ToolUseItem: Stagehand.ToolUseItem; AnthropicMessage: Stagehand.AnthropicMessage; AnthropicContentBlock: Stagehand.AnthropicContentBlock; AnthropicTextBlock: Stagehand.AnthropicTextBlock; AnthropicToolResult: Stagehand.AnthropicToolResult; ResponseItem: Stagehand.ResponseItem; ComputerCallItem: Stagehand.ComputerCallItem; FunctionCallItem: Stagehand.FunctionCallItem; ResponseInputItem: Stagehand.ResponseInputItem; AgentInstance: Stagehand.AgentInstance; AgentProviderType: Stagehand.AgentProviderType; AgentModelConfig: Stagehand.AgentModelConfig; AgentConfig: Stagehand.AgentConfig; AgentToolMode: Stagehand.AgentToolMode; VariableValue: Stagehand.VariableValue; Variables: Stagehand.Variables; AgentCallbacks: Stagehand.AgentCallbacks; AgentExecuteCallbacks: Stagehand.AgentExecuteCallbacks; AgentStreamCallbacks: Stagehand.AgentStreamCallbacks; AgentExecuteOptionsBase: Stagehand.AgentExecuteOptionsBase; AgentStreamExecuteOptions: Stagehand.AgentStreamExecuteOptions; ModelMessage: Stagehand.ModelMessage; // Types from agent/tools AgentTools: Stagehand.AgentTools; AgentToolTypesMap: Stagehand.AgentToolTypesMap; AgentUITools: Stagehand.AgentUITools; AgentToolCall: Stagehand.AgentToolCall; AgentToolResult: Stagehand.AgentToolResult; // Types from logs.ts LogLevel: Stagehand.LogLevel; LogLine: Stagehand.LogLine; Logger: Stagehand.Logger; // Types from metrics.ts StagehandMetrics: Stagehand.StagehandMetrics; // Types from options.ts V3Env: Stagehand.V3Env; LocalBrowserLaunchOptions: Stagehand.LocalBrowserLaunchOptions; V3Options: Stagehand.V3Options; // Types from page.ts AnyPage: Stagehand.AnyPage; Page: Stagehand.Page; PlaywrightPage: Stagehand.PlaywrightPage; PatchrightPage: Stagehand.PatchrightPage; PuppeteerPage: Stagehand.PuppeteerPage; ConsoleListener: Stagehand.ConsoleListener; LoadState: Stagehand.LoadState; // Types from LLMClient.ts ChatMessage: Stagehand.ChatMessage; ChatMessageContent: Stagehand.ChatMessageContent; ChatMessageImageContent: Stagehand.ChatMessageImageContent; ChatMessageTextContent: Stagehand.ChatMessageTextContent; ChatCompletionOptions: Stagehand.ChatCompletionOptions; LLMResponse: Stagehand.LLMResponse; CreateChatCompletionOptions: Stagehand.CreateChatCompletionOptions; LLMUsage: Stagehand.LLMUsage; LLMParsedResponse: Stagehand.LLMParsedResponse>; // Types from zodCompat.ts StagehandZodSchema: Stagehand.StagehandZodSchema; StagehandZodObject: Stagehand.StagehandZodObject; InferStagehandSchema: Stagehand.InferStagehandSchema; JsonSchemaDocument: Stagehand.JsonSchemaDocument; // Types from utils.ts JsonSchema: Stagehand.JsonSchema; JsonSchemaProperty: Stagehand.JsonSchemaProperty; // Types from cookies.ts Cookie: Stagehand.Cookie; CookieParam: Stagehand.CookieParam; ClearCookieOptions: Stagehand.ClearCookieOptions; }; describe("Stagehand public API types", () => { describe("AnyPage", () => { type ExpectedAnyPage = | Stagehand.PlaywrightPage | Stagehand.PuppeteerPage | Stagehand.PatchrightPage | Stagehand.Page; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ActOptions", () => { type ExpectedActOptions = { model?: Stagehand.ModelConfiguration; variables?: Stagehand.Variables; timeout?: number; page?: Stagehand.AnyPage; serverCache?: boolean; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ActResult", () => { type ExpectedActResult = { success: boolean; message: string; actionDescription: string; actions: Stagehand.Action[]; cacheStatus?: "HIT" | "MISS"; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ExtractOptions", () => { type ExpectedExtractOptions = { model?: Stagehand.ModelConfiguration; timeout?: number; selector?: string; page?: Stagehand.AnyPage; serverCache?: boolean; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ObserveOptions", () => { type ExpectedObserveOptions = { model?: Stagehand.ModelConfiguration; timeout?: number; selector?: string; page?: Stagehand.AnyPage; serverCache?: boolean; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ObserveResult", () => { it("is an Action array with optional cacheStatus", () => { expectTypeOf().toExtend(); expectTypeOf().toEqualTypeOf< "HIT" | "MISS" | undefined >(); }); }); describe("Action", () => { type ExpectedAction = { selector: string; description: string; method?: string; arguments?: string[]; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("AgentAction", () => { // AgentAction is a separate type from Action, not an extension // It has additional fields like type, reasoning, taskCompleted, etc. it("has type field", () => { type TestAction = { type: string } & Stagehand.AgentAction; expectTypeOf().toEqualTypeOf(); }); }); describe("AgentExecuteOptions", () => { type ExpectedAgentExecuteOptions = { instruction: string; maxSteps?: number; page?: Stagehand.AnyPage; highlightCursor?: boolean; messages?: Stagehand.ModelMessage[]; signal?: AbortSignal; excludeTools?: string[]; output?: Stagehand.StagehandZodObject; callbacks?: Stagehand.AgentExecuteCallbacks; variables?: Stagehand.Variables; toolTimeout?: number; useSearch?: boolean; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("AgentStreamExecuteOptions", () => { type ExpectedAgentStreamExecuteOptions = { instruction: string; maxSteps?: number; page?: Stagehand.AnyPage; highlightCursor?: boolean; messages?: Stagehand.ModelMessage[]; signal?: AbortSignal; excludeTools?: string[]; output?: Stagehand.StagehandZodObject; callbacks?: Stagehand.AgentStreamCallbacks; variables?: Stagehand.Variables; toolTimeout?: number; useSearch?: boolean; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("AgentExecutionOptions", () => { type ExpectedAgentExecutionOptions = { options: T; logger: (message: Stagehand.LogLine) => void; retries?: number; }; it("matches expected type shape", () => { expectTypeOf< Stagehand.AgentExecutionOptions >().toEqualTypeOf< ExpectedAgentExecutionOptions >(); }); }); describe("AgentResult", () => { type ExpectedAgentResult = { success: boolean; message: string; actions: Stagehand.AgentAction[]; completed: boolean; metadata?: Record; usage?: { input_tokens: number; output_tokens: number; reasoning_tokens?: number; cached_input_tokens?: number; inference_time_ms: number; }; messages?: Stagehand.ModelMessage[]; output?: Record; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("AgentConfig", () => { type ExpectedAgentConfig = { systemPrompt?: string; integrations?: (unknown | string)[]; tools?: unknown; cua?: boolean; model?: string | Stagehand.AgentModelConfig; executionModel?: string | Stagehand.AgentModelConfig; stream?: boolean; mode?: Stagehand.AgentToolMode; }; it("matches expected type shape", () => { expectTypeOf().toExtend(); }); }); describe("AgentToolMode", () => { type ExpectedAgentToolMode = "dom" | "hybrid" | "cua"; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("HistoryEntry", () => { type ExpectedHistoryEntry = { method: "act" | "extract" | "observe" | "navigate" | "agent"; parameters: unknown; result: unknown; timestamp: string; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("Cookie", () => { type ExpectedCookie = { name: string; value: string; domain: string; path: string; expires: number; httpOnly: boolean; secure: boolean; sameSite: "Strict" | "Lax" | "None"; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("CookieParam", () => { type ExpectedCookieParam = { name: string; value: string; url?: string; domain?: string; path?: string; expires?: number; httpOnly?: boolean; secure?: boolean; sameSite?: "Strict" | "Lax" | "None"; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("ClearCookieOptions", () => { type ExpectedClearCookieOptions = { name?: string | RegExp; domain?: string | RegExp; path?: string | RegExp; }; it("matches expected type shape", () => { expectTypeOf().toEqualTypeOf(); }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/runtime-utils.test.ts ================================================ import { describe, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; describe("Runtime Utils public API types", () => { describe("injectUrls", () => { type ExpectedInjectUrlsParams = [ unknown, Array, Record, ]; it("has correct parameter types", () => { expectTypeOf( Stagehand.injectUrls, ).parameters.branded.toEqualTypeOf(); }); }); describe("isRunningInBun", () => { type ExpectedIsRunningInBunParams = []; it("has correct parameter types", () => { expectTypeOf( Stagehand.isRunningInBun, ).parameters.branded.toEqualTypeOf(); }); }); describe("loadApiKeyFromEnv", () => { type ExpectedLoadApiKeyFromEnvParams = [ string | undefined, (logLine: Stagehand.LogLine) => void, ]; it("has correct parameter types", () => { expectTypeOf( Stagehand.loadApiKeyFromEnv, ).parameters.branded.toEqualTypeOf(); }); }); describe("providerEnvVarMap", () => { type ExpectedProviderEnvVarMap = Partial< Record> >; it("maps providers to environment variable names", () => { expectTypeOf< typeof Stagehand.providerEnvVarMap >().toExtend(); }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/schema-utils.test.ts ================================================ import { describe, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; describe("Schema Utils public API types", () => { describe("defaultExtractSchema", () => { type ExpectedInferredType = { extraction: string }; it("infers to the correct type", () => { expectTypeOf< Stagehand.InferStagehandSchema >().toEqualTypeOf(); }); }); describe("getZodType", () => { type ExpectedGetZodTypeParams = [Stagehand.StagehandZodSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.getZodType, ).parameters.branded.toEqualTypeOf(); }); }); describe("isZod3Schema", () => { type ExpectedIsZod3SchemaParams = [Stagehand.StagehandZodSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.isZod3Schema, ).parameters.branded.toEqualTypeOf(); }); }); describe("isZod4Schema", () => { type ExpectedIsZod4SchemaParams = [Stagehand.StagehandZodSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.isZod4Schema, ).parameters.branded.toEqualTypeOf(); }); }); describe("jsonSchemaToZod", () => { type ExpectedJsonSchemaToZodParams = [Stagehand.JsonSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.jsonSchemaToZod, ).parameters.branded.toEqualTypeOf(); }); }); describe("pageTextSchema", () => { type ExpectedInferredType = { pageText: string }; it("infers to the correct type", () => { expectTypeOf< Stagehand.InferStagehandSchema >().toEqualTypeOf(); }); }); describe("toGeminiSchema", () => { type ExpectedToGeminiSchemaParams = [Stagehand.StagehandZodSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.toGeminiSchema, ).parameters.branded.toEqualTypeOf(); }); }); describe("toJsonSchema", () => { type ExpectedToJsonSchemaParams = [Stagehand.StagehandZodSchema]; it("has correct parameter types", () => { expectTypeOf( Stagehand.toJsonSchema, ).parameters.branded.toEqualTypeOf(); }); }); describe("transformSchema", () => { type ExpectedTransformSchemaParams = [ Stagehand.StagehandZodSchema, Array, ]; it("has correct parameter types", () => { expectTypeOf( Stagehand.transformSchema, ).parameters.branded.toEqualTypeOf(); }); }); describe("trimTrailingTextNode", () => { type ExpectedTrimTrailingTextNodeParams = [string | undefined]; it("has correct parameter types", () => { expectTypeOf( Stagehand.trimTrailingTextNode, ).parameters.branded.toEqualTypeOf(); }); }); describe("validateZodSchema", () => { type ExpectedValidateZodSchemaParams = [ Stagehand.StagehandZodSchema, unknown, ]; it("has correct parameter types", () => { expectTypeOf( Stagehand.validateZodSchema, ).parameters.branded.toEqualTypeOf(); }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/timeout-error-types.test.ts ================================================ import { describe, expect, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; // ============================================================================ // Public Timeout Error Types Runtime Tests // ============================================================================ // These tests verify the runtime behavior of exported timeout error types, // complementing the type-level tests in public-error-types.test.ts describe("Public timeout error types runtime behavior", () => { describe("ActTimeoutError", () => { it("is exported and extends Error", () => { const error = new Stagehand.ActTimeoutError(1000); expect(error).toBeInstanceOf(Error); expect(error).toBeInstanceOf(Stagehand.ActTimeoutError); expect(error.name).toBe("ActTimeoutError"); }); it("contains timeout value in milliseconds in message", () => { const error = new Stagehand.ActTimeoutError(500); expect(error.message).toContain("500ms"); }); it("contains operation name in message", () => { const error = new Stagehand.ActTimeoutError(100); expect(error.message).toContain("act()"); }); it("extends TimeoutError", () => { const error = new Stagehand.ActTimeoutError(1000); expect(error).toBeInstanceOf(Stagehand.TimeoutError); }); }); describe("ExtractTimeoutError", () => { it("is exported and extends Error", () => { const error = new Stagehand.ExtractTimeoutError(1000); expect(error).toBeInstanceOf(Error); expect(error).toBeInstanceOf(Stagehand.ExtractTimeoutError); expect(error.name).toBe("ExtractTimeoutError"); }); it("contains timeout value in milliseconds in message", () => { const error = new Stagehand.ExtractTimeoutError(1000); expect(error.message).toContain("1000ms"); }); it("contains operation name in message", () => { const error = new Stagehand.ExtractTimeoutError(100); expect(error.message).toContain("extract()"); }); it("extends TimeoutError", () => { const error = new Stagehand.ExtractTimeoutError(1000); expect(error).toBeInstanceOf(Stagehand.TimeoutError); }); }); describe("ObserveTimeoutError", () => { it("is exported and extends Error", () => { const error = new Stagehand.ObserveTimeoutError(1000); expect(error).toBeInstanceOf(Error); expect(error).toBeInstanceOf(Stagehand.ObserveTimeoutError); expect(error.name).toBe("ObserveTimeoutError"); }); it("contains timeout value in milliseconds in message", () => { const error = new Stagehand.ObserveTimeoutError(1500); expect(error.message).toContain("1500ms"); }); it("contains operation name in message", () => { const error = new Stagehand.ObserveTimeoutError(100); expect(error.message).toContain("observe()"); }); it("extends TimeoutError", () => { const error = new Stagehand.ObserveTimeoutError(1000); expect(error).toBeInstanceOf(Stagehand.TimeoutError); }); }); describe("TimeoutError (base class)", () => { it("is exported and extends Error", () => { const error = new Stagehand.TimeoutError("custom operation", 2000); expect(error).toBeInstanceOf(Error); expect(error).toBeInstanceOf(Stagehand.TimeoutError); }); it("contains operation name and timeout in message", () => { const error = new Stagehand.TimeoutError("custom operation", 2000); expect(error.message).toContain("custom operation"); expect(error.message).toContain("2000ms"); }); it("extends StagehandError", () => { const error = new Stagehand.TimeoutError("operation", 1000); expect(error).toBeInstanceOf(Stagehand.StagehandError); }); }); }); ================================================ FILE: packages/core/tests/unit/public-api/tool-type-export.test.ts ================================================ import { describe, expectTypeOf, it, expect } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; import { type Tool } from "ai"; import { z } from "zod"; /** * Test to verify tool-related exports from Stagehand. * Users should be able to create custom tools using the exported `tool` function * without needing to install the ai package directly. */ describe("Tool exports from AI SDK", () => { it("exports Tool type that matches AI SDK Tool type", () => { expectTypeOf().toEqualTypeOf(); }); it("exports tool function", () => { expect(typeof Stagehand.tool).toBe("function"); }); it("tool function can be used to define custom tools", () => { const customTool = Stagehand.tool({ description: "A test tool", inputSchema: z.object({ input: z.string(), }), execute: async ({ input }) => { return { result: `Processed: ${input}` }; }, }); expect(customTool).toBeDefined(); expect(customTool.description).toBe("A test tool"); }); }); ================================================ FILE: packages/core/tests/unit/public-api/v3-core.test.ts ================================================ import { describe, expect, expectTypeOf, it } from "vitest"; import * as Stagehand from "@browserbasehq/stagehand"; describe("V3 Core public API types", () => { describe("Stagehand", () => { type ExpectedShape = { init: () => Promise; close: (opts?: { force?: boolean }) => Promise; act: ( input: string | Stagehand.Action, options?: Stagehand.ActOptions, ) => Promise; extract: (...args: unknown[]) => Promise; observe: (...args: unknown[]) => Promise; agent: (config?: Stagehand.AgentConfig) => { execute: ( instructionOrOptions: string | Stagehand.AgentExecuteOptions, ) => Promise; }; connectURL: () => string; context: unknown; metrics: Promise; history: Promise>; llmClient: Stagehand.LLMClient; browserbaseSessionID: string | undefined; browserbaseSessionURL: string | undefined; browserbaseDebugURL: string | undefined; experimental: boolean; logInferenceToFile: boolean; verbose: 0 | 1 | 2; logger: (logLine: Stagehand.LogLine) => void; isAgentReplayActive: () => boolean; recordAgentReplayStep: (step: unknown) => void; }; type StagehandInstance = InstanceType; it("has correct public interface shape", () => { expectTypeOf().toExtend(); }); it("act accepts Action as first parameter", () => { const mockAction = {} as Stagehand.Action; expectTypeOf().toBeCallableWith( mockAction, {} as Stagehand.ActOptions, ); }); it("extract accepts instruction and schema", () => { const mockSchema = {} as Stagehand.StagehandZodSchema; expectTypeOf().toBeCallableWith( "instruction", mockSchema, {} as Stagehand.ExtractOptions, ); }); it("observe accepts instruction and options", () => { expectTypeOf().toBeCallableWith( "instruction", {} as Stagehand.ObserveOptions, ); }); it("agent execute accepts page option", () => { type AgentReturn = ReturnType; const mockPage = {} as Stagehand.AnyPage; expectTypeOf().toBeCallableWith({ instruction: "test", page: mockPage, } satisfies Stagehand.AgentExecuteOptions); }); }); describe("StagehandMetrics", () => { type ExpectedStagehandMetrics = { actPromptTokens: number; actCompletionTokens: number; actReasoningTokens: number; actCachedInputTokens: number; actInferenceTimeMs: number; extractPromptTokens: number; extractCompletionTokens: number; extractReasoningTokens: number; extractCachedInputTokens: number; extractInferenceTimeMs: number; observePromptTokens: number; observeCompletionTokens: number; observeReasoningTokens: number; observeCachedInputTokens: number; observeInferenceTimeMs: number; agentPromptTokens: number; agentCompletionTokens: number; agentReasoningTokens: number; agentCachedInputTokens: number; agentInferenceTimeMs: number; totalPromptTokens: number; totalCompletionTokens: number; totalReasoningTokens: number; totalCachedInputTokens: number; totalInferenceTimeMs: number; }; it("matches the published metrics shape", () => { expectTypeOf().toEqualTypeOf(); }); }); describe("V3", () => { // V3 is the same class as Stagehand, just re-exported with a different name. // The public interface shape is already tested in the "Stagehand" test above. it("is exported", () => { expect(Stagehand.V3).toBeDefined(); }); }); describe("V3Evaluator", () => { type V3EvaluatorInstance = InstanceType; it("is exported", () => { expect(Stagehand.V3Evaluator).toBeDefined(); }); it("has ask method", () => { expectTypeOf().toExtend< (options: unknown) => Promise >(); }); it("has batchAsk method", () => { expectTypeOf().toExtend< (options: unknown) => Promise >(); }); }); describe("V3FunctionName", () => { const expectedFunctionNames = [ "ACT", "EXTRACT", "OBSERVE", "AGENT", ] as const; it("matches the known function name literals", () => { expectTypeOf().toExtend< (typeof expectedFunctionNames)[number] >(); void expectedFunctionNames; // Mark as used to satisfy ESLint }); }); describe("connectToMCPServer", () => { type ExpectedServerConfig = | string | URL | { command: string; args?: string[]; env?: Record } | { serverUrl: string | URL; clientOptions?: unknown; requestOptions?: unknown; }; it("has correct parameter types", () => { expectTypeOf( Stagehand.connectToMCPServer, ).parameters.branded.toEqualTypeOf<[ExpectedServerConfig]>(); }); }); describe("LOG_LEVEL_NAMES", () => { type ExpectedLOG_LEVEL_NAMES = Record; it("maps numeric levels to strings", () => { expectTypeOf< typeof Stagehand.LOG_LEVEL_NAMES >().toExtend(); }); }); }); ================================================ FILE: packages/core/tests/unit/safety-confirmation.test.ts ================================================ import { describe, it, expect, vi } from "vitest"; import { OpenAICUAClient } from "../../lib/v3/agent/OpenAICUAClient.js"; import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js"; import type { SafetyCheck, SafetyConfirmationHandler, } from "../../lib/v3/types/public/agent.js"; import type { LogLine } from "../../lib/v3/types/public/logs.js"; type LoggerMock = (message: LogLine) => void; const openAISafetyInvoker = ( OpenAICUAClient.prototype as unknown as { handleSafetyConfirmation: ( this: OpenAICUAClient, pendingSafetyChecks: SafetyCheck[], logger: LoggerMock, ) => Promise; } ).handleSafetyConfirmation; const googleSafetyInvoker = ( GoogleCUAClient.prototype as unknown as { handleSafetyConfirmation: ( this: GoogleCUAClient, safetyDecision: unknown, logger: LoggerMock, ) => Promise; } ).handleSafetyConfirmation; function createOpenAIClient(): OpenAICUAClient { return new OpenAICUAClient( "openai", "openai/computer-use-preview", "test instructions", { apiKey: "test" }, ); } function createGoogleClient(): GoogleCUAClient { return new GoogleCUAClient( "google", "google/gemini-2.5-computer-use-preview-10-2025", "test instructions", { apiKey: "test" }, ); } describe("Safety Confirmation Handler", () => { describe("OpenAI-style (pending_safety_checks)", () => { const mockChecks: SafetyCheck[] = [ { id: "check-1", code: "malicious_instructions", message: "Potentially harmful action detected", }, ]; it("returns checks when handler acknowledges", async () => { const client = createOpenAIClient(); const handler: SafetyConfirmationHandler = vi.fn(async () => ({ acknowledged: true, })); client.setSafetyConfirmationHandler(handler); const logger = vi.fn(); const result = await openAISafetyInvoker.call(client, mockChecks, logger); expect(handler).toHaveBeenCalledWith(mockChecks); expect(result).toEqual(mockChecks); }); it("returns undefined when handler rejects", async () => { const client = createOpenAIClient(); const handler: SafetyConfirmationHandler = vi.fn(async () => ({ acknowledged: false, })); client.setSafetyConfirmationHandler(handler); const logger = vi.fn(); const result = await openAISafetyInvoker.call(client, mockChecks, logger); expect(handler).toHaveBeenCalledWith(mockChecks); expect(result).toBeUndefined(); }); it("auto-acknowledges when no handler is set", async () => { const client = createOpenAIClient(); const logger = vi.fn(); const result = await openAISafetyInvoker.call(client, mockChecks, logger); expect(result).toEqual(mockChecks); }); }); describe("Google-style (safety_decision)", () => { const mockDecision = { decision: "require_confirmation", explanation: "Cookie consent dialog detected", }; it("returns 'true' when handler acknowledges", async () => { const client = createGoogleClient(); const handler: SafetyConfirmationHandler = vi.fn(async () => ({ acknowledged: true, })); client.setSafetyConfirmationHandler(handler); const logger = vi.fn(); const result = await googleSafetyInvoker.call( client, mockDecision, logger, ); expect(handler).toHaveBeenCalledWith([ { id: "google-safety-decision", code: "safety_decision", message: JSON.stringify(mockDecision, null, 2), }, ]); expect(result).toBe("true"); }); it("returns undefined when handler rejects", async () => { const client = createGoogleClient(); const handler: SafetyConfirmationHandler = vi.fn(async () => ({ acknowledged: false, })); client.setSafetyConfirmationHandler(handler); const logger = vi.fn(); const result = await googleSafetyInvoker.call( client, mockDecision, logger, ); expect(handler).toHaveBeenCalled(); expect(result).toBeUndefined(); }); it("auto-acknowledges when no handler is set", async () => { const client = createGoogleClient(); const logger = vi.fn(); const result = await googleSafetyInvoker.call( client, mockDecision, logger, ); expect(result).toBe("true"); }); it("handles string safety decisions", async () => { const client = createGoogleClient(); const handler: SafetyConfirmationHandler = vi.fn(async () => ({ acknowledged: true, })); client.setSafetyConfirmationHandler(handler); const logger = vi.fn(); const result = await googleSafetyInvoker.call( client, "Simple string decision", logger, ); expect(handler).toHaveBeenCalledWith([ { id: "google-safety-decision", code: "safety_decision", message: "Simple string decision", }, ]); expect(result).toBe("true"); }); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-a11y-resolvers.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { a11yForFrame } from "../../lib/v3/understudy/a11y/snapshot/a11yTree.js"; import type { AccessibilityTreeResult } from "../../lib/v3/types/private/index.js"; import * as focusSelectors from "../../lib/v3/understudy/a11y/snapshot/focusSelectors.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import { executionContexts } from "../../lib/v3/understudy/executionContextRegistry.js"; import { tryScopedSnapshot } from "../../lib/v3/understudy/a11y/snapshot/capture.js"; import type { FrameContext, A11yOptions, } from "../../lib/v3/types/private/index.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import * as domTree from "../../lib/v3/understudy/a11y/snapshot/domTree.js"; import * as a11yTree from "../../lib/v3/understudy/a11y/snapshot/a11yTree.js"; import * as logger from "../../lib/v3/logger.js"; const stringType = "string" as Protocol.Accessibility.AXValueType; const baseAxNodes = (): Protocol.Accessibility.AXNode[] => [ { nodeId: "1", role: { type: stringType, value: "RootWebArea" }, backendDOMNodeId: 100, childIds: ["2"], ignored: false, }, { nodeId: "2", role: { type: stringType, value: "link" }, name: { type: stringType, value: "Docs" }, backendDOMNodeId: 101, parentId: "1", childIds: [], properties: [ { name: "url", value: { type: stringType, value: "https://example.com" }, }, ], ignored: false, }, ]; const baseHandlers = { "Accessibility.enable": async () => ({}), "Runtime.enable": async () => ({}), "DOM.enable": async () => ({}), }; describe("a11yForFrame", () => { beforeEach(() => { vi.restoreAllMocks(); }); it("returns full outline and url map when no focus selector is provided", async () => { const session = new MockCDPSession({ ...baseHandlers, "Accessibility.getFullAXTree": async () => ({ nodes: baseAxNodes() }), }); const opts: A11yOptions = { focusSelector: undefined, experimental: false, tagNameMap: { "enc-100": "#document", "enc-101": "a" }, scrollableMap: {}, encode: (backend) => `enc-${backend}`, }; const result = await a11yForFrame(session, undefined, opts); expect(result.scopeApplied).toBe(false); expect(result.urlMap["enc-101"]).toBe("https://example.com"); expect(result.outline).toContain("Docs"); }); it("scopes the tree to the resolved focus selector target", async () => { const nodes = baseAxNodes().map((n) => n.nodeId === "2" ? { ...n, childIds: ["3"], } : n, ); nodes.push({ nodeId: "3", parentId: "2", childIds: [], role: { type: stringType, value: "StaticText" }, backendDOMNodeId: 102, ignored: false, }); let scopedOnce = false; const session = new MockCDPSession({ ...baseHandlers, "Accessibility.getFullAXTree": async (params) => { if (params?.frameId && !scopedOnce) { scopedOnce = true; throw new Error("does not belong to the target"); } return { nodes }; }, "DOM.describeNode": async () => ({ node: { backendNodeId: 101 }, }), }); const resolveSpy = vi .spyOn(focusSelectors, "resolveObjectIdForXPath") .mockResolvedValue("object-1"); const opts: A11yOptions = { focusSelector: "xpath=//a", experimental: false, tagNameMap: { "enc-101": "a" }, scrollableMap: {}, encode: (backend) => `enc-${backend}`, }; const result = await a11yForFrame(session, "frame-1", opts); expect(result.scopeApplied).toBe(true); expect(result.outline).not.toContain("RootWebArea"); expect(resolveSpy).toHaveBeenCalled(); resolveSpy.mockRestore(); }); it("falls back to full tree when resolveObjectId throws", async () => { const session = new MockCDPSession({ ...baseHandlers, "Accessibility.getFullAXTree": async () => ({ nodes: baseAxNodes() }), }); vi.spyOn(focusSelectors, "resolveObjectIdForCss").mockRejectedValue( new Error("fail"), ); const opts: A11yOptions = { focusSelector: ".btn", experimental: false, tagNameMap: {}, scrollableMap: {}, encode: (backend) => `enc-${backend}`, }; const result = await a11yForFrame(session, "frame-1", opts); expect(result.scopeApplied).toBe(false); }); }); describe("resolveObjectIdForXPath", () => { beforeEach(() => { vi.restoreAllMocks(); }); it("evaluates in the target frame's main world when available", async () => { vi.spyOn(executionContexts, "waitForMainWorld").mockResolvedValue(42); vi.spyOn(executionContexts, "getMainWorld").mockReturnValue(undefined); const session = new MockCDPSession({ "Runtime.evaluate": async (params) => { expect(params?.contextId).toBe(42); return { result: { objectId: "node-obj" } }; }, }); const objectId = await focusSelectors.resolveObjectIdForXPath( session, "//div", "frame-1", ); expect(objectId).toBe("node-obj"); }); it("returns null when evaluation throws or reports exception details", async () => { vi.spyOn(executionContexts, "waitForMainWorld").mockRejectedValue( new Error("missing"), ); vi.spyOn(executionContexts, "getMainWorld").mockReturnValue(undefined); const session = new MockCDPSession({ "Runtime.evaluate": async () => ({ result: {}, exceptionDetails: { exception: { description: "bad" } }, }), }); const objectId = await focusSelectors.resolveObjectIdForXPath( session, "//div", "frame-2", ); expect(objectId).toBeNull(); }); }); describe("resolveObjectIdForCss", () => { beforeEach(() => { vi.restoreAllMocks(); }); it("returns primary evaluation result when available", async () => { vi.spyOn(executionContexts, "waitForMainWorld").mockResolvedValue(7); const session = new MockCDPSession({ "Runtime.evaluate": async () => ({ result: { objectId: "primary-obj" }, }), }); const objectId = await focusSelectors.resolveObjectIdForCss( session, ".btn", "frame-1", ); expect(objectId).toBe("primary-obj"); }); it("falls back to the pierce selector when the primary lookup fails", async () => { let call = 0; const session = new MockCDPSession({ "Runtime.evaluate": async (params) => { call++; if (call === 1) { expect(String(params?.expression)).toContain("resolveCssSelector"); return { result: {} }; } expect(String(params?.expression)).toContain( "resolveCssSelectorPierce", ); return { result: { objectId: "css-obj" } }; }, }); const objectId = await focusSelectors.resolveObjectIdForCss( session, ".btn", undefined, ); expect(objectId).toBe("css-obj"); }); it("returns null when both primary and fallback evaluations throw", async () => { vi.spyOn(executionContexts, "waitForMainWorld").mockResolvedValue(11); vi.spyOn(executionContexts, "getMainWorld").mockReturnValue(undefined); const session = new MockCDPSession({ "Runtime.evaluate": async () => ({ result: {}, exceptionDetails: { exception: { description: "fail" } }, }), }); const objectId = await focusSelectors.resolveObjectIdForCss( session, ".missing", "frame-1", ); expect(objectId).toBeNull(); }); }); describe("tryScopedSnapshot", () => { const ordinal = (frameId: string) => (frameId === "frame-1" ? 0 : 1); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const makePage = (session: MockCDPSession, overrides?: Partial): Page => ({ mainFrameId: () => "frame-1", asProtocolFrameTree: () => ({ frame: { id: "frame-1" as Protocol.Page.FrameId }, childFrames: [{ frame: { id: "frame-2" as Protocol.Page.FrameId } }], }), listAllFrameIds: () => ["frame-1", "frame-2"], getSessionForFrame: () => session, getOrdinal: (fid: string) => ordinal(fid), ...overrides, }) as unknown as Page; beforeEach(() => { vi.restoreAllMocks(); }); it("returns scoped snapshot when focus selector resolves via CSS hops", async () => { const session = new MockCDPSession({}); const domMapsSpy = vi .spyOn(domTree, "domMapsForSession") .mockResolvedValue({ tagNameMap: { "1-10": "div" }, xpathMap: { "1-10": "/div[1]" }, scrollableMap: {}, }); const a11ySpy = vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "[1-10] div", urlMap: { "1-10": "https://example.com" }, scopeApplied: true, } as AccessibilityTreeResult); vi.spyOn(focusSelectors, "resolveCssFocusFrameAndTail").mockResolvedValue({ targetFrameId: "frame-2", tailSelector: ".btn-inner", absPrefix: "/html/body/iframe[1]", }); const result = await tryScopedSnapshot( makePage(session), { focusSelector: ".btn" }, context, true, ); expect(result).not.toBeNull(); expect(result?.combinedXpathMap["1-10"]).toBe( "/html/body/iframe[1]/div[1]", ); expect(domMapsSpy).toHaveBeenCalled(); expect(a11ySpy).toHaveBeenCalled(); }); it("returns null and logs fallback when scope is not applied", async () => { const session = new MockCDPSession({}); vi.spyOn(domTree, "domMapsForSession").mockResolvedValue({ tagNameMap: { "1-10": "div" }, xpathMap: { "1-10": "/div[1]" }, scrollableMap: {}, }); vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "ignored", urlMap: {}, scopeApplied: false, } as AccessibilityTreeResult); const loggerSpy = vi.spyOn(logger, "v3Logger").mockImplementation(() => {}); const result = await tryScopedSnapshot( makePage(session), { focusSelector: ".btn" }, context, false, ); expect(result).toBeNull(); expect(loggerSpy).toHaveBeenCalled(); }); it("returns null immediately when no focus selector is provided", async () => { const result = await tryScopedSnapshot( makePage(new MockCDPSession({})), {}, context, true, ); expect(result).toBeNull(); }); it("supports XPath focus resolution branch", async () => { const session = new MockCDPSession({}); vi.spyOn(domTree, "domMapsForSession").mockResolvedValue({ tagNameMap: { "1-10": "div" }, xpathMap: { "1-10": "/div[1]" }, scrollableMap: {}, }); vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "[1-10] div", urlMap: {}, scopeApplied: true, } as AccessibilityTreeResult); vi.spyOn(focusSelectors, "resolveFocusFrameAndTail").mockResolvedValue({ targetFrameId: "frame-1", tailXPath: "//div[1]", absPrefix: "", }); const result = await tryScopedSnapshot( makePage(session), { focusSelector: "xpath=//div" }, context, true, ); expect(result).not.toBeNull(); expect(result?.combinedXpathMap["1-10"]).toBe("/div[1]"); }); it("logs and returns null when resolver throws", async () => { const session = new MockCDPSession({}); vi.spyOn(focusSelectors, "resolveCssFocusFrameAndTail").mockRejectedValue( new Error("bad selector"), ); const loggerSpy = vi.spyOn(logger, "v3Logger").mockImplementation(() => {}); const result = await tryScopedSnapshot( makePage(session), { focusSelector: ".bad" }, context, true, ); expect(result).toBeNull(); expect(loggerSpy).toHaveBeenCalled(); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-a11y-tree-utils.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { describe, expect, it } from "vitest"; import type { A11yNode, A11yOptions, } from "../../lib/v3/types/private/snapshot.js"; import { buildHierarchicalTree, decorateRoles, extractUrlFromAXNode, isStructural, removeRedundantStaticTextChildren, } from "../../lib/v3/understudy/a11y/snapshot/a11yTree.js"; const axString = (value: string): Protocol.Accessibility.AXValue => ({ type: "string", value, }); const defaultOpts: A11yOptions = { focusSelector: undefined, experimental: false, tagNameMap: {}, scrollableMap: {}, encode: (backendNodeId: number) => `enc-${backendNodeId}`, }; const makeAxNode = ( overrides: Partial = {}, ): Protocol.Accessibility.AXNode => ({ nodeId: overrides.nodeId ?? String(Math.random()), backendDOMNodeId: overrides.backendDOMNodeId ?? Math.floor(Math.random() * 1e6), role: overrides.role ?? axString("generic"), childIds: overrides.childIds ?? [], parentId: overrides.parentId, properties: overrides.properties ?? [], name: overrides.name, description: overrides.description, value: overrides.value, ignored: overrides.ignored ?? false, }); describe("decorateRoles", () => { it("marks scrollable DOM nodes with tag labels and encoded ids", () => { const opts: A11yOptions = { ...defaultOpts, tagNameMap: { "enc-1": "div", "enc-2": "html", "enc-3": "#document", "enc-4": "#svg", }, scrollableMap: { "enc-1": true, "enc-4": true }, }; const nodes = [ makeAxNode({ backendDOMNodeId: 1, role: { type: "string", value: "region" }, }), makeAxNode({ backendDOMNodeId: 2, role: { type: "string", value: "generic" }, }), makeAxNode({ backendDOMNodeId: 3, role: { type: "string", value: "generic" }, }), makeAxNode({ backendDOMNodeId: 4, role: { type: "string", value: "generic" }, }), ]; const decorated = decorateRoles(nodes, opts); expect(decorated).toMatchObject([ { encodedId: "enc-1", role: "scrollable, div" }, { encodedId: "enc-2", role: "scrollable, html" }, { encodedId: "enc-3", role: "generic" }, { encodedId: "enc-4", role: "scrollable, svg" }, ]); }); it("falls back when encoding fails", () => { const opts: A11yOptions = { ...defaultOpts, encode: () => { throw new Error("boom"); }, }; const nodes = [makeAxNode({ backendDOMNodeId: 4 })]; const decorated = decorateRoles(nodes, opts); expect(decorated[0]?.encodedId).toBeUndefined(); }); }); describe("buildHierarchicalTree", () => { const opts: A11yOptions = { ...defaultOpts, tagNameMap: { root: "div", child: "span" }, }; it("drops structural nodes without children or names", async () => { const nodes: A11yNode[] = [ { role: "generic", name: "", nodeId: "root", encodedId: "root", parentId: undefined, childIds: ["child"], }, { role: "generic", name: "", nodeId: "child", encodedId: "child", parentId: "root", childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, opts); expect(tree).toEqual([]); }); it("promotes select/combobox tag names for structural nodes", async () => { const nodes: A11yNode[] = [ { role: "combobox", name: "Select", nodeId: "root", encodedId: "root", parentId: undefined, childIds: ["child"], }, { role: "StaticText", name: "Option", nodeId: "child", encodedId: "child", parentId: "root", childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, { ...opts, tagNameMap: { root: "select" }, }); expect(tree[0]?.role).toBe("select"); }); it("drops structural parents with a single cleaned child while keeping it in place", async () => { const nodes: A11yNode[] = [ { role: "generic", name: "", nodeId: "root", encodedId: "root", parentId: undefined, childIds: ["child"], }, { role: "StaticText", name: "Ok", nodeId: "child", encodedId: "child", parentId: "root", childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, opts); expect(tree[0]?.role).toBe("StaticText"); }); it("drops structural parents entirely when all descendants are pruned", async () => { const nodes: A11yNode[] = [ { role: "generic", name: "", nodeId: "root", encodedId: "root", parentId: undefined, childIds: ["child"], }, { role: "generic", name: "", nodeId: "child", encodedId: "child", parentId: "root", childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, opts); expect(tree).toEqual([]); }); it("renames structural nodes to their tag names when not combobox", async () => { const nodes: A11yNode[] = [ { role: "generic", name: "Container", nodeId: "root", encodedId: "root", parentId: undefined, childIds: ["child-a", "child-b"], }, { role: "StaticText", name: "A", nodeId: "child-a", encodedId: "child-a", parentId: "root", childIds: [], }, { role: "StaticText", name: "B", nodeId: "child-b", encodedId: "child-b", parentId: "root", childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, { ...opts, tagNameMap: { root: "section" }, }); expect(tree[0]?.role).toBe("section"); }); it("skips nodes with negative node ids early", async () => { const nodes: A11yNode[] = [ { role: "button", name: "Hidden", nodeId: "-1", encodedId: "hidden", parentId: undefined, childIds: [], }, ]; const { tree } = await buildHierarchicalTree(nodes, opts); expect(tree).toEqual([]); }); }); describe("isStructural", () => { it("marks generic/none/InlineTextBox roles as structural", () => { expect(isStructural("generic")).toBe(true); expect(isStructural("none")).toBe(true); expect(isStructural("InlineTextBox")).toBe(true); expect(isStructural("button")).toBe(false); }); }); describe("removeRedundantStaticTextChildren", () => { it("removes static text children whose concatenated text equals the parent name", () => { const parent: A11yNode = { role: "button", name: "HelloWorld", nodeId: "root", }; const children: A11yNode[] = [ { role: "StaticText", name: "Hello", nodeId: "c1" }, { role: "StaticText", name: "World", nodeId: "c2" }, { role: "button", name: "Child", nodeId: "c3" }, ]; const pruned = removeRedundantStaticTextChildren(parent, children); expect(pruned).toEqual([{ role: "button", name: "Child", nodeId: "c3" }]); }); it("keeps static text when combined text differs", () => { const parent: A11yNode = { role: "button", name: "Hello World", nodeId: "root", }; const children: A11yNode[] = [ { role: "StaticText", name: "Hello", nodeId: "c1" }, { role: "StaticText", name: "Mars", nodeId: "c2" }, ]; expect(removeRedundantStaticTextChildren(parent, children)).toEqual( children, ); }); it("returns original children when parent name is empty", () => { const parent: A11yNode = { role: "button", nodeId: "root", }; const children: A11yNode[] = [ { role: "StaticText", name: "Hello", nodeId: "c1" }, { role: "StaticText", name: "World", nodeId: "c2" }, ]; expect(removeRedundantStaticTextChildren(parent, children)).toEqual( children, ); }); }); describe("extractUrlFromAXNode", () => { it("returns trimmed URL string from node properties", () => { const node = makeAxNode({ properties: [ { name: "busy", value: axString("bar") }, { name: "url", value: axString(" https://example.com ") }, ], }); expect(extractUrlFromAXNode(node)).toBe("https://example.com"); }); it("returns undefined when url property missing or invalid", () => { expect( extractUrlFromAXNode(makeAxNode({ properties: [] })), ).toBeUndefined(); expect( extractUrlFromAXNode( makeAxNode({ properties: [{ name: "url", value: { type: "number", value: 123 } }], }), ), ).toBeUndefined(); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-capture-orchestration.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { CDPSessionLike } from "../../lib/v3/understudy/cdp.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import type { FrameContext, SessionDomIndex, } from "../../lib/v3/types/private/index.js"; import * as capture from "../../lib/v3/understudy/a11y/snapshot/capture.js"; import * as a11yTree from "../../lib/v3/understudy/a11y/snapshot/a11yTree.js"; import * as domTree from "../../lib/v3/understudy/a11y/snapshot/domTree.js"; import * as focusSelectors from "../../lib/v3/understudy/a11y/snapshot/focusSelectors.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; const makeProtocolFrame = (id: string): Protocol.Page.Frame => ({ id, loaderId: `${id}-loader`, url: "https://example.com", securityOrigin: "https://example.com", mimeType: "text/html", }) as unknown as Protocol.Page.Frame; const makeFrameTree = ( id: string, children: Protocol.Page.FrameTree[] = [], ): Protocol.Page.FrameTree => ({ frame: makeProtocolFrame(id), childFrames: children, }); type PageStub = Pick< Page, | "mainFrameId" | "asProtocolFrameTree" | "listAllFrameIds" | "getSessionForFrame" | "getOrdinal" >; const makePage = (overrides: Partial = {}): Page => { const defaultSession = new MockCDPSession({}, "default-session"); const base: PageStub = { mainFrameId: () => "frame-1", asProtocolFrameTree: () => makeFrameTree("frame-1"), listAllFrameIds: () => ["frame-1"], getSessionForFrame: () => defaultSession, getOrdinal: () => 0, }; return { ...base, ...overrides } as unknown as Page; }; const makeSessionIndex = (): SessionDomIndex => ({ rootBackend: 100, absByBe: new Map([ [100, "/"], [101, "/html[1]"], [102, "/html[1]/body[1]"], [150, "/html[1]/body[1]/iframe[1]"], [200, "/html[1]/body[1]/iframe[1]"], [201, "/html[1]/body[1]/iframe[1]/div[1]"], ]), tagByBe: new Map([ [100, "#document"], [101, "html"], [102, "body"], [150, "iframe"], [200, "#document"], [201, "div"], ]), scrollByBe: new Map([[201, true]]), docRootOf: new Map([ [100, 100], [101, 100], [102, 100], [150, 100], [200, 200], [201, 200], ]), contentDocRootByIframe: new Map([[150, 200]]), }); beforeEach(() => { vi.restoreAllMocks(); }); describe("buildFrameContext", () => { it("indexes parent relationships from the frame tree", () => { const frameTree = makeFrameTree("frame-1", [ makeFrameTree("frame-2", [makeFrameTree("frame-3")]), makeFrameTree("frame-4"), ]); const page = makePage({ asProtocolFrameTree: () => frameTree, listAllFrameIds: () => ["frame-1", "frame-2", "frame-3", "frame-4"], }); const context = capture.buildFrameContext(page); expect(context.rootId).toBe("frame-1"); expect(context.frames).toEqual([ "frame-1", "frame-2", "frame-3", "frame-4", ]); expect(context.parentByFrame.get("frame-1")).toBeNull(); expect(context.parentByFrame.get("frame-2")).toBe("frame-1"); expect(context.parentByFrame.get("frame-3")).toBe("frame-2"); expect(context.parentByFrame.get("frame-4")).toBe("frame-1"); }); }); describe("buildSessionIndexes", () => { it("deduplicates frames that share the same CDP session id", async () => { const session = new MockCDPSession({}, "session-a"); const page = makePage({ // Every frame lookup returns the same session instance, so buildSessionIndexes // should call buildSessionDomIndex only once and reuse the result. getSessionForFrame: () => session, }); const idx = makeSessionIndex(); const spy = vi .spyOn(domTree, "buildSessionDomIndex") .mockResolvedValue(idx); const result = await capture.buildSessionIndexes( page, ["frame-1", "frame-2"], true, ); expect(spy).toHaveBeenCalledTimes(1); // only one DOM.getDocument per session id expect(spy).toHaveBeenCalledWith(session, true); expect(result.get("session-a")).toBe(idx); }); it("builds indexes for sessions without ids using the 'root' key", async () => { const sessionWithoutId: CDPSessionLike = { id: undefined, async send( _method: string, _params?: Record, ): Promise { void _method; void _params; return {} as R; }, on() {}, off() {}, async close() {}, }; const sessionWithId = new MockCDPSession({}, "child-session"); const page = makePage({ getSessionForFrame: (frameId: string) => frameId === "frame-1" ? sessionWithoutId : sessionWithId, }); const idxA = makeSessionIndex(); const idxB = makeSessionIndex(); const spy = vi .spyOn(domTree, "buildSessionDomIndex") .mockResolvedValueOnce(idxA) .mockResolvedValueOnce(idxB); const result = await capture.buildSessionIndexes( page, ["frame-1", "frame-2"], false, ); // Verifies the helper invokes buildSessionDomIndex once for each unique session, // keying anonymous sessions as "root" so downstream lookups remain stable. expect(spy).toHaveBeenNthCalledWith(1, sessionWithoutId, false); expect(spy).toHaveBeenNthCalledWith(2, sessionWithId, false); expect(result.get("root")).toBe(idxA); expect(result.get("child-session")).toBe(idxB); }); }); describe("collectPerFrameMaps", () => { it("builds per-frame xpath/tag maps and outlines from a shared session index", async () => { const session = new MockCDPSession( { "DOM.getFrameOwner": async () => ({ backendNodeId: 150 }), }, "session-a", ); const page = makePage({ getSessionForFrame: () => session, getOrdinal: (frameId: string) => (frameId === "frame-1" ? 0 : 1), }); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const sessionIndex = makeSessionIndex(); const sessionToIndex = new Map([[session.id, sessionIndex]]); vi.spyOn(a11yTree, "a11yForFrame").mockImplementation( async (_sess, frameId) => ({ outline: `outline-${frameId}`, urlMap: { [`url-${frameId}`]: `https://${frameId}.test` }, scopeApplied: false, }), ); const result = await capture.collectPerFrameMaps( page, context, sessionToIndex, { experimental: true }, true, context.frames, ); expect(result.perFrameOutlines).toEqual([ { frameId: "frame-1", outline: "outline-frame-1" }, { frameId: "frame-2", outline: "outline-frame-2" }, ]); const rootMaps = result.perFrameMaps.get("frame-1"); expect(rootMaps?.xpathMap["0-100"]).toBe("/"); expect(rootMaps?.xpathMap["0-101"]).toBe("/html[1]"); expect(rootMaps?.xpathMap["0-102"]).toBe("/html[1]/body[1]"); const childMaps = result.perFrameMaps.get("frame-2"); expect(childMaps?.xpathMap["1-200"]).toBe("/"); expect(childMaps?.xpathMap["1-201"]).toBe("/div[1]"); expect(childMaps?.scrollableMap["1-201"]).toBe(true); expect(childMaps?.urlMap).toEqual({ "url-frame-2": "https://frame-2.test", }); expect(session.callsFor("DOM.getFrameOwner")).toHaveLength(1); }); it("builds a missing session index on demand and memoizes it", async () => { const session = new MockCDPSession({}, "new-session"); const page = makePage({ getSessionForFrame: () => session, getOrdinal: () => 2, }); const context: FrameContext = { rootId: "frame-9", frames: ["frame-9"], parentByFrame: new Map([["frame-9", null]]), }; const idx = makeSessionIndex(); const buildSpy = vi .spyOn(domTree, "buildSessionDomIndex") .mockResolvedValue(idx); vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "outline", urlMap: {}, scopeApplied: false, }); const sessionToIndex = new Map(); const result = await capture.collectPerFrameMaps( page, context, sessionToIndex, undefined, false, context.frames, ); expect(buildSpy).toHaveBeenCalledWith(session, false); expect(sessionToIndex.get("new-session")).toBe(idx); expect(result.perFrameMaps.get("frame-9")?.xpathMap["2-100"]).toBe("/"); }); it("skips frames that are not listed in the frameIds argument", async () => { const session = new MockCDPSession({}, "session-a"); const page = makePage({ getSessionForFrame: () => session, getOrdinal: (frameId: string) => (frameId === "frame-1" ? 0 : 1), }); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const sessionIndex = makeSessionIndex(); const sessionToIndex = new Map([[session.id, sessionIndex]]); const a11ySpy = vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "outline", urlMap: {}, scopeApplied: false, }); const result = await capture.collectPerFrameMaps( page, context, sessionToIndex, undefined, true, ["frame-1"], ); expect(a11ySpy).toHaveBeenCalledTimes(1); expect(result.perFrameMaps.has("frame-2")).toBe(false); expect(result.perFrameOutlines.map((o) => o.frameId)).toEqual(["frame-1"]); }); }); describe("captureHybridSnapshot", () => { it("returns early when the scoped snapshot path succeeds", async () => { const session = new MockCDPSession({}, "session-a"); const page = makePage({ getSessionForFrame: () => session, }); const options = { focusSelector: "/html" }; vi.spyOn(focusSelectors, "resolveFocusFrameAndTail").mockResolvedValue({ targetFrameId: "frame-1", tailXPath: "", absPrefix: "", }); const domMapsSpy = vi .spyOn(domTree, "domMapsForSession") .mockResolvedValue({ tagNameMap: { "0-100": "#document" }, xpathMap: { "0-100": "/" }, scrollableMap: {}, }); const a11ySpy = vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "scoped outline", urlMap: { "0-100": "https://frame-1.test" }, scopeApplied: true, }); const buildIndexSpy = vi .spyOn(domTree, "buildSessionDomIndex") .mockImplementation(() => { throw new Error("should not build session index when scoped"); }); const result = await capture.captureHybridSnapshot(page, options); expect(result.combinedTree).toBe("scoped outline"); expect(result.combinedUrlMap["0-100"]).toBe("https://frame-1.test"); expect(domMapsSpy).toHaveBeenCalled(); expect(a11ySpy).toHaveBeenCalled(); expect(buildIndexSpy).not.toHaveBeenCalled(); }); it("scoped snapshot still succeeds when iframe inclusion is disabled", async () => { const session = new MockCDPSession({}, "session-a"); const page = makePage({ getSessionForFrame: () => session, }); const options = { focusSelector: "/html", includeIframes: false }; vi.spyOn(focusSelectors, "resolveFocusFrameAndTail").mockResolvedValue({ targetFrameId: "frame-1", tailXPath: "", absPrefix: "", }); const domMapsSpy = vi .spyOn(domTree, "domMapsForSession") .mockResolvedValue({ tagNameMap: { "0-100": "#document" }, xpathMap: { "0-100": "/" }, scrollableMap: {}, }); const a11ySpy = vi.spyOn(a11yTree, "a11yForFrame").mockResolvedValue({ outline: "scoped outline", urlMap: { "0-100": "https://frame-1.test" }, scopeApplied: true, }); const buildIndexSpy = vi .spyOn(domTree, "buildSessionDomIndex") .mockImplementation(() => { throw new Error("should not build session index when scoped"); }); const result = await capture.captureHybridSnapshot(page, options); expect(result.combinedTree).toBe("scoped outline"); expect(result.combinedUrlMap["0-100"]).toBe("https://frame-1.test"); expect(domMapsSpy).toHaveBeenCalled(); expect(a11ySpy).toHaveBeenCalled(); expect(buildIndexSpy).not.toHaveBeenCalled(); }); it("collects per-frame data and merges it when no scoped snapshot is available", async () => { const session = new MockCDPSession( { "DOM.getFrameOwner": async () => ({ backendNodeId: 150 }), }, "session-a", ); const page = makePage({ asProtocolFrameTree: () => makeFrameTree("frame-1", [makeFrameTree("frame-2")]), listAllFrameIds: () => ["frame-1", "frame-2"], getSessionForFrame: () => session, getOrdinal: (frameId: string) => (frameId === "frame-1" ? 0 : 1), }); const idx = makeSessionIndex(); vi.spyOn(domTree, "buildSessionDomIndex").mockResolvedValue(idx); vi.spyOn(a11yTree, "a11yForFrame").mockImplementation( async (_sess, frameId) => ({ outline: frameId === "frame-1" ? "[0-150] iframe host" : "[1-200] child subtree", urlMap: { [`url-${frameId}`]: `https://${frameId}.test` }, scopeApplied: false, }), ); const snapshot = await capture.captureHybridSnapshot(page); expect(snapshot.combinedTree).toContain("[1-200] child subtree"); expect(snapshot.combinedXpathMap["0-100"]).toBe("/"); expect(snapshot.combinedXpathMap["1-201"]).toBe( "/html[1]/body[1]/iframe[1]/div[1]", ); expect(snapshot.combinedUrlMap["url-frame-2"]).toBe("https://frame-2.test"); expect(snapshot.perFrame?.map((pf) => pf.frameId)).toEqual([ "frame-1", "frame-2", ]); }); it("omits iframe frames when includeIframes is false", async () => { const session = new MockCDPSession( { "DOM.getFrameOwner": async () => ({ backendNodeId: 150 }), }, "session-a", ); const page = makePage({ asProtocolFrameTree: () => makeFrameTree("frame-1", [makeFrameTree("frame-2")]), listAllFrameIds: () => ["frame-1", "frame-2"], getSessionForFrame: () => session, getOrdinal: (frameId: string) => (frameId === "frame-1" ? 0 : 1), }); const idx = makeSessionIndex(); vi.spyOn(domTree, "buildSessionDomIndex").mockResolvedValue(idx); const a11ySpy = vi .spyOn(a11yTree, "a11yForFrame") .mockImplementation(async (_sess, frameId) => ({ outline: frameId === "frame-1" ? "[0-150] iframe host" : "[1-200] child subtree", urlMap: { [`url-${frameId}`]: `https://${frameId}.test` }, scopeApplied: false, })); const snapshot = await capture.captureHybridSnapshot(page, { includeIframes: false, }); expect(a11ySpy).toHaveBeenCalledTimes(1); expect(session.callsFor("DOM.getFrameOwner")).toHaveLength(0); expect(snapshot.perFrame?.map((pf) => pf.frameId)).toEqual(["frame-1"]); expect(snapshot.combinedXpathMap["1-201"]).toBeUndefined(); expect(snapshot.combinedTree).not.toContain("[1-200] child subtree"); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-cbor.test.ts ================================================ import { describe, expect, it } from "vitest"; import type { Protocol } from "devtools-protocol"; import { captureHybridSnapshot } from "../../lib/v3/understudy/a11y/snapshot/index.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import { StagehandDomProcessError } from "../../lib/v3/types/public/sdkErrors.js"; import { CDPSessionLike } from "../../lib/v3/understudy/cdp.js"; type Handler = (params?: Record) => Promise | unknown; function createFakePage(session: CDPSessionLike): Page { const frameTree: Protocol.Page.FrameTree = { frame: { id: "root" as Protocol.Page.FrameId, loaderId: "root-loader" as Protocol.Network.LoaderId, url: "http://fake", domainAndRegistry: "fake", securityOrigin: "http://fake", mimeType: "text/html", secureContextType: "Secure", crossOriginIsolatedContextType: "NotIsolated", gatedAPIFeatures: [], }, childFrames: [], }; return { mainFrameId: () => "root", asProtocolFrameTree: () => frameTree, listAllFrameIds: () => ["root"], getSessionForFrame: () => session, getOrdinal: () => 0, } as unknown as Page; } function completeDomTree(): Protocol.DOM.Node { return { nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: "#document", childNodeCount: 1, children: [ { nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: "HTML", childNodeCount: 1, children: [ { nodeId: 3, backendNodeId: 3, nodeType: 1, nodeName: "BODY", childNodeCount: 1, children: [ { nodeId: 4, backendNodeId: 4, nodeType: 1, nodeName: "DIV", childNodeCount: 0, children: [], }, ], }, ], }, ], } as Protocol.DOM.Node; } function truncatedDomTree(): Protocol.DOM.Node { return { nodeId: 1, backendNodeId: 1, nodeType: 9, nodeName: "#document", childNodeCount: 1, children: [ { nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: "HTML", childNodeCount: 1, children: [], }, ], } as Protocol.DOM.Node; } function htmlWithChildren(): Protocol.DOM.Node { return { nodeId: 2, backendNodeId: 2, nodeType: 1, nodeName: "HTML", childNodeCount: 1, children: [ { nodeId: 3, backendNodeId: 3, nodeType: 1, nodeName: "BODY", childNodeCount: 1, children: [ { nodeId: 4, backendNodeId: 4, nodeType: 1, nodeName: "DIV", childNodeCount: 0, children: [], }, ], }, ], } as Protocol.DOM.Node; } function simpleAxNodes(): Protocol.Accessibility.AXNode[] { const stringType: Protocol.Accessibility.AXValueType = "string"; return [ { nodeId: "1", role: { type: stringType, value: "RootWebArea" }, backendDOMNodeId: 2, childIds: ["2"], ignored: false, }, { nodeId: "2", role: { type: stringType, value: "generic" }, name: { type: stringType, value: "Content" }, backendDOMNodeId: 4, parentId: "1", childIds: [] as string[], ignored: false, }, ]; } const baseHandlers: Record = { "DOM.enable": async () => ({}), "Runtime.enable": async () => ({}), "Accessibility.enable": async () => ({}), "Accessibility.getFullAXTree": async () => ({ nodes: simpleAxNodes() }), }; function makeCborError(): Error { return new Error("CBOR: stack limit exceeded"); } describe("captureHybridSnapshot CBOR fallbacks", () => { it("retries DOM.getDocument with reduced depths before succeeding", async () => { let domCalls = 0; const session = new MockCDPSession({ ...baseHandlers, "DOM.getDocument": async (params) => { domCalls += 1; if (domCalls === 1) throw makeCborError(); expect(params?.depth).toBe(256); return { root: completeDomTree() }; }, }); const page = createFakePage(session); const snapshot = await captureHybridSnapshot(page); expect(snapshot.combinedTree).toContain("html"); const depths = session .callsFor("DOM.getDocument") .map((c) => c.params?.depth); expect(depths).toEqual([-1, 256]); }); it("throws StagehandDomProcessError after all DOM.getDocument attempts fail", async () => { const session = new MockCDPSession({ ...baseHandlers, "DOM.getDocument": async () => { throw makeCborError(); }, }); const page = createFakePage(session); await expect(captureHybridSnapshot(page)).rejects.toThrow( StagehandDomProcessError, ); }); it("hydrates truncated nodes by retrying DOM.describeNode depths", async () => { let domAttempts = 0; let describeAttempts = 0; const session = new MockCDPSession({ ...baseHandlers, "DOM.getDocument": async (params) => { domAttempts += 1; if (domAttempts === 1) throw makeCborError(); expect(params?.depth).toBe(256); return { root: truncatedDomTree() }; }, "DOM.describeNode": async (params) => { describeAttempts += 1; if (describeAttempts === 1) throw makeCborError(); expect(params?.depth).toBe(64); return { node: htmlWithChildren() }; }, }); const page = createFakePage(session); const snapshot = await captureHybridSnapshot(page); const describeDepths = session .callsFor("DOM.describeNode") .map((c) => c.params?.depth); expect(describeDepths).toEqual([-1, 64]); expect(snapshot.combinedXpathMap["0-4"]).toBe("/html[1]/body[1]/div[1]"); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-dom-session-builders.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { describe, expect, it } from "vitest"; import { buildSessionDomIndex, domMapsForSession, getDomTreeWithFallback, hydrateDomTree, } from "../../lib/v3/understudy/a11y/snapshot/domTree.js"; import { StagehandDomProcessError } from "../../lib/v3/types/public/sdkErrors.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; let nextNodeId = 1; const makeDomNode = ( overrides: Partial = {}, ): Protocol.DOM.Node => { const nodeId = overrides.nodeId ?? nextNodeId++; const backendNodeId = overrides.backendNodeId ?? nextNodeId++; const nodeName = overrides.nodeName ?? "DIV"; const nodeType = overrides.nodeType ?? 1; const children = overrides.children ?? []; return { nodeId, backendNodeId, nodeName, nodeType, localName: overrides.localName ?? nodeName.toLowerCase(), nodeValue: overrides.nodeValue ?? "", childNodeCount: overrides.childNodeCount ?? children.length, children, shadowRoots: overrides.shadowRoots, contentDocument: overrides.contentDocument, isScrollable: overrides.isScrollable, }; }; const buildSampleDomTree = () => { const iframeChild = makeDomNode({ nodeName: "P" }); const iframeBody = makeDomNode({ nodeName: "BODY", children: [iframeChild], isScrollable: true, }); const iframeHtml = makeDomNode({ nodeName: "HTML", children: [iframeBody] }); const iframeDoc = makeDomNode({ nodeName: "#document", nodeType: 9, children: [iframeHtml], }); const iframeElement = makeDomNode({ nodeName: "IFRAME", contentDocument: iframeDoc, }); const scrollDiv = makeDomNode({ nodeName: "DIV", isScrollable: true, }); const body = makeDomNode({ nodeName: "BODY", children: [scrollDiv, iframeElement], }); const html = makeDomNode({ nodeName: "HTML", children: [body] }); const root = makeDomNode({ nodeName: "#document", nodeType: 9, children: [html], }); return { root, html, body, scrollDiv, iframeElement, iframeDoc, iframeHtml, iframeBody, iframeChild, }; }; describe("hydrateDomTree", () => { it("expands truncated nodes by calling DOM.describeNode", async () => { const child = makeDomNode({ nodeName: "DIV" }); const root = makeDomNode({ nodeName: "HTML", childNodeCount: 1, children: [], }); const session = new MockCDPSession({ "DOM.describeNode": async () => ({ node: { ...root, children: [child], childNodeCount: 1, }, }), }); await hydrateDomTree(session, root, true); expect(root.children).toEqual([child]); }); it("retries describeNode when CBOR errors occur before succeeding", async () => { const child = makeDomNode({ nodeName: "DIV" }); const root = makeDomNode({ nodeName: "HTML", childNodeCount: 1, children: [], }); let attempts = 0; const session = new MockCDPSession({ "DOM.describeNode": async () => { attempts++; if (attempts === 1) throw new Error("CBOR: stack limit exceeded"); return { node: { ...root, children: [child], childNodeCount: 1 } }; }, }); await hydrateDomTree(session, root, true); expect(attempts).toBe(2); expect(root.children).toEqual([child]); }); it("throws StagehandDomProcessError after exhausting describeNode retries", async () => { const root = makeDomNode({ nodeName: "HTML", childNodeCount: 1, children: [], }); const session = new MockCDPSession({ "DOM.describeNode": async () => { throw new Error("CBOR: stack limit exceeded"); }, }); await expect(hydrateDomTree(session, root, true)).rejects.toBeInstanceOf( StagehandDomProcessError, ); }); }); describe("getDomTreeWithFallback", () => { it("retries DOM.getDocument after CBOR errors and returns the hydrated root", async () => { const root = makeDomNode({ nodeName: "#document", nodeType: 9, children: [], }); const depths: number[] = []; const session = new MockCDPSession({ "DOM.getDocument": async (params) => { const depth = (params?.depth ?? 0) as number; depths.push(depth); if (depth === -1) throw new Error("CBOR: stack limit exceeded"); return { root }; }, "DOM.describeNode": async () => ({ node: root }), }); const result = await getDomTreeWithFallback(session, true); expect(result).toBe(root); expect(depths).toEqual([-1, 256]); }); it("propagates non-CBOR DOM.getDocument errors", async () => { const session = new MockCDPSession({ "DOM.getDocument": async () => { throw new Error("network fail"); }, }); await expect(getDomTreeWithFallback(session, false)).rejects.toThrow( "network fail", ); }); it("throws StagehandDomProcessError when all depth attempts hit CBOR limits", async () => { const session = new MockCDPSession({ "DOM.getDocument": async () => { throw new Error("CBOR: stack limit exceeded"); }, }); await expect(getDomTreeWithFallback(session, false)).rejects.toBeInstanceOf( StagehandDomProcessError, ); }); }); describe("buildSessionDomIndex", () => { it("collects absolute paths, scrollability, and content-document metadata", async () => { const tree = buildSampleDomTree(); const session = new MockCDPSession({ "DOM.enable": async () => ({}), "DOM.getDocument": async () => ({ root: tree.root }), "DOM.describeNode": async () => ({ node: tree.root }), }); const index = await buildSessionDomIndex(session, true); expect(index.rootBackend).toBe(tree.root.backendNodeId); expect(index.absByBe.get(tree.body.backendNodeId)).toBe("/html[1]/body[1]"); expect(index.absByBe.get(tree.scrollDiv.backendNodeId)).toBe( "/html[1]/body[1]/div[1]", ); expect(index.scrollByBe.get(tree.scrollDiv.backendNodeId)).toBe(true); expect(index.docRootOf.get(tree.iframeHtml.backendNodeId)).toBe( tree.iframeDoc.backendNodeId, ); expect( index.contentDocRootByIframe.get(tree.iframeElement.backendNodeId), ).toBe(tree.iframeDoc.backendNodeId); }); }); describe("domMapsForSession", () => { it("derives frame-relative xpath/tag/scrollable maps for a frame's document root", async () => { const tree = buildSampleDomTree(); const session = new MockCDPSession({ "DOM.enable": async () => ({}), "DOM.getDocument": async () => ({ root: tree.root }), "DOM.getFrameOwner": async () => ({ backendNodeId: tree.iframeElement.backendNodeId, }), "DOM.describeNode": async () => ({ node: tree.root }), }); const encode = (frameId: string, backendNodeId: number) => `${frameId}-${backendNodeId}`; const maps = await domMapsForSession( session, "frame-A", true, encode, true, ); const iframeDocKey = `frame-A-${tree.iframeDoc.backendNodeId}`; const iframeBodyKey = `frame-A-${tree.iframeBody.backendNodeId}`; const iframeChildKey = `frame-A-${tree.iframeChild.backendNodeId}`; expect(maps.tagNameMap[iframeDocKey]).toBe("#document"); expect(maps.xpathMap[iframeDocKey]).toBe("/"); expect(maps.xpathMap[iframeBodyKey]).toBe("/html[1]/body[1]"); expect(maps.xpathMap[iframeChildKey]).toBe("/html[1]/body[1]/p[1]"); expect(maps.scrollableMap[iframeBodyKey]).toBe(true); expect(Object.keys(maps.tagNameMap)).not.toContain( `frame-A-${tree.html.backendNodeId}`, ); }); it("falls back to the root document when frame owner lookup fails", async () => { const tree = buildSampleDomTree(); const session = new MockCDPSession({ "DOM.enable": async () => ({}), "DOM.getDocument": async () => ({ root: tree.root }), "DOM.getFrameOwner": async () => { throw new Error("owner lookup failed"); }, "DOM.describeNode": async () => ({ node: tree.root }), }); const encode = (frameId: string, backendNodeId: number) => `${frameId}-${backendNodeId}`; const maps = await domMapsForSession( session, "frame-B", false, encode, true, ); expect(maps.xpathMap[`frame-B-${tree.html.backendNodeId}`]).toBe( "/html[1]", ); expect(maps.xpathMap[`frame-B-${tree.scrollDiv.backendNodeId}`]).toBe( "/html[1]/body[1]/div[1]", ); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-dom-tree-utils.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { describe, expect, it } from "vitest"; import { collectDomTraversalTargets, findNodeByBackendId, mergeDomNodes, shouldExpandNode, } from "../../lib/v3/understudy/a11y/snapshot/domTree.js"; let nextNodeId = 1; const makeNode = ( overrides: Partial = {}, ): Protocol.DOM.Node => { const base: Protocol.DOM.Node = { nodeId: nextNodeId++, backendNodeId: nextNodeId++, nodeType: 1, nodeName: "DIV", localName: "div", nodeValue: "", childNodeCount: overrides.childNodeCount ?? (overrides.children ? overrides.children.length : 0), }; return { ...base, ...overrides }; }; describe("shouldExpandNode", () => { it("returns true when declared children exceed realized children", () => { const node = makeNode({ childNodeCount: 2, children: [makeNode()], }); expect(shouldExpandNode(node)).toBe(true); }); it("returns false when all declared children are realized", () => { const child = makeNode(); const node = makeNode({ childNodeCount: 1, children: [child], }); expect(shouldExpandNode(node)).toBe(false); }); }); describe("mergeDomNodes", () => { it("overrides structural fields with expanded node data", () => { const originalChildren = [makeNode({ nodeName: "SPAN" })]; const target = makeNode({ childNodeCount: 1, children: originalChildren, shadowRoots: [makeNode({ nodeName: "shadow-old" })], contentDocument: makeNode({ nodeName: "doc-old" }), }); const source = makeNode({ childNodeCount: 3, children: [makeNode({ nodeName: "DIV" })], shadowRoots: [], contentDocument: makeNode({ nodeName: "doc-new" }), }); mergeDomNodes(target, source); expect(target.childNodeCount).toBe(3); expect(target.children).toEqual(source.children); expect(target.shadowRoots).toEqual([]); expect(target.contentDocument?.nodeName).toBe("doc-new"); }); it("preserves original structures when source omits them", () => { const child = makeNode(); const target = makeNode({ childNodeCount: 1, children: [child], }); const source = makeNode({ childNodeCount: 5, }); mergeDomNodes(target, source); expect(target.childNodeCount).toBe(5); expect(target.children).toEqual([child]); }); }); describe("collectDomTraversalTargets", () => { it("returns children, shadow roots, and content document in order", () => { const childA = makeNode({ nodeName: "CHILD-A" }); const childB = makeNode({ nodeName: "CHILD-B" }); const shadow = makeNode({ nodeName: "SHADOW" }); const content = makeNode({ nodeName: "CONTENT" }); const node = makeNode({ children: [childA, childB], shadowRoots: [shadow], contentDocument: content, }); const targets = collectDomTraversalTargets(node); expect(targets).toEqual([childA, childB, shadow, content]); }); }); describe("findNodeByBackendId", () => { it("finds nodes nested within children and shadow roots", () => { const target = makeNode({ backendNodeId: 999, nodeName: "TARGET" }); const root = makeNode({ children: [ makeNode({ children: [makeNode(), target], }), ], shadowRoots: [makeNode()], }); expect(findNodeByBackendId(root, 999)).toBe(target); }); it("returns undefined when no node matches the backend id", () => { const root = makeNode({ children: [makeNode()], shadowRoots: [makeNode()], }); expect(findNodeByBackendId(root, 123456)).toBeUndefined(); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-focus-selectors-utils.test.ts ================================================ import type { Step } from "../../lib/v3/types/private/snapshot.js"; import { describe, expect, it } from "vitest"; import { buildXPathFromSteps, IFRAME_STEP_RE, listChildrenOf, parseXPathToSteps, } from "../../lib/v3/understudy/a11y/snapshot/focusSelectors.js"; describe("parseXPathToSteps", () => { it("records axis direction and normalized names", () => { const steps = parseXPathToSteps(" //iframe[1]/div[2]//SPAN "); expect(steps).toEqual([ { axis: "desc", raw: "iframe[1]", name: "iframe" }, { axis: "child", raw: "div[2]", name: "div" }, { axis: "desc", raw: "SPAN", name: "span" }, ]); }); it("drops empty segments and returns [] for blank input", () => { expect(parseXPathToSteps(" ")).toEqual([]); expect(parseXPathToSteps("/ ")).toEqual([]); }); }); describe("buildXPathFromSteps", () => { it("reconstructs descendant and child hops as a string", () => { const steps: ReadonlyArray = [ { axis: "child", raw: "iframe[1]", name: "iframe" }, { axis: "desc", raw: "div[@id='main']", name: "div" }, { axis: "child", raw: "span", name: "span" }, ]; expect(buildXPathFromSteps(steps)).toBe("/iframe[1]//div[@id='main']/span"); }); it("returns '/' for empty sequences", () => { expect(buildXPathFromSteps([])).toBe("/"); }); }); describe("IFRAME_STEP_RE — frame boundary detection", () => { it("matches both iframe and frame with optional index", () => { expect(IFRAME_STEP_RE.test("iframe")).toBe(true); expect(IFRAME_STEP_RE.test("iframe[1]")).toBe(true); expect(IFRAME_STEP_RE.test("frame")).toBe(true); expect(IFRAME_STEP_RE.test("frame[4]")).toBe(true); }); it("does NOT match frameset", () => { expect(IFRAME_STEP_RE.test("frameset")).toBe(false); expect(IFRAME_STEP_RE.test("frameset[1]")).toBe(false); }); }); describe("parseXPathToSteps — frameset XPaths", () => { it("parses a frameset page XPath with frame[N] steps", () => { const steps = parseXPathToSteps( "/html[1]/frameset[1]/frame[4]/html[1]/body[1]/table[1]", ); expect(steps).toEqual([ { axis: "child", raw: "html[1]", name: "html" }, { axis: "child", raw: "frameset[1]", name: "frameset" }, { axis: "child", raw: "frame[4]", name: "frame" }, { axis: "child", raw: "html[1]", name: "html" }, { axis: "child", raw: "body[1]", name: "body" }, { axis: "child", raw: "table[1]", name: "table" }, ]); // frame[4] step should be detected as a frame boundary const frameBoundaries = steps.filter((s) => IFRAME_STEP_RE.test(s.name)); expect(frameBoundaries).toHaveLength(1); expect(frameBoundaries[0].raw).toBe("frame[4]"); }); it("detects iframe boundaries in standard iframe XPaths", () => { const steps = parseXPathToSteps( "/html[1]/body[1]/div[2]/iframe[1]/html[1]/body[1]/p[1]", ); const frameBoundaries = steps.filter((s) => IFRAME_STEP_RE.test(s.name)); expect(frameBoundaries).toHaveLength(1); expect(frameBoundaries[0].raw).toBe("iframe[1]"); }); it("does NOT detect frameset as a frame boundary", () => { const steps = parseXPathToSteps("/html[1]/frameset[1]/frame[2]"); const frameBoundaries = steps.filter((s) => IFRAME_STEP_RE.test(s.name)); expect(frameBoundaries).toHaveLength(1); // Only frame[2] matches, not frameset[1] expect(frameBoundaries[0].raw).toBe("frame[2]"); }); }); describe("listChildrenOf", () => { it("returns direct children whose parent matches the provided id", () => { const parentByFrame = new Map([ ["frame-1", null], ["frame-2", "frame-1"], ["frame-3", "frame-1"], ["frame-4", "frame-2"], ]); expect(listChildrenOf(parentByFrame, "frame-1")).toEqual([ "frame-2", "frame-3", ]); expect(listChildrenOf(parentByFrame, "frame-4")).toEqual([]); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-frame-merge.test.ts ================================================ import { describe, expect, it } from "vitest"; import type { FrameContext, FrameDomMaps, } from "../../lib/v3/types/private/index.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import { MockCDPSession } from "./helpers/mockCDPSession.js"; import { computeFramePrefixes, mergeFramesIntoSnapshot, } from "../../lib/v3/understudy/a11y/snapshot/capture.js"; const makePage = (sessions: Record): Page => ({ getSessionForFrame: (frameId: string) => sessions[frameId] ?? sessions.root, getOrdinal: (frameId: string) => frameId === "frame-1" ? 0 : frameId === "frame-2" ? 1 : 2, }) as unknown as Page; describe("computeFramePrefixes", () => { it("derives prefixes from parent iframe xpaths within the same session", async () => { const parentSession = new MockCDPSession({ "DOM.getFrameOwner": async () => ({ backendNodeId: 200 }), }); const page = makePage({ "frame-1": parentSession, "frame-2": parentSession, root: parentSession, }); const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: { "0-200": "/html[1]/body[1]/iframe[1]" }, }, ], ]); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const { absPrefix, iframeHostEncByChild } = await computeFramePrefixes( page, context, perFrameMaps, context.frames, ); expect(absPrefix.get("frame-1")).toBe(""); expect(absPrefix.get("frame-2")).toBe("/html[1]/body[1]/iframe[1]"); expect(iframeHostEncByChild.get("frame-2")).toBe("0-200"); }); it("inherits the parent prefix when frame owner lookups fail (OOPIF)", async () => { const parentSession = new MockCDPSession({ "DOM.getFrameOwner": async (params) => { if (params?.frameId === "frame-2") return { backendNodeId: 200 }; if (params?.frameId === "frame-3") throw new Error("unavailable"); return {}; }, }); const page = makePage({ "frame-1": parentSession, "frame-2": parentSession, "frame-3": parentSession, root: parentSession, }); const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: { "0-200": "/iframe[1]" }, }, ], [ "frame-2", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: { "1-300": "/div[1]/iframe[1]" }, }, ], ]); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2", "frame-3"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ["frame-3", "frame-2"], ]), }; const maps = await computeFramePrefixes( page, context, perFrameMaps, context.frames, ); expect(maps.absPrefix.get("frame-2")).toBe("/iframe[1]"); expect(maps.absPrefix.get("frame-3")).toBe("/iframe[1]"); }); it("inherits parent prefix when iframe xpath mapping is missing", async () => { const session = new MockCDPSession({ "DOM.getFrameOwner": async () => ({ backendNodeId: 999 }), }); const page = makePage({ "frame-1": session, "frame-2": session, root: session, }); const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: {}, }, ], ]); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const result = await computeFramePrefixes( page, context, perFrameMaps as Map, context.frames, ); expect(result.absPrefix.get("frame-2")).toBe(""); }); it("does not compute prefixes for frames excluded from the scope", async () => { const session = new MockCDPSession({ "DOM.getFrameOwner": async () => ({ backendNodeId: 200 }), }); const page = makePage({ "frame-1": session, "frame-2": session, root: session, }); const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: { "0-200": "/iframe[1]" }, }, ], ]); const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const { absPrefix, iframeHostEncByChild } = await computeFramePrefixes( page, context, perFrameMaps, ["frame-1"], ); expect(absPrefix.has("frame-2")).toBe(false); expect(iframeHostEncByChild.has("frame-2")).toBe(false); }); }); describe("mergeFramesIntoSnapshot", () => { it("merges root and child maps, prefixing child xpaths and injecting subtrees", () => { const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: { "0-10": "https://example.com" }, xpathMap: { "0-10": "/html[1]/body[1]" }, }, ], [ "frame-2", { tagNameMap: {}, scrollableMap: {}, urlMap: { "1-20": "https://child.com" }, xpathMap: { "1-20": "/div[1]/span[1]" }, }, ], ]); const perFrameOutlines = [ { frameId: "frame-1", outline: "[0-10] body\n [0-200] iframe" }, { frameId: "frame-2", outline: "[1-20] child" }, ]; const absPrefix = new Map([ ["frame-1", ""], ["frame-2", "/html[1]/body[1]/iframe[1]"], ]); const iframeHostEncByChild = new Map([ ["frame-2", "0-200"], ]); const snapshot = mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, absPrefix, iframeHostEncByChild, context.frames, ); expect(snapshot.combinedXpathMap["0-10"]).toBe("/html[1]/body[1]"); expect(snapshot.combinedXpathMap["1-20"]).toBe( "/html[1]/body[1]/iframe[1]/div[1]/span[1]", ); expect(snapshot.combinedUrlMap["1-20"]).toBe("https://child.com"); expect(snapshot.combinedTree).toContain("[1-20] child"); }); it("skips frames without maps and handles missing iframe mappings", () => { const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: { "0-10": "/html[1]" }, }, ], ]); const perFrameOutlines = [ { frameId: "frame-1", outline: "[0-10] html" }, { frameId: "frame-2", outline: "[1-20] orphan" }, ]; const absPrefix = new Map([ ["frame-1", ""], ["frame-2", "/missing"], ]); const snapshot = mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, absPrefix, new Map(), context.frames, ); expect(snapshot.combinedXpathMap["1-20"]).toBeUndefined(); expect(snapshot.combinedTree).toBe("[0-10] html"); }); it("falls back to first outline when root frame outline is missing", () => { const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const perFrameMaps = new Map([ [ "frame-2", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: {}, }, ], ]); const perFrameOutlines = [ { frameId: "frame-2", outline: "[child] frame2" }, ]; const snapshot = mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, new Map([["frame-2", "/iframe[1]"]]), new Map(), context.frames, ); expect(snapshot.combinedTree).toBe("[child] frame2"); }); it("overwrites duplicate iframe host entries when multiple children map to the same parent", () => { const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2", "frame-3"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ["frame-3", "frame-1"], ]), }; const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: {}, xpathMap: {}, }, ], ]); const perFrameOutlines = [ { frameId: "frame-1", outline: "[root] frame1\n [0-200] iframe slot" }, { frameId: "frame-2", outline: "[child] frame2" }, { frameId: "frame-3", outline: "[child] frame3" }, ]; const snapshot = mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, new Map([ ["frame-1", ""], ["frame-2", ""], ["frame-3", ""], ]), new Map([ ["frame-2", "0-200"], ["frame-3", "0-200"], ]), context.frames, ); expect(snapshot.combinedTree).toContain("[child] frame3"); expect(snapshot.combinedTree).not.toContain("[child] frame2"); }); it("only merges xpath and url maps for frames included in frameIds", () => { const context: FrameContext = { rootId: "frame-1", frames: ["frame-1", "frame-2"], parentByFrame: new Map([ ["frame-1", null], ["frame-2", "frame-1"], ]), }; const perFrameMaps = new Map([ [ "frame-1", { tagNameMap: {}, scrollableMap: {}, urlMap: { "0-10": "https://root.test" }, xpathMap: { "0-10": "/html[1]" }, }, ], [ "frame-2", { tagNameMap: {}, scrollableMap: {}, urlMap: { "1-20": "https://child.test" }, xpathMap: { "1-20": "/div[1]" }, }, ], ]); const perFrameOutlines = [{ frameId: "frame-1", outline: "[root] doc" }]; const snapshot = mergeFramesIntoSnapshot( context, perFrameMaps, perFrameOutlines, new Map([["frame-1", ""]]), new Map(), ["frame-1"], ); expect(snapshot.combinedXpathMap["0-10"]).toBe("/html[1]"); expect(snapshot.combinedXpathMap["1-20"]).toBeUndefined(); expect(snapshot.perFrame?.map((pf) => pf.frameId)).toEqual(["frame-1"]); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-tree-format-utils.test.ts ================================================ import { describe, expect, it } from "vitest"; import { cleanText, diffCombinedTrees, formatTreeLine, indentBlock, injectSubtrees, normaliseSpaces, } from "../../lib/v3/understudy/a11y/snapshot/treeFormatUtils.js"; describe("formatTreeLine", () => { it("includes encoded ids and indents children", () => { const outline = formatTreeLine({ role: "section", name: "Container", encodedId: "frame-1", nodeId: "ax-1", children: [ { role: "button", name: "Submit", nodeId: "ax-2", }, ], }); expect(outline).toBe( "[frame-1] section: Container\n [ax-2] button: Submit", ); }); }); describe("injectSubtrees", () => { it("nests child outlines under iframe encoded ids", () => { const rootOutline = `[root] document\n [iframe-1] iframe\n [leaf] item`; const iframeOutline = `[child-root] child\n [nested-frame] iframe`; const nestedOutline = `[nested-leaf] nested`; const merged = injectSubtrees( rootOutline, new Map([ ["iframe-1", iframeOutline], ["nested-frame", nestedOutline], ]), ); expect(merged).toBe( `[root] document [iframe-1] iframe [child-root] child [nested-frame] iframe [nested-leaf] nested [leaf] item`, ); }); it("injects child outline only once when the same id repeats", () => { const rootOutline = `[root] document [iframe-1] iframe [iframe-1] iframe`; const iframeOutline = `[child-root] child`; const merged = injectSubtrees( rootOutline, new Map([["iframe-1", iframeOutline]]), ); expect(merged).toBe( `[root] document [iframe-1] iframe [child-root] child [iframe-1] iframe`, ); }); it("returns the original outline when no encoded ids are matched", () => { const outline = `[root] document\n [leaf] item`; expect(injectSubtrees(outline, new Map([["other", "[x] child"]]))).toBe( outline, ); }); }); describe("indentBlock", () => { it("prefixes each line with the provided indent", () => { expect(indentBlock("a\nb", " ")).toBe(" a\n b"); expect(indentBlock("", " ")).toBe(""); }); }); describe("diffCombinedTrees", () => { it("returns newly-added lines relative to previous outline", () => { const prev = `[root] document\n [child] a`; const next = `[root] document\n [child] a\n [child-2] b`; expect(diffCombinedTrees(prev, next)).toBe("[child-2] b"); }); it("normalizes indentation for added lines with stray spaces", () => { const prev = `[root] document\n [child] a`; const next = `[root] document\n [child] a\n [child-2] b`; expect(diffCombinedTrees(prev, next)).toBe("[child-2] b"); }); }); describe("cleanText", () => { it("removes NBSP and private-use characters while collapsing spaces", () => { const dirty = `Hello\u00A0\u00A0world\uE000 !`; expect(cleanText(dirty)).toBe("Hello world !"); }); }); describe("normaliseSpaces", () => { it("replaces whitespace runs with a single space", () => { expect(normaliseSpaces("a b\tc\nd")).toBe("a b c d"); }); }); ================================================ FILE: packages/core/tests/unit/snapshot-xpath-utils.test.ts ================================================ import type { Protocol } from "devtools-protocol"; import { describe, expect, it } from "vitest"; import { buildChildXPathSegments, joinXPath, normalizeXPath, prefixXPath, } from "../../lib/v3/understudy/a11y/snapshot/xpathUtils.js"; import { relativizeXPath } from "../../lib/v3/understudy/a11y/snapshot/domTree.js"; describe("prefixXPath", () => { it("treats root prefixes as no-op", () => { expect(prefixXPath("/", "/div[1]")).toBe("/div[1]"); expect(prefixXPath("/", "//div[1]")).toBe("//div[1]"); }); it("handles descendant hops and blank children", () => { expect(prefixXPath("/html/body", "//slot[1]")).toBe("/html/body//slot[1]"); expect(prefixXPath("/html/body", "/")).toBe("/html/body"); expect(prefixXPath("/html/body/", "")).toBe("/html/body"); }); }); describe("normalizeXPath", () => { it("strips prefixes, trims whitespace, and enforces absolute roots", () => { expect(normalizeXPath(" xpath=/html/body/ ")).toBe("/html/body"); expect(normalizeXPath("div/span")).toBe("/div/span"); expect(normalizeXPath("")).toBe(""); expect(normalizeXPath()).toBe(""); }); }); describe("relativizeXPath", () => { it("returns '/' when paths match exactly", () => { expect(relativizeXPath("/html/body", "/html/body")).toBe("/"); }); it("omits duplicate prefixes and preserves descendant hops", () => { expect(relativizeXPath("/html/body", "/html/body/div[2]")).toBe("/div[2]"); expect(relativizeXPath("/html/body", "/html/body//shadow-root[1]")).toBe( "//shadow-root[1]", ); }); it("falls back to absolute paths outside of the base document", () => { expect(relativizeXPath("/html/body", "/head")).toBe("/head"); expect(relativizeXPath("/", "/html/body")).toBe("/html/body"); }); }); describe("buildChildXPathSegments", () => { it("produces positional selectors for each node type", () => { const makeNode = ( nodeType: number, nodeName: string, override?: Partial, ): Protocol.DOM.Node => ({ nodeId: 1, backendNodeId: 1, localName: nodeName.toLowerCase(), nodeValue: "", ...override, nodeType, nodeName, }); const nodes: Protocol.DOM.Node[] = [ makeNode(1, "DIV"), makeNode(1, "DIV"), makeNode(1, "svg:path"), makeNode(3, "#text"), makeNode(8, "#comment"), ]; expect(buildChildXPathSegments(nodes)).toEqual([ "div[1]", "div[2]", "*[name()='svg:path'][1]", "text()[1]", "comment()[1]", ]); }); }); describe("joinXPath", () => { it("joins base and steps while preserving special hops", () => { expect(joinXPath("", "div[1]")).toBe("/div[1]"); expect(joinXPath("/", "span[1]")).toBe("/span[1]"); expect(joinXPath("/html/body", "//")).toBe("/html/body//"); expect(joinXPath("/html//", "slot[1]")).toBe("/html//slot[1]"); expect(joinXPath("/html/body", "")).toBe("/html/body"); }); }); ================================================ FILE: packages/core/tests/unit/timeout-handlers.test.ts ================================================ import { beforeEach, describe, expect, it, vi } from "vitest"; import { ActHandler } from "../../lib/v3/handlers/actHandler.js"; import { ExtractHandler } from "../../lib/v3/handlers/extractHandler.js"; import { ObserveHandler } from "../../lib/v3/handlers/observeHandler.js"; import type { Page } from "../../lib/v3/understudy/page.js"; import type { ClientOptions } from "../../lib/v3/types/public/model.js"; import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; import { createTimeoutGuard } from "../../lib/v3/handlers/handlerUtils/timeoutGuard.js"; import { waitForDomNetworkQuiet } from "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js"; import { captureHybridSnapshot } from "../../lib/v3/understudy/a11y/snapshot/index.js"; import { ActTimeoutError, ExtractTimeoutError, ObserveTimeoutError, } from "../../lib/v3/types/public/sdkErrors.js"; import { act as actInference, extract as extractInference, observe as observeInference, } from "../../lib/inference.js"; import { V3FunctionName } from "../../lib/v3/types/public/methods.js"; vi.mock("../../lib/v3/handlers/handlerUtils/timeoutGuard", () => ({ createTimeoutGuard: vi.fn(), })); vi.mock("../../lib/v3/handlers/handlerUtils/actHandlerUtils", () => ({ waitForDomNetworkQuiet: vi.fn(), performUnderstudyMethod: vi.fn(), })); vi.mock("../../lib/v3/understudy/a11y/snapshot", () => ({ captureHybridSnapshot: vi.fn(), diffCombinedTrees: vi.fn(), })); vi.mock("../../lib/inference", () => ({ act: vi.fn(), extract: vi.fn(), observe: vi.fn(), })); describe("ActHandler timeout guard", () => { beforeEach(() => { vi.clearAllMocks(); }); it("throws ActTimeoutError when timeout expires before snapshot", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "", combinedXpathMap: {}, combinedUrlMap: {}, }); // Make createTimeoutGuard return a guard that throws on call #2 vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 2) { throw errorFactory ? errorFactory(timeoutMs!) : new ActTimeoutError(timeoutMs!); } }); }, ); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.act({ instruction: "do something", page: fakePage, timeout: 5, }), ).rejects.toThrow(ActTimeoutError); // Verify pre-timeout helper ran expect(waitForDomNetworkQuietMock).toHaveBeenCalledTimes(1); // Verify snapshot was NOT called (timeout fired before it) expect(captureHybridSnapshotMock).not.toHaveBeenCalled(); }); it("throws ActTimeoutError when timeout expires before LLM call", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const actInferenceMock = vi.mocked(actInference); // Throw on call #3 (after snapshot but before LLM) vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 3) { throw errorFactory ? errorFactory(timeoutMs!) : new ActTimeoutError(timeoutMs!); } }); }, ); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.act({ instruction: "do something", page: fakePage, timeout: 5, }), ).rejects.toThrow(ActTimeoutError); // Snapshot should have been called expect(captureHybridSnapshotMock).toHaveBeenCalledTimes(1); // LLM inference should NOT have been called expect(actInferenceMock).not.toHaveBeenCalled(); }); it("throws ActTimeoutError with correct message format", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const timeoutMs = 100; vi.mocked(createTimeoutGuard).mockImplementation((ms, errorFactory) => { return vi.fn(() => { throw errorFactory ? errorFactory(ms!) : new ActTimeoutError(ms!); }); }); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; try { await handler.act({ instruction: "do something", page: fakePage, timeout: timeoutMs, }); throw new Error("Expected ActTimeoutError to be thrown"); } catch (error) { expect(error).toBeInstanceOf(ActTimeoutError); expect((error as ActTimeoutError).message).toContain("act()"); expect((error as ActTimeoutError).message).toContain(`${timeoutMs}ms`); expect((error as ActTimeoutError).name).toBe("ActTimeoutError"); } }); }); describe("ActHandler two-step timeout", () => { beforeEach(() => { vi.clearAllMocks(); }); it("throws ActTimeoutError during step 2; step 2 action does not run", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); performUnderstudyMethodMock.mockResolvedValue(undefined); const actInferenceMock = vi.mocked(actInference); // First call returns a two-step action actInferenceMock.mockResolvedValueOnce({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: true, prompt_tokens: 100, completion_tokens: 50, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); const diffCombinedTreesMock = vi.mocked( (await import("../../lib/v3/understudy/a11y/snapshot/index.js")) .diffCombinedTrees, ); diffCombinedTreesMock.mockReturnValue("diff tree"); // Timeout fires after step 1 completes, during step 2 snapshot // ensureTimeRemaining calls: 1=before wait, 2=after wait/before snap1, 3=before LLM1, // 4=before action1, 5=inside takeDeterministicAction, 6=performUnderstudy, // 7=before snap2 (this one should throw) let callCount = 0; vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { return vi.fn(() => { callCount += 1; if (callCount >= 7) { throw errorFactory ? errorFactory(timeoutMs!) : new ActTimeoutError(timeoutMs!); } }); }, ); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.act({ instruction: "click then type", page: fakePage, timeout: 50, }), ).rejects.toThrow(ActTimeoutError); // Step 1 action should have been executed expect(performUnderstudyMethodMock).toHaveBeenCalledTimes(1); // Step 2 LLM call should NOT have happened expect(actInferenceMock).toHaveBeenCalledTimes(1); }); }); describe("ActHandler self-heal timeout", () => { beforeEach(() => { vi.clearAllMocks(); }); it("throws ActTimeoutError during self-heal snapshot; no retry action executes", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); // First call fails, triggering self-heal performUnderstudyMethodMock.mockRejectedValueOnce( new Error("Element not found"), ); const actInferenceMock = vi.mocked(actInference); actInferenceMock.mockResolvedValue({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: false, prompt_tokens: 100, completion_tokens: 50, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); // Timeout during self-heal snapshot (call 7 or later) let callCount = 0; vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { return vi.fn(() => { callCount += 1; // Timeout during self-heal snapshot call if (callCount >= 7) { throw errorFactory ? errorFactory(timeoutMs!) : new ActTimeoutError(timeoutMs!); } }); }, ); const handler = buildActHandler({ selfHeal: true }); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.act({ instruction: "click button", page: fakePage, timeout: 50, }), ).rejects.toThrow(ActTimeoutError); // First action attempt should have been tried expect(performUnderstudyMethodMock).toHaveBeenCalledTimes(1); // First LLM call should have happened expect(actInferenceMock).toHaveBeenCalledTimes(1); // Self-heal snapshot should have been started (call happened) expect(captureHybridSnapshotMock).toHaveBeenCalled(); }); it("throws ActTimeoutError during self-heal LLM inference; no retry action executes", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); // First call fails, triggering self-heal performUnderstudyMethodMock.mockRejectedValueOnce( new Error("Element not found"), ); const actInferenceMock = vi.mocked(actInference); actInferenceMock.mockResolvedValueOnce({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: false, prompt_tokens: 100, completion_tokens: 50, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); // Timeout during self-heal LLM inference (call 8) let callCount = 0; vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { return vi.fn(() => { callCount += 1; // Timeout during self-heal LLM call if (callCount >= 8) { throw errorFactory ? errorFactory(timeoutMs!) : new ActTimeoutError(timeoutMs!); } }); }, ); const handler = buildActHandler({ selfHeal: true }); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.act({ instruction: "click button", page: fakePage, timeout: 50, }), ).rejects.toThrow(ActTimeoutError); // Self-heal snapshot was captured expect(captureHybridSnapshotMock).toHaveBeenCalledTimes(2); // Only one LLM inference (the retry inference was aborted by timeout) expect(actInferenceMock).toHaveBeenCalledTimes(1); }); }); describe("ExtractHandler timeout guard", () => { beforeEach(() => { vi.clearAllMocks(); }); it("throws ExtractTimeoutError when timeout expires before snapshot", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const extractInferenceMock = vi.mocked(extractInference); // Throw immediately on first call vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { return vi.fn(() => { throw errorFactory ? errorFactory(timeoutMs!) : new ExtractTimeoutError(timeoutMs!); }); }, ); const handler = buildExtractHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.extract({ instruction: "extract title", page: fakePage, timeout: 5, }), ).rejects.toThrow(ExtractTimeoutError); // Snapshot should NOT have been called expect(captureHybridSnapshotMock).not.toHaveBeenCalled(); // LLM inference should NOT have been called expect(extractInferenceMock).not.toHaveBeenCalled(); }); it("throws ExtractTimeoutError when timeout expires before LLM call", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const extractInferenceMock = vi.mocked(extractInference); // Throw on call #2 (after snapshot but before LLM) vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 2) { throw errorFactory ? errorFactory(timeoutMs!) : new ExtractTimeoutError(timeoutMs!); } }); }, ); const handler = buildExtractHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.extract({ instruction: "extract title", page: fakePage, timeout: 5, }), ).rejects.toThrow(ExtractTimeoutError); // Snapshot should have been called expect(captureHybridSnapshotMock).toHaveBeenCalledTimes(1); // LLM inference should NOT have been called expect(extractInferenceMock).not.toHaveBeenCalled(); }); it("throws ExtractTimeoutError with correct message format", async () => { const timeoutMs = 200; vi.mocked(createTimeoutGuard).mockImplementation((ms, errorFactory) => { return vi.fn(() => { throw errorFactory ? errorFactory(ms!) : new ExtractTimeoutError(ms!); }); }); const handler = buildExtractHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; try { await handler.extract({ instruction: "extract title", page: fakePage, timeout: timeoutMs, }); throw new Error("Expected ExtractTimeoutError to be thrown"); } catch (error) { expect(error).toBeInstanceOf(ExtractTimeoutError); expect((error as ExtractTimeoutError).message).toContain("extract()"); expect((error as ExtractTimeoutError).message).toContain( `${timeoutMs}ms`, ); expect((error as ExtractTimeoutError).name).toBe("ExtractTimeoutError"); } }); it("stops LLM and post-processing when timeout expires", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: { "1-0": "https://example.com" }, }); const extractInferenceMock = vi.mocked(extractInference); // Allow snapshot but timeout before LLM vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 2) { throw errorFactory ? errorFactory(timeoutMs!) : new ExtractTimeoutError(timeoutMs!); } }); }, ); const handler = buildExtractHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.extract({ instruction: "extract links", page: fakePage, timeout: 5, }), ).rejects.toThrow(ExtractTimeoutError); // Post-processing (URL injection) never runs because LLM was never called expect(extractInferenceMock).not.toHaveBeenCalled(); }); }); describe("ObserveHandler timeout guard", () => { beforeEach(() => { vi.clearAllMocks(); }); it("throws ObserveTimeoutError when timeout expires before snapshot", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const observeInferenceMock = vi.mocked(observeInference); // Throw immediately on first call vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { return vi.fn(() => { throw errorFactory ? errorFactory(timeoutMs!) : new ObserveTimeoutError(timeoutMs!); }); }, ); const handler = buildObserveHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.observe({ instruction: "find buttons", page: fakePage, timeout: 5, }), ).rejects.toThrow(ObserveTimeoutError); // Snapshot should NOT have been called expect(captureHybridSnapshotMock).not.toHaveBeenCalled(); // LLM inference should NOT have been called expect(observeInferenceMock).not.toHaveBeenCalled(); }); it("throws ObserveTimeoutError when timeout expires before LLM call", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const observeInferenceMock = vi.mocked(observeInference); // Throw on call #2 (after snapshot but before LLM) vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 2) { throw errorFactory ? errorFactory(timeoutMs!) : new ObserveTimeoutError(timeoutMs!); } }); }, ); const handler = buildObserveHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.observe({ instruction: "find buttons", page: fakePage, timeout: 5, }), ).rejects.toThrow(ObserveTimeoutError); // Snapshot should have been called expect(captureHybridSnapshotMock).toHaveBeenCalledTimes(1); // LLM inference should NOT have been called expect(observeInferenceMock).not.toHaveBeenCalled(); }); it("throws ObserveTimeoutError with correct message format", async () => { const timeoutMs = 150; vi.mocked(createTimeoutGuard).mockImplementation((ms, errorFactory) => { return vi.fn(() => { throw errorFactory ? errorFactory(ms!) : new ObserveTimeoutError(ms!); }); }); const handler = buildObserveHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; try { await handler.observe({ instruction: "find buttons", page: fakePage, timeout: timeoutMs, }); throw new Error("Expected ObserveTimeoutError to be thrown"); } catch (error) { expect(error).toBeInstanceOf(ObserveTimeoutError); expect((error as ObserveTimeoutError).message).toContain("observe()"); expect((error as ObserveTimeoutError).message).toContain( `${timeoutMs}ms`, ); expect((error as ObserveTimeoutError).name).toBe("ObserveTimeoutError"); } }); it("aborts result processing when timeout expires", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const observeInferenceMock = vi.mocked(observeInference); // Timeout before LLM call vi.mocked(createTimeoutGuard).mockImplementation( (timeoutMs, errorFactory) => { let calls = 0; return vi.fn(() => { calls += 1; if (calls >= 2) { throw errorFactory ? errorFactory(timeoutMs!) : new ObserveTimeoutError(timeoutMs!); } }); }, ); const handler = buildObserveHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; await expect( handler.observe({ instruction: "find all interactive elements", page: fakePage, timeout: 5, }), ).rejects.toThrow(ObserveTimeoutError); // Result mapping/processing never happens expect(observeInferenceMock).not.toHaveBeenCalled(); }); }); describe("No-timeout success paths", () => { beforeEach(() => { vi.clearAllMocks(); }); it("act() completes successfully without timeout and records metrics", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); performUnderstudyMethodMock.mockResolvedValue(undefined); const actInferenceMock = vi.mocked(actInference); actInferenceMock.mockResolvedValue({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: false, prompt_tokens: 100, completion_tokens: 50, reasoning_tokens: 10, cached_input_tokens: 5, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); // No timeout - guard never throws vi.mocked(createTimeoutGuard).mockImplementation(() => { return vi.fn(() => { // No-op - never throws }); }); const metricsCallback = vi.fn(); const handler = buildActHandler({ onMetrics: metricsCallback }); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; const result = await handler.act({ instruction: "click button", page: fakePage, // No timeout specified }); expect(result.success).toBe(true); expect(metricsCallback).toHaveBeenCalledWith( V3FunctionName.ACT, 100, 50, 10, 5, 500, ); }); it("extract() completes successfully without timeout and records metrics", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: {}, combinedUrlMap: {}, }); const extractInferenceMock = vi.mocked(extractInference); extractInferenceMock.mockResolvedValue({ title: "Test Title", metadata: { completed: true, progress: "100%" }, prompt_tokens: 200, completion_tokens: 100, reasoning_tokens: 20, cached_input_tokens: 10, inference_time_ms: 800, } as ReturnType extends Promise ? T : never); // No timeout - guard never throws vi.mocked(createTimeoutGuard).mockImplementation(() => { return vi.fn(() => { // No-op - never throws }); }); const metricsCallback = vi.fn(); const handler = buildExtractHandler({ onMetrics: metricsCallback }); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; const result = await handler.extract({ instruction: "extract title", page: fakePage, // No timeout specified }); expect(result).toHaveProperty("title", "Test Title"); expect(metricsCallback).toHaveBeenCalledWith( V3FunctionName.EXTRACT, 200, 100, 20, 10, 800, ); }); it("observe() completes successfully without timeout and records metrics", async () => { const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const observeInferenceMock = vi.mocked(observeInference); observeInferenceMock.mockResolvedValue({ elements: [ { elementId: "1-0", description: "Submit button", }, ], prompt_tokens: 150, completion_tokens: 75, reasoning_tokens: 15, cached_input_tokens: 8, inference_time_ms: 600, } as ReturnType extends Promise ? T : never); // No timeout - guard never throws vi.mocked(createTimeoutGuard).mockImplementation(() => { return vi.fn(() => { // No-op - never throws }); }); const metricsCallback = vi.fn(); const handler = buildObserveHandler({ onMetrics: metricsCallback }); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; const result = await handler.observe({ instruction: "find buttons", page: fakePage, // No timeout specified }); expect(result).toHaveLength(1); expect(result[0]).toHaveProperty("description", "Submit button"); expect(metricsCallback).toHaveBeenCalledWith( V3FunctionName.OBSERVE, 150, 75, 15, 8, 600, ); }); it("act() with zero timeout behaves as no timeout", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); performUnderstudyMethodMock.mockResolvedValue(undefined); const actInferenceMock = vi.mocked(actInference); actInferenceMock.mockResolvedValue({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: false, prompt_tokens: 100, completion_tokens: 50, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); // When timeout is 0 or negative, createTimeoutGuard returns a no-op vi.mocked(createTimeoutGuard).mockImplementation((timeoutMs) => { if (!timeoutMs || timeoutMs <= 0) { return vi.fn(() => { // No-op }); } return vi.fn(() => { throw new ActTimeoutError(timeoutMs); }); }); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; const result = await handler.act({ instruction: "click button", page: fakePage, timeout: 0, // Zero timeout should be treated as "no timeout" }); expect(result.success).toBe(true); }); it("act() with negative timeout behaves as no timeout", async () => { const waitForDomNetworkQuietMock = vi.mocked(waitForDomNetworkQuiet); waitForDomNetworkQuietMock.mockResolvedValue(undefined); const captureHybridSnapshotMock = vi.mocked(captureHybridSnapshot); captureHybridSnapshotMock.mockResolvedValue({ combinedTree: "tree content", combinedXpathMap: { "1-0": "/html/body/button" }, combinedUrlMap: {}, }); const { performUnderstudyMethod } = await import( "../../lib/v3/handlers/handlerUtils/actHandlerUtils.js" ); const performUnderstudyMethodMock = vi.mocked(performUnderstudyMethod); performUnderstudyMethodMock.mockResolvedValue(undefined); const actInferenceMock = vi.mocked(actInference); actInferenceMock.mockResolvedValue({ element: { elementId: "1-0", description: "click button", method: "click", arguments: [], }, twoStep: false, prompt_tokens: 100, completion_tokens: 50, inference_time_ms: 500, } as ReturnType extends Promise ? T : never); vi.mocked(createTimeoutGuard).mockImplementation((timeoutMs) => { if (!timeoutMs || timeoutMs <= 0) { return vi.fn(() => { // No-op }); } return vi.fn(() => { throw new ActTimeoutError(timeoutMs); }); }); const handler = buildActHandler(); const fakePage = { mainFrame: vi.fn().mockReturnValue({}), } as unknown as Page; const result = await handler.act({ instruction: "click button", page: fakePage, timeout: -100, // Negative timeout should be treated as "no timeout" }); expect(result.success).toBe(true); }); }); interface BuildActHandlerOptions { selfHeal?: boolean; onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; } function buildActHandler(options: BuildActHandlerOptions = {}): ActHandler { const defaultClientOptions = {} as ClientOptions; const fakeClient = { type: "openai", modelName: "gpt-4o", clientOptions: defaultClientOptions, } as LLMClient; const resolveLlmClient = vi.fn().mockReturnValue(fakeClient); return new ActHandler( fakeClient, "gpt-4o", defaultClientOptions, resolveLlmClient, undefined, false, options.selfHeal ?? false, options.onMetrics, undefined, ); } interface BuildExtractHandlerOptions { onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; } function buildExtractHandler( options: BuildExtractHandlerOptions = {}, ): ExtractHandler { const defaultClientOptions = {} as ClientOptions; const fakeClient = { type: "openai", modelName: "gpt-4o", clientOptions: defaultClientOptions, } as LLMClient; const resolveLlmClient = vi.fn().mockReturnValue(fakeClient); return new ExtractHandler( fakeClient, "gpt-4o", defaultClientOptions, resolveLlmClient, undefined, false, false, options.onMetrics, ); } interface BuildObserveHandlerOptions { onMetrics?: ( functionName: V3FunctionName, promptTokens: number, completionTokens: number, reasoningTokens: number, cachedInputTokens: number, inferenceTimeMs: number, ) => void; } function buildObserveHandler( options: BuildObserveHandlerOptions = {}, ): ObserveHandler { const defaultClientOptions = {} as ClientOptions; const fakeClient = { type: "openai", modelName: "gpt-4o", clientOptions: defaultClientOptions, } as LLMClient; const resolveLlmClient = vi.fn().mockReturnValue(fakeClient); return new ObserveHandler( fakeClient, "gpt-4o", defaultClientOptions, resolveLlmClient, undefined, false, false, options.onMetrics, ); } ================================================ FILE: packages/core/tests/unit/understudy-command-exception.test.ts ================================================ import { describe, expect, it } from "vitest"; import { UnderstudyCommandException, StagehandError, } from "../../lib/v3/types/public/sdkErrors.js"; describe("UnderstudyCommandException", () => { it("extends StagehandError", () => { const err = new UnderstudyCommandException("test"); expect(err).toBeInstanceOf(StagehandError); expect(err).toBeInstanceOf(Error); }); it("has the correct name", () => { const err = new UnderstudyCommandException("test"); expect(err.name).toBe("UnderstudyCommandException"); }); it("preserves the message", () => { const err = new UnderstudyCommandException("something broke"); expect(err.message).toBe("something broke"); }); it("stores the original error as cause when provided", () => { const original = new Error("root cause"); const err = new UnderstudyCommandException("wrapper message", original); expect(err.cause).toBe(original); expect((err.cause as Error).message).toBe("root cause"); expect((err.cause as Error).stack).toBeDefined(); }); it("stores non-Error cause values", () => { const err = new UnderstudyCommandException("failed", "string cause"); expect(err.cause).toBe("string cause"); }); it("has undefined cause when none is provided", () => { const err = new UnderstudyCommandException("no cause"); expect(err.cause).toBeUndefined(); }); it("generates its own stack trace", () => { const err = new UnderstudyCommandException("test"); expect(err.stack).toBeDefined(); expect(err.stack).toContain("UnderstudyCommandException"); }); it("preserves the original stack via cause for debugging", () => { function deepFunction() { throw new Error("deep error"); } let original: Error; try { deepFunction(); } catch (e) { original = e as Error; } const wrapped = new UnderstudyCommandException(original!.message, original); // The wrapper has its own stack expect(wrapped.stack).toBeDefined(); // The original stack is accessible via cause expect((wrapped.cause as Error).stack).toContain("deepFunction"); }); }); ================================================ FILE: packages/core/tests/unit/xpath-parser.test.ts ================================================ import { describe, expect, it } from "vitest"; import { applyPredicates, parseXPathSteps, type XPathPredicate, } from "../../lib/v3/dom/locatorScripts/xpathParser.js"; describe("parseXPathSteps", () => { describe("basic tag parsing", () => { it("parses a simple absolute path", () => { expect(parseXPathSteps("/html/body/div")).toEqual([ { axis: "child", tag: "html", predicates: [] }, { axis: "child", tag: "body", predicates: [] }, { axis: "child", tag: "div", predicates: [] }, ]); }); it("lowercases tag names", () => { const steps = parseXPathSteps("/HTML/BODY"); expect(steps[0].tag).toBe("html"); expect(steps[1].tag).toBe("body"); }); it("treats wildcard correctly", () => { const steps = parseXPathSteps("//*"); expect(steps).toEqual([{ axis: "desc", tag: "*", predicates: [] }]); }); }); describe("axes", () => { it("distinguishes child (/) from descendant (//)", () => { const steps = parseXPathSteps("/html//div/span"); expect(steps).toEqual([ { axis: "child", tag: "html", predicates: [] }, { axis: "desc", tag: "div", predicates: [] }, { axis: "child", tag: "span", predicates: [] }, ]); }); it("handles leading //", () => { const steps = parseXPathSteps("//div"); expect(steps[0].axis).toBe("desc"); }); }); describe("positional indices", () => { it("parses positional index", () => { const steps = parseXPathSteps("/div[1]/span[3]"); expect(steps[0]).toMatchObject({ tag: "div", predicates: [{ type: "index", index: 1 }], }); expect(steps[1]).toMatchObject({ tag: "span", predicates: [{ type: "index", index: 3 }], }); }); it("clamps index to minimum 1", () => { const steps = parseXPathSteps("/div[0]"); expect(steps[0].predicates[0]).toMatchObject({ type: "index", index: 1, }); }); it("keeps multiple positional predicates in order", () => { const steps = parseXPathSteps("//div[2][3]"); expect(steps[0].predicates).toEqual([ { type: "index", index: 2 }, { type: "index", index: 3 }, ]); }); }); describe("attribute predicates", () => { it("parses single attribute predicate with single quotes", () => { const steps = parseXPathSteps("//img[@alt='Stagehand']"); expect(steps).toEqual([ { axis: "desc", tag: "img", predicates: [{ type: "attrEquals", name: "alt", value: "Stagehand" }], }, ]); }); it("parses single attribute predicate with double quotes", () => { const steps = parseXPathSteps('//img[@alt="Stagehand"]'); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "alt", value: "Stagehand" }, ]); }); it("parses multiple attribute predicates", () => { const steps = parseXPathSteps("//div[@class='foo'][@id='bar']"); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "class", value: "foo" }, { type: "attrEquals", name: "id", value: "bar" }, ]); }); it("parses attribute predicate combined with positional index", () => { const steps = parseXPathSteps("//div[@class='item'][2]"); expect(steps[0]).toMatchObject({ tag: "div", predicates: [ { type: "attrEquals", name: "class", value: "item" }, { type: "index", index: 2 }, ], }); }); it("parses attribute with hyphenated name", () => { const steps = parseXPathSteps("//div[@data-testid='submit']"); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "data-testid", value: "submit" }, ]); }); it("parses attribute with empty value", () => { const steps = parseXPathSteps("//input[@value='']"); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "value", value: "" }, ]); }); it("parses attribute value containing closing bracket", () => { const steps = parseXPathSteps("//div[@title='array[0]']"); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "title", value: "array[0]" }, ]); }); it("parses attribute value containing multiple brackets", () => { const steps = parseXPathSteps("//div[@data-json='[1,2,3]']"); expect(steps[0].predicates).toEqual([ { type: "attrEquals", name: "data-json", value: "[1,2,3]" }, ]); }); it("parses attribute value containing a closing bracket", () => { // The step splitter should ignore ] characters inside quotes. const steps = parseXPathSteps("//div[@title='a]b']/span"); expect(steps).toEqual([ { axis: "desc", tag: "div", predicates: [{ type: "attrEquals", name: "title", value: "a]b" }], }, { axis: "child", tag: "span", predicates: [] }, ]); }); it("parses attribute existence predicates", () => { const steps = parseXPathSteps("//iframe[@data-test]"); expect(steps[0].predicates).toEqual([ { type: "attrExists", name: "data-test" }, ]); }); it("parses attribute contains predicates", () => { const steps = parseXPathSteps("//iframe[contains(@src,'checkout')]"); expect(steps[0].predicates).toEqual([ { type: "attrContains", name: "src", value: "checkout" }, ]); }); it("parses attribute starts-with predicates", () => { const steps = parseXPathSteps("//button[starts-with(@id,'save-')]"); expect(steps[0].predicates).toEqual([ { type: "attrStartsWith", name: "id", value: "save-" }, ]); }); }); describe("text predicates", () => { it("parses text equality", () => { const steps = parseXPathSteps("//button[text()='Submit']"); expect(steps[0].predicates).toEqual([ { type: "textEquals", value: "Submit" }, ]); }); it("parses text contains", () => { const steps = parseXPathSteps("//div[contains(text(),'Welcome')]"); expect(steps[0].predicates).toEqual([ { type: "textContains", value: "Welcome" }, ]); }); it("parses normalize-space on text", () => { const steps = parseXPathSteps( "//div[normalize-space(text())='Hello world']", ); expect(steps[0].predicates).toEqual([ { type: "textEquals", value: "Hello world", normalize: true }, ]); }); }); describe("boolean predicates", () => { it("parses and predicates", () => { const steps = parseXPathSteps("//div[@a='x' and @b='y']"); expect(steps[0].predicates).toEqual([ { type: "and", predicates: [ { type: "attrEquals", name: "a", value: "x" }, { type: "attrEquals", name: "b", value: "y" }, ], }, ]); }); it("parses operators without surrounding whitespace", () => { const steps = parseXPathSteps("//div[not(@x)and@y='z']"); expect(steps[0].predicates).toEqual([ { type: "and", predicates: [ { type: "not", predicate: { type: "attrExists", name: "x" } }, { type: "attrEquals", name: "y", value: "z" }, ], }, ]); }); it("parses or predicates", () => { const steps = parseXPathSteps("//div[@a='x' or @b='y']"); expect(steps[0].predicates).toEqual([ { type: "or", predicates: [ { type: "attrEquals", name: "a", value: "x" }, { type: "attrEquals", name: "b", value: "y" }, ], }, ]); }); it("parses not predicates", () => { const steps = parseXPathSteps("//button[not(@disabled)]"); expect(steps[0].predicates).toEqual([ { type: "not", predicate: { type: "attrExists", name: "disabled" } }, ]); }); it("does not treat @and as a boolean operator", () => { const steps = parseXPathSteps("//div[@and='x' and @y='z']"); expect(steps[0].predicates).toEqual([ { type: "and", predicates: [ { type: "attrEquals", name: "and", value: "x" }, { type: "attrEquals", name: "y", value: "z" }, ], }, ]); }); }); describe("multi-step with predicates", () => { it("parses complex path with mixed predicates", () => { const steps = parseXPathSteps( "/html/body//div[@class='container']/ul/li[3]", ); expect(steps).toEqual([ { axis: "child", tag: "html", predicates: [] }, { axis: "child", tag: "body", predicates: [] }, { axis: "desc", tag: "div", predicates: [ { type: "attrEquals", name: "class", value: "container" }, ], }, { axis: "child", tag: "ul", predicates: [] }, { axis: "child", tag: "li", predicates: [{ type: "index", index: 3 }] }, ]); }); }); describe("edge cases", () => { it("returns empty array for empty string", () => { expect(parseXPathSteps("")).toEqual([]); }); it("strips xpath= prefix", () => { const steps = parseXPathSteps("xpath=//div"); expect(steps).toEqual([{ axis: "desc", tag: "div", predicates: [] }]); }); it("strips XPATH= prefix (case-insensitive)", () => { const steps = parseXPathSteps("XPATH=//div"); expect(steps).toEqual([{ axis: "desc", tag: "div", predicates: [] }]); }); it("handles forward slashes inside attribute values", () => { const steps = parseXPathSteps("//a[@href='/api/endpoint']"); expect(steps).toEqual([ { axis: "desc", tag: "a", predicates: [ { type: "attrEquals", name: "href", value: "/api/endpoint" }, ], }, ]); }); it("handles URL attribute values with multiple slashes", () => { const steps = parseXPathSteps( "//a[@data-url='http://example.com/path/to/page']", ); expect(steps).toEqual([ { axis: "desc", tag: "a", predicates: [ { type: "attrEquals", name: "data-url", value: "http://example.com/path/to/page", }, ], }, ]); }); it("handles whitespace", () => { const steps = parseXPathSteps(" //div "); expect(steps.length).toBe(1); expect(steps[0].tag).toBe("div"); }); }); }); describe("applyPredicates", () => { const makeElement = (id: string): Element => { return { localName: "div", getAttribute: (name: string) => (name === "id" ? id : null), } as unknown as Element; }; it("applies positional predicates sequentially", () => { const elements = ["a", "b", "c", "d"].map(makeElement); const predicates: XPathPredicate[] = [ { type: "index", index: 2 }, { type: "index", index: 3 }, ]; expect(applyPredicates(elements, predicates)).toEqual([]); }); }); ================================================ FILE: packages/core/tests/unit/xpath-resolver.test.ts ================================================ import { JSDOM } from "jsdom"; import { afterAll, beforeAll, beforeEach, describe, expect, it } from "vitest"; import { countXPathMatches, resolveXPathAtIndex, } from "../../lib/v3/dom/locatorScripts/xpathResolver.js"; type DomGlobals = { window: Window & typeof globalThis; document: Document; Node: typeof Node; NodeFilter: typeof NodeFilter; Element: typeof Element; HTMLElement: typeof HTMLElement; Document: typeof Document; DocumentFragment: typeof DocumentFragment; ShadowRoot: typeof ShadowRoot; XPathResult: typeof XPathResult; }; const globalRef = globalThis as typeof globalThis & Partial; const originalGlobals: Partial = { window: globalRef.window, document: globalRef.document, Node: globalRef.Node, NodeFilter: globalRef.NodeFilter, Element: globalRef.Element, HTMLElement: globalRef.HTMLElement, Document: globalRef.Document, DocumentFragment: globalRef.DocumentFragment, ShadowRoot: globalRef.ShadowRoot, XPathResult: globalRef.XPathResult, }; let dom: JSDOM; const installDomGlobals = () => { const win = dom.window; globalRef.window = win as unknown as Window & typeof globalThis; globalRef.document = win.document; globalRef.Node = win.Node as unknown as typeof Node; globalRef.NodeFilter = win.NodeFilter as unknown as typeof NodeFilter; globalRef.Element = win.Element as unknown as typeof Element; globalRef.HTMLElement = win.HTMLElement as unknown as typeof HTMLElement; globalRef.Document = win.Document as unknown as typeof Document; globalRef.DocumentFragment = win.DocumentFragment as unknown as typeof DocumentFragment; globalRef.ShadowRoot = win.ShadowRoot as unknown as typeof ShadowRoot; globalRef.XPathResult = win.XPathResult as unknown as typeof XPathResult; }; const restoreDomGlobals = () => { for (const [key, value] of Object.entries(originalGlobals)) { if (value === undefined) { delete (globalRef as Record)[key]; } else { (globalRef as Record)[key] = value; } } }; describe("xpathResolver composed traversal", () => { beforeAll(() => { dom = new JSDOM(""); installDomGlobals(); }); afterAll(() => { dom.window.close(); restoreDomGlobals(); }); beforeEach(() => { document.body.innerHTML = ""; }); it("counts matches across light + shadow DOM without double counting", () => { document.body.innerHTML = '
' + '' + '
'; const host = document.getElementById("host") as HTMLElement; const shadow = host.attachShadow({ mode: "open" }); shadow.innerHTML = '
'; expect(countXPathMatches("//div")).toBe(4); }); it("resolves nth over composed tree in document-order DFS", () => { document.body.innerHTML = '
' + '' + '
'; const host = document.getElementById("host") as HTMLElement; const shadow = host.attachShadow({ mode: "open" }); shadow.innerHTML = '
'; expect(resolveXPathAtIndex("//div", 0)?.id).toBe("light-1"); expect(resolveXPathAtIndex("//div", 1)?.id).toBe("shadow-1"); expect(resolveXPathAtIndex("//div", 2)?.id).toBe("shadow-2"); expect(resolveXPathAtIndex("//div", 3)?.id).toBe("light-2"); }); }); ================================================ FILE: packages/core/tests/unit/zod-enum-compatibility.test.ts ================================================ import { describe, expect, it } from "vitest"; import * as z3 from "zod/v3"; import { z as z4 } from "zod"; import { SupportedUnderstudyAction } from "../../lib/v3/types/private/handlers.js"; /** * Tests for Zod v3/v4 compatibility with the SupportedUnderstudyAction enum. * * This test ensures that z.enum() works correctly with both Zod v3 and v4. * The key issue is that z.enum() in Zod v3 does NOT accept TypeScript enums directly - * it only accepts string literal tuples. For TypeScript enums, you need to use * Object.values() to convert the enum to an array first. * * In Zod v4, z.enum() was updated to accept TypeScript enums directly, but for * backwards compatibility, we should use Object.values() which works with both. * * See PR #1613: https://github.com/browserbase/stagehand/pull/1613 */ describe("SupportedUnderstudyAction enum Zod compatibility", () => { const testInput = { elementId: "1-2", method: "click", arguments: [] as string[], }; const invalidInput = { elementId: "1-2", method: "invalidMethod", arguments: [] as string[], }; it("Object.values(SupportedUnderstudyAction) produces correct array for z.enum()", () => { const enumValues = Object.values( SupportedUnderstudyAction, ) as unknown as readonly [string, ...string[]]; expect(enumValues).toContain("click"); expect(enumValues).toContain("fill"); expect(enumValues).toContain("type"); expect(enumValues).toContain("press"); expect(enumValues).toContain("scrollTo"); expect(enumValues).toContain("nextChunk"); expect(enumValues).toContain("prevChunk"); expect(enumValues).toContain("selectOptionFromDropdown"); expect(enumValues).toContain("hover"); expect(enumValues).toContain("doubleClick"); expect(enumValues).toContain("dragAndDrop"); expect(enumValues.length).toBe(11); }); it("Zod v3 z.enum() with Object.values(SupportedUnderstudyAction) works correctly", () => { const enumValues = Object.values( SupportedUnderstudyAction, ) as unknown as readonly [string, ...string[]]; const schema = z3.z.object({ elementId: z3.z.string(), method: z3.z.enum(enumValues), arguments: z3.z.array(z3.z.string()), }); // Valid input should pass const validResult = schema.safeParse(testInput); expect(validResult.success).toBe(true); if (validResult.success) { expect(validResult.data.method).toBe("click"); } // Invalid input should fail const invalidResult = schema.safeParse(invalidInput); expect(invalidResult.success).toBe(false); }); it("Zod v4 z.enum() with Object.values(SupportedUnderstudyAction) works correctly", () => { const enumValues = Object.values( SupportedUnderstudyAction, ) as unknown as readonly [string, ...string[]]; const schema = z4.object({ elementId: z4.string(), method: z4.enum(enumValues), arguments: z4.array(z4.string()), }); // Valid input should pass const validResult = schema.safeParse(testInput); expect(validResult.success).toBe(true); if (validResult.success) { expect(validResult.data.method).toBe("click"); } // Invalid input should fail const invalidResult = schema.safeParse(invalidInput); expect(invalidResult.success).toBe(false); }); it("Zod v3 z.enum() with raw TypeScript enum throws error on parse", () => { // This demonstrates the bug that PR #1613 would introduce // In Zod v3, z.enum() does NOT accept TypeScript enums directly // The schema creation might succeed, but parsing will fail const schema = z3.z.object({ elementId: z3.z.string(), // eslint-disable-next-line @typescript-eslint/no-explicit-any method: z3.z.enum(SupportedUnderstudyAction as any), arguments: z3.z.array(z3.z.string()), }); // This should throw an error because the enum is not iterable expect(() => schema.safeParse(testInput)).toThrow("object is not iterable"); }); it("Zod v4 z.enum() with raw TypeScript enum works (but not v3 compatible)", () => { // Zod v4 allows passing TypeScript enums directly to z.enum() // But this approach is NOT backwards compatible with v3 const schema = z4.object({ elementId: z4.string(), method: z4.enum(SupportedUnderstudyAction), arguments: z4.array(z4.string()), }); // In v4, this works fine const validResult = schema.safeParse(testInput); expect(validResult.success).toBe(true); }); it("All SupportedUnderstudyAction values are valid enum options", () => { const enumValues = Object.values( SupportedUnderstudyAction, ) as unknown as readonly [string, ...string[]]; // Test with both v3 and v4 schemas const v3Schema = z3.z.enum(enumValues); const v4Schema = z4.enum(enumValues); for (const action of enumValues) { expect(v3Schema.safeParse(action).success).toBe(true); expect(v4Schema.safeParse(action).success).toBe(true); } }); }); ================================================ FILE: packages/core/tsconfig.json ================================================ { "extends": "../../tsconfig.base.json", "compilerOptions": { "baseUrl": "../../", "rootDir": ".", "outDir": "./dist/esm", "allowJs": true, "paths": { "@browserbasehq/stagehand": ["packages/core/lib/v3/index.ts"], "@browserbasehq/stagehand/*": ["packages/core/lib/*"], "*": ["node_modules/*", "packages/core/lib/types/*"], "@/*": ["./*"] } }, "include": [ "lib/**/*.ts", "tests/**/*.ts", "lib/v3/cli.js" ], "exclude": ["node_modules", "dist", "lib/v3/dom/gen*.ts"] } ================================================ FILE: packages/core/vitest.cjs.config.mjs ================================================ import { defineConfig } from "vitest/config"; import path from "node:path"; import { fileURLToPath } from "node:url"; const rootDir = path.dirname(fileURLToPath(import.meta.url)); export default defineConfig({ resolve: { alias: { "@browserbasehq/stagehand": path.join(rootDir, "dist", "cjs", "index.js"), }, }, test: { environment: "node", include: ["**/dist/cjs/tests/unit/**/*.test.js"], }, }); ================================================ FILE: packages/core/vitest.config.ts ================================================ import { defineConfig } from "vitest/config"; import path from "node:path"; import { fileURLToPath } from "node:url"; const rootDir = path.dirname(fileURLToPath(import.meta.url)); export default defineConfig({ resolve: { alias: { "@browserbasehq/stagehand": path.join(rootDir, "dist", "esm", "index.js"), }, }, test: { environment: "node", include: ["**/dist/esm/tests/unit/**/*.test.js"], }, }); ================================================ FILE: packages/core/vitest.esm.config.mjs ================================================ import { defineConfig } from "vitest/config"; import path from "node:path"; import { fileURLToPath } from "node:url"; const rootDir = path.dirname(fileURLToPath(import.meta.url)); export default defineConfig({ resolve: { alias: { "@browserbasehq/stagehand": path.join(rootDir, "dist", "esm", "index.js"), }, }, test: { environment: "node", include: ["**/dist/esm/tests/unit/**/*.test.js"], }, }); ================================================ FILE: packages/docs/.gitignore ================================================ node_modules downloads .DS_Store ================================================ FILE: packages/docs/README.md ================================================ # Mintlify Starter Kit Click on `Use this template` to copy the Mintlify starter kit. The starter kit contains examples including - Guide pages - Navigation - Customizations - API Reference pages - Use of popular components ### Development Install dependencies with pnpm ``` pnpm install ``` Run the following command at the root of your documentation (where mint.json is) ``` pnpm mintlify dev ``` ### Publishing Changes Install our Github App to auto propagate changes from your repo to your deployment. Changes will be deployed to production automatically after pushing to the default branch. Find the link to install on your dashboard. #### Troubleshooting - Mintlify dev isn't running - Run `mintlify install` it'll re-install dependencies. - Page loads as a 404 - Make sure you are running in a folder with `mint.json` ================================================ FILE: packages/docs/docs.json ================================================ { "$schema": "https://mintlify.com/docs.json", "theme": "willow", "name": "🤘 Stagehand", "colors": { "primary": "#B88100", "light": "#FFC83C", "dark": "#FFC83C" }, "favicon": "/images/favicon.svg", "seo": { "indexing": "all", "metatags": { "og:type": "website", "og:site_name": "Stagehand Docs" } }, "openapi": "https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml", "navigation": { "versions": [ { "version": "v3", "dropdowns": [ { "dropdown": "TypeScript", "icon": "code", "pages": [ "v3/first-steps/introduction" ], "groups": [ { "group": "First Steps", "pages": [ "v3/first-steps/introduction", "v3/first-steps/quickstart", "v3/first-steps/installation", "v3/first-steps/ai-rules" ] }, { "group": "The Basics", "pages": [ "v3/basics/agent", "v3/basics/act", "v3/basics/extract", "v3/basics/observe", "v3/basics/evals" ] }, { "group": "Configuration", "pages": [ "v3/configuration/browser", "v3/configuration/observability", "v3/configuration/logging", "v3/configuration/models" ] }, { "group": "Best Practices", "pages": [ "v3/best-practices/caching", "v3/best-practices/cost-optimization", "v3/best-practices/deterministic-agent", "v3/best-practices/using-multiple-tabs", "v3/best-practices/deployments", "v3/best-practices/history", "v3/best-practices/computer-use", "v3/best-practices/agent-fallbacks", "v3/best-practices/prompting-best-practices", "v3/best-practices/mcp-integrations", "v3/best-practices/speed-optimization" ] }, { "group": "Integrations", "pages": [ { "group": "MCP Server", "pages": [ "v3/integrations/mcp/introduction", "v3/integrations/mcp/setup", "v3/integrations/mcp/tools", "v3/integrations/mcp/configuration" ] }, { "group": "CrewAI", "pages": [ "v3/integrations/crew-ai/introduction", "v3/integrations/crew-ai/configuration" ] }, { "group": "Langchain", "pages": [ "v3/integrations/langchain/introduction", "v3/integrations/langchain/configuration" ] }, { "group": "Next.js + Vercel", "pages": [ "v3/integrations/vercel/introduction", "v3/integrations/vercel/configuration" ] }, { "group": "Convex", "pages": [ "v3/integrations/convex/introduction", "v3/integrations/convex/configuration" ] }, "v3/integrations/playwright", "v3/integrations/puppeteer", "v3/integrations/selenium" ] }, { "group": "Reference", "pages": [ "v3/references/stagehand", "v3/references/agent", "v3/references/act", "v3/references/extract", "v3/references/observe", "v3/references/context", "v3/references/page", "v3/references/locator", "v3/references/deeplocator", "v3/references/response" ] }, { "group": "Migration Guides", "pages": [ "v3/migrations/v2", "v3/migrations/python" ] } ] }, { "dropdown": "Python", "icon": "code", "pages": [ "v3/sdk/python" ], "groups": [ { "group": "SDK Reference", "pages": [ "v3/sdk/python" ] }, { "group": "API Reference", "openapi": { "source": "https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml", "directory": "v3/api-reference/python" }, "pages": [ "POST /v1/sessions/start", "POST /v1/sessions/{id}/navigate", "POST /v1/sessions/{id}/act", "POST /v1/sessions/{id}/observe", "POST /v1/sessions/{id}/extract", "POST /v1/sessions/{id}/agentExecute", "POST /v1/sessions/{id}/end", "GET /v1/sessions/{id}/replay" ] } ] }, { "dropdown": "Java", "icon": "code", "pages": [ "v3/sdk/java" ], "groups": [ { "group": "SDK Reference", "pages": [ "v3/sdk/java" ] }, { "group": "API Reference", "openapi": { "source": "https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml", "directory": "v3/api-reference/java" }, "pages": [ "POST /v1/sessions/start", "POST /v1/sessions/{id}/navigate", "POST /v1/sessions/{id}/act", "POST /v1/sessions/{id}/observe", "POST /v1/sessions/{id}/extract", "POST /v1/sessions/{id}/agentExecute", "POST /v1/sessions/{id}/end", "GET /v1/sessions/{id}/replay" ] } ] }, { "dropdown": "Go", "icon": "code", "pages": [ "v3/sdk/go" ], "groups": [ { "group": "SDK Reference", "pages": [ "v3/sdk/go" ] }, { "group": "API Reference", "openapi": { "source": "https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml", "directory": "v3/api-reference/go" }, "pages": [ "POST /v1/sessions/start", "POST /v1/sessions/{id}/navigate", "POST /v1/sessions/{id}/act", "POST /v1/sessions/{id}/observe", "POST /v1/sessions/{id}/extract", "POST /v1/sessions/{id}/agentExecute", "POST /v1/sessions/{id}/end", "GET /v1/sessions/{id}/replay" ] } ] }, { "dropdown": "Ruby", "icon": "code", "pages": [ "v3/sdk/ruby" ], "groups": [ { "group": "SDK Reference", "pages": [ "v3/sdk/ruby" ] }, { "group": "API Reference", "openapi": { "source": "https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml", "directory": "v3/api-reference/ruby" }, "pages": [ "POST /v1/sessions/start", "POST /v1/sessions/{id}/navigate", "POST /v1/sessions/{id}/act", "POST /v1/sessions/{id}/observe", "POST /v1/sessions/{id}/extract", "POST /v1/sessions/{id}/agentExecute", "POST /v1/sessions/{id}/end", "GET /v1/sessions/{id}/replay" ] } ] } ] }, { "version": "v2", "groups": [ { "group": "First Steps", "pages": [ "v2/first-steps/introduction", "v2/first-steps/quickstart", "v2/first-steps/installation", "v2/first-steps/ai-rules" ] }, { "group": "The Basics", "pages": [ "v2/basics/agent", "v2/basics/act", "v2/basics/extract", "v2/basics/observe" ] }, { "group": "Configuration", "pages": [ "v2/configuration/browser", "v2/configuration/observability", "v2/configuration/logging", "v2/configuration/models", "v2/configuration/evals" ] }, { "group": "Best Practices", "pages": [ "v2/best-practices/caching", "v2/best-practices/cost-optimization", "v2/best-practices/using-multiple-tabs", "v2/best-practices/working-with-iframes", "v2/best-practices/deployments", "v2/best-practices/computer-use", "v2/best-practices/contributing", "v2/best-practices/playwright-interop", "v2/best-practices/build-agent", "v2/best-practices/agent-fallbacks", "v2/best-practices/prompting-best-practices", "v2/best-practices/mcp-integrations", "v2/best-practices/speed-optimization" ] }, { "group": "Integrations", "pages": [ { "group": "MCP Server", "pages": [ "v2/integrations/mcp/introduction", "v2/integrations/mcp/setup", "v2/integrations/mcp/tools", "v2/integrations/mcp/configuration" ] }, { "group": "CrewAI", "pages": [ "v2/integrations/crew-ai/introduction", "v2/integrations/crew-ai/configuration" ] }, { "group": "Langchain", "pages": [ "v2/integrations/langchain/introduction", "v2/integrations/langchain/configuration" ] }, { "group": "Next.js + Vercel", "pages": [ "v2/integrations/vercel/introduction", "v2/integrations/vercel/configuration" ] } ] }, { "group": "Reference", "pages": [ "v2/references/stagehand", "v2/references/act", "v2/references/extract", "v2/references/observe", "v2/references/agent" ] } ] } ], "global": { "anchors": [ { "anchor": "Discord", "href": "https://stagehand.dev/discord", "icon": "discord" }, { "anchor": "GitHub", "href": "https://github.com/browserbase/stagehand", "icon": "github" }, { "anchor": "Changelog", "href": "https://github.com/browserbase/stagehand/releases", "icon": "scroll" } ] } }, "logo": { "light": "/logo/light_logo.png", "dark": "/logo/dark_logo.png", "href": "https://stagehand.dev" }, "navbar": { "links": [ { "label": "Discord", "href": "https://stagehand.dev/discord" }, { "label": "Support", "href": "mailto:support@browserbase.com" } ] }, "footer": { "socials": { "discord": "https://stagehand.dev/discord", "x": "https://x.com/stagehanddev", "github": "https://github.com/browserbase/stagehand", "linkedin": "https://linkedin.com/company/browserbasehq" } }, "integrations": { "posthog": { "apiKey": "phc_hmwkFrlc9UVrdE1jyG8AEKoCQCSr8dScjsRpKoLBEiV", "apiHost": "https://us.i.posthog.com" } }, "contextual": { "options": [ "copy", "chatgpt", "claude", "view" ] }, "redirects": [ { "source": "/first-steps/:slug*", "destination": "/v3/first-steps/:slug*" }, { "source": "/basics/:slug*", "destination": "/v3/basics/:slug*" }, { "source": "/configuration/:slug*", "destination": "/v3/configuration/:slug*" }, { "source": "/best-practices/:slug*", "destination": "/v3/best-practices/:slug*" }, { "source": "/integrations/mcp/:slug*", "destination": "/v3/integrations/mcp/:slug*" }, { "source": "/integrations/crew-ai/:slug*", "destination": "/v3/integrations/crew-ai/:slug*" }, { "source": "/integrations/langchain/:slug*", "destination": "/v3/integrations/langchain/:slug*" }, { "source": "/integrations/vercel/:slug*", "destination": "/v3/integrations/vercel/:slug*" }, { "source": "/integrations/convex/:slug*", "destination": "/v3/integrations/convex/:slug*" }, { "source": "/references/:slug*", "destination": "/v3/references/:slug*" }, { "source": "/migrations/:slug*", "destination": "/v3/migrations/:slug*" } ] } ================================================ FILE: packages/docs/language-selector.js ================================================ // Language switcher for Stagehand docs // Handles: 1) Sidebar language dropdown selection 2) Code block language syncing (function() { // ============================================ // CONFIGURATION // ============================================ const DROPDOWN_LANGUAGES = ['TypeScript', 'Python', 'Java', 'Go', 'Ruby']; const LANGUAGE_MAP = { 'TypeScript': 'Javascript', 'Python': 'Python', 'Java': 'Java', 'Go': 'Go', 'Ruby': 'Ruby' }; const CODE_BLOCK_LANGUAGES = ['Javascript', 'Python', 'Go', 'Java', 'Ruby', 'cURL', 'PHP']; const SDK_PATH_MAP = { 'Python': 'python', 'Java': 'java', 'Go': 'go', 'Ruby': 'ruby' }; const NAVIGATION_MAP = { 'TypeScript': '/v3/first-steps/introduction', 'Python': '/v3/sdk/python', 'Java': '/v3/sdk/java', 'Go': '/v3/sdk/go', 'Ruby': '/v3/sdk/ruby' }; let currentSelectedLanguage = 'TypeScript'; let isSelecting = false; // ============================================ // UTILITIES // ============================================ // Run callback on next frame (immediate visual update) const onNextFrame = (fn) => requestAnimationFrame(() => requestAnimationFrame(fn)); const dropdownStyle = document.createElement('style'); dropdownStyle.id = 'stagehand-language-style'; dropdownStyle.textContent = ` /* Hide dropdown during programmatic selection */ .stagehand-selecting [role="menu"], .stagehand-selecting [role="listbox"] { opacity: 0 !important; pointer-events: none !important; transition: none !important; } /* Hide version switcher when non-TypeScript language is selected */ .stagehand-hide-version-switcher .stagehand-version-switcher { display: none !important; } /* Hide SDK reference items that don't match the selected language */ li[id^="/v3/sdk/"].stagehand-sdk-hidden { display: none !important; } `; document.head.appendChild(dropdownStyle); // ============================================ // SDK REFERENCE FILTERING // ============================================ function updateSDKReferenceVisibility() { // Get the SDK path for the current language const currentSDKPath = SDK_PATH_MAP[currentSelectedLanguage]; // Find all SDK reference items in the sidebar const sdkItems = document.querySelectorAll('li[id^="/v3/sdk/"]'); sdkItems.forEach(item => { const itemId = item.getAttribute('id') || ''; // Extract the language from the id (e.g., "/v3/sdk/python" -> "python") const itemLang = itemId.split('/').pop(); if (currentSelectedLanguage === 'TypeScript') { // For TypeScript, hide all SDK references (they don't apply) item.classList.add('stagehand-sdk-hidden'); } else if (currentSDKPath && itemLang === currentSDKPath) { // Show the SDK that matches the current language item.classList.remove('stagehand-sdk-hidden'); } else { // Hide SDKs that don't match item.classList.add('stagehand-sdk-hidden'); } }); } // ============================================ // VERSION SWITCHER VISIBILITY // ============================================ function getVersionSwitcher() { // Find the version switcher button (contains "v3" or "v2" and has chevron-down) const buttons = document.querySelectorAll('button'); for (const btn of buttons) { const text = (btn.textContent || '').trim().toLowerCase(); // Check if it's a version button (v2, v3, etc.) with chevron icon if (/^v\d+$/.test(text) && btn.querySelector('.lucide-chevron-down')) { return btn; } } return null; } function updateVersionSwitcherVisibility() { const versionSwitcher = getVersionSwitcher(); if (versionSwitcher) { // Mark the version switcher so we can target it with CSS versionSwitcher.classList.add('stagehand-version-switcher'); // Show version switcher only for TypeScript if (currentSelectedLanguage === 'TypeScript') { document.body.classList.remove('stagehand-hide-version-switcher'); } else { document.body.classList.add('stagehand-hide-version-switcher'); } } } // ============================================ // SIDEBAR DROPDOWN FUNCTIONS // ============================================ function getDropdownButton() { const buttons = document.querySelectorAll('button'); for (const btn of buttons) { const text = (btn.textContent || '').trim(); if (DROPDOWN_LANGUAGES.includes(text)) { return btn; } } return null; } function getDropdownMenu() { return document.querySelector('menu[role="menu"], [role="menu"]'); } function updateButtonText(newText) { const button = getDropdownButton(); if (!button) return; const paragraph = button.querySelector('p'); if (paragraph) { paragraph.textContent = newText; } } function updateDropdownCheckIndicator() { const menu = getDropdownMenu(); if (!menu) return; const menuItems = menu.querySelectorAll('a, [role="menuitem"]'); const checkIconsMap = new Map(); let anyCheckIcon = null; for (const item of menuItems) { const text = (item.textContent || '').trim(); const checkIcon = item.querySelector('.lucide-check, [class*="lucide-check"], svg[class*="check"]'); for (const lang of DROPDOWN_LANGUAGES) { if (text.includes(lang)) { checkIconsMap.set(lang, { item, checkIcon }); if (checkIcon) { anyCheckIcon = checkIcon; } break; } } } for (const [lang, { item, checkIcon }] of checkIconsMap) { const shouldBeSelected = lang === currentSelectedLanguage; if (checkIcon) { checkIcon.style.opacity = shouldBeSelected ? '1' : '0'; checkIcon.style.visibility = shouldBeSelected ? 'visible' : 'hidden'; } else if (shouldBeSelected && anyCheckIcon) { const clonedCheck = anyCheckIcon.cloneNode(true); clonedCheck.style.opacity = '1'; clonedCheck.style.visibility = 'visible'; const targetSpan = item.querySelector('span:last-child') || item; if (targetSpan.querySelector('.lucide-check, [class*="lucide-check"]') === null) { targetSpan.appendChild(clonedCheck); } } } } // ============================================ // CODE BLOCK LANGUAGE SELECTOR FUNCTIONS // ============================================ function simulateClick(element) { if (!element) return; const rect = element.getBoundingClientRect(); const x = rect.left + rect.width / 2; const y = rect.top + rect.height / 2; ['pointerdown', 'mousedown', 'pointerup', 'mouseup', 'click'].forEach(eventType => { const EventClass = eventType.startsWith('pointer') ? PointerEvent : MouseEvent; element.dispatchEvent(new EventClass(eventType, { view: window, bubbles: true, cancelable: true, clientX: x, clientY: y, button: 0, buttons: 1, isPrimary: true, pointerType: 'mouse' })); }); } function getCodeBlockLanguageDropdown() { const paragraphs = document.querySelectorAll('p'); for (const p of paragraphs) { const text = (p.textContent || '').trim(); if (CODE_BLOCK_LANGUAGES.includes(text)) { const parentDiv = p.closest('div'); if (parentDiv && parentDiv.querySelector('.lucide-chevrons-up-down')) { return { element: parentDiv, language: text }; } } } return null; } function waitForCodeBlockMenuAndSelect(targetLanguage, attempts = 0) { if (attempts > 30) { document.body.classList.remove('stagehand-selecting'); document.body.click(); isSelecting = false; return; } const menuItems = document.querySelectorAll('[role="menuitem"], [role="option"]'); if (menuItems.length === 0) { requestAnimationFrame(() => waitForCodeBlockMenuAndSelect(targetLanguage, attempts + 1)); return; } for (const item of menuItems) { const text = (item.textContent || '').trim(); if (text === targetLanguage) { simulateClick(item); onNextFrame(() => { document.body.classList.remove('stagehand-selecting'); isSelecting = false; }); return; } } requestAnimationFrame(() => waitForCodeBlockMenuAndSelect(targetLanguage, attempts + 1)); } function selectCodeBlockLanguage(targetLanguage) { if (isSelecting) return; const current = getCodeBlockLanguageDropdown(); if (!current) return; if (current.language === targetLanguage) return; isSelecting = true; document.body.classList.add('stagehand-selecting'); simulateClick(current.element); requestAnimationFrame(() => waitForCodeBlockMenuAndSelect(targetLanguage)); } function syncCodeBlockLanguage() { const codeBlockLang = LANGUAGE_MAP[currentSelectedLanguage]; if (codeBlockLang) { selectCodeBlockLanguage(codeBlockLang); } } // ============================================ // EVENT HANDLERS & OBSERVERS // ============================================ function setupDropdownMenuObserver() { const menuObserver = new MutationObserver(() => { const menu = getDropdownMenu(); if (menu) { updateDropdownCheckIndicator(); onNextFrame(updateDropdownCheckIndicator); } }); menuObserver.observe(document.body, { subtree: true, childList: true }); } function setupMenuClickHandler() { document.addEventListener('click', (e) => { const target = e.target; // Check if we clicked on a sidebar dropdown menu item const menuItem = target.closest('[role="menu"] a, menu a'); if (!menuItem) return; const text = (menuItem.textContent || '').trim(); // Check if it's one of our language options for (const lang of DROPDOWN_LANGUAGES) { if (text.includes(lang)) { currentSelectedLanguage = lang; // Update the check indicator immediately updateDropdownCheckIndicator(); // Update version switcher visibility updateVersionSwitcherVisibility(); // Update SDK reference visibility updateSDKReferenceVisibility(); // Store in sessionStorage try { sessionStorage.setItem('stagehand-selected-language', lang); } catch (err) { // Ignore storage errors } // Navigate to the corresponding SDK page const targetPath = NAVIGATION_MAP[lang]; const normalizedPathname = window.location.pathname.replace(/\/$/, ''); if (targetPath && !normalizedPathname.endsWith(targetPath)) { e.preventDefault(); e.stopPropagation(); window.location.href = targetPath; return; } // Update button text after menu closes onNextFrame(() => updateButtonText(lang)); // Sync the code block language selector onNextFrame(syncCodeBlockLanguage); break; } } }, true); } function restoreLanguageSelection() { try { const stored = sessionStorage.getItem('stagehand-selected-language'); if (stored && DROPDOWN_LANGUAGES.includes(stored)) { currentSelectedLanguage = stored; updateButtonText(stored); updateVersionSwitcherVisibility(); updateSDKReferenceVisibility(); onNextFrame(syncCodeBlockLanguage); } } catch (err) { // Ignore storage errors } // Always update visibility on restore onNextFrame(() => { updateVersionSwitcherVisibility(); updateSDKReferenceVisibility(); }); } function setupPageChangeObserver() { let sdkUpdatePending = false; const observer = new MutationObserver(() => { // Check if button needs updating const button = getDropdownButton(); if (button) { const currentText = (button.textContent || '').trim(); if (currentText !== currentSelectedLanguage && DROPDOWN_LANGUAGES.includes(currentSelectedLanguage)) { updateButtonText(currentSelectedLanguage); } } // Re-check version switcher visibility (DOM might have re-rendered) const versionSwitcher = getVersionSwitcher(); if (versionSwitcher && !versionSwitcher.classList.contains('stagehand-version-switcher')) { updateVersionSwitcherVisibility(); } // Check for SDK reference items that need to be hidden (debounced via rAF) const sdkItems = document.querySelectorAll('li[id^="/v3/sdk/"]:not(.stagehand-sdk-processed)'); if (sdkItems.length > 0 && !sdkUpdatePending) { sdkUpdatePending = true; onNextFrame(() => { updateSDKReferenceVisibility(); document.querySelectorAll('li[id^="/v3/sdk/"]').forEach(item => { item.classList.add('stagehand-sdk-processed'); }); sdkUpdatePending = false; }); } }); observer.observe(document.body, { subtree: true, childList: true }); } // Watch for code block dropdowns appearing and sync them function setupCodeBlockObserver() { let lastCodeBlockDropdown = null; const observer = new MutationObserver(() => { const dropdown = getCodeBlockLanguageDropdown(); if (dropdown && dropdown.element !== lastCodeBlockDropdown) { lastCodeBlockDropdown = dropdown.element; // New code block dropdown appeared, sync it const targetLang = LANGUAGE_MAP[currentSelectedLanguage]; if (targetLang && dropdown.language !== targetLang) { onNextFrame(() => selectCodeBlockLanguage(targetLang)); } } }); observer.observe(document.body, { subtree: true, childList: true }); } // ============================================ // INITIALIZATION // ============================================ function init() { setupMenuClickHandler(); setupDropdownMenuObserver(); setupPageChangeObserver(); setupCodeBlockObserver(); restoreLanguageSelection(); updateVersionSwitcherVisibility(); updateSDKReferenceVisibility(); } // Initialize on page load if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', init); } else { init(); } // Re-run when URL changes (SPA navigation) let lastUrl = location.href; const urlObserver = new MutationObserver(() => { if (location.href !== lastUrl) { lastUrl = location.href; // Remove processed class so SDK items get re-evaluated document.querySelectorAll('li[id^="/v3/sdk/"].stagehand-sdk-processed').forEach(item => { item.classList.remove('stagehand-sdk-processed'); }); onNextFrame(() => { restoreLanguageSelection(); syncCodeBlockLanguage(); updateVersionSwitcherVisibility(); updateSDKReferenceVisibility(); }); } }); urlObserver.observe(document.body, { subtree: true, childList: true }); })(); ================================================ FILE: packages/docs/package.json ================================================ { "name": "@browserbasehq/stagehand-docs", "version": "1.0.0", "description": "", "type": "module", "main": "index.js", "scripts": { "dev": "mintlify dev --no-open --port 3002", "upgrade": "mintlify upgrade", "sync-sdk": "node scripts/sync-sdk-docs.js" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { "mintlify": "^4.2.47", "zod": "^4.2.1" }, "packageManager": "pnpm@9.15.0+sha512.76e2379760a4328ec4415815bcd6628dee727af3779aaa4c914e3944156c4299921a89f976381ee107d41f12cfa4b66681ca9c718f0668fa0831ed4c6d8ba56c" } ================================================ FILE: packages/docs/scripts/runtimePaths.js ================================================ /** * Keep this file in sync with: * - /packages/core/lib/v3/runtimePaths.ts * - /packages/server-v3/scripts/runtimePaths.ts * - /packages/server-v4/scripts/runtimePaths.ts * - /packages/evals/runtimePaths.ts * - /packages/docs/scripts/runtimePaths.js */ import path from "node:path"; import { fileURLToPath } from "node:url"; const PACKAGE_SEGMENT = "/packages/docs/"; const EVAL_FRAMES = new Set(["[eval]", "[eval]-wrapper"]); const INTERNAL_FRAME_NAMES = new Set([ "readCallsites", "readCallsitePath", "resolveCallerFilePath", "getCurrentFilePath", "getCurrentDirPath", "getRepoRootDir", "isMainModule", ]); const normalizePath = (value) => { const input = value.startsWith("file://") ? fileURLToPath(value) : value; return path.resolve(input).replaceAll("\\", "/"); }; const readCallsites = () => { const previousPrepare = Error.prepareStackTrace; try { Error.prepareStackTrace = (_, stack) => stack; return new Error().stack ?? []; } finally { Error.prepareStackTrace = previousPrepare; } }; const readCallsitePath = (callsite) => { const rawPath = callsite.getFileName?.() ?? callsite.getScriptNameOrSourceURL?.(); if (!rawPath) return null; if (rawPath.startsWith("node:")) return null; if (EVAL_FRAMES.has(rawPath)) return null; return normalizePath(rawPath); }; const isInternalCallsite = (callsite) => { const functionName = callsite.getFunctionName?.(); if (functionName && INTERNAL_FRAME_NAMES.has(functionName)) return true; const methodName = callsite.getMethodName?.(); if (methodName && INTERNAL_FRAME_NAMES.has(methodName)) return true; const callsiteString = callsite.toString?.() ?? ""; for (const frameName of INTERNAL_FRAME_NAMES) { if (callsiteString.includes(`${frameName} (`)) return true; if (callsiteString.includes(`.${frameName} (`)) return true; } return false; }; const resolveCallerFilePath = () => { const packageCandidates = []; const fallbackCandidates = []; for (const callsite of readCallsites()) { const filePath = readCallsitePath(callsite); if (!filePath) continue; if (isInternalCallsite(callsite)) continue; if (filePath.includes(PACKAGE_SEGMENT)) { packageCandidates.push(filePath); continue; } fallbackCandidates.push(filePath); } const packageCandidate = packageCandidates[0]; if (packageCandidate) return packageCandidate; const fallbackCandidate = fallbackCandidates[0]; if (fallbackCandidate) return fallbackCandidate; throw new Error("Unable to resolve caller file path."); }; export const getCurrentFilePath = () => resolveCallerFilePath(); export const getCurrentDirPath = () => path.dirname(getCurrentFilePath()); export const getRepoRootDir = () => { const currentFilePath = getCurrentFilePath(); const index = currentFilePath.lastIndexOf(PACKAGE_SEGMENT); if (index === -1) { throw new Error( `Unable to determine repo root from ${currentFilePath} (missing ${PACKAGE_SEGMENT}).`, ); } return currentFilePath.slice(0, index); }; export const isMainModule = () => { const entryScript = process.argv.at(1); if (!entryScript) return false; return normalizePath(entryScript) === getCurrentFilePath(); }; ================================================ FILE: packages/docs/scripts/sync-sdk-docs.js ================================================ #!/usr/bin/env node /** * Script to sync SDK documentation from GitHub READMEs * * Usage: node scripts/sync-sdk-docs.js * * This script fetches README.md files from each language SDK repo * and generates MDX files for the docs. */ import fs from "node:fs"; import path from "node:path"; import https from "node:https"; import { getCurrentDirPath } from "./runtimePaths.js"; const currentDir = getCurrentDirPath(); // SDK repos configuration const SDK_REPOS = { java: { repo: 'browserbase/stagehand-java', title: 'Java SDK', description: 'Official Stagehand SDK for Java', outputPath: 'v3/sdk/java.mdx' }, python: { repo: 'browserbase/stagehand-python', title: 'Python SDK', description: 'Official Stagehand SDK for Python', outputPath: 'v3/sdk/python.mdx' }, ruby: { repo: 'browserbase/stagehand-ruby', title: 'Ruby SDK', description: 'Official Stagehand SDK for Ruby', outputPath: 'v3/sdk/ruby.mdx' }, go: { repo: 'browserbase/stagehand-go', title: 'Go SDK', description: 'Official Stagehand SDK for Go', outputPath: 'v3/sdk/go.mdx' } }; /** * Fetch content from a URL */ function fetchUrl(url) { return new Promise((resolve, reject) => { https.get(url, { headers: { 'User-Agent': 'Stagehand-Docs-Sync' } }, (res) => { // Handle redirects if (res.statusCode === 301 || res.statusCode === 302) { fetchUrl(res.headers.location).then(resolve).catch(reject); return; } if (res.statusCode !== 200) { reject(new Error(`HTTP ${res.statusCode}: ${url}`)); return; } let data = ''; res.on('data', chunk => data += chunk); res.on('end', () => resolve(data)); res.on('error', reject); }).on('error', reject); }); } /** * Process README content for MDX compatibility */ function processReadmeContent(content, config) { let processed = content; // Remove HTML comments processed = processed.replace(//g, ''); // Remove entire HTML blocks with picture/source tags (badge sections) processed = processed.replace(/]*>[\s\S]*?<\/div>/gi, ''); processed = processed.replace(/]*align[^>]*>[\s\S]*?<\/p>/gi, ''); processed = processed.replace(/[\s\S]*?<\/picture>/gi, ''); // Remove standalone HTML tags processed = processed.replace(/]*>[\s]*]*>[\s]*<\/a>/gi, ''); processed = processed.replace(/]*badge[^>]*>/gi, ''); processed = processed.replace(/]*shields\.io[^>]*>/gi, ''); processed = processed.replace(/]*>\s*[\s\S]*?<\/picture>\s*<\/a>/gi, ''); // Remove badge images in markdown format processed = processed.replace(/^\s*(\[!\[.*?\]\(.*?\)\]\(.*?\)\s*)+/gm, ''); processed = processed.replace(/^\s*!\[.*?\]\(https:\/\/.*?badge.*?\)\s*/gm, ''); processed = processed.replace(/\[!\[.*?\]\(.*?badge.*?\)\]\(.*?\)/g, ''); // Remove standalone anchor img tags processed = processed.replace(/]*href[^>]*>]*><\/a>/gi, ''); // Clean up tags with backticks inside (common in Go docs) processed = processed.replace(/\\`([^`]*?)\\`<\/code>/g, '`$1`'); processed = processed.replace(/`([^`]*?)`<\/code>/g, '`$1`'); processed = processed.replace(/([^<]*?)<\/code>/g, '`$1`'); // Fix malformed links with parentheses in URL (Go docs issue) processed = processed.replace(/\[([^\]]+)\]\(([^)]+)\(([^)]+)\)([^)]*)\)/g, '[$1]($2)'); // Convert relative links to absolute GitHub links const repoUrl = `https://github.com/${config.repo}`; processed = processed.replace(/\]\((?!http)(?!#)(?!mailto)([^)]+)\)/g, `](${repoUrl}/blob/main/$1)`); // Fix code block language hints for MDX processed = processed.replace(/```kotlin/g, '```java'); // Remove the first H1 if it exists (we'll add our own title) processed = processed.replace(/^#\s+.*\n+/, ''); // Clean up excessive newlines processed = processed.replace(/\n{4,}/g, '\n\n\n'); // Remove any remaining inline HTML img tags processed = processed.replace(/]*>/gi, ''); // Remove any remaining tags that are empty or just whitespace processed = processed.replace(/]*>\s*<\/a>/gi, ''); // Clean up lines that are just whitespace processed = processed.replace(/^\s+$/gm, ''); return processed.trim(); } /** * Generate MDX frontmatter */ function generateFrontmatter(config) { return `--- title: "${config.title}" description: "${config.description}" --- This documentation is automatically synced from the [${config.title} GitHub repository](https://github.com/${config.repo}). `; } /** * Sync a single SDK's documentation */ async function syncSdk(language, config) { const rawUrl = `https://raw.githubusercontent.com/${config.repo}/main/README.md`; console.log(`Fetching ${language} SDK docs from ${rawUrl}...`); try { const readme = await fetchUrl(rawUrl); const processedContent = processReadmeContent(readme, config); const frontmatter = generateFrontmatter(config); const mdxContent = frontmatter + processedContent; // Ensure directory exists const outputDir = path.dirname(`${currentDir}/../${config.outputPath}`); if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Write MDX file const outputFile = `${currentDir}/../${config.outputPath}`; fs.writeFileSync(outputFile, mdxContent, 'utf8'); console.log(`✓ ${language} SDK docs written to ${config.outputPath}`); return true; } catch (error) { console.error(`✗ Failed to sync ${language} SDK: ${error.message}`); return false; } } /** * Main function */ async function main() { console.log('Syncing SDK documentation from GitHub...\n'); const results = await Promise.all( Object.entries(SDK_REPOS).map(([lang, config]) => syncSdk(lang, config)) ); const successCount = results.filter(Boolean).length; const totalCount = results.length; console.log(`\nDone! ${successCount}/${totalCount} SDKs synced successfully.`); if (successCount < totalCount) { process.exit(1); } } main().catch(error => { console.error('Fatal error:', error); process.exit(1); }); ================================================ FILE: packages/docs/snippets/excalidraw.mdx ================================================ export const Excalidraw = ({ url, className = "w-full" }) => { return ( <>
) } ================================================ FILE: packages/docs/snippets/v3-banner.mdx ================================================ {/* V3Banner - Currently a no-op component This component is imported across 50+ pages in v3 docs. Keeping it as a no-op rather than removing allows us to easily add a new banner message in the future without editing every file. To add a banner, replace the null return with your JSX content. */} export const V3Banner = () => null; ================================================ FILE: packages/docs/v2/basics/act.mdx ================================================ --- title: Act description: 'Interact with a web page' --- ## What is `act()`? ``` typescript page.act("click on add to cart") ``` `act` enables Stagehand to perform **individual** actions on a web page. Use it to build self-healing and deterministic automations that adapt to website changes. ## Why use `act()`? Write automation in plain English. No selectors or complex syntax. Build automations step by step. Define exactly what happens at every moment. Actions automatically adapt when websites change. Cache actions to avoid LLM calls and ensure consistent execution across runs. ## Using `act()` Use `act` to perform single actions in your automation. Here's how to click a button: ```typescript TypeScript await page.goto("https://example-store.com"); await page.act("click the add to cart button"); ``` ```python Python await page.goto("https://example-store.com") await page.act("click the add to cart button") ``` With `act`, breaking complex actions into small, single-step actions works best. If you need to orchestrate multi-step flows, use multiple `act` commands or `agent`. | Action | Example instruction | |--------|---------------------| | Click | `click the button` | | Fill | `fill the field with ` | | Type | `type into the search box` | | Press | `press in the search field` | | Scroll | `scroll to ` | | Select from dropdown | `select from the dropdown` | Break your task into single-step actions. ```typescript TypeScript // Break it into single-step actions await page.act("open the filters panel"); await page.act("choose 4-star rating"); await page.act("click the apply button"); ``` ```python Python # Break it into single-step actions await page.act("open the filters panel") await page.act("choose 4-star rating") await page.act("click the apply button") ``` For multi-step tasks, use [`agent()`](/v2/basics/agent) instead. ```typescript TypeScript // Too complex - trying to do multiple things at once await page.act("open the filters panel, choose 4-star rating, and click apply"); ``` ```python Python # Too complex - trying to do multiple things at once await page.act("open the filters panel, choose 4-star rating, and click apply") ``` ### Advanced Configuration For advanced scenarios, you can configure additional options: ```typescript TypeScript // Dynamic food search with advanced options const foodItem = "organic quinoa"; await page.act({ action: "Type %foodItem% in the search box and press enter", variables: { foodItem: foodItem }, modelName: "google/gemini-2.5-pro", modelClientOptions: { modelApiKey: process.env.GOOGLE_API_KEY, }, iframes: true, // Search within iframes if needed domSettleTimeoutMs: 45000, // Wait longer for dynamic content timeoutMs: 60000 // Extended timeout for slow-loading forms }); ``` ```python Python # Dynamic food search with advanced options food_item = "organic quinoa" await page.act({ "action": "Type %foodItem% in the search box and press enter", "variables": { "foodItem": food_item }, "modelName": "google/gemini-2.5-pro", "modelClientOptions": { "modelApiKey": os.environ.get("GOOGLE_API_KEY") }, "iframes": True, # Search within iframes if needed "domSettleTimeoutMs": 45000, # Wait longer for dynamic content "timeoutMs": 60000 # Extended timeout for slow-loading forms }) ``` Shadow DOM support is now available! Set `experimental: true` in your Stagehand configuration to enable it. See the [configuration guide](/v2/configuration/browser) for more details. ## Best practices ### Ensure reliable actions Use `observe()` to discover candidate actions on the current page and plan reliably. It returns a list of suggested actions (with selector, description, method, and arguments). You can pass an observed action directly to `act` to execute it. ```typescript TypeScript const [action] = await page.observe("click the login button"); if (action) { await page.act(action); } ``` ```python Python results = await page.observe("click the login button") if results: await page.act(results[0]) ``` Plan actions with `observe()` before executing with `act`. ### Reduce model costs Cache observed actions to avoid repeated LLM calls and ensure consistent execution. ```typescript TypeScript // Cost-optimized actions with caching const actionCache = new Map(); const getCachedAction = async (instruction: string) => { if (actionCache.has(instruction)) { return actionCache.get(instruction); } const [action] = await page.observe(instruction); actionCache.set(instruction, action); return action; }; // Reuse cached actions const loginAction = await getCachedAction("click the login button"); await page.act(loginAction); ``` ```python Python # Cost-optimized actions with caching action_cache = {} async def get_cached_action(instruction: str): if instruction in action_cache: return action_cache[instruction] results = await page.observe(instruction) if results: action = results[0] action_cache[instruction] = action return action return None # Reuse cached actions login_action = await get_cached_action("click the login button") if login_action: await page.act(login_action) ``` Learn advanced caching techniques and patterns for optimal performance. ### Secure your automations Variables will not be shared with LLM providers. Use them for passwords, API keys, and other sensitive data. Load sensitive data from environment variables using `.env` files. Never hardcode API keys, passwords, or other secrets directly in your code. ```typescript TypeScript await page.act({ action: "enter %username% in the email field", variables: { username: "user@example.com" } }); await page.act({ action: "enter %password% in the password field", variables: { password: process.env.USER_PASSWORD } }); ``` ```python Python # If using Python, set `use_api: true` in your Stagehand configuration await page.act( "enter %username% in the email field", variables={ "username": "user@example.com" } ) await page.act( "enter %password% in the password field", variables={ "password": os.environ.get("USER_PASSWORD") } ) ``` When handling sensitive data, set `verbose: 0` in your Stagehand configuration to prevent secrets from appearing in logs. See the [configuration guide](/v2/configuration/browser) for more details. Complete guide to securing your browser automations with best practices and configurations. ## Troubleshooting **Problem**: `act` fails with "method not supported" error **Solutions**: - Use clear and detailed instructions for what you want to accomplish - Review our [evals](https://stagehand.dev/evals) to find the best models for your use case - Use [`observe()`](/v2/basics/observe) and verify the resulting action is within a list of expected actions **Solution 1: Validate with observe** ```typescript TypeScript const prompt = "click the submit button"; const expectedMethod = "click"; try { await page.act(prompt); } catch (error) { if (error.message.includes("method not supported")) { // Observe the same prompt to get the planned action const [action] = await page.observe(prompt); if (action && action.method === expectedMethod) { await page.act(action); } else { throw new Error(`Unsupported method: expected "${expectedMethod}", got "${action?.method}"`); } } else { throw error; } } ``` ```python Python prompt = "click the submit button" expected_method = "click" try: await page.act(prompt) except Exception as error: if "method not supported" in str(error): # Observe the same prompt to get the planned action results = await page.observe(prompt) if results and results[0].method == expected_method: await page.act(results[0]) else: method = results[0].method if results else "unknown" raise Exception(f'Unsupported method: expected "{expected_method}", got "{method}"') else: raise error ``` **Solution 2: Retry with exponential backoff** ```typescript TypeScript // Retry with exponential backoff for intermittent issues const prompt = "click the submit button"; const maxRetries = 3; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { await page.act(prompt, { timeoutMs: 10000 + (attempt * 5000) }); break; // Success, exit retry loop } catch (error) { if (error.message.includes("method not supported") && attempt < maxRetries) { // Exponential backoff: wait 2^attempt seconds const delay = Math.pow(2, attempt) * 1000; console.log(`Retry ${attempt + 1}/${maxRetries} after ${delay}ms`); await new Promise(resolve => setTimeout(resolve, delay)); } else { throw error; } } } ``` ```python Python # Retry with exponential backoff for intermittent issues import asyncio prompt = "click the submit button" max_retries = 3 for attempt in range(max_retries + 1): try: timeout = 10000 + (attempt * 5000) await page.act(prompt, {"timeoutMs": timeout}) break # Success, exit retry loop except Exception as error: if "method not supported" in str(error) and attempt < max_retries: # Exponential backoff: wait 2^attempt seconds delay = 2 ** attempt print(f"Retry {attempt + 1}/{max_retries} after {delay}s") await asyncio.sleep(delay) else: raise error ``` **Problem**: `act` times out or fails to complete action (often due to element not found) **Solutions**: - Ensure page has fully loaded - Check if content is in iframes: [Learn more about working with iframes](/v2/best-practices/working-with-iframes) - Increase action timeout - Use `observe()` first to verify element exists ```typescript TypeScript // Handle timeout and element not found issues try { await page.act("click the submit button", { timeout: 30000 }); } catch (error) { // Check if page is fully loaded await page.waitForLoadState('domcontentloaded'); // Use observe to check element state const [element] = await page.observe("find the submit button"); if (element) { console.log("Element found, trying more specific instruction"); await page.act("click the submit button at the bottom of the form"); } else { console.log("Element not found, trying alternative selector"); await page.act("click the button with text 'Submit'"); } } ``` ```python Python # Handle timeout and element not found issues try: await page.act("click the submit button", {"timeout": 30000}) except Exception as error: # Check if page is fully loaded await page.wait_for_load_state('domcontentloaded') # Use observe to check element state results = await page.observe("find the submit button") if results: print("Element found, trying more specific instruction") await page.act("click the submit button at the bottom of the form") else: print("Element not found, trying alternative selector") await page.act("click the button with text 'Submit'") ``` **Problem**: `act` performs action on wrong element **Solutions**: - Be more specific in instructions: include visual cues, position, or context - Use `observe()` to preview which element will be selected - Add contextual information: "the search button in the header" - Use unique identifiers when available ```typescript TypeScript // More precise element targeting // Instead of: await page.act("click the button"); // Use specific context: await page.act("click the red 'Delete' button next to the user John Smith"); // Or preview with observe first: const [action] = await page.observe("click the submit button in the checkout form"); if (action.description.includes("checkout")) { await page.act(action); } ``` ```python Python # More precise element targeting # Instead of: await page.act("click the button") # Use specific context: await page.act("click the red 'Delete' button next to the user John Smith") # Or preview with observe first: results = await page.observe("click the submit button in the checkout form") if results and "checkout" in results[0].description: await page.act(results[0]) ``` ## Next steps Use `Agent` to autonomously execute multi-step tasks and complex workflows. Speed up repeated automations by caching actions. Use `extract` with a data schema to pull clean, typed data from any page. Learn best practices for interacting with elements inside iframes. ================================================ FILE: packages/docs/v2/basics/agent.mdx ================================================ --- title: Agent description: 'Automate complex workflows with AI powered browser agents' --- ## What is `agent()?` ``` typescript agent.execute("apply for a job at browserbase") ``` `agent` turns high level tasks into **fully autonomous** browser workflows. You can customize the agent by specifying the LLM provider and model, setting custom instructions for behavior, and configuring max steps. Agent ## Why use `agent()`? Execute complex sequences automatically. Sees and understands web interfaces like humans do using computer vision. ## Using `agent()` There are two ways to create agents in Stagehand: ### Computer Use Agents Use computer use agents with specialized models from OpenAI or Anthropic: ```typescript TypeScript const agent = stagehand.agent({ provider: "anthropic", model: "claude-sonnet-4-20250514", instructions: "You are a helpful assistant that can use a web browser.", options: { apiKey: process.env.ANTHROPIC_API_KEY, }, }); await agent.execute("apply for a job at Browserbase") ``` ```python Python agent = stagehand.agent( model="claude-sonnet-4-20250514", instructions="You are a helpful assistant that can use a web browser.", options={ "api_key": os.getenv("ANTHROPIC_API_KEY"), }, ) await agent.execute("apply for a job at Browserbase") ``` View or run the example template [here](https://www.browserbase.com/templates/gemini-cua) ### Use Stagehand Agent with Any LLM Use the agent without specifying a provider to utilize any model or LLM provider: Non CUA agents are currently only supported in TypeScript ```typescript TypeScript const agent = stagehand.agent(); await agent.execute("apply for a job at Browserbase") ``` ## MCP Integrations Agents can be enhanced with external tools and services through MCP (Model Context Protocol) integrations. This allows your agent to access external APIs and data sources beyond just browser interactions. ```typescript TypeScript (Pass URL) const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, ], instructions: `You have access to web search through Exa. Use it to find current information before browsing.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); ``` ```typescript TypeScript (Create Connection) import { connectToMCPServer } from "@browserbasehq/stagehand"; const supabaseClient = await connectToMCPServer( `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` ); const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [supabaseClient], instructions: `You can interact with Supabase databases. Use these tools to store and retrieve data.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for restaurants and save the first result to the database"); ``` MCP integrations enable agents to be more powerful by combining browser automation with external APIs, databases, and services. The agent can intelligently decide when to use browser actions versus external tools. Stagehand uses a 1288x711 viewport by default (the optimal size for Computer Use Agents). Other viewport sizes may reduce performance. If you need to modify the viewport, you can edit in the [Browser Configuration](/v2/configuration/browser). ## Available Models Use specialized computer use models (e.g., `computer-use-preview` from OpenAI or `claude-sonnet-4-20250514` from Anthropic) Check out the guide on how to use different models with Stagehand. ## Agent Execution Configuration Control the maximum number of steps the agent can take to complete the task using the `maxSteps` parameter. ```typescript TypeScript // Set maxSteps to control how many actions the agent can take await agent.execute({ instruction: "Sign me up for a library card", maxSteps: 15 // Agent will stop after 15 steps if task isn't complete }); ``` ```python Python # Set max_steps to control how many actions the agent can take result = await agent.execute({ "instruction": "Sign me up for a library card", "max_steps": 15 # Agent will stop after 15 steps if task isn't complete }) ``` For complex tasks, increase the `maxSteps` limit and check task success. ```typescript TypeScript // Complex multi-step task requiring more actions const result = await agent.execute({ instruction: "Find and apply for software engineering jobs, filtering by remote work and saving 3 applications", maxSteps: 30, // Higher limit for complex workflows }); // Check if the task completed successfully if (result.success === true) { console.log("Task completed successfully!"); } else { console.log("Task failed or was incomplete"); } ``` ```python Python # Complex multi-step task requiring more actions result = await agent.execute({ "instruction": "Find and apply for software engineering jobs, filtering by remote work and saving 3 applications", "max_steps": 30 # Higher limit for complex workflows }) # Check if the task completed successfully if result.success == True: print("Task completed successfully!") else: print("Task failed or was incomplete") ``` ## Best Practices Following these best practices will improve your agent's success rate, reduce execution time, and minimize unexpected errors during task completion. ### Start on the Right Page Navigate to your target page before executing tasks: ```typescript TypeScript await page.goto('https://github.com/browserbase/stagehand'); await agent.execute('Get me the latest PR on the stagehand repo'); ``` ```python Python await page.goto("https://github.com/browserbase/stagehand") result = await agent.execute("Get me the latest PR on the stagehand repo") ``` ```typescript TypeScript await agent.execute('Go to GitHub and find the latest PR on browserbase/stagehand'); ``` ```python Python result = await agent.execute("Go to GitHub and find the latest PR on browserbase/stagehand") ``` ### Be Specific Provide detailed instructions for better results: ```typescript TypeScript await agent.execute("Find Italian restaurants in Brooklyn that are open after 10pm and have outdoor seating"); ``` ```python Python result = await agent.execute("Find Italian restaurants in Brooklyn that are open after 10pm and have outdoor seating") ``` ```typescript TypeScript await agent.execute("Find a restaurant"); ``` ```python Python result = await agent.execute("Find a restaurant") ``` ## Troubleshooting **Problem**: Agent stops before finishing the requested task **Solutions**: - Check if the agent is hitting the maxSteps limit (default is 20) - Increase maxSteps for complex tasks: `maxSteps: 30` or higher - Break very complex tasks into smaller sequential executions ```typescript // Increase maxSteps for complex tasks await agent.execute({ instruction: "Complete the multi-page registration form with all required information", maxSteps: 40 // Increased limit for complex task }); // Or break into smaller tasks with success checking const firstResult = await agent.execute({ instruction: "Fill out page 1 of the registration form", maxSteps: 15 }); // Only proceed if the first task was successful if (firstResult.success === true) { await agent.execute({ instruction: "Navigate to page 2 and complete remaining fields", maxSteps: 15 }); } else { console.log("First task failed, stopping execution"); } ``` **Problem**: Agent clicks on wrong elements or fails to interact with the correct UI components **Solutions**: - Ensure proper viewport size: Stagehand uses `1288x711` by default (optimal for Computer Use models) - Avoid changing viewport dimensions as other sizes may reduce performance ## Next steps Execute actions efficiently using observe results Extract structured data from observed elements ================================================ FILE: packages/docs/v2/basics/extract.mdx ================================================ --- title: Extract description: Extract structured data from a webpage --- ## What is `extract()`? ```typescript page.extract("extract the name of the repository"); ``` `extract` grabs structured data from a webpage. You can define your schema with [zod](https://github.com/colinhacks/zod) (TypeScript) or [pydantic](https://github.com/pydantic/pydantic) (Python). If you do not want to define a schema, you can also call `extract` with just a [natural language prompt](#prompt-only-extraction), or call `extract` [with no parameters](#extract-with-no-parameters). ## Why use `extract()`? Turn messy webpage data into clean objects that follow a schema. Build resilient extractions that don't break when the website changes For TypeScript, the extract schemas are defined using zod schemas. For Python, the extract schemas are defined using pydantic models. ## Using `extract()` ### Single object Extraction Here is how an `extract` call might look for a single object: ```typescript TypeScript import { z } from 'zod/v3'; const item = await page.extract({ instruction: "extract the price of the item", schema: z.object({ price: z.number(), }), }); ``` ```python Python from pydantic import BaseModel class Extraction(BaseModel): price: float item = await page.extract( "extract the price of the item", schema=Extraction ) ``` Your output schema will look like: ```Example { price: number } ``` ### List of objects Extraction Here is how an `extract` call might look for a list of objects. ```typescript TypeScript import { z } from 'zod/v3'; const apartments = await page.extract({ instruction: "Extract ALL the apartment listings and their details, including address, price, and square feet.", schema: z.object({ list_of_apartments: z.array( z.object({ address: z.string(), price: z.string(), square_feet: z.string(), }), ), }) }) console.log("the apartment list is: ", apartments); ``` ```python Python from pydantic import BaseModel class Apartment(BaseModel): address: str price: str square_feet: str class Apartments(BaseModel): list_of_apartments: list[Apartment] apartments = await page.extract( "Extract ALL the apartment listings and their details as a list, including address, price, and square feet for each apartment", schema=Apartments ) print("the apartment list is: ", apartments) ``` Your output schema will look like: ```Example list_of_apartments: [ { address: "street address here", price: "$1234.00", square_feet: "700" }, { address: "another address here", price: "1010.00", square_feet: "500" }, ... ] ``` ### Prompt-only Extraction You can call `extract` with just a natural language prompt: ```typescript TypeScript const result = await page.extract("extract the name of the repository"); ``` ```python Python result = await page.extract("extract the name of the repository") ``` When you call `extract` with just a prompt, your output schema will look like: ```Example { extraction: string } ``` ### Extract with no parameters Here is how you can call `extract` with no parameters. ```typescript TypeScript const pageText = await page.extract(); ``` ```python Python pageText = await page.extract() ``` Output schema: ```Example { pageText: string } ``` Calling `extract` with no parameters will return hierarchical tree representation of the root DOM. This will not be passed through an LLM. It will look something like this: ``` Accessibility Tree: [0-2] RootWebArea: What is Stagehand? - 🤘 Stagehand [0-37] scrollable [0-118] body [0-241] scrollable [0-242] div [0-244] link: 🤘 Stagehand home page light logo [0-245] span [0-246] StaticText: 🤘 Stagehand [0-247] StaticText: home page ``` ## Best practices ### Extract with Context You can provide additional context to your schema to help the model extract the data more accurately. ```typescript TypeScript import { z } from 'zod/v3'; const apartments = await page.extract({ instruction: "Extract ALL the apartment listings and their details, including address, price, and square feet.", schema: z.object({ list_of_apartments: z.array( z.object({ address: z.string().describe("the address of the apartment"), price: z.string().describe("the price of the apartment"), square_feet: z.string().describe("the square footage of the apartment"), }), ), }) }) ``` ```python Python from pydantic import BaseModel, Field class Apartment(BaseModel): address: str = Field(..., description="the address of the apartment") price: str = Field(..., description="the price of the apartment") square_feet: str = Field(..., description="the square footage of the apartment") class Apartments(BaseModel): list_of_apartments: list[Apartment] apartments = await page.extract( "Extract ALL the apartment listings and their details as a list. For each apartment, include: the address of the apartment, the price of the apartment, and the square footage of the apartment", schema=Apartments ) ``` ### Link Extraction To extract links or URLs, in the TypeScript version of Stagehand, you'll need to define the relevant field as `z.string().url()`. In Python, you'll need to define it as `HttpUrl`. Here is how an `extract` call might look for extracting a link or URL. This also works for image links. ```typescript TypeScript import { z } from 'zod/v3'; const extraction = await page.extract({ instruction: "extract the link to the 'contact us' page", schema: z.object({ link: z.string().url(), // note the usage of z.string().url() here }), }); console.log("the link to the contact us page is: ", extraction.link); ``` ```python Python from pydantic import BaseModel, HttpUrl class Extraction(BaseModel): link: HttpUrl # note the usage of HttpUrl here extraction = await page.extract( "extract the link to the 'contact us' page", schema=Extraction ) print("the link to the contact us page is: ", extraction.link) ``` Inside Stagehand, extracting links works by asking the LLM to select an ID. Stagehand looks up that ID in a mapping of IDs -> URLs. When logging the LLM trace, you should expect to see IDs. The actual URLs will be included in the final `ExtractResult`. ## Troubleshooting **Problem**: `extract()` returns empty or incomplete data **Solutions**: - **Check your instruction clarity**: Make sure your instruction is specific and describes exactly what data you want to extract - **Verify the data exists**: Use `page.observe()` first to confirm the data is present on the page - **Wait for dynamic content**: If the page loads content dynamically, use `page.act("wait for the content to load")` before extracting **Solution: Wait for content before extracting** ```typescript TypeScript // Wait for content before extracting await page.act("wait for the product listings to load"); const products = await page.extract({ instruction: "extract all product names and prices", schema: z.object({ products: z.array(z.object({ name: z.string(), price: z.string() })) }) }); ``` ```python Python # Wait for content before extracting await page.act("wait for the product listings to load") products = await page.extract( "extract all product names and prices", schema=ProductList ) ``` **Problem**: Getting schema validation errors or type mismatches **Solutions**: - **Use optional fields**: Make fields optional with `z.optional()` (TypeScript) or `Optional[type]` (Python) if the data might not always be present - **Use flexible types**: Consider using `z.string()` instead of `z.number()` for prices that might include currency symbols - **Add descriptions**: Use `.describe()` (TypeScript) or `Field(description="...")` (Python) to help the model understand field requirements **Solution: More flexible schema** ```typescript TypeScript const schema = z.object({ price: z.string().describe("price including currency symbol, e.g., '$19.99'"), availability: z.string().optional().describe("stock status if available"), rating: z.number().optional() }); ``` ```python Python class FlexibleProduct(BaseModel): price: str = Field(description="price including currency symbol, e.g., '$19.99'") availability: Optional[str] = Field(default=None, description="stock status if available") rating: Optional[float] = None ``` **Problem**: Extraction results vary between runs **Solutions**: - **Be more specific in instructions**: Instead of "extract prices", use "extract the numerical price value for each item" - **Use context in schema descriptions**: Add field descriptions to guide the model - **Combine with observe**: Use `page.observe()` to understand the page structure first **Solution: Validate with observe first** ```typescript TypeScript // First observe to understand the page structure const elements = await page.observe("find all product listings"); console.log("Found elements:", elements.map(e => e.description)); // Then extract with specific targeting const products = await page.extract({ instruction: "extract name and price from each product listing shown on the page", schema: z.object({ products: z.array(z.object({ name: z.string().describe("the product title or name"), price: z.string().describe("the price as displayed, including currency") })) }) }); ``` ```python Python # First observe to understand the page structure elements = await page.observe("find all product listings") print("Found elements:", [e.description for e in elements]) # Then extract with specific targeting products = await page.extract( "extract name and price from each product listing shown on the page", schema=ProductSchema ) ``` **Problem**: Extraction is slow or timing out **Solutions**: - **Reduce scope**: Extract smaller chunks of data in multiple calls rather than everything at once - **Use targeted instructions**: Be specific about which part of the page to focus on - **Consider pagination**: For large datasets, extract one page at a time - **Increase timeout**: Use `timeoutMs` parameter for complex extractions **Solution: Break down large extractions** ```typescript TypeScript // Instead of extracting everything at once const allData = []; const pageNumbers = [1, 2, 3, 4, 5]; for (const pageNum of pageNumbers) { await page.act(`navigate to page ${pageNum}`); const pageData = await page.extract({ instruction: "extract product data from the current page only", schema: ProductPageSchema, timeoutMs: 60000 // 60 second timeout }); allData.push(...pageData.products); } ``` ```python Python # Instead of extracting everything at once all_data = [] page_numbers = [1, 2, 3, 4, 5] for page_num in page_numbers: await page.act(f"navigate to page {page_num}") page_data = await page.extract( "extract product data from the current page only", schema=ProductPageSchema, timeout_ms=60000 # 60 second timeout ) all_data.extend(page_data.products) ``` ## Next steps Execute actions efficiently using observe results Analyze pages with observe() ================================================ FILE: packages/docs/v2/basics/observe.mdx ================================================ --- title: Observe sidebarTitle: Observe description: 'Find suggested actions for your workflows' --- ## What is `observe()`? ``` typescript page.observe("Find the login button") ``` `observe` allows you to turn any page into a checklist of reliable, executable actions. It discovers key elements, ranks likely next steps, and returns structured actions (selector, method, args) you can run instantly with `act` or use to precisely target `extract` so workflows are faster, cheaper, and more resilient. ## Why use `observe()`? When you're unsure what's on a page or need to discover available actions When building complex workflows, plan ahead all the actions you'll need to take When you want to remember actions for the future and avoid LLM calls Before performing critical actions to ensure elements exist ## Using `observe()` Calling `observe` supercharges other Stagehand methods. Use it to plan workflows, speed up `act`, and precisely target `extract`. Using `observe` helps you explore what's possible on a page by giving you a list of suggested actions. ```typescript TypeScript // Plan & validate const buttons = await page.observe("Find the log in / sign up buttons"); ``` ```python Python # Plan & validate buttons = await page.observe("Find the log in / sign up buttons") ``` This will return a list of suggestions with the following structure ```json { "selector": "xpath=/html/body/header/div/button[1]", "description": "Log in button in the top right corner", "method": "click", "arguments": [] } ``` ### Observe with Act You can **validate** the action (method, selector, arguments...) and then pass it to `act` to **avoid extra LLM inference**. **Performance Tip**: Acting on multiple `observe` suggestions will minimize the number of LLM calls for multi-step actions and speed up your workflow 2-3x. ```typescript TypeScript await page.act(buttons[0]); // No LLM! ``` ```python Python await page.act(buttons[0]) # No LLM! ``` #### Plan ahead You can use multiple suggestions from `observe` to preview a batch of actions. For example, when filling a form you could ask `observe` to find all the fields and then pass them in to `act`. **Call the LLM once, act multiple times**. ```typescript TypeScript const fields = await page.observe("Find all the fields in the form"); for (const field of fields) { await page.act(field); // No LLM! } ``` ```python Python fields = await page.observe("Find all the fields in the form") for field in fields: await page.act(field) # No LLM! ``` ### Observe and Extract Using `observe` to focus `extract` on a specific section of the page (like a table, a form, a list...) minimizes the context needed for an extraction. **Savings Tip**: Pass the selector to `extract` to reduce LLM token usage by 10x for verbose websites! ```typescript TypeScript // Use observe to validate elements before extraction const [ table ] = await page.observe("Find the data table"); const { data } = await page.extract({ instruction: "Extract data from the table", schema: z.object({ data: z.string() }), selector: table.selector // Reduce context scope needed for extraction }); ``` ```python Python # Use observe to validate elements before extraction [ table ] = await page.observe("Find the data table") extraction = await page.extract( "Extract data from the table", schema=Data, # Pydantic schema selector=table.selector # Reduce context scope needed for extraction ) ``` ## Best Practices ### Choose the right commands - Use `observe` when a yes/no answer will gate an action (e.g., "Find the Submit button"), then conditionally `act`. - Use `extract` for information-only questions (e.g., "What’s the page title?", "How many results are listed?"). - Don’t call `extract` to locate elements you plan to click next. - Don’t call `observe` to answer info-only questions that won’t lead to an action. - **Discover and plan with `observe`**: Use `observe("Find…")` to map actionable elements and preview next steps. - **Scope `extract` with selectors from `observe`**: First `observe("Find the data table")`, then pass `selector` to `extract` to reduce tokens and boost accuracy. ### Conserve LLM tokens Optimize performance by directly passing `ObserveResult` to `act` (e.g., `await page.act(results[0])`) to save LLM tokens. Batch operations by using `observe` once to find elements, then act on each. Cache and reuse stable `observe` results for familiar pages, using self-healing if layouts change. Check out the guide on how to build your own action cache ### Improve Accuracy Be precise with instructions, e.g., "Find the primary CTA in the hero" for better results. For iframes, set `iframes: true` and wait for `networkidle`. Use `observe` selectors in `extract` to limit context. Check out the guide on how to improve the accuracy of your results ### Action Validation Before performing critical actions, validate the suggestion's `method`, `selector`, and `arguments` to prevent misclicks. If a direct `act` fails, use `observe` with the same prompt to verify the method, then proceed with the suggested action. ```typescript TypeScript const prompt = "click the submit button"; const expectedMethod = "click"; try { await page.act(prompt); } catch (error) { if (error.message.includes("method not supported")) { // Observe the same prompt to get the planned action const [action] = await page.observe(prompt); if (action && action.method === expectedMethod) { await page.act(action); } else { throw new Error(`Unsupported method: expected "${expectedMethod}", got "${action?.method}"`); } } else { throw error; } } ``` ```python Python prompt = "click the submit button" expected_method = "click" try: await page.act(prompt) except Exception as error: if "method not supported" in str(error): # Observe the same prompt to get the planned action results = await page.observe(prompt) if results and results[0].method == expected_method: await page.act(results[0]) else: method = results[0].method if results else "unknown" raise Exception(f'Unsupported method: expected "{expected_method}", got "{method}"') else: raise error ``` ## Troubleshooting **Problem**: `observe` returns empty array **Solutions**: - Make sure the element exists on the page - Use explicit instructions to find the element - Ensure page has fully loaded - Look at the [debugging logs](/v2/configuration/logging), if the element is there then the LLM might be hallucinating/not catching it. **Problem**: Descriptions don't match actual elements **Solutions**: - Use more capable models: check [evals](https://stagehand.dev/evals) for the best models for your use case - Provide more specific instructions - Log inference to file (see [debugging logs](/v2/configuration/logging#llm-inference-logging)) to get an LLM trace **Problem**: The method identified is not valid **Solutions**: - Check the [supported actions](/v2/basics/act) - Provide more specific instructions - Validate the method, if invalid override with one of the supported ones ## Next Steps Execute actions efficiently using `observe` results Extract structured data from observed elements Monitor and debug observation performance Advanced patterns and optimization techniques ================================================ FILE: packages/docs/v2/best-practices/agent-fallbacks.mdx ================================================ --- title: Agent Fallbacks description: "A failsafe when unexpected page changes add extra steps" --- ## When to use Use an agent fallback as a failsafe when a one step action unexpectedly becomes a multi-step flow. ## How it works 1. [`act()`](/v2/basics/act) is attempted for the direct action 2. If it fails, [`agent()`](/v2/basics/agent) figures out the new path 3. Agent completes all needed steps (open menu → click button) ### Example scenario **Before**: Sign in button was in the header **After**: Sign in now requires: Click account menu → Click "Sign in" option A single `act("click sign in")` can't handle this change. The agent fallback can discover and execute both steps. ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; try { await page.act("click the 'Sign In' button"); } catch (err) { console.log("Agent fallback triggered"); const agent = stagehand.agent({ provider: "anthropic", model: "claude-sonnet-4-20250514", instructions: "You are a helpful assistant that can use a web browser.", }); const result = await agent.execute({ instruction: "Find and click Sign In button", maxSteps: 10, }); console.log(result.success ? "Agent fallback success" : "Agent fallback failed"); if (!result.success) throw err; } ``` ```python Python from stagehand import Stagehand try: await page.act("click the 'Sign In' button") except Exception as err: print("Agent fallback triggered") agent = stagehand.agent({ "provider": "anthropic", "model": "claude-sonnet-4-20250514", "instructions": "Complete the action, handling any new steps required.", }) result = await agent.execute({ "instruction": "Find and click Sign In button", "max_steps": 10, }) print("Agent fallback success" if result.success else "Agent fallback failed") if not result.success: raise err ``` ================================================ FILE: packages/docs/v2/best-practices/build-agent.mdx ================================================ --- title: 'Build a web browsing agent' description: 'Build an AI agent that can autonomously control a browser with Stagehand' --- import { Excalidraw } from '/snippets/excalidraw.mdx'; Stagehand gives AI agents powerful tools to control a browser completely autonomously. Watch below as a Stagehand agent autonomously navigates to a URL, takes actions on the page, and extracts structured data to answer a question. There's quite a few ways to build an agent with Stagehand. Let's look at a few of them. ![Agent](/media/stagehand-agent.gif) ## Stagehand MCP The above example is a Claude agent that uses Stagehand to control a browser. At this time of writing, [multimodal tool calling](https://sdk.vercel.ai/docs/ai-sdk-core/tools-and-tool-calling#multi-modal-tool-results) is only supported in Claude 3.5/3.7 Sonnet. This means Claude is intelligent enough to know when to request a browser screenshot, and it can then use that screenshot to make decisions about what actions to take next. Control a browser with Browserbase MCP powered by Stagehand What's really interesting about this is that the agent is able to reason about the browser state and take actions separate from one another! Claude is able to reason about the browser state, while Stagehand is able to take actions on the page with GPT-4o-mini or a computer use model. Stagehand is even smart enough to know when to use GPT-4o-mini and when to use a computer use model, i.e. on iframe detection. We've found great success from having Claude as the "Trajectory" agent calling Stagehand tools when it sees fit! While MCP is really nascent, we're excited to see where it goes. ## Stagehand + Computer Use Models Stagehand lets you leverage powerful computer use APIs from OpenAI and Anthropic with just one line of code. ```typescript TypeScript await page.goto("https://github.com/browserbase/stagehand"); // Create a Computer Use agent with just one line of code! const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview" }); // Use the agent to execute a task const result = await agent.execute("Extract the top contributor's username"); console.log(result); ``` ```python Python await page.goto("https://github.com/browserbase/stagehand-python") # Create a Computer Use agent with just one line of code! agent = stagehand.agent( model="computer-use-preview" ) # Use the agent to execute a task result = await agent.execute("Extract the top contributor's username") print(result) ``` Check out our docs page for instructions on how to use computer use models with Stagehand. Check out a live demo of a Browserbase browser controlled by OpenAI's Computer Using Agent (CUA) model. ## Sequential Tool Calling (Open Operator) In January 2025, Browserbase released [Open Operator](https://operator.browserbase.com). Open Operator is able to reason about the browser state and take actions accordingly to accomplish larger tasks like "order me a pizza". It works by calling Stagehand tools in sequence: 1. If there's no URL, go to a default URL. 1. Examine the browser state. Ask an LLM to reason about what to do next. 1. Use `page.act()` to execute the LLM-suggested action. 1. Repeat Incorporating `stagehand.agent` into your browser automation is as easy as adding a single line of code: Python currently supports `stagehand.agent` with Computer Use Agent (CUA) models. The default implementation is coming soon. ```typescript TypeScript await stagehand.page.goto("https://github.com/browserbase/stagehand"); // Open Operator will use the default LLM from Stagehand config const operator = stagehand.agent(); const { message, actions } = await operator.execute( "Extract the top contributor's username" ); console.log(message); ``` ### Replay the agent's actions You can replay the agent's actions exactly the same way you would with a regular Stagehand agent. You can even automatically cache the actions to avoid unnecessary LLM calls on a repeated run. Let's use the `replay` function below to save the actions to a Stagehand script file, which will reproduce the same actions the agent did, with cached actions built in. ```typescript import { AgentAction, AgentResult } from "@browserbasehq/stagehand"; import { exec } from "child_process"; import fs from "fs/promises"; export async function replay(result: AgentResult) { const history = result.actions; const replay = history .map((action: AgentAction) => { switch (action.type) { case "act": if (!action.playwrightArguments) { throw new Error("No playwright arguments provided"); } return `await page.act(${JSON.stringify( action.playwrightArguments )})`; case "extract": return `await page.extract("${action.parameters}")`; case "goto": return `await page.goto("${action.parameters}")`; case "wait": return `await page.waitForTimeout(${parseInt( action.parameters as string )})`; case "navback": return `await page.goBack()`; case "refresh": return `await page.reload()`; case "close": return `await stagehand.close()`; default: return `await stagehand.oops()`; } }) .join("\n"); console.log("Replay:"); const boilerplate = ` import { Page, BrowserContext, Stagehand } from "@browserbasehq/stagehand"; export async function main(stagehand: Stagehand) { const page = stagehand.page ${replay} } `; await fs.writeFile("replay.ts", boilerplate); // Format the replay file with prettier await new Promise((resolve, reject) => { exec( "npx prettier --write replay.ts", (error: any, stdout: any, stderr: any) => { if (error) { console.error(`Error formatting replay.ts: ${error}`); reject(error); return; } resolve(stdout); } ); }); } ``` Here's the replay output of an instruction like `"Get me the stock price of NVDA"`: ```typescript {14-22} replay.ts import { Page, BrowserContext, Stagehand } from "@browserbasehq/stagehand"; export async function main({ page, context, stagehand, }: { page: Page; // Playwright Page with act, extract, and observe methods context: BrowserContext; // Playwright BrowserContext stagehand: Stagehand; // Stagehand instance }) { await page.goto("https://www.google.com"); // Replay will default to Playwright first to avoid unnecessary LLM calls! // If the Playwright action fails, Stagehand AI will take over and self-heal await page.act({ description: "The search combobox where users can type their queries.", method: "fill", arguments: ["NVDA stock price"], selector: "xpath=/html/body[1]/div[1]/div[3]/form[1]/div[1]/div[1]/div[1]/div[1]/div[2]/textarea[1]", }); await page.extract( "the displayed NVDA stock price in the search suggestions", ); await stagehand.close(); } ``` ================================================ FILE: packages/docs/v2/best-practices/caching.mdx ================================================ --- title: Caching Actions description: You can cache actions in Stagehand to avoid redundant LLM calls. --- Caching actions in Stagehand is useful for actions that are expensive to run, or when the underlying DOM structure is not expected to change. ## Using `observe` to preview an action `observe` lets you preview an action before taking it. If you are satisfied with the action preview, you can run it in `page.act` with no further LLM calls. ```typescript TypeScript const [actionPreview] = await page.observe("Click the quickstart link"); /** actionPreview is a JSON-ified version of a Playwright action: { description: "The quickstart link", method: "click", selector: "/html/body/div[1]/div[1]/a", arguments: [], } **/ // NO LLM INFERENCE when calling act on the preview await page.act(actionPreview) ``` ```python Python actions = await page.observe("Click the quickstart link") action_preview = actions[0] # action_preview is a dictionary version of a Playwright action: # { # "description": "The quickstart link", # "method": "click", # "selector": "/html/body/div[1]/div[1]/a", # "arguments": [], # } # NO LLM INFERENCE when calling act on the preview await page.act(action_preview) ``` ## Simple caching Let's use a simple file-based cache for this example. We'll write a getter and a setter functions that can read and write to a JSON file: ```typescript TypeScript // Get the cached value (undefined if it doesn't exist) async function getCache(key: string): Promise { try { const cache = await readFile("cache.json"); const parsed = JSON.parse(cache); return parsed[key]; } catch { return undefined; } } // Set the cache value async function setCache(key: string, value: ObserveResult): Promise { const cache = await readFile("cache.json"); const parsed = JSON.parse(cache); parsed[key] = value; await writeFile("cache.json", JSON.stringify(parsed)); } ``` ```python Python # Get the cached value (None if it doesn't exist) async def get_cache(key: str) -> Optional[Dict[str, Any]]: try: async with aiofiles.open("cache.json", 'r') as f: cache_content = await f.read() parsed = json.loads(cache_content) return parsed.get(key) except (FileNotFoundError, json.JSONDecodeError): return None # Set the cache value async def set_cache(key: str, value: Dict[str, Any]) -> None: try: async with aiofiles.open("cache.json", 'r') as f: cache_content = await f.read() parsed = json.loads(cache_content) except (FileNotFoundError, json.JSONDecodeError): parsed = {} parsed[key] = value async with aiofiles.open("cache.json", 'w') as f: await f.write(json.dumps(parsed)) ``` ### Act with cache Let's write a function that will check the cache, get the action, and run it. If the action fails, we'll attempt to "self-heal", i.e. retry it with `page.act` directly. ```typescript TypeScript // Check the cache, get the action, and run it // If selfHeal is true, we'll attempt to self-heal if the action fails async function actWithCache(page: Page, key: string, prompt: string, selfHeal = false) { try { const cacheExists = await getCache(key); let action: ObserveResult; if (cacheExists) { // Get the cached action action = await getCache(prompt); } else { // Get the observe result (the action) [action] = await page.observe(prompt); // Cache the action await setCache(prompt, action); } // Run the action (no LLM inference) await page.act(action); } catch (e) { console.error(e); // in selfHeal mode, we'll retry the action if (selfHeal) { console.log("Attempting to self-heal..."); await page.act(prompt); } else { throw e; } } } ``` ```python Python # Check the cache, get the action, and run it # If self_heal is true, we'll attempt to self-heal if the action fails async def act_with_cache(page, key: str, prompt: str, self_heal: bool = False): try: cache_exists = await get_cache(key) if cache_exists: # Get the cached action action = await get_cache(prompt) else: # Get the observe result (the action) actions = await page.observe(prompt) action = actions[0] # Cache the action await set_cache(prompt, action) # Run the action (no LLM inference) await page.act(action) except Exception as e: print(f"Error: {e}") # in self_heal mode, we'll retry the action if self_heal: print("Attempting to self-heal...") await page.act(prompt) else: raise e ``` You can now use `actWithCache` to run an action with caching: ```typescript TypeScript const prompt = "Click the quickstart link"; const key = prompt; // Simple cache key // Attempt cached action or self-heal await actWithCache(page, key, prompt); ``` ```python Python prompt = "Click the quickstart link" key = prompt # Simple cache key # Attempt cached action or self-heal await act_with_cache(page, key, prompt) ``` ## Advanced caching The above example is simple, but you may want to cache actions based on the page contents. Also, if you have duplicate prompts, you should use a more unique key. We want to leave caching logic up to you, but give you all the tools you need to implement your own caching strategy. You can directly access the DOM and accessibility tree from Playwright's page object. Here's an example of how to access the page content: ```typescript TypeScript // Get the page content const pageContent = await page.content(); ``` ```python Python # Get the page content page_content = await page.content() ``` You may also want to use the accessibility tree, the DOM, or any other information to create a more unique key. You can do this as you please, with very similar logic to the above example. ================================================ FILE: packages/docs/v2/best-practices/computer-use.mdx ================================================ --- title: Computer Use Agents description: Incorporate Computer Use APIs from Anthropic and OpenAI with one line of code in Stagehand. --- ## What is a Computer Use Agent? You might've heard of [Claude Computer Use](https://www.anthropic.com/news/3-5-models-and-computer-use) or [OpenAI's Computer Using Agent](https://openai.com/index/computer-using-agent/). These are powerful tools that can convert natural language into actions on the computer. However, you'd otherwise need to write your own code to convert these actions into Playwright commands. Stagehand not only handles the execution of Computer Use outputs, but also lets you hot-swap between OpenAI and Anthropic models with one line of code. ## How to use a Computer Use Agent in Stagehand Stagehand lets you use Computer Use Agents with one line of code: **IMPORTANT! Configure your browser dimensions** Computer Use Agents will often return XY-coordinates to click on the screen, so you'll need to configure your browser dimensions. If not specified, the default browser dimensions are 1024x768. You can also configure the browser dimensions in the `browserbaseSessionCreateParams` or `localBrowserLaunchOptions` options. ### Configuring browser dimensions Browser configuration differs by environment: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY /* API key for authentication */, projectId: process.env.BROWSERBASE_PROJECT_ID /* Project identifier */, browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, browserSettings: { blockAds: true, viewport: { width: 1024, height: 768, }, }, }, }); await stagehand.init(); ``` ```python Python import os from stagehand import Stagehand, StagehandConfig stagehand = Stagehand(StagehandConfig( env="BROWSERBASE", api_key=os.getenv("BROWSERBASE_API_KEY"), # API key for authentication project_id=os.getenv("BROWSERBASE_PROJECT_ID"), # Project identifier browserbase_session_create_params={ "projectId": os.getenv("BROWSERBASE_PROJECT_ID"), "browserSettings": { "blockAds": True, "viewport": { "width": 1024, "height": 768, }, }, }, )) await stagehand.init() ``` ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { headless: false, viewport: { width: 1024, height: 768, }, } }); await stagehand.init(); ``` ```python Python from stagehand import Stagehand, StagehandConfig stagehand = Stagehand(StagehandConfig( env="LOCAL", local_browser_launch_options={ "headless": False, "viewport": { "width": 1024, "height": 768, }, } )) await stagehand.init() ``` ### Direct your Computer Use Agent Call `execute` on the agent to assign a task to the agent. ```typescript TypeScript // Navigate to a website await stagehand.page.goto("https://www.google.com"); const agent = stagehand.agent({ // You can use either OpenAI or Anthropic provider: "anthropic", // The model to use (computer-use-preview for OpenAI) model: "claude-sonnet-4-20250514", // Customize the system prompt instructions: `You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.`, // Customize the API key options: { apiKey: process.env.ANTHROPIC_API_KEY, }, }); // Execute the agent await agent.execute("Apply for a library card at the San Francisco Public Library"); ``` ```python Python import os # Navigate to a website await stagehand.page.goto("https://www.google.com") agent = stagehand.agent({ # The model to use model="computer-use-preview", # Customize the system prompt instructions="You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.", # Customize the API key options={ "apiKey": os.getenv("ANTHROPIC_API_KEY"), }, }) # Execute the agent await agent.execute("Apply for a library card at the San Francisco Public Library") ``` You can also define the maximum number of steps the agent can take with: ```typescript TypeScript await agent.execute({ instructions: "Apply for a library card at the San Francisco Public Library", maxSteps: 10, }); ``` ```python Python await agent.execute( "Apply for a library card at the San Francisco Public Library", max_steps=10, ) ``` View or run the example templates [here](https://www.browserbase.com/templates?category=Computer+Use+Agents) ================================================ FILE: packages/docs/v2/best-practices/contributing.mdx ================================================ --- title: 'Contribute to Stagehand' description: 'Best practices for making a meaningful contribution to Stagehand' --- # Codeowners and Subject-Matter Experts Any contribution must be explicitly approved by a codeowner. Officially, Stagehand codeowners are as follows: - [**Paul Klein**](https://github.com/pkiv) - [**Miguel Gonzalez**](https://github.com/miguelg719) - [**Sean McGuire**](https://github.com/seanmcguire12) - [**Anirudh Kamath**](https://github.com/kamath) - [**Sameel Arif**](https://github.com/sameelarif) - [**Filip Michalsky**](https://github.com/filip-michalsky) Special thanks to [Jeremy Press](https://github.com/jeremypress), [Navid Pour](https://github.com/navidkpr), and [all the contributors](https://github.com/browserbase/stagehand/graphs/contributors) for your help in making Stagehand the best browser automation framework. ***Please do not hesitate to reach out to anyone listed here in the [public Discord server](https://stagehand.dev/discord)*** ## General Workflow Get listed as [one of our beloved contributors](https://github.com/browserbase/stagehand/graphs/contributors)! 1. **Discuss your proposed contribution before starting.** Not doing this runs you the risk of entirely discarding something you put considerable time and effort into. You can DM Miguel on [Discord](https://stagehand.dev/discord) for a 1on1 call. 2. **Open a Pull Request.** Create a fork of this repository, and follow [GitHub’s instructions to create a Pull Request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). This allows our team to review your contribution and leave comments. 3. **Wait for Review**. We'll do our best to get to your contribution as soon as possible. If it's been 2-3 days and you have yet to receive any comments, DM Miguel on [Discord](https://stagehand.dev/discord) 4. **Merge into `evals` branch.** We don’t let external contributors [run our CI via GitHub Actions](https://github.com/browserbase/stagehand/blob/main/.github/workflows/ci.yml) to prevent spam and misuse. If your contribution passes an initial screen, we’ll run our evals on it 1. By default, all PRs run the following tests that you can also run from the repo source: 1. Lint (`npm run lint`) - Runs `prettier` and `eslint`. If this fails, you can most likely run `npm run format` to fix some simple linting errors. 2. Build (`npm run build`) - Lints and builds TS → JS in `dist/` 3. End-to-End (`npm run e2e`) - These are deterministic end-to-end Playwright tests to ensure the integrity of basic Playwright functionality of [`stagehand.page`](http://stagehand.page) and `stagehand.context` as well as compatibility with the Browserbase API 4. Combination (`npm run evals category combination`) - This runs AI-based end-to-end tests using combinations of `act`, `extract`, and `observe` 2. If you’re changing anything about `act`, `extract`, or `observe` itself, we might also run specific act/extract/observe evals to ensure existing functionality doesn’t significantly drop. ![CI](/images/CI.png) 5. **Cleanup and merge to main**. Once it’s in `evals`, unfortunately the original contributor can’t make any further changes. The internal Stagehand team will be responsible for cleaning up the code and bringing it into main. ## Contribution Guidelines 1. **Use draft PRs.** If your PR is a work in progress, please convert it to a draft (see below) while you’re working on it, and mark it for review/add reviewers when you’re ready. This helps us prevent clutter in the review queue. ![Draft PR](/images/pr_draft.png) 2. **Provide a reproducible test plan.** Include an eval (preferred) or example. We can’t merge your PR if we can’t run anything that specifically highlights your contribution. 1. Write a script in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/v2/evals/tasks) as `someTask.ts` 2. Add your script to [`evals.config.json`](https://github.com/browserbase/stagehand/blob/v2/evals/evals.config.json) with default category `combination` (*or act/extract/observe if you’re* *only* *testing* *act/extract/observe*). 3. **Add a changeset.** Run `npx changeset` in TS or `uvx changeset` in Python to add a changeset that will directly reflect in the `CHANGELOG` in the upcoming release. 1. `patch` - no net new functionality to an end-user 2. `minor` - some net new functionality to an end-user (new function parameter, new exposed type, etc.) 3. `major` - you shouldn’t be committing a major change ================================================ FILE: packages/docs/v2/best-practices/cost-optimization.mdx ================================================ --- title: Cost Optimization sidebarTitle: Cost Optimization description: Minimize costs while maintaining automation performance --- Cost optimization in Stagehand involves balancing LLM inference costs and browser infrastructure costs. This guide provides practical strategies to reduce your automation expenses. ## Quick Wins Start with these simple optimizations that can reduce costs: ### 1. Use the Right Model for the Job We don't recommend using larger, more premium models for simple tasks. See our [evaluation results](https://stagehand.dev/evals) for model performance and cost comparisons across different task types. Choose the right LLM for your budget and accuracy requirements See how different models perform on different tasks ### 2. Implement Smart Caching Cache successful actions to avoid repeated LLM calls. Learn the basics in our [Caching Guide](/v2/best-practices/caching): ```typescript TypeScript // Cache successful actions const [action] = await page.observe("Click the sign in button"); await setCache("sign_in_button", action); // Reuse cached action (no LLM cost) const cachedAction = await getCache("sign_in_button"); if (cachedAction) { await page.act(cachedAction); } else { await page.act(action); } ``` ```python Python # Cache successful actions actions = await page.observe("Click the sign in button") action = actions[0] await set_cache("sign_in_button", action) # Reuse cached action (no LLM cost) cached_action = await get_cache("sign_in_button") if cached_action: await page.act(cached_action) else: await page.act(action) ``` Reduce costs with smart action caching and observe patterns ### 3. Optimize Browser Sessions Reuse sessions when possible and set appropriate timeouts. See [Browser Configuration](/v2/configuration/browser) for details: ```typescript TypeScript const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionCreateParams: { timeout: 1800, // 30 minutes instead of default 1 hour keepAlive: true, // Keep session alive between tasks } }); ``` ```python Python stagehand = Stagehand( env="BROWSERBASE", browserbase_session_create_params={ "timeout": 1800, # 30 minutes instead of default 1 hour "keep_alive": True, # Keep session alive between tasks } ) ``` Optimize Browserbase infrastructure costs and session management ## Advanced Strategies ### Intelligent Model Switching Automatically fall back to cheaper models for simple tasks: ```typescript TypeScript // Use models from least to most expensive based on task complexity // See stagehand.dev/evals for performance comparisons async function smartAct(page: Page, prompt: string) { const models = ["cheaper-model", "premium-model"]; for (const model of models) { try { const stagehand = new Stagehand({ modelName: model }); await stagehand.init(); const [action] = await stagehand.page.observe(prompt); await stagehand.page.act(action); return; } catch (error) { console.log(`Falling back to ${model}...`); } } } ``` ```python Python # Use models from least to most expensive based on task complexity # See stagehand.dev/evals for performance comparisons async def smart_act(page, prompt: str): models = ["cheaper-model", "premium-model"] for model in models: try: stagehand = Stagehand(model_name=model) await stagehand.init() actions = await stagehand.page.observe(prompt) action = actions[0] await stagehand.page.act(action) return except Exception: print(f"Falling back to {model}...") ``` ### Session Pooling Reuse browser sessions across multiple tasks: ```typescript TypeScript class SessionManager { private sessions = new Map(); async getSession(taskType: string): Promise { if (this.sessions.has(taskType)) { return this.sessions.get(taskType)!; } const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); this.sessions.set(taskType, stagehand); return stagehand; } } ``` ```python Python class SessionManager: def __init__(self): self.sessions = {} async def get_session(self, task_type: str): if task_type in self.sessions: return self.sessions[task_type] stagehand = Stagehand(env="BROWSERBASE") await stagehand.init() self.sessions[task_type] = stagehand return stagehand ``` ## Cost Monitoring Track your spending to identify optimization opportunities. See our [Observability Guide](/configuration/observability) for detailed metrics: ```typescript TypeScript // Monitor token usage const metrics = stagehand.metrics; console.log(`Total tokens: ${metrics.totalPromptTokens + metrics.totalCompletionTokens}`); console.log(`Estimated cost: $${(metrics.totalPromptTokens + metrics.totalCompletionTokens) * 0.00001}`); ``` ```python Python # Monitor token usage metrics = stagehand.metrics total_tokens = metrics['total_prompt_tokens'] + metrics['total_completion_tokens'] print(f"Total tokens: {total_tokens}") print(f"Estimated cost: ${total_tokens * 0.00001:.4f}") ``` Monitor usage patterns and track costs in real-time ## Budget Controls Set spending limits to prevent unexpected costs: ```typescript TypeScript class BudgetGuard { private dailySpend = 0; private maxDailyBudget: number; constructor(maxDailyBudget: number = 25) { this.maxDailyBudget = maxDailyBudget; } checkBudget(estimatedCost: number): void { if (this.dailySpend + estimatedCost > this.maxDailyBudget) { throw new Error(`Daily budget exceeded: $${this.maxDailyBudget}`); } this.dailySpend += estimatedCost; } } ``` ```python Python class BudgetGuard: def __init__(self, max_daily_budget: float = 25.0): self.daily_spend = 0 self.max_daily_budget = max_daily_budget def check_budget(self, estimated_cost: float) -> None: if self.daily_spend + estimated_cost > self.max_daily_budget: raise Exception(f"Daily budget exceeded: ${self.max_daily_budget}") self.daily_spend += estimated_cost ``` ## Related Resources Choose the right LLM for your budget and accuracy requirements Reduce costs with smart action caching and observe patterns Monitor usage patterns and track costs in real-time Optimize Browserbase infrastructure costs and session management ================================================ FILE: packages/docs/v2/best-practices/deployments.mdx ================================================ --- title: 'Deploying Stagehand' description: 'Deploy your AI agents and automations to the cloud' --- **🌟 Preview: Browser Functions** - Deploy your web automation code directly on Browserbase with browser functions. Scale your `act()` automations in the cloud with zero infrastructure setup. Reach out to hello@browserbase.com to get beta access. ## Deploy on Vercel Securely run Stagehand on Browserbase inside a Vercel Function. This guide shows a minimal, production-safe HTTP endpoint you can call directly or on a schedule. ### 1. Install Vercel CLI To download and install Vercel CLI, run one of the following commands: ```bash pnpm pnpm i -g vercel ``` ```bash yarn yarn global add vercel ``` ```bash npm npm i -g vercel ``` ```bash bun bun add -g vercel ``` ### 2. Project layout ```text your-project/ api/ run.ts package.json tsconfig.json vercel.json ``` Create the structure with: ```bash mkdir -p api touch api/run.ts package.json vercel.json tsconfig.json ``` ### 3. `api/run.ts` (Node.js runtime) ```typescript // api/run.ts import type { VercelRequest, VercelResponse } from "@vercel/node"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod/v3"; export default async function handler(req: VercelRequest, res: VercelResponse): Promise { try { const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY!, projectId: process.env.BROWSERBASE_PROJECT_ID!, disablePino: true, modelName: "google/gemini-2.5-flash", modelClientOptions: { apiKey: process.env.GOOGLE_API_KEY!, }, // optional session params browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, region: "us-west-2", browserSettings: { blockAds: true, }, }, }); await stagehand.init(); const page = stagehand.page; await page.goto("https://www.stagehand.dev/"); await page.act("click the evals button"); const { extraction } = await page.extract("extract the fastest model"); const data = { model: extraction ?? "" }; await stagehand.close(); res.status(200).json({ ok: true, data: data.model }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); res.status(500).json({ ok: false, error: msg }); } } ``` ### 4. `package.json` ```json { "name": "bb-stagehand-on-vercel", "private": true, "type": "module", "engines": { "node": ">=18" }, "dependencies": { "@browserbasehq/stagehand": "^2.4.3", "zod": "^3.25.0" }, "devDependencies": { "typescript": "^5.6.0", "@types/node": "^20.12.12", "@vercel/node": "^3.2.20" } } ``` ### 5. `tsconfig.json` ```json { "compilerOptions": { "target": "ES2022", "module": "ES2022", "moduleResolution": "node", "outDir": ".vercel/output/functions", "strict": true, "esModuleInterop": true, "skipLibCheck": true, "types": ["node"] }, "include": ["api/**/*.ts"] } ``` ### 6. `vercel.json` ```json { "$schema": "https://openapi.vercel.sh/vercel.json", "functions": { "api/run.ts": { "maxDuration": 60 } } } ``` See Vercel's [configuring functions](https://vercel.com/docs/functions/configuring-functions) docs for more details. ### 7. Link your project Link your local folder to a Vercel project before configuring environment variables: ```bash # authenticate if needed vercel login # link the current directory to a Vercel project (interactive) vercel link ``` ### 8. Environment variables Do not commit `.env` in production. Add variables via Vercel CLI: ```bash vercel env add BROWSERBASE_API_KEY vercel env add BROWSERBASE_PROJECT_ID # (and your model key if needed) vercel env add GOOGLE_API_KEY ``` See also: [Browser Environment](/configuration/environment) for details on required variables. ### 9. Test locally Replicate the Vercel environment locally to exercise your Function before deploying. Run from the project root. ```bash # ensure dependencies are installed npm install # start the local Vercel dev server vercel dev --listen 5005 ``` ### 10. Deploy ```bash vercel vercel --prod ``` ### Execute the function #### Configure Protection Bypass for Automation Before invoking the production URL, create a Protection Bypass for Automation: 1. Generate a 32-character secret (you can use `openssl rand -hex 16`) 2. Go to your project in Vercel 3. Navigate to Settings → Deployment Protection 4. Add the secret to "Protection Bypass for Automation" Then invoke the function with the bypass header: ```bash curl -X POST \ -H "x-vercel-protection-bypass: " \ https:///api/run ``` ### Optional: Cron on Vercel Hit the same endpoint on a schedule by extending `vercel.json`: ```json { "$schema": "https://openapi.vercel.sh/vercel.json", "functions": { "api/run.ts": { "maxDuration": 60 } } }, "crons": [ { "path": "/api/run", "schedule": "0 * * * *" } ] } ``` ### Features - **No local browsers needed** with `env: "BROWSERBASE"`. [Browserbase](https://www.browserbase.com/) provides the browsers. - **Fast functionality**: Offload browser work to Browserbase and return JSON promptly. - **Long-running tasks**: Raise `maxDuration` and/or consider Edge runtime limits depending on plan. ================================================ FILE: packages/docs/v2/best-practices/mcp-integrations.mdx ================================================ --- title: "MCP Integrations" description: "Using Model Context Protocol (MCP) integrations to enhance agent capabilities" --- ## What are MCP Integrations? MCP (Model Context Protocol) integrations allow you to connect your Stagehand agents to external tools, APIs, and services. This enables agents to perform actions beyond browser automation, such as web search, database operations, and API calls. MCP integrations make your agents more powerful by combining browser automation with external capabilities. The agent can intelligently decide when to use browser actions versus external tools. ## Connection Options There are two options for connecting to MCP servers: 1. **Pass a URL directly** - The simplest approach for quick setup 2. **Create a connection first** - Gives you more control over the connection MCP client support is currently only available in TypeScript. ## Passing a URL The simplest way to add MCP integrations is by providing server URLs directly in the agent configuration: ```typescript const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, ], instructions: `You have access to web search through Exa. Use it to find current information before browsing.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); ``` ## Creating a Connection First Alternatively, you can establish MCP connections first and then pass the client objects: ```typescript import { connectToMCPServer } from "@browserbasehq/stagehand"; // Connect to MCP server const supabaseClient = await connectToMCPServer( `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` ); // You can also pass the config to start a local MCP server const notionClient = await connectToMCPServer({ command: "npx", args: ["-y", "@notionhq/notion-mcp-server"], env: { NOTION_TOKEN: process.env.NOTION_TOKEN, }, }); // Use the connected client const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [supabaseClient, notionClient], instructions: `You can interact with Supabase databases and Notion. Use these tools to store and retrieve data.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for restaurants in New Brunswick, NJ and save the first result to the database"); ``` ## Multiple Integrations You can combine multiple MCP integrations in a single agent: ```typescript const databaseClient = await connectToMCPServer(/* database config */); const agent = stagehand.agent({ integrations: [ `https://search-service.example.com/mcp?apiKey=${process.env.SEARCH_API_KEY}`, databaseClient ], instructions: `You have access to external tools for search and data storage. Use these tools strategically to complete tasks efficiently.` }); ``` ## Best Practices ### Choose the Right Connection Approach **When to use:** - Simple setup requirements - Standard API configurations - Getting started quickly **Benefits:** - Minimal code required - Automatic connection handling - Easy to configure **When to use:** - Custom connection options - Connection reuse across agents - Advanced error handling **Benefits:** - Full control over connections - Better error handling - Connection pooling capabilities ### Environment Variables Always use environment variables for API keys and sensitive information: ```bash # .env file SEARCH_API_KEY=your_search_service_key MCP_SERVICE_API_KEY=your_mcp_service_key OPENAI_API_KEY=your_openai_key DATABASE_URL=your_database_url DATABASE_API_KEY=your_database_key ``` ### Instructions Best Practices Provide clear instructions about available tools: ```typescript instructions: `You have access to: 1. Web search tools - Use to find current information 2. Database tools - Use to store/retrieve data 3. Browser automation - Use for web interactions Always search for current information before making decisions. Store important data for later reference.` ``` ```typescript instructions: "You can search and save data." ``` ### Error Handling Implement proper error handling for MCP connections: ```typescript try { const client = await connectToMCPServer(serverUrl); const agent = stagehand.agent({ integrations: [client], // ... other config }); const result = await agent.execute(instruction); } catch (error) { console.error("MCP integration failed:", error); // Handle fallback behavior } ``` ## Troubleshooting **Problem:** MCP server connections timing out **Solutions:** - Verify server URLs are correct and accessible - Check network connectivity - Ensure API keys are valid and have proper permissions - Try connecting to servers individually to isolate issues **Problem:** Agent not using available MCP tools **Solutions:** - Make instructions more specific about when to use tools - Ensure API keys are properly configured - Check that the MCP server supports the expected tools - Verify tool descriptions are clear and actionable **Problem:** API key or authentication failures **Solutions:** - Verify all required environment variables are set - Check API key validity and permissions - Ensure URLs include necessary authentication parameters - Test MCP connections independently before using in agents ## Examples ### Web Search + Browser Automation ```typescript const agent = stagehand.agent({ integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], instructions: `First search for current information, then use the browser to complete tasks based on what you find.` }); await agent.execute("Find the best laptop deals for 2025 and navigate to purchase the top recommendation"); ``` ### Data Extraction + Storage ```typescript const supabaseClient = await connectToMCPServer(/* config */); const agent = stagehand.agent({ integrations: [supabaseClient], instructions: `Extract data from websites and store it using available database tools.` }); await agent.execute("Extract all restaurant information from this directory and save it to the database"); ``` ### Multi-tool Workflow ```typescript const agent = stagehand.agent({ integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, supabaseClient ], instructions: `Use all available tools strategically: search for current info, browse websites, and store important data.` }); await agent.execute("Research competitor pricing, compare with our site, and store the analysis"); ``` ## Further Reading Learn the fundamentals of Stagehand agents Set up your own MCP server Create custom MCP tools ================================================ FILE: packages/docs/v2/best-practices/playwright-interop.mdx ================================================ --- title: 'Playwright Interoperability' description: 'How Stagehand interacts with Playwright' --- Stagehand is built on top of [Playwright](https://playwright.dev/), so you can use Playwright methods directly through the Stagehand instance. ## `page` and `context` `stagehand.page` and `stagehand.context` are instances of Playwright's `Page` and `BrowserContext` respectively. Use these methods to interact with the Playwright instance that Stagehand is using. ```TypeScript TypeScript const page = stagehand.page; // Base Playwright methods work await page.goto("https://github.com/browserbase/stagehand"); // Stagehand overrides Playwright objects await page.act("click on the contributors") ``` ```python Python page = stagehand.page # Base Playwright methods work await page.goto("https://github.com/browserbase/stagehand") # Stagehand overrides Playwright objects await page.act("click on the contributors") ``` ## Stagehand v. Playwright Below is an example of how to extract a list of companies from the AI Grant website using both Stagehand and Playwright. Stagehand v. Playwright The above example with Stagehand can be easily reused to extract data from other websites, whereas the Playwright example would need to be rewritten for each new website. ================================================ FILE: packages/docs/v2/best-practices/prompting-best-practices.mdx ================================================ --- title: Prompting Best Practices description: "Write effective prompts for reliable Stagehand automation" --- Good prompts make Stagehand reliable. Bad prompts cause failures. Here's how to write prompts that work consistently. ## Act Method Use `act()` for single actions on web pages. Each action should be focused and clear. ```typescript TypeScript // Good - Single, specific actions await page.act("click the 'Add to Cart' button"); await page.act("type 'user@example.com' into the email field"); // Bad - Multiple actions combined await page.act("fill out the form and submit it"); await page.act("login with credentials and navigate to dashboard"); ``` ```python Python # Good - Single, specific actions await page.act("click the 'Add to Cart' button") await page.act("type 'user@example.com' into the email field") # Bad - Multiple actions combined await page.act("fill out the form and submit it") await page.act("login with credentials and navigate to dashboard") ``` ### Use Element Types, Not Colors Describe elements by their type and function rather than visual attributes like color. ```typescript TypeScript // Good - Element types and descriptive text await page.act("click the 'Sign In' button"); await page.act("type into the email input field"); // Bad - Color-based descriptions await page.act("click the blue button"); await page.act("type into the white input"); ``` ```python Python # Good - Element types and descriptive text await page.act("click the 'Sign In' button") await page.act("type into the email input field") # Bad - Color-based descriptions await page.act("click the blue button") await page.act("type into the white input") ``` ### Use Descriptive Language ```typescript TypeScript // Good - Clear element identification await page.act("click the 'Next' button at the bottom of the form"); await page.act("type into the search bar at the top of the page"); // Bad - Vague descriptions await page.act("click next"); await page.act("type into search"); ``` ```python Python # Good - Clear element identification await page.act("click the 'Next' button at the bottom of the form") await page.act("type into the search bar at the top of the page") # Bad - Vague descriptions await page.act("click next") await page.act("type into search") ``` ### Choose the Right Action Verbs - **Click** for buttons, links, checkboxes - **Type** for text inputs - **Select** for dropdowns - **Check/uncheck** for checkboxes - **Upload** for file inputs ```typescript TypeScript // Good await page.act("click the submit button"); await page.act("select 'Option 1' from dropdown"); // Bad await page.act("click submit"); await page.act("choose option 1"); ``` ```python Python # Good await page.act("click the submit button") await page.act("select 'Option 1' from dropdown") # Bad await page.act("click submit") await page.act("choose option 1") ``` ### Protect Sensitive Data Variables keep sensitive information out of prompts and logs. ```typescript TypeScript // Good - Secure approach await page.act({ action: "enter %username% in the email field", variables: { username: "user@example.com" } }); await page.act({ action: "enter %password% in the password field", variables: { password: process.env.USER_PASSWORD } }); // Bad - Insecure approach await page.act("type 'mySecretPassword123' into the password field"); ``` ```python Python import os # Good - Secure approach await page.act( "enter %username% in the email field", variables={ "username": "user@example.com" } ) await page.act( "enter %password% in the password field", variables={ "password": os.environ.get("USER_PASSWORD") } ) # Bad - Insecure approach await page.act("type 'mySecretPassword123' into the password field") ``` Set `verbose: 0` in your Stagehand config to prevent secrets from appearing in logs. ## Extract Method Use `extract()` to pull structured data from pages. Define clear schemas and provide context. ### Schema Best Practices Use descriptive field names, correct types, and detailed descriptions. Field descriptions provide context that helps the agent understand exactly what to extract. ```typescript TypeScript // Good - Descriptive names, correct types, and helpful descriptions const productData = await page.extract({ instruction: "Extract product information", schema: z.object({ productTitle: z.string().describe("The main product name displayed on the page"), priceInDollars: z.number().describe("Current selling price as a number, without currency symbol"), isInStock: z.boolean().describe("Whether the product is available for purchase") }) }); // Bad - Generic names, wrong types, no descriptions const data = await page.extract({ instruction: "Get product details", schema: z.object({ name: z.string(), // Too generic, no context price: z.string(), // Should be number stock: z.string() // Should be boolean, no context }) }); ``` ```python Python from pydantic import BaseModel, Field # Good - Descriptive names, correct types, and helpful descriptions class ProductData(BaseModel): productTitle: str = Field(description="The main product name displayed on the page") priceInDollars: float = Field(description="Current selling price as a number, without currency symbol") isInStock: bool = Field(description="Whether the product is available for purchase") productData = await page.extract( "Extract product information", schema=ProductData ) # Bad - Generic names, wrong types, no descriptions class Data(BaseModel): name: str # Too generic, no context price: str # Should be float, no context stock: str # Should be bool, no context data = await page.extract( "Get product details", schema=Data ) ``` ### Handle Arrays Correctly Always wrap schemas in objects for reliable extraction. ```typescript TypeScript // Good - Array wrapped in object const listings = await page.extract({ instruction: "Extract all apartment listings", schema: z.object({ apartments: z.array(z.object({ address: z.string(), rent: z.number() })) }) }); // Bad - Bare array const listings = await page.extract({ instruction: "Extract apartment listings", schema: z.array(z.string()) // Don't do this }); ``` ```python Python from pydantic import BaseModel from typing import List # Good - Array wrapped in object class Apartment(BaseModel): address: str rent: float class Listings(BaseModel): apartments: List[Apartment] listings = await page.extract( "Extract all apartment listings", schema=Listings ) # Bad - Bare array (not supported) # Don't do this - arrays must be wrapped in objects ``` ### Use Proper URL Types Specify URL types to tell Stagehand to extract URLs. Without proper URL types, Stagehand won't extract URLs. ```typescript TypeScript // Good - Tells Stagehand to extract URLs const links = await page.extract({ instruction: "Extract navigation links", schema: z.object({ links: z.array(z.object({ text: z.string(), url: z.string().url() // Required for URL extraction })) }) }); ``` ```python Python from pydantic import BaseModel, HttpUrl from typing import List # Good - Tells Stagehand to extract URLs class Link(BaseModel): text: str url: HttpUrl # Required for URL extraction class Links(BaseModel): links: List[Link] links = await page.extract( "Extract navigation links", schema=Links ) ``` ## Observe Method Use `observe()` to discover actionable elements before acting on them. ### Check Elements First Verify elements exist before taking action to avoid errors. ```typescript TypeScript // Check for elements first const loginButtons = await page.observe("Find the login button"); if (loginButtons.length > 0) { await page.act(loginButtons[0]); } else { console.log("No login button found"); } ``` ```python Python # Check for elements first login_buttons = await page.observe("Find the login button") if len(login_buttons) > 0: await page.act(login_buttons[0]) else: print("No login button found") ``` ### Be Specific About Element Types ```typescript TypeScript // Good - Specific element types const submitButtons = await page.observe("Find submit button in the form"); const dropdowns = await page.observe("Find the state dropdown menu"); // Bad - Too vague const elements = await page.observe("Find submit stuff"); const things = await page.observe("Find state selection"); ``` ```python Python # Good - Specific element types submit_buttons = await page.observe("Find submit button in the form") dropdowns = await page.observe("Find the state dropdown menu") # Bad - Too vague elements = await page.observe("Find submit") things = await page.observe("Find state selection") ``` ## Agent Method Use `agent()` for complex, multi-step workflows. Provide detailed instructions and set appropriate limits. ### Navigate First Don't include navigation in agent tasks. Handle it separately. ```typescript TypeScript // Good - Navigate first await page.goto('https://amazon.com'); await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart'); // Bad - Navigation in task await agent.execute('Go to Amazon, search for headphones, and add one to cart'); ``` ```python Python # Good - Navigate first await page.goto('https://amazon.com') await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart') # Bad - Navigation in task await agent.execute('Go to Amazon, search for headphones, and add one to cart') ``` ### Be Highly Specific Detailed instructions lead to better results. ```typescript TypeScript // Good - Detailed instructions await agent.execute({ instruction: "Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", maxSteps: 25 }); // Bad - Vague instructions await agent.execute("Find some good restaurants"); ``` ```python Python # Good - Detailed instructions await agent.execute( instruction="Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", max_steps=25 ) # Bad - Vague instructions await agent.execute("Find some good restaurants") ``` ### Set Appropriate Step Limits Match step limits to task complexity. ```typescript TypeScript // Simple task - fewer steps await agent.execute({ instruction: "Subscribe to the newsletter with email 'user@example.com'", maxSteps: 10 }); // Complex task - more steps await agent.execute({ instruction: "Research and compare 5 project management tools with pricing and features", maxSteps: 50 }); ``` ```python Python # Simple task - fewer steps await agent.execute( instruction="Subscribe to the newsletter with email 'user@example.com'", max_steps=10 ) # Complex task - more steps await agent.execute( instruction="Research and compare 5 project management tools with pricing and features", max_steps=50 ) ``` ### Include Success Criteria Tell the agent how to know when it's done. ```typescript TypeScript // Good - Clear success criteria await agent.execute({ instruction: "Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", maxSteps: 20 }); // Bad - No validation await agent.execute("Add some items to cart"); ``` ```python Python # Good - Clear success criteria await agent.execute( instruction="Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", max_steps=20 ) # Bad - No validation await agent.execute("Add some items to cart") ``` ## Common Mistakes to Avoid - **Combining multiple actions** - Keep each `act()` call to one action - **Using vague descriptions** - Be specific about which elements to interact with - **Exposing sensitive data** - Always use variables for credentials - **Skipping validation** - Check results before proceeding ## Testing Your Prompts 1. **Start simple** - Test basic functionality first 2. **Add complexity gradually** - Build up to complex workflows 3. **Monitor results** - Use logging to understand what's happening 4. **Iterate based on failures** - Refine prompts when they don't work Remember: Good prompting is iterative. When in doubt, be more specific rather than less. ================================================ FILE: packages/docs/v2/best-practices/speed-optimization.mdx ================================================ --- title: Speed Optimization sidebarTitle: Speed Optimization description: Optimize Stagehand performance for faster automation and reduced latency --- Stagehand performance depends on several factors: DOM processing speed, LLM inference time, browser operations, and network latency. This guide provides proven strategies to maximize automation speed. ## Quick Performance Wins ### 1. Plan Ahead with Observe Use a single `observe()` call to plan multiple actions, then execute them efficiently: ```typescript TypeScript // Instead of sequential operations with multiple LLM calls await page.act("Fill name field"); // LLM call #1 await page.act("Fill email field"); // LLM call #2 await page.act("Select country dropdown"); // LLM call #3 // Use single observe to plan all form fields - one LLM call const formFields = await page.observe("Find all form fields to fill"); // Execute all actions without LLM inference for (const field of formFields) { await page.act(field); // No LLM calls! } ``` ```python Python import asyncio # Instead of sequential operations with multiple LLM calls await page.act("Fill name field") # LLM call #1 await page.act("Fill email field") # LLM call #2 await page.act("Select country dropdown") # LLM call #3 # Use single observe to plan all form fields - one LLM call form_fields = await page.observe("Find all form fields to fill") # Execute all actions without LLM inference for field in form_fields: await page.act(field) # No LLM calls! ``` **Performance Tip**: Acting on `observe` results avoids LLM inference entirely. This approach is 2-3x faster than direct `act()` calls and is the recommended pattern for multi-step workflows. Learn advanced caching patterns and cache invalidation strategies ### 2. Optimize DOM Processing Reduce DOM complexity before Stagehand processes the page: ```typescript TypeScript // Remove heavy elements that slow down processing await page.evaluate(() => { // Remove video elements document.querySelectorAll('video, iframe').forEach(el => el.remove()); // Hide complex animations document.querySelectorAll('[style*="animation"]').forEach(el => { (el as HTMLElement).style.animation = 'none'; }); }); // Then perform Stagehand operations await page.act("Click the submit button"); ``` ```python Python # Remove heavy elements that slow down processing await page.evaluate(""" () => { // Remove video elements document.querySelectorAll('video, iframe').forEach(el => el.remove()); // Hide complex animations document.querySelectorAll('[style*="animation"]').forEach(el => { el.style.animation = 'none'; }); } """) # Then perform Stagehand operations await page.act("Click the submit button") ``` ### 3. Set Appropriate Timeouts Use shorter timeouts for simple operations and longer ones for complex page loads: ```typescript TypeScript // Simple actions - reduce action timeout await page.act({ instruction: "Click the login button", actTimeout: 5000 // Default is 30000ms, reduce for simple clicks }); // Complex page loads - optimize navigation await page.goto("https://heavy-spa.com", { waitUntil: "domcontentloaded", // Don't wait for all resources timeout: 15000 // Shorter than default 30s }); ``` ```python Python # Simple actions - reduce action timeout await page.act("Click button", act_timeout=5000) # Complex page loads - optimize navigation await page.goto("https://heavy-spa.com", wait_until="domcontentloaded", timeout=15000 ) ``` ## Advanced Performance Strategies ### Smart Model Selection Use faster models for simple tasks, premium models only when needed: ```typescript TypeScript class SpeedOptimizedStagehand { private fastModel: Stagehand; private premiumModel: Stagehand; async smartAct(page: Page, prompt: string, complexity: 'simple' | 'complex') { const model = complexity === 'simple' ? this.fastModel : this.premiumModel; return await model.page.act(prompt); } } // Use fast model for simple clicks/forms await stagehand.smartAct(page, "Click submit", 'simple'); // Use premium model for complex reasoning await stagehand.smartAct(page, "Find the cheapest flight option", 'complex'); ``` ```python Python class SpeedOptimizedStagehand: def __init__(self): self.fast_model = Stagehand(model_name="fast-model") self.premium_model = Stagehand(model_name="premium-model") async def smart_act(self, page, prompt: str, complexity: str): model = self.fast_model if complexity == 'simple' else self.premium_model return await model.page.act(prompt) # Use fast model for simple clicks/forms await stagehand.smart_act(page, "Click submit", 'simple') # Use premium model for complex reasoning await stagehand.smart_act(page, "Find the cheapest flight option", 'complex') ``` Compare model performance and costs ### Page Load Optimization Skip unnecessary resources during page loads: ```typescript TypeScript // Block heavy resources globally await context.route('**/*', (route) => { const resourceType = route.request().resourceType(); if (['image', 'font', 'media'].includes(resourceType)) { route.abort(); } else { route.continue(); } }); // Use faster navigation await page.goto(url, { waitUntil: 'domcontentloaded', // Don't wait for images/fonts timeout: 10000 }); ``` ```python Python # Block heavy resources globally async def handle_route(route): resource_type = route.request.resource_type if resource_type in ['image', 'font', 'media']: await route.abort() else: await route.continue_() await context.route('**/*', handle_route) # Use faster navigation await page.goto(url, wait_until='domcontentloaded', # Don't wait for images/fonts timeout=10000 ) ``` Balance speed with cost considerations ## Performance Monitoring and Benchmarking Track performance metrics and measure optimization impact: ### Performance Tracking ```typescript TypeScript class PerformanceTracker { private speedMetrics: Map = new Map(); async timedAct(page: Page, prompt: string): Promise { const start = Date.now(); const result = await page.act(prompt); const duration = Date.now() - start; if (!this.speedMetrics.has(prompt)) { this.speedMetrics.set(prompt, []); } this.speedMetrics.get(prompt)!.push(duration); console.log(`Action "${prompt}" took ${duration}ms`); return result; } getAverageTime(prompt: string): number { const times = this.speedMetrics.get(prompt) || []; return times.reduce((a, b) => a + b, 0) / times.length; } } ``` ```python Python import time from collections import defaultdict class PerformanceTracker: def __init__(self): self.speed_metrics = defaultdict(list) async def timed_act(self, page, prompt: str): start = time.time() result = await page.act(prompt) duration = (time.time() - start) * 1000 # Convert to ms self.speed_metrics[prompt].append(duration) print(f'Action "{prompt}" took {duration:.0f}ms') return result def get_average_time(self, prompt: str) -> float: times = self.speed_metrics[prompt] return sum(times) / len(times) if times else 0 ``` Example Output: ``` Action "Fill form" took 1000ms Action "Click submit" took 2000ms Action "Confirm submission" took 5000ms ``` ### Before vs After Benchmarking ```typescript TypeScript // Before optimization console.time("workflow"); await page.act("Fill form"); await page.act("Click submit"); await page.act("Confirm submission"); console.timeEnd("workflow"); // 8000ms // After optimization with observe planning console.time("workflow-optimized"); const workflowActions = await page.observe("Find form, submit, and confirm elements"); // Execute actions sequentially to avoid conflicts for (const action of workflowActions) { await page.act(action); } console.timeEnd("workflow-optimized"); // 500ms ``` ```python Python import time # Before optimization start = time.time() await page.act("Fill form") await page.act("Click submit") await page.act("Confirm submission") print(f"Workflow took {(time.time() - start) * 1000:.0f}ms") # 8000ms # After optimization with observe planning start = time.time() workflow_actions = await page.observe("Find form, submit, and confirm elements") # Execute actions sequentially to avoid conflicts for action in workflow_actions: await page.act(action) print(f"Optimized workflow took {(time.time() - start) * 1000:.0f}ms") # 500ms ``` Example Output: ``` Workflow took 8000ms Optimized workflow took 500ms ``` Set up comprehensive performance monitoring ## Related Resources Advanced caching patterns for maximum performance Balance speed improvements with cost considerations Optimize Browserbase settings for speed Choose the right model for speed vs accuracy ================================================ FILE: packages/docs/v2/best-practices/usecase-observe.mdx ================================================ --- sidebarTitle: Use Cases --- ## Real-World Use Cases ### E-commerce Product Discovery ```typescript // Discover product interaction elements const productActions = await page.observe({ instruction: "Find add to cart buttons, size selectors, and product images" }); // Categorize actions by type const cartButtons = productActions.filter(a => a.description.toLowerCase().includes('cart') ); const sizeOptions = productActions.filter(a => a.description.toLowerCase().includes('size') ); // Execute purchase workflow if (sizeOptions.length > 0) { await page.act(sizeOptions[0]); // Select size first } if (cartButtons.length > 0) { await page.act(cartButtons[0]); // Then add to cart } ``` ### Form Handling & Validation ```typescript // Analyze form structure before filling const formElements = await page.observe({ instruction: "Find form fields, validation messages, and submit buttons" }); // Check for required fields const requiredFields = formElements.filter(e => e.description.includes('required') || e.description.includes('*') ); console.log(`Found ${requiredFields.length} required fields to complete`); // Fill form systematically for (const field of requiredFields) { await page.act(field); // Add appropriate input based on field type } ``` ### Dynamic Content & SPA Navigation ```typescript // Wait for and discover dynamically loaded content await page.waitForLoadState('networkidle'); const dynamicElements = await page.observe({ instruction: "Find newly loaded content, infinite scroll triggers, or loading indicators", domSettleTimeoutMs: 15000 // Wait longer for dynamic content }); // Handle infinite scroll const scrollTriggers = dynamicElements.filter(e => e.description.toLowerCase().includes('load more') || e.description.toLowerCase().includes('scroll') ); if (scrollTriggers.length > 0) { await page.act(scrollTriggers[0]); // Recursively observe new content const newContent = await page.observe("Find additional items"); } ``` ### Multi-Step Workflow Planning ```typescript // Plan entire checkout flow upfront async function planCheckoutWorkflow() { // Step 1: Cart page analysis await page.goto('/cart'); const cartActions = await page.observe("Find checkout and cart modification options"); // Step 2: Checkout page analysis const checkoutButton = cartActions.find(a => a.description.includes('checkout')); if (checkoutButton) await page.act(checkoutButton); const checkoutActions = await page.observe("Find payment forms and shipping options"); // Step 3: Plan execution order const shippingFields = checkoutActions.filter(a => a.description.includes('shipping')); const paymentFields = checkoutActions.filter(a => a.description.includes('payment')); const submitButton = checkoutActions.find(a => a.description.includes('complete order')); return { shippingFields, paymentFields, submitButton }; } // Execute planned workflow const workflow = await planCheckoutWorkflow(); // Fill shipping → payment → submit ``` ================================================ FILE: packages/docs/v2/best-practices/user-data.mdx ================================================ --- title: User Data Directory sidebarTitle: User Data description: Persist browser data between sessions --- ### User Data Directory Persist browser data between sessions using a custom user data directory: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; // For Browserbase sessions const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionCreateParams: { userDataDir: "/path/to/user/data/directory", }, }); // For Local sessions const localStagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { userDataDir: "./browser-data", }, }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ```python Python from stagehand import Stagehand # For Browserbase sessions stagehand = Stagehand( env="BROWSERBASE", browserbase_session_create_params={ "user_data_dir": "/path/to/user/data/directory", }, ) # For Local sessions local_stagehand = Stagehand( env="LOCAL", local_browser_launch_options={ "user_data_dir": "./browser-data", }, ) await stagehand.init() print(f"Session ID: {stagehand.session_id}") ``` ================================================ FILE: packages/docs/v2/best-practices/using-multiple-tabs.mdx ================================================ --- title: 'Using Multiple Tabs' description: 'Act on multiple tabs with Stagehand' --- Many modern web applications open new tabs when users click certain buttons or links. Without proper multitab support, automation scripts break when expected content appears in a new tab rather than the current one. Stagehand's multitab capabilities ensure your automations work seamlessly across multitab workflows. ## The Stagehand Page Stagehand automatically adapts to multitab workflows. The `stagehand.page` object always points to the most recently opened or active tab, ensuring your automations continue working even when new tabs are created. This means you can continue using familiar patterns: ```typescript TypeScript const page = stagehand.page; await page.goto("https://example.com"); await page.act("click the button that opens a new tab"); // page now automatically points to the new tab await page.extract("get data from new tab"); ``` ```python Python page = stagehand.page await page.goto("https://example.com") await page.act("click the button that opens a new tab") # page now automatically points to the new tab await page.extract("get data from new tab") ``` **Important**: [Stagehand Agent](/v2/basics/agent) will always operate on the `stagehand.page`. If you need an agent to work across specific tabs, you'll need to manage page switching manually. ## Manual Page Management For more control or multitab workflows, you can manage multiple tabs explicitly: ```typescript TypeScript // Create a second page await stagehand.context.newPage(); const pages = stagehand.context.pages(); const githubPage = pages[0]; const pythonPage = pages[1]; // Navigate each page to different repositories await githubPage.goto("https://github.com/browserbase/stagehand"); await pythonPage.goto("https://github.com/browserbase/stagehand-python"); // Extract data from both pages simultaneously const [stagehandStars, stagehandPythonStars] = await Promise.all([ githubPage.extract("extract the repository stars"), pythonPage.extract("extract the repository stars") ]); console.log(`Stagehand stars: ${stagehandStars}`); console.log(`Stagehand-Python stars: ${stagehandPythonStars}`); ``` ```python Python # Create a second page await stagehand.context.new_page() pages = stagehand.context.pages() github_page = pages[0] python_page = pages[1] # Navigate each page to different repositories await github_page.goto("https://github.com/browserbase/stagehand") await python_page.goto("https://github.com/browserbase/stagehand-python") # Extract data from both pages stagehand_stars = await github_page.extract("extract the repository stars") stagehand_python_stars = await python_page.extract("extract the repository stars") print(f"Stagehand stars: {stagehand_stars}") print(f"Stagehand-Python stars: {stagehand_python_stars}") ``` ## Handling Tab Events You can also listen for tab events to control what happens when new tabs are opened: ```typescript TypeScript const page = stagehand.page; await page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/five-tab/"); // close the new tab after it's opened page.on("popup", async () => { const newPage = stagehand.context.pages()[1]; await newPage.close(); }); await page.act("click the button to open the other page"); const page_number = await page.extract("extract the page number"); console.log(`You're on page ${page_number}`); ``` ```python Python page = stagehand.page await page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/five-tab/") # Close the new tab after it's opened async def handle_popup(): new_page = stagehand.context.pages()[1] await new_page.close() page.on("popup", handle_popup) await page.act("click the button to open the other page") page_number = await page.extract("extract the page number") print(f"You're on page {page_number}") ``` ## Next Steps Use `Agent` to autonomously execute multi-step tasks and complex workflows. Learn best practices for interacting with elements inside iframes. Manage browser contexts and sessions for complex automation scenarios. Handle errors gracefully and debug automation issues effectively. ================================================ FILE: packages/docs/v2/best-practices/working-with-iframes.mdx ================================================ --- title: Working with iframes --- ### What is an iframe? Iframes embed other pages within your current page. Sites use them for consent banners, payment widgets, chat bubbles, and third-party content. Elements inside iframes exist in a separate context than the main page. ### Enable iframe support Set `iframes: true` in your `act()`, `observe()`, and `extract()` commands. ```typescript TypeScript // Act within iframes await page.act({ action: "click the accept cookies button", iframes: true }); // Observe within iframes const results = await page.observe({ instruction: "Find the primary action button", iframes: true, }); // Extract from iframes const data = await page.extract({ instruction: "Extract the product price from the payment widget", schema: z.object({ price: z.string(), }), iframes: true, }); ``` ```python Python # Act within iframes await page.act( "click the accept cookies button", iframes=True ) # Observe within iframes results = await page.observe({ "instruction": "Find the primary action button", "iframes": True, }) # Extract from iframes data = await page.extract({ "instruction": "Extract the product price from the payment widget", "schema": { "type": "object", "properties": { "price": {"type": "string"} } }, "iframes": True, }) ``` ### Tips - Iframes can increase processing time. For best performance, use the iframe option only when necessary. - When you are unsure whether an element will be in an iframe, you can verify the presence of iframes in Stagehand logs. - If an element intermittently fails to be found, it may be inside a lazy‑loaded iframe. Add small waits between steps or re‑run your action. You can enable experimental features (like Shadow DOM support) via your Stagehand configuration. See the [configuration guide](/v2/configuration/browser). ## Next steps Use `observe()` to plan precise, single-step actions before executing them. Use `extract()` with a data schema to pull clean, typed data from any page. Speed up repeated automations by caching actions. Learn how to perform single-step actions reliably with `act()`. ================================================ FILE: packages/docs/v2/configuration/browser.mdx ================================================ --- title: Browser sidebarTitle: Browser description: Configure Stagehand on Browserbase or locally --- Stagehand supports two primary environments: - **Browserbase** - Cloud-managed browser infrastructure optimized for production web automation at scale - **Local** - Run browsers directly on your machine for development and debugging ## Browserbase Environment Browserbase provides managed cloud browser infrastructure optimized for web automation at scale. It offers advanced features like stealth mode, proxy support, and persistent contexts. Discover the power of cloud-managed browser infrastructure with Browserbase. ### Environment Variables Before getting started, set up the required environment variables: ```bash .env BROWSERBASE_API_KEY=your_api_key_here BROWSERBASE_PROJECT_ID=your_project_id_here ``` Get your API key and Project ID from the [Browserbase Dashboard](https://browserbase.com/overview) ### Using Stagehand with Browserbase #### Basic Setup The simplest way to get started is with default settings: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", }); await stagehand.init(); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( env="BROWSERBASE", ) await stagehand.init() ``` #### Advanced Configuration Configure browser settings, proxy support, and other session parameters: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", // Optional: API Key and Project ID will be pulled directly from your environment apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, browserbaseSessionCreateParams: { proxies: true, region: "us-west-2", browserSettings: { viewport: { width: 1920, height: 1080 }, blockAds: true, }, }, }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( env="BROWSERBASE", # Optional: API Key and Project ID will be pulled directly from your environment api_key=os.getenv("BROWSERBASE_API_KEY"), project_id=os.getenv("BROWSERBASE_PROJECT_ID"), browserbase_session_create_params={ "proxies": True, "region": "us-west-2", "browser_settings": { "viewport": {"width": 1920, "height": 1080}, "block_ads": True, }, }, ) ``` ```typescript TypeScript const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, proxies: true, region: "us-west-2", timeout: 3600, // 1 hour session timeout keepAlive: true, // Available on Startup plan browserSettings: { advancedStealth: false, // this is a Scale Plan feature - reach out to support@browserbase.com to enable blockAds: true, solveCaptchas: true, recordSession: false, viewport: { width: 1920, height: 1080, }, fingerprint: { browsers: ["chrome", "edge"], devices: ["desktop"], operatingSystems: ["windows", "macos"], locales: ["en-US", "en-GB"], httpVersion: 2, }, }, userMetadata: { userId: "automation-user-123", environment: "production", }, }, }); ``` ```python Python stagehand = Stagehand( env="BROWSERBASE", api_key=os.getenv("BROWSERBASE_API_KEY"), project_id=os.getenv("BROWSERBASE_PROJECT_ID"), browserbase_session_create_params={ "project_id": os.getenv("BROWSERBASE_PROJECT_ID"), "proxies": True, "region": "us-west-2", "timeout": 3600, # 1 hour session timeout "keep_alive": True, # Available on Startup plan "browser_settings": { "advanced_stealth": False, # this is a Scale Plan feature - reach out to support@browserbase.com to enable "block_ads": True, "solve_captchas": True, "record_session": False, "viewport": { "width": 1920, "height": 1080, }, "fingerprint": { "browsers": ["chrome", "edge"], "devices": ["desktop"], "operating_systems": ["windows", "macos"], "locales": ["en-US", "en-GB"], "http_version": 2, }, }, "user_metadata": { "user_id": "automation-user-123", "environment": "production", }, }, ) ``` #### Initialization Result After calling `stagehand.init()`, the method returns configuration information about the initialized session: ```typescript TypeScript const result = await stagehand.init(); console.log(result); ``` ```python Python result = await stagehand.init() print(result) ``` The returned object contains: ```Example { debugUrl: 'https://www.browserbase.com/devtools/inspector.html?wss=connect.browserbase.com/debug/f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0/devtools/page/5474B0E0510C5B6E629BEB06E799CD70?debug=true', sessionUrl: 'https://www.browserbase.com/sessions/f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0', sessionId: 'f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0' } ``` **Open the Browserbase [session live view](https://docs.browserbase.com/features/session-live-view)** to include a human-in-the-loop. **Open the [session replay](https://docs.browserbase.com/features/session-replay)** to see the full session recording. **Unique identifier** for the [Browserbase session](https://docs.browserbase.com/introduction/what-is-browserbase). This is used to identify the session in the Browserbase dashboard and to connect to the session. ### Alternative: Browserbase SDK If you prefer to manage sessions directly, you can use the Browserbase SDK: ```typescript TypeScript import { Browserbase } from "@browserbasehq/sdk"; const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY! }); const session = await bb.sessions.create({ projectId: process.env.BROWSERBASE_PROJECT_ID!, // Add configuration options here }); ``` ```python Python from browserbase import Browserbase bb = Browserbase(api_key=os.environ["BROWSERBASE_API_KEY"]) session = bb.sessions.create( project_id=os.environ["BROWSERBASE_PROJECT_ID"], # Add configuration options here ) ``` #### Connecting to an Existing Session Connect to a previously created Browserbase session using its session ID: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionID: "existing-session-uuid-here", }); await stagehand.init(); console.log("Resumed Session ID:", stagehand.sessionId); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( env="BROWSERBASE", browserbase_session_id="existing-session-uuid-here", ) await stagehand.init() print(f"Resumed Session ID: {stagehand.session_id}") ``` ## Local Environment The local environment runs browsers directly on your machine, providing full control over browser instances and configurations. Ideal for development, debugging, and scenarios requiring custom browser setups. ### Environment Comparison | Feature | Browserbase | Local | | --- | --- | --- | | **Scalability** | High (cloud-managed) | Limited (local resources) | | **Stealth Features** | Advanced fingerprinting | Basic stealth | | **Proxy Support** | Built-in residential proxies | Manual configuration | | **Session Persistence** | Cloud context storage | File-based user data | | **Geographic Distribution** | Multi-region deployment | Single machine | | **Debugging** | Session recordings & logs | Direct DevTools access | | **Setup Complexity** | Environment variables only | Browser installation required | | **Cost** | Usage-based pricing | Infrastructure & maintenance | | **Best For** | Production, scale, compliance | Development, debugging | ### Basic Local Setup ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL" }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ```python Python from stagehand import Stagehand stagehand = Stagehand( env="LOCAL" ) await stagehand.init() print(f"Session ID: {stagehand.session_id}") ``` ### Advanced Local Configuration Customize browser launch options for local development: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { headless: false, // Show browser window devtools: true, // Open developer tools viewport: { width: 1280, height: 720 }, executablePath: '/opt/google/chrome/chrome', // Custom Chrome path args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security', '--allow-running-insecure-content', ], env: { NODE_ENV: "development", DEBUG: "true", }, }, }); await stagehand.init(); ``` ```python Python from stagehand import Stagehand stagehand = Stagehand( env="LOCAL", headless=False, # Show browser window local_browser_launch_options={ "devtools": True, # Open developer tools "viewport": {"width": 1280, "height": 720}, "executable_path": "/opt/google/chrome/chrome", # Custom Chrome path "args": [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-web-security", "--allow-running-insecure-content", ], "env": { "NODE_ENV": "development", "DEBUG": "true", }, }, ) await stagehand.init() ``` ### Connecting to your local browser Connect to your existing local Chrome/Chromium browser instead of launching a new one. This lets you automate your normal browser with all your existing tabs, extensions and settings. ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { cdpUrl: 'http://localhost:9222' } }); await stagehand.init(); ``` ```python Python from stagehand import Stagehand stagehand = Stagehand( env="LOCAL", local_browser_launch_options={ "cdp_url": "http://localhost:9222" } ) await stagehand.init() ``` ## Troubleshooting ### Common Issues - Verify your `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set correctly - Check that your API key has the necessary permissions - Ensure your Browserbase account has sufficient credits - Install Chrome or Chromium on your system - Set the correct `executablePath` for your Chrome installation - Check that required dependencies are installed (Linux: `libnss3-dev libatk-bridge2.0-dev libgtk-3-dev libxss1 libasound2`) - Increase session timeout in `browserbaseSessionCreateParams.timeout` - Use `keepAlive: true` for long-running sessions - Monitor session usage to avoid unexpected terminations ================================================ FILE: packages/docs/v2/configuration/evals.mdx ================================================ --- title: Evaluations & Metrics sidebarTitle: Evaluations description: Monitor performance, optimize costs, and evaluate LLM effectiveness --- Evaluations help you understand how well your automation performs, which models work best for your use cases, and how to optimize for cost and reliability. This guide covers both monitoring your own workflows and running comprehensive evaluations. ## Why Evaluations Matter - **Performance Optimization**: Identify which models and settings work best for your specific automation tasks - **Cost Control**: Track token usage and inference time to optimize spending - **Reliability**: Measure success rates and identify failure patterns - **Model Selection**: Compare different LLMs on real-world tasks to make informed decisions View real-time performance comparisons across different LLMs on the [Stagehand Evals Dashboard](https://www.stagehand.dev/evals) ## Comprehensive Evaluations Evaluations help you systematically test and improve your automation workflows. Stagehand provides both built-in evaluations and tools to create your own. We have 2 types of evals: 1. **Deterministic Evals** - These include unit tests, integration tests, and E2E tests that can be run without any LLM inference. 2. **LLM-based Evals** - These are evals that test the underlying functionality of Stagehand's AI primitives. ### Evals CLI ![Evals CLI](/media/evals-cli.png) To run evals, you'll need to clone the [Stagehand repo](https://github.com/browserbase/stagehand) and set up the CLI. We recommend using [Braintrust](https://www.braintrust.dev/docs/) to help visualize evals results and metrics. The Stagehand CLI provides a powerful interface for running evaluations. You can run specific evals, categories, or external benchmarks with customizable settings. Evals are grouped into: 1. **Act Evals** - These are evals that test the functionality of the `act` method. 2. **Extract Evals** - These are evals that test the functionality of the `extract` method. 3. **Observe Evals** - These are evals that test the functionality of the `observe` method. 4. **Combination Evals** - These are evals that test the functionality of the `act`, `extract`, and `observe` methods together. 5. **Experimental Evals** - These are experimental custom evals that test the functionality of the stagehand primitives. 6. **Agent Evals** - These are evals that test the functionality of `agent`. 7. **(NEW) External Benchmarks** - Run external benchmarks like WebBench, GAIA, WebVoyager, OnlineMind2Web, and OSWorld. #### Installation ```bash # From the stagehand root directory pnpm install ``` ```bash pnpm run build:cli ``` ```bash evals help ``` #### CLI Commands and Options ##### Basic Commands ```bash # Run all evals evals run all # Run specific category evals run act evals run extract evals run observe evals run agent # Run specific eval evals run extract/extract_text # List available evals evals list evals list --detailed # Configure defaults evals config evals config set env browserbase evals config set trials 5 ``` ##### Command Options - **`-e, --env`**: Environment (`local` or `browserbase`) - **`-t, --trials`**: Number of trials per eval (default: 3) - **`-c, --concurrency`**: Max parallel sessions (default: 10) - **`-m, --model`**: Model override - **`-p, --provider`**: Provider override - **`--api`**: Use Stagehand API instead of SDK ##### Running External Benchmarks The CLI supports several industry-standard benchmarks: ```bash # WebBench with filters evals run benchmark:webbench -l 10 -f difficulty=easy -f category=READ # GAIA benchmark evals run b:gaia -s 100 -l 25 -f level=1 # WebVoyager evals run b:webvoyager -l 50 # OnlineMind2Web evals run b:onlineMind2Web # OSWorld evals run b:osworld -f source=Mind2Web ``` #### Configuration Files You can view the specific evals in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/v2/evals/tasks). Each eval is grouped into eval categories based on [`evals/evals.config.json`](https://github.com/browserbase/stagehand/blob/main/evals/evals.config.json). #### Viewing eval results ![Eval results](/images/evals.png) Eval results are viewable on Braintrust. You can view the results of a specific eval by going to the Braintrust URL specified in the terminal when you run `npm run evals`. By default, each eval will run five times per model. The "Exact Match" column shows the percentage of times the eval was correct. The "Error Rate" column shows the percentage of times the eval errored out. You can use the Braintrust UI to filter by model/eval and aggregate results across all evals. ### Deterministic Evals To run deterministic evals, you can run `npm run e2e` from within the Stagehand repo. This will test the functionality of Playwright within Stagehand to make sure it's working as expected. These tests are in [`evals/deterministic`](https://github.com/browserbase/stagehand/tree/v2/evals/deterministic) and test on both Browserbase browsers and local headless Chromium browsers. ## Creating Custom Evaluations ### Step-by-Step Guide Create a new file in `evals/tasks/your-eval.ts`: ```typescript import { EvalTask } from '../types'; export const customEvalTask: EvalTask = { name: 'custom_task_name', description: 'Test specific automation workflow', // Test setup setup: async ({ page }) => { await page.goto('https://example.com'); }, // The actual test task: async ({ stagehand, page }) => { // Your automation logic await page.act({ action: 'click the login button' }); const result = await page.extract({ instruction: 'Get the user name', schema: { username: 'string' } }); return result; }, // Validation validate: (result, expected) => { return result.username === expected.username; }, // Test cases testCases: [ { input: { /* test input */ }, expected: { username: 'john_doe' } } ], // Evaluation criteria scoring: { exactMatch: true, timeout: 30000, retries: 2 } }; ``` Update `evals/evals.config.json`: ```json { "categories": { "custom": ["custom_task_name"], "existing_category": ["custom_task_name"] } } ``` ```bash # Test your custom evaluation evals run custom_task_name # Run the entire custom category evals run custom # Run with specific settings evals run custom_task_name -e browserbase -t 5 -m gpt-4o ``` ## Best Practices for Custom Evals - **Atomic**: Each test should validate one specific capability - **Deterministic**: Tests should produce consistent results - **Realistic**: Use real-world scenarios and websites - **Measurable**: Define clear success/failure criteria - **Parallel Execution**: Design tests to run independently - **Resource Management**: Clean up after each test - **Timeout Handling**: Set appropriate timeouts for operations - **Error Recovery**: Handle failures gracefully - **Ground Truth**: Establish reliable expected outcomes - **Edge Cases**: Test boundary conditions and error scenarios - **Statistical Significance**: Run multiple iterations for reliability - **Version Control**: Track changes to test cases over time ### Troubleshooting Evaluations **Symptoms**: Tests fail with timeout errors **Solutions**: - Increase timeout in `taskConfig.ts` - Use faster models (Gemini 2.5 Flash, GPT-4o Mini) - Optimize test scenarios to be less complex - Check network connectivity to LLM providers **Symptoms**: Same test passes/fails randomly **Solutions**: - Set temperature to 0 for deterministic outputs - Increase repetitions for statistical significance - Use more capable models for complex tasks - Check for dynamic website content affecting tests **Symptoms**: Token usage exceeding budget **Solutions**: - Use cost-effective models (Gemini 2.0 Flash, GPT-4o Mini) - Reduce repetitions for initial testing - Focus on specific evaluation categories - Use local browser environment to reduce Browserbase costs **Symptoms**: Results not uploading to dashboard **Solutions**: - Check Braintrust API key configuration - Verify internet connectivity - Update Braintrust SDK to latest version - Check project permissions in Braintrust dashboard ================================================ FILE: packages/docs/v2/configuration/logging.mdx ================================================ --- title: Logging & Debugging sidebarTitle: Logging description: Set up logging, debugging, and error tracking for Stagehand workflows --- Stagehand provides comprehensive logging capabilities to help you debug automation workflows, track execution, and diagnose issues. Configure logging levels, structured output, and debugging tools for both development and production environments. ## Logging Configuration ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", // or "LOCAL" verbose: 1, // 0 = errors only, 1 = info, 2 = debug }); ``` ```python Python from stagehand import Stagehand stagehand = Stagehand( env="BROWSERBASE", # or "LOCAL" verbose=1, # 0 = errors only, 1 = info, 2 = debug ) ``` ### Verbose Levels - **Level 0**: Errors only - minimal output for production - **Level 1**: Info - includes successful operations and important events - **Level 2**: Debug - comprehensive logging including internal operations ## Structured Logging ### Log Line Format Each log entry contains structured information: ```typescript TypeScript interface LogLine { category: 'browser' | 'action' | 'llm' | 'error' | 'stagehand' | 'cache'; message: string; level: 0 | 1 | 2; // error | info | debug timestamp: string; auxiliary?: { executionTime?: { value: string; unit: string }; sessionId?: string; url?: string; [key: string]: any; }; } ``` ```python Python # Log line structure in Python { "category": "browser" | "action" | "llm" | "error" | "stagehand" | "cache", "message": str, "level": 0 | 1 | 2, # error | info | debug "timestamp": str, "auxiliary": { "execution_time": {"value": str, "unit": str}, "session_id": str, "url": str, # ... other context data } } ``` ### Custom Logger ```typescript TypeScript class AdvancedLogger { private logFile?: string; constructor(logFile?: string) { this.logFile = logFile; } log = (logLine: any) => { const timestamp = new Date().toISOString(); const colors = { browser: '\x1b[34m', // blue action: '\x1b[32m', // green llm: '\x1b[35m', // magenta error: '\x1b[31m', // red stagehand: '\x1b[36m', // cyan cache: '\x1b[33m', // yellow }; const color = colors[logLine.category] || '\x1b[0m'; const reset = '\x1b[0m'; // Console output with colors console.log(`${color}[${logLine.category}]${reset} ${logLine.message}`); // Log execution time if available if (logLine.auxiliary?.executionTime) { console.log(` ${logLine.auxiliary.executionTime.value}${logLine.auxiliary.executionTime.unit}`); } // Log additional context if (logLine.auxiliary && Object.keys(logLine.auxiliary).length > 0) { console.log(' Context:', JSON.stringify(logLine.auxiliary, null, 2)); } // File logging (optional) if (this.logFile) { const logEntry = { timestamp, ...logLine }; require('fs').appendFileSync(this.logFile, JSON.stringify(logEntry) + '\n'); } } } // Usage const logger = new AdvancedLogger('./automation.log'); const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 2, logger: logger.log }); ``` ```python Python import json import os from datetime import datetime from typing import Dict, Any, Optional class AdvancedLogger: def __init__(self, log_file: Optional[str] = None): self.log_file = log_file def log(self, log_line: Dict[str, Any]): timestamp = datetime.now().isoformat() colors = { 'browser': '\033[34m', # blue 'action': '\033[32m', # green 'llm': '\033[35m', # magenta 'error': '\033[31m', # red 'stagehand': '\033[36m', # cyan 'cache': '\033[33m', # yellow } color = colors.get(log_line.get('category', ''), '\033[0m') reset = '\033[0m' # Console output with colors print(f"{color}[{log_line.get('category')}]{reset} {log_line.get('message')}") # Log execution time if available if log_line.get('auxiliary', {}).get('execution_time'): exec_time = log_line['auxiliary']['execution_time'] print(f"{exec_time['value']}{exec_time['unit']}") # Log additional context auxiliary = log_line.get('auxiliary', {}) if auxiliary and len(auxiliary) > 0: print(' Context:', json.dumps(auxiliary, indent=2)) # File logging (optional) if self.log_file: log_entry = { 'timestamp': timestamp, **log_line } with open(self.log_file, 'a') as f: f.write(json.dumps(log_entry) + '\n') # Usage logger = AdvancedLogger('./automation.log') stagehand = Stagehand( env="BROWSERBASE", verbose=2, logger=logger.log ) ``` ## Detailed Logging Features ### LLM Inference Logging Enable detailed logging of all LLM interactions: ```typescript TypeScript const stagehand = new Stagehand({ env: "BROWSERBASE", logInferenceToFile: true, // Creates inference_summary/ directory verbose: 2 }); ``` ```python Python stagehand = Stagehand( env="BROWSERBASE", log_inference_to_file=True, # Creates inference_summary/ directory verbose=2 ) ``` The `inference_summary/` directory structure: ``` inference_summary/ ├── act_summary/ │ ├── 20240329_080446068.json │ ├── 20240329_080447019.json │ └── act_summary.json ├── extract_summary/ │ ├── 20240329_081205123.json │ └── extract_summary.json └── observe_summary/ ├── 20240329_081634891.json └── observe_summary.json ``` ## Log Analysis & Debugging ### Common Log Patterns ```json { "category": "action", "message": "act completed successfully", "level": 1, "auxiliary": { "executionTime": {"value": "1250", "unit": "ms"}, "url": "https://example.com", "sessionId": "session-123" } } ``` ```json { "category": "llm", "message": "inference completed", "level": 1, "auxiliary": { "model": "gpt-4o", "tokens": {"prompt": 3451, "completion": 45}, "executionTime": {"value": "951", "unit": "ms"} } } ``` ```json { "category": "action", "message": "action failed: element not found", "level": 0, "auxiliary": { "selector": "button[data-testid='submit']", "url": "https://example.com/form", "sessionId": "session-123" } } ``` ## Best Practices - Use `verbose: 2` with visual debugging - Enable browser DevTools for element inspection - Use `logInferenceToFile: true` to capture LLM decisions - Implement structured logging early - Use `verbose: 1` to balance visibility with performance - Implement error tracking and alerting - Use structured JSON logging - Monitor session success rates and execution times - Never log credentials or sensitive data - Implement log retention policies - Secure log files and dashboards ================================================ FILE: packages/docs/v2/configuration/models.mdx ================================================ --- title: Models sidebarTitle: Models description: Enhance Stagehand with LLMs for optimal performance, cost, and reliability --- Stagehand uses Large Language Models (LLMs) to understand web pages, plan actions, and interact with complex interfaces. The choice of LLM significantly impacts your automation's accuracy, speed, and cost. Find more details about how to choose the right model on our Model Evaluation page. ## Why LLM Choice Matters - **Accuracy**: Better models provide more reliable element detection and action planning - **Speed**: Faster models reduce automation latency - **Cost**: Different providers offer varying pricing structures - **Reliability**: Structured output support ensures consistent automation behavior Find more details about how to choose the right model on our [Model Evaluation](https://www.stagehand.dev/evals) page. Small models on **Ollama** struggle with consistent structured outputs. While technically supported, we don't recommend them for production Stagehand workflows. ## Environment Variables Setup Set up your API keys before configuring Stagehand: ```bash .env # Choose one or more providers OPENAI_API_KEY=your_openai_key_here ANTHROPIC_API_KEY=your_anthropic_key_here GOOGLE_API_KEY=your_google_key_here GROQ_API_KEY=your_groq_key_here ``` ## Supported Providers Stagehand supports major LLM providers with structured output capabilities: ### Production-Ready Providers | Provider | Best Models | Strengths | Use Case | |----------|-------------|-----------|----------| | **OpenAI** | `gpt-4.1`, `gpt-4.1-mini` | High accuracy, reliable | Production, complex sites | | **Anthropic** | `claude-sonnet-4-6` | Excellent reasoning | Complex automation tasks | | **Google** | `gemini-2.5-flash`, `gemini-2.5-pro` | Fast, cost-effective | High-volume automation | ### Additional Providers - **Groq** - `llama-3.3-70b-versatile` (Good for speed critical applications) - **xAI** - `grok-beta` (Good for complex reasoning) - **Azure** - Enterprise OpenAI deployment - **Cerebras** - High-speed inference - **TogetherAI** - Open-source models - **Mistral** - `mixtral-8x7b-32768` (European option) - **DeepSeek** - Cost-effective alternative - **Perplexity** - Real-time web data - **Ollama** - Local deployment (limited accuracy) - **Run any model included in AI SDK** - Find supported models in the [Vercel AI SDK](https://sdk.vercel.ai/providers/ai-sdk-providers) (Follow the guide [here](#vercel-ai-sdk) to get started.) ## Basic Configuration ### Model Name Format Stagehand uses the format `provider/model-name` for model specification. **Examples:** - OpenAI: `openai/gpt-4.1` - Anthropic: `anthropic/claude-sonnet-4-6` - Google: `google/gemini-2.5-flash` (Recommended) ### Quick Start Examples ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ modelName: "google/gemini-2.5-flash", modelClientOptions: { apiKey: process.env.GOOGLE_API_KEY, }, }); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( model_name="google/gemini-2.5-flash", model_api_key=os.getenv("GOOGLE_API_KEY") ) ``` ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ modelName: "openai/gpt-4.1", modelClientOptions: { apiKey: process.env.OPENAI_API_KEY, }, }); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( model_name="openai/gpt-4.1", model_api_key=os.getenv("OPENAI_API_KEY") ) ``` ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ modelName: "anthropic/claude-sonnet-4-6", modelClientOptions: { apiKey: process.env.ANTHROPIC_API_KEY, }, }); ``` ```python Python import os from stagehand import Stagehand stagehand = Stagehand( model_name="anthropic/claude-sonnet-4-6", model_api_key=os.getenv("ANTHROPIC_API_KEY") ) ``` ## Custom LLM Integration Custom LLMs are currently only supported in TypeScript. Integrate any LLM with Stagehand using custom clients. The only requirement is **structured output support** for consistent automation behavior. ### Vercel AI SDK The [Vercel AI SDK](https://sdk.vercel.ai/providers/ai-sdk-providers) is a popular library for interacting with LLMs. You can use any of the providers supported by the Vercel AI SDK to create a client for your model, **as long as they support structured outputs**. Vercel AI SDK supports providers for OpenAI, Anthropic, and Google, along with support for **Amazon Bedrock** and **Azure OpenAI**. To get started, you'll need to install the `ai` package and the provider you want to use. For example, to use Amazon Bedrock, you'll need to install the `@ai-sdk/amazon-bedrock` package. You'll also need to use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/v2/examples/external_clients/aisdk.ts) as a template to create a client for your model. ```bash npm install ai @ai-sdk/amazon-bedrock ``` ```bash pnpm install ai @ai-sdk/amazon-bedrock ``` ```bash yarn add ai @ai-sdk/amazon-bedrock ``` To get started, you can use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/84f810b4631291307a32a47addad7e26e9c1deb3/examples/external_clients/aisdk.ts) as a template to create a client for your model. ```ts // Install/import the provider you want to use. // For example, to use OpenAI, import `openai` from @ai-sdk/openai import { bedrock } from "@ai-sdk/amazon-bedrock"; import { AISdkClient } from "./external_clients/aisdk"; const stagehand = new Stagehand({ llmClient: new AISdkClient({ model: bedrock("anthropic.claude-sonnet-4-6-v1:0"), }), }); ``` ## Troubleshooting ### Common Issues **Error**: `Model does not support structured outputs` **Solution**: Use models that support function calling/structured outputs. The minimum requirements are: - Model must support JSON/structured outputs - Model must have strong reasoning capabilities - Model must be able to handle complex instructions For each provider, use their latest models that meet these requirements. Some examples: - **OpenAI**: GPT-4 series or newer - **Anthropic**: Claude 3 series or newer - **Google**: Gemini 2 series or newer - **Other providers**: Latest models with structured output support **Note**: Avoid base language models without structured output capabilities or fine-tuning for instruction following. When in doubt, check our [Model Evaluation](https://www.stagehand.dev/evals) page for up-to-date recommendations. **Error**: `Invalid API key` or `Unauthorized` **Solution**: - Verify your environment variables are set correctly - Check API key permissions and quotas - Ensure you're using the correct API key for the provider - For Anthropic, make sure you have access to the Claude API **Symptoms**: Actions work sometimes but fail other times **Causes & Solutions**: - **Weak models**: Use more capable models - check our [Model Evaluation](https://www.stagehand.dev/evals) page for current recommendations - **High temperature**: Set temperature to 0 for deterministic outputs - **Complex pages**: Switch to models with higher accuracy scores on our [Model Evaluation](https://www.stagehand.dev/evals) page - **Rate limits**: Implement retry logic with exponential backoff - **Context limits**: Reduce page complexity or use models with larger context windows - **Prompt clarity**: Ensure your automation instructions are clear and specific **Issue**: Automation takes too long to respond **Solutions**: - **Use fast models**: Choose models optimized for speed - Any model with < 1s response time - Models with "fast" or "flash" variants - **Optimize settings**: - Use `verbose: 0` to minimize token usage - Set temperature to 0 for fastest processing - Keep max tokens as low as possible - **Consider local deployment**: Local models can provide lowest latency - **Batch operations**: Group multiple actions when possible **Issue**: LLM usage costs are too high **Cost Optimization Strategies**: 1. **Switch to cost-effective models**: - Check our [Model Evaluation](https://www.stagehand.dev/evals) page for current cost-performance benchmarks - Choose models with lower cost per token that still meet accuracy requirements - Consider models optimized for speed to reduce total runtime costs 2. **Optimize token usage**: - Set `verbose: 0` to reduce logging overhead - Use concise prompts and limit response length 3. **Smart model selection**: Start with cheaper models, fallback to premium ones only when needed 4. **Cache responses**: Implement LLM response caching for repeated automation patterns 5. **Monitor usage**: Set up billing alerts and track costs per automation run 6. **Batch processing**: Process multiple similar tasks together ### Next Steps See our Model Evaluation page Evaluate performance on your specific use cases in our Model Evaluation guide Monitor token usage and set alerts using our Observability tools Store successful patterns using our Caching Guide ================================================ FILE: packages/docs/v2/configuration/observability.mdx ================================================ --- title: Observability sidebarTitle: Observability description: Track Stagehand automation with session visibility and analytics --- Stagehand provides powerful observability features to help you monitor, track performance, and analyze your browser automation workflows. Focus on session monitoring, resource usage, and operational insights for both Browserbase and local environments. ## Browserbase Session Monitoring When running on Browserbase, you gain access to comprehensive cloud-based monitoring and session management through the Browserbase API and dashboard.
Browserbase Session Observability
### Live Session Visibility Browserbase provides real-time visibility into your automation sessions: **Session Dashboard Features** - Real-time browser screen recording and replay - Network request monitoring with detailed timing - JavaScript console logs and error tracking - CPU and memory usage metrics - Session status and duration tracking **Session Management & API Access** ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; import { Browserbase } from "@browserbasehq/sdk"; const browserbase = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const sessionInfo = await browserbase.sessions.retrieve(stagehand.sessionId); console.log("Session status:", sessionInfo.status); console.log("Session region:", sessionInfo.region); console.log("CPU usage:", sessionInfo.avgCpuUsage); console.log("Memory usage:", sessionInfo.memoryUsage); console.log("Proxy bytes:", sessionInfo.proxyBytes); ``` ```python Python import os from stagehand import Stagehand from browserbase import Browserbase browserbase = Browserbase( api_key=os.getenv("BROWSERBASE_API_KEY"), ) stagehand = Stagehand( env="BROWSERBASE", ) await stagehand.init() session_info = browserbase.sessions.retrieve(stagehand.session_id) print(f"Session status: {session_info['status']}") print(f"Session region: {session_info['region']}") print(f"CPU usage: {session_info['avgCpuUsage']}") print(f"Memory usage: {session_info['memoryUsage']}") print(f"Proxy bytes: {session_info['proxyBytes']}") ``` ### Session Analytics & Insights Monitor live session status, resource usage, and geographic distribution. Scale and manage concurrent sessions with real-time insights. Review complete session recordings with frame-by-frame playback. Analyze network requests and debug browser interactions visually. Programmatically access session data, automate lifecycle management, and integrate with monitoring systems through our API. Track resource consumption, session duration, and API usage. Get detailed breakdowns of costs and utilization across your automation. ### Session Monitoring & Filtering Query and monitor sessions by status and metadata: ```typescript TypeScript import { Browserbase } from "@browserbasehq/sdk"; const browserbase = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); // List sessions with filtering async function getFilteredSessions() { const sessions = await browserbase.sessions.list({ status: 'RUNNING' }); return sessions.map(session => ({ id: session.id, status: session.status, // RUNNING, COMPLETED, ERROR, TIMED_OUT startedAt: session.startedAt, endedAt: session.endedAt, region: session.region, avgCpuUsage: session.avgCpuUsage, memoryUsage: session.memoryUsage, proxyBytes: session.proxyBytes, userMetadata: session.userMetadata })); } // Query sessions by metadata async function querySessionsByMetadata(query: string) { const sessions = await browserbase.sessions.list({ q: query }); return sessions; } ``` ```python Python import os from browserbase import Browserbase browserbase = Browserbase( api_key=os.getenv("BROWSERBASE_API_KEY"), ) def get_filtered_sessions(): sessions = browserbase.sessions.list(status="RUNNING") return [{ 'id': session['id'], 'status': session['status'], # RUNNING, COMPLETED, ERROR, TIMED_OUT 'started_at': session['startedAt'], 'ended_at': session['endedAt'], 'region': session['region'], 'avg_cpu_usage': session['avgCpuUsage'], 'memory_usage': session['memoryUsage'], 'proxy_bytes': session['proxyBytes'], 'user_metadata': session['userMetadata'] } for session in sessions] def query_sessions_by_metadata(query): sessions = browserbase.sessions.list(q=query) return sessions ``` ## Local Environment Monitoring For local development, Stagehand provides performance monitoring and resource tracking capabilities directly on your machine. ### Performance Tracking ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, // Monitor performance without debug noise }); // Track local automation metrics const startTime = Date.now(); const initialMetrics = stagehand.metrics; // ... perform automation tasks const finalMetrics = stagehand.metrics; const executionTime = Date.now() - startTime; console.log('Local Performance Summary:', { executionTime: `${executionTime}ms`, totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, averageResponseTime: finalMetrics.totalInferenceTimeMs / 3, // Assuming 3 operations tokensPerSecond: (finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / (executionTime / 1000) }); ``` ```python Python from stagehand import Stagehand import time stagehand = Stagehand( env="LOCAL", verbose=1, # Monitor performance without debug noise ) # Track local automation metrics start_time = time.time() initial_metrics = stagehand.metrics # ... perform automation tasks final_metrics = stagehand.metrics execution_time = (time.time() - start_time) * 1000 # Convert to ms print('Local Performance Summary:', { 'execution_time': f"{execution_time:.0f}ms", 'total_tokens': final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens'], 'average_response_time': final_metrics['total_inference_time_ms'] / 3, # Assuming 3 operations 'tokens_per_second': (final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens']) / (execution_time / 1000) }) ``` ## Resource Usage Monitoring When running locally, monitor system resource usage and browser performance: ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; import * as os from 'os'; import { performance } from 'perf_hooks'; class LocalResourceMonitor { private cpuUsage: number[] = []; private memoryUsage: number[] = []; startMonitoring() { const interval = setInterval(() => { // Track system resources const memUsage = process.memoryUsage(); this.memoryUsage.push(memUsage.heapUsed / 1024 / 1024); // MB // Track CPU (simplified) const loadAvg = os.loadavg()[0]; this.cpuUsage.push(loadAvg); }, 1000); return interval; } getResourceSummary() { return { avgMemoryUsage: this.memoryUsage.reduce((a, b) => a + b, 0) / this.memoryUsage.length, peakMemoryUsage: Math.max(...this.memoryUsage), avgCpuLoad: this.cpuUsage.reduce((a, b) => a + b, 0) / this.cpuUsage.length, totalDataPoints: this.cpuUsage.length }; } } const monitor = new LocalResourceMonitor(); const interval = monitor.startMonitoring(); const stagehand = new Stagehand({ env: "LOCAL" }); // ... run automation clearInterval(interval); console.log('Resource Usage:', monitor.getResourceSummary()); ``` ```python Python import psutil import time from typing import List from stagehand import Stagehand class LocalResourceMonitor: def __init__(self): self.cpu_usage: List[float] = [] self.memory_usage: List[float] = [] self.monitoring = False def start_monitoring(self): self.monitoring = True import threading def monitor_resources(): while self.monitoring: # Track CPU and memory usage cpu_percent = psutil.cpu_percent(interval=1) memory_info = psutil.virtual_memory() self.cpu_usage.append(cpu_percent) self.memory_usage.append(memory_info.percent) time.sleep(1) thread = threading.Thread(target=monitor_resources) thread.daemon = True thread.start() return thread def stop_monitoring(self): self.monitoring = False def get_resource_summary(self): if not self.cpu_usage or not self.memory_usage: return {'error': 'No monitoring data collected'} return { 'avg_cpu_usage': sum(self.cpu_usage) / len(self.cpu_usage), 'peak_cpu_usage': max(self.cpu_usage), 'avg_memory_usage': sum(self.memory_usage) / len(self.memory_usage), 'peak_memory_usage': max(self.memory_usage), 'total_data_points': len(self.cpu_usage) } monitor = LocalResourceMonitor() monitor.start_monitoring() stagehand = Stagehand(env="LOCAL") # ... run automation monitor.stop_monitoring() print('Resource Usage:', monitor.get_resource_summary()) ``` Monitor token usage, costs, and speed. Set up automated alerting for critical failures. Implement cost tracking across different environments. Use session analytics to optimize automation workflows. ## Real-Time Metrics & Monitoring ### Basic Usage Tracking Monitor your automation's resource usage in real-time with `stagehand.metrics`: ```typescript TypeScript // Get current metrics console.log(stagehand.metrics); // Monitor during automation const startTime = Date.now(); const initialMetrics = stagehand.metrics; // ... perform automation tasks const finalMetrics = stagehand.metrics; const executionTime = Date.now() - startTime; console.log('Automation Summary:', { totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, totalCost: calculateCost(finalMetrics), executionTime, efficiency: (finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / executionTime }); ``` ```python Python # Get current metrics print(stagehand.metrics) # Monitor during automation import time start_time = time.time() initial_metrics = stagehand.metrics # ... perform automation tasks final_metrics = stagehand.metrics execution_time = (time.time() - start_time) * 1000 # Convert to ms print('Automation Summary:', { 'total_tokens': final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens'], 'total_cost': calculate_cost(final_metrics), 'execution_time': execution_time, 'efficiency': (final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens']) / execution_time }) ``` ### Understanding Metrics Data The metrics object provides detailed breakdown by Stagehand operation: ```typescript TypeScript { actPromptTokens: 4011, actCompletionTokens: 51, actInferenceTimeMs: 1688, extractPromptTokens: 4200, extractCompletionTokens: 243, extractInferenceTimeMs: 4297, observePromptTokens: 347, observeCompletionTokens: 43, observeInferenceTimeMs: 903, totalPromptTokens: 8558, totalCompletionTokens: 337, totalInferenceTimeMs: 6888 } ``` ```python Python { "act_prompt_tokens": 4011, "act_completion_tokens": 51, "act_inference_time_ms": 1688, "extract_prompt_tokens": 4200, "extract_completion_tokens": 243, "extract_inference_time_ms": 4297, "observe_prompt_tokens": 347, "observe_completion_tokens": 43, "observe_inference_time_ms": 903, "total_prompt_tokens": 8558, "total_completion_tokens": 337, "total_inference_time_ms": 6888 } ``` ### Log Inference to File You can also log inference to a file by setting `logInferenceToFile` to `true`. This will create a directory called `inference_summary` in your project's root directory. ```typescript TypeScript const stagehand = new Stagehand({ logInferenceToFile: true, }); ``` ```python Python stagehand = Stagehand( log_inference_to_file=True, ) ``` The `inference_summary` directory provides granular analysis data: ``` inference_summary/ ├── act_summary/ │ ├── {timestamp}.json │ ├── {timestamp}.json │ └── ... │ └── act_summary.json ├── extract_summary/ │ ├── {timestamp}.json │ ├── {timestamp}.json │ └── ... │ └── extract_summary.json ├── observe_summary/ │ ├── {timestamp}.json │ ├── {timestamp}.json │ └── ... │ └── observe_summary.json ``` ### Log File Structure Each operation creates detailed logs for analysis: ```typescript { "act_summary": [ { "act_inference_type": "act", "timestamp": "20250329_080446068", "LLM_input_file": "20250329_080446068_act_call.txt", "LLM_output_file": "20250329_080447019_act_response.txt", "prompt_tokens": 3451, "completion_tokens": 45, "inference_time_ms": 951 }, ... ], } ``` ## Best Practices - Track session success rates and failure patterns - Monitor resource usage and scaling requirements - Set up automated alerting for critical failures - Implement cost tracking across different environments - Use session analytics to optimize automation workflows - Compare Browserbase vs local execution times - Monitor token usage and inference costs across models - Track geographic performance differences - Identify bottlenecks in automation workflows - Optimize for cost-effectiveness and speed - Track session distribution across regions - Monitor concurrent session limits and scaling - Analyze failure patterns and common error scenarios - Use session recordings for root cause analysis - Implement custom metadata for workflow categorization - Integrate session APIs with monitoring dashboards - Set up automated notifications for session failures - Track SLA compliance and performance benchmarks - Monitor resource costs and usage patterns - Use analytics data for capacity planning and optimization For detailed logging and debugging capabilities, see [Logging](/v2/configuration/logging). ================================================ FILE: packages/docs/v2/first-steps/ai-rules.mdx ================================================ --- title: AI Rules description: Using AI to write Stagehand code faster, and better. --- You're likely using AI to write code, and there's a **right and wrong way to do it.** This page is a collection of rules, configs, and copy‑paste snippets to allow your AI agents/assistants to write performant, Stagehand code as fast as possible. ## Quickstart Configure Browserbase (Stagehand), Context7, DeepWiki, and Stagehand Docs in your MCP client. Drop in `cursorrules` and `claude.md` so AI agents/assistants always emit Stagehand patterns. ## Using MCP Servers MCP (Model Context Protocol) servers act as intermediaries that connect AI systems to external data sources and tools. These servers enable your coding assistant to access real-time information, execute tasks, and retrieve structured data to enhance code generation accuracy. The following **MCP servers** provide specialized access to Stagehand documentation and related resources: Provides semantic search across documentation and codebase context. Context7 enables AI assistants to find relevant code patterns, examples, and implementation details from your project history. It maintains contextual understanding of your development workflow and can surface related solutions from previous work. **Installation:** ```json { "mcpServers": { "context7": { "command": "npx", "args": ["-y", "@upstash/context7-mcp"] } } } ``` Offers deep indexing of GitHub repositories and documentation. DeepWiki allows AI agents to understand project architecture, API references, and best practices from the entire Stagehand ecosystem. It provides comprehensive knowledge about repository structure, code relationships, and development patterns. **Installation:** ```json { "mcpServers": { "deepwiki": { "url": "https://mcp.deepwiki.com/mcp" } } } ``` Direct access to official Stagehand documentation. This MCP server provides AI assistants with up-to-date API references, configuration options, and usage examples for accurate code generation. Mintlify auto-generates this server from the official docs, ensuring your AI assistant always has the latest information. **Usage:** ```json { "mcpServers": { "stagehand-docs": { "url": "https://docs.stagehand.dev/mcp" } } } ``` **How MCP Servers Enhance Your Development:** - **Real-time Documentation Access**: AI assistants can query the latest Stagehand docs, examples, and best practices - **Context-Aware Code Generation**: Servers provide relevant code patterns and configurations based on your specific use case - **Reduced Integration Overhead**: Standardized protocol eliminates the need for custom integrations with each documentation source - **Enhanced Accuracy**: AI agents receive structured, up-to-date information rather than relying on potentially outdated training data **Prompting tip:** Explicitly ask your coding agent/assistant to use these MCP servers to fetch relevant information from the docs so they have better context and know how to write proper Stagehand code. ie. **"Use the stagehand-docs MCP to fetch the act/observe guidelines, then generate code that follows them. Prefer cached observe results."** ## Editor rule files (copy‑paste) Drop these in `.cursorrules`, `windsurfrules`, `claude.md`, or any agent rule framework: ``````md # Stagehand Project This is a project that uses [Stagehand](https://github.com/browserbase/stagehand), which amplifies Playwright with AI-powered `act`, `extract`, and `observe` methods added to the Page class. `Stagehand` is a class that provides configuration and browser automation capabilities with: - `stagehand.page`: A StagehandPage object (extends Playwright Page) - `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) - `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows - `stagehand.init()`: Initialize the browser session - `stagehand.close()`: Clean up resources `Page` extends Playwright's Page class with AI-powered methods: - `act()`: Perform actions on web elements using natural language - `extract()`: Extract structured data from pages using schemas - `observe()`: Plan actions and get selectors before executing `Agent` provides autonomous Computer Use Agent capabilities: - `execute()`: Perform complex multi-step tasks using natural language instructions `Context` extends Playwright's BrowserContext class for browser session management. Use the following rules to write code for this project. - To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. ```typescript const results = await page.observe("Click the sign in button"); ``` You can also pass in the following params: ```typescript await page.observe({ instruction: "the instruction to execute", returnAction: true }); ``` - The result of `observe` is an array of `ObserveResult` objects that can directly be used as params for `act` like this: ```typescript const results = await page.observe({ instruction: "the instruction to execute", returnAction: true, // return the action to execute }); await page.act(results[0]); ``` - When writing code that needs to extract data from the page, use Stagehand `extract`. Explicitly pass the following params by default: ```typescript const { someValue } = await page.extract({ instruction: "the instruction to execute", schema: z.object({ someValue: z.string(), }), // The schema to extract }); ``` ## Initialize ```typescript import { Stagehand, Page, BrowserContext } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const page = stagehand.page; // Playwright Page with act, extract, and observe methods const context = stagehand.context; // Playwright BrowserContext ``` ### Configuration Options ```typescript const StagehandConfig = { env: "BROWSERBASE" | "LOCAL", // Environment to run in apiKey: process.env.BROWSERBASE_API_KEY, // Browserbase API key projectId: process.env.BROWSERBASE_PROJECT_ID, // Browserbase project ID debugDom: true, // Enable DOM debugging features headless: false, // Run browser in headless mode domSettleTimeoutMs: 30_000, // Timeout for DOM to settle enableCaching: true, // Enable action caching modelName: "gpt-4o", // AI model to use modelClientOptions: { apiKey: process.env.OPENAI_API_KEY, // OpenAI API key }, }; ``` ## Act You can act directly with string instructions: ```typescript await page.act("Click the sign in button"); ``` Use variables for dynamic form filling: ```typescript await page.act({ action: `Enter the following information: Name: %name% Email: %email% Phone: %phone%`, variables: { name: "John Doe", email: "john@example.com", phone: "+1-555-0123" } }); ``` **Best Practices:** - Cache the results of `observe` to avoid unexpected DOM changes - Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") - Use variable substitution for dynamic data entry Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". ## Extract ### Simple String Extraction ```typescript const signInButtonText = await page.extract("extract the sign in button text"); ``` ### Structured Extraction with Schema (Recommended) Always use Zod schemas for structured data extraction: ```typescript import { z } from "zod/v3"; const data = await page.extract({ instruction: "extract the sign in button text", schema: z.object({ text: z.string(), }), }); ``` ### Array Extraction To extract multiple items, wrap the array in a single object: ```typescript const data = await page.extract({ instruction: "extract the text inside all buttons", schema: z.object({ buttons: z.array(z.string()), }) }); ``` ### Complex Object Extraction For more complex data structures: ```typescript const productData = await page.extract({ instruction: "extract product information from this page", schema: z.object({ title: z.string(), price: z.number(), description: z.string(), features: z.array(z.string()), availability: z.boolean(), }), }); ``` ### Schema Validation ```typescript import { validateZodSchema } from "./utils.js"; import { z } from "zod/v3"; const schema = z.object({ name: z.string() }); const isValid = validateZodSchema(schema, { name: "John" }); // true ``` ## Agent System Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). Agents execute multi-step workflows using natural language instructions. ### Creating Agents ```typescript // Basic agent (default) const agent = stagehand.agent(); // OpenAI agent const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", instructions: "You are a helpful assistant that can use a web browser.", options: { apiKey: process.env.OPENAI_API_KEY } }); // Anthropic agent const agent = stagehand.agent({ provider: "anthropic", model: "claude-sonnet-4-20250514", instructions: "You are a helpful assistant that can use a web browser.", options: { apiKey: process.env.ANTHROPIC_API_KEY } }); ``` ### Agent Execution ```typescript // Simple task const result = await agent.execute("Extract the title from this webpage"); // Complex multi-step task const result = await agent.execute({ instruction: "Apply for the first engineer position with mock data", maxSteps: 20, autoScreenshot: true }); ``` ### Best Practices - Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` - Break down complex tasks into smaller steps - Use error handling with try/catch blocks - Combine agents for navigation with traditional methods for precise data extraction ```typescript // Good: Specific instructions await agent.execute("Navigate to products page and filter by 'Electronics'"); // Avoid: Vague instructions await agent.execute("Do some stuff on this page"); ``` ## Project Structure Best Practices - Store configurations in `stagehand.config.ts` - Use environment variables for API keys (see `.env.example`) - Implement main automation logic in functions that accept `{ page, context, stagehand }` - Use TypeScript with proper imports from `@browserbasehq/stagehand` `````` ``````md # Stagehand Python Project This is a project that uses [Stagehand Python](https://github.com/browserbase/stagehand-python), which provides AI-powered browser automation with `act`, `extract`, and `observe` methods. `Stagehand` is a class that provides configuration and browser automation capabilities with: - `stagehand.page`: A StagehandPage object (extends Playwright Page) - `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) - `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows - `stagehand.init()`: Initialize the browser session - `stagehand.close()`: Clean up resources `Page` extends Playwright's Page class with AI-powered methods: - `act()`: Perform actions on web elements using natural language - `extract()`: Extract structured data from pages using schemas - `observe()`: Plan actions and get selectors before executing `Agent` provides autonomous Computer Use Agent capabilities: - `execute()`: Perform complex multi-step tasks using natural language instructions Use the following rules to write code for this project. - To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. ```python results = await page.observe("Click the sign in button") ``` You can also pass in the following params: ```python await page.observe( instruction="the instruction to execute", draw_overlay=True # Show visual overlay on observed elements ) ``` - The result of `observe` is a list of `ObserveResult` objects that can directly be used as params for `act` like this: ```python results = await page.observe("Click the sign in button") await page.act(results[0]) ``` - When writing code that needs to extract data from the page, use Stagehand `extract`. Use Pydantic models for schemas: ```python from pydantic import BaseModel class ExtractedData(BaseModel): some_value: str result = await page.extract( instruction="the instruction to execute", schema=ExtractedData ) ``` ## Initialize ```python from stagehand import Stagehand, StagehandConfig import asyncio import os from dotenv import load_dotenv load_dotenv() async def main(): config = StagehandConfig( env="BROWSERBASE", # or "LOCAL" api_key=os.getenv("BROWSERBASE_API_KEY"), project_id=os.getenv("BROWSERBASE_PROJECT_ID"), model_name="google/gemini-2.5-flash-preview-05-20", model_api_key=os.getenv("MODEL_API_KEY"), ) # Recommended: Use as async context manager async with Stagehand(config) as stagehand: page = stagehand.page # Your automation code here # Alternative: Manual initialization stagehand = Stagehand(config) await stagehand.init() page = stagehand.page # Your automation code here await stagehand.close() if __name__ == "__main__": asyncio.run(main()) ``` ### Configuration Options Key configuration options in `StagehandConfig`: ```python config = StagehandConfig( env="BROWSERBASE", # or "LOCAL" api_key=os.getenv("BROWSERBASE_API_KEY"), project_id=os.getenv("BROWSERBASE_PROJECT_ID"), model_name="google/gemini-2.5-flash-preview-05-20", model_api_key=os.getenv("MODEL_API_KEY"), verbose=1, # 0=minimal, 1=medium, 2=detailed dom_settle_timeout_ms=30000, self_heal=True, # Enable self-healing functionality ) ``` ## Act You can act directly with string instructions: ```python await page.act("Click the sign in button") ``` Use variables for dynamic form filling: ```python await page.act( "Enter the following information: Name: John Doe, Email: john@example.com" ) ``` **Best Practices:** - Cache the results of `observe` to avoid unexpected DOM changes - Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") - Use specific, descriptive instructions Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". ## Extract ### Simple String Extraction ```python sign_in_button_text = await page.extract("extract the sign in button text") ``` ### Structured Extraction with Schema (Recommended) Always use Pydantic models for structured data extraction: ```python from pydantic import BaseModel, Field from typing import List class ButtonData(BaseModel): text: str = Field(..., description="Button text content") data = await page.extract( instruction="extract the sign in button text", schema=ButtonData ) ``` ### Array Extraction For arrays, use List types: ```python from pydantic import BaseModel, Field from typing import List class ButtonsData(BaseModel): buttons: List[str] = Field(..., description="List of button texts") data = await page.extract( instruction="extract the text inside all buttons", schema=ButtonsData ) ``` ### Complex Object Extraction For more complex data structures: ```python from pydantic import BaseModel, Field from typing import List class Company(BaseModel): name: str = Field(..., description="Company name") description: str = Field(..., description="Brief company description") class Companies(BaseModel): companies: List[Company] = Field(..., description="List of companies") companies_data = await page.extract( "Extract names and descriptions of 5 companies", schema=Companies ) ``` ## Agent System Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). ### Creating Agents ```python # Basic agent (uses default model) agent = stagehand.agent() # OpenAI agent agent = stagehand.agent( model="computer-use-preview", instructions="You are a helpful web navigation assistant.", options={"apiKey": os.getenv("OPENAI_API_KEY")} ) # Anthropic agent agent = stagehand.agent( model="claude-sonnet-4-20250514", instructions="You are a helpful web navigation assistant.", options={"apiKey": os.getenv("ANTHROPIC_API_KEY")} ) ``` ### Agent Execution ```python # Simple task result = await agent.execute("Play a game of 2048") # Complex multi-step task with options result = await agent.execute( instruction="Apply for the first engineer position with mock data", max_steps=20, auto_screenshot=True, wait_between_actions=1000 # milliseconds ) ``` **Best Practices:** - Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` - Break down complex tasks into smaller steps - Use error handling with try/except blocks - Combine agents for navigation with traditional methods for precise data extraction ```python # Good: Specific instructions await agent.execute("Navigate to products page and filter by 'Electronics'") # Avoid: Vague instructions await agent.execute("Do some stuff on this page") ``` ## Project Structure Best Practices - Store configurations in environment variables or config files - Use async/await patterns consistently - Implement main automation logic in async functions - Use async context managers for resource management - Use type hints and Pydantic models for data validation - Handle exceptions appropriately with try/except blocks `````` ## Security notes - Do not embed secrets in docs or rule files; use env vars in MCP configs. - Avoid broad actions that may trigger unintended navigation; prefer `observe` first. ## Resources/references - Context7 MCP (Upstash) - https://github.com/upstash/context7 - DeepWiki MCP - https://mcp.deepwiki.com/ - Stagehand Docs MCP (Mintlify) - https://docs.stagehand.dev/mcp ================================================ FILE: packages/docs/v2/first-steps/installation.mdx ================================================ --- title: Installation description: Integrate Stagehand into an existing project. --- Install Stagehand in your current app with the TypeScript or Python SDK. For TypeScript/Node.js: We highly recommend using the Node.js runtime environment to run Stagehand scripts, as opposed to newer alternatives like Deno or Bun. **Bun does not support Stagehand** since it doesn't support [Playwright](https://github.com/search?q=repo:oven-sh/bun+playwright&type=issues). For Python: We require Python 3.9+ and recommend using [uv](https://docs.astral.sh/uv/) to manage your virtual environment. ### Install dependencies ```bash npm npm install @browserbasehq/stagehand playwright zod ``` ```bash pnpm pnpm add @browserbasehq/stagehand playwright zod ``` ```bash yarn yarn add @browserbasehq/stagehand playwright zod ``` If you plan to run locally, install browsers once: `npx playwright install`. For cloud browser sessions, skip this. ### Configure environment Set environment variables (or a `.env` via your framework): ```bash Bash OPENAI_API_KEY=your_api_key BROWSERBASE_API_KEY=your_api_key BROWSERBASE_PROJECT_ID=your_project_id ``` ### Use in your codebase Add Stagehand where you need browser automation. ```typescript TypeScript import "dotenv/config"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod/v3"; async function main() { const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const page = stagehand.page; await page.goto("https://example.com"); // Act on the page await page.act("Click the sign in button"); // Extract structured data const { title } = await page.extract({ instruction: "extract the page title", schema: z.object({ title: z.string(), }), }); console.log(title); await stagehand.close(); } main().catch((err) => { console.error(err); process.exit(1); }); ``` ### Add dependencies ```bash uv uv add stagehand ``` ```bash pip pip install stagehand ``` ### Configure environment Set environment variables (or a `.env` via your framework): ```bash Bash MODEL_API_KEY=your_api_key BROWSERBASE_API_KEY=your_api_key BROWSERBASE_PROJECT_ID=your_project_id ``` ### Use in your codebase ```python Python import os import asyncio from stagehand import Stagehand async def main(): stagehand = Stagehand( env="BROWSERBASE", model_api_key=os.getenv("MODEL_API_KEY") ) await stagehand.init() page = stagehand.page await page.goto("https://example.com") # Act on the page await page.act("Click the sign in button") # Extract structured data result = await page.extract({ "instruction": "extract the page title", "schema": { "title": { "type": "string" } } }) print(result["title"]) await stagehand.close() if __name__ == "__main__": asyncio.run(main()) ``` ## Next steps Environment, Browserbase vs Local, logging, timeouts, LLM customization Perform precise actions with natural language Typed data extraction with Zod schemas Discover elements and suggested actions ================================================ FILE: packages/docs/v2/first-steps/introduction.mdx ================================================ --- title: Introducing Stagehand sidebarTitle: Introduction description: Developers use Stagehand to reliably automate the web. --- Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. ## The Problem with Browser Automation Traditional frameworks like Playwright and Puppeteer force you to write brittle scripts that break with every UI change. Web agents promise to solve this with AI, but leave you at the mercy of unpredictable behavior. **You're stuck between two bad options:** - **Too brittle**: Traditional selectors break when websites change - **Too agentic**: AI agents are unpredictable and impossible to debug ## Enter Stagehand Stagehand gives you the best of both worlds through four powerful primitives that let you choose exactly how much AI to use: Execute actions using natural language Pull structured data with schemas Discover available actions on any page Automate entire workflows autonomously ```typescript TypeScript // Act - Execute natural language actions await page.act("click the login button"); // Extract - Pull structured data const { price } = await page.extract({ schema: z.object({ price: z.number() }) }); // Observe - Discover available actions const actions = await page.observe("find submit buttons"); // Agent - Automate entire workflows const agent = stagehand.agent({ provider: "anthropic", model: "claude-sonnet-4-20250514", options: { apiKey: process.env.ANTHROPIC_API_KEY, }, }) await agent.execute("apply for this job"); ``` ```python Python # Act - Execute natural language actions await page.act("click the login button") # Extract - Pull structured data result = await page.extract( schema={"price": float} ) # Observe - Discover available actions actions = await page.observe("find submit buttons") # Agent - Automate entire workflows await agent.execute("apply for this job") ``` ## Why Developers Choose Stagehand - **Precise Control**: Mix AI-powered actions with deterministic code. You decide exactly how much AI to use. - **Actually Repeatable**: Save and replay actions exactly. No more "it worked on my machine" with browser automations. - **Maintainable at Scale**: One script can automate multiple websites. When sites change, your automations adapt. - **Composable Tools**: Choose your level of automation with Act, Extract, Observe, and Agent. ## Built for Modern Development Stagehand is designed for developers building production browser automations and AI agents that need reliable web access. Use any Playwright API alongside Stagehand. You're never locked into our abstractions. First-class support for both ecosystems with type safety and IDE autocomplete. Compatible with all Chromium-based browsers: Chrome, Edge, Arc, Brave, and more. Created and maintained by the team behind enterprise browser infrastructure. ## Get Started in 60 Seconds **Pro tip**: For best results, we recommend using Stagehand with [Browserbase](https://www.browserbase.com) for reliable cloud browser infrastructure. Build your first automation in under a minute Generate Stagehand scripts with AI See real-world automation examples Get help from the community ================================================ FILE: packages/docs/v2/first-steps/quickstart.mdx ================================================ --- title: Quickstart description: 'Stagehand allows you to build web automations with natural language and code.' --- If this is your **first time using Stagehand**, you should try [Director](https://director.ai) first. It's an agent that allows you to build Stagehand workflows using natural language. You can also try Stagehand using our [MCP server](/v2/integrations/mcp/introduction). Otherwise, the quickest way to start with Stagehand is with our CLI. It scaffolds a ready‑to‑run Stagehand app with sensible defaults, and an example script. This quickstart is for **TypeScript**. For **Python**, see the [installation guide](/v2/first-steps/installation). ## 1) Create a sample project ```bash Bash npx create-browser-app ``` ## 2) Run it Follow the CLI prompts to enter the project directory and add your API keys. Then run the example script. ```bash Bash cd my-stagehand-app # Enter the project directory cp .env.example .env # Add your API keys npm start # Run the example script ``` ## 3) Use Stagehand (act, extract, observe) The scaffold includes an index.ts file that contains the example script. Here's what it looks like: ```typescript TypeScript import "dotenv/config"; import { Stagehand } from "@browserbasehq/stagehand"; async function main() { const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); console.log(`Stagehand Session Started`); console.log(`Watch live: https://browserbase.com/sessions/${stagehand.browserbaseSessionID}`); const page = stagehand.page; await page.goto("https://stagehand.dev"); const extractResult = await page.extract("Extract the value proposition from the page."); console.log(`Extract result:\n`, extractResult); const actResult = await page.act("Click the 'Evals' button."); console.log(`Act result:\n`, actResult); const observeResult = await page.observe("What can I click on this page?"); console.log(`Observe result:\n`, observeResult); const agent = await stagehand.agent({ instructions: "You're a helpful assistant that can control a web browser.", }); const agentResult = await agent.execute("What is the most accurate model to use in Stagehand?"); console.log(`Agent result:\n`, agentResult); await stagehand.close(); } main().catch((err) => { console.error(err); process.exit(1); }); ``` To use, set provider keys in `.env` (e.g., `OPENAI_API_KEY`). For cloud browsers, add `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`. ## Next steps Learn about the Stagehand primitives: act, extract, observe, and agent. Perform actions on web pages with natural language Get structured data with Zod schemas Discover available elements and actions Autonomous multi-step browser workflows ================================================ FILE: packages/docs/v2/integrations/crew-ai/configuration.mdx ================================================ --- title: "Use CrewAI to Automate Browser Tasks" sidebarTitle: Configuration description: "Create intelligent agents that can interact with websites and automate browser tasks using natural language instructions" --- This guide walks you through setting up CrewAI with Browserbase to create agents that can perform web automation tasks using natural language instructions. ## Step 1: Install Dependencies Install the required packages for CrewAI and Stagehand integration: ```bash pip install stagehand-py crewai crewai-tools ``` ## Step 2: Configure Environment Variables You'll need API keys from three services: 1. **Browserbase API Key and Project ID**: Get these from your [Browserbase dashboard](https://www.browserbase.com/) 2. **LLM API Key**: Get an API key from [OpenAI](https://platform.openai.com/api-keys) or [Anthropic](https://console.anthropic.com/) Store your API keys securely as environment variables: ```bash BROWSERBASE_API_KEY="your-browserbase-api-key" BROWSERBASE_PROJECT_ID="your-browserbase-project-id" OPENAI_API_KEY="your-openai-api-key" ANTHROPIC_API_KEY="your-anthropic-api-key" ``` ## Step 3: Create Your First Agent Create a Python script with a basic CrewAI agent: ```python import os from crewai import Agent, Task, Crew from crewai_tools import StagehandTool from stagehand.schemas import AvailableModel # Get API keys from environment browserbase_api_key = os.environ.get("BROWSERBASE_API_KEY") browserbase_project_id = os.environ.get("BROWSERBASE_PROJECT_ID") model_api_key = os.environ.get("OPENAI_API_KEY") # or ANTHROPIC_API_KEY # Initialize the StagehandTool stagehand_tool = StagehandTool( api_key=browserbase_api_key, project_id=browserbase_project_id, model_api_key=model_api_key, model_name=AvailableModel.GPT_4O, # or AvailableModel.CLAUDE_3_7_SONNET_LATEST ) # Create an agent with the tool researcher = Agent( role="Web Researcher", goal="Find and summarize information from websites", backstory="I'm an expert at finding information online.", verbose=True, tools=[stagehand_tool], ) ``` ## Step 4: Create and Run a Task Define a task for your agent and execute it: ```python # Create a task that uses the tool research_task = Task( description="Go to https://www.example.com and tell me what you see on the homepage.", agent=researcher, ) # Run the crew crew = Crew( agents=[researcher], tasks=[research_task], verbose=True, ) try: result = crew.kickoff() print(result) finally: # Clean up resources stagehand_tool.close() ``` ## Step 5: Run Your Script Execute your Python script: ```bash python your_crew_script.py ``` ## Advanced Configuration Customize the StagehandTool behavior with additional parameters: ```python stagehand_tool = StagehandTool( api_key=browserbase_api_key, project_id=browserbase_project_id, model_api_key=model_api_key, model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, dom_settle_timeout_ms=5000, # Wait longer for DOM to settle headless=True, # Run browser in headless mode self_heal=True, # Attempt to recover from errors wait_for_captcha_solves=True, # Wait for CAPTCHA solving verbose=1, # Control logging verbosity (0-3) ) ``` ## Example Tasks ```python form_task = Task( description=""" Submit a contact form: 1. Go to https://example.com/contact 2. Fill out the form with name 'John Doe', email 'john@example.com' 3. Submit and confirm success """, agent=researcher, ) ``` ```python extraction_task = Task( description=""" Extract product information: 1. Go to the products page 2. Extract all product names, prices, and descriptions 3. Format as structured data """, agent=researcher, ) ``` ```python navigation_task = Task( description=""" Navigate and analyze: 1. Start at homepage 2. Navigate to products section 3. Filter by 'Electronics' category 4. Find and extract details of highest-rated product """, agent=researcher, ) ``` Dive into the CrewAI documentation to learn more about its capabilities and integrations. Access the Browserbase documentation for comprehensive guides and resources. ================================================ FILE: packages/docs/v2/integrations/crew-ai/introduction.mdx ================================================ --- title: "CrewAI Introduction" sidebarTitle: Introduction description: "Automate browser tasks using natural language instructions with CrewAI" --- ## Overview This guide shows you how to use CrewAI with Browserbase to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: - Set up CrewAI with the StagehandTool - Create agents that can interact with websites - Automate browser tasks using natural language instructions - Extract structured data from web pages ## When You'd Use This The CrewAI integration is perfect for scenarios where you need intelligent web automation: - **Research automation**: Have agents research information across multiple websites - **Data collection**: Extract structured data from e-commerce sites, job boards, or news sites - **Form automation**: Automatically fill out and submit forms based on specific criteria - **Multi-step workflows**: Execute complex browser workflows that require decision-making The StagehandTool wraps the Stagehand Python SDK to provide CrewAI agents with the ability to control a real web browser and interact with websites using three core primitives: 1. **Act**: Perform actions like clicking, typing, or navigating 2. **Extract**: Extract structured data from web pages 3. **Observe**: Identify and analyze elements on the page Learn how to configure and use the StagehandTool with CrewAI agents for web automation tasks ================================================ FILE: packages/docs/v2/integrations/langchain/configuration.mdx ================================================ --- title: "LangChain JS Configuration" sidebarTitle: Configuration description: "Set up Stagehand with LangChain JS to create intelligent web automation agents" --- This guide walks you through integrating Stagehand with LangChain JS to build powerful web automation workflows using natural language instructions. ## Step 1: Install Dependencies Install the required packages for LangChain JS and Stagehand integration: ```bash npm install @langchain/langgraph @langchain/community @langchain/core @browserbasehq/stagehand ``` ## Step 2: Configure Environment Variables For remote browser automation, set up your Browserbase credentials: ```bash BROWSERBASE_API_KEY="your-browserbase-api-key" BROWSERBASE_PROJECT_ID="your-browserbase-project-id" ``` ## Step 3: Create a Stagehand Instance Initialize Stagehand with your preferred configuration: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; // For local development const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, enableCaching: false, }); // For production with Browserbase const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, enableCaching: true, }); ``` ## Step 4: Generate the StagehandToolkit Create the toolkit that provides LangChain-compatible tools: ```typescript import { StagehandToolkit } from '@langchain/community/agents/toolkits/stagehand'; const stagehandToolkit = await StagehandToolkit.fromStagehand(stagehand); ``` ## Step 5: Use Individual Tools The toolkit provides four specialized tools for web automation: ### Available Tools - **stagehand_navigate**: Navigate to specific URLs - **stagehand_act**: Perform browser actions (clicking, typing, etc.) - **stagehand_extract**: Extract structured data using schemas - **stagehand_observe**: Analyze page elements and possible actions ### Basic Tool Usage ```typescript import { z } from "zod"; // Navigate to a website const navigateTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_navigate" ); await navigateTool.invoke("https://www.google.com"); // Perform an action const actionTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_act" ); await actionTool.invoke('Search for "OpenAI"'); // Observe the page const observeTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_observe" ); const result = await observeTool.invoke( "What actions can be performed on the current page?" ); console.log(JSON.parse(result)); // Extract structured data const extractTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_extract" ); const extractResult = await extractTool.invoke({ instruction: "Extract the main heading and description", schema: z.object({ heading: z.string(), description: z.string(), }), }); console.log(extractResult); ``` ## Step 6: Build LangGraph Agents Integrate with LangGraph for complex automation workflows: ```typescript import { createReactAgent } from "@langchain/langgraph/prebuilt"; // Create an LLM const llm = new ChatOpenAI({ model: "gpt-4", temperature: 0, }); // Create an agent with Stagehand tools const agent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); // Execute a complex workflow const result = await agent.invoke({ messages: [ { role: "user", content: "Go to example.com, find the contact form, and extract all the form fields" } ] }); ``` ## Advanced Configuration ### Custom Stagehand Configuration ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 2, enableCaching: true, headless: true, domSettleTimeoutMs: 5000, }); ``` ### Error Handling ```typescript try { const result = await agent.invoke({ messages: [{ role: "user", content: "Navigate to invalid-url.com" }] }); } catch (error) { console.error("Automation failed:", error); } finally { // Clean up resources await stagehand.close(); } ``` ## Example Workflows ```typescript const extractionAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await extractionAgent.invoke({ messages: [{ role: "user", content: ` Go to news-website.com and extract: 1. All article headlines 2. Publication dates 3. Author names Format as structured JSON ` }] }); ``` ```typescript const formAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await formAgent.invoke({ messages: [{ role: "user", content: ` Navigate to contact-form.com and: 1. Fill out the contact form with: - Name: John Doe - Email: john@example.com - Message: Inquiry about services 2. Submit the form 3. Confirm submission success ` }] }); ``` ```typescript const researchAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await researchAgent.invoke({ messages: [{ role: "user", content: ` Research product pricing by: 1. Visit competitor1.com and extract pricing info 2. Visit competitor2.com and extract pricing info 3. Compare features and prices 4. Provide summary analysis ` }] }); ``` Official LangChain JS documentation for the Stagehand integration ================================================ FILE: packages/docs/v2/integrations/langchain/introduction.mdx ================================================ --- title: "Langchain JS Introduction" sidebarTitle: Introduction description: "Integrate Stagehand with Langchain JS for intelligent web automation" --- ## Overview This guide shows you how to use Stagehand with Langchain JS to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: - Set up the StagehandToolkit with Langchain JS - Create agents that can navigate and interact with websites - Extract structured data using natural language instructions - Build complex automation workflows with LangGraph ## When You'd Use This The Langchain JS integration is perfect for scenarios where you need intelligent web automation with advanced reasoning: - **AI-driven research**: Create agents that can research information across multiple websites and synthesize findings - **Dynamic form filling**: Automatically fill out complex forms based on contextual requirements - **Data extraction workflows**: Extract and transform data from multiple sources with intelligent navigation - **Multi-step web processes**: Execute complex browser workflows that require decision-making and adaptation Learn how to set up and configure the StagehandToolkit with Langchain JS agents ================================================ FILE: packages/docs/v2/integrations/mcp/configuration.mdx ================================================ --- title: "Browserbase MCP Server Configuration" sidebarTitle: "Configuration" description: "Configure your browser automation with command-line flags, environment variables, and advanced options" --- ## Configuration Overview The Browserbase MCP server supports extensive configuration options through command-line flags and environment variables. Configure browser behavior, proxy settings, stealth modes, model selection, and more to customize your browser automation workflows. Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). ## Environment Variables Configure the essential Browserbase credentials and optional debugging settings: Your Browserbase API key for authentication Your Browserbase project ID ## Command-Line Flags ### Available Flags | Flag | Description | |------|-------------| | `--proxies` | Enable Browserbase proxies for the session | | `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | | `--keepAlive` | Enable Browserbase Keep Alive Session | | `--contextId ` | Specify a Browserbase Context ID to use | | `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | | `--port ` | Port to listen on for HTTP/SHTTP transport | | `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | | `--cookies [json]` | JSON array of cookies to inject into the browser | | `--browserWidth ` | Browser viewport width (default: 1024) | | `--browserHeight ` | Browser viewport height (default: 768) | | `--modelName ` | The model to use for Stagehand (default: google/gemini-2.5-flash-lite) | | `--modelApiKey ` | API key for the custom model provider (required when using custom models) | | `--experimental` | Enable experimental features (default: false) | ## Configuration Examples ### Basic Configuration ```json Direct SHTTP { "mcpServers": { "browserbase": { "url": "your-smithery-url.com" } } } ``` When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ```json { "mcpServers": { "browserbase": { "command": "node", "args": ["/path/to/mcp-server-browserbase/cli.js"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ```bash # Start server node cli.js --port 8931 ``` ```json { "mcpServers": { "browserbase": { "url": "http://localhost:8931/mcp", "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ### Advanced Features Enable Browserbase proxies for IP rotation and geo-location testing. [Learn more about Browserbase Proxies](https://docs.browserbase.com/features/proxies) ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--proxies"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Enable advanced anti-detection features for enhanced stealth browsing. [Learn more about Advanced Stealth](https://docs.browserbase.com/features/stealth-mode#advanced-stealth-mode) **Note:** Advanced Stealth is only available for Scale Plan users. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--advancedStealth"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Use persistent browser contexts to maintain authentication and state across sessions. [Learn more about Browserbase Contexts](https://docs.browserbase.com/features/contexts) ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--contextId", "your_context_id"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ### Browser Customization Customize browser window dimensions. Default is 1024x768. Recommended aspect ratios: 16:9. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--browserWidth", "1920", "--browserHeight", "1080" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` **Common Resolutions:** - Desktop: 1920x1080, 1280x720, 1024x768 - Mobile: 375x667 (iPhone), 360x640 (Android) - Tablet: 768x1024 (iPad) Inject session cookies for authentication. Useful when persistent contexts don't handle session cookies. Cookies must be in [Playwright Cookie format](https://playwright.dev/docs/api/class-browsercontext#browser-context-cookies). ```json { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--cookies", "[{\"name\": \"session\", \"value\": \"abc123\", \"domain\": \".example.com\", \"path\": \"/\", \"httpOnly\": true, \"secure\": true}]" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ## Model Configuration Configure AI models for enhanced browser automation. Stagehand defaults to Google's Gemini 2.5 Flash Lite but supports multiple providers. When using any custom model (non-default), you must provide your own API key for that model provider using the `--modelApiKey` flag. **Google Gemini** (Default) - `google/gemini-2.5-flash-lite` (default) - `google/gemini-1.5-pro` - `google/gemini-1.5-flash` **OpenAI** - `openai/gpt-4o` - `openai/gpt-4o-mini` - `openai/o1-mini` - `openai/o1-preview` - `openai/o3-mini` **Anthropic Claude** - `anthropic/claude-sonnet-4-6` - `anthropic/claude-sonnet-4-5-20250929` [View full list of supported models](https://docs.stagehand.dev/examples/custom_llms#supported-llms) ```json OpenAI GPT-4o { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--modelName", "openai/gpt-4o", "--modelApiKey", "your_openai_api_key" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ```json Claude Sonnet { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--modelName", "anthropic/claude-sonnet-4-6", "--modelApiKey", "your_anthropic_api_key" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ## Development Configuration Enable detailed logging for troubleshooting and development. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key", "DEBUG": "true" } } } } ``` Configure custom host and port for SHTTP transport. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--host", "0.0.0.0", "--port", "8080" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ## Best Practices - Use appropriate viewport sizes for your use case - Enable proxies only when needed for geo-location - Choose efficient models (Gemini Flash for speed, GPT-4o for accuracy) - Reuse contexts for authentication persistence - Store API keys securely in environment variables - Use Advanced Stealth for sensitive operations - Implement proper session management - Rotate cookies and contexts regularly - Enable debug mode during development - Use context persistence for faster iteration - Test with different viewport sizes - Monitor session usage and quotas - Use NPM installation for reliability - Configure appropriate timeouts - Implement error handling and retries - Monitor performance and resource usage ## Further Reading Complete platform documentation AI-powered browser automation Get help from our team ================================================ FILE: packages/docs/v2/integrations/mcp/introduction.mdx ================================================ --- title: "Browserbase MCP Server" sidebarTitle: "Introduction" description: "AI-powered browser automation through Model Context Protocol integration with Stagehand" --- ## Overview The Browserbase MCP Server brings powerful browser automation capabilities to MCP clients through the Model Context Protocol (MCP). Built on top of [Stagehand](https://docs.stagehand.dev/), this integration provides AI-powered web automation using natural language commands. The hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint is served on Browserbase infrastructure. You can also run the MCP server locally with STDIO, but we recommend the hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint for most users. ## Key Features Control browsers using plain English commands like "click the login button" or "fill out the contact form" Navigate, click, and fill forms with ease Extract structured data from any website automatically Create, reuse, and close browser sessions with explicit MCP tools ## Core Benefits No need to learn complex selectors or automation syntax. Simply describe what you want to do in natural language. Get started in minutes with either hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) or local STDIO. Stagehand's AI understands web page context and can adapt to different layouts and designs. Navigate, click, type, scroll, and interact with any web element. Extract structured information from complex web pages automatically. Maintain authentication states and cookies across multiple interactions. Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) runs on Browserbase infrastructure for consistent performance. Handle multiple concurrent sessions and high-volume automation tasks. Stealth mode, proxy support, and advanced anti-detection capabilities. Detailed session recordings and debugging information. ## Use Cases Track product prices, availability, and competitor information Gather data from multiple sources for analysis and reporting Collect articles, posts, and media from various websites Extract contact information and business data from directories Create comprehensive test suites for web applications Test functionality across different browser environments Simulate real user interactions and workflows Track page load times and user experience metrics Automatically fill and submit complex web forms Extract data and generate automated reports Schedule posts and monitor engagement across platforms Automate repetitive web-based business processes ## Getting Started Choose hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) (recommended) or local STDIO based on your needs. Set up your Browserbase API credentials in MCP configuration. Get API keys from the [Browserbase Dashboard](https://www.browserbase.com/overview). Begin using natural language commands to control browsers through your MCP client. Ready to get started? Check out the [Setup Guide](/v2/integrations/mcp/setup). ## Further Reading Get started with installation and configuration Learn more about the MCP protocol Explore Browserbase features and capabilities ================================================ FILE: packages/docs/v2/integrations/mcp/setup.mdx ================================================ --- title: "Browserbase MCP Server Setup" sidebarTitle: "Setup" description: "Add the Browserbase MCP Server to your MCP client" --- ## Quick Installation One-click installation directly in Cursor You can also add Browserbase MCP to Claude Code with a single command: ```bash claude mcp add --transport http browserbase "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY" ``` We support both local STDIO and hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) (SHTTP). We recommend hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) for most users. ## Endpoint Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint (served on Browserbase infrastructure): ```text https://mcp.browserbase.com/mcp ``` ## Prerequisites Get your Browserbase API key from the [Browserbase Dashboard](https://www.browserbase.com/overview). Browserbase API Key settings Then copy your API Key directly from the input. ## Query Parameters (Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http)) ### Required for tool calls Browserbase API key. ### Optional | Query Param | Type | Behavior | | ----------------- | -------------- | ------------------------------------------ | | `modelName` | string | Defaults to `google/gemini-2.5-flash-lite` | | `modelApiKey` | string | Required when `modelName` is non-default | | `keepAlive` | boolean string | `"true"` or `"false"` | | `proxies` | boolean string | `"true"` or `"false"` | | `advancedStealth` | boolean string | `"true"` or `"false"` | Boolean query values must be exact strings: `"true"` or `"false"`. ## Available Tools Navigate to any URL in the browser The URL to navigate to Perform an action on the web page using natural language The action to perform (e.g., "click the login button", "fill form field") Observe and find actionable elements on the page. Specific instruction for observation (e.g., "find the login button", "locate search form") Extract data from the current page. Optional extraction instruction. Create or reuse a Browserbase session and set it as active for the current MCP transport session. No input parameters required. Browserbase session ID. Close the active Browserbase session for the current MCP transport session. No input parameters required. ## Local Command-Line Flags Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). | Flag | Description | |------|-------------| | `--proxies` | Enable Browserbase proxies for the session | | `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | | `--keepAlive` | Enable Browserbase Keep Alive Session | | `--contextId ` | Specify a Browserbase Context ID to use | | `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | | `--port ` | Port to listen on for HTTP or [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) transport | | `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | | `--browserWidth ` | Browser viewport width (default: 1024) | | `--browserHeight ` | Browser viewport height (default: 768) | | `--modelName ` | The model to use for Stagehand (default: google/gemini-2.5-flash-lite) | | `--modelApiKey ` | API key for the custom model provider (required when using custom models) | | `--experimental` | Enable experimental features (default: false) | ## Installation Methods Use your MCP client config: ```json { "mcpServers": { "browserbase": { "url": "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY" } } } ``` For custom models, include `modelName` and `modelApiKey`: ```json { "mcpServers": { "browserbase": { "url": "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY&modelName=openai/gpt-4.1&modelApiKey=YOUR_MODEL_API_KEY" } } } ``` The easiest way to get started locally is using our NPM package. If you would like to use a different model, you have to pass the model name and keys in the args. More info in the [Local Command-Line Flags](#local-command-line-flags) section. Go into your MCP Config JSON and add the Browserbase Server: ```json Claude Desktop { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` That's it! Reload your MCP client and you will be able to use Browserbase. For local development or customization, you can run the server locally. ```bash # Clone the Repo git clone https://github.com/browserbase/mcp-server-browserbase.git cd mcp-server-browserbase # Install the dependencies and build the project npm install && npm run build ``` You can run locally using either STDIO or [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http). Add the following to your MCP Config JSON file: ```json { "mcpServers": { "browserbase": { "command": "node", "args": ["/path/to/mcp-server-browserbase/cli.js"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` First, run the server: ```bash node cli.js --port 8931 ``` Then add this to your MCP Config JSON file: ```json { "mcpServers": { "browserbase": { "url": "http://localhost:8931/mcp", "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Reload your MCP client and you should be good to go! ## Verify Installation Restart/refresh your MCP client app and verify tools are available. Get started using our MCP Server by asking your MCP client to navigate to any page and see your Browserbase Browser in action on the [dashboard](https://www.browserbase.com/sessions). Try: "Navigate to example.com and extract the main heading" ## Further Reading Learn more about the MCP protocol Explore Browserbase features and capabilities Get help from our support team ================================================ FILE: packages/docs/v2/integrations/mcp/tools.mdx ================================================ --- title: "Browserbase MCP Server Tools" sidebarTitle: "Tools" description: "This guide covers the specialized tools available in the Browserbase MCP server for browser automation and interaction." --- ## Overview The Browserbase MCP server provides tools for browser automation and session management through a transport-scoped active session. ## Core Browser Automation Tools These are the primary tools for modern web automation using natural language commands. Navigate to any URL in the browser The URL to navigate to Perform an action on the web page using natural language The action to perform (e.g., "click the login button", "fill form field") Observe and find actionable elements on the page Specific instruction for observation (e.g., "find the login button", "locate search form") Extract data from the current page. Optional extraction instruction. ## Session Management Create or reuse a Browserbase session and set it as active for the current MCP transport session. No input parameters required. Browserbase session ID. Close the active Browserbase session for the current MCP transport session. No input parameters required. ## Further Reading Learn more about the MCP protocol Explore Stagehand's AI-powered browser automation Get help from our support team ================================================ FILE: packages/docs/v2/integrations/vercel/configuration.mdx ================================================ --- title: Use Stagehand in Next.js sidebarTitle: Configuration description: Next.js is a popular framework for developing web-based applications in production. It powers Stagehand apps like [Director](https://director.ai), [Brainrot](https://brainrot.run) and [Open Operator](https://operator.browserbase.com). --- Clone our [GitHub repo](https://github.com/browserbase/stagehand-nextjs-quickstart) to get started with Stagehand in Next.js. ## Add Stagehand to an existing Next.js project If you'd like to add Stagehand to an existing Next.js project, you can do so by installing the dependencies: ```bash npm install @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ```bash pnpm add @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ```bash yarn add @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ### Write a server action Next, let's define our `main` function as a server action in `app/stagehand/main.ts`. This file will have the following three functions: 1. **`main`: Run the main Stagehand script** 2. **`runStagehand`: Initialize and run the `main` function** 3. **`startBBSSession`: Start a Browserbase session** ```ts app/stagehand/main.ts // 🤘 Welcome to Stagehand! // This file is from the [Stagehand docs](https://docs.stagehand.dev/sections/examples/nextjs). "use server"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod/v3"; import { Browserbase } from "@browserbasehq/sdk"; /** * Run the main Stagehand script */ async function main(stagehand: Stagehand) { // You can use the `page` instance to write any Playwright code // For more info: https://playwright.dev/docs/pom const page = stagehand.page; // In this example, we'll get the title of the Stagehand quickstart page await page.goto("https://docs.stagehand.dev/"); await page.act("click the quickstart link"); const { title } = await page.extract({ instruction: "extract the main heading of the page", schema: z.object({ title: z.string(), }), }); return title; } /** * Initialize and run the main() function */ export async function runStagehand(sessionId?: string) { const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, verbose: 1, logger: console.log, browserbaseSessionID: sessionId, disablePino: true, }); await stagehand.init(); await main(stagehand); await stagehand.close(); } /** * Start a Browserbase session */ export async function startBBSSession() { const browserbase = new Browserbase(); const session = await browserbase.sessions.create({ projectId: process.env.BROWSERBASE_PROJECT_ID!, }); const debugUrl = await browserbase.sessions.debug(session.id); return { sessionId: session.id, debugUrl: debugUrl.debuggerFullscreenUrl, }; } ``` ### Create a client component Next, let's create a client component that will start a Browserbase session and run the `main` function with the server actions we just defined. We'll first create a Browserbase session and embed the session in an iframe before running the `main` function. ```tsx app/components/stagehandEmbed.tsx "use client"; import { useCallback, useState } from "react"; import { runStagehand, startBBSSession } from "@/app/stagehand/main"; export function StagehandEmbed() { const [sessionId, setSessionId] = useState(null); const [debugUrl, setDebugUrl] = useState(null); const startSession = useCallback(async () => { const { sessionId, debugUrl } = await startBBSSession(); setSessionId(sessionId); setDebugUrl(debugUrl); await runStagehand(sessionId); }, []); return (
{!sessionId && } {sessionId && debugUrl && ( You might've heard of [Gemini Computer Use](https://blog.google/technology/google-deepmind/gemini-computer-use-model/), [Claude Computer Use](https://www.anthropic.com/news/3-5-models-and-computer-use), or [OpenAI's Computer Using Agent](https://openai.com/index/computer-using-agent/). These are powerful tools that can convert natural language into actions on the computer. However, you'd otherwise need to write your own code to convert these actions into Playwright commands. Stagehand not only handles the execution of Computer Use outputs, but also lets you hot-swap between Google, OpenAI, Anthropic, and Microsoft models with one line of code. You can find more information on the performance of different computer use models by visiting our [evals page](https://www.stagehand.dev/agent-evals). ## How to use a Computer Use Agent in Stagehand Stagehand lets you use Computer Use Agents with one line of code: **Deprecation Notice:** The `cua: true` option is deprecated and will be removed in a future version. Use `mode: "cua"` instead. **IMPORTANT! Configure your browser dimensions** Computer Use Agents will often return XY-coordinates to click on the screen, so you'll need to configure your browser dimensions. If not specified, the default browser dimensions are 1288 x 711. You can also configure the browser dimensions in the `browserbaseSessionCreateParams` or `localBrowserLaunchOptions` options. ### Configuring browser dimensions Browser configuration differs by environment: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "google/gemini-2.5-flash", browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, browserSettings: { blockAds: true, viewport: { width: 1288, height: 711, }, }, }, }); await stagehand.init(); ``` ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { headless: false, viewport: { width: 1288, height: 711, }, } }); await stagehand.init(); ``` ### Direct your Computer Use Agent Call `execute` on the agent to assign a task to the agent. ```typescript Google await page.goto("https://www.google.com/"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }, systemPrompt: "You are a helpful assistant...", }); await agent.execute({ instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", maxSteps: 20, highlightCursor: true }) ``` ```typescript OpenAI await page.goto("https://www.google.com/"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "openai/computer-use-preview", apiKey: process.env.OPENAI_API_KEY }, systemPrompt: "You are a helpful assistant...", }); await agent.execute({ instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", maxSteps: 20, highlightCursor: true }) ``` ```typescript Anthropic await page.goto("https://www.google.com/"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "anthropic/claude-sonnet-4-20250514", apiKey: process.env.ANTHROPIC_API_KEY }, systemPrompt: "You are a helpful assistant...", }); await agent.execute({ instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", maxSteps: 20, highlightCursor: true }) ``` You can define the maximum number of steps the agent can take with `maxSteps`: ```typescript await agent.execute({ instructions: "Apply for a library card at the San Francisco Public Library", maxSteps: 10, }); ``` ### Select Your Computer Use Model Stagehand supports computer use models from Google, Anthropic, OpenAI, and Microsoft. You can find all supported models on the [models page](/v3/configuration/models#agent-models-with-cua-support). ```typescript const agent = stagehand.agent({ mode: "cua", model: "google/gemini-2.5-computer-use-preview-10-2025", // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded - set in your .env }); ``` ```typescript const agent = stagehand.agent({ mode: "cua", model: "anthropic/claude-sonnet-4-20250514", // ANTHROPIC_API_KEY is auto-loaded - set in your .env }); ``` ```typescript const agent = stagehand.agent({ mode: "cua", model: "openai/computer-use-preview", // OPENAI_API_KEY is auto-loaded - set in your .env }); ``` View or run the example templates [here](https://www.browserbase.com/templates?category=Computer+Use+Agents) ================================================ FILE: packages/docs/v3/best-practices/cost-optimization.mdx ================================================ --- title: Cost Optimization sidebarTitle: Cost Optimization description: Minimize costs while maintaining automation performance --- import { V3Banner } from '/snippets/v3-banner.mdx'; Cost optimization in Stagehand involves balancing LLM inference costs and browser infrastructure costs. This guide provides practical strategies to reduce your automation expenses. ## Quick Wins Start with these simple optimizations that can reduce costs: ### 1. Use the Right Model for the Job We don't recommend using larger, more premium models for simple tasks. See our [evaluation results](https://stagehand.dev/evals) for model performance and cost comparisons across different task types. Choose the right LLM for your budget and accuracy requirements See how different models perform on different tasks ### 2. Implement Caching Enable automatic action caching to eliminate redundant LLM calls. Simply specify a `cacheDir` when initializing Stagehand: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "action-cache", // Enable automatic caching }); await stagehand.init(); // First run: uses LLM inference and caches // Subsequent runs: reuses cached action (no LLM cost) await stagehand.act("Click the sign in button"); ``` Learn how to organize caches and manage cache directories ### 3. Optimize Browser Sessions Reuse sessions when possible and set appropriate timeouts. See [Browser Configuration](/configuration/browser) for details: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionCreateParams: { timeout: 1800, // 30 minutes instead of default 1 hour keepAlive: true, // Keep session alive between tasks } }); ``` Optimize Browserbase infrastructure costs and session management ## Advanced Strategies ### Intelligent Model Switching Automatically fall back to cheaper models for simple tasks: ```typescript // Use models from least to most expensive based on task complexity // See stagehand.dev/evals for performance comparisons async function smartAct(prompt: string) { const models = ["google/gemini-2.5-flash", "openai/gpt-4o"]; for (const model of models) { try { const stagehand = new Stagehand({ env: "LOCAL", model: model }); await stagehand.init(); const [action] = await stagehand.observe(prompt); await stagehand.act(action); await stagehand.close(); return; } catch (error) { console.log(`Falling back to ${model}...`); await stagehand.close(); } } } ``` ### Session Pooling Reuse browser sessions across multiple tasks: ```typescript class SessionManager { private sessions = new Map(); async getSession(taskType: string): Promise { if (this.sessions.has(taskType)) { return this.sessions.get(taskType)!; } const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); this.sessions.set(taskType, stagehand); return stagehand; } } ``` ## Cost Monitoring Track your spending to identify optimization opportunities. See our [Observability Guide](/configuration/observability) for detailed metrics: ```typescript // Monitor token usage const metrics = await stagehand.metrics; console.log(`Total tokens: ${metrics.totalPromptTokens + metrics.totalCompletionTokens}`); console.log(`Estimated cost: $${(metrics.totalPromptTokens + metrics.totalCompletionTokens) * 0.00001}`); ``` Monitor usage patterns and track costs in real-time ## Budget Controls Set spending limits to prevent unexpected costs: ```typescript class BudgetGuard { private dailySpend = 0; private maxDailyBudget: number; constructor(maxDailyBudget: number = 25) { this.maxDailyBudget = maxDailyBudget; } checkBudget(estimatedCost: number): void { if (this.dailySpend + estimatedCost > this.maxDailyBudget) { throw new Error(`Daily budget exceeded: $${this.maxDailyBudget}`); } this.dailySpend += estimatedCost; } } ``` ## Related Resources Choose the right LLM for your budget and accuracy requirements Reduce costs with smart action caching and observe patterns Monitor usage patterns and track costs in real-time Optimize Browserbase infrastructure costs and session management ================================================ FILE: packages/docs/v3/best-practices/deployments.mdx ================================================ --- title: 'Deploying Stagehand' description: 'Deploy your AI agents and automations to the cloud' --- import { V3Banner } from '/snippets/v3-banner.mdx'; **🌟 Preview: Browser Functions** - Deploy your web automation code directly on Browserbase with browser functions. Scale your `act()` automations in the cloud with zero infrastructure setup. Reach out to hello@browserbase.com to get beta access. ## Deploy on Vercel Securely run Stagehand on Browserbase inside a Vercel Function. This guide shows a minimal, production-safe HTTP endpoint you can call directly or on a schedule. ### 1. Install Vercel CLI To download and install Vercel CLI, run one of the following commands: ```bash pnpm pnpm i -g vercel ``` ```bash yarn yarn global add vercel ``` ```bash npm npm i -g vercel ``` ```bash bun bun add -g vercel ``` ### 2. Project layout ```text your-project/ api/ run.ts package.json tsconfig.json vercel.json ``` Create the structure with: ```bash mkdir -p api touch api/run.ts package.json vercel.json tsconfig.json ``` ### 3. `api/run.ts` (Node.js runtime) ```typescript // api/run.ts import type { VercelRequest, VercelResponse } from "@vercel/node"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod"; export default async function handler(req: VercelRequest, res: VercelResponse): Promise { try { const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY!, projectId: process.env.BROWSERBASE_PROJECT_ID!, disablePino: true, model: { modelName: "google/gemini-2.5-flash", apiKey: process.env.GOOGLE_API_KEY!, }, // optional session params browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, region: "us-west-2", browserSettings: { blockAds: true, }, }, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://www.stagehand.dev/"); await stagehand.act("click the evals button"); const fastestModel = await stagehand.extract("extract the fastest model", z.string()); await stagehand.close(); res.status(200).json({ ok: true, data: fastestModel }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); res.status(500).json({ ok: false, error: msg }); } } ``` ### 4. `package.json` ```json { "name": "bb-stagehand-on-vercel", "private": true, "type": "module", "engines": { "node": ">=18" }, "dependencies": { "@browserbasehq/stagehand": "^3.0.0" }, "devDependencies": { "@types/node": "^20.12.12", "@vercel/node": "^3.2.20", "typescript": "^5.2.2" } } ``` ### 5. `tsconfig.json` ```json { "compilerOptions": { "target": "ES2022", "module": "ES2022", "moduleResolution": "node", "outDir": ".vercel/output/functions", "strict": true, "esModuleInterop": true, "skipLibCheck": true, "types": ["node"] }, "include": ["api/**/*.ts"] } ``` ### 6. `vercel.json` ```json { "$schema": "https://openapi.vercel.sh/vercel.json", "functions": { "api/run.ts": { "maxDuration": 60 } } } ``` See Vercel's [configuring functions](https://vercel.com/docs/functions/configuring-functions) docs for more details. ### 7. Link your project Link your local folder to a Vercel project before configuring environment variables: ```bash # authenticate if needed vercel login # link the current directory to a Vercel project (interactive) vercel link ``` ### 8. Environment variables Do not commit `.env` in production. Add variables via Vercel CLI: ```bash vercel env add BROWSERBASE_API_KEY vercel env add BROWSERBASE_PROJECT_ID # (and your model key if needed) vercel env add GOOGLE_API_KEY ``` See also: [Browser Environment](/configuration/environment) for details on required variables. ### 9. Test locally Replicate the Vercel environment locally to exercise your Function before deploying. Run from the project root. ```bash # ensure dependencies are installed npm install # start the local Vercel dev server vercel dev --listen 5005 ``` ### 10. Deploy ```bash vercel vercel --prod ``` ### Execute the function #### Configure Protection Bypass for Automation Before invoking the production URL, create a Protection Bypass for Automation: 1. Generate a 32-character secret (you can use `openssl rand -hex 16`) 2. Go to your project in Vercel 3. Navigate to Settings → Deployment Protection 4. Add the secret to "Protection Bypass for Automation" Then invoke the function with the bypass header: ```bash curl -X POST \ -H "x-vercel-protection-bypass: " \ https:///api/run ``` ### Optional: Cron on Vercel Hit the same endpoint on a schedule by extending `vercel.json`: ```json { "$schema": "https://openapi.vercel.sh/vercel.json", "functions": { "api/run.ts": { "maxDuration": 60 } } }, "crons": [ { "path": "/api/run", "schedule": "0 * * * *" } ] } ``` ### Features - **No local browsers needed** with `env: "BROWSERBASE"`. [Browserbase](https://www.browserbase.com/) provides the browsers. - **Fast functionality**: Offload browser work to Browserbase and return JSON promptly. - **Long-running tasks**: Raise `maxDuration` and/or consider Edge runtime limits depending on plan. ================================================ FILE: packages/docs/v3/best-practices/deterministic-agent.mdx ================================================ --- title: Deterministic Agent Scripts sidebarTitle: Deterministic Agent description: Use auto-caching to convert agent workflows into fast, deterministic scripts --- import { V3Banner } from '/snippets/v3-banner.mdx'; Agent workflows are powerful for exploring and automating complex tasks, but they can be slow and non-deterministic. This guide shows you how to use Stagehand's built-in auto-caching to convert agent-discovered workflows into fast, deterministic scripts that run 10-100x faster. ## Why Use Auto-Caching with Agent? Cached agent workflows run 10-100x faster by skipping LLM inference on subsequent runs Eliminate repeated LLM calls—first run uses inference, subsequent runs use cache Cached actions are deterministic and more predictable than fresh agent exploration Works automatically—just specify `cacheDir` and Stagehand handles everything ## How Auto-Caching Works When you specify a `cacheDir`: 1. **First run**: Agent explores and executes workflow using LLM inference 2. **Actions cached**: All actions are automatically saved to local cache 3. **Subsequent runs**: Same workflow reuses cached actions (no LLM calls) 4. **Performance**: 10-100x faster execution, zero LLM tokens The cache key is automatically generated based on: - Agent instruction - Start URL - Agent execution options - Agent configuration ## Basic Auto-Caching with Agent Simply add `cacheDir` when initializing Stagehand: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; // Enable auto-caching const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "agent-cache" // Automatic caching enabled }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }, systemPrompt: "You are a helpful assistant that can use a web browser.", }); // First run: Uses LLM inference (~20-30 seconds, ~50,000 tokens) // Subsequent runs: Uses cached actions (~2-3 seconds, 0 tokens) const result = await agent.execute({ instruction: "Find the login form, fill in username 'demo' and password 'test123', then click submit", maxSteps: 10 }); console.log("Completed:", result.success); console.log("Actions taken:", result.actions.length); await stagehand.close(); ``` That's it! The second time you run this script, it will reuse the cached agent actions automatically. ## Organizing Caches by Workflow Use descriptive cache directories for different workflows: ```typescript // Login workflow const loginStagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/login-workflow" }); // Checkout workflow const checkoutStagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/checkout-workflow" }); // Data extraction workflow const extractStagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/extraction-workflow" }); ``` ## Complete Example: First vs Subsequent Runs ### First Run (Exploration Mode) ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/github-search" // Enable caching }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://github.com"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }, systemPrompt: "You are a helpful assistant that can use a web browser.", }); console.log("First run: Exploring with agent..."); const startTime = Date.now(); const result = await agent.execute({ instruction: "Search for 'stagehand' and click the first repository result", maxSteps: 10 }); const duration = Date.now() - startTime; console.log(`First run completed in ${duration}ms`); console.log(`Actions: ${result.actions.length}`); console.log(`Status: ${result.success}`); await stagehand.close(); // Output (example): // First run completed in 25000ms // Actions: 8 // Status: true ``` ### Subsequent Runs (Cached Mode) Run the **exact same script** again: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/github-search" // Same cache directory }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://github.com"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }, systemPrompt: "You are a helpful assistant that can use a web browser.", }); console.log("Subsequent run: Using cached actions..."); const startTime = Date.now(); const result = await agent.execute({ instruction: "Search for 'stagehand' and click the first repository result", maxSteps: 10 }); const duration = Date.now() - startTime; console.log(`Subsequent run completed in ${duration}ms`); console.log(`Actions: ${result.actions.length}`); console.log(`Status: ${result.success}`); await stagehand.close(); // Output (example): // Subsequent run completed in 2500ms ← 10x faster! // Actions: 8 // Status: true ``` ## Using History for Analysis While caching handles execution automatically, you can still use `stagehand.history` to analyze what happened: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import fs from "fs/promises"; const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow" }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY }, systemPrompt: "You are a helpful assistant that can use a web browser.", }); await agent.execute({ instruction: "Complete the login process", maxSteps: 10 }); // Analyze what the agent did const history = await stagehand.history; console.log(`\nWorkflow Analysis:`); console.log(`Total operations: ${history.length}`); const agentOps = history.filter(e => e.method === 'agent'); const actOps = history.filter(e => e.method === 'act'); const navOps = history.filter(e => e.method === 'navigate'); console.log(`- Agent executions: ${agentOps.length}`); console.log(`- Act operations: ${actOps.length}`); console.log(`- Navigate operations: ${navOps.length}`); // Save for documentation await fs.writeFile( 'workflow-analysis.json', JSON.stringify(history, null, 2) ); await stagehand.close(); ``` ## Cache Management ### Clear Cache When Site Changes If the website structure changes, clear the cache to force fresh exploration: ```typescript import { rmSync } from 'fs'; // Clear specific workflow cache rmSync('cache/login-workflow', { recursive: true, force: true }); // Then run with fresh exploration const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/login-workflow" // Will rebuild cache }); ``` ### Programmatic Cache Control ```typescript import { rmSync, existsSync } from 'fs'; function clearCacheIfNeeded(cacheDir: string, maxAge: number = 7 * 24 * 60 * 60 * 1000) { if (!existsSync(cacheDir)) { return; // No cache to clear } const stats = statSync(cacheDir); const age = Date.now() - stats.mtimeMs; if (age > maxAge) { console.log(`Cache older than ${maxAge}ms, clearing...`); rmSync(cacheDir, { recursive: true, force: true }); } } // Clear cache if older than 7 days clearCacheIfNeeded('cache/workflow'); const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow" }); ``` ## Advanced Patterns ### Fallback to Fresh Exploration Combine caching with fallback for resilience: ```typescript async function executeWithFallback() { const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow", selfHeal: true // Enable self-healing }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); const agent = stagehand.agent({ model: "anthropic/claude-sonnet-4-20250514" }); try { // Try with cache const result = await agent.execute({ instruction: "Complete the checkout process", maxSteps: 15 }); console.log("Execution successful:", result.success); } catch (error) { console.error("Cached workflow failed:", error); // Clear cache and retry with fresh exploration rmSync('cache/workflow', { recursive: true, force: true }); console.log("Retrying with fresh exploration..."); const retryResult = await agent.execute({ instruction: "Complete the checkout process", maxSteps: 15 }); console.log("Retry successful:", retryResult.success); } await stagehand.close(); } ``` ### Version Control for Caches Commit cache directories to ensure consistent behavior across environments: ```gitignore # .gitignore # Commit cache directories for deterministic CI/CD !cache/ !cache/**/*.json ``` ```typescript // CI/CD pipeline will use pre-generated cache const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/production-workflow" // Committed to repo }); ``` ## Best Practices Organize caches by workflow or feature: ```typescript // Good: descriptive cache names cacheDir: "cache/user-registration" cacheDir: "cache/product-search" cacheDir: "cache/checkout-flow" // Avoid: generic names cacheDir: "cache" cacheDir: "my-cache" ``` Implement a strategy for refreshing caches: ```typescript // Option 1: Time-based invalidation if (isCacheOlderThan('cache/workflow', 7)) { clearCache('cache/workflow'); } // Option 2: Version-based invalidation const CACHE_VERSION = 'v2'; const cacheDir = `cache/workflow-${CACHE_VERSION}`; // Option 3: Manual invalidation flag if (process.env.CLEAR_CACHE === 'true') { clearCache('cache/workflow'); } ``` Always test cached workflows in staging before production: ```typescript const env = process.env.NODE_ENV === 'production' ? 'production' : 'staging'; const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: `cache/${env}-workflow` }); ``` Track cache usage for optimization: ```typescript const cacheHit = existsSync('cache/workflow') && statSync('cache/workflow').mtimeMs < Date.now(); if (cacheHit) { console.log("Cache hit - using cached workflow"); } else { console.log("Cache miss - exploring with agent"); } // Log metrics metrics.recordCacheHit(cacheHit); ``` ## Performance Comparison **Without Caching (Every Run):** ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE" }); // No cacheDir specified const result = await agent.execute({ instruction: "Complete workflow", maxSteps: 10 }); // Every run: ~20-30 seconds, ~50,000 tokens ``` **With Auto-Caching (First Run):** ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow" }); const result = await agent.execute({ instruction: "Complete workflow", maxSteps: 10 }); // First run: ~20-30 seconds, ~50,000 tokens (cached for next time) ``` **With Auto-Caching (Subsequent Runs):** ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow" // Reuses cache }); const result = await agent.execute({ instruction: "Complete workflow", maxSteps: 10 }); // Subsequent runs: ~2-3 seconds, 0 tokens ← 10-100x faster! ``` Cached agent workflows run **10-100x faster** and consume **zero LLM tokens** on subsequent runs. The first run pays the exploration cost, every run after is nearly instant. ## Troubleshooting **Problem**: Workflow still slow on subsequent runs **Solutions**: - Verify `cacheDir` path is correct and consistent across runs - Ensure instruction, URL, and agent config are identical - Check file permissions on cache directory - Look for cache hit/miss logs in verbose mode ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow", verbose: 2 // Enable debug logs }); ``` **Problem**: Cached actions fail on subsequent runs **Solutions**: - Website may have changed—clear cache to re-explore - Enable self-healing to adapt to minor changes - Implement fallback logic to retry with fresh exploration ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", cacheDir: "cache/workflow", selfHeal: true // Adapt to changes }); ``` **Problem**: Cache directories growing uncontrolled **Solutions**: - Use version prefixes for cache directories - Implement automatic cleanup of old caches - Share cache directories for similar workflows ```typescript // Versioned caches const CACHE_VERSION = '2024-01'; const cacheDir = `cache/workflow-${CACHE_VERSION}`; // Cleanup old versions rmSync('cache/workflow-2023-12', { recursive: true, force: true }); ``` ## Next Steps Learn more about agent capabilities and configuration Complete guide to auto-caching with act() and agent() Monitor and track history and metrics Additional techniques for faster automation ================================================ FILE: packages/docs/v3/best-practices/history.mdx ================================================ --- title: History Tracking sidebarTitle: History Tracking description: Track and analyze Stagehand operations with the history API --- import { V3Banner } from '/snippets/v3-banner.mdx'; The history API captures every Stagehand operation for debugging, auditing, and workflow analysis. ## Basic Usage ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); await stagehand.act("click login button"); // Get complete history const history = await stagehand.history; console.log(`Total operations: ${history.length}`); history.forEach((entry, i) => { console.log(`${i + 1}. ${entry.method} at ${entry.timestamp}`); }); await stagehand.close(); ``` ## History Entry Structure ```typescript interface HistoryEntry { method: "act" | "extract" | "observe" | "navigate" | "agent"; parameters: unknown; // Input parameters result: unknown; // Output/result timestamp: string; // ISO 8601 timestamp } ``` ## Common Use Cases ### Debugging Failures ```typescript try { await stagehand.act("click login button"); } catch (error) { const history = await stagehand.history; history.forEach((entry, i) => { const status = entry.result && 'error' in entry.result ? "FAILED" : "SUCCESS"; console.log(`${i + 1}. ${status} - ${entry.method}`); }); } ``` ### Analyzing Timing ```typescript const history = await stagehand.history; const timings = history.map((entry, i) => { if (i === 0) return null; const duration = new Date(entry.timestamp).getTime() - new Date(history[i - 1].timestamp).getTime(); return { operation: entry.method, duration }; }).filter(Boolean); console.log("Slowest operations:", timings.sort((a, b) => b.duration - a.duration).slice(0, 3) ); ``` ### Operation Statistics ```typescript const history = await stagehand.history; const stats = history.reduce((acc, entry) => { acc[entry.method] = (acc[entry.method] || 0) + 1; return acc; }, {} as Record); console.log("Operations:", stats); // { act: 5, extract: 2, observe: 3, navigate: 1 } ``` ### Saving History ```typescript import fs from "fs/promises"; const history = await stagehand.history; const metrics = await stagehand.metrics; await fs.writeFile( `workflow-report.json`, JSON.stringify({ history, totalOps: history.length, totalTokens: metrics.totalPromptTokens + metrics.totalCompletionTokens }, null, 2) ); ``` ## Filtering by Operation Type ```typescript const history = await stagehand.history; const actions = history.filter(e => e.method === 'act'); const extractions = history.filter(e => e.method === 'extract'); const agentOps = history.filter(e => e.method === 'agent'); console.log(`Actions: ${actions.length}`); console.log(`Extractions: ${extractions.length}`); console.log(`Agent executions: ${agentOps.length}`); ``` ## Combining with Metrics ```typescript const history = await stagehand.history; const metrics = await stagehand.metrics; const report = { totalOps: history.length, successful: history.filter(e => !e.result || !('error' in e.result)).length, failed: history.filter(e => e.result && 'error' in e.result).length, totalTokens: metrics.totalPromptTokens + metrics.totalCompletionTokens, avgTimePerOp: `${(metrics.totalInferenceTimeMs / history.length).toFixed(0)}ms` }; console.log(report); ``` Learn more about metrics, logging, and monitoring ## What's Tracked? Only Stagehand methods are tracked in history: ```typescript // Tracked await stagehand.act("click button"); // ✓ await stagehand.extract({ instruction: "..." }); // ✓ await stagehand.observe("find elements"); // ✓ await page.goto("https://example.com"); // ✓ // Not tracked await page.locator("button").click(); // ✗ Native Playwright await page.click("button"); // ✗ Native Playwright ``` ## Best Practices - **Save history for critical workflows** - Maintain audit trails for production - **Inspect history when debugging** - Check the last operations to identify failures - **Analyze timing periodically** - Find slow operations and optimize - **Combine with metrics** - Get complete visibility into performance and cost ## Next Steps Build fast, cached agent workflows Combine history with metrics Speed up workflows with caching Configure detailed execution traces ================================================ FILE: packages/docs/v3/best-practices/mcp-integrations.mdx ================================================ --- title: "MCP Integrations" description: "Using Model Context Protocol (MCP) integrations to enhance agent capabilities" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## What are MCP Integrations? MCP (Model Context Protocol) integrations allow you to connect your Stagehand agents to external tools, APIs, and services. This enables agents to perform actions beyond browser automation, such as web search, database operations, and API calls. MCP integrations make your agents more powerful by combining browser automation with external capabilities. The agent can intelligently decide when to use browser actions versus external tools. ## Connection Options There are two options for connecting to MCP servers: 1. **Pass a URL directly** - The simplest approach for quick setup 2. **Create a connection first** - Gives you more control over the connection MCP client support is currently only available in TypeScript. ## Passing a URL The simplest way to add MCP integrations is by providing server URLs directly in the agent configuration: ```typescript const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, ], systemPrompt: `You have access to web search through Exa. Use it to find current information before browsing.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); ``` ## Creating a Connection First Alternatively, you can establish MCP connections first and then pass the client objects: ```typescript import { connectToMCPServer } from "@browserbasehq/stagehand"; // Connect to MCP server const supabaseClient = await connectToMCPServer( `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` ); // You can also pass the config to start a local MCP server const notionClient = await connectToMCPServer({ command: "npx", args: ["-y", "@notionhq/notion-mcp-server"], env: { NOTION_TOKEN: process.env.NOTION_TOKEN, }, }); // Use the connected clients (example with Supabase + Notion) const agent = stagehand.agent({ provider: "openai", model: "computer-use-preview", integrations: [supabaseClient, notionClient], systemPrompt: `You can interact with Supabase databases and Notion. Use these tools to store and retrieve data.`, options: { apiKey: process.env.OPENAI_API_KEY, }, }); await agent.execute("Search for restaurants in New Brunswick, NJ and save the first result to the database"); ``` ## Authenticated MCP Servers Some MCP servers require authentication via HTTP request headers. You can pass request headers through `requestOptions`: ```typescript const authenticatedClient = await connectToMCPServer({ serverUrl: "https://mcp-server.example.com/mcp", requestOptions: { requestInit: { headers: { Authorization: `Bearer ${process.env.MCP_SERVER_API_KEY}`, }, }, }, }); ``` ## Multiple Integrations You can combine multiple MCP integrations in a single agent: ```typescript const databaseClient = await connectToMCPServer(/* database config */); const agent = stagehand.agent({ integrations: [ `https://search-service.example.com/mcp?apiKey=${process.env.SEARCH_API_KEY}`, databaseClient ], systemPrompt: `You have access to external tools for search and data storage. Use these tools strategically to complete tasks efficiently.` }); ``` ## Best Practices ### Choose the Right Connection Approach **When to use:** - Simple setup requirements - Standard API configurations - Getting started quickly **Benefits:** - Minimal code required - Automatic connection handling - Easy to configure **When to use:** - Custom connection options - Connection reuse across agents - Advanced error handling **Benefits:** - Full control over connections - Better error handling - Connection pooling capabilities ### Environment Variables Always use environment variables for API keys and sensitive information: ```bash # .env file SEARCH_API_KEY=your_search_service_key MCP_SERVICE_API_KEY=your_mcp_service_key OPENAI_API_KEY=your_openai_key DATABASE_URL=your_database_url DATABASE_API_KEY=your_database_key ``` ### Instructions Best Practices Provide clear instructions about available tools: ```typescript systemPrompt: `You have access to: 1. Web search tools - Use to find current information 2. Database tools - Use to store/retrieve data 3. Browser automation - Use for web interactions Always search for current information before making decisions. Store important data for later reference.` ``` ```typescript systemPrompt: "You can search and save data." ``` ### Error Handling Implement proper error handling for MCP connections: ```typescript try { const client = await connectToMCPServer(serverUrl); const agent = stagehand.agent({ integrations: [client], // ... other config }); const result = await agent.execute(instruction); } catch (error) { console.error("MCP integration failed:", error); // Handle fallback behavior } ``` ## Troubleshooting **Problem:** MCP server connections timing out **Solutions:** - Verify server URLs are correct and accessible - Check network connectivity - Ensure API keys are valid and have proper permissions - Try connecting to servers individually to isolate issues **Problem:** Agent not using available MCP tools **Solutions:** - Make instructions more specific about when to use tools - Ensure API keys are properly configured - Check that the MCP server supports the expected tools - Verify tool descriptions are clear and actionable **Problem:** API key or authentication failures **Solutions:** - Verify all required environment variables are set - Check API key validity and permissions - Ensure URLs include necessary authentication parameters - Test MCP connections independently before using in agents ## Examples ### Web Search + Browser Automation ```typescript const agent = stagehand.agent({ integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], systemPrompt: `First search for current information, then use the browser to complete tasks based on what you find.` }); await agent.execute("Find the best laptop deals for 2025 and navigate to purchase the top recommendation"); ``` ### Data Extraction + Storage ```typescript const supabaseClient = await connectToMCPServer(/* config */); const agent = stagehand.agent({ integrations: [supabaseClient], systemPrompt: `Extract data from websites and store it using available database tools.` }); await agent.execute("Extract all restaurant information from this directory and save it to the database"); ``` ### Multi-tool Workflow ```typescript const agent = stagehand.agent({ integrations: [ `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, supabaseClient ], systemPrompt: `Use all available tools strategically: search for current info, browse websites, and store important data.` }); await agent.execute("Research competitor pricing, compare with our site, and store the analysis"); ``` ## Further Reading Learn the fundamentals of Stagehand agents Set up your own MCP server Create custom MCP tools ================================================ FILE: packages/docs/v3/best-practices/prompting-best-practices.mdx ================================================ --- title: Prompting Best Practices description: "Write effective prompts for reliable Stagehand automation" --- import { V3Banner } from '/snippets/v3-banner.mdx'; Good prompts make Stagehand reliable. Bad prompts cause failures. Here's how to write prompts that work consistently. ## Act Method Use `act()` for single actions on web pages. Each action should be focused and clear. ```typescript // Good - Single, specific actions await stagehand.act("click the 'Add to Cart' button"); await stagehand.act("type 'user@example.com' into the email field"); // Bad - Multiple actions combined await stagehand.act("fill out the form and submit it"); await stagehand.act("login with credentials and navigate to dashboard"); ``` ### Use Element Types, Not Colors Describe elements by their type and function rather than visual attributes like color. ```typescript // Good - Element types and descriptive text await stagehand.act("click the 'Sign In' button"); await stagehand.act("type into the email input field"); // Bad - Color-based descriptions await stagehand.act("click the blue button"); await stagehand.act("type into the white input"); ``` ### Use Descriptive Language ```typescript // Good - Clear element identification await stagehand.act("click the 'Next' button at the bottom of the form"); await stagehand.act("type into the search bar at the top of the page"); // Bad - Vague descriptions await stagehand.act("click next"); await stagehand.act("type into search"); ``` ### Choose the Right Action Verbs - **Click** for buttons, links, checkboxes - **Type** for text inputs - **Select** for dropdowns - **Check/uncheck** for checkboxes - **Upload** for file inputs ```typescript // Good await stagehand.act("click the submit button"); await stagehand.act("select 'Option 1' from dropdown"); // Bad await stagehand.act("click submit"); await stagehand.act("choose option 1"); ``` ### Protect Sensitive Data Variables keep sensitive information out of prompts and logs. ```typescript // Use variables for sensitive data await stagehand.act("type %username% into the email field", { variables: { username: "user@example.com" } }); await stagehand.act("type %password% into the password field", { variables: { password: process.env.USER_PASSWORD } }); ``` Set `verbose: 0` in your Stagehand config to prevent secrets from appearing in logs. ## Extract Method Use `extract()` to pull structured data from pages. Define clear schemas and provide context. ### Schema Best Practices Use descriptive field names, correct types, and detailed descriptions. Field descriptions provide context that helps the model understand exactly what to extract. ```typescript // Good - Descriptive names, correct types, and helpful descriptions const productData = await stagehand.extract( "Extract product information", z.object({ productTitle: z.string().describe("The main product name displayed on the page"), priceInDollars: z.number().describe("Current selling price as a number, without currency symbol"), isInStock: z.boolean().describe("Whether the product is available for purchase") }) ); // Bad - Generic names, wrong types, no descriptions const data = await stagehand.extract( "Get product details", z.object({ name: z.string(), // Too generic, no context price: z.string(), // Should be number stock: z.string() // Should be boolean, no context }) ); ``` ### Use Proper URL Types Specify URL types with `z.string().url()` to tell Stagehand to extract URLs. ```typescript // Good - Tells Stagehand to extract URLs const links = await stagehand.extract( "Extract navigation links", z.array(z.object({ text: z.string(), url: z.string().url() // Required for URL extraction })) ); // Single URL extraction const contactUrl = await stagehand.extract( "extract the contact page URL", z.string().url() ); ``` ## Observe Method Use `observe()` to discover actionable elements before acting on them. ### Check Elements First Verify elements exist before taking action to avoid errors. ```typescript // Check for elements first const loginButtons = await stagehand.observe("Find the login button"); if (loginButtons.length > 0) { await stagehand.act(loginButtons[0]); } else { console.log("No login button found"); } ``` ### Be Specific About Element Types ```typescript // Good - Specific element types const submitButtons = await stagehand.observe("Find submit button in the form"); const dropdowns = await stagehand.observe("Find the state dropdown menu"); // Bad - Too vague const elements = await stagehand.observe("Find submit stuff"); const things = await stagehand.observe("Find state selection"); ``` ## Agent Method Use `agent()` for complex, multi-step workflows. Provide detailed instructions and set appropriate limits. ### Navigate First Don't include navigation in agent tasks. Handle it separately. ```typescript // Good - Navigate first await page.goto('https://amazon.com'); await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart'); // Bad - Navigation in task await agent.execute('Go to Amazon, search for headphones, and add one to cart'); ``` ### Be Highly Specific Detailed instructions lead to better results. ```typescript // Good - Detailed instructions await agent.execute({ instruction: "Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", maxSteps: 25 }); // Bad - Vague instructions await agent.execute("Find some good restaurants"); ``` ### Set Appropriate Step Limits Match step limits to task complexity. ```typescript // Simple task - fewer steps await agent.execute({ instruction: "Subscribe to the newsletter with email 'user@example.com'", maxSteps: 10 }); // Complex task - more steps await agent.execute({ instruction: "Research and compare 5 project management tools with pricing and features", maxSteps: 50 }); ``` ### Include Success Criteria Tell the agent how to know when it's done. ```typescript // Good - Clear success criteria await agent.execute({ instruction: "Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", maxSteps: 20 }); // Bad - No validation await agent.execute("Add some items to cart"); ``` ## Common Mistakes to Avoid - **Combining multiple actions** - Keep each `act()` call to one action - **Using vague descriptions** - Be specific about which elements to interact with - **Exposing sensitive data** - Always use variables for credentials - **Skipping validation** - Check results before proceeding ## Testing Your Prompts 1. **Start simple** - Test basic functionality first 2. **Add complexity gradually** - Build up to complex workflows 3. **Monitor results** - Use logging to understand what's happening 4. **Iterate based on failures** - Refine prompts when they don't work Remember: Good prompting is iterative. When in doubt, be more specific rather than less. ================================================ FILE: packages/docs/v3/best-practices/speed-optimization.mdx ================================================ --- title: Speed Optimization sidebarTitle: Speed Optimization description: Optimize Stagehand performance for faster automation and reduced latency --- import { V3Banner } from '/snippets/v3-banner.mdx'; Stagehand performance depends on several factors: DOM processing speed, LLM inference time, browser operations, and network latency. This guide provides proven strategies to maximize automation speed. ## Quick Performance Wins ### 1. Plan Ahead with Observe Use a single `observe()` call to plan multiple actions, then execute them efficiently: ```typescript // Instead of sequential operations with multiple LLM calls await stagehand.act("Fill name field"); // LLM call #1 await stagehand.act("Fill email field"); // LLM call #2 await stagehand.act("Select country dropdown"); // LLM call #3 // Use single observe to plan all form fields - one LLM call const formFields = await stagehand.observe("Find all form fields to fill"); // Execute all actions without LLM inference for (const field of formFields) { await stagehand.act(field); // No LLM calls! } ``` **Performance Tip**: Acting on `observe` results avoids LLM inference entirely. This approach is 2-3x faster than direct `act()` calls and is the recommended pattern for multi-step workflows. Learn advanced caching patterns and cache invalidation strategies ### 2. Optimize DOM Processing Reduce DOM complexity before Stagehand processes the page: ```typescript // Remove heavy elements that slow down processing await page.evaluate(() => { // Remove video elements document.querySelectorAll('video, iframe').forEach(el => el.remove()); // Hide complex animations document.querySelectorAll('[style*="animation"]').forEach(el => { (el as HTMLElement).style.animation = 'none'; }); }); // Then perform Stagehand operations await stagehand.act("Click the submit button"); ``` ### 3. Set Appropriate Timeouts Use shorter timeouts for simple operations and longer ones for complex page loads: ```typescript // Simple actions - reduce action timeout await stagehand.act("Click the login button", { timeout: 5000 // Default is 30000ms, reduce for simple clicks }); // Complex page loads - optimize navigation const page = stagehand.context.pages()[0]; await page.goto("https://heavy-spa.com", { waitUntil: "domcontentloaded", // Don't wait for all resources timeout: 15000 // Shorter than default 30s }); ``` ## Performance Monitoring and Benchmarking Track performance metrics and measure optimization impact: ### Performance Tracking ```typescript class PerformanceTracker { private speedMetrics: Map = new Map(); async timedAct(page: Page, prompt: string): Promise { const start = Date.now(); const result = await stagehand.act(prompt); const duration = Date.now() - start; if (!this.speedMetrics.has(prompt)) { this.speedMetrics.set(prompt, []); } this.speedMetrics.get(prompt)!.push(duration); console.log(`Action "${prompt}" took ${duration}ms`); return result; } getAverageTime(prompt: string): number { const times = this.speedMetrics.get(prompt) || []; return times.reduce((a, b) => a + b, 0) / times.length; } } ``` Example Output: ``` Action "Fill form" took 1000ms Action "Click submit" took 2000ms Action "Confirm submission" took 5000ms ``` ### Before vs After Benchmarking ```typescript // Before optimization console.time("workflow"); await stagehand.act("Fill form"); await stagehand.act("Click submit"); await stagehand.act("Confirm submission"); console.timeEnd("workflow"); // 8000ms // After optimization with observe planning console.time("workflow-optimized"); const workflowActions = await stagehand.observe("Find form, submit, and confirm elements"); // Execute actions sequentially to avoid conflicts for (const action of workflowActions) { await stagehand.act(action); } console.timeEnd("workflow-optimized"); // 500ms ``` Example Output: ``` Workflow took 8000ms Optimized workflow took 500ms ``` Set up comprehensive performance monitoring ## Related Resources Advanced caching patterns for maximum performance Balance speed improvements with cost considerations Optimize Browserbase settings for speed Choose the right model for speed vs accuracy ================================================ FILE: packages/docs/v3/best-practices/usecase-observe.mdx ================================================ --- sidebarTitle: Use Cases --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Real-World Use Cases ### E-commerce Product Discovery ```typescript // Discover product interaction elements const productActions = await stagehand.observe({ instruction: "Find add to cart buttons, size selectors, and product images" }); // Categorize actions by type const cartButtons = productActions.filter(a => a.description.toLowerCase().includes('cart') ); const sizeOptions = productActions.filter(a => a.description.toLowerCase().includes('size') ); // Execute purchase workflow if (sizeOptions.length > 0) { await stagehand.act(sizeOptions[0]); // Select size first } if (cartButtons.length > 0) { await stagehand.act(cartButtons[0]); // Then add to cart } ``` ### Form Handling & Validation ```typescript // Analyze form structure before filling const formElements = await stagehand.observe({ instruction: "Find form fields, validation messages, and submit buttons" }); // Check for required fields const requiredFields = formElements.filter(e => e.description.includes('required') || e.description.includes('*') ); console.log(`Found ${requiredFields.length} required fields to complete`); // Fill form systematically for (const field of requiredFields) { await stagehand.act(field); // Add appropriate input based on field type } ``` ### Dynamic Content & SPA Navigation ```typescript // Wait for and discover dynamically loaded content await page.waitForLoadState('networkidle'); const dynamicElements = await stagehand.observe({ instruction: "Find newly loaded content, infinite scroll triggers, or loading indicators", domSettleTimeoutMs: 15000 // Wait longer for dynamic content }); // Handle infinite scroll const scrollTriggers = dynamicElements.filter(e => e.description.toLowerCase().includes('load more') || e.description.toLowerCase().includes('scroll') ); if (scrollTriggers.length > 0) { await stagehand.act(scrollTriggers[0]); // Recursively observe new content const newContent = await stagehand.observe("Find additional items"); } ``` ### Multi-Step Workflow Planning ```typescript // Plan entire checkout flow upfront async function planCheckoutWorkflow() { // Step 1: Cart page analysis await page.goto('/cart'); const cartActions = await stagehand.observe("Find checkout and cart modification options"); // Step 2: Checkout page analysis const checkoutButton = cartActions.find(a => a.description.includes('checkout')); if (checkoutButton) await stagehand.act(checkoutButton); const checkoutActions = await stagehand.observe("Find payment forms and shipping options"); // Step 3: Plan execution order const shippingFields = checkoutActions.filter(a => a.description.includes('shipping')); const paymentFields = checkoutActions.filter(a => a.description.includes('payment')); const submitButton = checkoutActions.find(a => a.description.includes('complete order')); return { shippingFields, paymentFields, submitButton }; } // Execute planned workflow const workflow = await planCheckoutWorkflow(); // Fill shipping → payment → submit ``` ================================================ FILE: packages/docs/v3/best-practices/user-data.mdx ================================================ --- title: User Data Directory sidebarTitle: User Data description: Persist browser data between sessions --- import { V3Banner } from '/snippets/v3-banner.mdx'; ### User Data Directory Persist browser data between sessions. #### Local Sessions For local sessions, use the `userDataDir` option: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { userDataDir: "./browser-data", }, }); await stagehand.init(); ``` #### Browserbase Sessions For Browserbase sessions, use [contexts](https://docs.browserbase.com/features/contexts) to persist browser data: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionCreateParams: { browserSettings: { context: { id: "my-context-id", persist: true, }, }, }, }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ================================================ FILE: packages/docs/v3/best-practices/using-multiple-tabs.mdx ================================================ --- title: 'Using Multiple Tabs' description: 'Act on multiple tabs with Stagehand' --- import { V3Banner } from '/snippets/v3-banner.mdx'; Many modern web applications open new tabs when users click certain buttons or links. Without proper multitab support, automation scripts break when expected content appears in a new tab rather than the current one. Stagehand's multitab capabilities ensure your automations work seamlessly across multitab workflows. ## The Stagehand Page Stagehand automatically adapts to multitab workflows. The active page (accessed via `context.activePage()`) always points to the most recently opened or active tab, ensuring your automations continue working even when new tabs are created. This means you can continue using familiar patterns: ```typescript const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); await stagehand.act("click the button that opens a new tab"); // page now automatically points to the new tab await stagehand.extract("get data from new tab"); ``` **Important**: [Stagehand Agent](/v3/basics/agent) will always operate on the active page. If you need an agent to work across specific tabs, you'll need to manage page switching manually. ## Manual Page Management For more control or multitab workflows, you can manage multiple tabs explicitly: ```typescript // Create a second page await stagehand.context.newPage(); const pages = stagehand.context.pages(); const githubPage = pages[0]; const pythonPage = pages[1]; // Navigate each page to different repositories await githubPage.goto("https://github.com/browserbase/stagehand"); await pythonPage.goto("https://github.com/browserbase/stagehand-python"); // Extract data from both pages simultaneously const [stagehandStars, stagehandPythonStars] = await Promise.all([ stagehand.extract("extract the repository stars", { page: githubPage }), stagehand.extract("extract the repository stars", { page: pythonPage }) ]); console.log(`Stagehand stars: ${stagehandStars}`); console.log(`Stagehand-Python stars: ${stagehandPythonStars}`); ``` ## Next Steps Use `Agent` to autonomously execute multi-step tasks and complex workflows. Learn best practices for interacting with elements inside iframes. Manage browser contexts and sessions for complex automation scenarios. Handle errors gracefully and debug automation issues effectively. ================================================ FILE: packages/docs/v3/configuration/browser.mdx ================================================ --- title: Browser sidebarTitle: Browser description: Configure Stagehand on Browserbase or locally --- import { V3Banner } from '/snippets/v3-banner.mdx'; Stagehand supports two primary environments: - **Browserbase** - Cloud-managed browser infrastructure optimized for production web automation at scale - **Local** - Run browsers directly on your machine for development and debugging ## Browserbase Environment Browserbase provides managed cloud browser infrastructure optimized for web automation at scale. It offers advanced features like stealth mode, proxy support, and persistent contexts. Discover the power of cloud-managed browser infrastructure with Browserbase. ### Multi-Region Support Stagehand API is available in multiple regions to optimize latency and support data residency requirements. The SDK automatically routes requests to the correct regional API endpoint based on your browser session's region. | Region | API Endpoint | | --- | --- | | **us-west-2** (Default) | https://api.stagehand.browserbase.com | | **us-east-1** | https://api.use1.stagehand.browserbase.com | | **eu-central-1** | https://api.euc1.stagehand.browserbase.com | | **ap-southeast-1** | https://api.apse1.stagehand.browserbase.com | Configure your browser session region in `browserbaseSessionCreateParams`: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionCreateParams: { region: "eu-central-1", // Browser runs in Frankfurt }, }); await stagehand.init(); ``` The API endpoint must match your browser session region. If there's a mismatch, you'll receive an error: `Session is in region 'X' but this API instance serves 'Y'. Please route your request to the X Stagehand API endpoint.` ### Disabling Stagehand API If you want to use Stagehand purely as a local library without routing through the Stagehand API, you can disable API mode: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", disableAPI: true, // Disable Stagehand API - runs locally with Browserbase }); await stagehand.init(); ``` Disabling the API is useful when you want to manage browser sessions directly while still using Stagehand's automation features locally. ### Environment Variables Before getting started, set up the required environment variables: ```bash .env BROWSERBASE_API_KEY=your_api_key_here BROWSERBASE_PROJECT_ID=your_project_id_here ``` Get your API key and Project ID from the [Browserbase Dashboard](https://browserbase.com/overview) ### Using Stagehand with Browserbase #### Basic Setup The simplest way to get started is with default settings: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", }); await stagehand.init(); ``` #### Advanced Configuration Configure browser settings, proxy support, and other session parameters: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", // Optional: API Key and Project ID will be pulled directly from your environment apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, browserbaseSessionCreateParams: { proxies: true, region: "us-west-2", browserSettings: { viewport: { width: 1920, height: 1080 }, blockAds: true, }, }, }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, browserbaseSessionCreateParams: { projectId: process.env.BROWSERBASE_PROJECT_ID!, proxies: true, region: "us-west-2", timeout: 3600, // 1 hour session timeout keepAlive: true, // Available on Startup plan browserSettings: { advancedStealth: false, // this is a Scale Plan feature - reach out to support@browserbase.com to enable blockAds: true, solveCaptchas: true, recordSession: false, viewport: { width: 1920, height: 1080, }, }, userMetadata: { userId: "automation-user-123", environment: "production", }, }, }); ``` ### Alternative: Browserbase SDK If you prefer to manage sessions directly, you can use the Browserbase SDK: ```typescript import { Browserbase } from "@browserbasehq/sdk"; const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY! }); const session = await bb.sessions.create({ projectId: process.env.BROWSERBASE_PROJECT_ID!, // Add configuration options here }); ``` #### Connecting to an Existing Session Connect to a previously created Browserbase session using its session ID: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionID: "existing-session-uuid-here", }); await stagehand.init(); console.log("Resumed Session ID:", stagehand.sessionId); ``` ## Local Environment The local environment runs browsers directly on your machine, providing full control over browser instances and configurations. Ideal for development, debugging, and scenarios requiring custom browser setups. ### Environment Comparison | Feature | Browserbase | Local | | --- | --- | --- | | **Scalability** | High (cloud-managed) | Limited (local resources) | | **Stealth Features** | Advanced fingerprinting | Basic stealth | | **Proxy Support** | Built-in residential proxies | Manual configuration | | **Session Persistence** | Cloud context storage | File-based user data | | **Geographic Distribution** | Multi-region deployment | Single machine | | **Debugging** | Session recordings & logs | Direct DevTools access | | **Setup Complexity** | Environment variables only | Browser installation required | | **Cost** | Usage-based pricing | Infrastructure & maintenance | | **Best For** | Production, scale, compliance | Development, debugging | ### Basic Local Setup ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL" }); await stagehand.init(); console.log("Session ID:", stagehand.sessionId); ``` ### Advanced Local Configuration Customize browser launch options for local development: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { headless: false, // Show browser window devtools: true, // Open developer tools viewport: { width: 1280, height: 720 }, executablePath: '/opt/google/chrome/chrome', // Custom Chrome path port: 9222, // Fixed CDP debugging port args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security', '--allow-running-insecure-content', ], userDataDir: './chrome-user-data', // Persist browser data preserveUserDataDir: true, // Keep data after closing chromiumSandbox: false, // Disable sandbox (adds --no-sandbox) ignoreHTTPSErrors: true, // Ignore certificate errors locale: 'en-US', // Set browser language deviceScaleFactor: 1.0, // Display scaling proxy: { server: 'http://proxy.example.com:8080', username: 'user', password: 'pass' }, downloadsPath: './downloads', // Download directory acceptDownloads: true, // Allow downloads connectTimeoutMs: 30000, // Connection timeout }, }); await stagehand.init(); ``` ## Advanced Configuration ### Keep Alive The `keepAlive` option controls whether the browser remains running after `stagehand.close()` is called or when the parent process exits unexpectedly (e.g., crash, `SIGTERM`, `SIGINT`). By default, Stagehand terminates the browser and cleans up all resources when it shuts down. Setting `keepAlive: true` keeps the browser running independently so you can reconnect to it later. ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", keepAlive: true, }); await stagehand.init(); // The browser session continues running after close() await stagehand.close(); // Later, reconnect to the same session const stagehand2 = new Stagehand({ env: "BROWSERBASE", browserbaseSessionID: stagehand.browserbaseSessionID, }); await stagehand2.init(); ``` #### Behavior by Environment | Behavior | `keepAlive: true` | `keepAlive: false` (default) | | --- | --- | --- | | **Browserbase** | Session stays active after `close()` | Session is terminated via API | | **Local** | Chrome process continues running | Chrome process is killed and temp profile is removed | | **On crash/signal** | Browser is left running | Browser is automatically cleaned up | #### Local Environment When running locally with `keepAlive: true`, the Chrome process is detached from the Node.js event loop, allowing your script to exit while the browser stays open. This is useful for debugging or for handing off a browser session to another process. ```typescript const stagehand = new Stagehand({ env: "LOCAL", keepAlive: true, localBrowserLaunchOptions: { headless: false, }, }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); // Browser window stays open after the script exits await stagehand.close(); ``` #### Browserbase Environment On Browserbase, `keepAlive: true` keeps the cloud session active so you can reconnect later using `browserbaseSessionID`. This is useful for long-running workflows that span multiple script executions. The top-level `keepAlive` option overrides `browserbaseSessionCreateParams.keepAlive` when both are provided. ### Fixed CDP Debugging Port Specify a fixed Chrome DevTools Protocol (CDP) debugging port instead of using a randomly assigned one. ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", localBrowserLaunchOptions: { port: 9222, }, }); await stagehand.init(); ``` If no `port` is specified, a random port will be assigned. ### DOM Settle Timeout Configure how long Stagehand waits for the DOM to stabilize before taking actions. ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", domSettleTimeout: 3000 // Wait up to 3 seconds for DOM to settle }); ``` #### What is DOM Settling? DOM settling ensures that: - **Animations complete** before interacting with elements - **Lazy-loaded content** has time to appear - **JavaScript updates** finish before actions are taken - **Dynamic content** is fully rendered #### When to Adjust Increase `domSettleTimeout` for pages with: - Heavy animations or transitions - Lazy-loading or infinite scroll - Dynamic JavaScript frameworks (React, Vue, Angular) - Complex single-page applications ```typescript // For fast, static pages const stagehand = new Stagehand({ env: "BROWSERBASE", domSettleTimeout: 500 // Minimal wait }); // For dynamic, animated pages const stagehand = new Stagehand({ env: "BROWSERBASE", domSettleTimeout: 5000 // Longer wait for stability }); ``` Setting `domSettleTimeout` too low may cause actions to fail on elements that aren't ready. Setting it too high increases execution time unnecessarily. ## Troubleshooting - Verify your `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set correctly - Check that your API key has the necessary permissions - Ensure your Browserbase account has sufficient credits - Install Chrome or Chromium on your system - Set the correct `executablePath` for your Chrome installation - Check that required dependencies are installed (Linux: `libnss3-dev libatk-bridge2.0-dev libgtk-3-dev libxss1 libasound2`) - Increase session timeout in `browserbaseSessionCreateParams.timeout` - Use `keepAlive: true` for long-running sessions - Monitor session usage to avoid unexpected terminations ================================================ FILE: packages/docs/v3/configuration/logging.mdx ================================================ --- title: Logging sidebarTitle: Logging description: Set up logging, debugging, and error tracking for Stagehand workflows --- import { V3Banner } from '/snippets/v3-banner.mdx'; Stagehand provides comprehensive logging capabilities to help you debug automation workflows, track execution, and diagnose issues. Configure logging levels, structured output, and debugging tools for both development and production environments. ## Quick Start Choose your logging setup based on your environment: ```typescript Development import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, // Full debug output // restOfYourConfiguration... }); ``` ```typescript Production import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, // Standard logging - less noise disablePino: true, // Disable default console logging - no console spam // logger: yourProductionLogger, // Send to observability platform like Sentry or DataDog // restOfYourConfiguration... }); ``` ```typescript Testing import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, // Pino automatically disabled in test environments - no worker thread issues // logger: yourTestLogger, // Send to test logging framework like Jest // restOfYourConfiguration... }); ``` --- ## Operational Logging Real-time event logging during automation execution. ### Verbosity Level Control how much detail you see in logs: **Use for:** Development, debugging specific issues ```typescript const stagehand = new Stagehand({ verbose: 2, // Maximum detail // restOfYourConfiguration... }); ``` ``` [12:34:56] DEBUG: Capturing DOM snapshot [12:34:57] DEBUG: DOM contains 847 elements [12:34:58] DEBUG: LLM inference started [12:34:59] DEBUG: LLM response: {"selector": "#btn-submit", "method": "click"} [12:35:00] INFO: act completed successfully ``` **Use for:** Standard operations, staging, production ```typescript const stagehand = new Stagehand({ verbose: 1, // Default level // restOfYourConfiguration... }); ``` ``` [12:34:56] INFO: act started [12:35:00] INFO: act completed successfully [12:35:01] INFO: extract started [12:35:03] INFO: extract completed ``` **Use for:** Production with external monitoring, minimal noise ```typescript const stagehand = new Stagehand({ verbose: 0, // Errors only // restOfYourConfiguration... }); ``` ``` [12:35:05] ERROR: act failed: element not found [12:35:10] ERROR: navigation timeout exceeded ``` --- ### Log Destinations Logs can be sent to different destinations, including your console and external observability platforms: Fast, structured, colorized JSON logger with console output. **When to use:** Development, staging, or production without external observability; can manage multiple Stagehand instances ```typescript // Enabled by default - Pino handles console output automatically const stagehand = new Stagehand({ verbose: 1, // restOfYourConfiguration... }); ``` - `process.env.NODE_ENV === "test"` - `process.env.JEST_WORKER_ID !== undefined` (Jest tests) - `process.env.PLAYWRIGHT_TEST_BASE_DIR !== undefined` (Playwright tests) - `process.env.CI === "true"` (CI/CD environments) **Why auto-disable?** Pino uses worker threads for pretty-printing, which can cause issues in test runners. Simple console.log/error output. **When to use:** Automatically activated in tests, or when `disablePino: true` without setting an external logger ```typescript const stagehand = new Stagehand({ verbose: 1, disablePino: true, // Set to true automatically when a test is detected // restOfYourConfiguration... }); ``` - `process.env.NODE_ENV === "test"` - `process.env.JEST_WORKER_ID !== undefined` (Jest tests) - `process.env.PLAYWRIGHT_TEST_BASE_DIR !== undefined` (Playwright tests) - `process.env.CI === "true"` (CI/CD environments) **Why auto-disable?** Pino uses worker threads for pretty-printing, which can cause issues in test runners. Your custom logging function to receive all logs. Works independently of Pino - receives logs regardless of Pino setting. **When to use:** Development, debugging, or when you don't need querying capabilities. ```typescript // Simple logger without parsing (for basic console output) const simpleLogger = (logLine: LogLine) => { console.log(`[${logLine.level}] ${logLine.message}`); // Optional: log raw auxiliary data if (logLine.auxiliary) { console.log(' Context:', logLine.auxiliary); } }; ``` Then pass the logger in your Stagehand instance: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, logger: simpleLogger, disablePino: true, // Avoid duplicate processing // restOfYourConfiguration... }) ``` Your custom logging function to receive all logs. Works independently of Pino - receives logs regardless of Pino setting. **When to use:** Production with DataDog, Sentry, CloudWatch, or custom observability platforms for centralized monitoring and enable error alerting. Here's examples using Sentry and DataDog: ```typescript import * as Sentry from "@sentry/node"; const productionLogger = (logLine: LogLine) => { // Send errors to Sentry if (logLine.level === 0) { Sentry.captureMessage(logLine.message, { level: 'error', extra: aux, }); } } // Helper to parse auxiliary data to be flat, numeric, and filterable function parseAuxiliary(aux?: LogLine['auxiliary']): Record { if (!aux) return {}; const parsed: Record = {}; for (const [key, entry] of Object.entries(aux)) { parsed[key] = entry.type === 'object' ? JSON.parse(entry.value) : entry.value; } return parsed; } ``` ```typescript import { datadogLogs } from "@datadog/browser-logs"; const productionLogger = (logLine: LogLine) => { // Send all logs to DataDog datadogLogs.logger.log(logLine.message, { status: logLine.level === 0 ? 'error' : 'info', service: 'stagehand-automation', category: logLine.category, ...aux, }); } // Helper to parse auxiliary data to be flat, numeric, and filterable function parseAuxiliary(aux?: LogLine['auxiliary']): Record { if (!aux) return {}; const parsed: Record = {}; for (const [key, entry] of Object.entries(aux)) { parsed[key] = entry.type === 'object' ? JSON.parse(entry.value) : entry.value; } return parsed; } ``` ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, logger: productionLogger, disablePino: true, // Avoid duplicate processing // restOfYourConfiguration... }) ``` --- ## File-Based Session Logging Enable detailed file-based logging for all Stagehand operations by setting a config directory. This creates comprehensive logs for `agent.execute`, `act`, `observe`, `extract`, CDP events, and LLM requests/responses. ### Setup Add to your shell configuration (`~/.zshrc`, `~/.bashrc`, etc.): ```bash export BROWSERBASE_CONFIG_DIR=~/.config/browserbase ``` Then reload your shell or run `source ~/.zshrc`. ### Usage Run your Stagehand script as normal: ```bash tsx run_some_script_that_imports_stagehand.ts ``` Logs are written to `~/.config/browserbase/sessions//` with a `latest` symlink pointing to the most recent session. ### Viewing Logs Follow all logs as they happen: ```bash tail -f ~/.config/browserbase/sessions/latest/*.log ``` Or watch specific log types: ```bash # LLM requests and responses only tail -f ~/.config/browserbase/sessions/latest/llm_events.log # CDP (Chrome DevTools Protocol) events only tail -f ~/.config/browserbase/sessions/latest/cdp_events.log ``` View unified output sorted by timestamp: ```bash cat ~/.config/browserbase/sessions/latest/*.log | sort ``` Browse previous session logs: ```bash ls ~/.config/browserbase/sessions/ # Output: 2025-01-06_14-30-45_abc123 2025-01-06_15-45-12_def456 latest cat ~/.config/browserbase/sessions/2025-01-06_14-30-45_abc123/*.log | sort ``` ### Log Files Each session directory contains: | File | Contents | |------|----------| | `llm_events.log` | LLM requests and responses for act, extract, observe, and agent operations | | `cdp_events.log` | Chrome DevTools Protocol calls and events | | `stagehand.log` | General Stagehand operations and state changes | This is especially useful for debugging agent workflows where you need to trace the full sequence of LLM decisions, browser actions, and CDP interactions. --- ## LLM Inference Debugging **Development only** - Creates large files and contains page content. Do not use in production. Save complete LLM request/response dumps to disk for offline analysis. See exactly what DOM was sent to the LLM and why it chose the wrong element. ```typescript const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, logInferenceToFile: true, // Writes files to ./inference_summary/ }); ``` Creates timestamped files for each LLM call: ``` ./inference_summary/ ├── act_summary/ │ ├── act_summary.json # Aggregate metrics │ ├── 20250127_123456_act_call.txt # LLM request │ ├── 20250127_123456_act_response.txt # LLM response │ ├── 20250127_123501_act_call.txt │ └── 20250127_123501_act_response.txt ├── extract_summary/ │ ├── extract_summary.json │ ├── 20250127_123510_extract_call.txt │ ├── 20250127_123510_extract_response.txt │ ├── 20250127_123511_metadata_call.txt │ └── 20250127_123511_metadata_response.txt └── observe_summary/ ├── observe_summary.json └── ... ``` **File Types:** Contains the complete LLM request: ```json { "modelCall": "act", "messages": [ { "role": "system", "content": "You are a browser automation assistant. You have access to these actions:\n- click\n- type\n- scroll\n..." }, { "role": "user", "content": "Click the sign in button\n\nDOM:\n\n \n \n \n \n" } ] } ``` Contains the LLM output: ```json { "modelResponse": "act", "rawResponse": { "selector": "#btn-1", "method": "click", "reasoning": "Found sign in button with ID btn-1" } } ``` Aggregates all calls with metrics: ```json { "act_summary": [ { "act_inference_type": "act", "timestamp": "20250127_123456", "LLM_input_file": "20250127_123456_act_call.txt", "LLM_output_file": "20250127_123456_act_response.txt", "prompt_tokens": 3451, "completion_tokens": 45, "inference_time_ms": 951 }, { "act_inference_type": "act", "timestamp": "20250127_123501", "LLM_input_file": "20250127_123501_act_call.txt", "LLM_output_file": "20250127_123501_act_response.txt", "prompt_tokens": 2890, "completion_tokens": 38, "inference_time_ms": 823 } ] } ``` --- ## Reference ### Logging Configuration All logging options are passed to the Stagehand constructor: ```typescript const stagehand = new Stagehand({ // ... your other configurations (env, model, etc.) // Logging options: verbose?: 0 | 1 | 2; // Log level (default: 1) logger?: (line: LogLine) => void; // External logger function disablePino?: boolean; // Disable Pino backend (default: false) logInferenceToFile?: boolean; // Save LLM requests to disk (default: false) }); ``` | Option | Default | Description | |--------|---------|-------------| | `verbose` | `1` | Log level: `0` = errors only, `1` = info, `2` = debug | | `logger` | `undefined` | Custom logger function for external platforms | | `disablePino` | `false` | Disable Pino (auto `true` in tests) | | `logInferenceToFile` | `false` | Save LLM requests to disk (default: false) | ### Log Structure Each log entry follows a structured format: ```typescript interface LogLine { message: string; // "act completed successfully" level?: 0 | 1 | 2; // error | info | debug category?: string; // "action", "llm", "browser", "cache" timestamp?: string; // ISO 8601 timestamp auxiliary?: { // Additional structured metadata [key: string]: { value: string; // Serialized value type: "object" | "string" | "integer" | "float" | "boolean"; }; }; } ``` ```json { "category": "action", "message": "act completed successfully", "level": 1, "timestamp": "2025-01-27T12:35:00.123Z", "auxiliary": { "selector": { "value": "#btn-submit", "type": "string" }, "executionTime": { "value": "1250", "type": "integer" } } } ``` ```json { "category": "llm", "message": "inference completed", "level": 1, "timestamp": "2025-01-27T12:34:58.456Z", "auxiliary": { "model": { "value": "gpt-4o", "type": "string" }, "promptTokens": { "value": "3451", "type": "integer" }, "completionTokens": { "value": "45", "type": "integer" } } } ``` ```json { "category": "action", "message": "action failed: element not found", "level": 0, "timestamp": "2025-01-27T12:35:05.789Z", "auxiliary": { "selector": { "value": "#missing-btn", "type": "string" }, "url": { "value": "https://example.com/form", "type": "string" } } } ``` --- ## Next Steps Now that you have logging configured, explore additional debugging and monitoring tools in [the Observability guide](/v3/configuration/observability): Track all LLM operations (act, extract, observe, agent) with parameters, results, and timestamps. Perfect for debugging sequences and replaying workflows. Monitor token usage and performance in real-time. Track costs per operation, identify expensive calls, and optimize resource usage. Save complete LLM request/response dumps to disk. See exactly what DOM was sent to the LLM and why it made specific decisions. Watch your automation visually with session recordings, network monitoring, and real-time browser inspection (Browserbase only). ================================================ FILE: packages/docs/v3/configuration/models.mdx ================================================ --- title: Models sidebarTitle: Models description: Use any LLM model with Stagehand for optimal performance --- import { V3Banner } from '/snippets/v3-banner.mdx'; Understand web pages, plan actions, and interact with complex interfaces with Google, OpenAI, Anthropic, xAI, DeepSeek, Perplexity, Azure, Ollama, the [Vercel AI Gateway](https://vercel.com/docs/ai-gateway), or any other LLM model from [the Vercel AI SDK](https://sdk.vercel.ai/providers). --- ## Configuration Setup ### Quick Start Set your API key in `.env` and Stagehand handles the rest. No explicit configuration needed! Get started with Google Gemini (recommended for speed and cost): ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "google/gemini-2.5-flash" // API key auto-loads from GOOGLE_GENERATIVE_AI_API_KEY - set in your .env }); await stagehand.init(); ``` --- ### First Class Models Use any model from the following supported providers. ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "google/gemini-2.5-flash" // API key auto-loads from GOOGLE_GENERATIVE_AI_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Google models →](https://ai.google.dev/gemini-api/docs/models) Google Vertex requires `experimental: true` in the Stagehand constructor. ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", experimental: true, // required for Vertex model: { modelName: "vertex/gemini-3-flash-preview", project: "your-gcp-project-id", location: "us-central1", googleAuthOptions: { credentials: { client_email: "your-sa@project.iam.gserviceaccount.com", private_key: process.env.GOOGLE_SERVICE_ACCOUNT_PRIVATE_KEY, }, }, }, }); await stagehand.init(); ``` The `model` object accepts: - `modelName` — The Vertex model, prefixed with `vertex/` (e.g. `vertex/gemini-3-flash-preview`) - `project` — Your GCP project ID - `location` — Your Vertex AI region (e.g. `us-central1`) - `googleAuthOptions.credentials` — Service account credentials with `client_email` and `private_key` [View all supported Vertex AI models →](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "anthropic/claude-haiku-4-5" // API key auto-loads from ANTHROPIC_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Anthropic models →](https://docs.anthropic.com/en/docs/models-overview) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "openai/gpt-5" // API key auto-loads from OPENAI_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported OpenAI models →](https://platform.openai.com/docs/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "azure/gpt-5" // API key auto-loads from AZURE_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Azure models →](https://ai.azure.com/catalog) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "cerebras/llama-4-scout" // API key auto-loads from CEREBRAS_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Cerebras models →](https://inference-docs.cerebras.ai/models/overview) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "deepseek/deepseek-chat" // API key auto-loads from DEEPSEEK_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported DeepSeek models →](https://api-docs.deepseek.com/quick_start/pricing) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "groq/llama-3.1-8b-instant" // API key auto-loads from GROQ_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Groq models →](https://console.groq.com/docs/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "mistral/codestral-2508" // API key auto-loads from MISTRAL_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Mistral models →](https://docs.mistral.ai/getting-started/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "ollama/llama3.2" // No API key required }); await stagehand.init(); ``` [View all supported Ollama models →](https://ollama.com/library) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "perplexity/sonar-reasoning" // API key auto-loads from PERPLEXITY_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported Perplexity models →](https://docs.perplexity.ai/getting-started/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "togetherai/Qwen/Qwen3-235B-A22B-Instruct-2507-tput" // API key auto-loads from TOGETHER_AI_API_KEY - set in your .env }); await stagehand.init(); ``` [View all supported TogetherAI models →](https://www.together.ai/models) ```typescript TypeScript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "xai/grok-4-fast-reasoning" // API key auto-loads from XAI_API_KEY - set in your .env }); await stagehand.init(); ``` [View all xAI models →](https://docs.x.ai/docs/models) --- ### Custom Models Amazon Bedrock, Cohere, all [first class models](/v3/configuration/models#first-class-models), and any model from [the Vercel AI SDK](https://sdk.vercel.ai/providers) is supported. Use this configuration for custom endpoints and custom retry or caching logic. We'll use Amazon Bedrock and Google as examples below. Install the Vercel AI SDK for your provider. ```bash npm install @ai-sdk/amazon-bedrock ``` ```bash pnpm add @ai-sdk/amazon-bedrock ``` ```bash yarn add @ai-sdk/amazon-bedrock ``` ```bash bun add @ai-sdk/amazon-bedrock ``` ```typescript import { createAmazonBedrock } from '@ai-sdk/amazon-bedrock'; import { AISdkClient } from '@browserbasehq/stagehand'; const bedrockProvider = createAmazonBedrock({ region: 'us-east-1', accessKeyId: 'xxxxxxxxx', secretAccessKey: 'xxxxxxxxx', sessionToken: 'xxxxxxxxx', }); const bedrockClient = new AISdkClient({ model: bedrockProvider("amazon/nova-pro-latest"), }); ``` ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", llmClient: bedrockClient }); await stagehand.init(); ``` Install the Vercel AI SDK for your provider. ```bash npm install @ai-sdk/google ``` ```bash pnpm add @ai-sdk/google ``` ```bash yarn add @ai-sdk/google ``` ```bash bun add @ai-sdk/google ``` ```typescript import { createGoogle } from '@ai-sdk/google'; import { AISdkClient } from '@browserbasehq/stagehand'; const googleProvider = createGoogle({ apiKey: process.env.GEMINI_API_KEY, }); const googleClient = new AISdkClient({ model: googleProvider("google/gemini-2.5-flash"), }); ``` ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", llmClient: googleClient }); await stagehand.init(); ``` To implement a custom model, follow the steps for the provider you are using. See the Amazon Bedrock and Google examples above. All supported providers and models are in [the Vercel AI SDK](https://sdk.vercel.ai/providers). Install the Vercel AI SDK for your provider. ```typescript import { createProvider } from '@ai-sdk/provider'; import { AISdkClient } from '@browserbasehq/stagehand'; const provider = createProvider({ apiKey: 'xxxxxxxxx', }); const providerClient = new AISdkClient({ model: provider("model/name"), }); ``` ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", llmClient: providerClient }); await stagehand.init(); ``` --- ## Choose a Model Different models excel at different tasks. Consider speed, accuracy, and cost for your use case. Find detailed model comparisons and recommendations on our Model Evaluation page. **Quick Recommendations** | Use Case | Recommended Model | Why | | ------------------------- | ------------------------------------ | ------------------------------ | | **Production** | `google/gemini-2.5-flash` | Fast, accurate, cost-effective | | **Intelligence** | `google/gemini-3-pro-preview` | Best accuracy on hard tasks | | **Speed** | `google/gemini-2.5-flash` | Fastest response times | | **Cost** | `google/gemini-2.5-flash` | Best value per token | | **Local/offline** | `ollama/qwen3` | No API costs, full control | --- ## Advanced Options ### Agent Models (with CUA Support) **Default** The Stagehand agent by default uses the same model passed to Stagehand. All models ([first class](/v3/configuration/models#first-class-models) and [custom](/v3/configuration/models#custom-models)) are supported. Here's an example with Gemini: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", model: "google/gemini-2.5-flash", // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env // ... other stagehand options }); // Agent will use google/gemini-2.5-flash const agent = stagehand.agent(); ``` **Override (with CUA support)** However, the stagehand agent also accepts a `model` parameter, which accepts any [first class](/v3/configuration/models#first-class-models) model, including [computer use agents (CUA)](/v3/configuration/models#agent-models-with-cua-support). This is useful when you'd like the agent to use a different model than the one passed to Stagehand. To use a CUA model, you must pass the `mode: "cua"` parameter to the `agent()` method. If a non-CUA model is used, whether specified in Stagehand or overridden in the `agent()` method, an error will be thrown. **Deprecation Notice:** The `cua: true` option is deprecated and will be removed in a future version. Use `mode: "cua"` instead. ```typescript const agent = stagehand.agent({ mode: "cua", model: "google/gemini-2.5-computer-use-preview-10-2025", // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env // ... other agent options }); ``` ```typescript const agent = stagehand.agent({ mode: "cua", model: "anthropic/claude-sonnet-4-6", // ANTHROPIC_API_KEY is auto-loaded from .env // ... other agent options }); ``` ```typescript const agent = stagehand.agent({ mode: "cua", model: "openai/computer-use-preview", // OPENAI_API_KEY is auto-loaded from .env // ... other agent options }); ``` All [first class models](/v3/configuration/models#first-class-models) are supported. Here's an example with Gemini: ```typescript const agent = stagehand.agent({ model: "google/gemini-2.5-pro", // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env // ... other agent options }); ``` | Provider | Model | | -------- | ----- | | Anthropic | `anthropic/claude-haiku-4-5-20251001` | | Anthropic | `anthropic/claude-sonnet-4-6` | | Anthropic | `anthropic/claude-sonnet-4-5-20250929` | | Anthropic | `anthropic/claude-opus-4-5-20251101` | | Anthropic | `anthropic/claude-opus-4-6` | | Google | `google/gemini-2.5-computer-use-preview-10-2025` | | Google | `google/gemini-3-flash-preview` | | Google | `google/gemini-3-pro-preview` | | Microsoft | `microsoft/fara-7b` | | OpenAI | `openai/computer-use-preview` | | OpenAI | `openai/computer-use-preview-2025-03-11` | For overriding the agent API key, using a corporate proxy, adding provider-specific options, or other advanced use cases, the agent model can also take the form of an object. To learn more, see the [Agent Reference](/v3/references/agent). --- ### Custom Endpoints If you need Azure OpenAI deployments or enterprise deployments. For OpenAI, you can pass configuration directly without using `llmClient` using the `model` parameter: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", model: { modelName: "openai/gpt-5", apiKey: process.env.OPENAI_API_KEY, baseURL: "https://custom-openai-endpoint.com/v1" } }); ``` For Anthropic, you can pass configuration directly without using `llmClient` using the `model` parameter: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", model: { modelName: "anthropic/claude-haiku-4-5", apiKey: process.env.ANTHROPIC_API_KEY, baseURL: "https://custom-anthropic-endpoint.com", }, }); ``` For all other providers, use `llmClient`. Here's an example with Hugging Face: ```typescript // pnpm add @ai-sdk/huggingface import { createHuggingFace } from "@ai-sdk/huggingface"; import { AISdkClient } from "@browserbasehq/stagehand"; const huggingFaceProvider = createHuggingFace({ apiKey: process.env.HUGGINGFACE_API_KEY, baseURL: "https://custom-huggingface-endpoint.com", }); const huggingFaceClient = new AISdkClient({ model: huggingFaceProvider("meta-llama/Llama-3.1-8B-Instruct"), }); const stagehand = new Stagehand({ env: "BROWSERBASE", llmClient: huggingFaceClient, }); ``` --- ### AI Gateway The [Vercel AI Gateway](https://vercel.com/docs/ai-gateway) lets you access models from multiple providers (OpenAI, Anthropic, Google, and more) through a single API key and interface. No extra provider SDKs or per-provider API keys needed. The AI Gateway is built into the `ai` package that Stagehand already uses -- no additional dependencies required. **Key benefits:** - Access models from all major providers with a single `AI_GATEWAY_API_KEY` - Automatic provider fallback and dynamic routing based on uptime and latency - Usage tracking and observability through the Vercel dashboard - Bring Your Own Key (BYOK) support for existing provider credentials Use the `gateway/` prefix followed by the provider and model name: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: "gateway/openai/gpt-5" // API key auto-loads from AI_GATEWAY_API_KEY - set in your .env }); await stagehand.init(); ``` Works with any model available on the gateway: ```typescript // Anthropic via gateway model: "gateway/anthropic/claude-sonnet-4.5" // Google via gateway model: "gateway/google/gemini-3-flash-preview" ``` Pass the API key and optional base URL explicitly using the model object format: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE", model: { modelName: "gateway/openai/gpt-5", apiKey: process.env.AI_GATEWAY_API_KEY, baseURL: "https://ai-gateway.vercel.sh/v3/ai" // optional custom endpoint } }); await stagehand.init(); ``` [View all available AI Gateway models →](https://vercel.com/docs/ai-gateway/models-and-providers) --- ### Extending the AI SDK Client For advanced use cases like custom retries or caching logic, you can extend the `AISdkClient`: ```typescript import { LLMClient } from "@browserbasehq/stagehand"; class CustomRetryClient extends LLMClient { async createChatCompletion(options) { let retries = 3; while (retries > 0) { try { return await super.createChatCompletion(options); } catch (error) { retries--; if (retries === 0) throw error; await new Promise((r) => setTimeout(r, 1000 * (4 - retries))); } } } } ``` Need custom caching? Consider using built-in [caching feature](/v3/best-practices/caching). --- ### Legacy Model Format **Recommendation:** Use `provider/model` format. Example: - `model: "openai/gpt-4o"` (recommended) - `model: "gpt-4o"` (legacy) The following models work without the `provider/` prefix in the model parameter as part of legacy support: - `gemini-2.5-flash-preview-04-17` - `gemini-2.5-pro-preview-03-25` - `gemini-2.0-flash` - `gemini-2.0-flash-lite` - `gemini-1.5-flash` - `gemini-1.5-flash-8b` - `gemini-1.5-pro` - `claude-sonnet-4-6` - `claude-sonnet-4-5-20250929` - `claude-haiku-4-5-20251001` - `gpt-4o` - `gpt-4o-mini` - `o1` - `o1-mini` - `o3` - `o3-mini` - `gpt-4.1` - `gpt-4.1-mini` - `gpt-4.1-nano` - `o4-mini` - `gpt-4.5-preview` - `gpt-4o-2024-08-06` - `o1-preview` - `cerebras-llama-3.3-70b` - `cerebras-llama-3.1-8b` - `groq-llama-3.3-70b-versatile` - `groq-llama-3.3-70b-specdec` - `moonshotai/kimi-k2-instruct` --- ## Troubleshooting **Error:** `API key not found` **Solutions:** - Check `.env` file has the correct variable name for the provider you are using - Ensure environment variables are loaded (use `dotenv`) - Restart your application after updating `.env` file | Provider | Environment Variable | | ---------- | ------------------------------ | | Google | `GOOGLE_GENERATIVE_AI_API_KEY` or `GEMINI_API_KEY` | | Vertex | Service account credentials (see [setup](#first-class-models)) | | Anthropic | `ANTHROPIC_API_KEY` | | OpenAI | `OPENAI_API_KEY` | | Azure | `AZURE_API_KEY` | | Cerebras | `CEREBRAS_API_KEY` | | DeepSeek | `DEEPSEEK_API_KEY` | | Groq | `GROQ_API_KEY` | | Mistral | `MISTRAL_API_KEY` | | Ollama | None (local) | | Perplexity | `PERPLEXITY_API_KEY` | | TogetherAI | `TOGETHER_AI_API_KEY` | | xAI | `XAI_API_KEY` | | AI Gateway | `AI_GATEWAY_API_KEY` | **Error:** `Unsupported model` **Solutions:** - Use the `provider/model` format: `openai/gpt-5` - Verify the model name exists in the provider's documentation - Check model name is spelled correctly - Ensure your Model API key can access the model **Error:** `Model does not support structured outputs` **Solutions:** - Check our [Model Evaluation page](https://www.stagehand.dev/evals) for recommended models **Symptoms:** Automation is expensive or slow **Solutions:** - Switch to cost-effective models (check [evals](https://www.stagehand.dev/evals) for comparisons) - Use faster models for simple tasks, powerful ones for complex tasks - Implement [caching](/v3/best-practices/caching) for repeated patterns Python is now supported in Stagehand v3! The Python SDK uses a BYOB (Bring Your Own Browser) architecture. **Solutions:** - See the [Python SDK documentation](/v3/sdk/python) for installation and usage - Check the [Python migration guide](/v3/migrations/python) if upgrading from v2 ### Need Help? Contact Support Can't find a solution? Have a question? Reach out to our support team: Email us at support@browserbase.com --- ## Next Steps Learn how to prompt LLMs for optimal results Test which models work best for your specific use case Cache responses to reduce costs and improve speed Reduce LLM spending with caching and smart model selection ================================================ FILE: packages/docs/v3/configuration/observability.mdx ================================================ --- title: Observability sidebarTitle: Observability description: Track Stagehand automation with session visibility and analytics --- import { V3Banner } from '/snippets/v3-banner.mdx'; Stagehand provides powerful observability features to help you monitor, track performance, and analyze your browser automation workflows. Focus on session monitoring, resource usage, and operational insights for both Browserbase and local environments. ## Browserbase Session Monitoring When running on Browserbase, you gain access to comprehensive cloud-based monitoring and session management through the Browserbase API and dashboard.
Browserbase Session Observability
### Live Session Visibility Browserbase provides real-time visibility into your automation sessions: **Session Dashboard Features** - Real-time browser screen recording and replay - Network request monitoring with detailed timing - JavaScript console logs and error tracking - CPU and memory usage metrics - Session status and duration tracking **Session Management & API Access** ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import { Browserbase } from "@browserbasehq/sdk"; const browserbase = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const sessionInfo = await browserbase.sessions.retrieve(stagehand.sessionId); console.log("Session status:", sessionInfo.status); console.log("Session region:", sessionInfo.region); console.log("CPU usage:", sessionInfo.avgCpuUsage); console.log("Memory usage:", sessionInfo.memoryUsage); console.log("Proxy bytes:", sessionInfo.proxyBytes); ``` ### Session Analytics & Insights Monitor live session status, resource usage, and geographic distribution. Scale and manage concurrent sessions with real-time insights. Review complete session recordings with frame-by-frame playback. Analyze network requests and debug browser interactions visually. Programmatically access session data, automate lifecycle management, and integrate with monitoring systems through our API. Track resource consumption, session duration, and API usage. Get detailed breakdowns of costs and utilization across your automation. ### Session Monitoring & Filtering Query and monitor sessions by status and metadata: ```typescript import { Browserbase } from "@browserbasehq/sdk"; const browserbase = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); // List sessions with filtering async function getFilteredSessions() { const sessions = await browserbase.sessions.list({ status: 'RUNNING' }); return sessions.map(session => ({ id: session.id, status: session.status, // RUNNING, COMPLETED, ERROR, TIMED_OUT startedAt: session.startedAt, endedAt: session.endedAt, region: session.region, avgCpuUsage: session.avgCpuUsage, memoryUsage: session.memoryUsage, proxyBytes: session.proxyBytes, userMetadata: session.userMetadata })); } // Query sessions by metadata async function querySessionsByMetadata(query: string) { const sessions = await browserbase.sessions.list({ q: query }); return sessions; } ``` ## Local Environment Monitoring For local development, Stagehand provides performance monitoring and resource tracking capabilities directly on your machine. ### Performance Tracking ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", verbose: 1, // Monitor performance without debug noise }); await stagehand.init(); // Track local automation metrics const startTime = Date.now(); const initialMetrics = await stagehand.metrics; // ... perform automation tasks const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); await stagehand.act("click button"); await stagehand.extract({ instruction: "get data", schema: DataSchema }); const finalMetrics = await stagehand.metrics; const executionTime = Date.now() - startTime; console.log('Local Performance Summary:', { executionTime: `${executionTime}ms`, totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, totalInferenceTime: `${finalMetrics.totalInferenceTimeMs}ms`, tokensPerSecond: ((finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / (executionTime / 1000)).toFixed(2) }); ``` ## Resource Usage Monitoring When running locally, monitor system resource usage and browser performance: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import * as os from 'os'; import { performance } from 'perf_hooks'; class LocalResourceMonitor { private cpuUsage: number[] = []; private memoryUsage: number[] = []; startMonitoring() { const interval = setInterval(() => { // Track system resources const memUsage = process.memoryUsage(); this.memoryUsage.push(memUsage.heapUsed / 1024 / 1024); // MB // Track CPU (simplified) const loadAvg = os.loadavg()[0]; this.cpuUsage.push(loadAvg); }, 1000); return interval; } getResourceSummary() { return { avgMemoryUsage: this.memoryUsage.reduce((a, b) => a + b, 0) / this.memoryUsage.length, peakMemoryUsage: Math.max(...this.memoryUsage), avgCpuLoad: this.cpuUsage.reduce((a, b) => a + b, 0) / this.cpuUsage.length, totalDataPoints: this.cpuUsage.length }; } } const monitor = new LocalResourceMonitor(); const interval = monitor.startMonitoring(); const stagehand = new Stagehand({ env: "LOCAL" }); // ... run automation clearInterval(interval); console.log('Resource Usage:', monitor.getResourceSummary()); ``` Monitor token usage, costs, and speed. Set up automated alerting for critical failures. Implement cost tracking across different environments. Use session analytics to optimize automation workflows. ## Real-Time Metrics & Monitoring ### Basic Usage Tracking Monitor your automation's resource usage in real-time with `stagehand.metrics`: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); // Metrics are async in V3 const metrics = await stagehand.metrics; console.log(metrics); // Monitor during automation const startTime = Date.now(); const initialMetrics = await stagehand.metrics; // ... perform automation tasks const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); await stagehand.act("click the login button"); const data = await stagehand.extract({ instruction: "extract user info", schema: UserSchema }); const finalMetrics = await stagehand.metrics; const executionTime = Date.now() - startTime; console.log('Automation Summary:', { totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, executionTime: `${executionTime}ms`, avgInferenceTime: `${finalMetrics.totalInferenceTimeMs / 3}ms`, }); ``` ### Understanding Metrics Data The metrics object provides detailed breakdown by Stagehand operation: ```typescript interface StagehandMetrics { // Act operation metrics actPromptTokens: number; actCompletionTokens: number; actReasoningTokens: number; actCachedInputTokens: number; actInferenceTimeMs: number; // Extract operation metrics extractPromptTokens: number; extractCompletionTokens: number; extractReasoningTokens: number; extractCachedInputTokens: number; extractInferenceTimeMs: number; // Observe operation metrics observePromptTokens: number; observeCompletionTokens: number; observeReasoningTokens: number; observeCachedInputTokens: number; observeInferenceTimeMs: number; // Agent operation metrics agentPromptTokens: number; agentCompletionTokens: number; agentReasoningTokens: number; agentCachedInputTokens: number; agentInferenceTimeMs: number; // Cumulative totals totalPromptTokens: number; totalCompletionTokens: number; totalReasoningTokens: number; totalCachedInputTokens: number; totalInferenceTimeMs: number; } ``` **Example metrics output:** ```typescript const metrics = await stagehand.metrics; console.log(metrics); // { // actPromptTokens: 4011, // actCompletionTokens: 51, // actReasoningTokens: 12, // actCachedInputTokens: 0, // actInferenceTimeMs: 1688, // extractPromptTokens: 4200, // extractCompletionTokens: 243, // extractReasoningTokens: 18, // extractCachedInputTokens: 0, // extractInferenceTimeMs: 4297, // observePromptTokens: 347, // observeCompletionTokens: 43, // observeReasoningTokens: 5, // observeCachedInputTokens: 0, // observeInferenceTimeMs: 903, // agentPromptTokens: 0, // agentCompletionTokens: 0, // agentReasoningTokens: 0, // agentCachedInputTokens: 0, // agentInferenceTimeMs: 0, // totalPromptTokens: 8558, // totalCompletionTokens: 337, // totalReasoningTokens: 35, // totalCachedInputTokens: 0, // totalInferenceTimeMs: 6888 // } ``` ## Best Practices - Track session success rates and failure patterns - Monitor resource usage and scaling requirements - Set up automated alerting for critical failures - Implement cost tracking across different environments - Use session analytics to optimize automation workflows - Compare Browserbase vs local execution times - Monitor token usage and inference costs across models - Track geographic performance differences - Identify bottlenecks in automation workflows - Optimize for cost-effectiveness and speed - Track session distribution across regions - Monitor concurrent session limits and scaling - Analyze failure patterns and common error scenarios - Use session recordings for root cause analysis - Implement custom metadata for workflow categorization - Integrate session APIs with monitoring dashboards - Set up automated notifications for session failures - Track SLA compliance and performance benchmarks - Monitor resource costs and usage patterns - Use analytics data for capacity planning and optimization ## Next Steps Track all LLM operations with parameters, results, and timestamps for debugging. Configure logging levels, custom loggers, and file-based session logging. ================================================ FILE: packages/docs/v3/first-steps/ai-rules.mdx ================================================ --- title: AI Rules description: Using AI to write Stagehand code faster, and better. --- import { V3Banner } from '/snippets/v3-banner.mdx'; You're likely using AI to write code, and there's a **right and wrong way to do it.** This page is a collection of rules, configs, and copy‑paste snippets to allow your AI agents/assistants to write performant, Stagehand code as fast as possible. ## Quickstart Configure Browserbase (Stagehand), Context7, DeepWiki, and Stagehand Docs in your MCP client. Drop in `cursorrules` and `claude.md` so AI agents/assistants always emit Stagehand patterns. ## Using MCP Servers MCP (Model Context Protocol) servers act as intermediaries that connect AI systems to external data sources and tools. These servers enable your coding assistant to access real-time information, execute tasks, and retrieve structured data to enhance code generation accuracy. The following **MCP servers** provide specialized access to Stagehand documentation and related resources: Provides semantic search across documentation and codebase context. Context7 enables AI assistants to find relevant code patterns, examples, and implementation details from your project history. It maintains contextual understanding of your development workflow and can surface related solutions from previous work. **Installation:** ```json { "mcpServers": { "context7": { "command": "npx", "args": ["-y", "@upstash/context7-mcp"] } } } ``` Offers deep indexing of GitHub repositories and documentation. DeepWiki allows AI agents to understand project architecture, API references, and best practices from the entire Stagehand ecosystem. It provides comprehensive knowledge about repository structure, code relationships, and development patterns. **Installation:** ```json { "mcpServers": { "deepwiki": { "url": "https://mcp.deepwiki.com/mcp" } } } ``` Direct access to official Stagehand documentation. This MCP server provides AI assistants with up-to-date API references, configuration options, and usage examples for accurate code generation. Mintlify auto-generates this server from the official docs, ensuring your AI assistant always has the latest information. **Usage:** ```json { "mcpServers": { "stagehand-docs": { "url": "https://docs.stagehand.dev/mcp" } } } ``` **How MCP Servers Enhance Your Development:** - **Real-time Documentation Access**: AI assistants can query the latest Stagehand docs, examples, and best practices - **Context-Aware Code Generation**: Servers provide relevant code patterns and configurations based on your specific use case - **Reduced Integration Overhead**: Standardized protocol eliminates the need for custom integrations with each documentation source - **Enhanced Accuracy**: AI agents receive structured, up-to-date information rather than relying on potentially outdated training data **Prompting tip:** Explicitly ask your coding agent/assistant to use these MCP servers to fetch relevant information from the docs so they have better context and know how to write proper Stagehand code. ie. **"Use the stagehand-docs MCP to fetch the act/observe guidelines, then generate code that follows them. Prefer cached observe results."** ## Editor rule files (copy‑paste) Drop these in `.cursorrules`, `windsurfrules`, `claude.md`, or any agent rule framework: ``````md # Stagehand Project This is a project that uses Stagehand V3, a browser automation framework with AI-powered `act`, `extract`, `observe`, and `agent` methods. The main class can be imported as `Stagehand` from `@browserbasehq/stagehand`. **Key Classes:** - `Stagehand`: Main orchestrator class providing `act`, `extract`, `observe`, and `agent` methods - `context`: A `V3Context` object that manages browser contexts and pages - `page`: Individual page objects accessed via `stagehand.context.pages()[i]` or created with `stagehand.context.newPage()` ## Initialize ```typescript import { Stagehand } from "@browserbasehq/stagehand"; const stagehand = new Stagehand({ env: "LOCAL", // or "BROWSERBASE" verbose: 2, // 0, 1, or 2 model: "openai/gpt-4.1-mini", // or any supported model }); await stagehand.init(); // Access the browser context and pages const page = stagehand.context.pages()[0]; const context = stagehand.context; // Create new pages if needed const page2 = await stagehand.context.newPage(); ``` ## Act Actions are called on the `stagehand` instance (not the page). Use atomic, specific instructions: ```typescript // Act on the current active page await stagehand.act("click the sign in button"); // Act on a specific page (when you need to target a page that isn't currently active) await stagehand.act("click the sign in button", { page: page2 }); ``` **Important:** Act instructions should be atomic and specific: - ✅ Good: "Click the sign in button" or "Type 'hello' into the search input" - ❌ Bad: "Order me pizza" or "Type in the search bar and hit enter" (multi-step) ### Observe + Act Pattern (Recommended) Cache the results of `observe` to avoid unexpected DOM changes: ```typescript const instruction = "Click the sign in button"; // Get candidate actions const actions = await stagehand.observe(instruction); // Execute the first action await stagehand.act(actions[0]); ``` To target a specific page: ```typescript const actions = await stagehand.observe("select blue as the favorite color", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Extract Extract data from pages using natural language instructions. The `extract` method is called on the `stagehand` instance. ### Basic Extraction (with schema) ```typescript import { z } from "zod"; // Extract with explicit schema const data = await stagehand.extract( "extract all apartment listings with prices and addresses", z.object({ listings: z.array( z.object({ price: z.string(), address: z.string(), }), ), }), ); console.log(data.listings); ``` ### Simple Extraction (without schema) ```typescript // Extract returns a default object with 'extraction' field const result = await stagehand.extract("extract the sign in button text"); console.log(result); // Output: { extraction: "Sign in" } // Or destructure directly const { extraction } = await stagehand.extract( "extract the sign in button text", ); console.log(extraction); // "Sign in" ``` ### Targeted Extraction Extract data from a specific element using a selector: ```typescript const reason = await stagehand.extract( "extract the reason why script injection fails", z.string(), { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, ); ``` ### URL Extraction When extracting links or URLs, use `z.string().url()`: ```typescript const { links } = await stagehand.extract( "extract all navigation links", z.object({ links: z.array(z.string().url()), }), ); ``` ### Extracting from a Specific Page ```typescript // Extract from a specific page (when you need to target a page that isn't currently active) const data = await stagehand.extract( "extract the placeholder text on the name field", { page: page2 }, ); ``` ## Observe Plan actions before executing them. Returns an array of candidate actions: ```typescript // Get candidate actions on the current active page const [action] = await stagehand.observe("Click the sign in button"); // Execute the action await stagehand.act(action); ``` Observing on a specific page: ```typescript // Target a specific page (when you need to target a page that isn't currently active) const actions = await stagehand.observe("find the next page button", { page: page2, }); await stagehand.act(actions[0], { page: page2 }); ``` ## Agent Use the `agent` method to autonomously execute complex, multi-step tasks. ### Basic Agent Usage ```typescript const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); const agent = stagehand.agent({ model: "google/gemini-2.0-flash", executionModel: "google/gemini-2.0-flash", }); const result = await agent.execute({ instruction: "Search for the stock price of NVDA", maxSteps: 20, }); console.log(result.message); ``` ### Computer Use Agent (CUA) For more advanced scenarios using computer-use models: ```typescript const agent = stagehand.agent({ mode: "cua", // Enable Computer Use Agent mode model: "anthropic/claude-sonnet-4-20250514", // or "google/gemini-2.5-computer-use-preview-10-2025" systemPrompt: `You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.`, }); await agent.execute({ instruction: "Apply for a library card at the San Francisco Public Library", maxSteps: 30, }); ``` ### Agent with Custom Model Configuration ```typescript const agent = stagehand.agent({ mode: "cua", model: { modelName: "google/gemini-2.5-computer-use-preview-10-2025", apiKey: process.env.GEMINI_API_KEY, }, systemPrompt: `You are a helpful assistant.`, }); ``` ### Agent with Integrations (MCP/External Tools) ```typescript const agent = stagehand.agent({ integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], systemPrompt: `You have access to the Exa search tool.`, }); ``` ## Advanced Features ### DeepLocator (XPath Targeting) Target specific elements across shadow DOM and iframes: ```typescript await page .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") .highlight({ durationMs: 5000, contentColor: { r: 255, g: 0, b: 0 }, }); ``` ### Multi-Page Workflows ```typescript const page1 = stagehand.context.pages()[0]; await page1.goto("https://example.com"); const page2 = await stagehand.context.newPage(); await page2.goto("https://example2.com"); // Act/extract/observe operate on the current active page by default // Pass { page } option to target a specific page await stagehand.act("click button", { page: page1 }); await stagehand.extract("get title", { page: page2 }); ``` `````` ``````md # Stagehand Python Project This is a project that uses [Stagehand Python](https://github.com/browserbase/stagehand-python), which provides AI-powered browser automation with `act`, `extract`, and `observe` methods. `Stagehand` is a class that provides configuration and browser automation capabilities with: - Pages accessed via `stagehand.context.pages()` or `stagehand.context.activePage()` - `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) - `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows - `stagehand.init()`: Initialize the browser session - `stagehand.close()`: Clean up resources `Page` extends Playwright's Page class with AI-powered methods: - `act()`: Perform actions on web elements using natural language - `extract()`: Extract structured data from pages using schemas - `observe()`: Plan actions and get selectors before executing `Agent` provides autonomous Computer Use Agent capabilities: - `execute()`: Perform complex multi-step tasks using natural language instructions Use the following rules to write code for this project. - To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. You can also pass in the following params: - The result of `observe` is a list of `ObserveResult` objects that can directly be used as params for `act` like this: - When writing code that needs to extract data from the page, use Stagehand `extract`. Use Pydantic models for schemas: ## Initialize ### Configuration Options Key configuration options in `StagehandConfig`: ## Act You can act directly with string instructions: Use variables for dynamic form filling: **Best Practices:** - Cache the results of `observe` to avoid unexpected DOM changes - Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") - Use specific, descriptive instructions Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". ## Extract ### Simple String Extraction ### Structured Extraction with Schema (Recommended) Always use Pydantic models for structured data extraction: ### Array Extraction For arrays, use List types: ### Complex Object Extraction For more complex data structures: ## Agent System Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). ### Creating Agents ### Agent Execution **Best Practices:** - Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` - Break down complex tasks into smaller steps - Use error handling with try/except blocks - Combine agents for navigation with traditional methods for precise data extraction ## Project Structure Best Practices - Store configurations in environment variables or config files - Use async/await patterns consistently - Implement main automation logic in async functions - Use async context managers for resource management - Use type hints and Pydantic models for data validation - Handle exceptions appropriately with try/except blocks `````` ## Security notes - Do not embed secrets in docs or rule files; use env vars in MCP configs. - Avoid broad actions that may trigger unintended navigation; prefer `observe` first. ## Resources/references - Context7 MCP (Upstash) - https://github.com/upstash/context7 - DeepWiki MCP - https://mcp.deepwiki.com/ - Stagehand Docs MCP (Mintlify) - https://docs.stagehand.dev/mcp ================================================ FILE: packages/docs/v3/first-steps/installation.mdx ================================================ --- title: Installation description: Integrate Stagehand into an existing project. --- import { V3Banner } from '/snippets/v3-banner.mdx'; Install Stagehand in your current app with the TypeScript SDK. We recommend using the Node.js runtime environment to run Stagehand scripts. **Bun is now supported** as long as you do not integrate Stagehand with Playwright. Playwright is not compatible with Bun. ### Install dependencies ```bash npm npm install @browserbasehq/stagehand ``` ```bash pnpm pnpm add @browserbasehq/stagehand ``` ```bash yarn yarn add @browserbasehq/stagehand ``` ```bash bun icon="sparkles" bun add @browserbasehq/stagehand ``` If you plan to run locally, you need to have [Chrome](https://www.google.com/chrome/) installed on your machine. For cloud browser sessions, skip this. ### Configure environment Set environment variables (or a `.env` via your framework): ```bash Bash OPENAI_API_KEY=your_api_key BROWSERBASE_API_KEY=your_api_key BROWSERBASE_PROJECT_ID=your_project_id ``` Stagehand does not auto-load `.env` files. If you use a `.env` file, install and initialize `dotenv` in your own app code: ```bash npm install dotenv ``` ```typescript import dotenv from "dotenv"; dotenv.config({ path: ".env" }); ``` ### Use in your codebase Add Stagehand where you need browser automation. ```typescript import dotenv from "dotenv"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod"; dotenv.config({ path: ".env" }); // if needed async function main() { const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); const page = stagehand.context.pages()[0]; await page.goto("https://example.com"); // Act on the page await stagehand.act("Click the learn more button"); // Extract structured data const description = await stagehand.extract("extract the description", z.string()); console.log(description); await stagehand.close(); } main().catch((err) => { console.error(err); process.exit(1); }); ``` For Python and other language SDKs, use the **language selector** in the top left corner of the sidebar to view the SDK documentation for your language. ## Next steps Environment, Browserbase vs Local, logging, timeouts, LLM customization Perform precise actions with natural language Typed data extraction with Zod schemas Discover elements and suggested actions ================================================ FILE: packages/docs/v3/first-steps/introduction.mdx ================================================ --- title: Introducing Stagehand sidebarTitle: Introduction description: Developers use Stagehand to reliably automate the web. --- import { V3Banner } from '/snippets/v3-banner.mdx'; Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. ## The Problem with Browser Automation Traditional frameworks like Playwright and Puppeteer force you to write brittle scripts that break with every UI change. Web agents promise to solve this with AI, but leave you at the mercy of unpredictable behavior. **You're stuck between two bad options:** - **Too brittle**: Traditional selectors break when websites change - **Too agentic**: AI agents are unpredictable and impossible to debug ## Enter Stagehand Stagehand gives you the best of both worlds through four powerful primitives that let you choose exactly how much AI to use: Execute actions using natural language Pull structured data with schemas Discover available actions on any page Automate entire workflows autonomously ```typescript // Act - Execute natural language actions await stagehand.act("click the login button"); // Extract - Pull structured data const price = await stagehand.extract( "extract the price", z.number() ); // Observe - Discover available actions const actions = await stagehand.observe("find submit buttons"); // Agent - Automate entire workflows const agent = stagehand.agent({ mode: "cua", model: "google/gemini-2.5-computer-use-preview-10-2025", }); await agent.execute("apply for this job"); ``` ## Why Developers Choose Stagehand - **Precise Control**: Mix AI-powered actions with deterministic code. You decide exactly how much AI to use. - **Actually Repeatable**: Save and replay actions exactly. No more "it worked on my machine" with browser automations. - **Maintainable at Scale**: One script can automate multiple websites. When sites change, your automations adapt. - **Composable Tools**: Choose your level of automation with Act, Extract, Observe, and Agent. ## Built for Modern Development Stagehand is designed for developers building production browser automations and AI agents that need reliable web access. Compatible with all Chromium-based browsers: Chrome, Edge, Arc, Brave, and more. Created and maintained by the team behind enterprise browser infrastructure. ## Get Started in 60 Seconds **Pro tip**: For best results, we recommend using Stagehand with [Browserbase](https://www.browserbase.com) for reliable cloud browser infrastructure. Build your first automation in under a minute Generate Stagehand scripts with AI See real-world automation examples Get help from the community ================================================ FILE: packages/docs/v3/first-steps/quickstart.mdx ================================================ --- title: Quickstart description: 'Stagehand allows you to build web automations with natural language and code.' --- import { V3Banner } from '/snippets/v3-banner.mdx'; If this is your **first time using Stagehand**, you should try [Director](https://director.ai) first. It's an agent that allows you to build Stagehand workflows using natural language. You can also try Stagehand using our [MCP server](/v3/integrations/mcp/introduction). Otherwise, the quickest way to start with Stagehand is with our CLI. It scaffolds a ready‑to‑run Stagehand app with sensible defaults, and an example script. This quickstart is for **TypeScript**. For other languages, change the language selector in the top left corner. ## 1) Create a sample project ```bash Bash npx create-browser-app ``` ## 2) Run it Follow the CLI prompts to enter the project directory and add your API keys. Then run the example script. ```bash Bash cd my-stagehand-app # Enter the project directory cp .env.example .env # Add your API keys npm start # Run the example script ``` ## 3) Use Stagehand (act, extract, observe) The scaffold includes an index.ts file that contains the example script. Here's what it looks like: ```typescript TypeScript import "dotenv/config"; import { Stagehand } from "@browserbasehq/stagehand"; async function main() { const stagehand = new Stagehand({ env: "BROWSERBASE" }); await stagehand.init(); console.log(`Stagehand Session Started`); console.log(`Watch live: https://browserbase.com/sessions/${stagehand.browserbaseSessionID}`); const page = stagehand.context.pages()[0]; await page.goto("https://stagehand.dev"); const extractResult = await stagehand.extract("Extract the value proposition from the page."); console.log(`Extract result:\n`, extractResult); await stagehand.act("Click the 'Evals' button."); const observeResult = await stagehand.observe("What can I click on this page?"); console.log(`Observe result:\n`, observeResult); const agent = stagehand.agent({ mode: "cua", model: "google/gemini-2.5-computer-use-preview-10-2025", systemPrompt: "You're a helpful assistant that can control a web browser.", }); const agentResult = await agent.execute("What is the most accurate model to use in Stagehand?"); console.log(`Agent result:\n`, agentResult); await stagehand.close(); } main().catch((err) => { console.error(err); process.exit(1); }); ``` To use, set provider keys in `.env` (e.g., `OPENAI_API_KEY`). For cloud browsers, add `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`. ## Next steps Learn about the Stagehand primitives: act, extract, observe, and agent. Perform actions on web pages with natural language Get structured data with Zod schemas Discover available elements and actions Autonomous multi-step browser workflows ================================================ FILE: packages/docs/v3/integrations/convex/configuration.mdx ================================================ --- title: "Use Stagehand in Convex" sidebarTitle: Configuration description: "Set up AI-powered browser automation in your Convex application" --- import { V3Banner } from '/snippets/v3-banner.mdx'; Clone the [GitHub repo](https://github.com/browserbase/convex-stagehand) to get started with Stagehand in Convex. ## Installation Install the convex-stagehand component and Zod for schema validation: ```bash npm install @browserbasehq/convex-stagehand zod ``` ## Configuration Add the Stagehand component to your `convex/convex.config.ts`: ```typescript convex/convex.config.ts import { defineApp } from "convex/server"; import stagehand from "@browserbasehq/convex-stagehand/convex.config"; const app = defineApp(); app.use(stagehand, { name: "stagehand" }); export default app; ``` ## Environment Variables Set the following environment variables in your [Convex Dashboard](https://dashboard.convex.dev): | Variable | Description | |----------|-------------| | `BROWSERBASE_API_KEY` | Your Browserbase API key | | `BROWSERBASE_PROJECT_ID` | Your Browserbase project ID | | `MODEL_API_KEY` | API key for your LLM provider (OpenAI, Anthropic, etc.) | ## Basic Usage ### Initialize the Client Create a Stagehand instance in your Convex action: ```typescript convex/actions.ts "use node"; import { Stagehand } from "@browserbasehq/convex-stagehand"; import { components } from "./_generated/api"; import { action } from "./_generated/server"; import { z } from "zod"; const stagehand = new Stagehand(components.stagehand, { browserbaseApiKey: process.env.BROWSERBASE_API_KEY!, browserbaseProjectId: process.env.BROWSERBASE_PROJECT_ID!, modelApiKey: process.env.MODEL_API_KEY!, }); ``` ### Extract Data Extract structured data from a web page using natural language instructions and Zod schemas: ```typescript export const extractProducts = action({ handler: async (ctx) => { const data = await stagehand.extract(ctx, { url: "https://example.com/products", instruction: "Extract all product names and prices", schema: z.object({ products: z.array(z.object({ name: z.string(), price: z.string(), })) }) }); return data.products; } }); ``` ### Perform Actions Execute browser interactions using plain English: ```typescript export const loginToSite = action({ handler: async (ctx) => { const result = await stagehand.act(ctx, { url: "https://example.com/login", action: "Click the login button and wait for the page to load" }); return result; } }); ``` ### Observe Elements Identify interactive elements on a page: ```typescript export const findNavLinks = action({ handler: async (ctx) => { const actions = await stagehand.observe(ctx, { url: "https://example.com", instruction: "Find all clickable navigation links" }); return actions; } }); ``` ### Run Autonomous Tasks Use the agent API for complex multi-step workflows: ```typescript export const searchAndExtract = action({ handler: async (ctx) => { const result = await stagehand.agent(ctx, { url: "https://google.com", instruction: "Search for 'convex database' and extract the top 3 results", options: { maxSteps: 10 } }); return result; } }); ``` ## Session Management For workflows that span multiple operations, you can reuse browser sessions: ```typescript export const multiStepWorkflow = action({ handler: async (ctx) => { // Start a session const session = await stagehand.startSession(ctx, { url: "https://example.com", options: { timeout: 30000, waitUntil: "networkidle" } }); // Perform multiple operations with the same session await stagehand.act(ctx, { sessionId: session.sessionId, action: "Click the login button" }); const data = await stagehand.extract(ctx, { sessionId: session.sessionId, instruction: "Extract the user profile information", schema: z.object({ name: z.string(), email: z.string(), }) }); // End the session await stagehand.endSession(ctx, { sessionId: session.sessionId }); return data; } }); ``` Session persistence allows you to preserve authentication state and cookies between operations. ## Model Configuration The default model is `openai/gpt-4o`. You can configure alternative providers: ```typescript const stagehand = new Stagehand(components.stagehand, { browserbaseApiKey: process.env.BROWSERBASE_API_KEY!, browserbaseProjectId: process.env.BROWSERBASE_PROJECT_ID!, modelApiKey: process.env.ANTHROPIC_API_KEY!, modelName: "anthropic/claude-sonnet-4-5-20250929", }); ``` ## Requirements - Convex 1.29.3 or later - A [Browserbase](https://browserbase.com) account with API credentials - An API key from a supported LLM provider (OpenAI, Anthropic, etc.) ## References Browse the complete repository on GitHub Learn more about Convex ================================================ FILE: packages/docs/v3/integrations/convex/introduction.mdx ================================================ --- title: "Convex" sidebarTitle: Introduction description: "AI-powered browser automation for Convex applications" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview This guide shows you how to use Stagehand with Convex to create AI-powered browser automation within your Convex applications. By the end of this guide, you'll know how to: - Set up the convex-stagehand component in your Convex app - Extract structured data from web pages using natural language - Execute browser actions via plain English instructions - Build autonomous multi-step workflows with the agent API ## When You'd Use This The Convex integration is perfect for scenarios where you need browser automation in serverless Convex functions: - **Data extraction pipelines**: Extract structured data from websites and store it directly in your Convex database - **Automated workflows**: Build background jobs that interact with web pages on behalf of users - **Form automation**: Automatically fill out and submit forms based on data from your Convex app - **Multi-step web processes**: Execute complex browser workflows that require decision-making and adaptation The integration wraps the Stagehand REST API to provide Convex actions with the ability to control cloud browsers via Browserbase: 1. **Act**: Perform actions like clicking, typing, or navigating using natural language 2. **Extract**: Extract structured data from web pages with Zod schemas 3. **Observe**: Identify and analyze interactive elements on the page 4. **Agent**: Run autonomous multi-step tasks with AI decision-making Browse the repository on GitHub Learn how to set up and configure convex-stagehand ================================================ FILE: packages/docs/v3/integrations/crew-ai/configuration.mdx ================================================ --- title: "Use CrewAI to Automate Browser Tasks" sidebarTitle: Configuration description: "Create intelligent agents that can interact with websites and automate browser tasks using natural language instructions" --- import { V3Banner } from '/snippets/v3-banner.mdx'; This guide walks you through setting up CrewAI with Browserbase to create agents that can perform web automation tasks using natural language instructions. ## Step 1: Install Dependencies Install the required packages for CrewAI and Stagehand integration: ```bash pip install stagehand crewai crewai-tools ``` ## Step 2: Configure Environment Variables You'll need API keys from three services: 1. **Browserbase API Key and Project ID**: Get these from your [Browserbase dashboard](https://www.browserbase.com/) 2. **LLM API Key**: Get an API key from [OpenAI](https://platform.openai.com/api-keys) or [Anthropic](https://console.anthropic.com/) Store your API keys securely as environment variables: ```bash BROWSERBASE_API_KEY="your-browserbase-api-key" BROWSERBASE_PROJECT_ID="your-browserbase-project-id" OPENAI_API_KEY="your-openai-api-key" ANTHROPIC_API_KEY="your-anthropic-api-key" ``` ## Step 3: Create Your First Agent Create a Python script with a basic CrewAI agent: ```python import os from crewai import Agent, Task, Crew from crewai_tools import StagehandTool from stagehand.schemas import AvailableModel # Get API keys from environment browserbase_api_key = os.environ.get("BROWSERBASE_API_KEY") browserbase_project_id = os.environ.get("BROWSERBASE_PROJECT_ID") model_api_key = os.environ.get("OPENAI_API_KEY") # or ANTHROPIC_API_KEY # Initialize the StagehandTool stagehand_tool = StagehandTool( api_key=browserbase_api_key, project_id=browserbase_project_id, model_api_key=model_api_key, model_name=AvailableModel.GPT_4O, # or AvailableModel.CLAUDE_3_7_SONNET_LATEST ) # Create an agent with the tool researcher = Agent( role="Web Researcher", goal="Find and summarize information from websites", backstory="I'm an expert at finding information online.", verbose=True, tools=[stagehand_tool], ) ``` ## Step 4: Create and Run a Task Define a task for your agent and execute it: ```python # Create a task that uses the tool research_task = Task( description="Go to https://www.example.com and tell me what you see on the homepage.", agent=researcher, ) # Run the crew crew = Crew( agents=[researcher], tasks=[research_task], verbose=True, ) try: result = crew.kickoff() print(result) finally: # Clean up resources stagehand_tool.close() ``` ## Step 5: Run Your Script Execute your Python script: ```bash python your_crew_script.py ``` ## Advanced Configuration Customize the StagehandTool behavior with additional parameters: ```python stagehand_tool = StagehandTool( api_key=browserbase_api_key, project_id=browserbase_project_id, model_api_key=model_api_key, model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, dom_settle_timeout_ms=5000, # Wait longer for DOM to settle headless=True, # Run browser in headless mode self_heal=True, # Attempt to recover from errors wait_for_captcha_solves=True, # Wait for CAPTCHA solving verbose=1, # Control logging verbosity (0-3) ) ``` ## Example Tasks ```python form_task = Task( description=""" Submit a contact form: 1. Go to https://example.com/contact 2. Fill out the form with name 'John Doe', email 'john@example.com' 3. Submit and confirm success """, agent=researcher, ) ``` ```python extraction_task = Task( description=""" Extract product information: 1. Go to the products page 2. Extract all product names, prices, and descriptions 3. Format as structured data """, agent=researcher, ) ``` ```python navigation_task = Task( description=""" Navigate and analyze: 1. Start at homepage 2. Navigate to products section 3. Filter by 'Electronics' category 4. Find and extract details of highest-rated product """, agent=researcher, ) ``` Dive into the CrewAI documentation to learn more about its capabilities and integrations. Access the Browserbase documentation for comprehensive guides and resources. ================================================ FILE: packages/docs/v3/integrations/crew-ai/introduction.mdx ================================================ --- title: "CrewAI Introduction" sidebarTitle: Introduction description: "Automate browser tasks using natural language instructions with CrewAI" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview This guide shows you how to use CrewAI with Browserbase to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: - Set up CrewAI with the StagehandTool - Create agents that can interact with websites - Automate browser tasks using natural language instructions - Extract structured data from web pages ## When You'd Use This The CrewAI integration is perfect for scenarios where you need intelligent web automation: - **Research automation**: Have agents research information across multiple websites - **Data collection**: Extract structured data from e-commerce sites, job boards, or news sites - **Form automation**: Automatically fill out and submit forms based on specific criteria - **Multi-step workflows**: Execute complex browser workflows that require decision-making The StagehandTool wraps the Stagehand Python SDK to provide CrewAI agents with the ability to control a real web browser and interact with websites using three core primitives: 1. **Act**: Perform actions like clicking, typing, or navigating 2. **Extract**: Extract structured data from web pages 3. **Observe**: Identify and analyze elements on the page Learn how to configure and use the StagehandTool with CrewAI agents for web automation tasks ================================================ FILE: packages/docs/v3/integrations/langchain/configuration.mdx ================================================ --- title: "LangChain JS Configuration" sidebarTitle: Configuration description: "Set up Stagehand with LangChain JS to create intelligent web automation agents" --- import { V3Banner } from '/snippets/v3-banner.mdx'; This guide walks you through integrating Stagehand with LangChain JS to build powerful web automation workflows using natural language instructions. ## Step 1: Install Dependencies Install the required packages for LangChain JS and Stagehand integration: ```bash npm install @langchain/langgraph @langchain/community @langchain/core @browserbasehq/stagehand ``` ## Step 2: Configure Environment Variables For remote browser automation, set up your Browserbase credentials: ```bash BROWSERBASE_API_KEY="your-browserbase-api-key" BROWSERBASE_PROJECT_ID="your-browserbase-project-id" ``` ## Step 3: Create a Stagehand Instance Initialize Stagehand with your preferred configuration: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; // For local development const stagehand = new Stagehand({ env: "LOCAL", verbose: 2, enableCaching: false, }); // For production with Browserbase const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 1, enableCaching: true, }); ``` ## Step 4: Generate the StagehandToolkit Create the toolkit that provides LangChain-compatible tools: ```typescript import { StagehandToolkit } from '@langchain/community/agents/toolkits/stagehand'; const stagehandToolkit = await StagehandToolkit.fromStagehand(stagehand); ``` ## Step 5: Use Individual Tools The toolkit provides four specialized tools for web automation: ### Available Tools - **stagehand_navigate**: Navigate to specific URLs - **stagehand_act**: Perform browser actions (clicking, typing, etc.) - **stagehand_extract**: Extract structured data using schemas - **stagehand_observe**: Analyze page elements and possible actions ### Basic Tool Usage ```typescript import { z } from "zod"; // Navigate to a website const navigateTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_navigate" ); await navigateTool.invoke("https://www.google.com"); // Perform an action const actionTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_act" ); await actionTool.invoke('Search for "OpenAI"'); // Observe the page const observeTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_observe" ); const result = await observeTool.invoke( "What actions can be performed on the current page?" ); console.log(JSON.parse(result)); // Extract structured data const extractTool = stagehandToolkit.tools.find( (t) => t.name === "stagehand_extract" ); const extractResult = await extractTool.invoke({ instruction: "Extract the main heading and description", schema: z.object({ heading: z.string(), description: z.string(), }), }); console.log(extractResult); ``` ## Step 6: Build LangGraph Agents Integrate with LangGraph for complex automation workflows: ```typescript import { createReactAgent } from "@langchain/langgraph/prebuilt"; // Create an LLM const llm = new ChatOpenAI({ model: "gpt-4", temperature: 0, }); // Create an agent with Stagehand tools const agent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); // Execute a complex workflow const result = await agent.invoke({ messages: [ { role: "user", content: "Go to example.com, find the contact form, and extract all the form fields" } ] }); ``` ## Advanced Configuration ### Custom Stagehand Configuration ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", verbose: 2, enableCaching: true, headless: true, domSettleTimeoutMs: 5000, }); ``` ### Error Handling ```typescript try { const result = await agent.invoke({ messages: [{ role: "user", content: "Navigate to invalid-url.com" }] }); } catch (error) { console.error("Automation failed:", error); } finally { // Clean up resources await stagehand.close(); } ``` ## Example Workflows ```typescript const extractionAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await extractionAgent.invoke({ messages: [{ role: "user", content: ` Go to news-website.com and extract: 1. All article headlines 2. Publication dates 3. Author names Format as structured JSON ` }] }); ``` ```typescript const formAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await formAgent.invoke({ messages: [{ role: "user", content: ` Navigate to contact-form.com and: 1. Fill out the contact form with: - Name: John Doe - Email: john@example.com - Message: Inquiry about services 2. Submit the form 3. Confirm submission success ` }] }); ``` ```typescript const researchAgent = createReactAgent({ llm, tools: stagehandToolkit.tools, }); const result = await researchAgent.invoke({ messages: [{ role: "user", content: ` Research product pricing by: 1. Visit competitor1.com and extract pricing info 2. Visit competitor2.com and extract pricing info 3. Compare features and prices 4. Provide summary analysis ` }] }); ``` Official LangChain JS documentation for the Stagehand integration ================================================ FILE: packages/docs/v3/integrations/langchain/introduction.mdx ================================================ --- title: "Langchain JS Introduction" sidebarTitle: Introduction description: "Integrate Stagehand with Langchain JS for intelligent web automation" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview This guide shows you how to use Stagehand with Langchain JS to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: - Set up the StagehandToolkit with Langchain JS - Create agents that can navigate and interact with websites - Extract structured data using natural language instructions - Build complex automation workflows with LangGraph ## When You'd Use This The Langchain JS integration is perfect for scenarios where you need intelligent web automation with advanced reasoning: - **AI-driven research**: Create agents that can research information across multiple websites and synthesize findings - **Dynamic form filling**: Automatically fill out complex forms based on contextual requirements - **Data extraction workflows**: Extract and transform data from multiple sources with intelligent navigation - **Multi-step web processes**: Execute complex browser workflows that require decision-making and adaptation Learn how to set up and configure the StagehandToolkit with Langchain JS agents ================================================ FILE: packages/docs/v3/integrations/mcp/configuration.mdx ================================================ --- title: "Browserbase MCP Server Configuration" sidebarTitle: "Configuration" description: "Configure your browser automation with command-line flags, environment variables, and advanced options" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Configuration Overview The Browserbase MCP server supports extensive configuration options through command-line flags and environment variables. Configure browser behavior, proxy settings, stealth modes, model selection, and more to customize your browser automation workflows. Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). ## Environment Variables Configure the essential Browserbase credentials and optional debugging settings: Your Browserbase API key for authentication Your Browserbase project ID ## Command-Line Flags ### Available Flags | Flag | Description | |------|-------------| | `--proxies` | Enable Browserbase proxies for the session | | `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | | `--keepAlive` | Enable Browserbase Keep Alive Session | | `--contextId ` | Specify a Browserbase Context ID to use | | `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | | `--port ` | Port to listen on for HTTP/SHTTP transport | | `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | | `--browserWidth ` | Browser viewport width (default: 1024) | | `--browserHeight ` | Browser viewport height (default: 768) | | `--modelName ` | The model to use for Stagehand (default: google/gemini-2.5-flash-lite) | | `--modelApiKey ` | API key for the custom model provider (required when using custom models) | | `--experimental` | Enable experimental features (default: false) | ## Configuration Examples ### Basic Configuration ```json Direct SHTTP { "mcpServers": { "browserbase": { "url": "your-smithery-url.com" } } } ``` When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ```json { "mcpServers": { "browserbase": { "command": "node", "args": ["/path/to/mcp-server-browserbase/cli.js"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ```bash # Start server node cli.js --port 8931 ``` ```json { "mcpServers": { "browserbase": { "url": "http://localhost:8931/mcp", "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ### Advanced Features Enable Browserbase proxies for IP rotation and geo-location testing. [Learn more about Browserbase Proxies](https://docs.browserbase.com/features/proxies) ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--proxies"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Enable advanced anti-detection features for enhanced stealth browsing. [Learn more about Advanced Stealth](https://docs.browserbase.com/features/stealth-mode#advanced-stealth-mode) **Note:** Advanced Stealth is only available for Scale Plan users. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--advancedStealth"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Use persistent browser contexts to maintain authentication and state across sessions. [Learn more about Browserbase Contexts](https://docs.browserbase.com/features/contexts) ```json { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase", "--contextId", "your_context_id"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ### Browser Customization Customize browser window dimensions. Default is 1288x711. Recommended aspect ratios: 16:9. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--browserWidth", "1920", "--browserHeight", "1080" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` **Common Resolutions:** - Desktop: 1920x1080, 1280x720, 1024x768 - Mobile: 375x667 (iPhone), 360x640 (Android) - Tablet: 768x1024 (iPad) ## Model Configuration Configure AI models for enhanced browser automation. Stagehand defaults to Google's Gemini 2.5 Flash Lite but supports multiple providers. When using any custom model (non-default), you must provide your own API key for that model provider using the `--modelApiKey` flag. **Google Gemini** (Default) - `google/gemini-2.5-flash-lite` (default) - `google/gemini-2.5-pro` - `google/gemini-2.5-flash` **OpenAI** - `gpt-5-2025-08-07` - `gpt-4.1-2025-04-14` - `gpt-4o` - `gpt-4o-mini` **Anthropic Claude** - `claude-sonnet-4-5` - `claude-haiku-4-5` [View full list of supported models](https://docs.stagehand.dev/v3/configuration/models#models) ```json OpenAI GPT-4o { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--modelName", "gpt-4o", "--modelApiKey", "your_openai_api_key" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ```json Claude Sonnet { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--modelName", "claude-sonnet-4-6", "--modelApiKey", "your_anthropic_api_key" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id" } } } } ``` ## Development Configuration Configure custom host and port for SHTTP transport. ```json { "mcpServers": { "browserbase": { "command": "npx", "args": [ "@browserbasehq/mcp-server-browserbase", "--host", "0.0.0.0", "--port", "8080" ], "env": { "BROWSERBASE_API_KEY": "your_api_key", "BROWSERBASE_PROJECT_ID": "your_project_id", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` ## Best Practices - Use appropriate viewport sizes for your use case - Enable proxies only when needed for geo-location - Choose efficient models (Gemini Flash for speed, GPT-4o for accuracy) - Reuse contexts for authentication persistence - Store API keys securely in environment variables - Use Advanced Stealth for sensitive operations - Implement proper session management - Rotate cookies and contexts regularly - Enable debug mode during development - Use context persistence for faster iteration - Test with different viewport sizes - Monitor session usage and quotas - Use NPM installation for reliability - Configure appropriate timeouts - Implement error handling and retries - Monitor performance and resource usage ## Further Reading Complete platform documentation AI-powered browser automation Get help from our team ================================================ FILE: packages/docs/v3/integrations/mcp/introduction.mdx ================================================ --- title: "Browserbase MCP Server" sidebarTitle: "Introduction" description: "AI-powered browser automation through Model Context Protocol integration with Stagehand" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview The Browserbase MCP Server brings powerful browser automation capabilities to MCP clients through the Model Context Protocol (MCP). Built on top of [Stagehand](https://docs.stagehand.dev/), this integration provides AI-powered web automation using natural language commands. The hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint is served on Browserbase infrastructure. You can also run the MCP server locally with STDIO, but we recommend the hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint for most users. ## Key Features Control browsers using plain English commands like "click the login button" or "fill out the contact form" Navigate, click, and fill forms with ease Extract structured data from any website automatically Create, reuse, and close browser sessions with explicit MCP tools ## Core Benefits No need to learn complex selectors or automation syntax. Simply describe what you want to do in natural language. Get started in minutes with either hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) or local STDIO. Stagehand's AI understands web page context and can adapt to different layouts and designs. Navigate, click, type, scroll, and interact with any web element. Extract structured information from complex web pages automatically. Maintain authentication states and cookies across multiple interactions. Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) runs on Browserbase infrastructure for consistent performance. Handle multiple concurrent sessions and high-volume automation tasks. Stealth mode, proxy support, and advanced anti-detection capabilities. Detailed session recordings and debugging information. ## Use Cases Track product prices, availability, and competitor information Gather data from multiple sources for analysis and reporting Collect articles, posts, and media from various websites Extract contact information and business data from directories Create comprehensive test suites for web applications Test functionality across different browser environments Simulate real user interactions and workflows Track page load times and user experience metrics Automatically fill and submit complex web forms Extract data and generate automated reports Schedule posts and monitor engagement across platforms Automate repetitive web-based business processes ## Getting Started Choose hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) (recommended) or local STDIO based on your needs. Set up your Browserbase API credentials in MCP configuration. Get API keys from the [Browserbase Dashboard](https://www.browserbase.com/overview). Begin using natural language commands to control browsers through your MCP client. Ready to get started? Check out the [Setup Guide](/v3/integrations/mcp/setup). ## Further Reading Get started with installation and configuration Learn more about the MCP protocol Explore Browserbase features and capabilities ================================================ FILE: packages/docs/v3/integrations/mcp/setup.mdx ================================================ --- title: "Browserbase MCP Server Setup" sidebarTitle: "Setup" description: "Add the Browserbase MCP Server to your MCP client" --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Quick Installation One-click installation directly in Cursor You can also add Browserbase MCP to Claude Code with a single command: ```bash claude mcp add --transport http browserbase "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY" ``` We support both local STDIO and hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) (SHTTP). We recommend hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) for most users. ## Endpoint Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) endpoint (served on Browserbase infrastructure): ```text https://mcp.browserbase.com/mcp ``` ## Prerequisites Get your Browserbase API key from the [Browserbase Dashboard](https://www.browserbase.com/overview). Browserbase API Key settings Then copy your API Key directly from the input. ## Query Parameters (Hosted [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http)) ### Required for tool calls Browserbase API key. ### Optional | Query Param | Type | Behavior | | ----------------- | -------------- | ------------------------------------------ | | `modelName` | string | Defaults to `google/gemini-2.5-flash-lite` | | `modelApiKey` | string | Required when `modelName` is non-default | | `keepAlive` | boolean string | `"true"` or `"false"` | | `proxies` | boolean string | `"true"` or `"false"` | | `advancedStealth` | boolean string | `"true"` or `"false"` | Boolean query values must be exact strings: `"true"` or `"false"`. ## Available Tools Navigate to any URL in the browser The URL to navigate to Perform an action on the web page using natural language The action to perform (e.g., "click the login button", "fill form field") Observe and find actionable elements on the page. Specific instruction for observation (e.g., "find the login button", "locate search form") Extract data from the current page. Optional extraction instruction. Create or reuse a Browserbase session and set it as active for the current MCP transport session. No input parameters required. Browserbase session ID. Close the active Browserbase session for the current MCP transport session. No input parameters required. ## Local Command-Line Flags Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). | Flag | Description | |------|-------------| | `--proxies` | Enable Browserbase proxies for the session | | `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | | `--keepAlive` | Enable Browserbase Keep Alive Session | | `--contextId ` | Specify a Browserbase Context ID to use | | `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | | `--port ` | Port to listen on for HTTP or [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http) transport | | `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | | `--browserWidth ` | Browser viewport width (default: 1024) | | `--browserHeight ` | Browser viewport height (default: 768) | | `--modelName ` | The model to use for Stagehand (default: google/gemini-2.5-flash-lite) | | `--modelApiKey ` | API key for the custom model provider (required when using custom models) | | `--experimental` | Enable experimental features (default: false) | ## Installation Methods Use your MCP client config: ```json { "mcpServers": { "browserbase": { "url": "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY" } } } ``` For custom models, include `modelName` and `modelApiKey`: ```json { "mcpServers": { "browserbase": { "url": "https://mcp.browserbase.com/mcp?browserbaseApiKey=YOUR_BROWSERBASE_API_KEY&modelName=openai/gpt-4.1&modelApiKey=YOUR_MODEL_API_KEY" } } } ``` The easiest way to get started locally is using our NPM package. If you would like to use a different model, you have to pass the model name and keys in the args. More info in the [Local Command-Line Flags](#local-command-line-flags) section. Go into your MCP Config JSON and add the Browserbase Server: ```json Claude Desktop { "mcpServers": { "browserbase": { "command": "npx", "args": ["@browserbasehq/mcp-server-browserbase"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` That's it! Reload your MCP client and you will be able to use Browserbase. For local development or customization, you can run the server locally. ```bash # Clone the Repo git clone https://github.com/browserbase/mcp-server-browserbase.git cd mcp-server-browserbase # Install the dependencies and build the project npm install && npm run build ``` You can run locally using either STDIO or [Streamable HTTP](https://modelcontextprotocol.io/specification/2025-03-26/basic/transports#streamable-http). Add the following to your MCP Config JSON file: ```json { "mcpServers": { "browserbase": { "command": "node", "args": ["/path/to/mcp-server-browserbase/cli.js"], "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` First, run the server: ```bash node cli.js --port 8931 ``` Then add this to your MCP Config JSON file: ```json { "mcpServers": { "browserbase": { "url": "http://localhost:8931/mcp", "env": { "BROWSERBASE_API_KEY": "your_api_key", "GEMINI_API_KEY": "your_gemini_api_key" } } } } ``` Reload your MCP client and you should be good to go! ## Verify Installation Restart/refresh your MCP client app and verify tools are available. Get started using our MCP Server by asking your MCP client to navigate to any page and see your Browserbase Browser in action on the [dashboard](https://www.browserbase.com/sessions). Try: "Navigate to example.com and extract the main heading" ## Further Reading Learn more about the MCP protocol Explore Browserbase features and capabilities Get help from our support team ================================================ FILE: packages/docs/v3/integrations/mcp/tools.mdx ================================================ --- title: "Browserbase MCP Server Tools" sidebarTitle: "Tools" description: "This guide covers the specialized tools available in the Browserbase MCP server for browser automation and interaction." --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview The Browserbase MCP server provides tools for browser automation and session management through a transport-scoped active session. ## Core Browser Automation Tools These are the primary tools for modern web automation using natural language commands. Navigate to any URL in the browser The URL to navigate to Perform an action on the web page using natural language The action to perform (e.g., "click the login button", "fill form field") Observe and find actionable elements on the page. Specific instruction for observation (e.g., "find the login button", "locate search form") Extract data from the current page. Optional extraction instruction. ## Session Management Create or reuse a Browserbase session and set it as active for the current MCP transport session. No input parameters required. Browserbase session ID. Close the active Browserbase session for the current MCP transport session. No input parameters required. ## Further Reading Learn more about the MCP protocol Explore Stagehand's AI-powered browser automation Get help from our support team ================================================ FILE: packages/docs/v3/integrations/playwright.mdx ================================================ --- title: Playwright description: Use Stagehand with Playwright for browser automation --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview Stagehand v3 can work seamlessly with Playwright, allowing you to use Playwright's `Page` objects directly with Stagehand's AI-powered methods like `act()`, `extract()`, and `observe()`. ## Installation First, install both Stagehand and Playwright: ```bash npm install @browserbasehq/stagehand playwright-core ``` ## Quickstart ### Basic Setup Connect Playwright to Stagehand's browser instance using Chrome DevTools Protocol (CDP): ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import { chromium } from "playwright-core"; const stagehand = new Stagehand({ env: "BROWSERBASE", // or "LOCAL" model: "openai/gpt-5", }); await stagehand.init(); // Connect Playwright to Stagehand's browser const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const pwContext = browser.contexts()[0]; const pwPage = pwContext.pages()[0]; ``` ### Using Playwright Pages with Stagehand Once connected, you can use Playwright's `Page` objects with Stagehand's AI-powered methods: ```typescript // Navigate using Playwright await pwPage.goto("https://example.com"); // Use Stagehand's AI methods with the Playwright page await stagehand.act("click the login button", { page: pwPage }); const data = await stagehand.extract( "extract the article title", z.object({ title: z.string() }), { page: pwPage } ); ``` ## Multi-Page Example Stagehand works great with multiple Playwright pages: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import { chromium } from "playwright-core"; import { z } from "zod"; // Initialize Stagehand const stagehand = new Stagehand({ env: "BROWSERBASE", model: "openai/gpt-5", }); await stagehand.init(); // Connect Playwright const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const pwContext = browser.contexts()[0]; const pwPage1 = pwContext.pages()[0]; // Create a second page const pwPage2 = await pwContext.newPage(); // Navigate both pages await pwPage1.goto("https://docs.stagehand.dev/first-steps/introduction"); await pwPage2.goto("https://docs.stagehand.dev/configuration/observability"); // Extract data from both pages concurrently const [page1Data, page2Data] = await Promise.all([ stagehand.extract( "extract the names of the four stagehand primitives", z.array(z.string()), { page: pwPage1 } ), stagehand.extract( "extract the list of session dashboard features", z.array(z.string()), { page: pwPage2 } ), ]); console.log("Page 1 primitives:", page1Data); console.log("Page 2 features:", page2Data); ``` ## Complete Example Here's a full working example: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import { chromium } from "playwright-core"; import { z } from "zod"; async function main() { // Initialize Stagehand const stagehand = new Stagehand({ env: "BROWSERBASE", model: "openai/gpt-5", verbose: 1, }); await stagehand.init(); console.log("Stagehand initialized"); // Connect Playwright to Stagehand's browser const browser = await chromium.connectOverCDP({ wsEndpoint: stagehand.connectURL(), }); const pwContext = browser.contexts()[0]; const pwPage = pwContext.pages()[0]; // Navigate and interact await pwPage.goto("https://example.com"); // Use Stagehand's AI methods const actions = await stagehand.observe("find the main heading", { page: pwPage, }); console.log("Found actions:", actions); // Extract data const heading = await stagehand.extract( "extract the main heading text", z.object({ heading: z.string() }), { page: pwPage } ); console.log("Heading:", heading); // Cleanup await stagehand.close(); } main(); ``` ## Key Points - **Connect via CDP**: Use `chromium.connectOverCDP()` with `stagehand.connectURL()` as the WebSocket endpoint - **Pass the page**: Always pass the Playwright `page` object to Stagehand methods using the `{ page }` option - **Multi-page support**: Create multiple pages with `pwContext.newPage()` and pass them to Stagehand methods - **Concurrent operations**: Use `Promise.all()` to run multiple Stagehand operations in parallel across different pages ## Environment Variables When using Browserbase, set your credentials: ```bash BROWSERBASE_API_KEY=your_api_key BROWSERBASE_PROJECT_ID=your_project_id ``` For OpenAI (or other providers): ```bash OPENAI_API_KEY=your_api_key ``` ## Next Steps Automate entire workflows Execute actions on web pages Extract structured data from pages Observe and find elements on pages ================================================ FILE: packages/docs/v3/integrations/puppeteer.mdx ================================================ --- title: Puppeteer description: Use Stagehand with Puppeteer for browser automation --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview Stagehand v3 can work seamlessly with Puppeteer, allowing you to use Puppeteer's `Page` objects directly with Stagehand's AI-powered methods like `act()`, `extract()`, and `observe()`. ## Installation First, install both Stagehand and Puppeteer: ```bash npm install @browserbasehq/stagehand puppeteer-core ``` ## Quickstart ### Basic Setup Connect Puppeteer to Stagehand's browser instance: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import puppeteer from "puppeteer-core"; const stagehand = new Stagehand({ env: "LOCAL", // or "BROWSERBASE" model: "openai/gpt-5", }); await stagehand.init(); // Connect Puppeteer to Stagehand's browser const browser = await puppeteer.connect({ browserWSEndpoint: stagehand.connectURL(), defaultViewport: null, }); const pages = await browser.pages(); const ppPage = pages[0]; ``` ### Using Puppeteer Pages with Stagehand Once connected, you can use Puppeteer's `Page` objects with Stagehand's AI-powered methods: ```typescript // Navigate using Puppeteer await ppPage.goto("https://example.com"); // Use Stagehand's AI methods with the Puppeteer page await stagehand.act("click the sign in button", { page: ppPage }); const data = await stagehand.extract( "extract the page title", z.object({ title: z.string() }), { page: ppPage } ); ``` ## Advanced: Multi-Page Usage Create and manage multiple Puppeteer pages with Stagehand: ```typescript import { Stagehand } from "@browserbasehq/stagehand"; import puppeteer from "puppeteer-core"; import { z } from "zod"; async function multiPageExample() { const stagehand = new Stagehand({ env: "BROWSERBASE", model: "openai/gpt-5", }); await stagehand.init(); // Connect Puppeteer const browser = await puppeteer.connect({ browserWSEndpoint: stagehand.connectURL(), defaultViewport: null, }); // Get the first page const pages = await browser.pages(); const ppPage1 = pages[0]; // Create a second page const ppPage2 = await browser.newPage(); // Navigate both pages await ppPage1.goto("https://example.com"); await ppPage2.goto("https://another-site.com"); // Use Stagehand on different pages await stagehand.act("click the button", { page: ppPage1 }); const data = await stagehand.extract( "extract the title", z.object({ title: z.string() }), { page: ppPage2 } ); console.log("Extracted from page 2:", data); await stagehand.close(); } ``` ## Observe + Act Pattern The recommended pattern for reliable automation: ```typescript // Step 1: Observe to find candidate actions const actions = await stagehand.observe( "find the submit button", { page: ppPage } ); // Step 2: Execute the first action if (actions.length > 0) { await stagehand.act(actions[0], { page: ppPage }); } ``` This pattern helps avoid DOM changes between observation and action execution. ## Key Points - **Connect via WebSocket**: Use `puppeteer.connect()` with `stagehand.connectURL()` as the `browserWSEndpoint` - **Pass the page**: Always pass the Puppeteer `page` object to Stagehand methods using the `{ page }` option - **Disable viewport**: Set `defaultViewport: null` to use Stagehand's viewport settings - **Multi-page support**: Create multiple pages with `browser.newPage()` and pass them to Stagehand methods ## Environment Variables When using Browserbase, set your credentials: ```bash BROWSERBASE_API_KEY=your_api_key BROWSERBASE_PROJECT_ID=your_project_id ``` For OpenAI (or other providers): ```bash OPENAI_API_KEY=your_api_key ``` ## Comparison: Stagehand Native vs Puppeteer | Feature | Stagehand Native | With Puppeteer | |---------|------------------|----------------| | **Setup** | Simple - use `stagehand.context.pages()` | Requires `puppeteer.connect()` | | **Page Access** | `stagehand.context.pages()[0]` | `await browser.pages()` | | **AI Methods** | `stagehand.act("click")` | `stagehand.act("click", { page: ppPage })` | | **Best For** | Pure Stagehand workflows | Existing Puppeteer codebases | ## Next Steps Automate entire workflows Execute actions on web pages Extract structured data from pages Observe and find elements on pages ================================================ FILE: packages/docs/v3/integrations/selenium.mdx ================================================ --- title: Selenium description: Use Stagehand with Selenium to operate the same browser in tandem --- import { V3Banner } from '/snippets/v3-banner.mdx'; ## Overview Stagehand v3 can work alongside Selenium WebDriver, allowing both tools to operate on the same browser session simultaneously. This enables you to combine Stagehand's AI-powered automation with Selenium's precise element interactions. **Browserbase Only**: This integration requires Browserbase. It does not work with `env: "LOCAL"` because Selenium needs a remote WebDriver endpoint. ## Installation Install Stagehand, Selenium, and the Browserbase SDK: ```bash npm install @browserbasehq/stagehand selenium-webdriver @browserbasehq/sdk ``` ## Quickstart ### Create Shared Session Use the Browserbase SDK to create a session that both tools can connect to: ```typescript import http from "http"; import { Builder, Key } from "selenium-webdriver"; import Browserbase from "@browserbasehq/sdk"; import { Stagehand } from "@browserbasehq/stagehand"; const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY, }); // Create shared session const session = await bb.sessions.create({ projectId: process.env.BROWSERBASE_PROJECT_ID, }); console.log("Session created:", session.id); ``` ### Connect Stagehand Initialize Stagehand with the session ID: ```typescript const stagehand = new Stagehand({ env: "BROWSERBASE", browserbaseSessionID: session.id, model: "openai/gpt-5", verbose: 2, }); await stagehand.init(); ``` ### Connect Selenium Use a custom HTTP agent with the session's signing key: ```typescript // Create custom HTTP agent with signing key const customHttpAgent = new http.Agent({}); (customHttpAgent as any).addRequest = (req: any, options: any) => { req.setHeader("x-bb-signing-key", session.signingKey); (http.Agent.prototype as any).addRequest.call(customHttpAgent, req, options); }; // Connect Selenium WebDriver const driver = new Builder() .forBrowser("chrome") .usingHttpAgent(customHttpAgent) .usingServer(session.seleniumRemoteUrl) .build(); ``` ### Use Both Tools Together Now both Stagehand and Selenium operate on the same browser: ```typescript // Navigate with Stagehand const page = stagehand.context.pages()[0]; await page.goto("https://www.google.com"); // Extract page content with Stagehand AI const pageContent = await stagehand.extract(); console.log("Page content:", pageContent); // Use Selenium for precise element interaction const searchBox = await driver.findElement({ name: "q" }); await searchBox.sendKeys("Browserbase automation"); await searchBox.sendKeys(Key.RETURN); // Wait for results await driver.sleep(2000); console.log("Search completed!"); ``` ## Key Points - **Shared Session**: Both tools connect to the same Browserbase session - **Signing Key**: Selenium requires the session's `signingKey` in HTTP headers - **Remote URL**: Use `session.seleniumRemoteUrl` for Selenium's server endpoint - **Concurrent Usage**: Both tools can operate on the browser simultaneously - **Cleanup**: Close both Stagehand (`await stagehand.close()`) and Selenium (`await driver.quit()`) ## Next Steps Automate entire workflows Execute actions on web pages Extract structured data from pages Observe and find elements on pages ================================================ FILE: packages/docs/v3/integrations/vercel/configuration.mdx ================================================ --- title: Use Stagehand in Next.js sidebarTitle: Configuration description: Next.js is a popular framework for developing web-based applications in production. It powers Stagehand apps like [Director](https://director.ai), [Brainrot](https://brainrot.run) and [Open Operator](https://operator.browserbase.com). --- import { V3Banner } from '/snippets/v3-banner.mdx'; Clone our [GitHub repo](https://github.com/browserbase/stagehand-nextjs-quickstart) to get started with Stagehand (v2) in Next.js. ## Add Stagehand to an existing Next.js project If you'd like to start from scratch, you can run: ```bash npm create next-app@latest stagehand-nextjs --yes cd stagehand-nextjs ``` ```bash pnpm create next-app@latest stagehand-nextjs --yes cd stagehand-nextjs ``` ```bash yarn create next-app@latest stagehand-nextjs --yes cd stagehand-nextjs ``` If you'd like to add Stagehand to an existing Next.js project, you can do so by installing the dependencies: ```bash npm install @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ```bash pnpm add @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ```bash yarn add @browserbasehq/stagehand @browserbasehq/sdk playwright zod ``` ### Add environment variables Next, let's add the environment variables to a `.env` file. ```env BROWSERBASE_API_KEY=your-browserbase-api-key BROWSERBASE_PROJECT_ID=your-browserbase-project-id OPENAI_API_KEY=your-openai-api-key ``` ### Write a server action Next, let's define our `main` function as a server action in `app/stagehand/main.ts`. This file will have the following three functions: 1. **`main`: Run the main Stagehand script** 2. **`runStagehand`: Initialize and run the `main` function** 3. **`startBBSSession`: Start a Browserbase session** ```ts app/stagehand/main.ts // 🤘 Welcome to Stagehand! // This file is from the [Stagehand docs](https://docs.stagehand.dev/sections/examples/nextjs). "use server"; import { Stagehand } from "@browserbasehq/stagehand"; import { z } from "zod"; import { Browserbase } from "@browserbasehq/sdk"; /** * Run the main Stagehand script */ async function main(stagehand: Stagehand) { // You can use the `page` instance to write any Playwright code // For more info: https://playwright.dev/docs/pom const page = stagehand.context.activePage(); // In this example, we'll get the title of the Stagehand quickstart page await page?.goto("https://docs.stagehand.dev/"); await stagehand.act("click the quickstart link"); const { title } = await stagehand.extract( "extract the main heading of the page", z.object({ title: z.string(), }), ); return title; } /** * Initialize and run the main() function */ export async function runStagehand(sessionId?: string) { const stagehand = new Stagehand({ env: "BROWSERBASE", apiKey: process.env.BROWSERBASE_API_KEY, projectId: process.env.BROWSERBASE_PROJECT_ID, verbose: 1, logger: console.log, browserbaseSessionID: sessionId, disablePino: true, }); await stagehand.init(); const result = await main(stagehand); console.log(result); await stagehand.close(); } /** * Start a Browserbase session */ export async function startBBSSession() { const browserbase = new Browserbase(); const session = await browserbase.sessions.create({ projectId: process.env.BROWSERBASE_PROJECT_ID!, }); const debugUrl = await browserbase.sessions.debug(session.id); return { sessionId: session.id, debugUrl: debugUrl.debuggerFullscreenUrl, }; } ``` ### Create a client component Next, let's create a client component that will start a Browserbase session and run the `main` function with the server actions we just defined. We'll first create a Browserbase session and embed the session in an iframe before running the `main` function. ```tsx app/components/stagehandEmbed.tsx "use client"; import { useCallback, useState } from "react"; import { runStagehand, startBBSSession } from "@/app/stagehand/main"; export function StagehandEmbed() { const [sessionId, setSessionId] = useState(null); const [debugUrl, setDebugUrl] = useState(null); const startSession = useCallback(async () => { const { sessionId, debugUrl } = await startBBSSession(); setSessionId(sessionId); setDebugUrl(debugUrl); await runStagehand(sessionId); }, []); return (
{!sessionId && } {sessionId && debugUrl && (