Repository: alibaba/page-agent
Branch: main
Commit: 80e96d0b9e1d
Files: 227
Total size: 756.6 KB
Directory structure:
gitextract_5g_s5zem/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.yml
│ │ ├── config.yml
│ │ └── feature_request.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── dependabot.yml
│ └── workflows/
│ ├── ci.yml
│ ├── deploy-demo.yml
│ └── release.yml
├── .gitignore
├── .husky/
│ ├── commit-msg
│ └── pre-commit
├── .vscode/
│ ├── extensions.json
│ └── settings.json
├── AGENTS.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs/
│ ├── CHANGELOG.md
│ ├── CODE_OF_CONDUCT.md
│ ├── README-zh.md
│ └── terms-and-privacy.md
├── eslint.config.js
├── package.json
├── packages/
│ ├── core/
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── PageAgentCore.ts
│ │ │ ├── env.d.ts
│ │ │ ├── prompts/
│ │ │ │ ├── .prettierignore
│ │ │ │ └── system_prompt.md
│ │ │ ├── tools/
│ │ │ │ └── index.ts
│ │ │ ├── types.ts
│ │ │ └── utils/
│ │ │ ├── autoFixer.ts
│ │ │ └── index.ts
│ │ ├── tsconfig.dts.json
│ │ ├── tsconfig.json
│ │ └── vite.config.js
│ ├── extension/
│ │ ├── .prettierignore
│ │ ├── PRIVACY.md
│ │ ├── components.json
│ │ ├── docs/
│ │ │ └── extension_api.md
│ │ ├── package.json
│ │ ├── public/
│ │ │ └── _locales/
│ │ │ ├── en/
│ │ │ │ └── messages.json
│ │ │ └── zh_CN/
│ │ │ └── messages.json
│ │ ├── src/
│ │ │ ├── agent/
│ │ │ │ ├── .prettierignore
│ │ │ │ ├── MultiPageAgent.ts
│ │ │ │ ├── RemotePageController.background.ts
│ │ │ │ ├── RemotePageController.content.ts
│ │ │ │ ├── RemotePageController.ts
│ │ │ │ ├── TabsController.background.ts
│ │ │ │ ├── TabsController.ts
│ │ │ │ ├── constants.ts
│ │ │ │ ├── system_prompt.md
│ │ │ │ ├── tabTools.ts
│ │ │ │ └── useAgent.ts
│ │ │ ├── assets/
│ │ │ │ └── index.css
│ │ │ ├── components/
│ │ │ │ ├── ConfigPanel.tsx
│ │ │ │ ├── ErrorBoundary.tsx
│ │ │ │ ├── HistoryDetail.tsx
│ │ │ │ ├── HistoryList.tsx
│ │ │ │ ├── cards.tsx
│ │ │ │ ├── misc.tsx
│ │ │ │ └── ui/
│ │ │ │ ├── button.tsx
│ │ │ │ ├── card.tsx
│ │ │ │ ├── field.tsx
│ │ │ │ ├── hover-card.tsx
│ │ │ │ ├── input-group.tsx
│ │ │ │ ├── input.tsx
│ │ │ │ ├── item.tsx
│ │ │ │ ├── label.tsx
│ │ │ │ ├── separator.tsx
│ │ │ │ ├── sonner.tsx
│ │ │ │ ├── spinner.tsx
│ │ │ │ ├── switch.tsx
│ │ │ │ ├── textarea.tsx
│ │ │ │ └── typing-animation.tsx
│ │ │ ├── entrypoints/
│ │ │ │ ├── background.ts
│ │ │ │ ├── content.ts
│ │ │ │ ├── hub/
│ │ │ │ │ ├── App.tsx
│ │ │ │ │ ├── hub-ws.ts
│ │ │ │ │ ├── index.html
│ │ │ │ │ └── main.tsx
│ │ │ │ ├── main-world.ts
│ │ │ │ └── sidepanel/
│ │ │ │ ├── App.tsx
│ │ │ │ ├── index.html
│ │ │ │ └── main.tsx
│ │ │ ├── lib/
│ │ │ │ ├── db.ts
│ │ │ │ ├── history-export.ts
│ │ │ │ └── utils.ts
│ │ │ └── types/
│ │ │ ├── assets.d.ts
│ │ │ ├── globals.d.ts
│ │ │ └── markdown.d.ts
│ │ ├── tsconfig.json
│ │ └── wxt.config.js
│ ├── llms/
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── OpenAIClient.ts
│ │ │ ├── constants.ts
│ │ │ ├── errors.ts
│ │ │ ├── index.ts
│ │ │ ├── types.ts
│ │ │ └── utils.ts
│ │ ├── tsconfig.dts.json
│ │ ├── tsconfig.json
│ │ └── vite.config.js
│ ├── mcp/
│ │ ├── README.md
│ │ ├── package.json
│ │ └── src/
│ │ ├── hub-bridge.js
│ │ ├── index.js
│ │ └── launcher.html
│ ├── page-agent/
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── PageAgent.ts
│ │ │ ├── demo.ts
│ │ │ └── env.d.ts
│ │ ├── tsconfig.dts.json
│ │ ├── tsconfig.json
│ │ ├── vite.config.js
│ │ └── vite.iife.config.js
│ ├── page-controller/
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── PageController.ts
│ │ │ ├── actions.ts
│ │ │ ├── dom/
│ │ │ │ ├── dom_tree/
│ │ │ │ │ ├── index.js
│ │ │ │ │ └── type.ts
│ │ │ │ ├── getPageInfo.ts
│ │ │ │ └── index.ts
│ │ │ ├── env.d.ts
│ │ │ ├── mask/
│ │ │ │ ├── SimulatorMask.module.css
│ │ │ │ ├── SimulatorMask.ts
│ │ │ │ ├── checkDarkMode.ts
│ │ │ │ └── cursor.module.css
│ │ │ ├── patches/
│ │ │ │ ├── antd.ts
│ │ │ │ └── react.ts
│ │ │ └── utils/
│ │ │ └── index.ts
│ │ ├── tsconfig.dts.json
│ │ ├── tsconfig.json
│ │ └── vite.config.js
│ ├── ui/
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── env.d.ts
│ │ │ ├── i18n/
│ │ │ │ ├── index.ts
│ │ │ │ └── locales.ts
│ │ │ ├── index.ts
│ │ │ ├── motion-css/
│ │ │ │ ├── createMotion.ts
│ │ │ │ ├── motion.module.css
│ │ │ │ └── readme
│ │ │ ├── panel/
│ │ │ │ ├── Panel.module.css
│ │ │ │ ├── Panel.ts
│ │ │ │ ├── cards.ts
│ │ │ │ └── types.ts
│ │ │ └── utils.ts
│ │ ├── tsconfig.dts.json
│ │ ├── tsconfig.json
│ │ └── vite.config.js
│ └── website/
│ ├── AGENTS.md
│ ├── components.json
│ ├── index.html
│ ├── package.json
│ ├── public/
│ │ └── robots.txt
│ ├── src/
│ │ ├── components/
│ │ │ ├── APIReference.tsx
│ │ │ ├── BetaNotice.tsx
│ │ │ ├── CodeEditor.tsx
│ │ │ ├── Footer.tsx
│ │ │ ├── Header.tsx
│ │ │ ├── Heading.tsx
│ │ │ ├── HighlightSyntax.module.css
│ │ │ ├── HighlightSyntax.tsx
│ │ │ ├── JSConsole.module.css
│ │ │ ├── JSConsole.tsx
│ │ │ ├── LanguageSwitcher.tsx
│ │ │ ├── ThemeSwitcher.tsx
│ │ │ └── ui/
│ │ │ ├── alert.tsx
│ │ │ ├── animated-gradient-text.tsx
│ │ │ ├── animated-shiny-text.tsx
│ │ │ ├── aurora-text.tsx
│ │ │ ├── badge.tsx
│ │ │ ├── bento-grid.tsx
│ │ │ ├── blur-fade.tsx
│ │ │ ├── button.tsx
│ │ │ ├── highlighter.tsx
│ │ │ ├── hyper-text.tsx
│ │ │ ├── kbd.tsx
│ │ │ ├── magic-card.tsx
│ │ │ ├── marquee.tsx
│ │ │ ├── neon-gradient-card.tsx
│ │ │ ├── particles.tsx
│ │ │ ├── separator.tsx
│ │ │ ├── sonner.tsx
│ │ │ ├── sparkles-text.tsx
│ │ │ ├── spinner.tsx
│ │ │ ├── switch.tsx
│ │ │ ├── text-animate.tsx
│ │ │ ├── tooltip.tsx
│ │ │ └── typing-animation.tsx
│ │ ├── constants.ts
│ │ ├── env.d.ts
│ │ ├── hooks/
│ │ │ └── useGitHubStars.ts
│ │ ├── i18n/
│ │ │ └── context.tsx
│ │ ├── index.css
│ │ ├── lib/
│ │ │ ├── useDocumentTitle.ts
│ │ │ └── utils.ts
│ │ ├── main.tsx
│ │ ├── pages/
│ │ │ ├── docs/
│ │ │ │ ├── Layout.tsx
│ │ │ │ ├── advanced/
│ │ │ │ │ ├── custom-ui/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── page-agent/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── page-agent-core/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── page-controller/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ └── security-permissions/
│ │ │ │ │ └── page.tsx
│ │ │ │ ├── features/
│ │ │ │ │ ├── chrome-extension/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── custom-instructions/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── custom-tools/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── data-masking/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ ├── models/
│ │ │ │ │ │ └── page.tsx
│ │ │ │ │ └── third-party-agent/
│ │ │ │ │ └── page.tsx
│ │ │ │ ├── index.tsx
│ │ │ │ └── introduction/
│ │ │ │ ├── limitations/
│ │ │ │ │ └── page.tsx
│ │ │ │ ├── overview/
│ │ │ │ │ └── page.tsx
│ │ │ │ ├── quick-start/
│ │ │ │ │ └── page.tsx
│ │ │ │ └── troubleshooting/
│ │ │ │ └── page.tsx
│ │ │ └── home/
│ │ │ ├── FeaturesSection.tsx
│ │ │ ├── HeroSection.tsx
│ │ │ ├── OneMoreThingSection.tsx
│ │ │ ├── ScenariosSection.tsx
│ │ │ └── index.tsx
│ │ └── router.tsx
│ ├── tailwind.config.js
│ ├── tsconfig.json
│ └── vite.config.js
├── scripts/
│ └── sync-version.js
├── tsconfig.base.json
└── tsconfig.json
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yml
================================================
name: Bug Report
description: Report a bug
title: '[Bug] '
labels: ['bug']
body:
- type: markdown
attributes:
value: |
Thanks for your interest in improving the project! Before submitting, please read our guidelines.
感谢您对改进项目的兴趣!提交前请阅读我们的指南。
- [Code of Conduct](https://github.com/alibaba/page-agent/blob/main/docs/CODE_OF_CONDUCT.md)
- [Contributing Guide](https://github.com/alibaba/page-agent/blob/main/CONTRIBUTING.md)
- type: textarea
id: description
attributes:
label: What happened?
placeholder: Describe the bug and expected behavior
validations:
required: true
- type: textarea
id: code
attributes:
label: Code
render: typescript
placeholder: Minimal reproduction code
validations:
required: false
- type: input
id: browser
attributes:
label: Browser
placeholder: 'Chrome 120, Firefox 119, etc.'
validations:
required: false
- type: input
id: version
attributes:
label: version
placeholder: '0.0.0'
validations:
required: false
- type: checkboxes
id: community
attributes:
label: Community Communication / 社区沟通
description: Confirm you will communicate respectfully and constructively / 确认将以礼貌、建设性的方式沟通
options:
- label: I will be polite and respectful. / 我会保持礼貌与尊重。
required: true
- label: I will share constructive, actionable suggestions. / 我会提供建设性、可行动的建议。
required: true
- label: I have read the Code of Conduct. / 我已阅读行为准则。
required: true
validations:
required: true
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: Questions & Ideas / 问题与想法(Discussions)
url: https://github.com/alibaba/page-agent/discussions
about: Use Discussions for Q&A and ideation. 使用 Discussions 进行问答与想法交流。
- name: Security Report / 安全问题报告
url: https://github.com/alibaba/page-agent/security/policy
about: Report security vulnerabilities responsibly. 通过安全页面报告漏洞。
- name: Contributing Guide / 贡献指南
url: https://github.com/alibaba/page-agent/blob/main/CONTRIBUTING.md
about: How to contribute code and ideas. 如何进行贡献与提交代码。
- name: Code of Conduct / 行为准则
url: https://github.com/alibaba/page-agent/blob/main/docs/CODE_OF_CONDUCT.md
about: Community expectations and standards. 社区行为期望与标准。
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.yml
================================================
name: Feature Request
description: Suggest a feature
title: '[Feature] '
labels: ['enhancement']
body:
- type: markdown
attributes:
value: |
Thanks for your interest in improving the project! Before submitting, please read our guidelines.
感谢您对改进项目的兴趣!提交前请阅读我们的指南。
- [Code of Conduct](https://github.com/alibaba/page-agent/blob/main/docs/CODE_OF_CONDUCT.md)
- [Contributing Guide](https://github.com/alibaba/page-agent/blob/main/CONTRIBUTING.md)
- type: textarea
id: description
attributes:
label: Feature Description / 功能描述
description: Describe the problem, solution, and any API changes. / 描述问题、解决方案以及 API 变更。
placeholder: |
**Problem**:
What problem does this solve?
**Solution**:
How should this work?
**Proposed API**:
```typescript
// code here
```
validations:
required: true
- type: checkboxes
id: community
attributes:
label: Community Communication / 社区沟通
description: Confirm you will communicate respectfully and constructively / 确认将以礼貌、建设性的方式沟通
options:
- label: I will be polite and respectful. / 我会保持礼貌与尊重。
required: true
- label: I will share constructive, actionable suggestions. / 我会提供建设性、可行动的建议。
required: true
- label: I have read the CODE_OF_CONDUCT.md and CONTRIBUTING.md. / 我已阅读行为准则。
required: true
validations:
required: true
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## What
Brief description of changes.
## Type
- [ ] Bug fix
- [ ] Feature / Improvement
- [ ] Refactor
- [ ] Documentation
- [ ] Website
- [ ] Demo / Testing
- [ ] Breaking change
## Testing
- [ ] Tested in modern browsers
- [ ] No console errors
- [ ] Types/doc added
Closes #(issue)
## Requirements / 要求
- [ ] I have read and follow the [Code of Conduct](../docs/CODE_OF_CONDUCT.md) and [Contributing Guide](../CONTRIBUTING.md) . / 我已阅读并遵守行为准则。
- [ ] This PR is NOT generated by a bot or AI agent acting autonomously. I have authored or meaningfully reviewed every change. / 此 PR 不是由 bot 或 AI 自主生成的,我已亲自编写或充分审查了每一处变更。
================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
- package-ecosystem: 'npm'
directory: '/'
schedule:
interval: 'weekly'
groups:
# 生产依赖 - 小版本更新
production-dependencies:
dependency-type: 'production'
update-types:
- 'minor'
- 'patch'
# 开发依赖 - 小版本更新
development-dependencies:
dependency-type: 'development'
update-types:
- 'minor'
- 'patch'
# Major 更新单独处理(不分组,需要人工审查)
# 安全更新也不分组,Dependabot 会自动优先创建独立 PR
- package-ecosystem: 'github-actions'
directory: '/'
schedule:
interval: 'weekly'
groups:
github-actions:
patterns:
- '*'
================================================
FILE: .github/workflows/ci.yml
================================================
permissions:
contents: read
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [24]
steps:
- uses: actions/checkout@v6
- name: Setup Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v6
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
# test on default version of npm
# - 9.6~10.8 on node@20
# - 11.3~11.6 on node@24
- name: Node and NPM version
run: node --version && npm --version
- name: Install dependencies
run: npm install
- name: Lint
run: npx eslint . && npx prettier --check **/*.ts
- name: Build
run: npm run build
================================================
FILE: .github/workflows/deploy-demo.yml
================================================
name: Deploy Demo
on:
push:
branches: [main]
jobs:
deploy:
runs-on: ubuntu-latest
permissions:
contents: read
pages: write
id-token: write
steps:
- uses: actions/checkout@v6
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 24
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Build demo
run: npm run build:website
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v4
with:
path: './packages/website/dist'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
================================================
FILE: .github/workflows/release.yml
================================================
name: Release
on:
push:
tags:
- 'v*'
permissions:
id-token: write # Required for OIDC
contents: read
jobs:
release:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 24
registry-url: 'https://registry.npmjs.org'
# Ensure npm 11.5.1 or later is installed
- name: Update npm
run: npm install -g npm@latest
- name: Install dependencies
run: npm ci
- name: Build
run: npm run build:libs
- name: Publish all public packages
run: |
VERSION=${GITHUB_REF#refs/tags/v}
if [[ "$VERSION" == *"-"* ]]; then
# Prerelease version (e.g., 0.3.0-beta.1) -> extract tag name before the dot
TAG=$(echo "$VERSION" | sed 's/.*-\([a-zA-Z]*\).*/\1/')
npm publish --workspaces --access public --tag "$TAG"
else
npm publish --workspaces --access public
fi
================================================
FILE: .gitignore
================================================
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
# /lib
dist-ssr
*.local
# Editor directories and files
# .vscode/*
# !.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
.qoder
# env files
.env
.env.*
# extension
.output
.wxt
# AI
.agent
.claude
.cursor
.gemini
CLAUDE.md
================================================
FILE: .husky/commit-msg
================================================
npx --no -- commitlint --edit $1
================================================
FILE: .husky/pre-commit
================================================
npx lint-staged --allow-empty
================================================
FILE: .vscode/extensions.json
================================================
{
"recommendations": ["dbaeumer.vscode-eslint", "esbenp.prettier-vscode"]
}
================================================
FILE: .vscode/settings.json
================================================
{
"cSpell.words": [
"contenteditable",
"deepseek",
"historychange",
"HITL",
"innerhtml",
"languagedetector",
"llms",
"magicui",
"npmmirror",
"onwarn",
"opensource",
"qwen",
"retryable",
"shadcn",
"sidepanel",
"statuschange",
"wouter"
],
"files.exclude": {
"packages/*/node_modules": true
},
"markdownlint.config": {
// "comment": "Relaxed rules",
"default": true,
"whitespace": false,
"line_length": false,
"ul-indent": false,
"no-inline-html": false,
"no-bare-urls": false,
"fenced-code-language": false,
"first-line-h1": false,
"block-spacing": false,
"blanks-around-lists": false,
"ol-prefix": false,
"no-duplicate-heading": false
}
}
================================================
FILE: AGENTS.md
================================================
# Instructions for Coding Assistants
## Project Overview
This is a **monorepo** with npm workspaces:
- **Page Agent** (`packages/page-agent/`) - Main entry with built-in UI Panel, published as `page-agent` on npm
- **Extension** (`packages/extension/`) - Browser extension (WXT + React) 🚧 WIP
- **Website** (`packages/website/`) - React docs and landing page. **When working on website, follow `packages/website/AGENTS.md`**
Internal packages:
- **Core** (`packages/core/`) - PageAgentCore without UI (npm: `@page-agent/core`)
- **LLMs** (`packages/llms/`) - LLM client with reflection-before-action mental model
- **Page Controller** (`packages/page-controller/`) - DOM operations and visual feedback (SimulatorMask), independent of LLM
- **UI** (`packages/ui/`) - Panel and i18n. Decoupled from PageAgent
## Development Commands
```bash
npm start # Start website dev server
npm run build # Build all packages
npm run build:libs # Build all libraries
npm run lint # ESLint with TypeScript strict rules
npm run zip -w @page-agent/ext # Zip the extension package
```
## Architecture
### Monorepo Structure
Simple monorepo solution: TypeScript references + Vite aliases. Update tsconfig and vite config when adding/removing packages.
```
packages/
├── core/ # npm: "@page-agent/core" ⭐ Core agent logic (headless)
├── page-agent/ # npm: "page-agent" entry class (with UI + controller + demo builds)
├── website/ # @page-agent/website (private)
├── llms/ # @page-agent/llms
├── extension/ # Browser extension (WXT + React)
├── page-controller/ # @page-agent/page-controller
└── ui/ # @page-agent/ui
```
`workspaces` in `package.json` must be in topological order.
### Module Boundaries
- **Page Agent**: Main entry with UI. Extends PageAgentCore and adds Panel. Imports from `@page-agent/core`, `@page-agent/ui`
- **Core**: PageAgentCore without UI. Imports from `@page-agent/llms`, `@page-agent/page-controller`
- **LLMs**: LLM client with MacroToolInput contract. No dependency on page-agent
- **UI**: Panel and i18n. Decoupled from PageAgent via PanelAgentAdapter interface
- **Page Controller**: DOM operations with optional visual feedback (SimulatorMask). No LLM dependency. Enable mask via `enableMask: true` config
### PageController ↔ PageAgent Communication
All communication is async and isolated:
```typescript
// PageAgent delegates DOM operations to PageController
await this.pageController.updateTree()
await this.pageController.clickElement(index)
await this.pageController.inputText(index, text)
await this.pageController.scroll({ down: true, numPages: 1 })
// PageController exposes state via async methods
const simplifiedHTML = await this.pageController.getSimplifiedHTML()
const pageInfo = await this.pageController.getPageInfo()
```
### DOM Pipeline
1. **DOM Extraction**: Live DOM → `FlatDomTree` via `page-controller/src/dom/dom_tree/`
2. **Dehydration**: DOM tree → simplified text for LLM
3. **LLM Processing**: AI returns action plans (page-agent)
4. **Indexed Operations**: PageAgent calls PageController by element index
## Key Files Reference
### Page Agent (`packages/page-agent/`)
| File | Description |
| ------------------ | -------------------------------------------- |
| `src/PageAgent.ts` | ⭐ Main class with UI, extends PageAgentCore |
| `src/demo.ts` | IIFE demo entry (auto-init with demo API) |
### Core (`packages/core/`)
| File | Description |
| ---------------------- | --------------------------------------- |
| `src/PageAgentCore.ts` | ⭐ Core agent class without UI |
| `src/tools/` | Tool definitions calling PageController |
| `src/config/` | Configuration types and constants |
| `src/prompts/` | System prompt templates |
### LLMs (`packages/llms/`)
| File | Description |
| --------------------- | ------------------------------------- |
| `src/index.ts` | ⭐ LLM class with retry logic |
| `src/types.ts` | MacroToolInput, AgentBrain, LLMConfig |
| `src/OpenAIClient.ts` | OpenAI-compatible client |
### Page Controller (`packages/page-controller/`)
| File | Description |
| --------------------------- | ---------------------------------------------------------- |
| `src/PageController.ts` | ⭐ Main controller class with optional mask support |
| `src/SimulatorMask.ts` | Visual overlay blocking user interaction during automation |
| `src/actions.ts` | Element interactions (click, input, scroll) |
| `src/dom/dom_tree/index.js` | Core DOM extraction engine |
## Adding New Features
### New Agent Tool
1. Implement in `packages/core/src/tools/index.ts`
2. If tool needs DOM ops, add method to PageController first
3. Tool calls `this.pageController.methodName()` for DOM interactions
### New PageController Action
1. Add implementation in `packages/page-controller/src/actions.ts`
2. Expose via async method in `PageController.ts`
3. Export from `packages/page-controller/src/index.ts`
## Code Standards
- Explicit typing for exported/public APIs
- ESLint relaxes some unsafe rules for rapid iteration
- Every change you make should not only implement the desired functionality but also improve the quality of the codebase
- All code and comments must be in English.
- Do not try to hide errors or risks. They are valuable feedbacks for developers and users. Make them visible and actionable.
- Traceability and predictability is more important than success rate.
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to PageAgent
♥️ We welcome contributions from everyone.
## 🚀 Quick Start
### Development Setup
1. **Prerequisites**
- `macOS` / `Linux` / `WSL`
- `node.js >= 20` with `npm >= 10`
- An editor that supports `ts/eslint/prettier`
- Make sure `eslint`, `prettier` and `commitlint` work well. Un-linted code won't pass the CI.
2. **Setup**
```bash
npm i
npm start # Start demo and documentation site
npm run build # Build libs and website
```
### Project Structure
This is a **monorepo** with npm workspaces containing **4 main packages**:
- **Page Agent** (`packages/page-agent/`) - Main entry with built-in UI Panel, published as `page-agent` on npm
- **Core** (`packages/core/`) - Core agent logic without UI (npm: `@page-agent/core`)
- **Extension** (`packages/extension/`) - Chrome extension for multi-page tasks and browser-level automation
- **Website** (`packages/website/`) - React documentation and landing page. Also as demo and test page for the core lib. private package `@page-agent/website`
> We use a simplified monorepo solution with `native npm-workspace + ts reference + vite alias`. No fancy tooling. Hoisting is required.
>
> - When developing. Use alias so that we don't have to pre-build.
> - When bundling. Use external and disable ts `paths` alias.
> - When bundling `IIFE` and `Website`. Bundle everything together.
## 🤝 How to Contribute
### Reporting Issues
- Use the GitHub issue tracker to report bugs or request features
- Search existing issues before creating new ones
- Provide clear reproduction steps for bugs
- Include browser version and environment details
### Code Contributions
1. **Fork and Clone**
```bash
git clone https://github.com/your-username/page-agent.git
cd page-agent
```
2. **Create Feature Branch**
```bash
git checkout -b feat/your-feature-name
```
3. **Make Changes**
- Follow existing code style and patterns
- Add tests for new functionality
- Update documentation as needed
4. **Test Your Changes**
- Build and lint everything.
- Test in our demo website
- Test it on other websites if applicable
- `@TODO: test suite`
5. **Commit and Push**
```bash
git add .
git commit -m "feat: add awesome feature"
git push origin feat/your-feature-name
```
6. **Create Pull Request**
- Provide clear description of changes
- Link related issues
- Include screenshots for UI changes
## 📝 Code Style
### General Guidelines
- Use TypeScript for type safety
- Follow existing naming conventions
- Write meaningful commit messages
- Keep functions small and focused
- Add JSDoc for public APIs
### Vibe Coding with AI
> [Vibe coding](https://en.wikipedia.org/wiki/Vibe_coding)
- Vibe coding is **RECOMMENDED** when maintaining **the demo, the website, the UI and tests**.
- We have a [website/AGENTS.md](packages/website/AGENTS.md) for that.
- Vibe coding is **NOT** allowed for the core lib!!!
- NEVER try to vibe coding the MV3 extension!!! It is HELL.
- Review anything AI wrote before make a commit. You are the author of anything you commit. NOT AI.
If your AI assistant does not support [AGENTS.md](https://agents.md/). Add an alias for it:
- claude-code (`CLAUDE.md`)
```markdown
@AGENTS.md
```
- antigravity (`.agent/rules/alias.md`)
```markdown
---
trigger: always_on
---
@../../AGENTS.md
```
## 🔧 Development Workflows
### Test With Your Own LLM API
- Create a `.env` file in the repo root with your LLM API config
```env
LLM_MODEL_NAME=gpt-5.2
LLM_API_KEY=your-api-key
LLM_BASE_URL=https://api.your-llm-provider.com/v1
```
- **Ollama example** (tested on 0.15 + qwen3:14b, RTX3090 24GB):
```env
LLM_BASE_URL="http://localhost:11434/v1"
LLM_API_KEY="NA"
LLM_MODEL_NAME="qwen3:14b"
```
> @see https://alibaba.github.io/page-agent/docs/features/models#ollama for configuration
- **Restart the dev server** to load new env vars
- If not provided, the demo will use the free testing proxy by default. By using it, you agree to its [terms](https://github.com/alibaba/page-agent/blob/main/docs/terms-and-privacy.md).
### Extension Development
```bash
# make sure you ran `npm run build:libs` first
# and every time you changed the core libs
npm run dev -w @page-agent/ext
npm run zip -w @page-agent/ext
```
- Update `packages/extension/docs/extension_api.md` for API integration details
### Testing on Other Websites
- Start and serve a local `iife` script
```bash
npm run dev:demo # Serving IIFE with auto rebuild at http://localhost:5174/page-agent.demo.js
```
- Add a new bookmark
```javascript
javascript:(function(){var s=document.createElement('script');s.src=`http://localhost:5174/page-agent.demo.js?t=${Math.random()}`;s.onload=()=>console.log(%27PageAgent ready!%27);document.head.appendChild(s);})();
```
- Click the bookmark on any page to load Page-Agent
> Warning: AK in your local `.env` will be inlined in the iife script. Be very careful when you distribute the script.
### Adding Documentation
Ask an AI to help you add documentation to the `website/` package. Follow the existing style.
> Our AGENTS.md file and guardrails are designed for this purpose. But please be careful and review anything AI generated.
## 🚫 What We Don't Accept
- Breaking changes and large PRs without prior discussion
- Heavy dependencies to core libs
- Contributions without proper testing
- Code that doesn't follow project conventions
- Dependencies or code with licenses incompatible with MIT
- Bot or AI-generated pull requests without meaningful human involvement
## 📄 Legal
By contributing to this project, you agree that your contributions will be licensed under the MIT License.
> CLA is optional.
## 💬 Questions?
- Open a GitHub issue for technical questions
- Check existing documentation and issues first
- Be respectful and constructive in discussions
Thank you for helping make PageAgent better! 🎉
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2026 SimonLuvRamen
Copyright (c) 2026 Alibaba Group Holding Limited
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Page Agent
[](https://opensource.org/licenses/MIT) [](http://www.typescriptlang.org/) [](https://bundlephobia.com/package/page-agent) [](https://www.npmjs.com/package/page-agent) [](https://github.com/alibaba/page-agent)
The GUI Agent Living in Your Webpage. Control web interfaces with natural language.
🌐 **English** | [中文](./docs/README-zh.md)
🚀 Demo | 📖 Docs | 📢 HN Discussion | 𝕏 Follow on X
---
## ✨ Features
- **🎯 Easy integration**
- No need for `browser extension` / `python` / `headless browser`.
- Just in-page javascript. Everything happens in your web page.
- **📖 Text-based DOM manipulation**
- No screenshots. No multi-modal LLMs or special permissions needed.
- **🧠 Bring your own LLMs**
- **🎨 Pretty UI with human-in-the-loop**
- **🐙 Optional [chrome extension](https://alibaba.github.io/page-agent/docs/features/chrome-extension) for multi-page tasks.**
## 💡 Use Cases
- **SaaS AI Copilot** — Ship an AI copilot in your product in lines of code. No backend rewrite.
- **Smart Form Filling** — Turn 20-click workflows into one sentence. Perfect for ERP, CRM, and admin systems.
- **Accessibility** — Make any web app accessible through natural language. Voice commands, screen readers, zero barrier.
- **Multi-page Agent** — Extend your own agent's reach across browser tabs with the optional [chrome extension](https://alibaba.github.io/page-agent/docs/features/chrome-extension).
## 🚀 Quick Start
### One-line integration
Fastest way to try PageAgent with our free Demo LLM:
```html
```
> **⚠️ For technical evaluation only.** This demo CDN uses our free [testing LLM API](https://alibaba.github.io/page-agent/docs/features/models#free-testing-api). By using it, you agree to its [terms](https://github.com/alibaba/page-agent/blob/main/docs/terms-and-privacy.md).
| Mirrors | URL |
| ------- | ---------------------------------------------------------------------------------- |
| Global | https://cdn.jsdelivr.net/npm/page-agent@1.6.0/dist/iife/page-agent.demo.js |
| China | https://registry.npmmirror.com/page-agent/1.6.0/files/dist/iife/page-agent.demo.js |
### NPM Installation
```bash
npm install page-agent
```
```javascript
import { PageAgent } from 'page-agent'
const agent = new PageAgent({
model: 'qwen3.5-plus',
baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1',
apiKey: 'YOUR_API_KEY',
language: 'en-US',
})
await agent.execute('Click the login button')
```
For more programmatic usage, see [📖 Documentations](https://alibaba.github.io/page-agent/docs/introduction/overview).
## 🤝 Contributing
We welcome contributions from the community! Follow our instructions in [CONTRIBUTING.md](CONTRIBUTING.md) for setup and guidelines.
Please read [Code of Conduct](docs/CODE_OF_CONDUCT.md) before contributing.
Contributions generated entirely by bots or agents without substantial human involvement will not be accepted, and bot accounts may be blocked.
## 👏 Acknowledgments
This project builds upon the excellent work of **[`browser-use`](https://github.com/browser-use/browser-use)**.
`PageAgent` is designed for **client-side web enhancement**, not server-side automation.
```
DOM processing components and prompt are derived from browser-use:
Browser Use
Copyright (c) 2024 Gregor Zunic
Licensed under the MIT License
We gratefully acknowledge the browser-use project and its contributors for their
excellent work on web automation and DOM interaction patterns that helped make
this project possible.
Third-party dependencies and their licenses can be found in the package.json
file and in the node_modules directory after installation.
```
## 📄 License
[MIT License](LICENSE)
---
**⭐ Star this repo if you find PageAgent helpful!**
================================================
FILE: SECURITY.md
================================================
# Security Policy
## Supported Versions
We provide security fixes on a best-effort basis for:
| Version | Supported |
| --------------------------------------------------------- | --------- |
| `main` | Yes |
| Latest npm release of `page-agent` and workspace packages | Yes |
| Older releases | No |
Please upgrade to the latest release before reporting an issue against an older build.
## Reporting a Vulnerability
Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.
Use GitHub's private vulnerability reporting flow:
- Open https://github.com/alibaba/page-agent/security/policy
- Click `Report a vulnerability`
If private reporting is unavailable, open a minimal public issue only to request a private contact channel. Do not include exploit details.
## What to Include
- Affected package or feature
- Exact version, commit, or build
- Browser, OS, and runtime environment
- Reproduction steps or a proof of concept
- Expected impact
## Scope
We prioritize reports that show a real security boundary failure, such as:
- Unauthorized access to data, tokens, or extension capabilities
- Bypassing explicit safety constraints
- Sensitive data exposure caused by default behavior
The following usually do not qualify by themselves:
- Unsafe custom integrations that ignore documented safeguards
- Intentionally embedding secrets into client-side builds
- Reports against unsupported older versions
## Disclosure
Please avoid public disclosure until maintainers have had a reasonable chance to investigate and ship a fix.
================================================
FILE: docs/CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [1.6.0] - 2026-03-21
### Features
- **Beta MCP support** - New `@page-agent/mcp` package lets MCP clients such as Claude Desktop and Copilot control the browser through the Page Agent extension
- **Better iframe handling** - Same-origin iframe elements are handled more reliably during DOM extraction and actions
- **Extension history workflows** - Users can rerun past tasks, export history sessions as JSON, and approve MCP-triggered tasks before execution
### Improvements
- **Unified versioning across packages** - The extension now follows the root workspace version. Changelog entries are no longer split into a separate extension version section
- **Configurable `stepDelay`** - Agent pacing between steps is now configurable via `stepDelay`
- **Optional API key** - `apiKey` can now be omitted for compatible deployments that do not require one
- **Optional named tool choice** - Tool invocation can disable named tool choice for providers that behave better without it
- **Better rich-text input support** - Improved `contenteditable` handling with better event dispatching and `execCommand` fallback for more editors
- **More flexible DOM extraction** - `includeAttributes` now supports wildcards, `contenteditable` is included by default, and heuristically interactive elements expose more useful attributes
- **MiniMax model support** - Added MiniMax compatibility, with the default recommendation updated to `MiniMax-M2.7`
### Bug Fixes
- Fixed Safari issues when `requestIdleCallback` is unavailable
- Avoid throwing when `webgl2` initialization fails
- Improved OpenAI-compatible request patches for GPT-5.4 chat tools and MiniMax temperature/tool-call compatibility
- Fixed several UI polish issues in the extension and website, including cursor and layout regressions
## [1.5.1] - 2026-03-05
### Breaking Changes
- **`data-browser-use-ignore` → `data-page-agent-ignore`** - DOM ignore attribute renamed to match the project identity
- **Config types restructured** - `PageAgentConfig` split into `AgentConfig` + `PageAgentCoreConfig`; config definitions moved from `config/index.ts` to `types.ts`
- **Zod v3/v4 dual support** - Libraries now accept both `zod@^3.25` and `zod@^4.0` as peer dependencies
### Features
- **Experimental `llms.txt` support** - Agent can fetch and include a site's `llms.txt` in context. Enable via `experimentalLlmsTxt: true`
### Improvements
- Default `maxSteps` changed from 20 to 40 for better for complex tasks out of the box
- Added 400ms wait between agent steps for page reactions
- Increased click wait time (100ms → 200ms) for more reliable interactions
- Removed debug `console.log` statements from scroll actions
- Reset observations on new task start
- Improved logging across packages
### Extension v0.1.9
> PageAgent 1.5.1
- **Advanced config panel** - New collapsible section exposing Max Steps, System Instruction, and experimental `llms.txt` toggle
- Streamlined User Auth Token description
- Moved testing API notice below auth token section
---
## [1.4.0] - 2026-02-27
### Features
- Update Terms of Use and Privacy Policy
- **Robust tool-call validation** - Action inputs are now validated against tool schemas individually, producing clear error messages (e.g. `Invalid input for action "click_element_by_index"`) instead of unreadable union parse errors
- **Primitive action input coercion** - Small models that output `{"click_element_by_index": 2}` instead of `{"click_element_by_index": {"index": 2}}` are now auto-corrected using tool schemas
- **Qwen model updates** - Added `qwen3.5-plus` as the default free testing model; disabled `enable_thinking` for Qwen models to avoid incompatible responses
- **Updated default LLM endpoint** - Migrated demo and extension to a new testing endpoint with legacy endpoint auto-migration
### Improvements
- Unified zod imports (`* as z`) across all packages for consistency
- Better Zod error formatting with `z.prettifyError()` in LLM client
- Exported `InvokeError` and `InvokeErrorType` as values (not just types) from `@page-agent/llms`
- Exported `SupportedLanguage` type from `@page-agent/core`
### Extension v0.1.8
- **Language setting** - Added language selector (System / English / 中文) in config panel
- **UI makeover** - New empty state with breathing glow and typing animation; ai-motion glow overlay while running; refined focus styles
- **Testing endpoint notice** - Shows terms of use notice when using the free testing API
- **Legacy endpoint migration** - Auto-migrates old Supabase testing endpoint to new endpoint on startup
---
## [1.3.0] - 2026-02-13
### Breaking Changes
- **Lifecycle: `stop()` vs `dispose()`** - New `stop()` method to cancel the current task while keeping the agent reusable. `dispose()` is now terminal — a disposed agent cannot be reused. This affects both `PageAgentCore` and `PanelAgentAdapter`.
### Features
- **Panel action button** - The panel button now morphs between Stop (■) and Close (X) based on agent status
- **Error history** - Errors and max-step failures are now recorded in `history` as `AgentErrorEvent`, making post-task analysis more complete
### Bug Fixes
- **AbortError handling** - `AbortError` is no longer retried by the LLM client, and shows a clean "Task stopped" message instead of a raw error stack
---
## [1.2.0] - 2026-02-11
### Features
- **Observe Phase** - Agent now observes the page before each action, improving decision accuracy on dynamic pages
- **Better Abort Handling** - Improved `abortSignal` support for cleaner task cancellation
### Improvements
- Pruned system prompts for lower token usage and faster responses
- Improved error handling during agent steps with better error messages
- Zod tree-shaking for smaller bundle size
### Bug Fixes
- Fixed indentation lost in DOM extraction caused by `trimLines`
- Fixed `gpt-5-mini` temperature configuration
---
## [1.1.0] - 2026-02-02
### Features
- **Custom System Prompt** - New `systemPrompt` config option to customize or extend the default system prompt
- **Chrome Extension** - Extension with multi-tab control, main-world API with token auth, and tab lifecycle management
### Improvements
- Renamed `include_attributes` to `includeAttributes` in PageController config (camelCase consistency)
- Lazy-loaded mask module for faster initialization
- Better date formatting and error messages from LLM client
- Added `rawRequest` to step history for easier debugging
### Bug Fixes
- Fixed CSP errors by using local SVGs for cursor mask instead of inline styles
- Fixed `AbortError` being incorrectly retried and shown to users
- Fixed mask not working correctly when starting a new task after stopping a previous one
---
## [1.0.0] - 2026-01-19
### 🎉 First Stable Release
PageAgent is now ready for production use. The API is stable and breaking changes will follow semantic versioning.
### Features
#### Core
- **PageAgent** - Main entry class with built-in UI Panel
- **PageAgentCore** - Headless agent class for custom UI or programmatic use
- **DOM Analysis** - Text-based DOM extraction with high-intensity dehydration
- **LLM Support** - Works with OpenAI, Claude, DeepSeek, Qwen, and other OpenAI-compatible APIs
- **Tool System** - Built-in tools for click, input, scroll, select, and more
- **Custom Tools** - Extend agent capabilities with your own tools (experimental)
- **Lifecycle Hooks** - Hook into agent execution (experimental)
- **Instructions System** - System-level and page-level instructions to guide agent behavior
- **Data Masking** - Transform page content before sending to LLM
#### Page Controller
- **Element Interactions** - Click, input text, select options, scroll
- **Visual Mask** - Blocks user interaction during automation
- **DOM Tree Extraction** - Efficient page structure extraction for LLM consumption
#### UI
- **Interactive Panel** - Real-time task progress and agent thinking display
- **Ask User Tool** - Agent can ask users for clarification
- **i18n Support** - English and Chinese localization
### Packages
| Package | Description |
| ----------------------------- | ---------------------------------- |
| `page-agent` | Main entry with UI Panel |
| `@page-agent/core` | Core agent logic without UI |
| `@page-agent/llms` | LLM client with retry logic |
| `@page-agent/page-controller` | DOM operations and visual feedback |
| `@page-agent/ui` | Panel and i18n |
### Known Limitations
- Single-page application only (cannot navigate across pages)
- No visual recognition (relies on DOM structure)
- Limited interaction support (no hover, drag-drop, canvas operations)
- See [Limitations](https://alibaba.github.io/page-agent/docs/introduction/limitations) for details
### Acknowledgments
This project builds upon the excellent work of [browser-use](https://github.com/browser-use/browser-use). DOM processing components and prompts are adapted from browser-use (MIT License).
================================================
FILE: docs/CODE_OF_CONDUCT.md
================================================
# Alibaba Open Source Code of Conduct
[¶中文版](#我们的保证)
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at opensource@alibaba-inc.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
---
> Chinese Version
> 《阿里巴巴开源行为准则》
## 我们的保证
为了促进一个开放透明且友好的环境,我们作为贡献者和维护者保证:无论年龄、种族、民族、性别认同和表达(方式)、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向,参与者在我们项目和社区中都免于骚扰。
## 我们的标准
有助于创造正面环境的行为包括但不限于:
* 使用友好和包容性语言
* 尊重不同的观点和经历
* 耐心地接受建设性批评
* 关注对社区最有利的事情
* 友善对待其他社区成员
身为参与者不能接受的行为包括但不限于:
* 使用与性有关的言语或是图像,以及不受欢迎的性骚扰
* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论,人身攻击及政治攻击
* 公开或私下的骚扰
* 未经许可地发布他人的个人资料,例如住址或是电子地址
* 其他可以被合理地认定为不恰当或者违反职业操守的行为
## 我们的责任
项目维护者有责任为「可接受的行为」标准做出诠释,以及对已发生的不被接受的行为采取恰当且公平的纠正措施。
项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论 (comments)、提交 (commits)、代码、wiki 编辑、问题 (issues) 和其他贡献,以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。
## 使用范围
当一个人代表该项目或是其社区时,本行为标准适用于其项目平台和公共平台。
代表项目或是社区的情况,举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。
该项目的呈现方式可由其项目维护者进行进一步的定义及解释。
## 强制执行
可以通过 opensource@alibaba-inc.com 来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
任何维护团队认为有必要且适合的所有投诉都将进行审查及调查,并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
没有切实地遵守或是执行本行为标准的项目维护人员,可能会因项目领导人或是其他成员的决定,暂时或是永久地取消其参与资格。
## 来源
本行为标准改编自[贡献者公约](https://www.contributor-covenant.org),版本 1.4
可在此查看[https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html)
================================================
FILE: docs/README-zh.md
================================================
# Page Agent
[](https://opensource.org/licenses/MIT) [](http://www.typescriptlang.org/) [](https://bundlephobia.com/package/page-agent) [](https://www.npmjs.com/package/page-agent) [](https://github.com/alibaba/page-agent)
纯 JS 实现的 GUI agent。使用自然语言操作你的 Web 应用。无须后端、客户端、浏览器插件。
🌐 [English](../README.md) | **中文**
🚀 Demo | 📖 Docs | 📢 HN Discussion | 𝕏 Follow on X
---
## ✨ Features
- **🎯 轻松集成**
- 无需 `浏览器插件` / `Python` / `无头浏览器`。
- 纯页面内 JavaScript,一切都在你的网页中完成。
- **📖 基于文本的 DOM 操作**
- 无需截图,无需多模态模型或特殊权限。
- **🧠 用你自己的 LLM**
- **🎨 精美 UI,支持人机协同**
- **🐙 可选的 [Chrome 扩展](https://alibaba.github.io/page-agent/docs/features/chrome-extension),支持跨页面任务。**
## 💡 应用场景
- **SaaS AI 副驾驶** — 几行代码为你的产品加上 AI 副驾驶,无需重写后端。
- **智能表单填写** — 把 20 次点击变成一句话。ERP、CRM、管理后台的最佳拍档。
- **无障碍增强** — 用自然语言让任何网页无障碍。语音指令、屏幕阅读器,零门槛。
- **跨页面 Agent** — 通过可选的 [Chrome 扩展](https://alibaba.github.io/page-agent/docs/features/chrome-extension),让你自己的 Agent 跨标签页工作。
## 🚀 快速开始
### 一行代码集成
通过我们免费的 Demo LLM 快速体验 PageAgent:
```html
```
> **⚠️ 仅用于技术评估。** 该 Demo CDN 使用了免费的[测试 LLM API](https://alibaba.github.io/page-agent/docs/features/models#free-testing-api),使用即表示您同意其[条款](https://github.com/alibaba/page-agent/blob/main/docs/terms-and-privacy.md)。
| Mirrors | URL |
| ------- | ---------------------------------------------------------------------------------- |
| Global | https://cdn.jsdelivr.net/npm/page-agent@1.6.0/dist/iife/page-agent.demo.js |
| China | https://registry.npmmirror.com/page-agent/1.6.0/files/dist/iife/page-agent.demo.js |
### NPM 安装
```bash
npm install page-agent
```
```javascript
import { PageAgent } from 'page-agent'
const agent = new PageAgent({
model: 'qwen3.5-plus',
baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1',
apiKey: 'YOUR_API_KEY',
language: 'zh-CN',
})
await agent.execute('点击登录按钮')
```
更多编程用法,请参阅 [📖 文档](https://alibaba.github.io/page-agent/docs/introduction/overview)。
## 🤝 贡献
欢迎社区贡献!请参阅 [CONTRIBUTING.md](../CONTRIBUTING.md) 了解安装与贡献指南。请在贡献前阅读[行为准则](CODE_OF_CONDUCT.md)。
我们不接受未经实质性人类参与、完全由 Bot 或 Agent 自动生成的代码,机器人账号可能被禁止参与互动。
## 👏 致谢
本项目基于 **[`browser-use`](https://github.com/browser-use/browser-use)** 的优秀工作构建。
`PageAgent` 专为**客户端网页增强**设计,不是服务端自动化工具。
```
DOM processing components and prompt are derived from browser-use:
Browser Use
Copyright (c) 2024 Gregor Zunic
Licensed under the MIT License
We gratefully acknowledge the browser-use project and its contributors for their
excellent work on web automation and DOM interaction patterns that helped make
this project possible.
Third-party dependencies and their licenses can be found in the package.json
file and in the node_modules directory after installation.
```
## 📄 许可证
[MIT License](../LICENSE)
---
**⭐ 如果觉得 PageAgent 有用或有趣,请给项目点个星!**
================================================
FILE: docs/terms-and-privacy.md
================================================
# Terms of Use & Privacy
**Last updated:** March 2026
"We" in this document refers to the maintainers of the open-source Page Agent project (https://github.com/alibaba/page-agent). "The software" refers to Page Agent (the JavaScript library) and Page Agent Ext (the browser extension). This document covers the software itself and the testing API we provide — **not** any third-party product or service built with it.
---
## 1. Open Source Software Privacy
The software is a **client-side only** tool with a "Bring Your Own Key" (BYOK) architecture. The software itself does **not** include any backend service. The software does **not** collect or transmit any user data on its own, and we do **not** have access to your browsing activity, page content, or task instructions through the software.
All data transmission occurs **only** between your browser and the LLM provider you configure. You are in full control of which provider receives your data.
The project is open source under the [MIT License](https://github.com/alibaba/page-agent/blob/main/LICENSE) and can be audited at: https://github.com/alibaba/page-agent
---
## 2. Testing API and Demo Disclaimer & Terms of Use
To facilitate easy testing and technical evaluation, we provide a free testing LLM API. This API is used in the project homepage's live demo, the pre-built demo CDN bundles, and the browser extension's default configuration. Users may also use it independently for their own technical evaluation of the software.
This free testing API is provided **strictly for technical evaluation and R&D purposes only**. It must not be used in any production environment. By using this API, you agree to the following terms:
- **Permitted Use Only**: This API must be used solely for technical evaluation of the software. Any other use — including integration into other products or services, unlawful activities, violation of the underlying LLM provider's usage policies, or automated scraping at scale — is strictly prohibited.
- **No Sensitive Data**: You are strictly prohibited from inputting any Personal Identifiable Information (PII), confidential business data, financial/medical records, or using this agent on web pages containing such sensitive information.
- **Data Processing**: We do not store or log your prompts, webpage data (HTML), or any submitted content, nor do we use such data for model training. All data is processed in-transit and immediately discarded. We perform in-memory request validation to prevent abuse of the testing API, and temporarily process IP addresses for rate-limiting purposes. No data from these processes is retained. Data is processed through Alibaba Cloud infrastructure, which is subject to its own privacy policy.
- **Independent Infrastructure**: The software is completely frontend-based with a "Bring Your Own Key" (BYOK) architecture and **no built-in backend**. To facilitate easy testing, the maintainers have purchased public cloud services from Alibaba Cloud China ([aliyun.com](https://www.aliyun.com) Function Compute and BaiLian Qwen models). This project is not a product of, nor endorsed by, Alibaba Cloud.
- **No Guaranteed Availability**: This testing API may be rate-limited, degraded, or discontinued at any time without prior notice.
- **"AS IS" & Limitation of Liability**: This service is provided strictly on an "AS IS" and "AS AVAILABLE" basis, without any warranties. The maintainers bear no liability for any data loss, service interruption, or legal consequences arising from your use of this service.
- **Recommendation for Real Usage**: For secure and continuous usage, we strongly advise using the BYOK mode with your own legally compliant commercial LLM API keys, or connecting to local, offline models (e.g., Ollama).
**Note**: This free testing LLM API processes data via servers located in Mainland China. If you are located in a region with strict data localization laws (such as the EU/EEA), please do not use this API.
**Age Requirement**: The software and testing API are not intended for use by individuals under the age of 13 (or the minimum age of digital consent in your jurisdiction).
---
## 3. Browser Extension (Page Agent Ext)
### Data Processing
The extension performs DOM analysis and automation actions **locally in your browser**. Your browsing history, passwords, and form data are not accessed or collected by the extension developer.
Data is transmitted to external servers **only when you initiate an automation task**. When this occurs:
- Your task instructions (natural language commands)
- Simplified page structure (cleaned HTML) of all pages under the extension's control
are sent to the LLM API endpoint configured in **your settings**.
> **Note:** The HTML cleaning process simplifies page structure for AI readability but **does not guarantee removal of sensitive information** (e.g., visible text, form values, or personal data on the page). Please be mindful of the page content when initiating tasks.
**If you configure a third-party LLM provider** (e.g., OpenAI, Anthropic, or others), data is sent directly to that provider. Their privacy policies apply.
**If you use the testing API**, the terms in [Section 2](#2-testing-api-and-demo-disclaimer--terms-of-use) apply. By using the extension with the default testing API, you agree to those terms.
### Data Storage
- **Local storage only**: Your configuration (API endpoint, API key, model selection) is stored in your browser via `chrome.storage.local` (or equivalent browser storage APIs)
- **No cloud sync**: Configuration is not synced to any external server
- **No analytics**: The extension does not include any analytics or tracking code
### Your Control
- The extension is open source and can be audited by anyone
- You choose which LLM provider to use
- You may configure your own API endpoint at any time
- You can clear all stored data by removing the extension
---
## Changes
We may update these terms at our discretion.
## Contact
https://github.com/alibaba/page-agent/issues
================================================
FILE: eslint.config.js
================================================
import js from '@eslint/js'
import reactDom from 'eslint-plugin-react-dom'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import reactX from 'eslint-plugin-react-x'
import { defineConfig, globalIgnores } from 'eslint/config'
import globals from 'globals'
import tseslint from 'typescript-eslint'
export default defineConfig([
globalIgnores([
'**/dist',
'**/node_modules',
'packages/*/src/components/ui',
'**/.wxt',
'**/.output',
]),
{
plugins: {
'react-hooks': reactHooks,
},
rules: reactHooks.configs.recommended.rules,
},
{
files: ['**/*.{ts,tsx}'],
extends: [
js.configs.recommended,
tseslint.configs.recommended,
// reactHooks.configs['recommended-latest'],
reactRefresh.configs.vite,
// Remove tseslint.configs.recommended and replace with this
...tseslint.configs.recommendedTypeChecked,
// Alternatively, use this for stricter rules
...tseslint.configs.strictTypeChecked,
// Optionally, add this for stylistic rules
...tseslint.configs.stylisticTypeChecked,
// Enable lint rules for React
reactX.configs['recommended-typescript'],
// Enable lint rules for React DOM
reactDom.configs.recommended,
],
languageOptions: {
parserOptions: {
// project: ['./tsconfig.json'],
// project: ['./packages/*/tsconfig.json'],
// tsconfigRootDir: import.meta.dirname,
projectService: true,
},
ecmaVersion: 2020,
globals: globals.browser,
},
rules: {
// Add any additional rules here
'@typescript-eslint/no-non-null-assertion': 'off',
'@typescript-eslint/no-unsafe-assignment': 'off',
'@typescript-eslint/no-unsafe-member-access': 'off',
'@typescript-eslint/no-unsafe-call': 'off',
'@typescript-eslint/no-explicit-any': 'off',
'@typescript-eslint/no-empty-function': 'off',
'@typescript-eslint/no-floating-promises': 'off',
'@typescript-eslint/no-confusing-void-expression': 'off',
'@typescript-eslint/no-unused-vars': 'off',
'@typescript-eslint/no-inferrable-types': 'off',
'@typescript-eslint/restrict-template-expressions': 'off',
'@typescript-eslint/no-dynamic-delete': 'off',
'@typescript-eslint/no-unnecessary-condition': 'off',
'@typescript-eslint/prefer-nullish-coalescing': 'off',
'@typescript-eslint/no-unnecessary-type-assertion': 'off',
'@typescript-eslint/no-misused-promises': 'off',
'@typescript-eslint/no-unsafe-argument': 'off',
'@typescript-eslint/no-unsafe-return': 'off',
'@typescript-eslint/restrict-plus-operands': 'off',
'react-dom/no-missing-button-type': 'off',
'react-x/no-nested-component-definitions': 'off',
'@typescript-eslint/prefer-optional-chain': 'off',
'@typescript-eslint/use-unknown-in-catch-callback-variable': 'off',
'@typescript-eslint/no-unnecessary-type-parameters': 'off',
// 'require-await': 'off',
'@typescript-eslint/require-await': 'off',
},
},
])
================================================
FILE: package.json
================================================
{
"name": "root",
"private": true,
"version": "1.6.0",
"type": "module",
"workspaces": [
"packages/page-controller",
"packages/ui",
"packages/llms",
"packages/core",
"packages/page-agent",
"packages/mcp",
"packages/extension",
"packages/website"
],
"description": "AI-powered UI agent for web applications",
"author": "Simon",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/alibaba/page-agent.git"
},
"homepage": "https://alibaba.github.io/page-agent/",
"engines": {
"node": "^20.19.0 || ^22.13.0 || >=24"
},
"scripts": {
"start": "npm run dev --workspace=@page-agent/website",
"dev:ext": "npm run dev -w @page-agent/ext",
"dev:demo": "npm run dev:demo --workspace=page-agent",
"build": "npm run build:libs && npm run build:website",
"build:libs": "npm run build --workspaces --if-present",
"build:website": "npm run build:website --workspace=@page-agent/website",
"build:ext": "npm run build:libs && npm run zip -w @page-agent/ext",
"version": "node scripts/sync-version.js",
"lint": "eslint .",
"cleanup": "rm -rf packages/*/dist",
"prepare": "husky"
},
"devDependencies": {
"@commitlint/cli": "^20.5.0",
"@commitlint/config-conventional": "^20.5.0",
"@eslint/js": "^9.39.2",
"@microsoft/api-extractor": "^7.57.7",
"@tailwindcss/vite": "^4.2.1",
"@trivago/prettier-plugin-sort-imports": "^6.0.2",
"@types/node": "^25.5.0",
"@vitejs/plugin-react-swc": "^4.3.0",
"chalk": "^5.6.2",
"concurrently": "^9.2.1",
"dotenv": "^17.3.1",
"eslint": "^9.39.2",
"eslint-config-prettier": "^10.1.8",
"eslint-plugin-react-dom": "^2.13.0",
"eslint-plugin-react-hooks": "^7.0.1",
"eslint-plugin-react-refresh": "^0.5.2",
"eslint-plugin-react-x": "^2.13.0",
"globals": "^17.4.0",
"husky": "^9.1.7",
"lint-staged": "^16.4.0",
"prettier": "^3.8.0",
"typescript": "^5.9.3",
"typescript-eslint": "^8.57.1",
"unplugin-dts": "^1.0.0-beta.6",
"vite": "^7.3.1",
"vite-plugin-css-injected-by-js": "^4.0.1",
"vite-bundle-analyzer": "^1.3.6"
},
"overrides": {
"typescript": "^5.9.3"
},
"lint-staged": {
"*.{js,ts,cjs,cts,mjs,mts}": [
"npx prettier --write --ignore-unknown",
"npx eslint --quiet"
],
"*.{jsx,tsx}": [
"npx prettier --write --ignore-unknown",
"npx eslint --quiet"
],
"*.css": [
"npx prettier --write --ignore-unknown"
]
},
"commitlint": {
"extends": [
"@commitlint/config-conventional"
],
"rules": {
"subject-case": [
0,
"never"
]
}
},
"prettier": {
"singleQuote": true,
"semi": false,
"useTabs": true,
"printWidth": 100,
"trailingComma": "es5",
"plugins": [
"@trivago/prettier-plugin-sort-imports"
],
"importOrder": [
"",
"^(@/).*(?",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/alibaba/page-agent.git"
},
"homepage": "https://alibaba.github.io/page-agent/",
"scripts": {
"build": "vite build",
"dev:iife": "concurrently \"vite build --config vite.iife.config.js --watch\" \"npx serve dist/iife -p 5174\"",
"prepublishOnly": "node -e \"const fs=require('fs');['README.md','LICENSE'].forEach(f=>fs.copyFileSync('../../'+f,f))\"",
"postpublish": "node -e \"['README.md','LICENSE'].forEach(f=>{try{require('fs').unlinkSync(f)}catch{}})\""
},
"dependencies": {
"chalk": "^5.6.2",
"@page-agent/llms": "1.6.0",
"@page-agent/page-controller": "1.6.0"
},
"peerDependencies": {
"zod": "^3.25.0 || ^4.0.0"
},
"devDependencies": {
"zod": "^4.3.5"
}
}
================================================
FILE: packages/core/src/PageAgentCore.ts
================================================
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* Copyright (C) 2026 SimonLuvRamen
* All rights reserved.
*/
import { InvokeError, LLM, type Tool } from '@page-agent/llms'
import type { BrowserState, PageController } from '@page-agent/page-controller'
import chalk from 'chalk'
import * as z from 'zod/v4'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import type {
AgentActivity,
AgentConfig,
AgentReflection,
AgentStatus,
AgentStepEvent,
ExecutionResult,
HistoricalEvent,
MacroToolInput,
MacroToolResult,
} from './types'
import { assert, fetchLlmsTxt, normalizeResponse, uid, waitFor } from './utils'
export { tool, type PageAgentTool } from './tools'
export type * from './types'
export type PageAgentCoreConfig = AgentConfig & { pageController: PageController }
/**
* AI agent for browser automation.
*
* @remarks
* ## Re-act Agent Loop
* - step
* - observe (gather information about current environment and context)
* - think (LLM calling)
* - reflection (evaluate history, generate memory, short-term planning)
* - action (give the action to approach the next goal)
* - act (execute the action)
* - loop
*
* ## Event System
* - `statuschange` - Agent status transitions (idle → running → completed/error)
* - `historychange` - History events updated (persistent, part of agent memory)
* - `activity` - Real-time activity feedback (transient, for UI only)
* - `dispose` - Agent cleanup triggered
*
* ## Information Streams
* 1. **History Events** (`history` array)
* - Persistent event stream that forms agent's memory
* - Included in LLM context across steps
* - Types: steps, observations, user takeovers, llm errors
*
* 2. **Activity Events** (via `activity` event)
* - Transient UI feedback during task execution
* - NOT included in LLM context
* - Types: thinking, executing, executed, retrying, error
*/
export class PageAgentCore extends EventTarget {
readonly id = uid()
readonly config: PageAgentCoreConfig & { maxSteps: number }
readonly tools: typeof tools
/** PageController for DOM operations */
readonly pageController: PageController
task = ''
taskId = ''
/** History events */
history: HistoricalEvent[] = []
/** Whether this agent has been disposed */
disposed = false
/**
* Callback for when agent needs user input (ask_user tool)
* If not set, ask_user tool will be disabled
* @example onAskUser: (q) => window.prompt(q) || ''
*/
onAskUser?: (question: string) => Promise
#status: AgentStatus = 'idle'
#llm: LLM
#abortController = new AbortController()
#observations: string[] = []
/** internal states during a single task execution */
#states = {
/** Accumulated wait time in seconds */
totalWaitTime: 0,
/** For detecting navigation */
lastURL: '',
/** Browser state */
browserState: null as BrowserState | null,
}
constructor(config: PageAgentCoreConfig) {
super()
this.config = { ...config, maxSteps: config.maxSteps ?? 40 }
this.#llm = new LLM(this.config)
this.tools = new Map(tools)
this.pageController = config.pageController
// Listen to LLM retry events
this.#llm.addEventListener('retry', (e) => {
const { attempt, maxAttempts } = (e as CustomEvent).detail
this.#emitActivity({ type: 'retrying', attempt, maxAttempts })
// Also push to history for panel rendering
this.history.push({
type: 'retry',
message: `LLM retry attempt ${attempt} of ${maxAttempts}`,
attempt,
maxAttempts,
})
this.#emitHistoryChange()
})
this.#llm.addEventListener('error', (e) => {
const error = (e as CustomEvent).detail.error as Error | InvokeError
if ((error as any)?.rawError?.name === 'AbortError') return
const message = String(error)
this.#emitActivity({ type: 'error', message })
// Also push to history for panel rendering
this.history.push({
type: 'error',
message,
rawResponse: (error as InvokeError).rawResponse,
})
this.#emitHistoryChange()
})
if (this.config.customTools) {
for (const [name, tool] of Object.entries(this.config.customTools)) {
if (tool === null) {
this.tools.delete(name)
continue
}
this.tools.set(name, tool)
}
}
if (!this.config.experimentalScriptExecutionTool) {
this.tools.delete('execute_javascript')
}
}
/** Get current agent status */
get status(): AgentStatus {
return this.#status
}
/** Emit statuschange event */
#emitStatusChange(): void {
this.dispatchEvent(new Event('statuschange'))
}
/** Emit historychange event */
#emitHistoryChange(): void {
this.dispatchEvent(new Event('historychange'))
}
/**
* Emit activity event - for transient UI feedback
* @param activity - Current agent activity
*/
#emitActivity(activity: AgentActivity): void {
this.dispatchEvent(new CustomEvent('activity', { detail: activity }))
}
/** Update status and emit event */
#setStatus(status: AgentStatus): void {
if (this.#status !== status) {
this.#status = status
this.#emitStatusChange()
}
}
/**
* Push an observation message to the history event stream.
* This will be visible in and remain persistent in memory across steps.
* @experimental @internal
* @note history change will be emitted before next step starts
*/
pushObservation(content: string): void {
this.#observations.push(content)
}
/** Stop the current task. Agent remains reusable. */
stop() {
this.pageController.cleanUpHighlights()
this.pageController.hideMask()
this.#abortController.abort()
}
async execute(task: string): Promise {
if (this.disposed) throw new Error('PageAgent has been disposed. Create a new instance.')
if (!task) throw new Error('Task is required')
this.task = task
this.taskId = uid()
// Disable ask_user tool if onAskUser is not set
if (!this.onAskUser) {
this.tools.delete('ask_user')
}
const onBeforeStep = this.config.onBeforeStep
const onAfterStep = this.config.onAfterStep
const onBeforeTask = this.config.onBeforeTask
const onAfterTask = this.config.onAfterTask
await onBeforeTask?.(this)
// Show mask
await this.pageController.showMask()
if (this.#abortController) {
this.#abortController.abort()
this.#abortController = new AbortController()
}
this.history = []
this.#setStatus('running')
this.#emitHistoryChange()
this.#observations = []
// Reset internal states
this.#states = { totalWaitTime: 0, lastURL: '', browserState: null }
let step = 0
while (true) {
try {
console.group(`step: ${step}`)
await onBeforeStep?.(this, step)
// observe
console.log(chalk.blue.bold('👀 Observing...'))
this.#states.browserState = await this.pageController.getBrowserState()
await this.#handleObservations(step)
// assemble prompts
const messages = [
{ role: 'system' as const, content: this.#getSystemPrompt() },
{ role: 'user' as const, content: await this.#assembleUserPrompt() },
]
const macroTool = { AgentOutput: this.#packMacroTool() }
// invoke LLM
console.log(chalk.blue.bold('🧠 Thinking...'))
this.#emitActivity({ type: 'thinking' })
const result = await this.#llm.invoke(messages, macroTool, this.#abortController.signal, {
toolChoiceName: 'AgentOutput',
normalizeResponse: (res) => normalizeResponse(res, this.tools),
})
// assemble history
const macroResult = result.toolResult as MacroToolResult
const input = macroResult.input
const output = macroResult.output
const reflection: Partial = {
evaluation_previous_goal: input.evaluation_previous_goal,
memory: input.memory,
next_goal: input.next_goal,
}
const actionName = Object.keys(input.action)[0]
const action: AgentStepEvent['action'] = {
name: actionName,
input: input.action[actionName],
output: output,
}
this.history.push({
type: 'step',
stepIndex: step,
reflection,
action,
usage: result.usage,
rawResponse: result.rawResponse,
rawRequest: result.rawRequest,
} as AgentStepEvent)
this.#emitHistoryChange()
//
await onAfterStep?.(this, this.history)
console.groupEnd()
// finish task if done
if (actionName === 'done') {
const success = action.input?.success ?? false
const text = action.input?.text || 'no text provided'
console.log(chalk.green.bold('Task completed'), success, text)
this.#onDone(success)
const result: ExecutionResult = {
success,
data: text,
history: this.history,
}
await onAfterTask?.(this, result)
return result
}
} catch (error: unknown) {
console.groupEnd() // to prevent nested groups
const isAbortError = (error as any)?.rawError?.name === 'AbortError'
console.error('Task failed', error)
const errorMessage = isAbortError ? 'Task stopped' : String(error)
this.#emitActivity({ type: 'error', message: errorMessage })
this.history.push({ type: 'error', message: errorMessage, rawResponse: error })
this.#emitHistoryChange()
this.#onDone(false)
const result: ExecutionResult = {
success: false,
data: errorMessage,
history: this.history,
}
await onAfterTask?.(this, result)
return result
}
step++
if (step > this.config.maxSteps) {
const errorMessage = 'Step count exceeded maximum limit'
this.history.push({ type: 'error', message: errorMessage })
this.#emitHistoryChange()
this.#onDone(false)
const result: ExecutionResult = {
success: false,
data: errorMessage,
history: this.history,
}
await onAfterTask?.(this, result)
return result
}
await waitFor(this.config.stepDelay ?? 0.4)
}
}
/**
* Merge all tools into a single MacroTool with the following input:
* - thinking: string
* - evaluation_previous_goal: string
* - memory: string
* - next_goal: string
* - action: { toolName: toolInput }
* where action must be selected from tools defined in this.tools
*/
#packMacroTool(): Tool {
const tools = this.tools
const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
return z.object({ [toolName]: tool.inputSchema }).describe(tool.description)
})
const actionSchema = z.union(actionSchemas as unknown as [z.ZodType, z.ZodType, ...z.ZodType[]])
const macroToolSchema = z.object({
// thinking: z.string().optional(),
evaluation_previous_goal: z.string().optional(),
memory: z.string().optional(),
next_goal: z.string().optional(),
action: actionSchema,
})
return {
description: 'You MUST call this tool every step!',
inputSchema: macroToolSchema as z.ZodType,
execute: async (input: MacroToolInput): Promise => {
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
console.log(chalk.blue.bold('MacroTool input'), input)
const action = input.action
const toolName = Object.keys(action)[0]
const toolInput = action[toolName]
// Build reflection text, only include non-empty fields
const reflectionLines: string[] = []
if (input.evaluation_previous_goal)
reflectionLines.push(`✅: ${input.evaluation_previous_goal}`)
if (input.memory) reflectionLines.push(`💾: ${input.memory}`)
if (input.next_goal) reflectionLines.push(`🎯: ${input.next_goal}`)
const reflectionText = reflectionLines.length > 0 ? reflectionLines.join('\n') : ''
if (reflectionText) {
console.log(reflectionText)
}
// Find the corresponding tool
const tool = tools.get(toolName)
assert(tool, `Tool ${toolName} not found`)
console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput)
// Emit executing activity
this.#emitActivity({ type: 'executing', tool: toolName, input: toolInput })
const startTime = Date.now()
// Execute tool, bind `this` to PageAgent
const result = await tool.execute.bind(this)(toolInput)
const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
// Emit executed activity
this.#emitActivity({
type: 'executed',
tool: toolName,
input: toolInput,
output: result,
duration,
})
// counting wait time
if (toolName === 'wait') {
this.#states.totalWaitTime += toolInput?.seconds || 0
} else {
this.#states.totalWaitTime = 0
}
// Return structured result
return {
input,
output: result,
}
},
}
}
/**
* Get system prompt, dynamically replace language settings based on configured language
*/
#getSystemPrompt(): string {
if (this.config.customSystemPrompt) {
return this.config.customSystemPrompt
}
const targetLanguage = this.config.language === 'zh-CN' ? '中文' : 'English'
const systemPrompt = SYSTEM_PROMPT.replace(
/Default working language: \*\*.*?\*\*/,
`Default working language: **${targetLanguage}**`
)
return systemPrompt
}
/**
* Get instructions from config
*/
async #getInstructions(): Promise {
const { instructions, experimentalLlmsTxt } = this.config
const systemInstructions = instructions?.system?.trim()
let pageInstructions: string | undefined
const url = this.#states.browserState?.url || ''
if (instructions?.getPageInstructions && url) {
try {
pageInstructions = instructions.getPageInstructions(url)?.trim()
} catch (error) {
console.error(
chalk.red('[PageAgent] Failed to execute getPageInstructions callback:'),
error
)
}
}
const llmsTxt = experimentalLlmsTxt && url ? await fetchLlmsTxt(url) : undefined
if (!systemInstructions && !pageInstructions && !llmsTxt) return ''
let result = '\n'
if (systemInstructions) {
result += `\n${systemInstructions}\n\n`
}
if (pageInstructions) {
result += `\n${pageInstructions}\n\n`
}
if (llmsTxt) {
result += `\n${llmsTxt}\n\n`
}
result += '\n\n'
return result
}
/**
* Generate system observations before each step
* @todo loop detection
* @todo console error
*/
async #handleObservations(step: number): Promise {
// Accumulated wait time warning
if (this.#states.totalWaitTime >= 3) {
this.pushObservation(
`You have waited ${this.#states.totalWaitTime} seconds accumulatively. ` +
`DO NOT wait any longer unless you have a good reason.`
)
}
// Detect URL change
const currentURL = this.#states.browserState?.url || ''
if (currentURL !== this.#states.lastURL) {
this.pushObservation(`Page navigated to → ${currentURL}`)
this.#states.lastURL = currentURL
await waitFor(0.5) // wait for page to stabilize
}
// Remaining steps warning
const remaining = this.config.maxSteps - step
if (remaining === 5) {
this.pushObservation(
`⚠️ Only ${remaining} steps remaining. ` +
`Consider wrapping up or calling done with partial results.`
)
} else if (remaining === 2) {
this.pushObservation(
`⚠️ Critical: Only ${remaining} steps left! You must finish the task or call done immediately.`
)
}
// Push observations to history and emit
if (this.#observations.length > 0) {
for (const content of this.#observations) {
this.history.push({ type: 'observation', content })
console.log(chalk.cyan('Observation:'), content)
}
this.#observations = []
this.#emitHistoryChange()
}
}
async #assembleUserPrompt(): Promise {
const browserState = this.#states.browserState!
let prompt = ''
// (optional)
prompt += await this.#getInstructions()
//
// -
// -
//
const stepCount = this.history.filter((e) => e.type === 'step').length
prompt += '\n'
prompt += '\n'
prompt += `${this.task}\n`
prompt += '\n'
prompt += '\n'
prompt += `Step ${stepCount + 1} of ${this.config.maxSteps} max possible steps\n`
prompt += `Current time: ${new Date().toLocaleString()}\n`
prompt += '\n'
prompt += '\n\n'
//
// - for steps
// - for observations and system messages
prompt += '\n'
let stepIndex = 0
for (const event of this.history) {
if (event.type === 'step') {
stepIndex++
prompt += `\n`
prompt += `Evaluation of Previous Step: ${event.reflection.evaluation_previous_goal}\n`
prompt += `Memory: ${event.reflection.memory}\n`
prompt += `Next Goal: ${event.reflection.next_goal}\n`
prompt += `Action Results: ${event.action.output}\n`
prompt += `\n`
} else if (event.type === 'observation') {
prompt += `${event.content}\n`
} else if (event.type === 'user_takeover') {
prompt += `User took over control and made changes to the page\n`
} else if (event.type === 'error') {
// Error events are mainly for panel rendering, not included in LLM context
// to avoid polluting the agent's reasoning with transient errors
}
}
prompt += '\n\n'
//
let pageContent = browserState.content
if (this.config.transformPageContent) {
pageContent = await this.config.transformPageContent(pageContent)
}
prompt += '\n'
prompt += browserState.header + '\n'
prompt += pageContent + '\n'
prompt += browserState.footer + '\n\n'
prompt += '\n\n'
return prompt
}
#onDone(success = true) {
this.pageController.cleanUpHighlights()
this.pageController.hideMask() // No await - fire and forget
this.#setStatus(success ? 'completed' : 'error')
this.#abortController.abort()
}
dispose() {
console.log('Disposing PageAgent...')
this.disposed = true
this.pageController.dispose()
// this.history = []
this.#abortController.abort()
// Emit dispose event for UI cleanup
this.dispatchEvent(new Event('dispose'))
this.config.onDispose?.(this)
}
}
================================================
FILE: packages/core/src/env.d.ts
================================================
///
declare module '*.md?raw' {
const content: string
export default content
}
================================================
FILE: packages/core/src/prompts/.prettierignore
================================================
system_prompt.md
================================================
FILE: packages/core/src/prompts/system_prompt.md
================================================
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in .
You excel at following tasks:
1. Navigating complex websites and extracting precise information
2. Automating form submissions and interactive web actions
3. Gathering and saving information
4. Operate effectively in an agent loop
5. Efficiently performing diverse web tasks
- Default working language: **English**
- Use the language that user is using. Return in user's language.
At every step, your input will consist of:
1. : A chronological event stream including your previous actions and their results.
2. : Current and .
3. : Current URL, interactive elements indexed for actions, and visible page content.
Agent history will be given as a list of step information as follows:
:
Evaluation of Previous Step: Assessment of last action
Memory: Your memory of this step
Next Goal: Your goal for this step
Action Results: Your actions and their results
and system messages wrapped in tag.
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan yourself how to get it done.
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Interactive Elements: All interactive elements will be provided in format as [index]text where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Examples:
[33]
User form
\t*[35]
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
- Only use indexes that are explicitly provided.
- If the page changes after, for example, an input text action, analyze if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling actions if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- All the elements that are scrollable are marked with `data-scrollable` attribute. Including the scrollable distance in every directions. You can scroll *the element* in case some area are overflowed.
- If a captcha appears, tell user you can not solve captcha. Finish the task and ask user to solve it.
- If expected elements are missing, try scrolling, or navigating back.
- If the page is not fully loaded, use the `wait` action.
- Do not repeat one action for more than 3 times unless some conditions changed.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
- You can only handle single page app. Do not jump out of current page.
- Do not click on link if it will open in a new page (e.g., )
- It is ok to fail the task.
- User can be wrong. If the request of user is not achievable, inappropriate or you do not have enough information or tools to achieve it. Tell user to make a better request.
- Webpage can be broken. All webpages or apps have bugs. Some bug will make it hard for your job. It's encouraged to tell user the problem of current page. Your feedbacks (including failing) are valuable for user.
- Trying too hard can be harmful. Repeating some action back and forth or pushing for a complex procedure with little knowledge can cause unwanted results and harmful side-effects. User would rather you complete the task with a fail.
- If you do not have knowledge for the current webpage or task. You must require user to give specific instructions and detailed steps.
You must call the `done` action in one of three cases:
- When you have fully completed the USER REQUEST.
- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
- When you feel stuck or unable to solve user request. Or user request is not clear or contains inappropriate content.
- If it is ABSOLUTELY IMPOSSIBLE to continue.
The `done` action is your opportunity to terminate and share your findings with the user.
- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
- You can use the `text` field of the `done` action to communicate your findings and to provide a coherent reply to the user and fulfill the USER REQUEST.
- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
- If the user asks for a structured output, your `done` action's schema may be modified. Take this schema into account when solving the task!
Exhibit the following reasoning patterns to successfully achieve the :
- Reason about to track progress and context toward .
- Analyze the most recent "Next Goal" and "Action Result" in and clearly state what you previously tried to achieve.
- Analyze all relevant items in and to understand your state.
- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in . If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or ask user for help.
- Ask user for help if you have any difficulty. Keep user in the loop.
- If you see information relevant to , plan saving the information to memory.
- Always reason about the . Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajectory with the user request and think carefully if thats how the user requested it.
Here are examples of good output patterns. Use them as reference but never copy them directly.
"evaluation_previous_goal": "Successfully navigated to the product page and found the target information. Verdict: Success"
"evaluation_previous_goal": "Clicked the login button and user authentication form appeared. Verdict: Success"
"memory": "Found many pending reports that need to be analyzed in the main page. Successfully processed the first 2 reports on quarterly sales data and moving on to inventory analysis and customer feedback reports."
"next_goal": "Click on the 'Add to Cart' button to proceed with the purchase flow."
================================================
FILE: packages/core/src/tools/index.ts
================================================
/**
* Internal tools for PageAgent.
* @note Adapted from browser-use
*/
import * as z from 'zod/v4'
import type { PageAgentCore } from '../PageAgentCore'
import { waitFor } from '../utils'
/**
* Internal tool definition that has access to PageAgent `this` context
*/
export interface PageAgentTool {
// name: string
description: string
inputSchema: z.ZodType
execute: (this: PageAgentCore, args: TParams) => Promise
}
export function tool(options: PageAgentTool): PageAgentTool {
return options
}
/**
* Internal tools for PageAgent.
* Note: Using any to allow different parameter types for each tool
*/
export const tools = new Map()
tools.set(
'done',
tool({
description:
'Complete task. Text is your final response to the user — keep it concise unless the user explicitly asks for detail.',
inputSchema: z.object({
text: z.string(),
success: z.boolean().default(true),
}),
execute: async function (this: PageAgentCore, input) {
// @note main loop will handle this one
return Promise.resolve('Task completed')
},
})
)
tools.set(
'wait',
tool({
description: 'Wait for x seconds. Can be used to wait until the page or data is fully loaded.',
inputSchema: z.object({
seconds: z.number().min(1).max(10).default(1),
}),
execute: async function (this: PageAgentCore, input) {
// try to subtract LLM calling time from the actual wait time
const lastTimeUpdate = await this.pageController.getLastUpdateTime()
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime)
return `✅ Waited for ${input.seconds} seconds.`
},
})
)
tools.set(
'ask_user',
tool({
description:
'Ask the user a question and wait for their answer. Use this if you need more information or clarification.',
inputSchema: z.object({
question: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
if (!this.onAskUser) {
throw new Error('ask_user tool requires onAskUser callback to be set')
}
const answer = await this.onAskUser(input.question)
return `User answered: ${answer}`
},
})
)
tools.set(
'click_element_by_index',
tool({
description: 'Click element by index',
inputSchema: z.object({
index: z.int().min(0),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.clickElement(input.index)
return result.message
},
})
)
tools.set(
'input_text',
tool({
description: 'Click and type text into an interactive input element',
inputSchema: z.object({
index: z.int().min(0),
text: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.inputText(input.index, input.text)
return result.message
},
})
)
tools.set(
'select_dropdown_option',
tool({
description:
'Select dropdown option for interactive element index by the text of the option you want to select',
inputSchema: z.object({
index: z.int().min(0),
text: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.selectOption(input.index, input.text)
return result.message
},
})
)
/**
* @note Reference from browser-use
*/
tools.set(
'scroll',
tool({
description: 'Scroll the page vertically. Use index for scroll elements (dropdowns/custom UI).',
inputSchema: z.object({
down: z.boolean().default(true),
num_pages: z.number().min(0).max(10).optional().default(0.1),
pixels: z.number().int().min(0).optional(),
index: z.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scroll({
...input,
numPages: input.num_pages,
})
return result.message
},
})
)
/**
* @todo Tables need a dedicated parser to extract structured data. This tool is useless.
*/
tools.set(
'scroll_horizontally',
tool({
description:
'Scroll the page horizontally, or within a specific element by index. Useful for wide tables.',
inputSchema: z.object({
right: z.boolean().default(true),
pixels: z.number().int().min(0),
index: z.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scrollHorizontally(input)
return result.message
},
})
)
tools.set(
'execute_javascript',
tool({
description:
'Execute JavaScript code on the current page. Supports async/await syntax. Use with caution!',
inputSchema: z.object({
script: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.executeJavascript(input.script)
return result.message
},
})
)
// @todo send_keys
// @todo upload_file
// @todo go_back
// @todo extract_structured_data
================================================
FILE: packages/core/src/types.ts
================================================
import type { LLMConfig } from '@page-agent/llms'
// @note circular dependency but okay
import type { PageAgentCore } from './PageAgentCore'
import type { PageAgentTool } from './tools'
/** Supported UI languages */
export type SupportedLanguage = 'en-US' | 'zh-CN'
export interface AgentConfig extends LLMConfig {
language?: SupportedLanguage
/**
* Maximum number of steps the agent can take per task.
* @default 40
*/
maxSteps?: number
/**
* Custom tools to extend PageAgent capabilities
* @experimental
* @note You can also override or remove internal tools by using the same name.
* @see PageAgentTool
*
* @example
* // override internal tool
* import { z } from 'zod/v4'
* import { tool } from 'page-agent'
* const customTools = {
* ask_user: tool({
* description:
* 'Ask the user or parent model a question and wait for their answer. Use this if you need more information or clarification.',
* inputSchema: z.object({
* question: z.string(),
* }),
* execute: async function (this: PageAgent, input) {
* const answer = await do_some_thing(input.question)
* return "✅ Received user answer: " + answer
* },
* })
* }
*
* @example
* // remove internal tool
* const customTools = {
* ask_user: null // never ask user questions
* }
*/
customTools?: Record
/**
* Instructions to guide the agent's behavior
*/
instructions?: {
/**
* Global system-level instructions, applied to all tasks
*/
system?: string
/**
* Dynamic page-level instructions callback
* Called before each step to get instructions for the current page
* @param url - Current page URL (window.location.href)
* @returns Instructions string, or undefined/null to skip
*/
getPageInstructions?: (url: string) => string | undefined | null
}
/**
* Lifecycle hooks for task execution.
* @experimental API may change in future versions.
*
* All hooks receive the agent instance as first parameter.
*/
/**
* Called before each step execution.
* @experimental
* @param agent - The PageAgentCore instance
* @param stepCount - Current step number (0-indexed)
*/
onBeforeStep?: (agent: PageAgentCore, stepCount: number) => Promise | void
/**
* Called after each step execution.
* @experimental
* @param agent - The PageAgentCore instance
* @param history - Current history of events
*/
onAfterStep?: (agent: PageAgentCore, history: HistoricalEvent[]) => Promise | void
/**
* Called before task execution starts.
* @experimental
* @param agent - The PageAgentCore instance
*/
onBeforeTask?: (agent: PageAgentCore) => Promise | void
/**
* Called after task execution completes (success or failure).
* @experimental
* @param agent - The PageAgentCore instance
* @param result - The execution result
*/
onAfterTask?: (agent: PageAgentCore, result: ExecutionResult) => Promise | void
/**
* Called when the agent is disposed.
* @experimental
* @note This hook can block the disposal process if it's async.
* @param agent - The PageAgentCore instance
* @param reason - Optional reason for disposal
*/
onDispose?: (agent: PageAgentCore, reason?: string) => void
// page behavior hooks
/**
* @experimental
* Enable the experimental script execution tool that allows executing generated JavaScript code on the page.
* @note Can cause unpredictable side effects.
* @note May bypass some safe guards and data-masking mechanisms.
*/
experimentalScriptExecutionTool?: boolean
/**
* @experimental
* Fetch /llms.txt from current site origin and include as context.
* Only fetched once per origin per task.
* @default false
*/
experimentalLlmsTxt?: boolean
/**
* Transform page content before sending to LLM.
* Called after DOM extraction and simplification, before LLM invocation.
* Use cases: inspect extraction results, modify page info, mask sensitive data.
*
* @param content - Simplified page content that will be sent to LLM
* @returns Transformed content
*
* @example
* // Mask phone numbers
* transformPageContent: async (content) => {
* return content.replace(/1[3-9]\d{9}/g, '***********')
* }
*/
transformPageContent?: (content: string) => Promise | string
/**
* Completely override the default system prompt.
* @experimental Use with caution - incorrect prompts may break agent behavior.
*/
customSystemPrompt?: string
/**
* Delay between steps in seconds.
* @default 0.4
*/
stepDelay?: number
}
/**
* Agent reflection state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentReflection {
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends Partial {
action: Record
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}
/**
* A single agent step with reflection and action
*/
export interface AgentStepEvent {
type: 'step'
stepIndex: number
reflection: Partial
action: {
name: string
input: any
output: string
}
usage: {
promptTokens: number
completionTokens: number
totalTokens: number
cachedTokens?: number
reasoningTokens?: number
}
/** Raw LLM response for debugging */
rawResponse?: unknown
/** Raw LLM request for debugging */
rawRequest?: unknown
}
/**
* Persistent observation event (stays in memory)
*/
export interface ObservationEvent {
type: 'observation'
content: string
}
/**
* User takeover event
*/
export interface UserTakeoverEvent {
type: 'user_takeover'
}
/**
* Retry event - LLM call is being retried
*/
export interface RetryEvent {
type: 'retry'
message: string
attempt: number
maxAttempts: number
}
/**
* Error event - fatal error from LLM or execution
*/
export interface AgentErrorEvent {
type: 'error'
message: string
rawResponse?: unknown
}
/**
* Union type for all history events
*/
export type HistoricalEvent =
| AgentStepEvent
| ObservationEvent
| UserTakeoverEvent
| RetryEvent
| AgentErrorEvent
/**
* Agent execution status
*/
export type AgentStatus = 'idle' | 'running' | 'completed' | 'error'
/**
* Agent activity - transient state for immediate UI feedback.
*
* Unlike historical events (which are persisted), activities are ephemeral
* and represent "what the agent is doing right now". UI components should
* listen to 'activity' events to show real-time feedback.
*
* Note: There is no 'idle' activity - absence of activity events means idle.
*/
export type AgentActivity =
| { type: 'thinking' }
| { type: 'executing'; tool: string; input: unknown }
| { type: 'executed'; tool: string; input: unknown; output: string; duration: number }
| { type: 'retrying'; attempt: number; maxAttempts: number }
| { type: 'error'; message: string }
export interface ExecutionResult {
success: boolean
data: string
history: HistoricalEvent[]
}
================================================
FILE: packages/core/src/utils/autoFixer.ts
================================================
import { InvokeError, InvokeErrorType } from '@page-agent/llms'
import chalk from 'chalk'
import * as z from 'zod/v4'
import type { PageAgentTool } from '../tools'
const log = console.log.bind(console, chalk.yellow('[autoFixer]'))
/**
* Normalize LLM response and fix common format issues.
*
* Handles:
* - No tool_calls but JSON in message.content (fallback)
* - Model returns action name as tool call instead of AgentOutput
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - Primitive action input for single-field tools (e.g. `{"click_element_by_index": 2}`)
* - etc.
*/
export function normalizeResponse(response: any, tools?: Map): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
if (!choice) throw new Error('No choices in response')
const message = choice.message
if (!message) throw new Error('No message in choice')
const toolCall = message.tool_calls?.[0]
// fix level and location of arguments
if (toolCall?.function?.arguments) {
resolvedArguments = safeJsonParse(toolCall.function.arguments)
// case: sometimes the model only returns the action level
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
log(`#1: fixing tool_call`)
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
// case: sometimes the model returns json in content instead of tool_calls
if (message.content) {
const content = message.content.trim()
const jsonInContent = retrieveJsonFromString(content)
if (jsonInContent) {
resolvedArguments = safeJsonParse(jsonInContent)
// case: sometimes the content json includes upper level wrapper
if (resolvedArguments?.name === 'AgentOutput') {
log(`#2: fixing tool_call`)
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
}
// case: sometimes even 2-levels of wrapping
if (resolvedArguments?.type === 'function') {
log(`#3: fixing tool_call`)
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
}
// case: and sometimes action level only
// todo: needs better detection logic
if (
!resolvedArguments?.action &&
!resolvedArguments?.evaluation_previous_goal &&
!resolvedArguments?.memory &&
!resolvedArguments?.next_goal &&
!resolvedArguments?.thinking
) {
log(`#4: fixing tool_call`)
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
throw new Error('No tool_call and the message content does not contain valid JSON')
}
} else {
throw new Error('No tool_call nor message content is present')
}
}
// fix double stringified arguments
resolvedArguments = safeJsonParse(resolvedArguments)
if (resolvedArguments.action) {
resolvedArguments.action = safeJsonParse(resolvedArguments.action)
}
// validate and fix action input using tool schemas
if (resolvedArguments.action && tools) {
resolvedArguments.action = validateAction(resolvedArguments.action, tools)
}
// fix incomplete formats
if (!resolvedArguments.action) {
log(`#5: fixing tool_call`)
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
}
// pack back to standard format
return {
...response,
choices: [
{
...choice,
message: {
...message,
tool_calls: [
{
...(toolCall || {}),
function: {
...(toolCall?.function || {}),
name: 'AgentOutput',
arguments: JSON.stringify(resolvedArguments),
},
},
],
},
},
],
}
}
/**
* Validate action against tool schemas. Provides clear error messages
* instead of letting the union schema produce unreadable errors.
*
* Also coerces primitive inputs for single-field tools:
* e.g. `{"click_element_by_index": 2}` → `{"click_element_by_index": {"index": 2}}`
*/
function validateAction(action: any, tools: Map): any {
if (typeof action !== 'object' || action === null) return action
const toolName = Object.keys(action)[0]
if (!toolName) return action
const tool = tools.get(toolName)
if (!tool) {
const available = Array.from(tools.keys()).join(', ')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Unknown action "${toolName}". Available: ${available}`
)
}
let value = action[toolName]
const schema = tool.inputSchema
// coerce primitive input for single-field tools
if (schema instanceof z.ZodObject && value !== null && typeof value !== 'object') {
const requiredKey = Object.keys(schema.shape).find(
(k) => !(schema.shape as Record)[k].safeParse(undefined).success
)
if (requiredKey) {
log(`coercing primitive action input for "${toolName}"`)
value = { [requiredKey]: value }
}
}
const result = schema.safeParse(value)
if (!result.success) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Invalid input for action "${toolName}": ${z.prettifyError(result.error)}`
)
}
return { [toolName]: result.data }
}
/**
* Safely parse JSON, return original input if not json.
*/
function safeJsonParse(input: any): any {
if (typeof input === 'string') {
try {
return JSON.parse(input.trim())
} catch {
return input
}
}
return input
}
/**
* Extract and parse JSON from a string.
* - Treat content between the first `{` and the last `}` as JSON.
* - Try to parse that content as JSON and return the parsed value (object/array/primitive) if successful, otherwise return null.
*/
function retrieveJsonFromString(str: string): any {
try {
const json = /({[\s\S]*})/.exec(str) ?? []
if (json.length === 0) {
return null
}
return JSON.parse(json[0]!)
} catch {
return null
}
}
interface Choice {
message?: {
role?: 'assistant'
content?: string
tool_calls?: {
id?: string
type?: 'function'
function?: {
name?: string
arguments?: string
}
}[]
}
index?: 0
finish_reason?: 'tool_calls'
}
================================================
FILE: packages/core/src/utils/index.ts
================================================
import chalk from 'chalk'
export * from './autoFixer'
export async function waitFor(seconds: number): Promise {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
//
export function truncate(text: string, maxLength: number): string {
if (text.length > maxLength) {
return text.substring(0, maxLength) + '...'
}
return text
}
//
export function randomID(existingIDs?: string[]): string {
let id = Math.random().toString(36).substring(2, 11)
if (!existingIDs) {
return id
}
const MAX_TRY = 1000
let tryCount = 0
while (existingIDs.includes(id)) {
id = Math.random().toString(36).substring(2, 11)
tryCount++
if (tryCount > MAX_TRY) {
throw new Error('randomID: too many tries')
}
}
return id
}
//
const _global = globalThis as any
if (!_global.__PAGE_AGENT_IDS__) {
_global.__PAGE_AGENT_IDS__ = []
}
const ids = _global.__PAGE_AGENT_IDS__
/**
* Generate a random ID.
* @note Unique within this window.
*/
export function uid() {
const id = randomID(ids)
ids.push(id)
return id
}
const llmsTxtCache = new Map()
/** Fetch /llms.txt for a URL's origin. Cached per origin, `null` = tried and not found. */
export async function fetchLlmsTxt(url: string): Promise {
let origin: string
try {
origin = new URL(url).origin
} catch {
return null // Invalid URL
}
// about:blank, data:, file:
if (origin === 'null') return null
if (llmsTxtCache.has(origin)) return llmsTxtCache.get(origin)!
const endpoint = `${origin}/llms.txt`
let result: string | null = null
try {
console.log(chalk.gray(`[llms.txt] Fetching ${endpoint}`))
const res = await fetch(endpoint, { signal: AbortSignal.timeout(3000) })
if (res.ok) {
result = await res.text()
console.log(chalk.green(`[llms.txt] Found (${result.length} chars)`))
if (result.length > 1000) {
console.log(chalk.yellow(`[llms.txt] Truncating to 1000 chars`))
result = truncate(result, 1000)
}
} else {
console.debug(chalk.gray(`[llms.txt] ${res.status} for ${endpoint}`))
}
} catch (e) {
console.debug(chalk.gray(`[llms.txt] not found for ${endpoint}`), e)
}
llmsTxtCache.set(origin, result)
return result
}
/**
* Simple assertion function that throws an error if the condition is falsy
* @param condition - The condition to assert
* @param message - Optional error message
* @throws Error if condition is falsy
*/
export function assert(condition: unknown, message?: string, silent?: boolean): asserts condition {
if (!condition) {
const errorMessage = message ?? 'Assertion failed'
if (!silent) console.error(chalk.red(`❌ assert: ${errorMessage}`))
throw new Error(errorMessage)
}
}
================================================
FILE: packages/core/tsconfig.dts.json
================================================
{
"extends": "./tsconfig.json",
"compilerOptions": {
// @workaround DTS bug
// dts do not work with monorepo path mapping
// disable path mapping for it
"paths": {}
}
}
================================================
FILE: packages/core/tsconfig.json
================================================
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo",
"noEmit": false,
"allowImportingTsExtensions": false,
"baseUrl": ".",
"outDir": "dist",
"paths": {
//
"@page-agent/llms": ["../llms/src/index.ts"],
"@page-agent/page-controller": ["../page-controller/src/PageController.ts"]
}
},
"include": ["**/*.ts"],
"exclude": ["dist", "node_modules"],
"references": [
//
{ "path": "../llms" },
{ "path": "../page-controller" }
]
}
================================================
FILE: packages/core/vite.config.js
================================================
// @ts-check
import { dirname, resolve } from 'path'
import dts from 'unplugin-dts/vite'
import { fileURLToPath } from 'url'
import { defineConfig } from 'vite'
import cssInjectedByJsPlugin from 'vite-plugin-css-injected-by-js'
const __dirname = dirname(fileURLToPath(import.meta.url))
// ES Module for NPM Package
export default defineConfig({
clearScreen: false,
plugins: [
dts({ tsconfigPath: './tsconfig.dts.json', bundleTypes: true }),
cssInjectedByJsPlugin({ relativeCSSInjection: true }),
],
publicDir: false,
esbuild: {
keepNames: true,
},
build: {
lib: {
entry: resolve(__dirname, 'src/PageAgentCore.ts'),
name: 'PageAgentCore',
fileName: 'page-agent-core',
formats: ['es'],
},
outDir: resolve(__dirname, 'dist', 'esm'),
rollupOptions: {
external: [
'chalk',
'zod',
'zod/v4',
// all the internal packages
/^@page-agent\//,
],
},
minify: false,
sourcemap: true,
cssCodeSplit: true,
},
define: {
'process.env.NODE_ENV': '"production"',
},
})
================================================
FILE: packages/extension/.prettierignore
================================================
.wxt
src/components/ui
================================================
FILE: packages/extension/PRIVACY.md
================================================
# Privacy Policy for Page Agent Extension
This document has moved. Please see our full **[Terms of Use & Privacy](../../docs/terms-and-privacy.md)**.
Online: https://github.com/alibaba/page-agent/blob/main/docs/terms-and-privacy.md
================================================
FILE: packages/extension/components.json
================================================
{
"$schema": "https://ui.shadcn.com/schema.json",
"style": "new-york",
"rsc": false,
"tsx": true,
"tailwind": {
"config": "",
"css": "src/assets/index.css",
"baseColor": "neutral",
"cssVariables": true,
"prefix": ""
},
"iconLibrary": "lucide",
"aliases": {
"components": "@/components",
"utils": "@/lib/utils",
"ui": "@/components/ui",
"lib": "@/lib",
"hooks": "@/lib/hooks"
},
"registries": {
"@magicui": "https://magicui.design/r/{name}.json"
}
}
================================================
FILE: packages/extension/docs/extension_api.md
================================================
# Page Agent Extension API
Integrate the Page Agent extension into your web app and trigger multi-page browser tasks from page JavaScript.
## Installation
### 1. Install the browser extension
Primary channel:
- Chrome Web Store: https://chromewebstore.google.com/detail/page-agent-ext/akldabonmimlicnjlflnapfeklbfemhj
Latest updates are often published earlier on:
- GitHub Releases: https://github.com/alibaba/page-agent/releases
### 2. Install type definitions (recommended)
```bash
npm install @page-agent/core --save-dev
```
### 3. Authorization (Token)
The token allows your page JS to call the extension API (`window.PAGE_AGENT_EXT`) and execute multi-page tasks.
Why token-based access is required:
- The extension has broad browser permissions (page access, navigation, multi-tab control).
- If abused, it can harm user privacy and security.
- Users must explicitly provide the token only to applications they trust.
Setup:
1. Open the extension side panel and copy your auth token.
2. Set the token in your page:
```typescript
localStorage.setItem('PageAgentExtUserAuthToken', 'your-token')
```
## Quick Start
```typescript
import type {
AgentActivity,
AgentStatus,
ExecutionResult,
HistoricalEvent,
} from '@page-agent/core'
// Wait for extension injection (up to 1 second)
async function waitForExtension(timeout = 1000): Promise {
const start = Date.now()
while (Date.now() - start < timeout) {
if (window.PAGE_AGENT_EXT) return true
await new Promise((r) => setTimeout(r, 100))
}
return false
}
// Usage
if (await waitForExtension()) {
const result = await window.PAGE_AGENT_EXT!.execute('Click the login button', {
baseURL: 'https://api.openai.com/v1',
apiKey: 'your-api-key',
model: 'gpt-5.2',
onStatusChange: (status) => console.log('Status:', status),
onActivity: (activity) => console.log('Activity:', activity),
})
console.log('Result:', result)
}
```
## Global API
After token match, the extension injects APIs into `window`.
### `window.PAGE_AGENT_EXT_VERSION`
Extension version string (for capability checks before using the main API).
### `window.PAGE_AGENT_EXT`
Main namespace object.
#### `PAGE_AGENT_EXT.execute(task, config)`
Execute one agent task.
Parameters:
| Name | Type | Required | Description |
| ---- | ---- | -------- | ----------- |
| `task` | `string` | Yes | Task description |
| `config` | `ExecuteConfig` | Yes | LLM settings, options, and callbacks |
Returns: `Promise`
#### `PAGE_AGENT_EXT.stop()`
Stop the current task.
## Types
Install `@page-agent/core` for complete types:
```typescript
import type {
AgentActivity,
AgentStatus,
ExecutionResult,
HistoricalEvent,
} from '@page-agent/core'
export interface ExecuteConfig {
baseURL: string
model: string
apiKey?: string
// Include the initial tab where page JS starts. Default: true.
includeInitialTab?: boolean
onStatusChange?: (status: AgentStatus) => void
onActivity?: (activity: AgentActivity) => void
onHistoryUpdate?: (history: HistoricalEvent[]) => void
}
export type Execute = (task: string, config: ExecuteConfig) => Promise
```
`AgentStatus`
```typescript
type AgentStatus = 'idle' | 'running' | 'completed' | 'error'
```
`AgentActivity`
```typescript
type AgentActivity =
| { type: 'thinking' }
| { type: 'executing'; tool: string; input: unknown }
| { type: 'executed'; tool: string; input: unknown; output: string; duration: number }
| { type: 'retrying'; attempt: number; maxAttempts: number }
| { type: 'error'; message: string }
```
`HistoricalEvent`
```typescript
type HistoricalEvent =
| { type: 'step'; stepIndex: number; reflection: AgentReflection; action: Action }
| { type: 'observation'; content: string }
| { type: 'user_takeover' }
| { type: 'retry'; message: string; attempt: number; maxAttempts: number }
| { type: 'error'; message: string; rawResponse?: unknown }
```
`ExecutionResult`
```typescript
interface ExecutionResult {
success: boolean
data: string
history: HistoricalEvent[]
}
```
## Usage Examples
### Basic Execution
```typescript
const result = await window.PAGE_AGENT_EXT!.execute(
'Fill in the email field with test@example.com and click Submit',
{
baseURL: 'https://api.openai.com/v1',
apiKey: process.env.OPENAI_API_KEY!,
model: 'gpt-5.2',
includeInitialTab: false, // Optional: exclude current tab
onStatusChange: (status) => console.log(status),
onActivity: (activity) => console.log(activity),
}
)
```
### Stop the Current Task
```typescript
window.PAGE_AGENT_EXT!.stop()
```
## Window Type Declaration
If you are not importing `@page-agent/core`, add:
```typescript
import type {
AgentActivity,
AgentStatus,
ExecutionResult,
HistoricalEvent,
} from '@page-agent/core'
interface ExecuteConfig {
baseURL: string
model: string
apiKey?: string
includeInitialTab?: boolean
onStatusChange?: (status: AgentStatus) => void
onActivity?: (activity: AgentActivity) => void
onHistoryUpdate?: (history: HistoricalEvent[]) => void
}
declare global {
interface Window {
PAGE_AGENT_EXT_VERSION?: string
PAGE_AGENT_EXT?: {
version: string
execute: Execute
stop: () => void
}
}
}
```
================================================
FILE: packages/extension/package.json
================================================
{
"name": "@page-agent/ext",
"private": true,
"version": "1.6.0",
"type": "module",
"scripts": {
"dev": "wxt",
"build:ext": "wxt build",
"zip": "wxt zip",
"postinstall": "wxt prepare"
},
"devDependencies": {
"@radix-ui/react-hover-card": "^1.1.15",
"@radix-ui/react-icons": "^1.3.2",
"@radix-ui/react-label": "^2.1.8",
"@radix-ui/react-separator": "^1.1.8",
"@radix-ui/react-slot": "^1.2.4",
"@radix-ui/react-switch": "^1.2.6",
"@types/chrome": "^0.1.37",
"@types/react": "^19.2.14",
"@types/react-dom": "^19.2.1",
"@wxt-dev/module-react": "^1.2.2",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"idb": "^8.0.3",
"lucide-react": "^0.577.0",
"motion": "^12.37.0",
"next-themes": "^0.4.6",
"react": "^19.2.4",
"react-dom": "^19.2.4",
"rough-notation": "^0.5.1",
"simple-icons": "^16.12.0",
"sonner": "^2.0.7",
"tailwind-merge": "^3.5.0",
"tailwindcss": "^4.1.14",
"tw-animate-css": "^1.4.0",
"wxt": "^0.20.19"
},
"dependencies": {
"@page-agent/core": "1.6.0",
"@page-agent/llms": "1.6.0",
"@page-agent/page-controller": "1.6.0",
"@page-agent/ui": "1.6.0",
"ai-motion": "^0.4.8",
"chalk": "^5.6.2"
},
"peerDependencies": {
"zod": "^3.25.0 || ^4.0.0"
}
}
================================================
FILE: packages/extension/public/_locales/en/messages.json
================================================
{
"extName": {
"message": "Page Agent Ext"
},
"extDescription": {
"message": "AI-powered browser automation assistant. Control web pages with natural language."
},
"extActionTitle": {
"message": "Open Page Agent"
}
}
================================================
FILE: packages/extension/public/_locales/zh_CN/messages.json
================================================
{
"extName": {
"message": "Page Agent Ext"
},
"extDescription": {
"message": "AI 驱动的浏览器自动化助手,用自然语言控制网页。"
},
"extActionTitle": {
"message": "打开 Page Agent"
}
}
================================================
FILE: packages/extension/src/agent/.prettierignore
================================================
system_prompt.md
================================================
FILE: packages/extension/src/agent/MultiPageAgent.ts
================================================
import { type AgentConfig, PageAgentCore } from '@page-agent/core'
import { RemotePageController } from './RemotePageController'
import { TabsController } from './TabsController'
import SYSTEM_PROMPT from './system_prompt.md?raw'
import { createTabTools } from './tabTools'
/** Detect user language from browser settings */
function detectLanguage(): 'en-US' | 'zh-CN' {
const lang = navigator.language || navigator.languages?.[0] || 'en-US'
return lang.startsWith('zh') ? 'zh-CN' : 'en-US'
}
/**
* MultiPageAgent
* - use with extension
* - can be used from a side panel or a content script
*/
export class MultiPageAgent extends PageAgentCore {
constructor(config: AgentConfig & { includeInitialTab?: boolean }) {
// multi page controller
const tabsController = new TabsController()
const pageController = new RemotePageController(tabsController)
const customTools = createTabTools(tabsController)
// system prompt - auto-detect language if not specified
const language = config.language ?? detectLanguage()
const targetLanguage = language === 'zh-CN' ? '中文' : 'English'
const systemPrompt = SYSTEM_PROMPT.replace(
/Default working language: \*\*.*?\*\*/,
`Default working language: **${targetLanguage}**`
)
// include initial tab for controlling
const includeInitialTab = config.includeInitialTab ?? true
/**
* When the agent is in side-panel and user closed the side-panel.
* There is no chance for isAgentRunning to be set false.
* (unload event doesn't work well in side panel.)
* (I'm trying not to use long-lived connection because the lifecycle of a sw is hard to predict.)
* This heartbeat mechanism acts as a backup.
*/
let heartBeatInterval: null | number = null
super({
...config,
pageController: pageController as any,
customTools: customTools,
customSystemPrompt: systemPrompt,
onBeforeTask: async (agent) => {
await tabsController.init(agent.task, includeInitialTab)
heartBeatInterval = window.setInterval(() => {
chrome.storage.local.set({
agentHeartbeat: Date.now(),
})
}, 1_000)
await chrome.storage.local.set({
isAgentRunning: true,
})
},
onAfterTask: async () => {
if (heartBeatInterval) {
window.clearInterval(heartBeatInterval)
heartBeatInterval = null
}
await chrome.storage.local.set({
isAgentRunning: false,
})
},
onBeforeStep: async (agent) => {
if (!tabsController.currentTabId) return
// make sure the current tab is loaded before the step starts
await tabsController.waitUntilTabLoaded(tabsController.currentTabId!)
},
onDispose: () => {
if (heartBeatInterval) {
window.clearInterval(heartBeatInterval)
heartBeatInterval = null
}
chrome.storage.local.set({
isAgentRunning: false,
})
tabsController.dispose()
},
})
}
}
================================================
FILE: packages/extension/src/agent/RemotePageController.background.ts
================================================
/**
* background logics for RemotePageController
* - redirect messages from RemotePageController(Agent, extension pages) to ContentScript
*/
export function handlePageControlMessage(
message: { type: 'PAGE_CONTROL'; action: string; payload: any; targetTabId: number },
sender: chrome.runtime.MessageSender,
sendResponse: (response: unknown) => void
): true | undefined {
const PREFIX = '[RemotePageController.background]'
function debug(...messages: any[]) {
console.debug(`\x1b[90m${PREFIX}\x1b[0m`, ...messages)
}
const { action, payload, targetTabId } = message
if (action === 'get_my_tab_id') {
debug('get_my_tab_id', sender.tab?.id)
sendResponse({ tabId: sender.tab?.id || null })
return
}
// proxy to content script
chrome.tabs
.sendMessage(targetTabId, {
type: 'PAGE_CONTROL',
action,
payload,
})
.then((result) => {
sendResponse(result)
})
.catch((error) => {
console.error(PREFIX, error)
sendResponse({
success: false,
error: error instanceof Error ? error.message : String(error),
})
})
return true // async response
}
================================================
FILE: packages/extension/src/agent/RemotePageController.content.ts
================================================
/**
* content script for RemotePageController
*/
import { PageController } from '@page-agent/page-controller'
export function initPageController() {
let pageController: PageController | null = null
let intervalID: number | null = null
const myTabIdPromise = chrome.runtime
.sendMessage({ type: 'PAGE_CONTROL', action: 'get_my_tab_id' })
.then((response) => {
return (response as { tabId: number | null }).tabId
})
.catch((error) => {
console.error('[RemotePageController.ContentScript]: Failed to get my tab id', error)
return null
})
function getPC(): PageController {
if (!pageController) {
pageController = new PageController({ enableMask: false, viewportExpansion: 400 })
}
return pageController
}
intervalID = window.setInterval(async () => {
const agentHeartbeat = (await chrome.storage.local.get('agentHeartbeat')).agentHeartbeat
const now = Date.now()
const agentInTouch = typeof agentHeartbeat === 'number' && now - agentHeartbeat < 2_000
const isAgentRunning = (await chrome.storage.local.get('isAgentRunning')).isAgentRunning
const currentTabId = (await chrome.storage.local.get('currentTabId')).currentTabId
const shouldShowMask = isAgentRunning && agentInTouch && currentTabId === (await myTabIdPromise)
if (shouldShowMask) {
const pc = getPC()
pc.initMask()
await pc.showMask()
} else {
// await getPC().hideMask()
if (pageController) {
pageController.hideMask()
pageController.cleanUpHighlights()
}
}
if (!isAgentRunning && agentInTouch) {
if (pageController) {
pageController.dispose()
pageController = null
}
}
}, 500)
chrome.runtime.onMessage.addListener((message, sender, sendResponse): true | undefined => {
if (message.type !== 'PAGE_CONTROL') {
// sendResponse({
// success: false,
// error: `[RemotePageController.ContentScript]: Invalid message type: ${message.type}`,
// })
return
}
const { action, payload } = message
const methodName = getMethodName(action)
const pc = getPC() as any
switch (action) {
case 'get_last_update_time':
case 'get_browser_state':
case 'update_tree':
case 'clean_up_highlights':
case 'click_element':
case 'input_text':
case 'select_option':
case 'scroll':
case 'scroll_horizontally':
case 'execute_javascript':
pc[methodName](...(payload || []))
.then((result: any) => sendResponse(result))
.catch((error: any) =>
sendResponse({
success: false,
error: error instanceof Error ? error.message : String(error),
})
)
break
default:
sendResponse({
success: false,
error: `Unknown PAGE_CONTROL action: ${action}`,
})
}
return true
})
}
function getMethodName(action: string): string {
switch (action) {
case 'get_last_update_time':
return 'getLastUpdateTime' as const
case 'get_browser_state':
return 'getBrowserState' as const
case 'update_tree':
return 'updateTree' as const
case 'clean_up_highlights':
return 'cleanUpHighlights' as const
// DOM actions
case 'click_element':
return 'clickElement' as const
case 'input_text':
return 'inputText' as const
case 'select_option':
return 'selectOption' as const
case 'scroll':
return 'scroll' as const
case 'scroll_horizontally':
return 'scrollHorizontally' as const
case 'execute_javascript':
return 'executeJavascript' as const
default:
return action
}
}
================================================
FILE: packages/extension/src/agent/RemotePageController.ts
================================================
import type { BrowserState } from '@page-agent/page-controller'
import type { TabsController } from './TabsController'
const PREFIX = '[RemotePageController]'
function debug(...messages: any[]) {
console.debug(`\x1b[90m${PREFIX}\x1b[0m`, ...messages)
}
function sendMessage(message: {
type: 'PAGE_CONTROL'
action: string
targetTabId: number
payload?: any
}): Promise {
return chrome.runtime.sendMessage(message).catch((error) => {
console.error(PREFIX, message.action, error)
return null
})
}
/**
* Agent side page controller.
* - live in the agent env (extension page or content script)
* - communicates with remote PageController via sw
*/
export class RemotePageController {
tabsController: TabsController
constructor(tabsController: TabsController) {
this.tabsController = tabsController
}
get currentTabId(): number | null {
return this.tabsController.currentTabId
}
private async getCurrentUrl(): Promise {
if (!this.currentTabId) return ''
const { url } = await this.tabsController.getTabInfo(this.currentTabId)
return url || ''
}
private async getCurrentTitle(): Promise {
if (!this.currentTabId) return ''
const { title } = await this.tabsController.getTabInfo(this.currentTabId)
return title || ''
}
async getLastUpdateTime(): Promise {
if (!this.currentTabId) throw new Error('tabsController not initialized.')
return sendMessage({
type: 'PAGE_CONTROL',
action: 'get_last_update_time',
targetTabId: this.currentTabId,
})
}
async getBrowserState(): Promise {
let browserState = {} as BrowserState
debug('getBrowserState', this.currentTabId)
const currentUrl = await this.getCurrentUrl()
const currentTitle = await this.getCurrentTitle()
if (!this.currentTabId || !isContentScriptAllowed(currentUrl)) {
browserState = {
url: currentUrl,
title: currentTitle,
header: '',
content: '(empty page. either current page is not readable or not loaded yet.)',
footer: '',
}
} else {
browserState = await sendMessage({
type: 'PAGE_CONTROL',
action: 'get_browser_state',
targetTabId: this.currentTabId,
})
}
const sum = await this.tabsController.summarizeTabs()
browserState.header = sum + '\n\n' + (browserState.header || '')
debug('getBrowserState: success', this.currentTabId, browserState)
return browserState
}
async updateTree(): Promise {
if (!this.currentTabId || !isContentScriptAllowed(await this.getCurrentUrl())) {
return
}
await sendMessage({
type: 'PAGE_CONTROL',
action: 'update_tree',
targetTabId: this.currentTabId,
})
}
async cleanUpHighlights(): Promise {
if (!this.currentTabId || !isContentScriptAllowed(await this.getCurrentUrl())) {
return
}
await sendMessage({
type: 'PAGE_CONTROL',
action: 'clean_up_highlights',
targetTabId: this.currentTabId,
})
}
async clickElement(...args: any[]): Promise {
const res = await this.remoteCallDomAction('click_element', args)
// @note may cause page navigation, wait for 1 second to ensure the page loading started
await new Promise((resolve) => setTimeout(resolve, 1000))
return res
}
async inputText(...args: any[]): Promise {
return this.remoteCallDomAction('input_text', args)
}
async selectOption(...args: any[]): Promise {
return this.remoteCallDomAction('select_option', args)
}
async scroll(...args: any[]): Promise {
return this.remoteCallDomAction('scroll', args)
}
async scrollHorizontally(...args: any[]): Promise {
return this.remoteCallDomAction('scroll_horizontally', args)
}
async executeJavascript(...args: any[]): Promise {
return this.remoteCallDomAction('execute_javascript', args)
}
/** @note Managed by content script via storage polling. */
async showMask(): Promise {}
/** @note Managed by content script via storage polling. */
async hideMask(): Promise {}
/** @note Managed by content script via storage polling. */
dispose(): void {}
private async remoteCallDomAction(action: string, payload: any[]): Promise {
if (!this.currentTabId) {
return { success: false, message: 'RemotePageController not initialized.' }
}
if (!isContentScriptAllowed(await this.getCurrentUrl())) {
return {
success: false,
message:
'Operation not allowed on this page. Use open_new_tab to navigate to a web page first.',
}
}
return sendMessage({
type: 'PAGE_CONTROL',
action: action,
targetTabId: this.currentTabId!,
payload,
})
}
}
interface DomActionReturn {
success: boolean
message: string
}
/**
* Check if a URL can run content scripts.
*/
export function isContentScriptAllowed(url: string | undefined): boolean {
if (!url) return false
const restrictedPatterns = [
/^chrome:\/\//,
/^chrome-extension:\/\//,
/^about:/,
/^edge:\/\//,
/^brave:\/\//,
/^opera:\/\//,
/^vivaldi:\/\//,
/^file:\/\//,
/^view-source:/,
/^devtools:\/\//,
]
return !restrictedPatterns.some((pattern) => pattern.test(url))
}
================================================
FILE: packages/extension/src/agent/TabsController.background.ts
================================================
/**
* background logics for TabsController
*/
import type { TabAction } from './TabsController'
const PREFIX = '[TabsController.background]'
function debug(...messages: any[]) {
console.debug(`\x1b[90m${PREFIX}\x1b[0m`, ...messages)
}
export function handleTabControlMessage(
message: { type: 'TAB_CONTROL'; action: TabAction; payload: any },
sender: chrome.runtime.MessageSender,
sendResponse: (response: unknown) => void
): true | undefined {
const { action, payload } = message
switch (action as TabAction) {
case 'get_active_tab': {
debug('get_active_tab')
chrome.tabs
.query({ active: true, currentWindow: true })
.then((tabs) => {
const tabId = tabs.length > 0 ? tabs[0].id || null : null
debug('get_active_tab: success', tabId)
sendResponse({ success: true, tabId })
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'get_tab_info': {
debug('get_tab_info', payload)
chrome.tabs
.get(payload.tabId)
.then((tab) => {
debug('get_tab_info: success', tab)
sendResponse(tab)
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'open_new_tab': {
debug('open_new_tab', payload)
chrome.tabs
.create({ url: payload.url, active: false })
.then((newTab) => {
debug('open_new_tab: success', newTab)
sendResponse({ success: true, tabId: newTab.id })
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'create_tab_group': {
debug('create_tab_group', payload)
chrome.tabs
.group({ tabIds: payload.tabIds })
.then((groupId) => {
debug('create_tab_group: success', groupId)
sendResponse({ success: true, groupId })
})
.catch((error) => {
console.error(PREFIX, 'Failed to create tab group', error)
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'update_tab_group': {
debug('update_tab_group', payload)
chrome.tabGroups
.update(payload.groupId, payload.properties)
.then(() => {
sendResponse({ success: true })
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'add_tab_to_group': {
debug('add_tab_to_group', payload)
chrome.tabs
.group({ tabIds: payload.tabId, groupId: payload.groupId })
.then(() => {
sendResponse({ success: true })
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
case 'close_tab': {
debug('close_tab', payload)
chrome.tabs
.remove(payload.tabId)
.then(() => {
sendResponse({ success: true })
})
.catch((error) => {
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
return true // async response
}
default:
sendResponse({ error: `Unknown action: ${action}` })
return
}
}
export function setupTabChangeEvents() {
console.log('[TabsController.background] setupTabChangeEvents')
chrome.tabs.onCreated.addListener((tab) => {
debug('onCreated', tab)
chrome.runtime
.sendMessage({ type: 'TAB_CHANGE', action: 'created', payload: { tab } })
.catch((error) => {
debug('onCreated error:', error)
})
})
chrome.tabs.onRemoved.addListener((tabId, removeInfo) => {
debug('onRemoved', tabId, removeInfo)
chrome.runtime
.sendMessage({
type: 'TAB_CHANGE',
action: 'removed',
payload: { tabId, removeInfo },
})
.catch((error) => {
debug('onRemoved error:', error)
})
})
chrome.tabs.onUpdated.addListener((tabId, changeInfo, tab) => {
debug('onUpdated', tabId, changeInfo)
chrome.runtime
.sendMessage({
type: 'TAB_CHANGE',
action: 'updated',
payload: { tabId, changeInfo, tab },
})
.catch((error) => {
debug('onUpdated error:', error)
})
})
}
================================================
FILE: packages/extension/src/agent/TabsController.ts
================================================
import { isContentScriptAllowed } from './RemotePageController'
const PREFIX = '[TabsController]'
function debug(...messages: any[]) {
console.debug(`\x1b[90m${PREFIX}\x1b[0m`, ...messages)
}
function sendMessage(message: {
type: 'TAB_CONTROL'
action: TabAction
payload?: any
}): Promise {
return chrome.runtime.sendMessage(message).catch((error) => {
console.error(PREFIX, message.action, error)
return null
})
}
/**
* Controller for managing browser tabs.
* - live in the agent env (extension page or content script)
* - no chrome apis. call sw for tab operations
*/
export class TabsController extends EventTarget {
currentTabId: number | null = null
private tabs: TabMeta[] = []
private initialTabId: number | null = null
private tabGroupId: number | null = null
private task: string = ''
async init(task: string, includeInitialTab: boolean = true) {
debug('init', task, includeInitialTab)
this.task = task
this.tabs = []
this.currentTabId = null
this.tabGroupId = null
this.initialTabId = null
const result = await sendMessage({
type: 'TAB_CONTROL',
action: 'get_active_tab',
})
this.initialTabId = result.tabId
if (!this.initialTabId) {
throw new Error('Failed to get initial tab ID')
}
if (includeInitialTab) {
const info = await sendMessage({
type: 'TAB_CONTROL',
action: 'get_tab_info',
payload: { tabId: this.initialTabId },
})
if (isContentScriptAllowed(info.url)) {
this.currentTabId = this.initialTabId
this.tabs.push({
id: result.tabId,
isInitial: true,
url: info.url,
title: info.title,
status: info.status,
})
await this.createTabGroup([this.initialTabId])
}
}
await this.updateCurrentTabId(this.currentTabId)
const tabChangeHandler = (message: any): void => {
if (message.type !== 'TAB_CHANGE') {
// throw new Error(`[TabsController]: Invalid message type: ${message.type}`)
return
}
if (message.action === 'created') {
const tab = message.payload.tab as chrome.tabs.Tab
if (tab.groupId === this.tabGroupId && tab.id != null) {
// Tab created in our controlled group
if (!this.tabs.find((t) => t.id === tab.id)) {
this.tabs.push({ id: tab.id, isInitial: false })
}
this.switchToTab(tab.id)
}
} else if (message.action === 'removed') {
const { tabId } = message.payload as { tabId: number }
const targetTab = this.tabs.find((t) => t.id === tabId)
if (targetTab) {
this.tabs = this.tabs.filter((t) => t.id !== tabId)
if (this.currentTabId === tabId) {
const newCurrentTab = this.tabs[this.tabs.length - 1] || null
if (newCurrentTab) {
this.switchToTab(newCurrentTab.id)
} else {
this.updateCurrentTabId(null)
}
}
}
} else if (message.action === 'updated') {
const { tabId, tab } = message.payload as { tabId: number; tab: chrome.tabs.Tab }
const targetTab = this.tabs.find((t) => t.id === tabId)
if (targetTab) {
targetTab.url = tab.url
targetTab.title = tab.title
targetTab.status = tab.status
}
}
}
chrome.runtime.onMessage.addListener(tabChangeHandler)
this.addEventListener('dispose', () => {
chrome.runtime.onMessage.removeListener(tabChangeHandler)
})
}
async openNewTab(url: string): Promise {
debug('openNewTab', url)
const result = await sendMessage({
type: 'TAB_CONTROL',
action: 'open_new_tab',
payload: { url },
})
if (!result.success) {
throw new Error(`Failed to open new tab: ${result.error}`)
}
const tabId = result.tabId as number
this.tabs.push({
id: tabId,
isInitial: false,
})
await this.switchToTab(tabId)
if (!this.tabGroupId) {
await this.createTabGroup([tabId])
} else {
await sendMessage({
type: 'TAB_CONTROL',
action: 'add_tab_to_group',
payload: { tabId: result.tabId, groupId: this.tabGroupId },
})
}
await this.waitUntilTabLoaded(tabId)
return `✅ Opened new tab ID ${tabId} with URL ${url}`
}
async switchToTab(tabId: number): Promise {
debug('switchToTab', tabId)
const targetTab = this.tabs.find((t) => t.id === tabId)
if (!targetTab) {
throw new Error(`Tab ID ${tabId} not found in tab list.`)
}
await this.updateCurrentTabId(tabId)
return `✅ Switched to tab ID ${tabId}.`
}
async closeTab(tabId: number): Promise {
debug('closeTab', tabId)
const targetTab = this.tabs.find((t) => t.id === tabId)
if (!targetTab) {
throw new Error(`Tab ID ${tabId} not found in tab list.`)
}
if (targetTab.isInitial) {
throw new Error(`Cannot close the initial tab ID ${tabId}.`)
}
const result = await sendMessage({
type: 'TAB_CONTROL',
action: 'close_tab',
payload: { tabId },
})
if (result.success) {
this.tabs = this.tabs.filter((t) => t.id !== tabId)
if (this.currentTabId === tabId) {
const newCurrentTab = this.tabs[this.tabs.length - 1] || null
if (newCurrentTab) {
await this.switchToTab(newCurrentTab.id)
} else {
await this.updateCurrentTabId(null)
}
}
return `✅ Closed tab ID ${tabId}.`
} else {
throw new Error(`Failed to close tab ID ${tabId}: ${result.error}`)
}
}
private async createTabGroup(tabIds: number[]) {
const result = await sendMessage({
type: 'TAB_CONTROL',
action: 'create_tab_group',
payload: { tabIds },
})
if (!result?.success) {
throw new Error(`Failed to create tab group: ${result?.error}`)
}
this.tabGroupId = result.groupId as number
await sendMessage({
type: 'TAB_CONTROL',
action: 'update_tab_group',
payload: {
groupId: this.tabGroupId,
properties: {
title: `PageAgent(${this.task})`,
color: randomColor(),
collapsed: false,
},
},
})
}
async updateCurrentTabId(tabId: number | null) {
debug('updateCurrentTabId', tabId)
this.currentTabId = tabId
await chrome.storage.local.set({ currentTabId: tabId })
}
async getTabInfo(tabId: number): Promise<{ title: string; url: string }> {
// use cached tab info if available
const tabMeta = this.tabs.find((t) => t.id === tabId)
if (tabMeta && tabMeta.url && tabMeta.title) {
return { title: tabMeta.title, url: tabMeta.url }
}
// otherwise, pull the latest tab info from the background script
debug('getTabInfo: pulling from background script', tabId)
const result = await sendMessage({
type: 'TAB_CONTROL',
action: 'get_tab_info',
payload: { tabId },
})
if (tabMeta) {
tabMeta.url = result.url
tabMeta.title = result.title
}
return result
}
async summarizeTabs(): Promise {
const summaries = [`| Tab ID | URL | Title | Current |`, `|-----|-----|-----|-----|`]
for (const tab of this.tabs) {
const { title, url } = await this.getTabInfo(tab.id)
summaries.push(
`| ${tab.id} | ${url} | ${title} | ${this.currentTabId === tab.id ? '✅' : ''} |`
)
}
if (!this.tabs.length) {
summaries.push('\nNo tabs available. Open a tab if needed.')
}
return summaries.join('\n')
}
async waitUntilTabLoaded(tabId: number): Promise {
const tab = this.tabs.find((t) => t.id === tabId)
if (!tab) throw new Error(`Tab ID ${tabId} not found in tab list.`)
if (tab.status === 'unloaded') throw new Error(`Tab ID ${tabId} is unloaded.`)
if (tab.status === 'complete') return
debug('waitUntilTabLoaded', tabId)
await waitUntil(() => tab.status === 'complete', 4_000)
}
dispose() {
this.dispatchEvent(new Event('dispose'))
}
}
export type TabAction =
| 'get_active_tab'
| 'get_tab_info'
| 'open_new_tab'
| 'create_tab_group'
| 'update_tab_group'
| 'add_tab_to_group'
| 'close_tab'
| 'get_tab_title'
interface TabMeta {
id: number
isInitial: boolean
url?: string
title?: string
status?: 'loading' | 'unloaded' | 'complete'
}
const TAB_GROUP_COLORS = ['blue', 'red', 'yellow', 'green', 'pink', 'purple', 'cyan'] as const
type TabGroupColor = (typeof TAB_GROUP_COLORS)[number]
function randomColor(): TabGroupColor {
return TAB_GROUP_COLORS[Math.floor(Math.random() * TAB_GROUP_COLORS.length)]
}
/**
* Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise
* @param timeoutMS Timeout in milliseconds, default 1 minutes, throws error on timeout
* @param error Error object to reject on timeout. If not provided, will resolve with false
*/
export async function waitUntil(
check: () => boolean | Promise,
timeoutMS = 60_000,
error?: string
): Promise {
if (await check()) return true
return new Promise((resolve, reject) => {
const start = Date.now()
const poll = async () => {
if (await check()) return resolve(true)
if (Date.now() - start > timeoutMS) {
if (error) {
return reject(new Error(error))
} else {
return resolve(false)
}
}
setTimeout(poll, 100)
}
setTimeout(poll, 100)
})
}
================================================
FILE: packages/extension/src/agent/constants.ts
================================================
import type { LLMConfig } from '@page-agent/llms'
// Demo LLM for testing
export const DEMO_MODEL = 'qwen3.5-plus'
export const DEMO_BASE_URL = 'https://page-ag-testing-ohftxirgbn.cn-shanghai.fcapp.run'
// export const DEMO_API_KEY = 'NA'
export const DEMO_CONFIG: LLMConfig = {
baseURL: DEMO_BASE_URL,
model: DEMO_MODEL,
// apiKey: DEMO_API_KEY,
}
/** Legacy testing endpoints that should be auto-migrated to DEMO_BASE_URL */
export const LEGACY_TESTING_ENDPOINTS = [
'https://hwcxiuzfylggtcktqgij.supabase.co/functions/v1/llm-testing-proxy',
]
export function isTestingEndpoint(url: string): boolean {
const normalized = url.replace(/\/+$/, '')
return normalized === DEMO_BASE_URL || LEGACY_TESTING_ENDPOINTS.some((ep) => normalized === ep)
}
export function migrateLegacyEndpoint(config: LLMConfig): LLMConfig {
const normalized = config.baseURL.replace(/\/+$/, '')
if (LEGACY_TESTING_ENDPOINTS.some((ep) => normalized === ep)) {
return { ...DEMO_CONFIG }
}
return config
}
================================================
FILE: packages/extension/src/agent/system_prompt.md
================================================
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in .
You excel at following tasks:
1. Navigating complex websites and extracting precise information
2. Automating form submissions and interactive web actions
3. Gathering and saving information
4. Operate effectively in an agent loop
5. Efficiently performing diverse web tasks
- Default working language: **English**
- Use the language that user is using. Return in user's language.
At every step, your input will consist of:
1. : A chronological event stream including your previous actions and their results.
2. : Current and .
3. : Tabs, Current Tab, Current URL, interactive elements indexed for actions, and visible page content.
Agent history will be given as a list of step information as follows:
:
Evaluation of Previous Step: Assessment of last action
Memory: Your memory of this step
Next Goal: Your goal for this step
Action Results: Your actions and their results
and system messages wrapped in tag.
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan yourself how to get it done.
1. Browser State will be given as:
Open Tabs: Open tabs with their ids.
Current Tab: The tab you are currently viewing.
Current URL: URL of the page you are currently viewing.
Interactive Elements: All interactive elements will be provided in format as [index]text where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Examples:
[33]
User form
\t*[35]
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
- Only use indexes that are explicitly provided.
- If the page changes after, for example, an input text action, analyze if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling actions if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- All the elements that are scrollable are marked with `data-scrollable` attribute. Including the scrollable distance in every directions. You can scroll *the element* in case some area are overflowed.
- If a captcha appears, tell user you can not solve captcha. Finish the task and ask user to solve it.
- If expected elements are missing, try scrolling, or navigating back.
- If the page is not fully loaded, use the `wait` action.
- Do not repeat one action for more than 3 times unless some conditions changed.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
You must call the `done` action in one of three cases:
- When you have fully completed the USER REQUEST.
- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
- When you feel stuck or unable to solve user request. Or user request is not clear or contains inappropriate content.
- When it is ABSOLUTELY IMPOSSIBLE to continue.
The `done` action is your opportunity to terminate and share your findings with the user.
- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
- You can use the `text` field of the `done` action to communicate your findings and to provide a coherent reply to the user and fulfill the USER REQUEST.
- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
- If the user asks for a structured output, your `done` action's schema may be modified. Take this schema into account when solving the task!
Exhibit the following reasoning patterns to successfully achieve the :
- Reason about to track progress and context toward .
- Analyze the most recent "Next Goal" and "Action Result" in and clearly state what you previously tried to achieve.
- Analyze all relevant items in and to understand your state.
- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in . If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or ask user for help.
- Ask user for help if you have any difficulty. Keep user in the loop.
- If you see information relevant to , plan saving the information to memory.
- Always reason about the . Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajectory with the user request and think carefully if thats how the user requested it.
Here are examples of good output patterns. Use them as reference but never copy them directly.
"evaluation_previous_goal": "Successfully navigated to the product page and found the target information. Verdict: Success"
"evaluation_previous_goal": "Clicked the login button and user authentication form appeared. Verdict: Success"
"memory": "Found many pending reports that need to be analyzed in the main page. Successfully processed the first 2 reports on quarterly sales data and moving on to inventory analysis and customer feedback reports."
"next_goal": "Click on the 'Add to Cart' button to proceed with the purchase flow."
================================================
FILE: packages/extension/src/agent/tabTools.ts
================================================
/**
* Tab control tools for browser extension
*
* These tools allow the agent to manage multiple browser tabs:
* - open_new_tab: Open a new tab and set it as current
* - switch_to_tab: Switch to an existing tab
* - close_tab: Close a tab (optionally switch to another)
*/
import * as z from 'zod/v4'
import type { TabsController } from './TabsController'
/** Tool definition compatible with PageAgentCore customTools */
interface TabTool {
description: string
inputSchema: z.ZodType
execute: (input: unknown) => Promise
}
/**
* Create tab control tools bound to a TabsManager instance.
* These tools are injected into PageAgentCore via customTools config.
*/
export function createTabTools(tabsController: TabsController): Record {
return {
open_new_tab: {
description:
'Open a new browser tab with the specified URL. The new tab becomes the current tab for all subsequent page operations.',
inputSchema: z.object({
url: z.string().describe('The URL to open in the new tab'),
}),
execute: async (input: unknown) => {
const { url } = input as { url: string }
try {
return await tabsController.openNewTab(url)
} catch (error) {
return `❌ Failed: ${error instanceof Error ? error.message : String(error)}`
}
},
},
switch_to_tab: {
description:
'Switch to an existing tab by its ID. After switching, all page operations will target the new current tab. You can only switch to tabs in the tab list shown in browser state.',
inputSchema: z.object({
tab_id: z.number().int().describe('The tab ID to switch to'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }
try {
return await tabsController.switchToTab(tab_id)
} catch (error) {
return `❌ Failed: ${error instanceof Error ? error.message : String(error)}`
}
},
},
close_tab: {
description:
'Close a tab by its ID. Cannot close the initial tab. Optionally specify which tab to switch to after closing.',
inputSchema: z.object({
tab_id: z.number().int().describe('The tab ID to close'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }
try {
return await tabsController.closeTab(tab_id)
} catch (error) {
return `❌ Failed: ${error instanceof Error ? error.message : String(error)}`
}
},
},
}
}
================================================
FILE: packages/extension/src/agent/useAgent.ts
================================================
/**
* React hook for using AgentController
*/
import type {
AgentActivity,
AgentStatus,
ExecutionResult,
HistoricalEvent,
SupportedLanguage,
} from '@page-agent/core'
import type { LLMConfig } from '@page-agent/llms'
import { useCallback, useEffect, useRef, useState } from 'react'
import { MultiPageAgent } from './MultiPageAgent'
import { DEMO_CONFIG, migrateLegacyEndpoint } from './constants'
/** Language preference: undefined means follow system */
export type LanguagePreference = SupportedLanguage | undefined
export interface AdvancedConfig {
maxSteps?: number
systemInstruction?: string
experimentalLlmsTxt?: boolean
disableNamedToolChoice?: boolean
}
export interface ExtConfig extends LLMConfig, AdvancedConfig {
language?: LanguagePreference
}
export interface UseAgentResult {
status: AgentStatus
history: HistoricalEvent[]
activity: AgentActivity | null
currentTask: string
config: ExtConfig | null
execute: (task: string) => Promise
stop: () => void
configure: (config: ExtConfig) => Promise
}
export function useAgent(): UseAgentResult {
const agentRef = useRef(null)
const [status, setStatus] = useState('idle')
const [history, setHistory] = useState([])
const [activity, setActivity] = useState(null)
const [currentTask, setCurrentTask] = useState('')
const [config, setConfig] = useState(null)
useEffect(() => {
chrome.storage.local.get(['llmConfig', 'language', 'advancedConfig']).then((result) => {
let llmConfig = (result.llmConfig as LLMConfig) ?? DEMO_CONFIG
const language = (result.language as SupportedLanguage) || undefined
const advancedConfig = (result.advancedConfig as AdvancedConfig) ?? {}
// Auto-migrate legacy testing endpoints
const migrated = migrateLegacyEndpoint(llmConfig)
if (migrated !== llmConfig) {
llmConfig = migrated
chrome.storage.local.set({ llmConfig: migrated })
} else if (!result.llmConfig) {
chrome.storage.local.set({ llmConfig: DEMO_CONFIG })
}
setConfig({ ...llmConfig, ...advancedConfig, language })
})
}, [])
useEffect(() => {
if (!config) return
const { systemInstruction, ...agentConfig } = config
const agent = new MultiPageAgent({
...agentConfig,
instructions: systemInstruction ? { system: systemInstruction } : undefined,
})
agentRef.current = agent
const handleStatusChange = (e: Event) => {
const newStatus = agent.status as AgentStatus
setStatus(newStatus)
if (newStatus === 'idle' || newStatus === 'completed' || newStatus === 'error') {
setActivity(null)
}
}
const handleHistoryChange = (e: Event) => {
setHistory([...agent.history])
}
const handleActivity = (e: Event) => {
const newActivity = (e as CustomEvent).detail as AgentActivity
setActivity(newActivity)
}
agent.addEventListener('statuschange', handleStatusChange)
agent.addEventListener('historychange', handleHistoryChange)
agent.addEventListener('activity', handleActivity)
return () => {
agent.removeEventListener('statuschange', handleStatusChange)
agent.removeEventListener('historychange', handleHistoryChange)
agent.removeEventListener('activity', handleActivity)
agent.dispose()
}
}, [config])
const execute = useCallback(async (task: string) => {
const agent = agentRef.current
console.log('🚀 [useAgent] start executing task:', task)
if (!agent) throw new Error('Agent not initialized')
setCurrentTask(task)
setHistory([])
return agent.execute(task)
}, [])
const stop = useCallback(() => {
agentRef.current?.stop()
}, [])
const configure = useCallback(
async ({
language,
maxSteps,
systemInstruction,
experimentalLlmsTxt,
disableNamedToolChoice,
...llmConfig
}: ExtConfig) => {
await chrome.storage.local.set({ llmConfig })
if (language) {
await chrome.storage.local.set({ language })
} else {
await chrome.storage.local.remove('language')
}
const advancedConfig: AdvancedConfig = {
maxSteps,
systemInstruction,
experimentalLlmsTxt,
disableNamedToolChoice,
}
await chrome.storage.local.set({ advancedConfig })
setConfig({ ...llmConfig, ...advancedConfig, language })
},
[]
)
return {
status,
history,
activity,
currentTask,
config,
execute,
stop,
configure,
}
}
================================================
FILE: packages/extension/src/assets/index.css
================================================
@import 'tailwindcss';
@import 'tw-animate-css';
@custom-variant dark (&:is(.dark *));
:root {
--background: oklch(1 0 0);
--foreground: oklch(0.145 0 0);
--card: oklch(1 0 0);
--card-foreground: oklch(0.145 0 0);
--popover: oklch(1 0 0);
--popover-foreground: oklch(0.145 0 0);
--primary: oklch(0.205 0 0);
--primary-foreground: oklch(0.985 0 0);
--secondary: oklch(0.97 0 0);
--secondary-foreground: oklch(0.205 0 0);
--muted: oklch(0.97 0 0);
--muted-foreground: oklch(0.556 0 0);
--accent: oklch(0.97 0 0);
--accent-foreground: oklch(0.205 0 0);
--destructive: oklch(0.577 0.245 27.325);
--destructive-foreground: oklch(0.577 0.245 27.325);
--border: oklch(0.922 0 0);
--input: oklch(0.922 0 0);
--ring: oklch(0.708 0 0);
--chart-1: oklch(0.646 0.222 41.116);
--chart-2: oklch(0.6 0.118 184.704);
--chart-3: oklch(0.398 0.07 227.392);
--chart-4: oklch(0.828 0.189 84.429);
--chart-5: oklch(0.769 0.188 70.08);
--radius: 0.625rem;
--sidebar: oklch(0.985 0 0);
--sidebar-foreground: oklch(0.145 0 0);
--sidebar-primary: oklch(0.205 0 0);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.97 0 0);
--sidebar-accent-foreground: oklch(0.205 0 0);
--sidebar-border: oklch(0.922 0 0);
--sidebar-ring: oklch(0.708 0 0);
}
.dark {
--background: oklch(0.19 0 0);
--foreground: oklch(0.985 0 0);
--card: oklch(0.145 0 0);
--card-foreground: oklch(0.985 0 0);
--popover: oklch(0.145 0 0);
--popover-foreground: oklch(0.985 0 0);
--primary: oklch(0.985 0 0);
--primary-foreground: oklch(0.205 0 0);
--secondary: oklch(0.269 0 0);
--secondary-foreground: oklch(0.985 0 0);
--muted: oklch(0.269 0 0);
--muted-foreground: oklch(0.708 0 0);
--accent: oklch(0.269 0 0);
--accent-foreground: oklch(0.985 0 0);
--destructive: oklch(0.396 0.141 25.723);
--destructive-foreground: oklch(0.637 0.237 25.331);
--border: oklch(0.269 0 0);
--input: oklch(0.269 0 0);
--ring: oklch(0.439 0 0);
--chart-1: oklch(0.488 0.243 264.376);
--chart-2: oklch(0.696 0.17 162.48);
--chart-3: oklch(0.769 0.188 70.08);
--chart-4: oklch(0.627 0.265 303.9);
--chart-5: oklch(0.645 0.246 16.439);
--sidebar: oklch(0.205 0 0);
--sidebar-foreground: oklch(0.985 0 0);
--sidebar-primary: oklch(0.488 0.243 264.376);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.269 0 0);
--sidebar-accent-foreground: oklch(0.985 0 0);
--sidebar-border: oklch(0.269 0 0);
--sidebar-ring: oklch(0.439 0 0);
}
@theme inline {
--color-background: var(--background);
--color-foreground: var(--foreground);
--color-card: var(--card);
--color-card-foreground: var(--card-foreground);
--color-popover: var(--popover);
--color-popover-foreground: var(--popover-foreground);
--color-primary: var(--primary);
--color-primary-foreground: var(--primary-foreground);
--color-secondary: var(--secondary);
--color-secondary-foreground: var(--secondary-foreground);
--color-muted: var(--muted);
--color-muted-foreground: var(--muted-foreground);
--color-accent: var(--accent);
--color-accent-foreground: var(--accent-foreground);
--color-destructive: var(--destructive);
--color-destructive-foreground: var(--destructive-foreground);
--color-border: var(--border);
--color-input: var(--input);
--color-ring: var(--ring);
--color-chart-1: var(--chart-1);
--color-chart-2: var(--chart-2);
--color-chart-3: var(--chart-3);
--color-chart-4: var(--chart-4);
--color-chart-5: var(--chart-5);
--radius-sm: calc(var(--radius) - 4px);
--radius-md: calc(var(--radius) - 2px);
--radius-lg: var(--radius);
--radius-xl: calc(var(--radius) + 4px);
--color-sidebar: var(--sidebar);
--color-sidebar-foreground: var(--sidebar-foreground);
--color-sidebar-primary: var(--sidebar-primary);
--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
--color-sidebar-accent: var(--sidebar-accent);
--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
--color-sidebar-border: var(--sidebar-border);
--color-sidebar-ring: var(--sidebar-ring);
--animate-blink-cursor: blink-cursor 1.2s step-end infinite;
@keyframes blink-cursor {
0%,
49% {
opacity: 1;
}
50%,
100% {
opacity: 0;
}
}
}
@keyframes glow-a {
0%,
100% {
opacity: 0.45;
transform: scale(1);
}
50% {
opacity: 0;
transform: scale(1.1);
}
}
@keyframes glow-b {
0%,
100% {
opacity: 0;
transform: scale(1.1);
}
50% {
opacity: 0.45;
transform: scale(1);
}
}
@layer base {
* {
@apply border-border outline-ring/50;
}
body {
@apply bg-background text-foreground;
}
}
================================================
FILE: packages/extension/src/components/ConfigPanel.tsx
================================================
import {
Copy,
CornerUpLeft,
ExternalLink,
Eye,
EyeOff,
FoldVertical,
HatGlasses,
Home,
Loader2,
Scale,
UnfoldVertical,
} from 'lucide-react'
import { useEffect, useState } from 'react'
import { siGithub } from 'simple-icons'
import { DEMO_BASE_URL, DEMO_MODEL, isTestingEndpoint } from '@/agent/constants'
import type { ExtConfig, LanguagePreference } from '@/agent/useAgent'
import { Button } from '@/components/ui/button'
import { Input } from '@/components/ui/input'
import { Switch } from '@/components/ui/switch'
interface ConfigPanelProps {
config: ExtConfig | null
onSave: (config: ExtConfig) => Promise
onClose: () => void
}
export function ConfigPanel({ config, onSave, onClose }: ConfigPanelProps) {
const [baseURL, setBaseURL] = useState(config?.baseURL || DEMO_BASE_URL)
const [model, setModel] = useState(config?.model || DEMO_MODEL)
const [apiKey, setApiKey] = useState(config?.apiKey)
const [language, setLanguage] = useState(config?.language)
const [maxSteps, setMaxSteps] = useState(config?.maxSteps)
const [systemInstruction, setSystemInstruction] = useState(config?.systemInstruction ?? '')
const [experimentalLlmsTxt, setExperimentalLlmsTxt] = useState(
config?.experimentalLlmsTxt ?? false
)
const [disableNamedToolChoice, setDisableNamedToolChoice] = useState(
config?.disableNamedToolChoice ?? false
)
const [advancedOpen, setAdvancedOpen] = useState(false)
const [saving, setSaving] = useState(false)
const [userAuthToken, setUserAuthToken] = useState('')
const [copied, setCopied] = useState(false)
const [showToken, setShowToken] = useState(false)
const [showApiKey, setShowApiKey] = useState(false)
useEffect(() => {
setBaseURL(config?.baseURL || DEMO_BASE_URL)
setModel(config?.model || DEMO_MODEL)
setApiKey(config?.apiKey)
setLanguage(config?.language)
setMaxSteps(config?.maxSteps)
setSystemInstruction(config?.systemInstruction ?? '')
setExperimentalLlmsTxt(config?.experimentalLlmsTxt ?? false)
setDisableNamedToolChoice(config?.disableNamedToolChoice ?? false)
}, [config])
// Poll for user auth token every second until found
useEffect(() => {
let interval: NodeJS.Timeout | null = null
const fetchToken = async () => {
const result = await chrome.storage.local.get('PageAgentExtUserAuthToken')
const token = result.PageAgentExtUserAuthToken
if (typeof token === 'string' && token) {
setUserAuthToken(token)
if (interval) {
clearInterval(interval)
interval = null
}
}
}
fetchToken()
interval = setInterval(fetchToken, 1000)
return () => {
if (interval) clearInterval(interval)
}
}, [])
const handleCopyToken = async () => {
if (userAuthToken) {
await navigator.clipboard.writeText(userAuthToken)
setCopied(true)
setTimeout(() => setCopied(false), 2000)
}
}
const handleSave = async () => {
setSaving(true)
try {
await onSave({
apiKey,
baseURL,
model,
language,
maxSteps: maxSteps || undefined,
systemInstruction: systemInstruction || undefined,
experimentalLlmsTxt,
disableNamedToolChoice,
})
} finally {
setSaving(false)
}
}
return (
Settings
{/* User Auth Token Section */}
Give a website the ability to call this extension.