[
  {
    "path": ".eslintrc.json",
    "content": "{\n  \"root\": true,\n  \"parser\": \"@typescript-eslint/parser\",\n  \"parserOptions\": {\n    \"ecmaVersion\": \"latest\",\n    \"sourceType\": \"module\",\n    \"project\": true\n  },\n  \"plugins\": [\"@typescript-eslint\"],\n  \"extends\": [\n    \"eslint:recommended\",\n    \"plugin:@typescript-eslint/recommended\"\n  ],\n  \"env\": {\n    \"node\": true,\n    \"es2022\": true\n  },\n  \"rules\": {\n    \"@typescript-eslint/no-explicit-any\": \"warn\",\n    \"@typescript-eslint/no-unused-vars\": [\"error\", { \"argsIgnorePattern\": \"^_\" }],\n    \"@typescript-eslint/explicit-function-return-type\": \"off\",\n    \"@typescript-eslint/explicit-module-boundary-types\": \"off\",\n    \"@typescript-eslint/no-non-null-assertion\": \"warn\",\n    \"no-console\": [\"warn\", { \"allow\": [\"warn\", \"error\"] }]\n  },\n  \"ignorePatterns\": [\"dist/\", \"node_modules/\", \"*.js\", \"*.config.ts\"]\n}\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: CI\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n\njobs:\n  test:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n\n      - uses: actions/setup-node@v4\n        with:\n          node-version: \"22\"\n          cache: \"npm\"\n\n      - run: npm ci\n\n      - name: Typecheck\n        run: npx tsc --noEmit\n\n      - name: Lint\n        run: npm run lint\n\n      - name: Format check\n        run: npm run format:check\n\n      - name: Test\n        run: npm test\n\n      - name: Build\n        run: npm run build\n"
  },
  {
    "path": ".github/workflows/publish.yml",
    "content": "name: Publish to npm\n\non:\n  release:\n    types: [published]\n\njobs:\n  publish:\n    runs-on: ubuntu-latest\n    permissions:\n      contents: read\n    steps:\n      - uses: actions/checkout@v4\n\n      - uses: actions/setup-node@v4\n        with:\n          node-version: \"22\"\n          registry-url: \"https://registry.npmjs.org\"\n\n      - run: npm ci\n\n      - name: Verify version matches tag\n        run: |\n          TAG_VERSION=\"${GITHUB_REF_NAME#v}\"\n          PKG_VERSION=$(node -p \"require('./package.json').version\")\n          if [ \"$TAG_VERSION\" != \"$PKG_VERSION\" ]; then\n            echo \"Error: Tag $TAG_VERSION does not match package.json $PKG_VERSION\"\n            exit 1\n          fi\n          echo \"Version verified: $PKG_VERSION\"\n\n      - name: Build\n        run: npm run build\n\n      - name: Publish\n        run: npm publish --access public\n        env:\n          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Dependencies\nnode_modules/\n\n# Build output\ndist/\n\n# Environment files\n.env\n.env.local\n.env.*.local\n\n# Logs\n*.log\nnpm-debug.log*\nyarn-debug.log*\nyarn-error.log*\n\n# OS files\n.DS_Store\nThumbs.db\n\n# IDE\n.idea/\n.vscode/\n*.swp\n*.swo\n\n# Coverage\ncoverage/\n.nyc_output/\n\n# Package manager locks\n# Note: package-lock.json is tracked for reproducible builds\nyarn.lock\n\n# Bun\nbun.lockb\n\n# Temporary files\ntmp/\ntemp/\n*.tmp\n\n# Hero/Ulixee session data\n.ulixee/\n\n# Claude Code context\nCLAUDE.md\n\n# Deployment configs (contain sensitive data)\ndeploy/\n"
  },
  {
    "path": ".leasotrc",
    "content": "{\n  \"tags\": [\"TODO\", \"FIXME\", \"HACK\", \"XXX\", \"BUG\", \"OPTIMIZE\", \"REVIEW\"],\n  \"ignore\": [\"node_modules/**\", \"dist/**\"]\n}\n"
  },
  {
    "path": ".nvmrc",
    "content": "v22.12.0\n"
  },
  {
    "path": ".prettierrc",
    "content": "{\n  \"semi\": true,\n  \"singleQuote\": false,\n  \"tabWidth\": 2,\n  \"trailingComma\": \"es5\",\n  \"printWidth\": 100,\n  \"useTabs\": false,\n  \"bracketSpacing\": true,\n  \"arrowParens\": \"always\",\n  \"endOfLine\": \"lf\"\n}\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.0\nmessage: \"If you use Reader in your research or project, please cite it.\"\ntitle: \"Reader: Open-source, production-grade web scraping engine built for LLMs\"\ntype: software\nauthors:\n  - family-names: Kaul\n    given-names: Nihal\nlicense: Apache-2.0\nurl: \"https://github.com/vakra-dev/reader\"\nrepository-code: \"https://github.com/vakra-dev/reader\"\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a welcoming experience for everyone, regardless of background or\nidentity.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment:\n\n- Using welcoming and inclusive language\n- Being respectful of differing viewpoints and experiences\n- Gracefully accepting constructive criticism\n- Focusing on what is best for the community\n- Showing empathy towards other community members\n\nExamples of unacceptable behavior:\n\n- Trolling, insulting or derogatory comments, and personal attacks\n- Public or private harassment\n- Publishing others' private information without explicit permission\n- Other conduct which could reasonably be considered inappropriate in a professional setting\n\n## Enforcement Responsibilities\n\nProject maintainers are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate or harmful.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\n\n## Enforcement\n\nInstances of unacceptable behavior may be reported to the project maintainers at\n**nihal.codes@gmail.com**. All complaints will be reviewed and investigated\npromptly and fairly.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact:** Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence:** A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the behavior\nwas inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact:** A violation through a single incident or series of actions.\n\n**Consequence:** A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or permanent\nban.\n\n### 3. Temporary Ban\n\n**Community Impact:** A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence:** A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact:** Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior, harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence:** A permanent ban from any sort of public interaction within the\ncommunity.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),\nversion 2.1, available at\n[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to Reader\n\nThank you for your interest in contributing to Reader! This document provides guidelines and instructions for contributing.\n\n## Development Setup\n\n### Prerequisites\n\n- **Node.js** >= 18 (v22 recommended)\n- **npm** for package management\n- **Git**\n\n> **Note:** Always run scripts with Node.js (`npx tsx` or `node`) as Hero has ESM compatibility issues with other runtimes.\n\n### Getting Started\n\n1. **Fork the repository** on GitHub\n\n2. **Clone your fork:**\n\n   ```bash\n   git clone https://github.com/YOUR_USERNAME/reader.git\n   cd reader\n   ```\n\n3. **Install dependencies:**\n\n   ```bash\n   npm install\n   ```\n\n4. **Verify setup:**\n\n   ```bash\n   npm run typecheck\n   npm run build\n   ```\n\n5. **Test the CLI:**\n   ```bash\n   npx tsx src/cli/index.ts scrape https://example.com\n   ```\n\n## Project Structure\n\n```\nsrc/\n├── index.ts              # Public API exports\n├── client.ts             # ReaderClient - main API entry point\n├── scraper.ts            # Scraper class - main scraping logic\n├── crawler.ts            # Crawler class - link discovery\n├── types.ts              # TypeScript types for scraping\n├── crawl-types.ts        # TypeScript types for crawling\n│\n├── browser/\n│   ├── pool.ts           # BrowserPool - manages Hero instances\n│   ├── hero-config.ts    # Hero configuration\n│   └── types.ts          # Pool types\n│\n├── cloudflare/\n│   ├── detector.ts       # Challenge detection\n│   ├── handler.ts        # Challenge resolution\n│   └── types.ts          # Cloudflare types\n│\n├── formatters/\n│   ├── markdown.ts       # Markdown formatter\n│   ├── html.ts           # HTML formatter\n│   ├── json.ts           # JSON formatter\n│   ├── text.ts           # Text formatter\n│   └── index.ts          # Re-exports\n│\n├── utils/\n│   ├── content-cleaner.ts    # HTML content cleaning\n│   ├── metadata-extractor.ts # Metadata extraction\n│   ├── url-helpers.ts        # URL utilities\n│   ├── rate-limiter.ts       # Rate limiting\n│   └── logger.ts             # Logging\n│\n├── proxy/\n│   └── config.ts         # Proxy configuration\n│\n├── daemon/\n│   ├── index.ts          # Module exports\n│   ├── server.ts         # DaemonServer - HTTP server with browser pool\n│   └── client.ts         # DaemonClient - connects CLI to daemon\n│\n└── cli/\n    └── index.ts          # CLI implementation\n```\n\n## Development Workflow\n\n### Running the CLI\n\n```bash\n# Run CLI directly\nnpx tsx src/cli/index.ts scrape https://example.com\n\n# With verbose output\nnpx tsx src/cli/index.ts scrape https://example.com -v\n\n# Show browser window\nnpx tsx src/cli/index.ts scrape https://example.com --show-chrome\n```\n\n### Daemon Mode\n\n```bash\n# Start daemon with browser pool\nnpx tsx src/cli/index.ts start --pool-size 5\n\n# Check daemon status\nnpx tsx src/cli/index.ts status\n\n# Run commands (auto-connects to daemon)\nnpx tsx src/cli/index.ts scrape https://example.com\n\n# Force standalone mode (bypass daemon)\nnpx tsx src/cli/index.ts scrape https://example.com --standalone\n\n# Stop daemon\nnpx tsx src/cli/index.ts stop\n```\n\n### Code Quality\n\nRun these commands before submitting a PR:\n\n```bash\n# Type checking\nnpm run typecheck\n\n# Linting\nnpm run lint\n\n# Auto-fix lint issues\nnpm run lint:fix\n\n# Format code\nnpm run format\n\n# Check formatting\nnpm run format:check\n\n# Build\nnpm run build\n```\n\n### Finding TODOs\n\nTrack outstanding work:\n\n```bash\nnpm run todo\n```\n\n## Making Changes\n\n### Branch Naming\n\n- `feature/description` - New features\n- `fix/description` - Bug fixes\n- `docs/description` - Documentation updates\n- `refactor/description` - Code refactoring\n\n### Commit Messages\n\nWrite clear, concise commit messages:\n\n```\ntype: short description\n\nLonger description if needed.\n```\n\nTypes: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`\n\nExamples:\n\n```\nfeat: add support for custom user agents\nfix: resolve timeout issue with Cloudflare challenges\ndocs: update proxy configuration guide\nrefactor: simplify browser pool recycling logic\n```\n\n### Pull Request Process\n\n1. Create a new branch from `main`\n2. Make your changes\n3. Run all checks:\n   ```bash\n   npm run lint\n   npm run format:check\n   npm run typecheck\n   npm run build\n   ```\n4. Push your branch and create a PR\n5. Fill out the PR template\n6. Wait for review\n\n## Common Tasks\n\n### Adding a New Output Format\n\n1. Create `src/formatters/newformat.ts`:\n\n   ```typescript\n   export function formatToNewFormat(\n     pages: Page[],\n     baseUrl: string,\n     scrapedAt: string,\n     duration: number,\n     metadata?: WebsiteMetadata\n   ): string {\n     // Implementation\n   }\n   ```\n\n2. Export from `src/formatters/index.ts`\n\n3. Add to format type in `src/types.ts`\n\n4. Call formatter in `src/scraper.ts`\n\n5. Update CLI validation in `src/cli/index.ts`\n\n### Adding a New ScrapeOption\n\n1. Add to `ScrapeOptions` interface in `src/types.ts`\n2. Add default in `DEFAULT_OPTIONS`\n3. Use in `Scraper` class via `this.options.newOption`\n4. Add CLI flag in `src/cli/index.ts` if applicable\n5. Update documentation\n\n### Modifying Cloudflare Detection\n\n1. Detection patterns: `src/cloudflare/detector.ts`\n2. Resolution logic: `src/cloudflare/handler.ts`\n3. Test with known Cloudflare-protected sites\n\n### Adjusting Browser Pool\n\n1. Default config: `src/browser/types.ts`\n2. Pool logic: `src/browser/pool.ts`\n\n## Testing\n\nCurrently testing is done manually. When adding new features:\n\n1. **Test basic functionality:**\n\n   ```bash\n   npx tsx src/cli/index.ts scrape https://example.com\n   ```\n\n2. **Test Cloudflare-protected sites:**\n\n   ```bash\n   npx tsx src/cli/index.ts scrape https://cloudflare-protected-site.com -v\n   ```\n\n3. **Test different output formats:**\n\n   ```bash\n   npx tsx src/cli/index.ts scrape https://example.com -f markdown,html,json,text\n   ```\n\n4. **Test crawling:**\n\n   ```bash\n   npx tsx src/cli/index.ts crawl https://example.com -d 2 -m 10\n   ```\n\n5. **Test batch scraping:**\n\n   ```bash\n   npx tsx src/cli/index.ts scrape url1 url2 url3 -c 3 -v\n   ```\n\n6. **Test daemon mode:**\n\n   ```bash\n   # Start daemon\n   npx tsx src/cli/index.ts start --pool-size 3\n\n   # Test scraping via daemon\n   npx tsx src/cli/index.ts scrape https://example.com\n\n   # Check status\n   npx tsx src/cli/index.ts status\n\n   # Stop daemon\n   npx tsx src/cli/index.ts stop\n   ```\n\n## Running Examples\n\nThe `examples/` folder contains working examples:\n\n```bash\ncd examples\nnpm install\n\n# Basic examples\nnpx tsx basic/basic-scrape.ts\nnpx tsx basic/batch-scrape.ts\nnpx tsx basic/crawl-website.ts\n\n# AI integration examples (requires API keys)\nexport OPENAI_API_KEY=\"sk-...\"\nnpx tsx ai-tools/openai-summary.ts https://example.com\n\n# Production server\nnpx tsx production/express-server/src/index.ts\n```\n\n## Code Style\n\n- Use TypeScript for all new code\n- Follow existing patterns in the codebase\n- Use async/await instead of callbacks\n- Prefer explicit types over `any`\n- Use meaningful variable and function names\n- Add JSDoc comments for public APIs\n\n## Documentation\n\nWhen making changes:\n\n1. Update relevant markdown files in `docs/`\n2. Update README.md if adding new features\n3. Add JSDoc comments to new public functions\n4. Update CLAUDE.md for AI context if architecture changes\n\n### Documentation Files\n\n| File                      | Purpose                         |\n| ------------------------- | ------------------------------- |\n| `README.md`               | Main documentation, quick start |\n| `CONTRIBUTING.md`         | This file                       |\n| `docs/getting-started.md` | Detailed setup guide            |\n| `docs/api-reference.md`   | Complete API docs               |\n| `docs/architecture.md`    | System design                   |\n| `docs/troubleshooting.md` | Common issues                   |\n| `docs/guides/`            | Feature guides                  |\n| `docs/deployment/`        | Deployment guides               |\n\n## Reporting Issues\n\nWhen reporting bugs, please include:\n\n- Operating system and version\n- Node.js version (`node --version`)\n- Reader version\n- Steps to reproduce\n- Expected vs actual behavior\n- Error messages and stack traces\n- Verbose output (`-v` flag)\n\n## Code of Conduct\n\n- Be respectful and inclusive\n- Focus on constructive feedback\n- Help others learn and grow\n- Follow project guidelines\n\n## License\n\nBy contributing, you agree that your contributions will be licensed under the Apache 2.0 License.\n\n## Disclaimer\n\nBy using Reader, you agree to the following:\n\n- You are solely responsible for respecting websites' policies when scraping and crawling\n- You will adhere to applicable privacy policies and terms of use before initiating scraping activities\n- Reader respects robots.txt directives by default, but ultimate compliance is your responsibility\n\n## Questions?\n\n- Check the [documentation](https://docs.reader.dev)\n- Search [GitHub Issues](https://github.com/vakra-dev/reader/issues)\n- Ask in [Discord](https://discord.gg/6tjkq7J5WV)\n- Open a new issue or discussion\n\nThank you for contributing!\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to the Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   Copyright (c) 2026 vakra-dev\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "README.md",
    "content": "<p align=\"center\">\n  <img src=\"docs/assets/logo.png\" alt=\"Reader Logo\" width=\"200\" />\n</p>\n\n<h1 align=\"center\">Reader</h1>\n\n<p align=\"center\">\n  <strong>Open source web infrastructure for AI.</strong>\n</p>\n\n<p align=\"center\">\n  Access the web without the complexity.\n</p>\n\n<p align=\"center\">\n  <a href=\"https://opensource.org/licenses/Apache-2.0\"><img src=\"https://img.shields.io/badge/License-Apache_2.0-blue.svg\" alt=\"License: Apache 2.0\"></a>\n  <a href=\"https://www.npmjs.com/package/@vakra-dev/reader\"><img src=\"https://img.shields.io/npm/v/@vakra-dev/reader.svg\" alt=\"npm version\"></a>\n  <a href=\"https://github.com/vakra-dev/reader/stargazers\"><img src=\"https://img.shields.io/github/stars/vakra-dev/reader.svg?style=social\" alt=\"GitHub stars\"></a>\n</p>\n\n<p align=\"center\">\n  <a href=\"https://docs.reader.dev\">Docs</a> · <a href=\"https://docs.reader.dev/home/examples\">Examples</a> · <a href=\"https://discord.gg/6tjkq7J5WV\">Discord</a>\n</p>\n\n<p align=\"center\">\n  <img src=\"./docs/assets/demo.gif\" alt=\"Reader demo - scrape any URL to clean markdown\" width=\"700\" />\n</p>\n\n## The Problem\n\nBuilding agents that need web access is frustrating. You piece together Puppeteer, add stealth plugins, fight Cloudflare, manage proxies and it still breaks in production.\n\nBecause production grade web scraping isn't about rendering a page and converting HTML to markdown. It's about everything underneath:\n\n| Layer                    | What it actually takes                                              |\n| ------------------------ | ------------------------------------------------------------------- |\n| **Browser architecture** | Managing browser instances at scale, not one-off scripts            |\n| **Anti-bot bypass**      | Cloudflare, Turnstile, JS challenges, they all block naive scrapers |\n| **TLS fingerprinting**   | Real browsers have fingerprints. Puppeteer doesn't. Sites know.     |\n| **Proxy infrastructure** | Datacenter vs residential, rotation strategies, sticky sessions     |\n| **Resource management**  | Browser pooling, memory limits, graceful recycling                  |\n| **Reliability**          | Rate limiting, retries, timeouts, caching, graceful degradation     |\n\nI built **Reader**, a production-grade web scraping engine on top of [Ulixee Hero](https://ulixee.org/), a headless browser designed for exactly this.\n\n## The Solution\n\nThree primitives. That's it.\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nconst reader = new ReaderClient();\n\n// 1. Scrape URLs → clean markdown\nconst result = await reader.scrape({ urls: [\"https://example.com\"] });\nconsole.log(result.data[0].markdown);\n\n// 2. Crawl a site → discover + scrape pages\nconst pages = await reader.crawl({\n  url: \"https://example.com\",\n  depth: 2,\n  scrape: true,\n});\nconsole.log(`Found ${pages.urls.length} pages`);\n\n// 3. Browser session → full Playwright/Puppeteer control with stealth\nconst session = await reader.browser();\nconst browser = await chromium.connectOverCDP(session.wsEndpoint);\nconst page = browser.contexts()[0].pages()[0];\nawait page.goto(\"https://example.com\");\nconsole.log(await page.title());\nawait session.close();\n```\n\nAll the hard stuff (browser pooling, anti-bot bypass, proxy rotation, retries) happens under the hood. You get clean markdown. Your agents get the web. And when you need full browser control, `browser()` gives you a stealthed Chrome that Playwright or Puppeteer can drive.\n\n> [!TIP]\n> If Reader is useful to you, a [star on GitHub](https://github.com/vakra-dev/reader) helps others discover the project.\n\n## Features\n\n- **Browser Sessions** - Launch stealthed Chrome, connect Playwright/Puppeteer via CDP\n- **Anti-Bot Bypass** - TLS fingerprinting, navigator spoofing, WebRTC masking, `webdriver=false`\n- **Clean Output** - Markdown and HTML with automatic main content extraction\n- **Smart Content Cleaning** - Removes nav, headers, footers, popups, cookie banners\n- **CLI & API** - Use from command line or programmatically\n- **Browser Pool** - Auto-recycling, health monitoring, tiered proxy pools\n- **Concurrent Scraping** - Parallel URL processing with progress tracking\n- **Website Crawling** - BFS link discovery with depth/page limits\n- **Tiered Proxies** - Datacenter and residential pools with auto-escalation and health tracking\n\n## Installation\n\n```bash\nnpm install @vakra-dev/reader\n```\n\n**Requirements:** Node.js >= 18\n\n> **Apple Silicon (M1/M2/M3):** Hero's bundled Chrome binary isn't available for arm64. Point to your system Chrome:\n>\n> ```bash\n> export CHROME_139_BIN=\"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome\"\n> ```\n\n## Quick Start\n\n### Cloud (Fastest)\n\nGet an API key at [app.reader.dev](https://app.reader.dev) and start scraping immediately:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader-js\";\n\nconst reader = new ReaderClient({ apiKey: process.env.READER_API_KEY });\n\nconst result = await reader.read({ url: \"https://example.com\" });\nif (result.kind === \"scrape\") {\n  console.log(result.data.markdown);\n}\n```\n\n```bash\nnpm install @vakra-dev/reader-js\n```\n\nSee the [cloud docs](https://docs.reader.dev) for the full API reference.\n\n### Self-Hosted\n\nInstall the reader engine and run scraping on your own infrastructure:\n\n### Basic Scrape\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  formats: [\"markdown\", \"html\"],\n});\n\nconsole.log(result.data[0].markdown);\nconsole.log(result.data[0].html);\n\nawait reader.close();\n```\n\n### Batch Scraping with Concurrency\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\"https://example.com\", \"https://example.org\", \"https://example.net\"],\n  formats: [\"markdown\"],\n  batchConcurrency: 3,\n  onProgress: (progress) => {\n    console.log(`${progress.completed}/${progress.total}: ${progress.currentUrl}`);\n  },\n});\n\nconsole.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);\n\nawait reader.close();\n```\n\n### Crawling\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.crawl({\n  url: \"https://example.com\",\n  depth: 2,\n  maxPages: 20,\n  scrape: true,\n});\n\nconsole.log(`Discovered ${result.urls.length} URLs`);\nconsole.log(`Scraped ${result.scraped?.batchMetadata.successfulUrls} pages`);\n\nawait reader.close();\n```\n\n### Browser Session\n\nLaunch a stealthed Chrome and control it with Playwright or Puppeteer. The browser has anti-bot stealth active (`webdriver=false`, navigator spoofing, WebRTC masking). Your existing scripts just work.\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nconst reader = new ReaderClient();\n\n// Create a browser session - returns a CDP WebSocket URL\nconst session = await reader.browser();\n\n// Connect Playwright (one-line change from a local script)\nconst browser = await chromium.connectOverCDP(session.wsEndpoint);\nconst context = await browser.newContext();\nconst page = await context.newPage();\n\n// Use Playwright normally - full stealth active\nawait page.goto(\"https://news.ycombinator.com/\");\nconsole.log(await page.title());\n\nawait browser.close();\nawait session.close();\nawait reader.close();\n```\n\nAlso works with Puppeteer:\n\n```typescript\nimport { connect } from \"puppeteer-core\";\n\nconst browser = await connect({ browserWSEndpoint: session.wsEndpoint });\n```\n\n### With Proxy\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  formats: [\"markdown\"],\n  proxy: {\n    type: \"residential\",\n    host: \"proxy.example.com\",\n    port: 8080,\n    username: \"username\",\n    password: \"password\",\n    country: \"us\",\n  },\n});\n\nawait reader.close();\n```\n\n### With Tiered Proxy Pools\n\nConfigure datacenter (fast, cheap) and residential (anti-bot) proxy tiers. Reader auto-escalates from datacenter to residential when sites block:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({\n  proxyPools: {\n    datacenter: [\n      { url: \"http://user:pass@dc-proxy1:8080\" },\n      { url: \"http://user:pass@dc-proxy2:8080\" },\n    ],\n    residential: [{ url: \"http://user:pass@res-proxy1:8080\" }],\n  },\n});\n\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxyTier: \"auto\", // datacenter first, escalate to residential on block\n});\n\nawait reader.close();\n```\n\nOr via environment variables:\n\n```bash\nPROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080\nPROXY_RESIDENTIAL=http://user:pass@res1:8080\n```\n\n### With Browser Pool Configuration\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({\n  browserPool: {\n    size: 5, // 5 browser instances\n    retireAfterPages: 50, // Recycle after 50 pages\n    retireAfterMinutes: 15, // Recycle after 15 minutes\n  },\n  verbose: true,\n});\n\nconst result = await reader.scrape({\n  urls: manyUrls,\n  batchConcurrency: 5,\n});\n\nawait reader.close();\n```\n\n## CLI Reference\n\n### Daemon Mode\n\nFor multiple requests, start a daemon to keep browser pool warm:\n\n```bash\n# Start daemon with browser pool\nnpx reader start --direct-pool-size 5\n\n# All subsequent commands auto-connect to daemon\nnpx reader scrape https://example.com\nnpx reader crawl https://example.com -d 2\n\n# Check daemon status\nnpx reader status\n\n# Stop daemon\nnpx reader stop\n\n# Force standalone mode (bypass daemon)\nnpx reader scrape https://example.com --standalone\n```\n\n### `reader scrape <urls...>`\n\nScrape one or more URLs.\n\n```bash\n# Scrape a single URL\nnpx reader scrape https://example.com\n\n# Scrape with multiple formats\nnpx reader scrape https://example.com -f markdown,html\n\n# Scrape multiple URLs concurrently\nnpx reader scrape https://example.com https://example.org -c 2\n\n# Save to file\nnpx reader scrape https://example.com -o output.md\n```\n\n| Option                   | Type   | Default      | Description                                             |\n| ------------------------ | ------ | ------------ | ------------------------------------------------------- |\n| `-f, --format <formats>` | string | `\"markdown\"` | Output formats (comma-separated: markdown,html)         |\n| `-o, --output <file>`    | string | stdout       | Output file path                                        |\n| `-c, --concurrency <n>`  | number | `1`          | Parallel requests                                       |\n| `-t, --timeout <ms>`     | number | `30000`      | Request timeout in milliseconds                         |\n| `--batch-timeout <ms>`   | number | `300000`     | Total timeout for entire batch operation                |\n| `--proxy <url>`          | string | -            | Proxy URL (e.g., http://user:pass@host:port)            |\n| `--user-agent <string>`  | string | -            | Custom user agent string                                |\n| `--show-chrome`          | flag   | -            | Show browser window for debugging                       |\n| `--no-main-content`      | flag   | -            | Disable main content extraction (include full page)     |\n| `--include-tags <sel>`   | string | -            | CSS selectors for elements to include (comma-separated) |\n| `--exclude-tags <sel>`   | string | -            | CSS selectors for elements to exclude (comma-separated) |\n| `-v, --verbose`          | flag   | -            | Enable verbose logging                                  |\n\n### `reader crawl <url>`\n\nCrawl a website to discover pages.\n\n```bash\n# Crawl with default settings\nnpx reader crawl https://example.com\n\n# Crawl deeper with more pages\nnpx reader crawl https://example.com -d 3 -m 50\n\n# Crawl and scrape content\nnpx reader crawl https://example.com -d 2 --scrape\n\n# Filter URLs with patterns\nnpx reader crawl https://example.com --include \"blog/*\" --exclude \"admin/*\"\n```\n\n| Option                   | Type   | Default      | Description                                     |\n| ------------------------ | ------ | ------------ | ----------------------------------------------- |\n| `-d, --depth <n>`        | number | `1`          | Maximum crawl depth                             |\n| `-m, --max-pages <n>`    | number | `20`         | Maximum pages to discover                       |\n| `-s, --scrape`           | flag   | -            | Also scrape content of discovered pages         |\n| `-f, --format <formats>` | string | `\"markdown\"` | Output formats when scraping (comma-separated)  |\n| `-o, --output <file>`    | string | stdout       | Output file path                                |\n| `--delay <ms>`           | number | `1000`       | Delay between requests in milliseconds          |\n| `-t, --timeout <ms>`     | number | -            | Total timeout for crawl operation               |\n| `--include <patterns>`   | string | -            | URL patterns to include (comma-separated regex) |\n| `--exclude <patterns>`   | string | -            | URL patterns to exclude (comma-separated regex) |\n| `--proxy <url>`          | string | -            | Proxy URL (e.g., http://user:pass@host:port)    |\n| `--user-agent <string>`  | string | -            | Custom user agent string                        |\n| `--show-chrome`          | flag   | -            | Show browser window for debugging               |\n| `-v, --verbose`          | flag   | -            | Enable verbose logging                          |\n\n### `reader browser`\n\nLaunch a browser session with a CDP WebSocket endpoint.\n\n```bash\n# Create a session (prints wsEndpoint, blocks until Ctrl+C)\nnpx reader browser create\n\n# Create with options\nnpx reader browser create --timeout 60000 --show-chrome\n\n# List active sessions (daemon mode)\nnpx reader browser list\n\n# Stop a session\nnpx reader browser stop <sessionId>\n```\n\n| Option               | Type   | Default  | Description                      |\n| -------------------- | ------ | -------- | -------------------------------- |\n| `--proxy <url>`      | string | -        | Proxy URL                        |\n| `-t, --timeout <ms>` | number | `300000` | Session lifetime in milliseconds |\n| `--show-chrome`      | flag   | -        | Show browser window              |\n| `--standalone`       | flag   | -        | Force standalone mode            |\n| `-v, --verbose`      | flag   | -        | Enable verbose logging           |\n\n## API Reference\n\n### `ReaderClient`\n\nThe recommended way to use Reader. Manages HeroCore lifecycle automatically.\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({ verbose: true });\n\n// Scrape\nconst result = await reader.scrape({ urls: [\"https://example.com\"] });\n\n// Crawl\nconst crawlResult = await reader.crawl({ url: \"https://example.com\", depth: 2 });\n\n// Browser session\nconst session = await reader.browser();\n// → session.wsEndpoint for Playwright/Puppeteer\n\n// Close when done (optional - auto-closes on exit)\nawait reader.close();\n```\n\n#### Constructor Options\n\n| Option          | Type                | Default         | Description                                      |\n| --------------- | ------------------- | --------------- | ------------------------------------------------ |\n| `verbose`       | `boolean`           | `false`         | Enable verbose logging                           |\n| `showChrome`    | `boolean`           | `false`         | Show browser window for debugging                |\n| `browserPool`   | `BrowserPoolConfig` | `undefined`     | Browser pool configuration (size, recycling)     |\n| `proxyPools`    | `ProxyPoolConfig`   | `undefined`     | Tiered proxy pools (datacenter + residential)    |\n| `proxies`       | `ProxyConfig[]`     | `undefined`     | Array of proxies for rotation (legacy)           |\n| `proxyRotation` | `string`            | `\"round-robin\"` | Rotation strategy: `\"round-robin\"` or `\"random\"` |\n\n#### BrowserPoolConfig\n\n| Option               | Type     | Default | Description                         |\n| -------------------- | -------- | ------- | ----------------------------------- |\n| `size`               | `number` | `2`     | Number of browser instances in pool |\n| `retireAfterPages`   | `number` | `100`   | Recycle browser after N page loads  |\n| `retireAfterMinutes` | `number` | `30`    | Recycle browser after N minutes     |\n| `maxQueueSize`       | `number` | `100`   | Max pending requests in queue       |\n\n#### Methods\n\n| Method              | Description                                        |\n| ------------------- | -------------------------------------------------- |\n| `scrape(options)`   | Scrape one or more URLs                            |\n| `crawl(options)`    | Crawl a website to discover pages                  |\n| `browser(options?)` | Launch a stealthed browser session (CDP WebSocket) |\n| `start()`           | Pre-initialize HeroCore (optional)                 |\n| `isReady()`         | Check if client is initialized                     |\n| `close()`           | Close client and release resources                 |\n\n### `scrape(options): Promise<ScrapeResult>`\n\nScrape one or more URLs. Can be used directly or via `ReaderClient`.\n\n| Option             | Type                          | Required | Default        | Description                                                     |\n| ------------------ | ----------------------------- | -------- | -------------- | --------------------------------------------------------------- |\n| `urls`             | `string[]`                    | Yes      | -              | Array of URLs to scrape                                         |\n| `formats`          | `Array<\"markdown\" \\| \"html\">` | No       | `[\"markdown\"]` | Output formats                                                  |\n| `onlyMainContent`  | `boolean`                     | No       | `true`         | Extract only main content (removes nav/header/footer)           |\n| `includeTags`      | `string[]`                    | No       | `[]`           | CSS selectors for elements to keep                              |\n| `excludeTags`      | `string[]`                    | No       | `[]`           | CSS selectors for elements to remove                            |\n| `waitForSelector`  | `string`                      | No       | -              | CSS selector to wait for before page is loaded                  |\n| `timeoutMs`        | `number`                      | No       | `30000`        | Request timeout in milliseconds                                 |\n| `batchConcurrency` | `number`                      | No       | `1`            | Number of URLs to process in parallel                           |\n| `batchTimeoutMs`   | `number`                      | No       | `300000`       | Total timeout for entire batch operation                        |\n| `proxy`            | `ProxyConfig`                 | No       | -              | Proxy configuration object                                      |\n| `proxyTier`        | `ProxyTier`                   | No       | -              | Proxy tier: `\"datacenter\"`, `\"residential\"`, `\"auto\"`           |\n| `onProgress`       | `function`                    | No       | -              | Progress callback: `({ completed, total, currentUrl }) => void` |\n| `verbose`          | `boolean`                     | No       | `false`        | Enable verbose logging                                          |\n| `showChrome`       | `boolean`                     | No       | `false`        | Show Chrome window for debugging                                |\n\n**Returns:** `Promise<ScrapeResult>`\n\n```typescript\ninterface ScrapeResult {\n  data: WebsiteScrapeResult[];\n  batchMetadata: BatchMetadata;\n}\n\ninterface WebsiteScrapeResult {\n  markdown?: string;\n  html?: string;\n  metadata: {\n    baseUrl: string;\n    finalUrl?: string; // Present if URL redirected\n    totalPages: number;\n    scrapedAt: string;\n    duration: number;\n    website: WebsiteMetadata;\n  };\n}\n\ninterface BatchMetadata {\n  totalUrls: number;\n  successfulUrls: number;\n  failedUrls: number;\n  scrapedAt: string;\n  totalDuration: number;\n  errors?: Array<{ url: string; error: string }>;\n}\n```\n\n### `crawl(options): Promise<CrawlResult>`\n\nCrawl a website to discover pages.\n\n| Option              | Type                          | Required | Default        | Description                                     |\n| ------------------- | ----------------------------- | -------- | -------------- | ----------------------------------------------- |\n| `url`               | `string`                      | Yes      | -              | Single seed URL to start crawling from          |\n| `depth`             | `number`                      | No       | `1`            | Maximum depth to crawl                          |\n| `maxPages`          | `number`                      | No       | `20`           | Maximum pages to discover                       |\n| `scrape`            | `boolean`                     | No       | `false`        | Also scrape full content of discovered pages    |\n| `delayMs`           | `number`                      | No       | `1000`         | Delay between requests in milliseconds          |\n| `timeoutMs`         | `number`                      | No       | -              | Total timeout for entire crawl operation        |\n| `includePatterns`   | `string[]`                    | No       | -              | URL patterns to include (regex strings)         |\n| `excludePatterns`   | `string[]`                    | No       | -              | URL patterns to exclude (regex strings)         |\n| `formats`           | `Array<\"markdown\" \\| \"html\">` | No       | `[\"markdown\"]` | Output formats for scraped content              |\n| `scrapeConcurrency` | `number`                      | No       | `2`            | Number of URLs to scrape in parallel            |\n| `proxy`             | `ProxyConfig`                 | No       | -              | Proxy configuration object                      |\n| `userAgent`         | `string`                      | No       | -              | Custom user agent string                        |\n| `verbose`           | `boolean`                     | No       | `false`        | Enable verbose logging                          |\n| `showChrome`        | `boolean`                     | No       | `false`        | Show Chrome window for debugging                |\n| `connectionToCore`  | `any`                         | No       | -              | Connection to shared Hero Core (for production) |\n\n**Returns:** `Promise<CrawlResult>`\n\n```typescript\ninterface CrawlResult {\n  urls: CrawlUrl[];\n  scraped?: ScrapeResult;\n  metadata: CrawlMetadata;\n}\n\ninterface CrawlUrl {\n  url: string;\n  title: string;\n  description: string | null;\n}\n\ninterface CrawlMetadata {\n  totalUrls: number;\n  maxDepth: number;\n  totalDuration: number;\n  seedUrl: string;\n}\n```\n\n### `browser(options?): Promise<BrowserSession>`\n\nLaunch a stealthed Chrome and return a CDP WebSocket URL for Playwright/Puppeteer.\n\n| Option       | Type          | Required | Default  | Description                                           |\n| ------------ | ------------- | -------- | -------- | ----------------------------------------------------- |\n| `proxy`      | `ProxyConfig` | No       | -        | Proxy configuration                                   |\n| `proxyTier`  | `ProxyTier`   | No       | -        | Proxy tier: `\"datacenter\"`, `\"residential\"`, `\"auto\"` |\n| `showChrome` | `boolean`     | No       | `false`  | Show browser window                                   |\n| `timeoutMs`  | `number`      | No       | `300000` | Session lifetime (auto-closes after)                  |\n| `verbose`    | `boolean`     | No       | `false`  | Enable verbose logging                                |\n\n**Returns:** `Promise<BrowserSession>`\n\n```typescript\ninterface BrowserSession {\n  sessionId: string; // Unique session identifier\n  wsEndpoint: string; // CDP WebSocket URL for Playwright/Puppeteer\n  createdAt: string; // ISO timestamp\n  close(): Promise<void>; // Close session and release resources\n}\n```\n\n**Stealth features active on all sessions:**\n\n- `navigator.webdriver = false` (via `--disable-blink-features=AutomationControlled`)\n- Proxy routing through authenticated proxy forwarder (if configured)\n- Isolated user profile per session (no cookie/state leaks)\n\n### ProxyConfig\n\n| Option     | Type                            | Required | Default | Description                                             |\n| ---------- | ------------------------------- | -------- | ------- | ------------------------------------------------------- |\n| `url`      | `string`                        | No       | -       | Full proxy URL (takes precedence over other fields)     |\n| `type`     | `\"datacenter\" \\| \"residential\"` | No       | -       | Proxy type                                              |\n| `host`     | `string`                        | No       | -       | Proxy host                                              |\n| `port`     | `number`                        | No       | -       | Proxy port                                              |\n| `username` | `string`                        | No       | -       | Proxy username                                          |\n| `password` | `string`                        | No       | -       | Proxy password                                          |\n| `country`  | `string`                        | No       | -       | Country code for residential proxies (e.g., 'us', 'uk') |\n\n## Daemon Mode (Production)\n\nFor production servers, start the daemon once and all scrape/crawl/browser requests share the warm browser pool:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\n// Create once at startup\nconst reader = new ReaderClient({\n  proxyPools: {\n    datacenter: [{ url: \"http://user:pass@dc-proxy:8080\" }],\n    residential: [{ url: \"http://user:pass@res-proxy:8080\" }],\n  },\n});\n\n// Reuse for all requests\nconst result = await reader.scrape({ urls: [\"https://example.com\"] });\n\n// Graceful shutdown\nprocess.on(\"SIGTERM\", () => reader.close());\n```\n\n## How It Works\n\n### Anti-Bot Bypass\n\nReader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced anti-detection:\n\n1. **TLS Fingerprinting** - Emulates real Chrome browser fingerprints via MITM proxy\n2. **Navigator Spoofing** - `webdriver=false`, device memory, hardware concurrency\n3. **DNS over TLS** - Uses Cloudflare DNS (1.1.1.1) to mimic Chrome behavior\n4. **WebRTC IP Masking** - Prevents IP leaks through WebRTC connections\n5. **WebGL/Canvas Fingerprinting** - Randomized rendering signatures\n\n### Browser Pool\n\n- **Tiered Proxy Pools** - Separate datacenter and residential pools with auto-escalation\n- **Auto-Recycling** - Browsers recycled after 100 requests or 30 minutes\n- **Health Tracking** - Auto-benches failed proxies for 5 minutes, revives on recovery\n- **Per-Proxy Concurrency** - Limits concurrent requests per proxy URL (default: 2)\n\n### HTML to Markdown: supermarkdown\n\nReader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines.\n\n**Why we built it:**\n\nWhen you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown.\n\n**What supermarkdown offers:**\n\n| Feature              | Benefit                                              |\n| -------------------- | ---------------------------------------------------- |\n| **Written in Rust**  | Native performance with Node.js bindings via napi-rs |\n| **Full GFM support** | Tables, task lists, strikethrough, autolinks         |\n| **LLM-optimized**    | Clean output designed for AI consumption             |\n| **Battle-tested**    | Handles malformed HTML from real web pages           |\n| **CSS selectors**    | Include/exclude elements during conversion           |\n\nsupermarkdown is open source and available as both a Rust crate and npm package:\n\n```bash\n# npm\nnpm install @vakra-dev/supermarkdown\n\n# Rust\ncargo add supermarkdown\n```\n\nCheck out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation.\n\n## Server Deployment\n\nReader uses a real Chromium browser under the hood. On headless Linux servers (VPS, EC2, etc.), you need to install Chrome's system dependencies:\n\n```bash\n# Debian/Ubuntu\nsudo apt-get install -y libnspr4 libnss3 libatk1.0-0 libatk-bridge2.0-0 \\\n  libcups2 libxcb1 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \\\n  libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 libasound2\n```\n\nThis is the same requirement that Puppeteer and Playwright have on headless Linux. macOS, Windows, and Linux desktops already have these libraries.\n\nFor Docker and production deployment guides, see the [deployment documentation](https://docs.reader.dev/documentation/guides/deployment).\n\n## Documentation\n\nFull documentation is available at **[docs.reader.dev](https://docs.reader.dev)**, including guides for scraping, crawling, proxy configuration, browser pool management, and deployment.\n\n### Examples\n\n| Example                                                                    | Description                                    |\n| -------------------------------------------------------------------------- | ---------------------------------------------- |\n| [Basic Scraping](examples/basic/basic-scrape.ts)                           | Simple single-URL scraping                     |\n| [Batch Scraping](examples/basic/batch-scrape.ts)                           | Concurrent multi-URL scraping                  |\n| [Crawl Website](examples/basic/crawl-website.ts)                           | Crawl and discover pages                       |\n| [Browser Session (Playwright)](examples/basic/browser-session.ts)          | Navigate, extract data, screenshot             |\n| [Browser Session (Actions)](examples/basic/browser-session-actions.ts)     | Click, type, search, wait for elements         |\n| [Browser Session (Puppeteer)](examples/basic/browser-session-puppeteer.ts) | Puppeteer via `connect({ browserWSEndpoint })` |\n| [Browser Session (Raw CDP)](examples/basic/browser-session-selenium.ts)    | Direct CDP WebSocket commands                  |\n| [Browser Pool Config](examples/basic/browser-pool-config.ts)               | Configure browser pool for high throughput     |\n| [Proxy Pool](examples/basic/proxy-pool.ts)                                 | Proxy rotation with multiple proxies           |\n| [Cloudflare Bypass](examples/basic/cloudflare-bypass.ts)                   | Scrape Cloudflare-protected sites              |\n| [All Formats](examples/basic/all-formats.ts)                               | Output in markdown and html                    |\n| [AI Tools](examples/ai-tools/)                                             | OpenAI, Anthropic, LangChain integrations      |\n\n## Development\n\n```bash\n# Install dependencies\nnpm install\n\n# Run linting\nnpm run lint\n\n# Format code\nnpm run format\n\n# Type check\nnpm run typecheck\n\n# Find TODOs\nnpm run todo\n```\n\n## Contributing\n\nContributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.\n\n## License\n\n[Apache 2.0](LICENSE) - See LICENSE for details.\n\n## Citation\n\nIf you use Reader in your research or project, please cite it:\n\n```bibtex\n@software{reader.dev,\n  author = {Kaul, Nihal},\n  title = {Reader: Open-source, production-grade web scraping engine built for LLMs},\n  year = {2026},\n  publisher = {GitHub},\n  url = {https://github.com/vakra-dev/reader}\n}\n```\n\n## Support\n\n- [GitHub Issues](https://github.com/vakra-dev/reader/issues)\n- [Documentation](https://docs.reader.dev)\n- [Discord](https://discord.gg/6tjkq7J5WV)\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security Policy\n\n## Supported Versions\n\n| Version | Supported |\n| ------- | --------- |\n| Latest  | Yes       |\n\nWe only provide security fixes for the latest release.\n\n## Reporting a Vulnerability\n\nIf you discover a security vulnerability in Reader, please report it responsibly.\n\n**Do not open a public GitHub issue for security vulnerabilities.**\n\nInstead, email **nihal.codes@gmail.com** with:\n\n- A description of the vulnerability\n- Steps to reproduce the issue\n- The potential impact\n- Any suggested fixes (optional)\n\n## What to Expect\n\n- **Acknowledgment** within 48 hours of your report\n- **Status update** within 7 days with an assessment and timeline\n- **Credit** in the release notes (unless you prefer to remain anonymous)\n\n## Scope\n\nThe following are in scope:\n\n- The `@vakra-dev/reader` npm package\n- The Reader CLI tool\n- The Reader Cloud API (`cloud.reader.dev`)\n\nThe following are out of scope:\n\n- Vulnerabilities in upstream dependencies (report these to the respective projects)\n- Issues related to websites blocking scraping (this is expected behavior, not a vulnerability)\n\n## Responsible Use\n\nReader is a web scraping tool. Users are responsible for complying with applicable laws and website terms of service. The project maintainers are not responsible for how the tool is used."
  },
  {
    "path": "docs/api-reference.md",
    "content": "# API Reference\n\nComplete API documentation for Reader.\n\n## ReaderClient (Recommended)\n\nThe recommended way to use Reader. Manages HeroCore lifecycle automatically, reuses connections efficiently, and auto-closes on process exit.\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({ verbose: true });\n\n// Scrape URLs\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  formats: [\"markdown\"],\n});\n\n// Crawl a website\nconst crawlResult = await reader.crawl({\n  url: \"https://example.com\",\n  depth: 2,\n});\n\n// Launch a stealthed browser session\nconst session = await reader.browser();\n// → session.wsEndpoint for Playwright/Puppeteer\n\n// Close when done (optional - auto-closes on exit)\nawait reader.close();\n```\n\n### Constructor\n\n```typescript\nnew ReaderClient(options?: ReaderClientOptions)\n```\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `verbose` | `boolean` | `false` | Enable verbose logging |\n| `showChrome` | `boolean` | `false` | Show browser window for debugging |\n| `browserPool` | `BrowserPoolConfig` | - | Browser pool configuration |\n| `proxyPools` | `ProxyPoolConfig` | - | Tiered proxy pools (datacenter + residential) |\n| `proxies` | `ProxyConfig[]` | - | List of proxies to rotate through (legacy) |\n| `proxyRotation` | `\"round-robin\" \\| \"random\"` | `\"round-robin\"` | Proxy rotation strategy |\n\n#### ProxyPoolConfig\n\n```typescript\ninterface ProxyPoolConfig {\n  datacenter?: ProxyConfig[];   // Fast, cheap - works for most sites\n  residential?: ProxyConfig[];  // Slower, anti-bot sites (Amazon, LinkedIn)\n}\n```\n\n#### BrowserPoolConfig\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `size` | `number` | `2` | Number of browser instances |\n| `retireAfterPages` | `number` | `100` | Retire browser after N page loads |\n| `retireAfterMinutes` | `number` | `30` | Retire browser after N minutes |\n| `maxQueueSize` | `number` | `100` | Maximum pending requests in queue |\n\n### Methods\n\n#### start()\n\nPre-initialize HeroCore. Called automatically on first scrape/crawl.\n\n```typescript\nawait reader.start(): Promise<void>\n```\n\n#### scrape(options)\n\nScrape one or more URLs.\n\n```typescript\nconst result = await reader.scrape(options): Promise<ScrapeResult>\n```\n\nSee [ScrapeOptions](#scrapeoptions) for available options.\n\n#### crawl(options)\n\nCrawl a website to discover pages.\n\n```typescript\nconst result = await reader.crawl(options): Promise<CrawlResult>\n```\n\nSee [CrawlOptions](#crawloptions) for available options.\n\n#### browser(options?)\n\nLaunch a stealthed browser session and return a CDP WebSocket URL for Playwright/Puppeteer.\n\n```typescript\nconst session = await reader.browser(options?): Promise<BrowserSession>\n```\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `proxy` | `ProxyConfig` | - | Proxy configuration |\n| `proxyTier` | `ProxyTier` | - | Proxy tier: `\"datacenter\"`, `\"residential\"`, `\"auto\"` |\n| `showChrome` | `boolean` | `false` | Show browser window |\n| `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) |\n| `verbose` | `boolean` | `false` | Enable verbose logging |\n\nReturns:\n\n```typescript\ninterface BrowserSession {\n  sessionId: string;       // Unique session identifier\n  wsEndpoint: string;      // CDP WebSocket URL\n  createdAt: string;       // ISO timestamp\n  close(): Promise<void>;  // Close session and release resources\n}\n```\n\nSee the [Browser Sessions guide](guides/browser-sessions.md) for full examples.\n\n#### isReady()\n\nCheck if the client is initialized and ready.\n\n```typescript\nreader.isReady(): boolean\n```\n\n#### close()\n\nClose the client and release resources.\n\n```typescript\nawait reader.close(): Promise<void>\n```\n\n---\n\n## Direct Functions (Advanced)\n\nFor advanced use cases where you need custom HeroCore management, you can use the direct functions. Note that without `connectionToCore`, each call spawns a new HeroCore instance which is less efficient.\n\n### scrape(options)\n\nScrape one or more URLs and return content in specified formats.\n\n```typescript\nimport { scrape } from \"@vakra-dev/reader\";\n\nconst result = await scrape({\n  urls: [\"https://example.com\"],\n  formats: [\"markdown\"],\n});\n```\n\n#### Parameters\n\n| Name | Type | Required | Default | Description |\n|------|------|----------|---------|-------------|\n| `urls` | `string[]` | Yes | - | Array of URLs to scrape |\n| `formats` | `FormatType[]` | No | `[\"markdown\"]` | Output formats |\n| `onlyMainContent` | `boolean` | No | `true` | Extract only main content |\n| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |\n| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |\n| `userAgent` | `string` | No | - | Custom user agent string |\n| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |\n| `batchConcurrency` | `number` | No | `1` | URLs to process in parallel |\n| `batchTimeoutMs` | `number` | No | `300000` | Total batch timeout |\n| `onProgress` | `ProgressCallback` | No | - | Progress callback function |\n| `proxy` | `ProxyConfig` | No | - | Proxy configuration |\n| `proxyTier` | `ProxyTier` | No | - | Proxy tier: `\"datacenter\"`, `\"residential\"`, `\"auto\"` |\n| `waitForSelector` | `string` | No | - | CSS selector to wait for |\n| `verbose` | `boolean` | No | `false` | Enable verbose logging |\n| `showChrome` | `boolean` | No | `false` | Show browser window |\n| `connectionToCore` | `any` | No | - | Shared Hero Core connection |\n\n#### Returns\n\n`Promise<ScrapeResult>`\n\n```typescript\ninterface ScrapeResult {\n  data: WebsiteScrapeResult[];\n  batchMetadata: BatchMetadata;\n}\n```\n\n#### Example\n\n```typescript\n// Using ReaderClient (recommended)\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://example.com\", \"https://example.org\"],\n  formats: [\"markdown\", \"html\"],\n  batchConcurrency: 2,\n  onProgress: ({ completed, total, currentUrl }) => {\n    console.log(`[${completed}/${total}] ${currentUrl}`);\n  },\n});\n\nfor (const site of result.data) {\n  console.log(\"URL:\", site.metadata.baseUrl);\n  console.log(\"Markdown:\", site.markdown?.substring(0, 200));\n}\n\nawait reader.close();\n```\n\n---\n\n### crawl(options)\n\nCrawl a website to discover pages, optionally scraping their content.\n\n```typescript\n// Using ReaderClient (recommended)\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\nconst result = await reader.crawl({\n  url: \"https://example.com\",\n  depth: 2,\n  maxPages: 20,\n  scrape: true,\n});\nawait reader.close();\n```\n\n#### Parameters\n\n| Name | Type | Required | Default | Description |\n|------|------|----------|---------|-------------|\n| `url` | `string` | Yes | - | Seed URL to start crawling |\n| `depth` | `number` | No | `1` | Maximum crawl depth |\n| `maxPages` | `number` | No | `20` | Maximum pages to discover |\n| `scrape` | `boolean` | No | `false` | Also scrape discovered pages |\n| `delayMs` | `number` | No | `1000` | Delay between requests |\n| `timeoutMs` | `number` | No | - | Total crawl timeout |\n| `includePatterns` | `string[]` | No | - | URL patterns to include |\n| `excludePatterns` | `string[]` | No | - | URL patterns to exclude |\n| `formats` | `FormatType[]` | No | `[\"markdown\", \"html\"]` | Output formats when scraping |\n| `scrapeConcurrency` | `number` | No | `2` | Scraping parallelism |\n| `proxy` | `ProxyConfig` | No | - | Proxy configuration |\n| `userAgent` | `string` | No | - | Custom user agent |\n| `verbose` | `boolean` | No | `false` | Enable verbose logging |\n| `showChrome` | `boolean` | No | `false` | Show browser window |\n| `connectionToCore` | `any` | No | - | Shared Hero Core connection |\n\n#### Returns\n\n`Promise<CrawlResult>`\n\n```typescript\ninterface CrawlResult {\n  urls: CrawlUrl[];\n  scraped?: ScrapeResult;\n  metadata: CrawlMetadata;\n}\n```\n\n#### Example\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.crawl({\n  url: \"https://docs.example.com\",\n  depth: 3,\n  maxPages: 50,\n  includePatterns: [\"docs/*\"],\n  excludePatterns: [\"docs/archive/*\"],\n  scrape: true,\n});\n\nconsole.log(`Discovered ${result.urls.length} pages`);\nresult.urls.forEach((page) => {\n  console.log(`- ${page.title}: ${page.url}`);\n});\n\nif (result.scraped) {\n  console.log(`Scraped ${result.scraped.batchMetadata.successfulUrls} pages`);\n}\n\nawait reader.close();\n```\n\n---\n\n## Type Definitions\n\n### ScrapeOptions\n\n```typescript\ninterface ScrapeOptions {\n  urls: string[];\n  formats?: Array<\"markdown\" | \"html\">;\n  onlyMainContent?: boolean;\n  includeTags?: string[];\n  excludeTags?: string[];\n  userAgent?: string;\n  timeoutMs?: number;\n  batchConcurrency?: number;\n  batchTimeoutMs?: number;\n  onProgress?: (progress: ProgressInfo) => void;\n  proxy?: ProxyConfig;\n  proxyTier?: \"datacenter\" | \"residential\" | \"auto\";\n  waitForSelector?: string;\n  verbose?: boolean;\n  showChrome?: boolean;\n  connectionToCore?: any;\n}\n```\n\n### CrawlOptions\n\n```typescript\ninterface CrawlOptions {\n  url: string;\n  depth?: number;\n  maxPages?: number;\n  scrape?: boolean;\n  delayMs?: number;\n  timeoutMs?: number;\n  includePatterns?: string[];\n  excludePatterns?: string[];\n  formats?: Array<\"markdown\" | \"html\">;\n  scrapeConcurrency?: number;\n  proxy?: ProxyConfig;\n  userAgent?: string;\n  verbose?: boolean;\n  showChrome?: boolean;\n  connectionToCore?: any;\n}\n```\n\n### ProxyConfig\n\n```typescript\ninterface ProxyConfig {\n  url?: string;\n  type?: \"datacenter\" | \"residential\";\n  host?: string;\n  port?: number;\n  username?: string;\n  password?: string;\n  country?: string;\n}\n```\n\n### ScrapeResult\n\n```typescript\ninterface ScrapeResult {\n  data: WebsiteScrapeResult[];\n  batchMetadata: BatchMetadata;\n}\n```\n\n### WebsiteScrapeResult\n\n```typescript\ninterface WebsiteScrapeResult {\n  markdown?: string;\n  html?: string;\n  metadata: {\n    baseUrl: string;\n    finalUrl?: string;  // Present if URL redirected\n    totalPages: number;\n    scrapedAt: string;\n    duration: number;\n    website: WebsiteMetadata;\n    proxy?: ProxyMetadata;  // Included when proxy pooling is used\n  };\n}\n```\n\n### ProxyMetadata\n\n```typescript\ninterface ProxyMetadata {\n  host: string;\n  port: number;\n  country?: string;  // If geo-targeting was used\n}\n```\n\n### BatchMetadata\n\n```typescript\ninterface BatchMetadata {\n  totalUrls: number;\n  successfulUrls: number;\n  failedUrls: number;\n  scrapedAt: string;\n  totalDuration: number;\n  errors?: Array<{ url: string; error: string }>;\n}\n```\n\n### CrawlResult\n\n```typescript\ninterface CrawlResult {\n  urls: CrawlUrl[];\n  scraped?: ScrapeResult;\n  metadata: CrawlMetadata;\n}\n```\n\n### CrawlUrl\n\n```typescript\ninterface CrawlUrl {\n  url: string;\n  title: string;\n  description: string | null;\n}\n```\n\n### CrawlMetadata\n\n```typescript\ninterface CrawlMetadata {\n  totalUrls: number;\n  maxDepth: number;\n  totalDuration: number;\n  seedUrl: string;\n}\n```\n\n### WebsiteMetadata\n\n```typescript\ninterface WebsiteMetadata {\n  title: string | null;\n  description: string | null;\n  author: string | null;\n  language: string | null;\n  charset: string | null;\n  favicon: string | null;\n  image: string | null;\n  canonical: string | null;\n  keywords: string[] | null;\n  robots: string | null;\n  themeColor: string | null;\n  openGraph: {\n    title: string | null;\n    description: string | null;\n    type: string | null;\n    url: string | null;\n    image: string | null;\n    siteName: string | null;\n    locale: string | null;\n  } | null;\n  twitter: {\n    card: string | null;\n    site: string | null;\n    creator: string | null;\n    title: string | null;\n    description: string | null;\n    image: string | null;\n  } | null;\n}\n```\n\n### ProgressInfo\n\n```typescript\ninterface ProgressInfo {\n  completed: number;\n  total: number;\n  currentUrl: string;\n}\n```\n\n---\n\n## Classes\n\n### BrowserPool\n\nManages a pool of Hero browser instances for efficient scraping.\n\n```typescript\nimport { BrowserPool } from \"@vakra-dev/reader\";\n\nconst pool = new BrowserPool({ size: 5 });\nawait pool.initialize();\n\nconst result = await pool.withBrowser(async (hero) => {\n  await hero.goto(\"https://example.com\");\n  return await hero.document.title;\n});\n\nawait pool.shutdown();\n```\n\n#### Constructor\n\n```typescript\nnew BrowserPool(config?: PoolConfig)\n```\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `size` | `number` | `2` | Number of browser instances |\n| `retireAfterPages` | `number` | `100` | Recycle after N pages |\n| `retireAfterMinutes` | `number` | `30` | Recycle after N minutes |\n| `maxQueueSize` | `number` | `100` | Maximum pending requests |\n| `healthCheckIntervalMs` | `number` | `300000` | Health check interval |\n\n#### Methods\n\n##### initialize()\n\nInitialize the browser pool.\n\n```typescript\nawait pool.initialize(): Promise<void>\n```\n\n##### withBrowser(fn)\n\nExecute a function with an acquired browser, automatically releasing it after.\n\n```typescript\nawait pool.withBrowser<T>(fn: (hero: Hero) => Promise<T>): Promise<T>\n```\n\n##### acquire()\n\nManually acquire a browser instance. Must be paired with `release()`.\n\n```typescript\nconst hero = await pool.acquire(): Promise<Hero>\n```\n\n##### release(hero)\n\nRelease a browser instance back to the pool.\n\n```typescript\nawait pool.release(hero: Hero): Promise<void>\n```\n\n##### healthCheck()\n\nCheck the health of all pool instances.\n\n```typescript\nconst health = await pool.healthCheck(): Promise<HealthCheckResult>\n```\n\n##### getStats()\n\nGet current pool statistics.\n\n```typescript\nconst stats = pool.getStats(): PoolStats\n```\n\n##### shutdown()\n\nShutdown all browser instances.\n\n```typescript\nawait pool.shutdown(): Promise<void>\n```\n\n---\n\n## Formatter Functions\n\n### formatToMarkdown(pages, baseUrl, scrapedAt, duration, metadata?)\n\nConvert scraped pages to Markdown format.\n\n```typescript\nimport { formatToMarkdown } from \"@vakra-dev/reader\";\n\nconst markdown = formatToMarkdown(\n  pages,\n  \"https://example.com\",\n  new Date().toISOString(),\n  1500,\n  metadata\n);\n```\n\n---\n\n### formatToHTML(pages, baseUrl, scrapedAt, duration, metadata?)\n\nConvert scraped pages to a complete HTML document.\n\n```typescript\nimport { formatToHTML } from \"@vakra-dev/reader\";\n\nconst html = formatToHTML(\n  pages,\n  \"https://example.com\",\n  new Date().toISOString(),\n  1500,\n  metadata\n);\n```\n\n\n---\n\n## Utility Functions\n\n### cleanContent(html)\n\nRemove navigation, ads, scripts, and other non-content elements from HTML.\n\n```typescript\nimport { cleanContent } from \"@vakra-dev/reader\";\n\nconst cleanHtml = cleanContent(rawHtml);\n```\n\n---\n\n### extractMetadata(html)\n\nExtract metadata from HTML including Open Graph and Twitter cards.\n\n```typescript\nimport { extractMetadata } from \"@vakra-dev/reader\";\n\nconst metadata = extractMetadata(html);\nconsole.log(metadata.title);\nconsole.log(metadata.openGraph?.image);\n```\n\n---\n\n## Default Values\n\n```typescript\nconst DEFAULT_OPTIONS = {\n  formats: [\"markdown\"],\n  onlyMainContent: true,\n  timeoutMs: 30000,\n  batchConcurrency: 1,\n  batchTimeoutMs: 300000,\n  verbose: false,\n  showChrome: false,\n};\n\nconst DEFAULT_CRAWL_OPTIONS = {\n  depth: 1,\n  maxPages: 20,\n  scrape: false,\n  delayMs: 1000,\n  formats: [\"markdown\", \"html\"],\n  scrapeConcurrency: 2,\n  verbose: false,\n  showChrome: false,\n};\n\nconst DEFAULT_POOL_CONFIG = {\n  size: 2,\n  retireAfterPages: 100,\n  retireAfterMinutes: 30,\n  maxQueueSize: 100,\n  healthCheckIntervalMs: 300000,\n};\n```\n\n---\n\n## See Also\n\n- [Getting Started](getting-started.md) - Quick start guide\n- [Architecture](architecture.md) - System design\n- [Browser Pool Guide](guides/browser-pool.md) - Pool management\n- [Cloudflare Bypass Guide](guides/cloudflare-bypass.md) - Challenge handling\n"
  },
  {
    "path": "docs/architecture.md",
    "content": "# Architecture\n\nThis document describes the internal architecture of Reader, helping contributors understand how the system works.\n\n## High-Level Overview\n\n```\n┌─────────────────────────────────────────────────────────────────┐\n│                        Public API                                │\n│              scrape() / crawl() / browser()                      │\n└──────────┬─────────────────┬────────────────┬───────────────────┘\n           │                 │                │\n     ┌─────▼─────┐    ┌─────▼─────┐    ┌─────▼──────────┐\n     │  Scraper  │    │  Crawler  │    │ BrowserSession │\n     │  Class    │    │  Class    │    │ (CDP WebSocket)│\n     └─────┬─────┘    └─────┬─────┘    └─────┬──────────┘\n           │                │                │\n           └────────┬───────┘                │ own HeroCore\n                    │                        │\n          ┌─────────▼─────────┐    ┌─────────▼─────────┐\n          │ TieredBrowserPool │    │  Dedicated Chrome  │\n          │ (shared, pooled)  │    │  (per-session)     │\n          └─────────┬─────────┘    └───────────────────┘\n                    │\n    ┌───────────────┼───────────────┐\n    │               │               │\n┌───▼──────────┐ ┌──▼──────────┐ ┌──▼────────────┐\n│  Hero Config │ │  Orchestrator│ │  Formatters   │\n│ (TLS, DNS, etc.) │ │   Detection     │ │ (MD, HTML, etc) │\n└──────────────────┘ └─────────────────┘ └─────────────────┘\n```\n\n## Directory Structure\n\n```\nsrc/\n├── index.ts              # Public API exports\n├── scraper.ts            # Scraper class - main scraping logic\n├── crawler.ts            # Crawler class - link discovery + scraping\n├── types.ts              # ScrapeOptions, ScrapeResult, etc.\n├── crawl-types.ts        # CrawlOptions, CrawlResult, etc.\n│\n├── browser/\n│   ├── pool.ts           # BrowserPool - manages Hero instances\n│   ├── hero-config.ts    # Hero configuration (TLS, DNS, viewport)\n│   └── types.ts          # IBrowserPool, PoolConfig, PoolStats\n│\n├── cloudflare/\n│   ├── detector.ts       # detectChallenge() - DOM/text matching\n│   ├── handler.ts        # waitForChallengeResolution() - polling\n│   └── types.ts          # ChallengeDetection, ResolutionResult\n│\n├── formatters/\n│   ├── markdown.ts       # formatToMarkdown() - uses supermarkdown\n│   ├── html.ts           # formatToHTML() - full HTML document\n│   ├── postprocess.ts    # Post-processing utilities\n│   └── index.ts          # Re-exports all formatters\n│\n├── utils/\n│   ├── content-cleaner.ts    # cleanContent() - removes nav, ads\n│   ├── metadata-extractor.ts # extractMetadata() - OG tags, etc.\n│   ├── url-helpers.ts        # URL validation, normalization\n│   ├── rate-limiter.ts       # Simple delay-based rate limiting\n│   └── logger.ts             # Pino logger with pretty print\n│\n├── proxy/\n│   └── config.ts         # createProxyUrl(), parseProxyUrl()\n│\n└── cli/\n    └── index.ts          # CLI using Commander.js\n```\n\n## Core Components\n\n### Scraper\n\nThe `Scraper` class (`src/scraper.ts`) handles URL scraping:\n\n```typescript\nclass Scraper {\n  constructor(options: ScrapeOptions) { ... }\n\n  async scrape(): Promise<ScrapeResult> {\n    // 1. Initialize browser pool\n    // 2. Process URLs with concurrency control (p-limit)\n    // 3. For each URL: fetch, detect challenges, extract content\n    // 4. Format to requested output formats\n    // 5. Aggregate results and metadata\n  }\n\n  private async scrapeSingleUrl(url: string): Promise<WebsiteScrapeResult> {\n    // 1. Acquire browser from pool\n    // 2. Navigate to URL\n    // 3. Detect Cloudflare challenge\n    // 4. Wait for resolution if needed\n    // 5. Extract HTML and metadata\n    // 6. Clean content\n    // 7. Format to outputs\n    // 8. Release browser to pool\n  }\n}\n```\n\n**Key design decisions:**\n\n- Uses `p-limit` for concurrency control\n- Each URL gets its own browser instance from the pool\n- Cloudflare detection runs before content extraction\n- All formatters run in parallel for each URL\n\n### Crawler\n\nThe `Crawler` class (`src/crawler.ts`) discovers links:\n\n```typescript\nclass Crawler {\n  async crawl(): Promise<CrawlResult> {\n    // BFS (Breadth-First Search) algorithm\n    // 1. Start with seed URL at depth 0\n    // 2. Fetch page, extract links\n    // 3. Filter links (same domain, patterns)\n    // 4. Add to queue with depth + 1\n    // 5. Repeat until maxPages or maxDepth\n    // 6. Optionally scrape discovered URLs\n  }\n}\n```\n\n**Key design decisions:**\n\n- BFS ensures shallow pages are discovered first\n- Respects `maxPages` and `depth` limits\n- Optional scraping reuses the Scraper class\n- Delay between requests for rate limiting\n\n### Browser Pool\n\nThe `BrowserPool` class (`src/browser/pool.ts`) manages Hero instances:\n\n```typescript\nclass BrowserPool {\n  private instances: HeroInstance[];\n  private available: HeroInstance[];\n  private queue: PendingRequest[];\n\n  async initialize(): Promise<void> { ... }\n  async acquire(): Promise<Hero> { ... }\n  async release(hero: Hero): Promise<void> { ... }\n\n  async withBrowser<T>(fn: (hero: Hero) => Promise<T>): Promise<T> {\n    const hero = await this.acquire();\n    try {\n      return await fn(hero);\n    } finally {\n      await this.release(hero);\n    }\n  }\n}\n```\n\n**Pool lifecycle:**\n\n1. **Initialize** - Create `size` Hero instances\n2. **Acquire** - Get available instance or queue the request\n3. **Use** - Execute scraping logic\n4. **Release** - Return to pool or recycle if stale\n5. **Recycle** - Close old instance, create new one\n6. **Shutdown** - Close all instances\n\n**Recycling triggers:**\n\n- After N pages (default: 100)\n- After N minutes (default: 30)\n- On health check failure\n\n### Cloudflare Detection\n\nDetection happens in two phases:\n\n**1. Challenge Detection** (`src/cloudflare/detector.ts`):\n\n```typescript\nasync function detectChallenge(hero: Hero): Promise<ChallengeDetection> {\n  // Check DOM for challenge elements\n  const signals = [];\n\n  // CSS selectors that indicate challenges\n  if (await hero.document.querySelector(\"#challenge-form\")) {\n    signals.push({ type: \"dom\", selector: \"#challenge-form\" });\n  }\n\n  // Text patterns that indicate challenges\n  const bodyText = await hero.document.body.textContent;\n  if (bodyText.includes(\"checking your browser\")) {\n    signals.push({ type: \"text\", pattern: \"checking your browser\" });\n  }\n\n  return {\n    isChallenge: signals.length > 0,\n    type: determineType(signals),\n    signals,\n  };\n}\n```\n\n**2. Challenge Resolution** (`src/cloudflare/handler.ts`):\n\n```typescript\nasync function waitForChallengeResolution(\n  hero: Hero,\n  options: ResolutionOptions\n): Promise<ResolutionResult> {\n  const startTime = Date.now();\n\n  while (Date.now() - startTime < options.maxWaitMs) {\n    // Check if URL changed (redirect after challenge)\n    if ((await hero.url) !== options.initialUrl) {\n      return { resolved: true, method: \"redirect\" };\n    }\n\n    // Check if challenge elements disappeared\n    const detection = await detectChallenge(hero);\n    if (!detection.isChallenge) {\n      return { resolved: true, method: \"element_removal\" };\n    }\n\n    await sleep(options.pollIntervalMs);\n  }\n\n  return { resolved: false };\n}\n```\n\n### Formatters\n\nEach formatter transforms scraped pages into a specific format:\n\n| Formatter | Input | Output |\n|-----------|-------|--------|\n| `formatToMarkdown` | Pages, metadata | Markdown document with frontmatter |\n| `formatToHTML` | Pages, metadata | Complete HTML document with CSS |\n\n**Markdown formatter** uses [supermarkdown](https://github.com/vakra-dev/supermarkdown) - a high-performance Rust-based HTML-to-Markdown converter with full GFM support.\n\n## Data Flow\n\n### Scrape Request Flow\n\n```\nscrape({ urls: [\"https://example.com\"], formats: [\"markdown\"] })\n  │\n  ├─► Scraper.scrape()\n  │     │\n  │     ├─► BrowserPool.initialize(size=concurrency)\n  │     │\n  │     ├─► For each URL (controlled by p-limit):\n  │     │     │\n  │     │     ├─► pool.withBrowser(async hero => {\n  │     │     │     │\n  │     │     │     ├─► hero.goto(url)\n  │     │     │     │\n  │     │     │     ├─► detectChallenge(hero)\n  │     │     │     │     └─► Returns { isChallenge, type, signals }\n  │     │     │     │\n  │     │     │     ├─► if (isChallenge):\n  │     │     │     │     └─► waitForChallengeResolution(hero)\n  │     │     │     │\n  │     │     │     ├─► Extract title, HTML\n  │     │     │     │\n  │     │     │     ├─► cleanContent(html)\n  │     │     │     │     └─► Remove nav, ads, scripts\n  │     │     │     │\n  │     │     │     ├─► extractMetadata(html)\n  │     │     │     │     └─► OG tags, Twitter cards, etc.\n  │     │     │     │\n  │     │     │     └─► Format to requested formats\n  │     │     │   })\n  │     │     │\n  │     │     └─► Add to results array\n  │     │\n  │     ├─► pool.shutdown()\n  │     │\n  │     └─► Return ScrapeResult { data[], batchMetadata }\n  │\n  └─► Result returned to caller\n```\n\n### Crawl Request Flow\n\n```\ncrawl({ url: \"https://example.com\", depth: 2, scrape: true })\n  │\n  ├─► Crawler.crawl()\n  │     │\n  │     ├─► Initialize queue with seed URL at depth 0\n  │     │\n  │     ├─► BFS loop (while queue not empty && pages < maxPages):\n  │     │     │\n  │     │     ├─► Dequeue next URL\n  │     │     │\n  │     │     ├─► Fetch page with Hero\n  │     │     │\n  │     │     ├─► Extract links via regex\n  │     │     │\n  │     │     ├─► Filter links:\n  │     │     │     ├─► Same domain only\n  │     │     │     ├─► Match includePatterns\n  │     │     │     └─► Exclude excludePatterns\n  │     │     │\n  │     │     ├─► Add new links to queue with depth + 1\n  │     │     │\n  │     │     ├─► Rate limit (delay between requests)\n  │     │     │\n  │     │     └─► Add to discovered URLs\n  │     │\n  │     ├─► If scrape=true:\n  │     │     └─► scrape({ urls: discoveredUrls })\n  │     │\n  │     └─► Return CrawlResult { urls[], scraped?, metadata }\n  │\n  └─► Result returned to caller\n```\n\n## Design Decisions\n\n### Why Hero?\n\n[Ulixee Hero](https://ulixee.org/) was chosen for:\n\n1. **Stealth** - Advanced TLS fingerprinting and anti-detection\n2. **Speed** - Optimized for headless automation\n3. **API** - Clean async/await interface\n4. **Stability** - Production-tested at scale\n\n### Pool vs Per-Request Browsers\n\nWe use a pool because:\n\n- Browser startup is slow (~2-3 seconds)\n- Memory overhead per browser is high\n- Connection reuse improves performance\n\nTrade-off: Stale browsers can accumulate state, so we recycle them periodically.\n\n### Cloudflare Detection Strategy\n\nMulti-signal approach because:\n\n- No single indicator is 100% reliable\n- Cloudflare changes their challenge pages\n- Different challenge types have different signatures\n\nDetection signals include:\n- DOM elements (`#challenge-form`, `.cf-browser-verification`)\n- Text patterns (\"checking your browser\", \"ray id\")\n- URL patterns (`/cdn-cgi/challenge-platform/`)\n- HTTP status codes\n\n### Content Cleaning\n\nWe clean HTML before formatting because:\n\n- Navigation, ads, scripts bloat output\n- LLMs perform better with focused content\n- Reduces token usage\n\nCleaning removes:\n- `<script>`, `<style>` tags\n- Navigation elements\n- Footer/sidebar content\n- Ad containers\n- Hidden elements\n\n## Extension Points\n\n### Adding a New Formatter\n\n1. Create `src/formatters/newformat.ts`:\n   ```typescript\n   export function formatToNewFormat(\n     pages: Page[],\n     baseUrl: string,\n     scrapedAt: string,\n     duration: number,\n     metadata?: WebsiteMetadata\n   ): string {\n     // Your formatting logic\n   }\n   ```\n\n2. Export from `src/formatters/index.ts`\n\n3. Add to format type in `src/types.ts`:\n   ```typescript\n   formats?: Array<\"markdown\" | \"html\" | \"newformat\">\n   ```\n\n4. Call formatter in `src/scraper.ts`\n\n### Adding a New ScrapeOption\n\n1. Add to `ScrapeOptions` in `src/types.ts`\n2. Add default in `DEFAULT_OPTIONS`\n3. Use in `Scraper` class via `this.options.newOption`\n4. Add CLI flag in `src/cli/index.ts` if needed\n\n### Modifying Cloudflare Detection\n\n- Detection patterns: `src/cloudflare/detector.ts`\n- Resolution logic: `src/cloudflare/handler.ts`\n\n## Testing\n\n```bash\ncd reader && npx vitest run\n```\n\n415 unit tests across 26 test files covering scraping, crawling, browser sessions, formatters, content cleaning, proxy pools, and error handling.\n\n## Related Guides\n\n- [Browser Pool](guides/browser-pool.md) - Deep dive into pool management\n- [Cloudflare Bypass](guides/cloudflare-bypass.md) - Understanding antibot bypass\n- [Production Server](deployment/production-server.md) - Shared Hero Core pattern\n"
  },
  {
    "path": "docs/assets/.gitkeep",
    "content": ""
  },
  {
    "path": "docs/assets/demo.tape",
    "content": "# VHS tape file for Reader demo GIF\n# Run: vhs docs/assets/demo.tape\n\nOutput docs/assets/demo.gif\n\nSet FontSize 16\nSet Width 900\nSet Height 500\nSet Theme \"Catppuccin Mocha\"\nSet Padding 20\n\n# Scrape a URL and extract the markdown\nType \"npx reader scrape https://reader.dev | jq -r '.data[0].markdown' | head -n 12\"\nSleep 500ms\nEnter\nSleep 3s\n\n# Let output display\nSleep 3s\n"
  },
  {
    "path": "docs/deployment/docker.md",
    "content": "# Docker Deployment Guide\n\nDeploy Reader in Docker containers.\n\n## Quick Start\n\n### Basic Dockerfile\n\n```dockerfile\n# Dockerfile\nFROM node:22-slim\n\n# Install Chrome dependencies\nRUN apt-get update && apt-get install -y \\\n    chromium \\\n    fonts-liberation \\\n    libasound2 \\\n    libatk-bridge2.0-0 \\\n    libatk1.0-0 \\\n    libcups2 \\\n    libdbus-1-3 \\\n    libdrm2 \\\n    libgbm1 \\\n    libgtk-3-0 \\\n    libnspr4 \\\n    libnss3 \\\n    libxcomposite1 \\\n    libxdamage1 \\\n    libxrandr2 \\\n    xdg-utils \\\n    --no-install-recommends \\\n    && rm -rf /var/lib/apt/lists/*\n\n# Set Chrome path for Hero\nENV CHROME_PATH=/usr/bin/chromium\n\nWORKDIR /app\n\n# Copy package files\nCOPY package*.json ./\n\n# Install dependencies\nRUN npm ci --only=production\n\n# Copy application\nCOPY . .\n\n# Build if TypeScript\nRUN npm run build 2>/dev/null || true\n\nEXPOSE 3000\n\nCMD [\"node\", \"dist/server.js\"]\n```\n\n### Build and Run\n\n```bash\n# Build image\ndocker build -t reader .\n\n# Run container\ndocker run -p 3000:3000 reader\n```\n\n## Docker Compose\n\n### Basic Setup\n\n```yaml\n# docker-compose.yml\nversion: \"3.8\"\n\nservices:\n  reader:\n    build: .\n    ports:\n      - \"3000:3000\"\n    environment:\n      - NODE_ENV=production\n      - LOG_LEVEL=info\n    restart: unless-stopped\n    deploy:\n      resources:\n        limits:\n          memory: 2G\n```\n\n### With Redis (for job queues)\n\n```yaml\n# docker-compose.yml\nversion: \"3.8\"\n\nservices:\n  api:\n    build:\n      context: .\n      dockerfile: Dockerfile.api\n    ports:\n      - \"3000:3000\"\n    environment:\n      - NODE_ENV=production\n      - REDIS_HOST=redis\n      - REDIS_PORT=6379\n    depends_on:\n      - redis\n    restart: unless-stopped\n\n  worker:\n    build:\n      context: .\n      dockerfile: Dockerfile.worker\n    environment:\n      - NODE_ENV=production\n      - REDIS_HOST=redis\n      - REDIS_PORT=6379\n    depends_on:\n      - redis\n    deploy:\n      replicas: 3\n      resources:\n        limits:\n          memory: 2G\n    restart: unless-stopped\n\n  redis:\n    image: redis:7-alpine\n    volumes:\n      - redis-data:/data\n    restart: unless-stopped\n\nvolumes:\n  redis-data:\n```\n\n### Start Services\n\n```bash\n# Start all services\ndocker-compose up -d\n\n# Scale workers\ndocker-compose up -d --scale worker=5\n\n# View logs\ndocker-compose logs -f worker\n\n# Stop services\ndocker-compose down\n```\n\n## Optimized Dockerfile\n\n### Multi-stage Build\n\n```dockerfile\n# Dockerfile\n# Build stage\nFROM node:22-slim AS builder\n\nWORKDIR /app\nCOPY package*.json ./\nRUN npm ci\nCOPY . .\nRUN npm run build\n\n# Production stage\nFROM node:22-slim\n\n# Install Chrome dependencies\nRUN apt-get update && apt-get install -y \\\n    chromium \\\n    fonts-liberation \\\n    libasound2 \\\n    libatk-bridge2.0-0 \\\n    libatk1.0-0 \\\n    libcups2 \\\n    libdbus-1-3 \\\n    libdrm2 \\\n    libgbm1 \\\n    libgtk-3-0 \\\n    libnspr4 \\\n    libnss3 \\\n    libxcomposite1 \\\n    libxdamage1 \\\n    libxrandr2 \\\n    xdg-utils \\\n    --no-install-recommends \\\n    && rm -rf /var/lib/apt/lists/*\n\nENV CHROME_PATH=/usr/bin/chromium\nENV NODE_ENV=production\n\nWORKDIR /app\n\n# Copy only production dependencies\nCOPY package*.json ./\nRUN npm ci --only=production\n\n# Copy built application\nCOPY --from=builder /app/dist ./dist\n\n# Non-root user for security\nRUN groupadd -r app && useradd -r -g app app\nUSER app\n\nEXPOSE 3000\n\nCMD [\"node\", \"dist/server.js\"]\n```\n\n## Configuration\n\n### Environment Variables\n\n```yaml\n# docker-compose.yml\nservices:\n  reader:\n    environment:\n      - NODE_ENV=production\n      - PORT=3000\n      - LOG_LEVEL=info\n      - CHROME_PATH=/usr/bin/chromium\n      - MAX_CONCURRENT_REQUESTS=10\n      - REQUEST_TIMEOUT_MS=60000\n```\n\n### Resource Limits\n\n```yaml\nservices:\n  reader:\n    deploy:\n      resources:\n        limits:\n          cpus: \"2\"\n          memory: 4G\n        reservations:\n          cpus: \"1\"\n          memory: 2G\n```\n\n### Health Checks\n\n```yaml\nservices:\n  reader:\n    healthcheck:\n      test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:3000/health\"]\n      interval: 30s\n      timeout: 10s\n      retries: 3\n      start_period: 40s\n```\n\n## Chrome Configuration\n\n### Sandbox Mode\n\nChrome requires special configuration in Docker:\n\n```dockerfile\n# Add to Dockerfile\nENV CHROME_FLAGS=\"--no-sandbox --disable-setuid-sandbox\"\n```\n\nOr configure in Hero:\n\n```typescript\n// In your application\nconst pool = new BrowserPool({\n  heroOptions: {\n    noChromeSandbox: true,\n  },\n});\n```\n\n### Shared Memory\n\nChrome needs sufficient shared memory:\n\n```yaml\nservices:\n  reader:\n    shm_size: \"2gb\"\n```\n\nOr mount tmpfs:\n\n```yaml\nservices:\n  reader:\n    volumes:\n      - /dev/shm:/dev/shm\n```\n\n## Production Considerations\n\n### Logging\n\n```yaml\nservices:\n  reader:\n    logging:\n      driver: \"json-file\"\n      options:\n        max-size: \"10m\"\n        max-file: \"3\"\n```\n\n### Networking\n\n```yaml\nservices:\n  reader:\n    networks:\n      - internal\n      - external\n\nnetworks:\n  internal:\n    internal: true\n  external:\n```\n\n### Secrets\n\n```yaml\nservices:\n  reader:\n    secrets:\n      - proxy_credentials\n\nsecrets:\n  proxy_credentials:\n    file: ./secrets/proxy.txt\n```\n\n### Volumes for Data\n\n```yaml\nservices:\n  reader:\n    volumes:\n      - ./data:/app/data\n      - ./logs:/app/logs\n```\n\n## Scaling\n\n### Docker Swarm\n\n```yaml\n# docker-stack.yml\nversion: \"3.8\"\n\nservices:\n  reader:\n    image: reader:latest\n    deploy:\n      replicas: 5\n      update_config:\n        parallelism: 2\n        delay: 10s\n      restart_policy:\n        condition: on-failure\n    networks:\n      - traefik\n\nnetworks:\n  traefik:\n    external: true\n```\n\nDeploy:\n\n```bash\ndocker stack deploy -c docker-stack.yml reader\n```\n\n### Kubernetes\n\n```yaml\n# deployment.yaml\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n  name: reader\nspec:\n  replicas: 3\n  selector:\n    matchLabels:\n      app: reader\n  template:\n    metadata:\n      labels:\n        app: reader\n    spec:\n      containers:\n        - name: reader\n          image: reader:latest\n          ports:\n            - containerPort: 3000\n          resources:\n            limits:\n              memory: \"2Gi\"\n              cpu: \"1\"\n          env:\n            - name: NODE_ENV\n              value: \"production\"\n---\napiVersion: v1\nkind: Service\nmetadata:\n  name: reader\nspec:\n  selector:\n    app: reader\n  ports:\n    - port: 80\n      targetPort: 3000\n```\n\n## Troubleshooting\n\n### Chrome Won't Start\n\n```bash\n# Check Chrome installation\ndocker exec -it container_name chromium --version\n\n# Test Chrome manually\ndocker exec -it container_name chromium --headless --no-sandbox --dump-dom https://example.com\n```\n\n### Memory Issues\n\n```yaml\n# Increase limits\nservices:\n  reader:\n    deploy:\n      resources:\n        limits:\n          memory: 4G\n    shm_size: \"2gb\"\n```\n\n### Network Issues\n\n```bash\n# Debug networking\ndocker exec -it container_name curl https://example.com\n\n# Check DNS\ndocker exec -it container_name nslookup example.com\n```\n\n## Complete Example\n\nSee [examples/deployment/docker/](../../examples/deployment/docker/) for a complete Docker setup.\n\n## Related Guides\n\n- [Production Server](production-server.md) - Server setup\n- [Job Queues](job-queues.md) - Async processing\n"
  },
  {
    "path": "docs/deployment/job-queues.md",
    "content": "# Job Queues Guide\n\nUse job queues for async scraping at scale with BullMQ.\n\n## Overview\n\nFor high-volume scraping, use a job queue to:\n- Process requests asynchronously\n- Handle retries automatically\n- Scale workers independently\n- Monitor job progress\n- Avoid overwhelming target sites\n\n## Architecture\n\n```\n┌─────────────┐     ┌─────────────┐     ┌─────────────┐\n│   API       │────▶│   Redis     │────▶│   Workers   │\n│   Server    │     │   Queue     │     │   (N)       │\n└─────────────┘     └─────────────┘     └─────────────┘\n       │                                       │\n       │         ┌─────────────┐              │\n       └────────▶│   Results   │◀─────────────┘\n                 │   Store     │\n                 └─────────────┘\n```\n\n## Setup\n\n### Installation\n\n```bash\nnpm install bullmq ioredis @vakra-dev/reader\n```\n\n### Basic Queue Setup\n\n```typescript\n// queue.ts\nimport { Queue, Worker, Job } from \"bullmq\";\nimport { scrape } from \"@vakra-dev/reader\";\n\nconst connection = {\n  host: process.env.REDIS_HOST || \"localhost\",\n  port: parseInt(process.env.REDIS_PORT || \"6379\"),\n};\n\n// Create queue\nexport const scrapeQueue = new Queue(\"scrape\", { connection });\n\n// Job data interface\ninterface ScrapeJobData {\n  urls: string[];\n  formats: (\"markdown\" | \"html\")[];\n  callbackUrl?: string;\n}\n\n// Add job to queue\nexport async function enqueueScrape(data: ScrapeJobData) {\n  const job = await scrapeQueue.add(\"scrape\", data, {\n    attempts: 3,\n    backoff: {\n      type: \"exponential\",\n      delay: 5000,\n    },\n  });\n\n  return job.id;\n}\n```\n\n### Worker Process\n\n```typescript\n// worker.ts\nimport { Worker, Job } from \"bullmq\";\nimport HeroCore from \"@ulixee/hero-core\";\nimport { TransportBridge } from \"@ulixee/net\";\nimport { ConnectionToHeroCore } from \"@ulixee/hero\";\nimport { scrape } from \"@vakra-dev/reader\";\n\nconst connection = {\n  host: process.env.REDIS_HOST || \"localhost\",\n  port: parseInt(process.env.REDIS_PORT || \"6379\"),\n};\n\n// Shared Hero Core\nlet heroCore: HeroCore;\n\nasync function createConnection() {\n  const bridge = new TransportBridge();\n  heroCore.addConnection(bridge.transportToClient);\n  return new ConnectionToHeroCore(bridge.transportToCore);\n}\n\n// Process jobs\nconst worker = new Worker(\n  \"scrape\",\n  async (job: Job) => {\n    const { urls, formats } = job.data;\n\n    console.log(`Processing job ${job.id}: ${urls.length} URLs`);\n\n    const result = await scrape({\n      urls,\n      formats,\n      connectionToCore: await createConnection(),\n      onProgress: async ({ completed, total }) => {\n        await job.updateProgress((completed / total) * 100);\n      },\n    });\n\n    // Callback if provided\n    if (job.data.callbackUrl) {\n      await fetch(job.data.callbackUrl, {\n        method: \"POST\",\n        headers: { \"Content-Type\": \"application/json\" },\n        body: JSON.stringify(result),\n      });\n    }\n\n    return result;\n  },\n  {\n    connection,\n    concurrency: 5,\n  }\n);\n\n// Event handlers\nworker.on(\"completed\", (job) => {\n  console.log(`Job ${job.id} completed`);\n});\n\nworker.on(\"failed\", (job, err) => {\n  console.error(`Job ${job?.id} failed:`, err.message);\n});\n\n// Start worker\nasync function start() {\n  heroCore = new HeroCore();\n  await heroCore.start();\n  console.log(\"Worker started, waiting for jobs...\");\n}\n\n// Graceful shutdown\nasync function shutdown() {\n  console.log(\"Shutting down worker...\");\n  await worker.close();\n  if (heroCore) await heroCore.close();\n  process.exit(0);\n}\n\nprocess.on(\"SIGTERM\", shutdown);\nprocess.on(\"SIGINT\", shutdown);\n\nstart().catch(console.error);\n```\n\n### API Server\n\n```typescript\n// api.ts\nimport express from \"express\";\nimport { scrapeQueue, enqueueScrape } from \"./queue\";\n\nconst app = express();\napp.use(express.json());\n\n// Enqueue scrape job\napp.post(\"/scrape\", async (req, res) => {\n  const { urls, formats, callbackUrl } = req.body;\n\n  const jobId = await enqueueScrape({ urls, formats, callbackUrl });\n\n  res.json({ jobId, status: \"queued\" });\n});\n\n// Get job status\napp.get(\"/job/:id\", async (req, res) => {\n  const job = await scrapeQueue.getJob(req.params.id);\n\n  if (!job) {\n    return res.status(404).json({ error: \"Job not found\" });\n  }\n\n  const state = await job.getState();\n  const progress = job.progress;\n\n  res.json({\n    id: job.id,\n    state,\n    progress,\n    data: job.data,\n    result: job.returnvalue,\n    failedReason: job.failedReason,\n  });\n});\n\n// Get job result\napp.get(\"/job/:id/result\", async (req, res) => {\n  const job = await scrapeQueue.getJob(req.params.id);\n\n  if (!job) {\n    return res.status(404).json({ error: \"Job not found\" });\n  }\n\n  const state = await job.getState();\n\n  if (state !== \"completed\") {\n    return res.status(202).json({ status: state, progress: job.progress });\n  }\n\n  res.json(job.returnvalue);\n});\n\napp.listen(3000, () => {\n  console.log(\"API server running on port 3000\");\n});\n```\n\n## Job Options\n\n### Retry Configuration\n\n```typescript\nawait scrapeQueue.add(\"scrape\", data, {\n  attempts: 5,\n  backoff: {\n    type: \"exponential\",\n    delay: 5000,  // 5s, 10s, 20s, 40s, 80s\n  },\n});\n```\n\n### Priority\n\n```typescript\n// High priority (lower number = higher priority)\nawait scrapeQueue.add(\"scrape\", urgentData, { priority: 1 });\n\n// Normal priority\nawait scrapeQueue.add(\"scrape\", normalData, { priority: 5 });\n\n// Low priority\nawait scrapeQueue.add(\"scrape\", bulkData, { priority: 10 });\n```\n\n### Delayed Jobs\n\n```typescript\n// Process after 5 minutes\nawait scrapeQueue.add(\"scrape\", data, {\n  delay: 5 * 60 * 1000,\n});\n```\n\n### Rate Limiting\n\n```typescript\n// Max 10 jobs per minute\nconst worker = new Worker(\"scrape\", processor, {\n  limiter: {\n    max: 10,\n    duration: 60000,\n  },\n});\n```\n\n## Scaling Workers\n\n### Multiple Workers\n\nRun multiple worker processes:\n\n```bash\n# Terminal 1\nWORKER_ID=1 npx tsx worker.ts\n\n# Terminal 2\nWORKER_ID=2 npx tsx worker.ts\n\n# Terminal 3\nWORKER_ID=3 npx tsx worker.ts\n```\n\n### Worker Concurrency\n\n```typescript\nconst worker = new Worker(\"scrape\", processor, {\n  connection,\n  concurrency: 5,  // Process 5 jobs simultaneously\n});\n```\n\n### Auto-Scaling\n\n```typescript\n// Scale based on queue depth\nasync function checkScale() {\n  const waiting = await scrapeQueue.getWaitingCount();\n  const active = await scrapeQueue.getActiveCount();\n\n  console.log(`Queue: ${waiting} waiting, ${active} active`);\n\n  if (waiting > 100) {\n    // Signal to scale up\n    await notifyScaleUp();\n  }\n}\n\nsetInterval(checkScale, 30000);\n```\n\n## Monitoring\n\n### Queue Dashboard (Bull Board)\n\n```typescript\nimport { createBullBoard } from \"@bull-board/api\";\nimport { BullMQAdapter } from \"@bull-board/api/bullMQAdapter\";\nimport { ExpressAdapter } from \"@bull-board/express\";\n\nconst serverAdapter = new ExpressAdapter();\nserverAdapter.setBasePath(\"/admin/queues\");\n\ncreateBullBoard({\n  queues: [new BullMQAdapter(scrapeQueue)],\n  serverAdapter,\n});\n\napp.use(\"/admin/queues\", serverAdapter.getRouter());\n```\n\n### Metrics\n\n```typescript\n// Queue stats\nasync function getQueueStats() {\n  return {\n    waiting: await scrapeQueue.getWaitingCount(),\n    active: await scrapeQueue.getActiveCount(),\n    completed: await scrapeQueue.getCompletedCount(),\n    failed: await scrapeQueue.getFailedCount(),\n    delayed: await scrapeQueue.getDelayedCount(),\n  };\n}\n\napp.get(\"/stats\", async (req, res) => {\n  res.json(await getQueueStats());\n});\n```\n\n### Events\n\n```typescript\n// Listen to queue events\nscrapeQueue.on(\"completed\", (job) => {\n  metrics.increment(\"jobs.completed\");\n  metrics.timing(\"jobs.duration\", job.processedOn - job.timestamp);\n});\n\nscrapeQueue.on(\"failed\", (job, err) => {\n  metrics.increment(\"jobs.failed\");\n  alerting.notify(`Job ${job.id} failed: ${err.message}`);\n});\n```\n\n## Error Handling\n\n### Retry Strategy\n\n```typescript\nconst worker = new Worker(\n  \"scrape\",\n  async (job) => {\n    try {\n      return await scrape(job.data);\n    } catch (error) {\n      // Don't retry on certain errors\n      if (error.message.includes(\"Invalid URL\")) {\n        throw new Error(`Permanent failure: ${error.message}`);\n      }\n      // Retry on transient errors\n      throw error;\n    }\n  },\n  {\n    connection,\n    settings: {\n      backoffStrategy: (attemptsMade) => {\n        // Custom backoff: 5s, 30s, 2m, 10m\n        const delays = [5000, 30000, 120000, 600000];\n        return delays[Math.min(attemptsMade - 1, delays.length - 1)];\n      },\n    },\n  }\n);\n```\n\n### Dead Letter Queue\n\n```typescript\n// Move failed jobs to DLQ after all retries\nawait scrapeQueue.add(\"scrape\", data, {\n  attempts: 3,\n  removeOnFail: {\n    age: 24 * 3600,  // Keep for 24 hours\n  },\n});\n\n// Process DLQ manually\nconst failedJobs = await scrapeQueue.getFailed();\nfor (const job of failedJobs) {\n  console.log(`Failed job ${job.id}: ${job.failedReason}`);\n  // Optionally retry\n  await job.retry();\n}\n```\n\n## Complete Example\n\n```typescript\n// complete-example.ts\nimport { Queue, Worker, Job } from \"bullmq\";\nimport express from \"express\";\nimport HeroCore from \"@ulixee/hero-core\";\nimport { scrape, ScrapeResult } from \"@vakra-dev/reader\";\n\nconst app = express();\napp.use(express.json());\n\n// Redis connection\nconst connection = { host: \"localhost\", port: 6379 };\n\n// Queue\nconst scrapeQueue = new Queue(\"scrape\", { connection });\n\n// Shared Hero Core\nlet heroCore: HeroCore;\n\n// Worker\nconst worker = new Worker<any, ScrapeResult>(\n  \"scrape\",\n  async (job: Job) => {\n    const result = await scrape({\n      ...job.data,\n      connectionToCore: await createConnection(),\n    });\n    return result;\n  },\n  { connection, concurrency: 3 }\n);\n\n// API endpoints\napp.post(\"/scrape/async\", async (req, res) => {\n  const job = await scrapeQueue.add(\"scrape\", req.body);\n  res.json({ jobId: job.id });\n});\n\napp.get(\"/scrape/:jobId\", async (req, res) => {\n  const job = await scrapeQueue.getJob(req.params.jobId);\n  if (!job) return res.status(404).json({ error: \"Not found\" });\n\n  const state = await job.getState();\n  res.json({\n    state,\n    progress: job.progress,\n    result: state === \"completed\" ? job.returnvalue : null,\n  });\n});\n\n// Start\nasync function start() {\n  heroCore = new HeroCore();\n  await heroCore.start();\n\n  app.listen(3000, () => console.log(\"Server running\"));\n}\n\nstart();\n```\n\n## Related Guides\n\n- [Production Server](production-server.md) - Basic server setup\n- [Docker](docker.md) - Containerized deployment\n- [Browser Pool](../guides/browser-pool.md) - Managing browsers\n"
  },
  {
    "path": "docs/deployment/production-server.md",
    "content": "# Production Server Guide\n\nDeploy Reader as a production-ready API server.\n\n## Overview\n\nFor production servers, use a **shared Hero Core** pattern instead of spawning individual Chrome processes per request. This dramatically reduces resource usage and improves performance.\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────┐\n│                Express Server                    │\n├─────────────────────────────────────────────────┤\n│              Shared Hero Core                    │\n│         (Single Chrome Process)                  │\n├─────────────────────────────────────────────────┤\n│   Browser 1  │  Browser 2  │  Browser 3  │ ...  │\n│   (Tab)      │  (Tab)      │  (Tab)      │      │\n└─────────────────────────────────────────────────┘\n```\n\n**Benefits:**\n- Single Chrome process instead of one per request\n- Lower memory footprint\n- Faster browser creation\n- Better resource utilization\n\n## Basic Setup\n\n### Installation\n\n```bash\nnpm install @vakra-dev/reader express\nnpm install @ulixee/hero-core @ulixee/net  # For shared Core\n```\n\n### Server Code\n\n```typescript\n// server.ts\nimport express from \"express\";\nimport HeroCore from \"@ulixee/hero-core\";\nimport { TransportBridge } from \"@ulixee/net\";\nimport { ConnectionToHeroCore } from \"@ulixee/hero\";\nimport { scrape, crawl } from \"@vakra-dev/reader\";\n\nconst app = express();\napp.use(express.json());\n\n// Shared Hero Core - initialized once\nlet heroCore: HeroCore;\n\nasync function createConnection() {\n  const bridge = new TransportBridge();\n  heroCore.addConnection(bridge.transportToClient);\n  return new ConnectionToHeroCore(bridge.transportToCore);\n}\n\n// Scrape endpoint\napp.post(\"/scrape\", async (req, res) => {\n  const { urls, formats = [\"markdown\"] } = req.body;\n\n  try {\n    const result = await scrape({\n      urls,\n      formats,\n      connectionToCore: await createConnection(),\n    });\n\n    res.json(result);\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n// Crawl endpoint\napp.post(\"/crawl\", async (req, res) => {\n  const { url, depth = 2, maxPages = 20, scrape: doScrape = false } = req.body;\n\n  try {\n    const result = await crawl({\n      url,\n      depth,\n      maxPages,\n      scrape: doScrape,\n      connectionToCore: await createConnection(),\n    });\n\n    res.json(result);\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n// Health check\napp.get(\"/health\", (req, res) => {\n  res.json({ status: \"ok\", heroCore: heroCore ? \"running\" : \"stopped\" });\n});\n\n// Start server\nasync function start() {\n  // Initialize shared Hero Core\n  heroCore = new HeroCore();\n  await heroCore.start();\n  console.log(\"Hero Core started\");\n\n  const PORT = process.env.PORT || 3000;\n  app.listen(PORT, () => {\n    console.log(`Server running on port ${PORT}`);\n  });\n}\n\n// Graceful shutdown\nasync function shutdown() {\n  console.log(\"Shutting down...\");\n  if (heroCore) {\n    await heroCore.close();\n  }\n  process.exit(0);\n}\n\nprocess.on(\"SIGTERM\", shutdown);\nprocess.on(\"SIGINT\", shutdown);\n\nstart().catch(console.error);\n```\n\n### Run the Server\n\n```bash\nnpx tsx server.ts\n```\n\n### Test Endpoints\n\n```bash\n# Scrape\ncurl -X POST http://localhost:3000/scrape \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"urls\": [\"https://example.com\"], \"formats\": [\"markdown\"]}'\n\n# Crawl\ncurl -X POST http://localhost:3000/crawl \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"url\": \"https://example.com\", \"depth\": 2, \"scrape\": true}'\n```\n\n## Production Configuration\n\n### Environment Variables\n\n```bash\n# .env\nPORT=3000\nNODE_ENV=production\nLOG_LEVEL=info\nMAX_CONCURRENT_REQUESTS=10\nREQUEST_TIMEOUT_MS=60000\n```\n\n### Request Limits\n\n```typescript\nimport rateLimit from \"express-rate-limit\";\n\n// Rate limiting\nconst limiter = rateLimit({\n  windowMs: 60 * 1000,  // 1 minute\n  max: 100,             // 100 requests per minute\n});\n\napp.use(limiter);\n\n// Request timeout\napp.use((req, res, next) => {\n  res.setTimeout(60000, () => {\n    res.status(408).json({ error: \"Request timeout\" });\n  });\n  next();\n});\n```\n\n### Request Validation\n\n```typescript\nimport { z } from \"zod\";\n\nconst scrapeSchema = z.object({\n  urls: z.array(z.string().url()).min(1).max(100),\n  formats: z.array(z.enum([\"markdown\", \"html\"])).optional(),\n  batchConcurrency: z.number().min(1).max(10).optional(),\n});\n\napp.post(\"/scrape\", async (req, res) => {\n  const parsed = scrapeSchema.safeParse(req.body);\n\n  if (!parsed.success) {\n    return res.status(400).json({ error: parsed.error.issues });\n  }\n\n  // ... handle request\n});\n```\n\n## Concurrency Control\n\n### Request Queue\n\n```typescript\nimport PQueue from \"p-queue\";\n\nconst requestQueue = new PQueue({\n  concurrency: parseInt(process.env.MAX_CONCURRENT_REQUESTS || \"10\"),\n});\n\napp.post(\"/scrape\", async (req, res) => {\n  try {\n    const result = await requestQueue.add(() =>\n      scrape({\n        urls: req.body.urls,\n        formats: req.body.formats,\n        connectionToCore: await createConnection(),\n      })\n    );\n\n    res.json(result);\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n```\n\n### Timeout Handling\n\n```typescript\nasync function scrapeWithTimeout(options: ScrapeOptions, timeoutMs: number) {\n  const controller = new AbortController();\n  const timeout = setTimeout(() => controller.abort(), timeoutMs);\n\n  try {\n    return await scrape({\n      ...options,\n      connectionToCore: await createConnection(),\n    });\n  } finally {\n    clearTimeout(timeout);\n  }\n}\n```\n\n## Monitoring\n\n### Health Checks\n\n```typescript\nlet activeRequests = 0;\nlet totalRequests = 0;\nlet failedRequests = 0;\n\napp.use((req, res, next) => {\n  activeRequests++;\n  totalRequests++;\n\n  res.on(\"finish\", () => {\n    activeRequests--;\n    if (res.statusCode >= 500) failedRequests++;\n  });\n\n  next();\n});\n\napp.get(\"/health\", (req, res) => {\n  res.json({\n    status: \"ok\",\n    heroCore: heroCore ? \"running\" : \"stopped\",\n    stats: {\n      activeRequests,\n      totalRequests,\n      failedRequests,\n      queueSize: requestQueue.size,\n      queuePending: requestQueue.pending,\n    },\n  });\n});\n```\n\n### Logging\n\n```typescript\nimport pino from \"pino\";\nimport pinoHttp from \"pino-http\";\n\nconst logger = pino({\n  level: process.env.LOG_LEVEL || \"info\",\n});\n\napp.use(pinoHttp({ logger }));\n\n// Log scrape requests\napp.post(\"/scrape\", async (req, res) => {\n  const startTime = Date.now();\n\n  try {\n    const result = await scrape({ ... });\n\n    logger.info({\n      type: \"scrape\",\n      urls: req.body.urls.length,\n      duration: Date.now() - startTime,\n      successful: result.batchMetadata.successfulUrls,\n    });\n\n    res.json(result);\n  } catch (error) {\n    logger.error({ type: \"scrape_error\", error: error.message });\n    res.status(500).json({ error: error.message });\n  }\n});\n```\n\n## Scaling\n\n### Horizontal Scaling\n\nRun multiple server instances behind a load balancer:\n\n```bash\n# Start multiple instances\nPORT=3001 npx tsx server.ts &\nPORT=3002 npx tsx server.ts &\nPORT=3003 npx tsx server.ts &\n```\n\n### PM2 Cluster Mode\n\n```javascript\n// ecosystem.config.js\nmodule.exports = {\n  apps: [{\n    name: \"reader\",\n    script: \"server.ts\",\n    interpreter: \"npx\",\n    interpreter_args: \"tsx\",\n    instances: \"max\",\n    exec_mode: \"cluster\",\n    env: {\n      NODE_ENV: \"production\",\n      PORT: 3000,\n    },\n  }],\n};\n```\n\n```bash\npm2 start ecosystem.config.js\n```\n\n### Memory Limits\n\n```javascript\n// ecosystem.config.js\nmodule.exports = {\n  apps: [{\n    name: \"reader\",\n    script: \"server.ts\",\n    max_memory_restart: \"2G\",\n    node_args: \"--max-old-space-size=2048\",\n  }],\n};\n```\n\n## Complete Example\n\nSee [examples/production/express-server/](../../examples/production/express-server/) for a complete production server implementation.\n\n## Related Guides\n\n- [Docker Deployment](docker.md) - Containerized deployment\n- [Job Queues](job-queues.md) - Async job processing\n- [Browser Pool](../guides/browser-pool.md) - Pool management\n"
  },
  {
    "path": "docs/getting-started.md",
    "content": "# Getting Started\n\nThis guide walks you through setting up Reader, verifying your installation, and running your first scrape.\n\n## Prerequisites\n\n- **Node.js >= 18** (v22 recommended)\n- **npm** package manager\n\n> **Note:** The Hero browser runtime requires Node.js. Always run your scripts with `node` or `npx tsx`.\n\n## Installation\n\n### From npm\n\n```bash\nnpm install @vakra-dev/reader\n```\n\n### From source\n\n```bash\ngit clone https://github.com/vakra-dev/reader.git\ncd reader\nnpm install\nnpm run build\n```\n\n## Verify Installation\n\n### Test the CLI\n\n```bash\nnpx reader scrape https://example.com\n```\n\nYou should see markdown output of the example.com page.\n\n### Test the API\n\nCreate a file `test-scrape.ts`:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  const reader = new ReaderClient();\n\n  const result = await reader.scrape({\n    urls: [\"https://example.com\"],\n    formats: [\"markdown\"],\n  });\n\n  console.log(\"Success:\", result.batchMetadata.successfulUrls === 1);\n  console.log(\"Content length:\", result.data[0].markdown?.length);\n\n  await reader.close();\n}\n\nmain().catch(console.error);\n```\n\nRun it:\n\n```bash\nnpx tsx test-scrape.ts\n```\n\n## Your First Scrape\n\n### Single URL\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\"https://news.ycombinator.com\"],\n  formats: [\"markdown\"],\n});\n\n// Access the markdown content\nconsole.log(result.data[0].markdown);\n\n// Access metadata\nconsole.log(\"Title:\", result.data[0].metadata.website.title);\nconsole.log(\"Duration:\", result.data[0].metadata.duration, \"ms\");\n\nawait reader.close();\n```\n\n### Multiple URLs\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\n    \"https://example.com\",\n    \"https://example.org\",\n    \"https://example.net\",\n  ],\n  formats: [\"markdown\"],\n  batchConcurrency: 3,\n  onProgress: ({ completed, total, currentUrl }) => {\n    console.log(`[${completed}/${total}] Scraping: ${currentUrl}`);\n  },\n});\n\nconsole.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);\nconsole.log(`Failed: ${result.batchMetadata.failedUrls}`);\n\nawait reader.close();\n```\n\n### Crawl a Website\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient();\n\nconst result = await reader.crawl({\n  url: \"https://example.com\",\n  depth: 2,\n  maxPages: 10,\n  scrape: true,\n});\n\nconsole.log(`Discovered ${result.urls.length} URLs:`);\nresult.urls.forEach((page) => {\n  console.log(`  - ${page.title}: ${page.url}`);\n});\n\nif (result.scraped) {\n  console.log(`\\nScraped ${result.scraped.batchMetadata.successfulUrls} pages`);\n}\n\nawait reader.close();\n```\n\n### Browser Session\n\nLaunch a stealthed Chrome and drive it with Playwright or Puppeteer:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nconst reader = new ReaderClient();\n\nconst session = await reader.browser();\nconst browser = await chromium.connectOverCDP(session.wsEndpoint);\nconst context = await browser.newContext();\nconst page = await context.newPage();\n\nawait page.goto(\"https://news.ycombinator.com\");\nconsole.log(\"Title:\", await page.title());\n\n// Full Playwright API - click, type, screenshot, evaluate\nconst stories = await page.evaluate(() =>\n  Array.from(document.querySelectorAll(\".athing\")).slice(0, 5).map((r) =>\n    r.querySelector(\".titleline > a\")?.textContent\n  )\n);\nconsole.log(\"Top stories:\", stories);\n\nawait browser.close();\nawait session.close();\nawait reader.close();\n```\n\nInstall Playwright: `npm install playwright-core`\n\nFor more examples, see the [Browser Sessions guide](guides/browser-sessions.md).\n\n## Understanding the Output\n\n### ScrapeResult Structure\n\n```typescript\ninterface ScrapeResult {\n  // Array of scraped websites (one per URL)\n  data: WebsiteScrapeResult[];\n\n  // Metadata about the batch operation\n  batchMetadata: {\n    totalUrls: number;\n    successfulUrls: number;\n    failedUrls: number;\n    scrapedAt: string;      // ISO timestamp\n    totalDuration: number;  // milliseconds\n    errors?: Array<{ url: string; error: string }>;\n  };\n}\n\ninterface WebsiteScrapeResult {\n  // Content in requested formats\n  markdown?: string;\n  html?: string;\n\n  // Metadata about this specific scrape\n  metadata: {\n    baseUrl: string;\n    finalUrl?: string;  // Present if URL redirected\n    totalPages: number;\n    scrapedAt: string;\n    duration: number;\n    website: WebsiteMetadata;  // Title, description, OG tags, etc.\n  };\n}\n```\n\n### CrawlResult Structure\n\n```typescript\ninterface CrawlResult {\n  // Discovered URLs with basic info\n  urls: Array<{\n    url: string;\n    title: string;\n    description: string | null;\n  }>;\n\n  // Full scrape results (only when scrape: true)\n  scraped?: ScrapeResult;\n\n  // Crawl operation metadata\n  metadata: {\n    totalUrls: number;\n    maxDepth: number;\n    totalDuration: number;\n    seedUrl: string;\n  };\n}\n```\n\n## CLI Quick Reference\n\n### Daemon Mode (Recommended for Multiple Requests)\n\n```bash\n# Start daemon (once, in a separate terminal or background)\nnpx reader start --pool-size 5\n\n# Scrape (auto-detects and uses daemon if running)\nnpx reader scrape https://example.com\n\n# Crawl (auto-detects and uses daemon if running)\nnpx reader crawl https://example.com -d 2\n\n# Check daemon status\nnpx reader status\n\n# Stop daemon\nnpx reader stop\n\n# Force standalone mode (bypass daemon)\nnpx reader scrape https://example.com --standalone\n```\n\n### Scraping\n\n```bash\n# Scrape a URL to markdown\nnpx reader scrape https://example.com\n\n# Scrape with multiple formats\nnpx reader scrape https://example.com -f markdown,html\n\n# Scrape multiple URLs concurrently\nnpx reader scrape url1 url2 url3 -c 3\n\n# Save output to file\nnpx reader scrape https://example.com -o output.md\n\n# Enable verbose logging\nnpx reader scrape https://example.com -v\n\n# Show browser window (debugging)\nnpx reader scrape https://example.com --show-chrome\n```\n\n### Crawling\n\n```bash\n# Crawl a website\nnpx reader crawl https://example.com -d 2 -m 20\n\n# Crawl and scrape content\nnpx reader crawl https://example.com -d 2 --scrape\n```\n\n## Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `LOG_LEVEL` | Logging level: `debug`, `info`, `warn`, `error` (default: `info`) |\n| `NODE_ENV` | Set to `development` for pretty-printed logs |\n\n## Common Issues\n\n### \"Chrome/Chromium not found\"\n\nHero automatically downloads Chrome on first run. If this fails:\n\n```bash\n# Manually install Chrome dependencies (Ubuntu/Debian)\nsudo apt-get install -y chromium-browser\n\n# Or use the system Chrome\nexport CHROME_PATH=/usr/bin/chromium-browser\n```\n\n### \"ECONNREFUSED\" errors\n\nThis usually means the target site is blocking requests. Try:\n\n1. Use a proxy: `--proxy http://user:pass@host:port`\n2. Add delays between requests: `--delay 2000`\n3. Use verbose mode to see what's happening: `-v`\n\n### ESM/CommonJS issues\n\nReader is ESM-only. Make sure your `package.json` has:\n\n```json\n{\n  \"type\": \"module\"\n}\n```\n\nOr use the `.mjs` extension for your files.\n\n## Next Steps\n\nBased on your use case, explore these guides:\n\n| Use Case | Guide |\n|----------|-------|\n| Understanding Cloudflare bypass | [Cloudflare Bypass](guides/cloudflare-bypass.md) |\n| Setting up proxies | [Proxy Configuration](guides/proxy-configuration.md) |\n| Production server deployment | [Production Server](deployment/production-server.md) |\n| High-volume scraping | [Browser Pool](guides/browser-pool.md) |\n| Docker deployment | [Docker](deployment/docker.md) |\n\n## Need Help?\n\n- Check the [Troubleshooting Guide](troubleshooting.md)\n- Browse [Examples](../examples/)\n- Open an issue on [GitHub](https://github.com/vakra-dev/reader/issues)\n"
  },
  {
    "path": "docs/guides/browser-pool.md",
    "content": "# Browser Pool Guide\n\nThis guide covers browser pool management for production-grade scraping.\n\n## When to Use BrowserPool vs ReaderClient\n\n| Use Case | Recommended |\n|----------|-------------|\n| Simple scraping/crawling | `ReaderClient` |\n| Scripts and CLI tools | `ReaderClient` |\n| Custom browser control | `BrowserPool` |\n| Express/production servers | `BrowserPool` or Shared Hero Core |\n| Low-level page interaction | `BrowserPool` |\n\nFor most use cases, **ReaderClient is recommended** as it manages the HeroCore lifecycle automatically. Use `BrowserPool` when you need direct access to Hero browser instances for custom logic.\n\n## Overview\n\nBrowser instances are expensive:\n- ~2-3 seconds to start\n- ~200-500MB memory each\n- Can accumulate state over time\n\nThe `BrowserPool` class manages a pool of reusable browser instances, handling lifecycle, recycling, and health monitoring.\n\n## Basic Usage\n\n### Using ReaderClient (Recommended)\n\nThe simplest way to configure browser pool settings:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({\n  browserPool: {\n    size: 5,                   // Number of browser instances\n    retireAfterPages: 50,      // Recycle after N pages\n    retireAfterMinutes: 15,    // Recycle after N minutes\n    maxQueueSize: 100,         // Max pending requests\n  },\n});\n\n// All scrape/crawl operations use the configured pool\nconst result = await reader.scrape({\n  urls: [\"https://example.com\", \"https://example.org\"],\n  batchConcurrency: 3,\n});\n\nawait reader.close();\n```\n\n### Using BrowserPool Directly (Advanced)\n\nFor custom browser control:\n\n```typescript\nimport { BrowserPool } from \"@vakra-dev/reader\";\n\nconst pool = new BrowserPool({ size: 5 });\nawait pool.initialize();\n\n// Use withBrowser for automatic acquire/release\nconst title = await pool.withBrowser(async (hero) => {\n  await hero.goto(\"https://example.com\");\n  return await hero.document.title;\n});\n\nawait pool.shutdown();\n```\n\n## Configuration\n\n```typescript\nconst pool = new BrowserPool({\n  size: 5,                    // Number of browser instances\n  retireAfterPages: 100,      // Recycle after N pages\n  retireAfterMinutes: 30,     // Recycle after N minutes\n  maxQueueSize: 100,          // Max pending requests\n  healthCheckIntervalMs: 300000, // Health check interval (5 min)\n});\n```\n\n### Configuration Options\n\n| Option | Default | Description |\n|--------|---------|-------------|\n| `size` | `2` | Number of browser instances in the pool |\n| `retireAfterPages` | `100` | Recycle browser after this many pages |\n| `retireAfterMinutes` | `30` | Recycle browser after this many minutes |\n| `maxQueueSize` | `100` | Maximum requests that can wait for a browser |\n| `healthCheckIntervalMs` | `300000` | Interval between health checks (5 minutes) |\n\n## Pool Lifecycle\n\n### Initialization\n\n```typescript\nconst pool = new BrowserPool({ size: 5 });\nawait pool.initialize();\n```\n\nThis:\n1. Creates `size` Hero instances\n2. Starts background health checking\n3. Makes pool ready for requests\n\n### Acquire and Release\n\n**Recommended: Use `withBrowser`**\n\n```typescript\nconst result = await pool.withBrowser(async (hero) => {\n  await hero.goto(\"https://example.com\");\n  const title = await hero.document.title;\n  return title;\n});\n```\n\nBenefits:\n- Automatic acquire/release\n- Exception-safe (always releases on error)\n- Clean, readable code\n\n**Manual acquire/release (advanced)**\n\n```typescript\nconst hero = await pool.acquire();\ntry {\n  await hero.goto(\"https://example.com\");\n  // ... do work\n} finally {\n  await pool.release(hero);\n}\n```\n\n### Recycling\n\nBrowsers are automatically recycled when:\n\n1. **Page limit reached** - After `retireAfterPages` navigations\n2. **Time limit reached** - After `retireAfterMinutes`\n3. **Health check failure** - If browser becomes unresponsive\n\nRecycling closes the old browser and creates a fresh one.\n\n### Shutdown\n\n```typescript\nawait pool.shutdown();\n```\n\nThis:\n1. Stops health checking\n2. Closes all browser instances\n3. Clears the queue\n\n## Monitoring\n\n### Get Pool Stats\n\n```typescript\nconst stats = pool.getStats();\nconsole.log(stats);\n// {\n//   total: 5,\n//   available: 3,\n//   inUse: 2,\n//   queueSize: 0,\n//   totalAcquired: 150,\n//   totalRecycled: 3\n// }\n```\n\n### Health Check\n\n```typescript\nconst health = await pool.healthCheck();\nconsole.log(health);\n// {\n//   healthy: true,\n//   instances: [\n//     { id: 0, healthy: true, pages: 45, ageMinutes: 12 },\n//     { id: 1, healthy: true, pages: 38, ageMinutes: 10 },\n//     ...\n//   ]\n// }\n```\n\n## Production Patterns\n\n### Shared Pool for Express Server\n\n```typescript\nimport express from \"express\";\nimport { BrowserPool } from \"@vakra-dev/reader\";\n\nconst app = express();\nconst pool = new BrowserPool({ size: 10 });\n\n// Initialize on startup\npool.initialize().then(() => {\n  console.log(\"Browser pool ready\");\n});\n\napp.get(\"/scrape\", async (req, res) => {\n  const url = req.query.url as string;\n\n  try {\n    const result = await pool.withBrowser(async (hero) => {\n      await hero.goto(url);\n      return await hero.document.body.innerHTML;\n    });\n\n    res.json({ html: result });\n  } catch (error) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n// Graceful shutdown\nprocess.on(\"SIGTERM\", async () => {\n  await pool.shutdown();\n  process.exit(0);\n});\n\napp.listen(3000);\n```\n\n### Queue Management\n\nWhen all browsers are busy, requests queue up:\n\n```typescript\nconst pool = new BrowserPool({\n  size: 5,\n  maxQueueSize: 100,  // Max 100 waiting requests\n});\n\n// If queue is full, acquire() throws an error\ntry {\n  const hero = await pool.acquire();\n} catch (error) {\n  if (error.message.includes(\"queue full\")) {\n    // Handle backpressure\n    console.log(\"Too many pending requests\");\n  }\n}\n```\n\n### Scaling Guidelines\n\n| Concurrent Users | Pool Size | Memory (approx) |\n|------------------|-----------|-----------------|\n| 1-5 | 2-3 | 1-1.5 GB |\n| 5-20 | 5-10 | 2.5-5 GB |\n| 20-50 | 10-20 | 5-10 GB |\n| 50+ | Consider distributed pools | 10+ GB |\n\n## Shared Hero Core Pattern\n\nFor production servers, use a shared Hero Core instead of individual cores per browser:\n\n```typescript\nimport HeroCore from \"@ulixee/hero-core\";\nimport { TransportBridge } from \"@ulixee/net\";\nimport { ConnectionToHeroCore } from \"@ulixee/hero\";\n\n// Initialize once at startup\nconst heroCore = new HeroCore();\nawait heroCore.start();\n\n// Create connection for each scrape\nfunction createConnection() {\n  const bridge = new TransportBridge();\n  heroCore.addConnection(bridge.transportToClient);\n  return new ConnectionToHeroCore(bridge.transportToCore);\n}\n\n// Use with scrape\nconst result = await scrape({\n  urls: [\"https://example.com\"],\n  connectionToCore: createConnection(),\n});\n\n// Shutdown on exit\nawait heroCore.close();\n```\n\n**Why use shared Core?**\n\n- Single Chrome process manages all browsers\n- Lower memory overhead\n- Better resource utilization\n- Faster browser creation\n\nSee [Production Server Guide](../deployment/production-server.md) for complete examples.\n\n## Memory Management\n\n### Reduce Memory Usage\n\n```typescript\nconst pool = new BrowserPool({\n  size: 3,                   // Fewer browsers\n  retireAfterPages: 50,      // Recycle more often\n  retireAfterMinutes: 15,    // Shorter lifetime\n});\n```\n\n### Monitor Memory\n\n```typescript\nimport { memoryUsage } from \"process\";\n\nsetInterval(() => {\n  const usage = memoryUsage();\n  console.log(`Memory: ${Math.round(usage.heapUsed / 1024 / 1024)} MB`);\n\n  const stats = pool.getStats();\n  console.log(`Pool: ${stats.inUse}/${stats.total} in use`);\n}, 30000);\n```\n\n### Force Garbage Collection\n\nBetween large batch operations:\n\n```typescript\nconst reader = new ReaderClient();\n\n// Process batch\nawait reader.scrape({ urls: batch1 });\n\n// Allow GC before next batch\nawait new Promise(r => setTimeout(r, 1000));\n\n// Process next batch\nawait reader.scrape({ urls: batch2 });\n\nawait reader.close();\n```\n\n## Error Handling\n\n### Browser Crashes\n\nIf a browser crashes, the pool automatically:\n1. Removes it from the pool\n2. Creates a replacement\n3. Continues serving requests\n\n### Timeout Handling\n\n```typescript\nconst result = await pool.withBrowser(async (hero) => {\n  // Set navigation timeout\n  await hero.goto(url, { timeoutMs: 30000 });\n\n  // ... rest of logic\n}, { timeoutMs: 60000 }); // Overall operation timeout\n```\n\n### Retry Logic\n\n```typescript\nasync function scrapeWithRetry(url: string, maxRetries = 3) {\n  for (let attempt = 1; attempt <= maxRetries; attempt++) {\n    try {\n      return await pool.withBrowser(async (hero) => {\n        await hero.goto(url);\n        return await hero.document.body.innerHTML;\n      });\n    } catch (error) {\n      if (attempt === maxRetries) throw error;\n      console.log(`Attempt ${attempt} failed, retrying...`);\n      await new Promise(r => setTimeout(r, 1000 * attempt));\n    }\n  }\n}\n```\n\n## Best Practices\n\n1. **Always use `withBrowser`** - Ensures proper acquire/release\n2. **Size pool appropriately** - Balance memory vs throughput\n3. **Enable recycling** - Prevents memory leaks from long-running browsers\n4. **Monitor stats** - Track pool utilization\n5. **Handle shutdown gracefully** - Close pool on process exit\n6. **Use shared Hero Core** - For production servers\n\n## Related Guides\n\n- [Production Server](../deployment/production-server.md) - Shared Hero Core setup\n- [Cloudflare Bypass](cloudflare-bypass.md) - Challenge handling\n- [Troubleshooting](../troubleshooting.md) - Common issues\n"
  },
  {
    "path": "docs/guides/browser-sessions.md",
    "content": "# Browser Sessions\n\nBrowser sessions launch a stealthed Chrome and return a CDP (Chrome DevTools Protocol) WebSocket URL. You connect Playwright, Puppeteer, or any CDP client and get full browser automation with anti-bot stealth active.\n\n## When to Use Browser Sessions\n\n| Use case | Primitive |\n|----------|-----------|\n| Extract content from a URL → markdown | `scrape()` |\n| Discover pages on a site | `crawl()` |\n| Click buttons, fill forms, navigate multi-page flows | `browser()` |\n| Scrape pages behind login/auth | `browser()` |\n| Take screenshots, generate PDFs | `browser()` |\n| Run existing Playwright/Puppeteer scripts with stealth | `browser()` |\n\n## Quick Start\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nconst reader = new ReaderClient();\n\n// Create a session\nconst session = await reader.browser();\n\n// Connect Playwright - one-line change from local scripts\nconst browser = await chromium.connectOverCDP(session.wsEndpoint);\nconst context = await browser.newContext();\nconst page = await context.newPage();\n\n// Use Playwright normally\nawait page.goto(\"https://example.com\");\nconsole.log(await page.title());\n\n// Cleanup\nawait browser.close();\nawait session.close();\nawait reader.close();\n```\n\n## Stealth Features\n\nEvery browser session has these anti-bot features active automatically:\n\n| Feature | What it does |\n|---------|-------------|\n| `navigator.webdriver = false` | Hides the automation flag that most bot detectors check first |\n| Navigator spoofing | Realistic `deviceMemory`, `hardwareConcurrency`, `platform` values |\n| WebGL/Canvas fingerprinting | Randomized rendering signatures |\n| WebRTC IP masking | Prevents real IP leaks through WebRTC connections |\n| Chrome plugin array | Simulates real Chrome extension presence |\n| Permission API behavior | Matches real Chrome permission responses |\n\nThese are injected at the browser level via `Page.addScriptToEvaluateOnNewDocument` and apply to all pages, including pages created by Playwright/Puppeteer.\n\n## Connecting with Playwright\n\n```typescript\nimport { chromium } from \"playwright-core\";\n\nconst session = await reader.browser();\nconst browser = await chromium.connectOverCDP(session.wsEndpoint);\nconst context = await browser.newContext();\nconst page = await context.newPage();\n\n// Full Playwright API available\nawait page.goto(\"https://example.com\");\nawait page.click(\"#login-button\");\nawait page.fill(\"#email\", \"user@example.com\");\nawait page.screenshot({ path: \"screenshot.png\" });\nawait page.pdf({ path: \"page.pdf\" });\n\nconst cookies = await context.cookies();\n```\n\nInstall: `npm install playwright-core`\n\n## Connecting with Puppeteer\n\n```typescript\nimport { connect } from \"puppeteer-core\";\n\nconst session = await reader.browser();\nconst browser = await connect({\n  browserWSEndpoint: session.wsEndpoint,\n  defaultViewport: null,\n});\n\nconst page = await browser.newPage();\nawait page.goto(\"https://example.com\");\nconsole.log(await page.title());\n```\n\nInstall: `npm install puppeteer-core`\n\n## Connecting with Raw CDP\n\nFor any language or tool that speaks the Chrome DevTools Protocol:\n\n```typescript\nimport WebSocket from \"ws\";\n\nconst session = await reader.browser();\nconst ws = new WebSocket(session.wsEndpoint);\n\n// Create a page target\nconst target = await sendCDP(ws, \"Target.createTarget\", { url: \"about:blank\" });\n\n// Attach and navigate\nconst attached = await sendCDP(ws, \"Target.attachToTarget\", {\n  targetId: target.targetId,\n  flatten: true,\n});\n\nawait sendPageCDP(ws, attached.sessionId, \"Page.navigate\", {\n  url: \"https://example.com\",\n});\n```\n\n## Session Lifecycle\n\n```\nreader.browser()\n  │\n  ├── Launches Chrome with stealth (Hero emulation scripts)\n  ├── Extracts CDP WebSocket URL\n  ├── Starts auto-close timeout (default: 5 minutes)\n  │\n  ▼\nsession.wsEndpoint\n  │\n  ├── Connect Playwright/Puppeteer\n  ├── Navigate, interact, extract\n  │\n  ▼\nsession.close()  OR  timeout expires\n  │\n  └── Chrome process terminated, resources released\n```\n\n### Timeout\n\nSessions auto-close after `timeoutMs` (default: 300,000ms = 5 minutes). Set a longer timeout for extended automation:\n\n```typescript\nconst session = await reader.browser({\n  timeoutMs: 600_000, // 10 minutes\n});\n```\n\n### Cleanup\n\nAlways close sessions when done to release Chrome processes:\n\n```typescript\ntry {\n  const session = await reader.browser();\n  // ... use session ...\n} finally {\n  await session.close();\n}\n```\n\n## CLI Usage\n\n```bash\n# Create a session (prints wsEndpoint JSON, blocks until Ctrl+C)\nnpx reader browser create\n\n# Create with options\nnpx reader browser create --timeout 60000 --show-chrome\n\n# List active sessions (daemon mode)\nnpx reader browser list\n\n# Stop a session\nnpx reader browser stop <sessionId>\n```\n\n## Options\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `proxy` | `ProxyConfig` | - | Proxy to route browser traffic through |\n| `proxyTier` | `ProxyTier` | - | Use a proxy from the configured pool tier |\n| `showChrome` | `boolean` | `false` | Show the browser window |\n| `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) |\n| `verbose` | `boolean` | `false` | Enable verbose logging |\n\n## Notes\n\n- Each session launches its own Chrome process (~300MB memory)\n- Sessions are isolated from the scrape/crawl browser pool\n- MITM proxy (TLS fingerprinting) is disabled for sessions. Emulation scripts provide the stealth layer\n- Selenium/chromedriver is not supported (requires exclusive Chrome access). Use Playwright, Puppeteer, or raw CDP instead.\n"
  },
  {
    "path": "docs/guides/cloudflare-bypass.md",
    "content": "# Cloudflare Bypass Guide\n\nThis guide explains how Reader bypasses Cloudflare and other bot detection systems.\n\n## Overview\n\nMany websites use Cloudflare to protect against bots. Reader uses [Ulixee Hero](https://ulixee.org/) which employs multiple techniques to appear as a legitimate browser.\n\n## How It Works\n\n### 1. TLS Fingerprinting\n\nEvery browser has a unique TLS (HTTPS) fingerprint based on:\n- Supported cipher suites\n- TLS extensions order\n- ALPN protocols\n\nHero emulates Chrome's exact TLS fingerprint, making connections indistinguishable from a real browser.\n\n### 2. DNS over TLS\n\nChrome uses DNS over HTTPS/TLS to Cloudflare's 1.1.1.1 servers. Hero replicates this behavior, which Cloudflare can detect and uses as a trust signal.\n\n### 3. WebRTC IP Masking\n\nWebRTC can leak your real IP even behind a proxy. Hero masks WebRTC to prevent IP detection that could reveal automation.\n\n### 4. JavaScript Environment\n\nHero creates a complete browser environment:\n- Navigator properties match real Chrome\n- WebGL fingerprints are realistic\n- Canvas fingerprints are consistent\n- Plugin arrays match real installations\n\n## Challenge Types\n\nReader detects and handles these challenge types:\n\n| Challenge | Detection | Bypass Method |\n|-----------|-----------|---------------|\n| **JS Challenge** | \"Checking your browser\" text | Wait for auto-resolution |\n| **Turnstile** | Turnstile widget in DOM | Wait for user interaction simulation |\n| **Under Attack Mode** | Interstitial page | Extended wait with polling |\n| **CAPTCHA** | hCaptcha/reCAPTCHA widget | Cannot bypass (requires human) |\n| **WAF Block** | 403/1020 error codes | Cannot bypass (IP blocked) |\n\n## How Detection Works\n\nChallenge detection and resolution is handled automatically by the engine. You don't need to call any detection functions manually - Reader detects and resolves challenges during every scrape.\n\n### Detection Signals\n\nThe detector looks for multiple signals:\n\n**DOM Signals:**\n- `#challenge-form` - Main challenge container\n- `.cf-browser-verification` - Verification widget\n- `#turnstile-wrapper` - Turnstile CAPTCHA\n- `#cf-hcaptcha-container` - hCaptcha container\n\n**Text Signals:**\n- \"Checking your browser\"\n- \"Please wait...\"\n- \"DDoS protection by Cloudflare\"\n- \"Ray ID:\"\n\n**URL Signals:**\n- `/cdn-cgi/challenge-platform/`\n- `__cf_chl_` parameters\n\n## Resolution\n\nThe engine automatically resolves challenges using two methods:\n\n1. **Redirect Detection** - URL changes after challenge is solved\n2. **Element Removal** - Challenge DOM elements disappear\n\nResolution runs automatically during every scrape with a 45-second timeout.\n\n## Improving Success Rate\n\n### Use Residential Proxies\n\nCloudflare trusts residential IPs more than datacenter IPs:\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://protected-site.com\"],\n  proxy: {\n    type: \"residential\",\n    host: \"proxy.example.com\",\n    port: 8080,\n    username: \"username\",\n    password: \"password\",\n    country: \"us\",\n  },\n});\nawait reader.close();\n```\n\n### Add Delays\n\nRate limiting makes your traffic look more human:\n\n```typescript\nconst reader = new ReaderClient();\n\n// For crawling\nconst result = await reader.crawl({\n  url: \"https://protected-site.com\",\n  delayMs: 3000,  // 3 seconds between requests\n});\n\n// For batch scraping, lower concurrency\nconst batchResult = await reader.scrape({\n  urls: manyUrls,\n  batchConcurrency: 1,  // One at a time\n});\n\nawait reader.close();\n```\n\n### Rotate User Agents\n\nSome sites track user agent patterns:\n\n```typescript\nconst userAgents = [\n  \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...\",\n  \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36...\",\n];\n\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],\n});\nawait reader.close();\n```\n\n### Increase Timeout\n\nChallenges can take 30+ seconds to resolve:\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://protected-site.com\"],\n  timeoutMs: 60000,  // 60 seconds\n});\nawait reader.close();\n```\n\n## What Can't Be Bypassed\n\n### CAPTCHAs\n\nCAPTCHAs require human interaction. Reader cannot solve:\n- hCaptcha\n- reCAPTCHA\n- Cloudflare Turnstile (interactive mode)\n\nFor these, consider:\n- CAPTCHA solving services (2Captcha, Anti-Captcha)\n- Manual solving workflows\n- Alternative data sources\n\n### IP Bans\n\nIf your IP is blocked by Cloudflare's WAF:\n- You'll see 403 or 1020 errors\n- No amount of browser emulation helps\n- Solution: Use different IPs (proxies)\n\n### Rate Limits\n\nExcessive requests trigger blocks:\n- Implement delays between requests\n- Use multiple proxies\n- Reduce concurrency\n\n## Debugging Challenges\n\n### Visual Debugging\n\nSee exactly what's happening:\n\n```typescript\nconst reader = new ReaderClient({ showChrome: true, verbose: true });\nconst result = await reader.scrape({\n  urls: [\"https://protected-site.com\"],\n});\nawait reader.close();\n```\n\n### Verbose Mode\n\nEnable verbose logging to see challenge detection and resolution in action:\n\n```typescript\nconst reader = new ReaderClient({ verbose: true });\nconst result = await reader.scrape({\n  urls: [\"https://protected-site.com\"],\n});\nawait reader.close();\n```\n\n## Best Practices\n\n1. **Start with verbose mode** to understand what's happening\n2. **Use residential proxies** for heavily protected sites\n3. **Implement delays** to avoid triggering rate limits\n4. **Handle failures gracefully** - not every request will succeed\n5. **Rotate IPs** for large-scale scraping\n6. **Respect robots.txt** when possible\n7. **Cache results** to minimize repeat requests\n\n## Example: Scraping a Cloudflare-Protected Site\n\nChallenge handling is automatic. Just scrape normally:\n\n```typescript\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nconst reader = new ReaderClient({\n  proxyPools: {\n    datacenter: [{ url: \"http://user:pass@dc-proxy:8080\" }],\n    residential: [{ url: \"http://user:pass@res-proxy:8080\" }],\n  },\n});\n\n// Reader auto-detects Cloudflare and escalates to residential proxy if needed\nconst result = await reader.scrape({\n  urls: [\"https://cloudflare-protected-site.com\"],\n  proxyTier: \"auto\",\n});\n\nconsole.log(result.data[0].markdown);\nawait reader.close();\n```\n\n## Related Guides\n\n- [Proxy Configuration](proxy-configuration.md) - Setting up proxies\n- [Browser Pool](browser-pool.md) - Managing browser instances\n- [Troubleshooting](../troubleshooting.md) - Common issues\n"
  },
  {
    "path": "docs/guides/output-formats.md",
    "content": "# Output Formats\n\nReader supports two output formats: **Markdown** and **HTML**.\n\n| Format | Best For | What You Get |\n|--------|----------|-------------|\n| **markdown** | LLM consumption, RAG pipelines | Clean markdown with headings, lists, links |\n| **html** | Rendering, further processing | Cleaned HTML with semantic structure |\n\n## Specifying Formats\n\n```typescript\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  formats: [\"markdown\", \"html\"],\n});\n\nconsole.log(result.data[0].markdown);\nconsole.log(result.data[0].html);\n```\n\n### CLI\n\n```bash\nnpx reader scrape https://example.com -f markdown,html\n```\n\nDefault format is `[\"markdown\"]` if not specified.\n\n## Markdown Output\n\nMarkdown is the recommended format for LLM consumption. Reader uses [supermarkdown](https://github.com/vakra-dev/supermarkdown), a Rust-based HTML to markdown converter built specifically for web scraping and LLM pipelines.\n\nFeatures:\n- Full GitHub Flavored Markdown (GFM) support\n- Tables, task lists, strikethrough, autolinks\n- Handles malformed HTML from real web pages\n- LLM-optimized output (clean, no artifacts)\n\n## HTML Output\n\nHTML output is the cleaned, semantic HTML after content extraction. It includes:\n- Main content only (nav/header/footer removed when `onlyMainContent: true`)\n- Scripts, styles, and hidden elements removed\n- Base64 images stripped\n- URLs resolved to absolute paths\n\n## Content Cleaning\n\nBoth formats benefit from the content cleaning pipeline:\n\n```typescript\n// Extract only main content (default)\nawait reader.scrape({ urls, onlyMainContent: true });\n\n// Include specific elements only\nawait reader.scrape({ urls, includeTags: [\".article-body\"] });\n\n// Exclude specific elements\nawait reader.scrape({ urls, excludeTags: [\".comments\", \".sidebar\"] });\n\n// Full page (no cleaning)\nawait reader.scrape({ urls, onlyMainContent: false });\n```\n\n## Metadata\n\nEvery scrape result includes metadata regardless of format:\n\n```typescript\nresult.data[0].metadata.website.title       // Page title\nresult.data[0].metadata.website.description // Meta description\nresult.data[0].metadata.website.language    // Language\nresult.data[0].metadata.baseUrl             // Original URL\nresult.data[0].metadata.finalUrl            // URL after redirects (if different)\nresult.data[0].metadata.statusCode          // HTTP status\nresult.data[0].metadata.duration            // Scrape duration (ms)\n```\n"
  },
  {
    "path": "docs/guides/proxy-configuration.md",
    "content": "# Proxy Configuration Guide\n\nThis guide covers proxy setup for Reader.\n\n## Overview\n\nProxies help with:\n- Bypassing IP-based blocks\n- Accessing geo-restricted content\n- Distributing requests across multiple IPs\n- Avoiding rate limits\n\n## Quick Start\n\n### Using Proxy URL\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxy: {\n    url: \"http://username:password@proxy.example.com:8080\",\n  },\n});\nawait reader.close();\n```\n\n### Using Structured Config\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxy: {\n    type: \"residential\",\n    host: \"proxy.example.com\",\n    port: 8080,\n    username: \"username\",\n    password: \"password\",\n    country: \"us\",\n  },\n});\nawait reader.close();\n```\n\n### CLI Usage\n\n```bash\nnpx reader scrape https://example.com --proxy http://user:pass@host:port\n```\n\n## Proxy Types\n\n### Datacenter Proxies\n\n- **Pros:** Fast, cheap, reliable\n- **Cons:** Easily detected, often blocked\n- **Best for:** Sites without bot protection\n\n```typescript\nproxy: {\n  type: \"datacenter\",\n  host: \"proxy.example.com\",\n  port: 8080,\n  username: \"username\",\n  password: \"password\",\n}\n```\n\n### Residential Proxies\n\n- **Pros:** Real IPs, hard to detect, trusted by Cloudflare\n- **Cons:** Slower, more expensive, limited bandwidth\n- **Best for:** Cloudflare-protected sites, sensitive scraping\n\n```typescript\nproxy: {\n  type: \"residential\",\n  host: \"proxy.example.com\",\n  port: 8080,\n  username: \"username\",\n  password: \"password\",\n  country: \"us\",\n}\n```\n\n### Mobile Proxies\n\n- **Pros:** Highest trust level, shared by many users\n- **Cons:** Most expensive, limited availability\n- **Best for:** Most aggressive anti-bot systems\n\n## Configuration Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| `url` | `string` | Full proxy URL (takes precedence) |\n| `type` | `\"datacenter\" \\| \"residential\"` | Proxy type |\n| `host` | `string` | Proxy server hostname |\n| `port` | `number` | Proxy server port |\n| `username` | `string` | Authentication username |\n| `password` | `string` | Authentication password |\n| `country` | `string` | Country code (e.g., \"us\", \"uk\", \"de\") |\n\n## Provider Examples\n\n### IPRoyal\n\n```typescript\nproxy: {\n  type: \"residential\",\n  host: \"geo.iproyal.com\",\n  port: 12321,\n  username: \"customer-username\",\n  password: \"password\",\n  country: \"us\",\n}\n```\n\n### Bright Data (Luminati)\n\n```typescript\nproxy: {\n  type: \"residential\",\n  host: \"brd.superproxy.io\",\n  port: 22225,\n  username: \"customer-zone-residential\",\n  password: \"password\",\n  country: \"us\",\n}\n```\n\n### Oxylabs\n\n```typescript\nproxy: {\n  type: \"residential\",\n  host: \"pr.oxylabs.io\",\n  port: 7777,\n  username: \"customer-username\",\n  password: \"password\",\n  country: \"us\",\n}\n```\n\n### SmartProxy\n\n```typescript\nproxy: {\n  type: \"residential\",\n  host: \"gate.smartproxy.com\",\n  port: 7000,\n  username: \"user\",\n  password: \"pass\",\n  country: \"us\",\n}\n```\n\n## Proxy Pooling\n\nReader supports built-in proxy pooling with automatic rotation:\n\n```typescript\nconst reader = new ReaderClient({\n  // Configure multiple proxies\n  proxies: [\n    { host: \"proxy1.example.com\", port: 8080, username: \"user\", password: \"pass\" },\n    { host: \"proxy2.example.com\", port: 8080, username: \"user\", password: \"pass\" },\n    { host: \"proxy3.example.com\", port: 8080, username: \"user\", password: \"pass\", country: \"us\" },\n  ],\n  // Rotation strategy: \"round-robin\" (default) or \"random\"\n  proxyRotation: \"round-robin\",\n});\n\n// Each request automatically uses the next proxy in rotation\nconst result = await reader.scrape({\n  urls: [\"https://example1.com\", \"https://example2.com\", \"https://example3.com\"],\n});\n\n// Check which proxy handled each request\nresult.data.forEach((site) => {\n  console.log(`${site.metadata.baseUrl} -> ${site.metadata.proxy?.host}:${site.metadata.proxy?.port}`);\n});\n\nawait reader.close();\n```\n\n### Proxy Metadata in Response\n\nWhen using proxy pooling, each result includes metadata about which proxy was used:\n\n```typescript\ninterface ProxyMetadata {\n  host: string;    // Proxy host that handled the request\n  port: number;    // Proxy port\n  country?: string; // Country code if geo-targeting was used\n}\n```\n\n## Tiered Proxy Pools (Recommended)\n\nInstead of a flat proxy list, configure separate datacenter and residential pools. Reader auto-escalates from datacenter to residential when a site blocks:\n\n```typescript\nconst reader = new ReaderClient({\n  proxyPools: {\n    datacenter: [\n      { url: \"http://user:pass@dc-proxy1:8080\" },\n      { url: \"http://user:pass@dc-proxy2:8080\" },\n    ],\n    residential: [\n      { url: \"http://user:pass@res-proxy1:8080\" },\n    ],\n  },\n});\n\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxyTier: \"auto\", // datacenter first, escalate to residential on block\n});\n```\n\n### Proxy Tiers\n\n| Tier | When used | Credits |\n|------|-----------|---------|\n| `\"datacenter\"` | Fast, most sites | 1 per scrape |\n| `\"residential\"` | Anti-bot sites (Amazon, LinkedIn) | 3 per scrape |\n| `\"auto\"` | Starts datacenter, escalates on block | 1 or 3 |\n\n### Environment Variables\n\nConfigure proxy pools via environment variables (useful for daemons):\n\n```bash\nPROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080\nPROXY_RESIDENTIAL=http://user:pass@res1:8080\n```\n\n### Health Tracking\n\nReader monitors proxy health automatically:\n- **Circuit breaker:** After 10 consecutive failures, a proxy is benched for 5 minutes\n- **Auto-recovery:** Benched proxies are automatically revived after the cooldown\n- **Only proxy faults count:** Bot blocks (403, captcha) don't count against the proxy. Those are the site's behavior, not the proxy's\n\n### Per-Proxy Concurrency\n\nEach proxy URL has a concurrency limit (default: 2 simultaneous requests). This prevents overwhelming a single proxy IP, which can trigger rate limits.\n\n## Rotation Strategies\n\n### Per-Request Rotation\n\nMost residential proxy providers rotate IPs automatically:\n\n```typescript\nconst reader = new ReaderClient();\n\n// Each request gets a different IP\nfor (const url of urls) {\n  await reader.scrape({\n    urls: [url],\n    proxy: proxyConfig,\n  });\n}\n\nawait reader.close();\n```\n\n### Sticky Sessions\n\nKeep the same IP for multiple requests:\n\n```typescript\n// Some providers support session IDs\nproxy: {\n  host: \"proxy.example.com\",\n  port: 8080,\n  username: \"user-session-abc123\",  // Session in username\n  password: \"pass\",\n}\n```\n\n### Manual Rotation\n\nRotate through a list of proxies:\n\n```typescript\nconst proxies = [\n  { host: \"proxy1.example.com\", port: 8080 },\n  { host: \"proxy2.example.com\", port: 8080 },\n  { host: \"proxy3.example.com\", port: 8080 },\n];\n\nlet proxyIndex = 0;\nconst reader = new ReaderClient();\n\nasync function scrapeWithRotation(url: string) {\n  const proxy = proxies[proxyIndex % proxies.length];\n  proxyIndex++;\n\n  return await reader.scrape({\n    urls: [url],\n    proxy: {\n      ...proxy,\n      username: \"username\",\n      password: \"password\",\n    },\n  });\n}\n\n// Don't forget to close when done\n// await reader.close();\n```\n\n## Geo-Targeting\n\nTarget specific countries for localized content:\n\n```typescript\nconst reader = new ReaderClient();\n\n// US content\nconst usResult = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxy: { ...baseProxy, country: \"us\" },\n});\n\n// UK content\nconst ukResult = await reader.scrape({\n  urls: [\"https://example.com\"],\n  proxy: { ...baseProxy, country: \"uk\" },\n});\n\nawait reader.close();\n```\n\nCommon country codes:\n- `us` - United States\n- `uk` or `gb` - United Kingdom\n- `de` - Germany\n- `fr` - France\n- `jp` - Japan\n- `au` - Australia\n\n## Error Handling\n\n### Proxy Failures\n\n```typescript\nconst reader = new ReaderClient();\n\nasync function scrapeWithFallback(url: string) {\n  const proxies = [residentialProxy, datacenterProxy, null];\n\n  for (const proxy of proxies) {\n    try {\n      return await reader.scrape({\n        urls: [url],\n        proxy,\n        timeoutMs: 30000,\n      });\n    } catch (error) {\n      console.log(`Proxy failed: ${proxy?.host || \"direct\"}`);\n      continue;\n    }\n  }\n\n  throw new Error(\"All proxies failed\");\n}\n\n// Don't forget to close when done\n// await reader.close();\n```\n\n### Connection Errors\n\nCommon proxy errors and solutions:\n\n| Error | Cause | Solution |\n|-------|-------|----------|\n| `ECONNREFUSED` | Proxy server down | Try different proxy |\n| `407 Proxy Auth Required` | Wrong credentials | Check username/password |\n| `403 Forbidden` | Proxy blocked by site | Use residential proxy |\n| `Timeout` | Slow proxy | Increase timeout |\n\n## Testing Proxies\n\n### Verify Proxy Works\n\n```typescript\nconst reader = new ReaderClient();\n\nasync function testProxy(proxy: ProxyConfig): Promise<boolean> {\n  try {\n    const result = await reader.scrape({\n      urls: [\"https://httpbin.org/ip\"],\n      formats: [\"markdown\"],\n      proxy,\n      timeoutMs: 10000,\n    });\n\n    console.log(\"Proxy IP:\", result.data[0].markdown);\n    return true;\n  } catch (error) {\n    console.log(\"Proxy failed:\", error.message);\n    return false;\n  }\n}\n\nawait reader.close();\n```\n\n### Check Geo-Location\n\n```typescript\nconst reader = new ReaderClient();\n\nconst result = await reader.scrape({\n  urls: [\"https://ipinfo.io/json\"],\n  formats: [\"markdown\"],\n  proxy: { ...proxyConfig, country: \"uk\" },\n});\n\nconsole.log(result.data[0].markdown);  // Contains the IP info\n\nawait reader.close();\n```\n\n## Best Practices\n\n1. **Start with datacenter proxies** - Cheaper, see if you need more\n2. **Upgrade to residential** - When blocked or for Cloudflare sites\n3. **Use geo-targeting** - Match target site's expected users\n4. **Implement rotation** - Spread requests across IPs\n5. **Handle failures gracefully** - Have fallback proxies\n6. **Monitor bandwidth** - Residential proxies charge by GB\n7. **Test before deploying** - Verify proxies work with target site\n\n## Cost Considerations\n\n| Proxy Type | Typical Cost | Best For |\n|------------|--------------|----------|\n| Datacenter | $0.50-2/GB | Unprotected sites |\n| Residential | $3-15/GB | Cloudflare, sensitive sites |\n| Mobile | $20-50/GB | Highest security sites |\n\n## Related Guides\n\n- [Cloudflare Bypass](cloudflare-bypass.md) - Works best with residential proxies\n- [Browser Pool](browser-pool.md) - Managing browser instances\n- [Troubleshooting](../troubleshooting.md) - Common proxy issues\n"
  },
  {
    "path": "docs/troubleshooting.md",
    "content": "# Troubleshooting\n\nThis guide covers common issues and their solutions when using Reader.\n\n## Quick Diagnostics\n\nBefore diving into specific issues, try these debugging steps:\n\n```bash\n# Enable verbose logging\nnpx reader scrape https://example.com -v\n\n# Show the browser window to see what's happening\nnpx reader scrape https://example.com --show-chrome\n\n# Check Node.js version (should be >= 18)\nnode --version\n```\n\n## Common Errors\n\n### Chrome/Chromium Not Found\n\n**Error:**\n```\nError: Could not find Chrome installation\n```\n\n**Cause:** Hero needs Chrome/Chromium to run. It tries to download it automatically on first run.\n\n**Solutions:**\n\n1. **Let Hero download Chrome:**\n   ```bash\n   # Clear any cached downloads and try again\n   rm -rf ~/.cache/ulixee\n   npx reader scrape https://example.com\n   ```\n\n2. **Install Chrome manually (Ubuntu/Debian):**\n   ```bash\n   sudo apt-get update\n   sudo apt-get install -y chromium-browser\n   ```\n\n3. **Install Chrome manually (macOS):**\n   ```bash\n   brew install --cask chromium\n   ```\n\n4. **Point to existing Chrome:**\n   ```bash\n   export CHROME_PATH=/usr/bin/chromium-browser\n   # or on macOS\n   export CHROME_PATH=\"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome\"\n   ```\n\n### Connection Refused (ECONNREFUSED)\n\n**Error:**\n```\nError: connect ECONNREFUSED 127.0.0.1:9222\n```\n\n**Cause:** Hero couldn't start or connect to Chrome.\n\n**Solutions:**\n\n1. **Check if Chrome is running:**\n   ```bash\n   ps aux | grep chrome\n   # Kill any zombie processes\n   pkill -f chrome\n   ```\n\n2. **Check for port conflicts:**\n   ```bash\n   lsof -i :9222\n   ```\n\n3. **Try with a fresh browser instance:**\n   ```typescript\n   const reader = new ReaderClient({ showChrome: true });\n   const result = await reader.scrape({\n     urls: [\"https://example.com\"],\n   });\n   await reader.close();\n   ```\n\n### Request Timeout\n\n**Error:**\n```\nError: Navigation timeout of 30000 ms exceeded\n```\n\n**Cause:** The page took too long to load, or Cloudflare challenge took too long to resolve.\n\n**Solutions:**\n\n1. **Increase timeout:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.scrape({\n     urls: [\"https://example.com\"],\n     timeoutMs: 60000,  // 60 seconds\n   });\n   await reader.close();\n   ```\n\n2. **For batch operations, increase batch timeout:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.scrape({\n     urls: [...manyUrls],\n     batchTimeoutMs: 600000,  // 10 minutes total\n   });\n   await reader.close();\n   ```\n\n3. **Check if the site is accessible:**\n   ```bash\n   curl -I https://example.com\n   ```\n\n### Cloudflare Block (403/1020)\n\n**Error:**\n```\nError: Access denied (Error code 1020)\n```\n\n**Cause:** Cloudflare detected automated access and blocked the request.\n\n**Solutions:**\n\n1. **Use a proxy:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.scrape({\n     urls: [\"https://example.com\"],\n     proxy: {\n       type: \"residential\",\n       host: \"proxy.example.com\",\n       port: 8080,\n       username: \"username\",\n       password: \"password\",\n     },\n   });\n   await reader.close();\n   ```\n\n2. **Add delays between requests:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.crawl({\n     url: \"https://example.com\",\n     delayMs: 3000,  // 3 seconds between requests\n   });\n   await reader.close();\n   ```\n\n3. **Try a different user agent:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.scrape({\n     urls: [\"https://example.com\"],\n     userAgent: \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\n   });\n   await reader.close();\n   ```\n\n4. **Enable verbose mode to see challenge detection:**\n   ```typescript\n   const reader = new ReaderClient({ verbose: true, showChrome: true });\n   const result = await reader.scrape({\n     urls: [\"https://example.com\"],\n   });\n   await reader.close();\n   ```\n\n### Memory Issues\n\n**Error:**\n```\nFATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory\n```\n\n**Cause:** Too many browser instances or large pages consuming memory.\n\n**Solutions:**\n\n1. **Reduce concurrency:**\n   ```typescript\n   const reader = new ReaderClient();\n   const result = await reader.scrape({\n     urls: [...manyUrls],\n     batchConcurrency: 2,  // Lower concurrency\n   });\n   await reader.close();\n   ```\n\n2. **Increase Node.js memory:**\n   ```bash\n   NODE_OPTIONS=\"--max-old-space-size=4096\" npx reader scrape ...\n   ```\n\n3. **Use browser pool recycling (happens automatically, but you can tune it):**\n   ```typescript\n   import { BrowserPool } from \"@vakra-dev/reader\";\n\n   const pool = new BrowserPool({\n     size: 2,\n     retireAfterPages: 50,  // Recycle browsers more frequently\n   });\n   ```\n\n### ESM/CommonJS Issues\n\n**Error:**\n```\nSyntaxError: Cannot use import statement outside a module\n```\n\n**Cause:** Reader is ESM-only, but your project is using CommonJS.\n\n**Solutions:**\n\n1. **Add to package.json:**\n   ```json\n   {\n     \"type\": \"module\"\n   }\n   ```\n\n2. **Or use .mjs extension:**\n   ```bash\n   mv script.js script.mjs\n   node script.mjs\n   ```\n\n3. **Or use dynamic import in CommonJS:**\n   ```javascript\n   // script.cjs\n   async function main() {\n     const { scrape } = await import(\"@vakra-dev/reader\");\n     // ...\n   }\n   main();\n   ```\n\n### \"Bun runtime not supported\"\n\n**Error:**\n```\nError: Hero doesn't work with Bun runtime\n```\n\n**Cause:** Hero requires Node.js runtime and is not compatible with Bun.\n\n**Solution:** Use Node.js to run your scripts:\n\n```bash\n# Use npx tsx\nnpx tsx script.ts\n\n# or node with loader\nnode --loader tsx script.ts\n```\n\n## Debugging Tips\n\n### Enable Verbose Logging\n\n```typescript\nconst reader = new ReaderClient({ verbose: true });\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n});\nawait reader.close();\n```\n\nThis shows:\n- Cloudflare challenge detection\n- Page navigation events\n- Timing information\n- Error details\n\n### Show Browser Window\n\n```typescript\nconst reader = new ReaderClient({ showChrome: true });\nconst result = await reader.scrape({\n  urls: [\"https://example.com\"],\n});\nawait reader.close();\n```\n\nThis opens a visible Chrome window so you can see:\n- What the page looks like\n- Cloudflare challenges appearing\n- JavaScript errors in DevTools\n\n### Enable Verbose Logging\n\nChallenge detection and resolution happens automatically. Enable verbose logging to see what's happening:\n\n### Log Progress\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: manyUrls,\n  batchConcurrency: 3,\n  onProgress: ({ completed, total, currentUrl }) => {\n    console.log(`[${completed}/${total}] ${currentUrl}`);\n  },\n});\nawait reader.close();\n```\n\n## Performance Issues\n\n### Slow Scraping\n\n1. **Increase concurrency (if resources allow):**\n   ```typescript\n   batchConcurrency: 5  // Default is 1\n   ```\n\n2. **Use browser pool for repeated scrapes:**\n   ```typescript\n   import { BrowserPool } from \"@vakra-dev/reader\";\n\n   const pool = new BrowserPool({ size: 5 });\n   await pool.initialize();\n\n   // Reuse pool for multiple operations\n   for (const url of urls) {\n     await pool.withBrowser(async (hero) => {\n       await hero.goto(url);\n       // ...\n     });\n   }\n\n   await pool.shutdown();\n   ```\n\n3. **Use shared Hero Core for production:**\n   See [Production Server Guide](deployment/production-server.md)\n\n### High Memory Usage\n\n1. **Reduce pool size:**\n   ```typescript\n   const pool = new BrowserPool({ size: 2 });\n   ```\n\n2. **Enable more aggressive recycling:**\n   ```typescript\n   const pool = new BrowserPool({\n     size: 3,\n     retireAfterPages: 30,      // Default: 100\n     retireAfterMinutes: 15,    // Default: 30\n   });\n   ```\n\n3. **Process URLs in smaller batches:**\n   ```typescript\n   const reader = new ReaderClient();\n   const batchSize = 10;\n   for (let i = 0; i < urls.length; i += batchSize) {\n     const batch = urls.slice(i, i + batchSize);\n     await reader.scrape({ urls: batch, batchConcurrency: 3 });\n     // Allow garbage collection between batches\n     await new Promise(r => setTimeout(r, 1000));\n   }\n   await reader.close();\n   ```\n\n## Site-Specific Issues\n\n### JavaScript-Heavy Sites\n\nSome sites require waiting for JavaScript to render:\n\n```typescript\nconst reader = new ReaderClient();\nconst result = await reader.scrape({\n  urls: [\"https://spa-site.com\"],\n  waitForSelector: \".main-content\",  // Wait for this element\n  timeoutMs: 60000,\n});\nawait reader.close();\n```\n\n### Sites with Infinite Scroll\n\nCrawling may not discover all content. Consider:\n\n1. Limiting depth and using specific URL patterns\n2. Using the API directly with custom scroll logic\n\n### Login-Protected Content\n\nReader doesn't handle authentication directly. Options:\n\n1. Use cookies from an authenticated session\n2. Build custom authentication logic using the Browser Pool\n3. Use a headless browser automation tool for login, then Reader for scraping\n\n## Getting More Help\n\n1. **Check the logs** with `-v` flag\n2. **Search existing issues** on [GitHub](https://github.com/vakra-dev/reader/issues)\n3. **Open a new issue** with:\n   - Node.js version\n   - Reader version\n   - Operating system\n   - Error message and stack trace\n   - Minimal reproduction steps\n\n## Related Guides\n\n- [Getting Started](getting-started.md)\n- [Cloudflare Bypass](guides/cloudflare-bypass.md)\n- [Browser Pool](guides/browser-pool.md)\n- [Proxy Configuration](guides/proxy-configuration.md)\n"
  },
  {
    "path": "ecosystem.config.cjs",
    "content": "/**\n * PM2 ecosystem config for reader daemon.\n *\n * Two separate instances on different ports, each with its own proxy pool.\n * NOT cluster mode: Hero browser pool is stateful (proxy-bound browsers).\n *\n * Proxy sets are split via READER_PROXIES env var in each instance's .env file.\n * Example:\n *   Instance 1 (.env.1): READER_PROXIES=dc1,dc2,dc3,dc4,dc5,res1,res2\n *   Instance 2 (.env.2): READER_PROXIES=dc6,dc7,dc8,dc9,dc10,res3,res4\n */\nmodule.exports = {\n  apps: [\n    {\n      name: \"reader-daemon-1\",\n      script: \"dist/cli/index.js\",\n      args: \"start --port 6003\",\n      node_args: \"--env-file=.env.1\",\n      instances: 1,\n      autorestart: true,\n      max_memory_restart: \"2G\",\n      env: {\n        NODE_ENV: \"production\",\n      },\n    },\n    {\n      name: \"reader-daemon-2\",\n      script: \"dist/cli/index.js\",\n      args: \"start --port 6004\",\n      node_args: \"--env-file=.env.2\",\n      instances: 1,\n      autorestart: true,\n      max_memory_restart: \"2G\",\n      env: {\n        NODE_ENV: \"production\",\n      },\n    },\n  ],\n};\n"
  },
  {
    "path": "examples/.gitignore",
    "content": "# Dependencies\nnode_modules/\nbun.lockb\n\n# Build outputs\ndist/\n*.js\n*.d.ts\n*.map\n\n# Environment\n.env\n.env.local\n.env.*.local\n\n# Logs\n*.log\nnpm-debug.log*\n\n# OS\n.DS_Store\n\n# IDE\n.idea/\n.vscode/\n*.swp\n*.swo\n"
  },
  {
    "path": "examples/.nvmrc",
    "content": "v22.12.0\n"
  },
  {
    "path": "examples/README.md",
    "content": "# Reader Examples\n\nExamples demonstrating various uses of Reader.\n\n## Structure\n\n```\nexamples/\n├── basic/                    # Basic usage examples\n│   ├── basic-scrape.ts       # Single URL scraping\n│   ├── batch-scrape.ts       # Concurrent multi-URL scraping\n│   ├── large-batch-scrape.ts # Large-scale batch scraping (1000+ URLs)\n│   ├── browser-pool-config.ts # Browser pool configuration\n│   ├── proxy-pool.ts         # Proxy rotation with multiple proxies\n│   ├── cloudflare-bypass.ts  # Cloudflare-protected site scraping\n│   ├── crawl-website.ts      # Website crawling\n│   ├── all-formats.ts        # All output formats\n│   └── with-proxy.ts         # Single proxy configuration\n│\n├── ai-tools/                 # AI framework integrations\n│   ├── openai-summary.ts     # OpenAI summarization\n│   ├── anthropic-summary.ts  # Anthropic summarization\n│   ├── vercel-ai-stream.ts   # Vercel AI SDK streaming\n│   ├── langchain-loader.ts   # LangChain document loader\n│   ├── llamaindex-loader.ts  # LlamaIndex document loader\n│   ├── pinecone-ingest.ts    # Pinecone vector store\n│   └── qdrant-ingest.ts      # Qdrant vector store\n│\n├── production/               # Production-ready setups\n│   └── express-server/       # REST API server\n│\n└── deployment/               # Cloud deployment guides\n    ├── docker/               # Docker + docker-compose\n    ├── aws-lambda/           # AWS Lambda (container)\n    └── vercel-functions/     # Vercel serverless\n```\n\n## Quick Start\n\n1. Install dependencies from the examples folder:\n\n```bash\ncd examples\nnpm install\n```\n\n2. Start Ulixee Cloud (in a separate terminal):\n\n```bash\nnpx @ulixee/cloud start\n```\n\n3. Run any example using tsx:\n\n```bash\n# Basic examples\nnpx tsx basic/basic-scrape.ts\nnpx tsx basic/batch-scrape.ts\nnpx tsx basic/large-batch-scrape.ts  # Large-scale (1000+ URLs)\nnpx tsx basic/browser-pool-config.ts\nnpx tsx basic/proxy-pool.ts\nnpx tsx basic/cloudflare-bypass.ts\nnpx tsx basic/crawl-website.ts\n\n# AI tools examples (requires API keys)\nexport OPENAI_API_KEY=\"sk-...\"\nnpx tsx ai-tools/openai-summary.ts https://example.com\n\nexport ANTHROPIC_API_KEY=\"sk-...\"\nnpx tsx ai-tools/anthropic-summary.ts https://example.com\n\n# Production server\nnpx tsx production/express-server/src/index.ts\n```\n\n### Deploy with Docker\n\n```bash\ncd examples/deployment/docker\ndocker-compose up -d\n```\n\n## Requirements\n\n- **Node.js** >= 18\n- For LLM examples: API keys for OpenAI/Anthropic\n- For deployment: Docker, cloud CLI tools\n\n## Contributing\n\nHave an example to share? Open a PR!\n"
  },
  {
    "path": "examples/ai-tools/README.md",
    "content": "# AI Tools Examples\n\nExamples showing how to integrate Reader with AI frameworks, LLMs, and vector stores.\n\n## Prerequisites\n\nStart Ulixee Cloud in a separate terminal:\n\n```bash\nnpx @ulixee/cloud start\n```\n\n## Examples\n\n### LLM Summarization\n\nScrape webpages and summarize with LLMs.\n\n| Example | Description | API Key Required |\n|---------|-------------|------------------|\n| [openai-summary.ts](./openai-summary.ts) | Summarize with GPT | `OPENAI_API_KEY` |\n| [anthropic-summary.ts](./anthropic-summary.ts) | Summarize with Claude | `ANTHROPIC_API_KEY` |\n| [vercel-ai-stream.ts](./vercel-ai-stream.ts) | Streaming summary with Vercel AI SDK | `OPENAI_API_KEY` |\n\n```bash\nexport OPENAI_API_KEY=\"sk-...\"\nnpx tsx ai-tools/openai-summary.ts https://example.com\n\nexport ANTHROPIC_API_KEY=\"sk-ant-...\"\nnpx tsx ai-tools/anthropic-summary.ts https://example.com\n```\n\n### RAG Frameworks\n\nLoad scraped content into RAG frameworks for retrieval-augmented generation.\n\n| Example | Description |\n|---------|-------------|\n| [langchain-loader.ts](./langchain-loader.ts) | Custom LangChain document loader |\n| [llamaindex-loader.ts](./llamaindex-loader.ts) | LlamaIndex document loader |\n\n```bash\nnpx tsx ai-tools/langchain-loader.ts\nnpx tsx ai-tools/llamaindex-loader.ts\n```\n\n### Vector Stores\n\nScrape and ingest content directly into vector databases for semantic search.\n\n| Example | Description | API Keys Required |\n|---------|-------------|-------------------|\n| [pinecone-ingest.ts](./pinecone-ingest.ts) | Ingest into Pinecone | `PINECONE_API_KEY`, `OPENAI_API_KEY` |\n| [qdrant-ingest.ts](./qdrant-ingest.ts) | Ingest into Qdrant | `OPENAI_API_KEY`, optionally `QDRANT_URL` |\n\n```bash\n# Pinecone\nexport PINECONE_API_KEY=\"...\"\nexport OPENAI_API_KEY=\"sk-...\"\nnpx tsx ai-tools/pinecone-ingest.ts\n\n# Qdrant (local)\ndocker run -p 6333:6333 qdrant/qdrant\nexport OPENAI_API_KEY=\"sk-...\"\nnpx tsx ai-tools/qdrant-ingest.ts\n```\n\n## Tips\n\n- Use `markdown` format for LLM input (cleaner than HTML)\n- Truncate content if it exceeds token limits\n- For production, consider chunking large documents before embedding\n"
  },
  {
    "path": "examples/ai-tools/anthropic-summary.ts",
    "content": "/**\n * Anthropic (Claude) Summarization Example\n *\n * Scrapes a webpage and uses Claude to summarize the content.\n *\n * Usage:\n *   npx tsx ai-tools/anthropic-summary.ts https://example.com\n *\n * Requirements:\n *   - Set ANTHROPIC_API_KEY environment variable\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport Anthropic from \"@anthropic-ai/sdk\";\n\nasync function main() {\n  const url = process.argv[2] || \"https://example.com\";\n\n  console.log(`Scraping ${url}...\\n`);\n\n  // Check for API key\n  if (!process.env.ANTHROPIC_API_KEY) {\n    console.error(\"Error: ANTHROPIC_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n\n  const reader = new ReaderClient();\n\n  try {\n    // Step 1: Scrape the webpage\n    const result = await reader.scrape({\n      urls: [url],\n      formats: [\"markdown\"], // Markdown is best for LLM consumption\n    });\n\n    const content = result.data[0]?.markdown;\n    if (!content) {\n      console.error(\"No content scraped\");\n      process.exit(1);\n    }\n\n    console.log(`Scraped ${content.length} characters`);\n    console.log(\"Sending to Claude for summarization...\\n\");\n\n    // Step 2: Summarize with Claude\n    const anthropic = new Anthropic();\n\n    const message = await anthropic.messages.create({\n      model: \"claude-3-haiku-20240307\",\n      max_tokens: 500,\n      messages: [\n        {\n          role: \"user\",\n          content: `Please summarize the following webpage content in 2-3 paragraphs:\\n\\n${content.slice(0, 10000)}`,\n        },\n      ],\n    });\n\n    const summary = message.content[0].type === \"text\" ? message.content[0].text : \"\";\n\n    console.log(\"=== SUMMARY ===\\n\");\n    console.log(summary);\n    console.log(\"\\n=== METADATA ===\");\n    console.log(`Source: ${url}`);\n    console.log(`Content length: ${content.length} chars`);\n    console.log(`Model: ${message.model}`);\n    console.log(`Tokens: ${message.usage.input_tokens} in / ${message.usage.output_tokens} out`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/ai-tools/langchain-loader.ts",
    "content": "/**\n * LangChain Document Loader Example\n *\n * Creates a custom LangChain document loader using Reader.\n *\n * Usage:\n *   npx tsx ai-tools/langchain-loader.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\n/**\n * Custom LangChain document loader powered by Reader\n */\nclass ReaderEngineLoader extends BaseDocumentLoader {\n  private urls: string[];\n  private crawlMode: boolean;\n  private maxPages: number;\n  private depth: number;\n  private reader: ReaderClient;\n\n  constructor(options: {\n    urls: string[];\n    crawl?: boolean;\n    maxPages?: number;\n    depth?: number;\n    reader: ReaderClient;\n  }) {\n    super();\n    this.urls = options.urls;\n    this.crawlMode = options.crawl ?? false;\n    this.maxPages = options.maxPages ?? 20;\n    this.depth = options.depth ?? 1;\n    this.reader = options.reader;\n  }\n\n  async load(): Promise<Document[]> {\n    const documents: Document[] = [];\n\n    if (this.crawlMode && this.urls.length === 1) {\n      // Crawl mode: discover pages from a single seed URL\n      const result = await this.reader.crawl({\n        url: this.urls[0],\n        depth: this.depth,\n        maxPages: this.maxPages,\n        scrape: true,\n      });\n\n      if (result.scraped) {\n        for (const page of result.scraped.data) {\n          documents.push(\n            new Document({\n              pageContent: page.markdown || \"\",\n              metadata: {\n                source: page.metadata.baseUrl,\n                title: page.metadata.website.title,\n                description: page.metadata.website.description,\n                scrapedAt: page.metadata.scrapedAt,\n              },\n            })\n          );\n        }\n      }\n    } else {\n      // Scrape mode: scrape specific URLs\n      const result = await this.reader.scrape({\n        urls: this.urls,\n        formats: [\"markdown\"],\n        batchConcurrency: 2,\n      });\n\n      for (const page of result.data) {\n        documents.push(\n          new Document({\n            pageContent: page.markdown || \"\",\n            metadata: {\n              source: page.metadata.baseUrl,\n              title: page.metadata.website.title,\n              description: page.metadata.website.description,\n              scrapedAt: page.metadata.scrapedAt,\n            },\n          })\n        );\n      }\n    }\n\n    return documents;\n  }\n}\n\n// Example usage\nasync function main() {\n  console.log(\"LangChain Document Loader Example\\n\");\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Example 1: Load specific URLs\n    console.log(\"--- Example 1: Load specific URLs ---\");\n    const loader1 = new ReaderEngineLoader({\n      urls: [\"https://example.com\", \"https://example.org\"],\n      reader,\n    });\n\n    const docs1 = await loader1.load();\n    console.log(`Loaded ${docs1.length} documents`);\n    for (const doc of docs1) {\n      console.log(`  - ${doc.metadata.source}: ${doc.pageContent.length} chars`);\n    }\n\n    // Example 2: Crawl a website\n    console.log(\"\\n--- Example 2: Crawl a website ---\");\n    const loader2 = new ReaderEngineLoader({\n      urls: [\"https://example.com\"],\n      crawl: true,\n      depth: 1,\n      maxPages: 5,\n      reader,\n    });\n\n    const docs2 = await loader2.load();\n    console.log(`Crawled and loaded ${docs2.length} documents`);\n    for (const doc of docs2) {\n      console.log(`  - ${doc.metadata.source}: ${doc.pageContent.length} chars`);\n    }\n\n    // The documents can now be used with LangChain:\n    // - Text splitters for chunking\n    // - Vector stores for embeddings\n    // - RAG pipelines\n    // - etc.\n  } finally {\n    await reader.close();\n  }\n}\n\nmain().catch(console.error);\n"
  },
  {
    "path": "examples/ai-tools/llamaindex-loader.ts",
    "content": "/**\n * LlamaIndex Document Loader Example\n *\n * Creates a custom LlamaIndex document loader using Reader.\n *\n * Usage:\n *   npx tsx ai-tools/llamaindex-loader.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { Document } from \"llamaindex\";\n\n/**\n * Load documents from URLs using Reader\n */\nasync function loadDocuments(reader: ReaderClient, urls: string[]): Promise<Document[]> {\n  const result = await reader.scrape({\n    urls,\n    formats: [\"markdown\"],\n    batchConcurrency: 2,\n  });\n\n  return result.data.map(\n    (page) =>\n      new Document({\n        text: page.markdown || \"\",\n        metadata: {\n          source: page.metadata.baseUrl,\n          title: page.metadata.website.title ?? undefined,\n          description: page.metadata.website.description ?? undefined,\n          scrapedAt: page.metadata.scrapedAt,\n        },\n      })\n  );\n}\n\n/**\n * Crawl a website and load all discovered pages as documents\n */\nasync function crawlAndLoadDocuments(\n  reader: ReaderClient,\n  url: string,\n  options: { depth?: number; maxPages?: number } = {}\n): Promise<Document[]> {\n  const result = await reader.crawl({\n    url,\n    depth: options.depth ?? 1,\n    maxPages: options.maxPages ?? 20,\n    scrape: true,\n  });\n\n  if (!result.scraped) {\n    return [];\n  }\n\n  return result.scraped.data.map(\n    (page) =>\n      new Document({\n        text: page.markdown || \"\",\n        metadata: {\n          source: page.metadata.baseUrl,\n          title: page.metadata.website.title ?? undefined,\n          description: page.metadata.website.description ?? undefined,\n          scrapedAt: page.metadata.scrapedAt,\n        },\n      })\n  );\n}\n\n// Example usage\nasync function main() {\n  console.log(\"LlamaIndex Document Loader Example\\n\");\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Example 1: Load specific URLs\n    console.log(\"--- Example 1: Load specific URLs ---\");\n    const docs1 = await loadDocuments(reader, [\"https://example.com\", \"https://example.org\"]);\n    console.log(`Loaded ${docs1.length} documents`);\n    for (const doc of docs1) {\n      console.log(`  - ${doc.metadata.source}: ${doc.getText().length} chars`);\n    }\n\n    // Example 2: Crawl a website\n    console.log(\"\\n--- Example 2: Crawl a website ---\");\n    const docs2 = await crawlAndLoadDocuments(reader, \"https://example.com\", {\n      depth: 1,\n      maxPages: 5,\n    });\n    console.log(`Crawled and loaded ${docs2.length} documents`);\n    for (const doc of docs2) {\n      console.log(`  - ${doc.metadata.source}: ${doc.getText().length} chars`);\n    }\n\n    // The documents can now be used with LlamaIndex:\n    // - VectorStoreIndex for similarity search\n    // - SummaryIndex for summarization\n    // - KnowledgeGraphIndex for graph-based retrieval\n  } finally {\n    await reader.close();\n  }\n}\n\nmain().catch(console.error);\n"
  },
  {
    "path": "examples/ai-tools/openai-summary.ts",
    "content": "/**\n * OpenAI Summarization Example\n *\n * Scrapes a webpage and uses OpenAI to summarize the content.\n *\n * Usage:\n *   npx tsx ai-tools/openai-summary.ts https://example.com\n *\n * Requirements:\n *   - Set OPENAI_API_KEY environment variable\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport OpenAI from \"openai\";\n\nasync function main() {\n  const url = process.argv[2] || \"https://example.com\";\n\n  console.log(`Scraping ${url}...\\n`);\n\n  // Check for API key\n  if (!process.env.OPENAI_API_KEY) {\n    console.error(\"Error: OPENAI_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n\n  const reader = new ReaderClient();\n\n  try {\n    // Step 1: Scrape the webpage\n    const result = await reader.scrape({\n      urls: [url],\n      formats: [\"markdown\"], // Markdown is best for LLM consumption\n    });\n\n    const content = result.data[0]?.markdown;\n    if (!content) {\n      console.error(\"No content scraped\");\n      process.exit(1);\n    }\n\n    console.log(`Scraped ${content.length} characters`);\n    console.log(\"Sending to OpenAI for summarization...\\n\");\n\n    // Step 2: Summarize with OpenAI\n    const openai = new OpenAI();\n\n    const completion = await openai.chat.completions.create({\n      model: \"gpt-4o-mini\",\n      messages: [\n        {\n          role: \"system\",\n          content:\n            \"You are a helpful assistant that summarizes web content. Provide a concise summary in 2-3 paragraphs.\",\n        },\n        {\n          role: \"user\",\n          content: `Please summarize the following webpage content:\\n\\n${content.slice(0, 10000)}`,\n        },\n      ],\n      max_tokens: 500,\n    });\n\n    const summary = completion.choices[0]?.message?.content;\n\n    console.log(\"=== SUMMARY ===\\n\");\n    console.log(summary);\n    console.log(\"\\n=== METADATA ===\");\n    console.log(`Source: ${url}`);\n    console.log(`Content length: ${content.length} chars`);\n    console.log(`Model: ${completion.model}`);\n    console.log(`Tokens used: ${completion.usage?.total_tokens}`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/ai-tools/pinecone-ingest.ts",
    "content": "/**\n * Pinecone Vector Store Ingestion Example\n *\n * Scrapes webpages and ingests them into Pinecone for semantic search.\n *\n * Usage:\n *   npx tsx ai-tools/pinecone-ingest.ts\n *\n * Requirements:\n *   - Set PINECONE_API_KEY environment variable\n *   - Set OPENAI_API_KEY environment variable (for embeddings)\n *   - Create a Pinecone index with dimension 1536 (for text-embedding-3-small)\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { Pinecone } from \"@pinecone-database/pinecone\";\nimport OpenAI from \"openai\";\n\nconst INDEX_NAME = \"reader-docs\";\n\nasync function main() {\n  // Check for required API keys\n  if (!process.env.PINECONE_API_KEY) {\n    console.error(\"Error: PINECONE_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n  if (!process.env.OPENAI_API_KEY) {\n    console.error(\"Error: OPENAI_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n\n  console.log(\"Pinecone Vector Store Ingestion Example\\n\");\n\n  // Initialize clients\n  const pinecone = new Pinecone();\n  const openai = new OpenAI();\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Step 1: Scrape webpages\n    const urls = [\"https://example.com\", \"https://example.org\"];\n\n    console.log(`Scraping ${urls.length} URLs...`);\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"],\n      batchConcurrency: 2,\n    });\n\n    console.log(`Scraped ${result.batchMetadata.successfulUrls} pages`);\n\n    // Step 2: Generate embeddings and prepare vectors\n    console.log(\"\\nGenerating embeddings...\");\n    const index = pinecone.index(INDEX_NAME);\n\n    const vectors = [];\n    for (const page of result.data) {\n      const content = page.markdown || \"\";\n      if (!content) continue;\n\n      // Truncate content to fit embedding model limits\n      const truncatedContent = content.slice(0, 8000);\n\n      // Generate embedding\n      const embeddingResponse = await openai.embeddings.create({\n        model: \"text-embedding-3-small\",\n        input: truncatedContent,\n      });\n\n      const embedding = embeddingResponse.data[0].embedding;\n\n      vectors.push({\n        id: Buffer.from(page.metadata.baseUrl).toString(\"base64\"),\n        values: embedding,\n        metadata: {\n          url: page.metadata.baseUrl,\n          title: page.metadata.website.title || \"\",\n          description: page.metadata.website.description || \"\",\n          content: truncatedContent.slice(0, 1000), // Store preview in metadata\n          scrapedAt: page.metadata.scrapedAt,\n        },\n      });\n\n      console.log(`  - Embedded: ${page.metadata.baseUrl}`);\n    }\n\n    // Step 3: Upsert to Pinecone\n    console.log(`\\nUpserting ${vectors.length} vectors to Pinecone...`);\n    await index.upsert(vectors);\n\n    console.log(\"\\nDone! Vectors are now searchable in Pinecone.\");\n    console.log(`Index: ${INDEX_NAME}`);\n\n    // Example: Query the index\n    console.log(\"\\n--- Example Query ---\");\n    const queryText = \"example domain\";\n    const queryEmbedding = await openai.embeddings.create({\n      model: \"text-embedding-3-small\",\n      input: queryText,\n    });\n\n    const queryResponse = await index.query({\n      vector: queryEmbedding.data[0].embedding,\n      topK: 3,\n      includeMetadata: true,\n    });\n\n    console.log(`Query: \"${queryText}\"`);\n    console.log(\"Results:\");\n    for (const match of queryResponse.matches) {\n      console.log(`  - ${match.metadata?.title} (score: ${match.score?.toFixed(3)})`);\n      console.log(`    URL: ${match.metadata?.url}`);\n    }\n  } finally {\n    await reader.close();\n  }\n}\n\nmain().catch(console.error);\n"
  },
  {
    "path": "examples/ai-tools/qdrant-ingest.ts",
    "content": "/**\n * Qdrant Vector Store Ingestion Example\n *\n * Scrapes webpages and ingests them into Qdrant for semantic search.\n *\n * Usage:\n *   npx tsx ai-tools/qdrant-ingest.ts\n *\n * Requirements:\n *   - Set QDRANT_URL environment variable (default: http://localhost:6333)\n *   - Set QDRANT_API_KEY environment variable (optional, for Qdrant Cloud)\n *   - Set OPENAI_API_KEY environment variable (for embeddings)\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { QdrantClient } from \"@qdrant/js-client-rest\";\nimport OpenAI from \"openai\";\n\nconst COLLECTION_NAME = \"reader-docs\";\nconst VECTOR_SIZE = 1536; // text-embedding-3-small dimension\n\nasync function main() {\n  // Check for required API keys\n  if (!process.env.OPENAI_API_KEY) {\n    console.error(\"Error: OPENAI_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n\n  console.log(\"Qdrant Vector Store Ingestion Example\\n\");\n\n  // Initialize clients\n  const qdrantUrl = process.env.QDRANT_URL || \"http://localhost:6333\";\n  const qdrant = new QdrantClient({\n    url: qdrantUrl,\n    apiKey: process.env.QDRANT_API_KEY,\n  });\n  const openai = new OpenAI();\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Ensure collection exists\n    try {\n      await qdrant.getCollection(COLLECTION_NAME);\n      console.log(`Using existing collection: ${COLLECTION_NAME}`);\n    } catch {\n      console.log(`Creating collection: ${COLLECTION_NAME}`);\n      await qdrant.createCollection(COLLECTION_NAME, {\n        vectors: {\n          size: VECTOR_SIZE,\n          distance: \"Cosine\",\n        },\n      });\n    }\n\n    // Step 1: Scrape webpages\n    const urls = [\"https://example.com\", \"https://example.org\"];\n\n    console.log(`\\nScraping ${urls.length} URLs...`);\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"],\n      batchConcurrency: 2,\n    });\n\n    console.log(`Scraped ${result.batchMetadata.successfulUrls} pages`);\n\n    // Step 2: Generate embeddings and prepare points\n    console.log(\"\\nGenerating embeddings...\");\n    const points = [];\n\n    for (let i = 0; i < result.data.length; i++) {\n      const page = result.data[i];\n      const content = page.markdown || \"\";\n      if (!content) continue;\n\n      // Truncate content to fit embedding model limits\n      const truncatedContent = content.slice(0, 8000);\n\n      // Generate embedding\n      const embeddingResponse = await openai.embeddings.create({\n        model: \"text-embedding-3-small\",\n        input: truncatedContent,\n      });\n\n      const embedding = embeddingResponse.data[0].embedding;\n\n      points.push({\n        id: i + 1, // Qdrant requires positive integers or UUIDs\n        vector: embedding,\n        payload: {\n          url: page.metadata.baseUrl,\n          title: page.metadata.website.title || \"\",\n          description: page.metadata.website.description || \"\",\n          content: truncatedContent.slice(0, 1000), // Store preview in payload\n          scrapedAt: page.metadata.scrapedAt,\n        },\n      });\n\n      console.log(`  - Embedded: ${page.metadata.baseUrl}`);\n    }\n\n    // Step 3: Upsert to Qdrant\n    console.log(`\\nUpserting ${points.length} points to Qdrant...`);\n    await qdrant.upsert(COLLECTION_NAME, {\n      wait: true,\n      points,\n    });\n\n    console.log(\"\\nDone! Points are now searchable in Qdrant.\");\n    console.log(`Collection: ${COLLECTION_NAME}`);\n    console.log(`Qdrant URL: ${qdrantUrl}`);\n\n    // Example: Query the collection\n    console.log(\"\\n--- Example Query ---\");\n    const queryText = \"example domain\";\n    const queryEmbedding = await openai.embeddings.create({\n      model: \"text-embedding-3-small\",\n      input: queryText,\n    });\n\n    const searchResponse = await qdrant.search(COLLECTION_NAME, {\n      vector: queryEmbedding.data[0].embedding,\n      limit: 3,\n      with_payload: true,\n    });\n\n    console.log(`Query: \"${queryText}\"`);\n    console.log(\"Results:\");\n    for (const result of searchResponse) {\n      console.log(`  - ${result.payload?.title} (score: ${result.score.toFixed(3)})`);\n      console.log(`    URL: ${result.payload?.url}`);\n    }\n  } finally {\n    await reader.close();\n  }\n}\n\nmain().catch(console.error);\n"
  },
  {
    "path": "examples/ai-tools/vercel-ai-stream.ts",
    "content": "/**\n * Vercel AI SDK Streaming Example\n *\n * Scrapes a webpage and streams a summary using the Vercel AI SDK.\n *\n * Usage:\n *   npx tsx ai-tools/vercel-ai-stream.ts https://example.com\n *\n * Requirements:\n *   - Set OPENAI_API_KEY environment variable\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { openai } from \"@ai-sdk/openai\";\nimport { streamText } from \"ai\";\n\nasync function main() {\n  const url = process.argv[2] || \"https://example.com\";\n\n  console.log(`Scraping ${url}...\\n`);\n\n  // Check for API key\n  if (!process.env.OPENAI_API_KEY) {\n    console.error(\"Error: OPENAI_API_KEY environment variable is required\");\n    process.exit(1);\n  }\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Step 1: Scrape the webpage\n    const result = await reader.scrape({\n      urls: [url],\n      formats: [\"markdown\"],\n    });\n\n    const content = result.data[0]?.markdown;\n    if (!content) {\n      console.error(\"No content scraped\");\n      process.exit(1);\n    }\n\n    console.log(`Scraped ${content.length} characters`);\n    console.log(\"Streaming summary...\\n\");\n    console.log(\"=== STREAMING SUMMARY ===\\n\");\n\n    // Step 2: Stream summary with Vercel AI SDK\n    const { textStream } = await streamText({\n      model: openai(\"gpt-4o-mini\"),\n      system:\n        \"You are a helpful assistant that summarizes web content. Provide a concise summary in 2-3 paragraphs.\",\n      prompt: `Please summarize the following webpage content:\\n\\n${content.slice(0, 10000)}`,\n      maxTokens: 500,\n    });\n\n    // Stream the response to stdout\n    for await (const chunk of textStream) {\n      process.stdout.write(chunk);\n    }\n\n    console.log(\"\\n\\n=== METADATA ===\");\n    console.log(`Source: ${url}`);\n    console.log(`Content length: ${content.length} chars`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/README.md",
    "content": "# Basic Examples\n\nSimple examples demonstrating core Reader functionality.\n\n## Running Examples\n\nAll commands run from the `reader` directory. Requires Node v22+ (`nvm use v22`).\n\n```bash\nnpx tsx --tsconfig examples/tsconfig.json examples/basic/<example>.ts\n```\n\nIf Hero's bundled Chrome binary isn't available (e.g. Apple Silicon), point to your local Chrome:\n\n```bash\nexport CHROME_139_BIN=\"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome\"\n```\n\n## Scraping\n\n| Example | Description |\n|---------|-------------|\n| `basic-scrape.ts` | Scrape a single URL and display markdown output |\n| `batch-scrape.ts` | Scrape multiple URLs concurrently with progress tracking |\n| `all-formats.ts` | Output content in all supported formats (markdown, html) |\n\n## Crawling\n\n| Example | Description |\n|---------|-------------|\n| `crawl-website.ts` | Crawl a website to discover and optionally scrape pages |\n\n## Browser Sessions\n\nBrowser sessions launch a stealthed Chrome and return a CDP WebSocket URL.\nConnect with Playwright, Puppeteer, or any CDP client. Anti-bot stealth is\nactive (`webdriver=false`, navigator spoofing, WebRTC masking).\n\n| Example | Description |\n|---------|-------------|\n| `browser-session.ts` | Playwright: navigate, extract data, screenshot |\n| `browser-session-actions.ts` | Playwright: click, type, search, wait for elements |\n| `browser-session-puppeteer.ts` | Puppeteer: same flow via `connect({ browserWSEndpoint })` |\n| `browser-session-selenium.ts` | Raw CDP: direct WebSocket commands, no framework needed |\n\n### Dependencies\n\n```bash\nnpm install --save-dev playwright-core   # for Playwright examples\nnpm install --save-dev puppeteer-core    # for Puppeteer example\nnpm install --save-dev ws                # for raw CDP example\n```\n\n## Configuration\n\n| Example | Description |\n|---------|-------------|\n| `with-proxy.ts` | Scrape using a proxy server |\n| `proxy-pool.ts` | Rotate through multiple proxies |\n| `browser-pool-config.ts` | Configure pool size, retirement, and queue limits |\n| `cloudflare-bypass.ts` | Scrape a Cloudflare-protected site |\n"
  },
  {
    "path": "examples/basic/all-formats.ts",
    "content": "#!/usr/bin/env node\n/**\n * All Formats Example\n *\n * Demonstrates outputting content in all supported formats (markdown and html)\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting all-formats example\\n\");\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    const result = await reader.scrape({\n      urls: [\"https://example.com\"],\n      formats: [\"markdown\", \"html\"],\n    });\n\n    const page = result.data[0];\n\n    if (!page) {\n      console.error(\"No data returned - scrape may have failed\");\n      console.log(\"Errors:\", result.batchMetadata.errors);\n      process.exit(1);\n    }\n\n    console.log(\"\\nScrape completed!\");\n    console.log(\"\\nFormat Lengths:\");\n    console.log(`  Markdown: ${page.markdown?.length || 0} chars`);\n    console.log(`  HTML: ${page.html?.length || 0} chars`);\n\n    console.log(\"\\n--- MARKDOWN OUTPUT ---\");\n    console.log(page.markdown?.slice(0, 500));\n\n    console.log(\"\\n--- HTML OUTPUT (first 500 chars) ---\");\n    console.log(page.html?.slice(0, 500));\n\n    console.log(\"\\n--- FULL RESULT (JSON) ---\");\n    console.log(JSON.stringify(result, null, 2).slice(0, 1000));\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/basic-scrape.ts",
    "content": "#!/usr/bin/env node\n/**\n * Basic Scraping Example\n *\n * Demonstrates simple single-URL scraping with reader\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting basic scrape example\\n\");\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    const result = await reader.scrape({\n      urls: [\"https://example.com\"],\n      formats: [\"markdown\", \"html\"],\n    });\n\n    const page = result.data[0];\n\n    if (!page) {\n      console.error(\"No data returned - scrape may have failed\");\n      console.log(\"Errors:\", result.batchMetadata.errors);\n      process.exit(1);\n    }\n\n    console.log(\"\\nScrape completed!\");\n    console.log(\"\\nResults:\");\n    console.log(`  URL: ${page.metadata.baseUrl}`);\n    console.log(`  Title: ${page.metadata.website.title}`);\n    console.log(`  Duration: ${page.metadata.duration}ms`);\n    console.log(`  Markdown length: ${page.markdown?.length || 0} chars`);\n    console.log(`  HTML length: ${page.html?.length || 0} chars`);\n\n    console.log(\"\\nMarkdown Preview (first 500 chars):\");\n    console.log(page.markdown?.slice(0, 500));\n\n    console.log(\"\\nBatch Metadata:\");\n    console.log(`  Total URLs: ${result.batchMetadata.totalUrls}`);\n    console.log(`  Successful: ${result.batchMetadata.successfulUrls}`);\n    console.log(`  Failed: ${result.batchMetadata.failedUrls}`);\n    console.log(`  Total Duration: ${result.batchMetadata.totalDuration}ms`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/batch-scrape.ts",
    "content": "#!/usr/bin/env node\n/**\n * Batch Scraping Example\n *\n * Demonstrates concurrent scraping of multiple URLs\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting batch scrape example\\n\");\n\n  const urls = [\"https://example.com\", \"https://example.org\", \"https://example.net\"];\n\n  console.log(`Scraping ${urls.length} URLs with concurrency=2\\n`);\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"],\n      batchConcurrency: 2, // Process 2 URLs in parallel\n      onProgress: (progress) => {\n        console.log(`\\nProgress: ${progress.completed}/${progress.total} - ${progress.currentUrl}`);\n      },\n    });\n\n    console.log(\"\\nBatch scrape completed!\\n\");\n    console.log(\"Results:\");\n\n    for (const page of result.data) {\n      console.log(`\\n  ${page.metadata.baseUrl}`);\n      console.log(`     Title: ${page.metadata.website.title}`);\n      console.log(`     Duration: ${page.metadata.duration}ms`);\n      console.log(`     Content: ${page.markdown?.length || 0} chars`);\n    }\n\n    console.log(\"\\nBatch Metadata:\");\n    console.log(`  Total URLs: ${result.batchMetadata.totalUrls}`);\n    console.log(`  Successful: ${result.batchMetadata.successfulUrls}`);\n    console.log(`  Failed: ${result.batchMetadata.failedUrls}`);\n    console.log(`  Total Duration: ${result.batchMetadata.totalDuration}ms`);\n    console.log(\n      `  Avg Per URL: ${Math.round(\n        result.batchMetadata.totalDuration / result.batchMetadata.totalUrls\n      )}ms`\n    );\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/browser-pool-config.ts",
    "content": "#!/usr/bin/env node\n/**\n * Browser Pool Configuration Example\n *\n * Demonstrates configuring the browser pool for high-throughput scraping.\n * Useful when scraping many URLs to optimize performance and resource usage.\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting browser pool configuration example\\n\");\n\n  // Configure browser pool for high-throughput scraping\n  const reader = new ReaderClient({\n    verbose: true,\n\n    // Browser pool configuration\n    browserPool: {\n      size: 5, // Run 5 browser instances in parallel\n      retireAfterPages: 50, // Recycle browser after 50 pages (prevents memory leaks)\n      retireAfterMinutes: 15, // Recycle browser after 15 minutes\n      maxQueueSize: 200, // Allow up to 200 pending requests in queue\n    },\n  });\n\n  // Sample URLs to scrape\n  const urls = [\n    \"https://example.com\",\n    \"https://example.org\",\n    \"https://example.net\",\n  ];\n\n  console.log(`Scraping ${urls.length} URLs with pool size=5, concurrency=3\\n`);\n\n  try {\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"],\n      batchConcurrency: 3, // Process 3 URLs in parallel\n      onProgress: (progress) => {\n        console.log(`Progress: ${progress.completed}/${progress.total} - ${progress.currentUrl}`);\n      },\n    });\n\n    console.log(\"\\nScrape completed!\\n\");\n    console.log(\"Results:\");\n\n    for (const page of result.data) {\n      console.log(`\\n  ${page.metadata.baseUrl}`);\n      console.log(`     Title: ${page.metadata.website.title}`);\n      console.log(`     Duration: ${page.metadata.duration}ms`);\n      console.log(`     Content: ${page.markdown?.length || 0} chars`);\n    }\n\n    console.log(\"\\nBatch Metadata:\");\n    console.log(`  Total URLs: ${result.batchMetadata.totalUrls}`);\n    console.log(`  Successful: ${result.batchMetadata.successfulUrls}`);\n    console.log(`  Failed: ${result.batchMetadata.failedUrls}`);\n    console.log(`  Total Duration: ${result.batchMetadata.totalDuration}ms`);\n    console.log(\n      `  Avg Per URL: ${Math.round(\n        result.batchMetadata.totalDuration / result.batchMetadata.totalUrls\n      )}ms`\n    );\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/browser-session-actions.ts",
    "content": "#!/usr/bin/env node\n/**\n * Browser Session — Actions Example\n *\n * Demonstrates performing browser actions: clicking, typing, form\n * submission, waiting for elements, and extracting structured data.\n *\n * Uses Playwright to search Hacker News and extract results.\n *\n * Install: npm install playwright-core\n * Run:     npx tsx --tsconfig examples/tsconfig.json examples/basic/browser-session-actions.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nasync function main() {\n  const reader = new ReaderClient();\n\n  try {\n    // Create a browser session\n    const session = await reader.browser({ timeoutMs: 60_000, verbose: true, showChrome: true });\n    console.log(`Session: ${session.wsEndpoint}\\n`);\n\n    // Connect Playwright — one-line change from a local script\n    const browser = await chromium.connectOverCDP(session.wsEndpoint);\n    const context = await browser.newContext();\n    const page = await context.newPage();\n\n    // 1. Navigate to Hacker News\n    console.log(\"1. Navigating to Hacker News...\");\n    await page.goto(\"https://news.ycombinator.com/\", {\n      waitUntil: \"domcontentloaded\",\n    });\n    console.log(`   Title: ${await page.title()}\\n`);\n\n    // 2. Click the \"past\" link in the nav\n    console.log(\"2. Clicking 'past' link...\");\n    await page.click('a[href=\"front\"]');\n    await page.waitForLoadState(\"domcontentloaded\");\n    console.log(`   URL: ${page.url()}`);\n    console.log(`   Title: ${await page.title()}\\n`);\n\n    // 3. Go to the search page (Algolia-powered)\n    console.log(\"3. Navigating to HN Search...\");\n    await page.goto(\"https://hn.algolia.com/\", {\n      waitUntil: \"domcontentloaded\",\n    });\n\n    // 4. Type a search query (use type() for character-by-character input\n    //    so Algolia's instant search triggers properly)\n    console.log('4. Typing search query \"web scraping\"...');\n    await page.locator('input[type=\"search\"]').pressSequentially(\"web scraping\", { delay: 50 });\n\n    // 5. Wait for search results to settle\n    console.log(\"5. Waiting for search results...\");\n    await page.waitForTimeout(3_000);\n\n    // 6. Extract search results\n    console.log(\"6. Extracting results...\\n\");\n    const results = await page.evaluate(() => {\n      return Array.from(document.querySelectorAll(\".Story\"))\n        .slice(0, 5)\n        .map((el) => {\n          const titleEl = el.querySelector(\".Story_title a\");\n          const metaLinks = el.querySelectorAll(\".Story_meta a\");\n          return {\n            title: titleEl?.textContent?.trim(),\n            points: metaLinks[0]?.textContent?.trim() ?? null,\n            author: metaLinks[1]?.textContent?.trim() ?? null,\n          };\n        });\n    });\n\n    console.log('Search results for \"web scraping\":');\n    console.log(\"─\".repeat(60));\n    for (const r of results) {\n      console.log(`  ${r.title}`);\n      console.log(`    ${r.points} | by ${r.author}`);\n      console.log();\n    }\n\n    // 7. Take a screenshot of the search results\n    await page.screenshot({ path: \"hn-search-results.png\" });\n    console.log(\"Screenshot saved to hn-search-results.png\\n\");\n\n    // 8. Get cookies\n    const cookies = await context.cookies();\n    console.log(`Cookies: ${cookies.length} cookies set`);\n\n    // Cleanup\n    await browser.close();\n    await session.close();\n    console.log(\"\\nDone.\");\n  } finally {\n    await reader.close();\n    process.exit(0);\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/browser-session-puppeteer.ts",
    "content": "#!/usr/bin/env node\n/**\n * Browser Session — Puppeteer Example\n *\n * Same browser session primitive, but using Puppeteer instead of\n * Playwright. Puppeteer connects via browserWSEndpoint.\n *\n * Install: npm install puppeteer-core\n * Run:     npx tsx --tsconfig examples/tsconfig.json examples/basic/browser-session-puppeteer.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { connect } from \"puppeteer-core\";\n\nasync function main() {\n  const reader = new ReaderClient();\n\n  try {\n    // Create a browser session\n    const session = await reader.browser({ timeoutMs: 60_000, verbose: true, showChrome: true });\n    console.log(`Session: ${session.wsEndpoint}\\n`);\n\n    // Connect Puppeteer — uses browserWSEndpoint instead of connectOverCDP\n    const browser = await connect({\n      browserWSEndpoint: session.wsEndpoint,\n      defaultViewport: null,\n    });\n\n    const page = await browser.newPage();\n\n    // Navigate to Hacker News\n    console.log(\"Navigating to Hacker News...\");\n    await page.goto(\"https://news.ycombinator.com/\", {\n      waitUntil: \"domcontentloaded\",\n    });\n    console.log(`Title: ${await page.title()}\\n`);\n\n    // Extract top stories using Puppeteer's evaluate\n    const stories = await page.evaluate(() => {\n      return Array.from(document.querySelectorAll(\".athing\"))\n        .slice(0, 5)\n        .map((row) => {\n          const titleEl = row.querySelector(\".titleline > a\");\n          const scoreRow = row.nextElementSibling;\n          const scoreEl = scoreRow?.querySelector(\".score\");\n          return {\n            rank: row.querySelector(\".rank\")?.textContent?.trim(),\n            title: titleEl?.textContent?.trim(),\n            points: scoreEl?.textContent?.trim() ?? null,\n          };\n        });\n    });\n\n    console.log(\"Top 5 stories:\");\n    for (const s of stories) {\n      console.log(`  ${s.rank} ${s.title} (${s.points ?? \"no score\"})`);\n    }\n\n    // Take a screenshot\n    await page.screenshot({ path: \"hn-puppeteer.png\", fullPage: true });\n    console.log(\"\\nScreenshot saved to hn-puppeteer.png\");\n\n    // Stealth check\n    const webdriver = await page.evaluate(() => (navigator as any).webdriver);\n    console.log(`\\nwebdriver: ${webdriver}`);\n\n    // Cleanup\n    await browser.close();\n    await session.close();\n    console.log(\"\\nDone.\");\n  } finally {\n    await reader.close();\n    process.exit(0);\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/browser-session-selenium.ts",
    "content": "#!/usr/bin/env node\n/**\n * Browser Session — Selenium CDP Example\n *\n * Selenium 4+ supports direct CDP connections, bypassing chromedriver.\n * This uses Chrome's CDP WebSocket directly to navigate, extract data,\n * and take screenshots.\n *\n * Note: This bypasses chromedriver and uses raw CDP commands. For a\n * higher-level API, use Playwright or Puppeteer (see other examples).\n *\n * Install: npm install ws\n * Run:     npx tsx --tsconfig examples/tsconfig.json examples/basic/browser-session-selenium.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport WebSocket from \"ws\";\nimport { writeFileSync } from \"fs\";\n\n/** Send a CDP command over a WebSocket */\nfunction sendCDP(\n  ws: WebSocket,\n  cmdId: { value: number },\n  method: string,\n  params: any = {},\n  sessionId?: string\n): Promise<any> {\n  const id = ++cmdId.value;\n  return new Promise((resolve, reject) => {\n    const timeout = setTimeout(() => reject(new Error(`CDP timeout: ${method}`)), 15_000);\n    const handler = (data: WebSocket.Data) => {\n      const msg = JSON.parse(data.toString());\n      if (msg.id === id) {\n        ws.off(\"message\", handler);\n        clearTimeout(timeout);\n        if (msg.error) reject(new Error(msg.error.message));\n        else resolve(msg.result);\n      }\n    };\n    ws.on(\"message\", handler);\n    ws.send(JSON.stringify({ id, method, params, ...(sessionId && { sessionId }) }));\n  });\n}\n\nasync function main() {\n  const reader = new ReaderClient();\n\n  try {\n    // Create a browser session\n    const session = await reader.browser({ timeoutMs: 60_000, verbose: true, showChrome: true });\n    console.log(`Session: ${session.wsEndpoint}\\n`);\n\n    const url = new URL(session.wsEndpoint);\n    const baseUrl = `http://${url.hostname}:${url.port}`;\n\n    // Get browser info via Chrome's HTTP debug API\n    const versionResp = await fetch(`${baseUrl}/json/version`);\n    const version = await versionResp.json();\n    console.log(`Browser: ${version.Browser}`);\n\n    // Connect to the browser via CDP WebSocket\n    const ws = new WebSocket(session.wsEndpoint);\n    await new Promise<void>((resolve) => ws.on(\"open\", resolve));\n\n    const cmdId = { value: 0 };\n    const send = (method: string, params: any = {}) => sendCDP(ws, cmdId, method, params);\n\n    // Create a new page target via CDP\n    const target = await send(\"Target.createTarget\", {\n      url: \"about:blank\",\n    });\n    console.log(`Page created: ${target.targetId}\\n`);\n\n    // Attach to the page target to get a session\n    const attached = await send(\"Target.attachToTarget\", {\n      targetId: target.targetId,\n      flatten: true,\n    });\n    const pageSessionId = attached.sessionId;\n\n    // Helper to send commands to the page session\n    const sendPage = (method: string, params: any = {}) =>\n      sendCDP(ws, cmdId, method, params, pageSessionId);\n\n    // Enable page events\n    await sendPage(\"Page.enable\");\n    await sendPage(\"Runtime.enable\");\n\n    // Navigate to Hacker News\n    console.log(\"Navigating to Hacker News...\");\n    await sendPage(\"Page.navigate\", {\n      url: \"https://news.ycombinator.com/\",\n    });\n\n    // Wait for load\n    await new Promise<void>((resolve) => {\n      const handler = (data: WebSocket.Data) => {\n        const msg = JSON.parse(data.toString());\n        if (msg.method === \"Page.loadEventFired\") {\n          ws.off(\"message\", handler);\n          resolve();\n        }\n      };\n      ws.on(\"message\", handler);\n    });\n\n    // Get page title\n    const titleResult = await sendPage(\"Runtime.evaluate\", {\n      expression: \"document.title\",\n    });\n    console.log(`Title: ${titleResult.result.value}\\n`);\n\n    // Extract top 5 stories\n    const storiesResult = await sendPage(\"Runtime.evaluate\", {\n      expression: `JSON.stringify(\n        Array.from(document.querySelectorAll('.athing')).slice(0, 5).map(row => {\n          const rank = row.querySelector('.rank')?.textContent?.trim();\n          const title = row.querySelector('.titleline > a')?.textContent?.trim();\n          return rank + ' ' + title;\n        })\n      )`,\n    });\n    const stories = JSON.parse(storiesResult.result.value);\n    console.log(\"Top 5 stories:\");\n    for (const s of stories) {\n      console.log(`  ${s}`);\n    }\n\n    // Stealth check\n    const wdResult = await sendPage(\"Runtime.evaluate\", {\n      expression: \"navigator.webdriver\",\n    });\n    console.log(`\\nwebdriver: ${wdResult.result.value}`);\n\n    // Take a screenshot\n    const screenshotResult = await sendPage(\"Page.captureScreenshot\", {\n      format: \"png\",\n    });\n    writeFileSync(\"hn-selenium-cdp.png\", Buffer.from(screenshotResult.data, \"base64\"));\n    console.log(\"Screenshot saved to hn-selenium-cdp.png\");\n\n    // Cleanup\n    ws.close();\n    await session.close();\n    console.log(\"\\nDone.\");\n  } finally {\n    await reader.close();\n    process.exit(0);\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/browser-session.ts",
    "content": "#!/usr/bin/env node\n/**\n * Browser Session Example\n *\n * Demonstrates the browser() primitive — launches a Hero-stealthed\n * Chrome and returns a CDP WebSocket URL for Playwright/Puppeteer.\n *\n * This example:\n * 1. Creates a browser session via ReaderClient\n * 2. Connects Playwright via connectOverCDP (one-line change)\n * 3. Navigates to Hacker News and extracts the top stories\n * 4. Takes a screenshot\n * 5. Cleans up the session\n *\n * Install: npm install playwright-core\n * Run:     npx tsx examples/basic/browser-session.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { chromium } from \"playwright-core\";\n\nasync function main() {\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    // Create a browser session — returns a CDP WebSocket URL\n    console.log(\"Creating browser session...\\n\");\n    const session = await reader.browser({\n      timeoutMs: 60_000,\n      verbose: true,\n      showChrome: true,\n    });\n    console.log(`\\nSession ready: ${session.wsEndpoint}\\n`);\n\n    // Connect Playwright — this is the only line that changes\n    // from a normal Playwright script\n    const browser = await chromium.connectOverCDP(session.wsEndpoint);\n    const context = await browser.newContext();\n    const page = await context.newPage();\n\n    // Navigate to Hacker News\n    console.log(\"Navigating to Hacker News...\");\n    await page.goto(\"https://news.ycombinator.com/\", {\n      waitUntil: \"domcontentloaded\",\n      timeout: 15_000,\n    });\n\n    console.log(`Title: ${await page.title()}`);\n    console.log(`URL: ${page.url()}\\n`);\n\n    // Extract the top 10 stories\n    const stories = await page.evaluate(() => {\n      const rows = document.querySelectorAll(\".athing\");\n      return Array.from(rows)\n        .slice(0, 10)\n        .map((row) => {\n          const titleEl = row.querySelector(\".titleline > a\");\n          const siteEl = row.querySelector(\".sitestr\");\n          const scoreRow = row.nextElementSibling;\n          const scoreEl = scoreRow?.querySelector(\".score\");\n          return {\n            rank: row.querySelector(\".rank\")?.textContent?.trim(),\n            title: titleEl?.textContent?.trim(),\n            url: titleEl?.getAttribute(\"href\"),\n            site: siteEl?.textContent?.trim() ?? null,\n            points: scoreEl?.textContent?.trim() ?? null,\n          };\n        });\n    });\n\n    console.log(\"Top 10 Hacker News stories:\");\n    console.log(\"─\".repeat(60));\n    for (const story of stories) {\n      console.log(`${story.rank} ${story.title}`);\n      if (story.site) console.log(`   ${story.site} | ${story.points ?? \"no score\"}`);\n      console.log();\n    }\n\n    // Take a screenshot\n    await page.screenshot({ fullPage: true, path: \"hn-screenshot.png\" });\n    console.log(`Screenshot saved to hn-screenshot.png\\n`);\n\n    // Stealth check\n    const stealth = await page.evaluate(() => ({\n      webdriver: (navigator as any).webdriver,\n      languages: navigator.languages,\n    }));\n    console.log(\n      `Stealth: webdriver=${stealth.webdriver}, languages=${JSON.stringify(stealth.languages)}`\n    );\n\n    // Cleanup\n    await browser.close();\n    await session.close();\n    console.log(\"\\nDone.\");\n  } finally {\n    await reader.close();\n    process.exit(0);\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/cloudflare-bypass.ts",
    "content": "#!/usr/bin/env node\n/**\n * Cloudflare Bypass Example\n *\n * Demonstrates scraping a Cloudflare-protected website.\n * Reader automatically detects and handles Cloudflare challenges\n * using TLS fingerprinting, DNS over TLS, and WebRTC masking.\n *\n * Test URL: https://www.scrapingcourse.com/cloudflare-challenge\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting Cloudflare bypass example\\n\");\n\n  // Cloudflare-protected test URL\n  const url = process.argv[2] || \"https://www.scrapingcourse.com/cloudflare-challenge\";\n\n  console.log(`Target: ${url}`);\n  console.log(\"This site is protected by Cloudflare challenge.\\n\");\n\n  const reader = new ReaderClient({\n    verbose: true,\n    showChrome: false, // Set to true to watch the bypass in action\n  });\n\n  try {\n    console.log(\"Scraping (Cloudflare bypass handled automatically)...\\n\");\n\n    const result = await reader.scrape({\n      urls: [url],\n      formats: [\"markdown\"],\n      timeoutMs: 5000, // Allow extra time for challenge resolution\n    });\n\n    const page = result.data[0];\n\n    if (!page) {\n      console.error(\"No data returned - scrape may have failed\");\n      console.log(\"Errors:\", result.batchMetadata.errors);\n      process.exit(1);\n    }\n\n    console.log(\"\\nScrape completed successfully!\");\n    console.log(\"\\nResults:\");\n    console.log(`  URL: ${page.metadata.baseUrl}`);\n    console.log(`  Title: ${page.metadata.website.title}`);\n    console.log(`  Duration: ${page.metadata.duration}ms`);\n    console.log(`  Content length: ${page.markdown?.length || 0} chars`);\n\n    console.log(\"\\n--- CONTENT PREVIEW (first 500 chars) ---\\n\");\n    console.log(page.markdown?.slice(0, 500));\n\n    console.log(\"\\n--- METADATA ---\");\n    console.log(`  Description: ${page.metadata.website.description || \"N/A\"}`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    console.log(\"\\nTip: If the challenge fails, try:\");\n    console.log(\"  - Increasing timeoutMs\");\n    console.log(\"  - Using --show-chrome to debug visually\");\n    console.log(\"  - Using a residential proxy\");\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/crawl-website.ts",
    "content": "#!/usr/bin/env node\n/**\n * Crawling Example\n *\n * Demonstrates website crawling with link discovery\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting crawl example\\n\");\n\n  const seedUrl = process.argv[2] || \"https://example.com\";\n\n  console.log(`Crawling: ${seedUrl}`);\n  console.log(`   Depth: 2`);\n  console.log(`   Max Pages: 10`);\n  console.log(`   Scrape Content: true\\n`);\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    const result = await reader.crawl({\n      url: seedUrl,\n      depth: 2,\n      maxPages: 10,\n      scrape: true,\n    });\n\n    console.log(\"\\nCrawl completed!\\n\");\n    console.log(\"Discovered URLs:\");\n\n    for (const crawlUrl of result.urls) {\n      console.log(`\\n  ${crawlUrl.url}`);\n      console.log(`     Title: ${crawlUrl.title}`);\n      if (crawlUrl.description) {\n        console.log(`     Description: ${crawlUrl.description.slice(0, 100)}...`);\n      }\n    }\n\n    console.log(\"\\nCrawl Metadata:\");\n    console.log(`  Total URLs: ${result.metadata.totalUrls}`);\n    console.log(`  Max Depth: ${result.metadata.maxDepth}`);\n    console.log(`  Duration: ${result.metadata.totalDuration}ms`);\n    console.log(`  Seed URL: ${result.metadata.seedUrl}`);\n\n    if (result.scraped) {\n      console.log(\"\\nScraped Content:\");\n      console.log(`  Pages Scraped: ${result.scraped.batchMetadata.successfulUrls}`);\n      console.log(\n        `  Total Content: ${result.scraped.data.reduce(\n          (acc, page) => acc + (page.markdown?.length || 0),\n          0\n        )} chars`\n      );\n    }\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/large-batch-scrape.ts",
    "content": "#!/usr/bin/env node\n/**\n * Large-Scale Batch Scraping Example (1000 URLs)\n *\n * Demonstrates how to configure Reader for scraping\n * large batches of URLs efficiently.\n *\n * Key configurations for large batches:\n * - browserPool.size: More browsers = more parallelism\n * - browserPool.maxQueueSize: Must exceed total URL count\n * - batchConcurrency: How many URLs to process in parallel\n * - batchTimeoutMs: Must be long enough for all URLs\n *\n * Configuration Guide:\n * | URLs  | Pool Size | Concurrency | Queue Size | Timeout  | Est. Time   |\n * |-------|-----------|-------------|------------|----------|-------------|\n * | 100   | 5         | 5           | 100        | 10 min   | 3-5 min     |\n * | 500   | 8         | 8           | 500        | 30 min   | 15-25 min   |\n * | 1000  | 10        | 10          | 1000       | 1 hour   | 25-50 min   |\n * | 5000  | 10        | 10          | 5000       | 3 hours  | 2-4 hours   |\n *\n * Memory requirements:\n * - Each browser: ~100-300MB RAM\n * - 10 browsers: ~1-3GB RAM\n * - Recommended: 8GB+ system RAM for 10 browser instances\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\n/**\n * Generate sample URLs for demonstration\n * In production, you'd load these from a file, database, or API\n */\nfunction generateSampleUrls(count: number): string[] {\n  // Using httpbin.org endpoints which are safe for testing\n  const urls: string[] = [];\n  for (let i = 0; i < count; i++) {\n    // Rotate through different endpoints to simulate variety\n    urls.push(`https://httpbin.org/html?page=${i}`);\n  }\n  return urls;\n}\n\nasync function main() {\n  // For demo purposes, use a smaller batch (10 URLs)\n  // Change to 1000 for actual large-scale scraping\n  const BATCH_SIZE = 10; // Set to 1000 for real large-scale scraping\n\n  console.log(`\\n╔══════════════════════════════════════════════════════════╗`);\n  console.log(`║         Large-Scale Batch Scraping Example               ║`);\n  console.log(`╚══════════════════════════════════════════════════════════╝\\n`);\n\n  const urls = generateSampleUrls(BATCH_SIZE);\n  console.log(`Preparing to scrape ${urls.length} URLs\\n`);\n\n  // Configure for large-scale scraping\n  const reader = new ReaderClient({\n    verbose: true,\n\n    browserPool: {\n      // More browsers = more parallelism (adjust based on RAM)\n      // Each browser uses ~100-300MB RAM\n      size: 10,\n\n      // Queue must be large enough for all URLs\n      maxQueueSize: 1000,\n\n      // Recycle browsers more frequently with large batches\n      retireAfterPages: 200,\n      retireAfterMinutes: 30,\n    },\n  });\n\n  const startTime = Date.now();\n  let lastProgressUpdate = 0;\n\n  try {\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"], // Use single format for efficiency\n\n      // Match concurrency to browser pool size\n      batchConcurrency: 10,\n\n      // Long timeout for large batches (1 hour)\n      batchTimeoutMs: 3600000,\n\n      // Progress tracking\n      onProgress: (progress) => {\n        const now = Date.now();\n        // Update every 5 seconds to avoid console spam\n        if (now - lastProgressUpdate > 5000 || progress.completed === progress.total) {\n          const elapsed = Math.round((now - startTime) / 1000);\n          const rate = progress.completed / (elapsed || 1);\n          const eta = Math.round((progress.total - progress.completed) / rate);\n\n          console.log(\n            `[${elapsed}s] Progress: ${progress.completed}/${progress.total} ` +\n              `(${Math.round((progress.completed / progress.total) * 100)}%) ` +\n              `| Rate: ${rate.toFixed(1)} URLs/s | ETA: ${eta}s`\n          );\n          lastProgressUpdate = now;\n        }\n      },\n    });\n\n    const duration = Date.now() - startTime;\n\n    console.log(`\\n╔══════════════════════════════════════════════════════════╗`);\n    console.log(`║                    Batch Complete                        ║`);\n    console.log(`╚══════════════════════════════════════════════════════════╝\\n`);\n\n    console.log(`Summary:`);\n    console.log(`  Total URLs:      ${result.batchMetadata.totalUrls}`);\n    console.log(`  Successful:      ${result.batchMetadata.successfulUrls}`);\n    console.log(`  Failed:          ${result.batchMetadata.failedUrls}`);\n    console.log(`  Total Duration:  ${Math.round(duration / 1000)}s`);\n    console.log(`  Avg Per URL:     ${Math.round(duration / result.batchMetadata.totalUrls)}ms`);\n    console.log(\n      `  Throughput:      ${(result.batchMetadata.totalUrls / (duration / 1000)).toFixed(2)} URLs/s`\n    );\n\n    // Show failed URLs if any\n    if (result.batchMetadata.errors && result.batchMetadata.errors.length > 0) {\n      console.log(`\\nFailed URLs:`);\n      for (const error of result.batchMetadata.errors.slice(0, 10)) {\n        console.log(`  - ${error.url}: ${error.error}`);\n      }\n      if (result.batchMetadata.errors.length > 10) {\n        console.log(`  ... and ${result.batchMetadata.errors.length - 10} more`);\n      }\n    }\n\n    // Sample output from successful scrapes\n    if (result.data.length > 0) {\n      console.log(`\\nSample Results (first 3):`);\n      for (const page of result.data.slice(0, 3)) {\n        console.log(`  - ${page.metadata.baseUrl}`);\n        console.log(`    Title: ${page.metadata.website.title || \"N/A\"}`);\n        console.log(`    Content: ${page.markdown?.length || 0} chars`);\n      }\n    }\n  } catch (error: any) {\n    console.error(\"\\nError:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n    console.log(\"\\nDone!\");\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/proxy-pool.ts",
    "content": "#!/usr/bin/env node\n/**\n * Proxy Pool Example\n *\n * Demonstrates configuring multiple proxies with rotation for scraping.\n * Useful for avoiding rate limits and IP blocks when scraping at scale.\n *\n * Usage:\n *   Set your proxy credentials and run:\n *   npx tsx basic/proxy-pool.ts\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting proxy pool example\\n\");\n\n  // Configure proxy pool with rotation\n  // Replace with your actual proxy credentials\n  const reader = new ReaderClient({\n    verbose: true,\n\n    // List of proxies to rotate through\n    proxies: [\n      {\n        host: \"proxy1.example.com\",\n        port: 8080,\n        username: \"user1\",\n        password: \"pass1\",\n        type: \"datacenter\",\n      },\n      {\n        host: \"proxy2.example.com\",\n        port: 8080,\n        username: \"user2\",\n        password: \"pass2\",\n        type: \"datacenter\",\n      },\n      {\n        host: \"residential.example.com\",\n        port: 9000,\n        username: \"user3\",\n        password: \"pass3\",\n        type: \"residential\",\n        country: \"us\", // Geo-target to US\n      },\n    ],\n\n    // Rotation strategy: \"round-robin\" (default) or \"random\"\n    proxyRotation: \"round-robin\",\n  });\n\n  // URLs to scrape - each will use a different proxy from the pool\n  const urls = [\n    \"https://example.com\",\n    \"https://example.org\",\n    \"https://example.net\",\n  ];\n\n  console.log(`Scraping ${urls.length} URLs with proxy rotation\\n`);\n  console.log(\"Proxy rotation: round-robin\");\n  console.log(\"Proxy pool size: 3\\n\");\n\n  try {\n    const result = await reader.scrape({\n      urls,\n      formats: [\"markdown\"],\n      batchConcurrency: 1, // Sequential to demonstrate rotation\n      onProgress: (progress) => {\n        console.log(`Progress: ${progress.completed}/${progress.total} - ${progress.currentUrl}`);\n      },\n    });\n\n    console.log(\"\\nScrape completed!\\n\");\n    console.log(\"Results:\");\n\n    for (const page of result.data) {\n      console.log(`\\n  ${page.metadata.baseUrl}`);\n      console.log(`     Title: ${page.metadata.website.title}`);\n      console.log(`     Duration: ${page.metadata.duration}ms`);\n\n      // Show which proxy was used (if available)\n      if (page.metadata.proxy) {\n        console.log(`     Proxy: ${page.metadata.proxy.host}:${page.metadata.proxy.port}`);\n        if (page.metadata.proxy.country) {\n          console.log(`     Country: ${page.metadata.proxy.country}`);\n        }\n      }\n    }\n\n    console.log(\"\\nBatch Metadata:\");\n    console.log(`  Total URLs: ${result.batchMetadata.totalUrls}`);\n    console.log(`  Successful: ${result.batchMetadata.successfulUrls}`);\n    console.log(`  Failed: ${result.batchMetadata.failedUrls}`);\n    console.log(`  Total Duration: ${result.batchMetadata.totalDuration}ms`);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/basic/with-proxy.ts",
    "content": "#!/usr/bin/env node\n/**\n * Proxy Example\n *\n * Demonstrates scraping with a proxy configuration\n */\n\nimport { ReaderClient } from \"@vakra-dev/reader\";\n\nasync function main() {\n  console.log(\"Starting proxy example\\n\");\n\n  // Example proxy configurations:\n  //\n  // 1. Simple proxy URL:\n  // proxy: { url: \"http://user:pass@proxy.example.com:8080\" }\n  //\n  // 2. Residential proxy with country targeting:\n  // proxy: {\n  //   type: \"residential\",\n  //   host: \"geo.iproyal.com\",\n  //   port: 12321,\n  //   username: \"customer-user\",\n  //   password: \"password\",\n  //   country: \"us\"\n  // }\n  //\n  // 3. Datacenter proxy:\n  // proxy: {\n  //   type: \"datacenter\",\n  //   host: \"proxy.example.com\",\n  //   port: 8080,\n  //   username: \"user\",\n  //   password: \"pass\"\n  // }\n\n  // For this example, we'll skip the proxy if not configured\n  const proxyUrl = process.env.PROXY_URL;\n\n  if (!proxyUrl) {\n    console.log(\"No PROXY_URL environment variable set.\");\n    console.log(\"Set PROXY_URL=http://user:pass@host:port to test proxy scraping.\");\n    console.log(\"\\nRunning without proxy...\\n\");\n  }\n\n  const reader = new ReaderClient({ verbose: true });\n\n  try {\n    const result = await reader.scrape({\n      urls: [\"https://httpbin.org/ip\"], // Shows your IP address\n      formats: [\"markdown\"],\n      proxy: proxyUrl ? { url: proxyUrl } : undefined,\n    });\n\n    const page = result.data[0];\n\n    if (!page) {\n      console.error(\"No data returned - scrape may have failed\");\n      console.log(\"Errors:\", result.batchMetadata.errors);\n      process.exit(1);\n    }\n\n    console.log(\"\\nScrape completed!\");\n    console.log(\"\\nResponse (should show proxy IP if configured):\");\n    console.log(page.markdown);\n  } catch (error: any) {\n    console.error(\"Error:\", error.message);\n    process.exit(1);\n  } finally {\n    await reader.close();\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/package.json",
    "content": "{\n  \"name\": \"reader-examples\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Examples for @vakra-dev/reader\",\n  \"type\": \"module\",\n  \"dependencies\": {\n    \"@vakra-dev/reader\": \"file:..\"\n  },\n  \"devDependencies\": {\n    \"@ai-sdk/openai\": \"^1.0.0\",\n    \"@anthropic-ai/sdk\": \"^0.39.0\",\n    \"@langchain/core\": \"^0.3.0\",\n    \"@pinecone-database/pinecone\": \"^4.0.0\",\n    \"@qdrant/js-client-rest\": \"^1.12.0\",\n    \"@types/aws-lambda\": \"^8.10.145\",\n    \"@types/express\": \"^4.17.21\",\n    \"@types/node\": \"^20.10.6\",\n    \"@ulixee/hero\": \"^2.0.0-alpha.34\",\n    \"@ulixee/hero-core\": \"^2.0.0-alpha.34\",\n    \"@ulixee/net\": \"^2.0.0-alpha.29\",\n    \"@vercel/node\": \"^3.2.0\",\n    \"ai\": \"^4.0.0\",\n    \"express\": \"^4.18.2\",\n    \"llamaindex\": \"^0.8.0\",\n    \"openai\": \"^4.0.0\",\n    \"tsx\": \"^4.7.0\",\n    \"typescript\": \"^5.3.3\"\n  }\n}\n"
  },
  {
    "path": "examples/production/README.md",
    "content": "# Production Examples\n\nProduction-ready setups for running Reader at scale.\n\n## Available Examples\n\n### [Express Server](./express-server/)\n\nA full-featured REST API server with:\n- Health checks and graceful shutdown\n- Scrape and crawl endpoints\n- Shared Hero Core for efficiency\n- Request validation and error handling\n\n### [Job Queue (BullMQ)](./job-queue-bullmq/)\n\nAsync job processing with Redis:\n- Submit jobs via API, process in background\n- Progress tracking and webhook notifications\n- Automatic retries with exponential backoff\n- Horizontally scalable workers\n\n### [Browser Pool Scaling](./browser-pool-scaling/)\n\nAdvanced browser pool management:\n- Pool metrics (JSON and Prometheus formats)\n- Health checks with auto-recovery\n- Browser recycling to prevent memory leaks\n- Graceful degradation under load\n\n## Best Practices\n\n1. **Use a Shared Core**: Initialize Hero Core once and share across requests\n2. **Implement Health Checks**: Monitor browser pool health\n3. **Add Rate Limiting**: Protect against abuse\n4. **Use Caching**: Cache scrape results (Redis, Memcached)\n5. **Queue Long Operations**: Use job queues for batch scraping\n6. **Monitor Resources**: Track memory, CPU, and pool metrics\n\n## Quick Comparison\n\n| Example | Use Case | Dependencies |\n|---------|----------|--------------|\n| Express Server | Simple REST API | Express |\n| Job Queue | Async batch processing | BullMQ, Redis |\n| Pool Scaling | High-throughput scraping | Express |\n\n## Getting Started\n\nEach example has its own README with setup instructions:\n\n```bash\n# Express Server\ncd express-server && npm install && npm start\n\n# Job Queue\ncd job-queue-bullmq && npm install\nnpm run start   # API server\nnpm run worker  # Worker process\n\n# Pool Scaling\ncd browser-pool-scaling && npm install && npm start\n```\n"
  },
  {
    "path": "examples/production/browser-pool-scaling/README.md",
    "content": "# Browser Pool Scaling\n\nAdvanced browser pool configuration with metrics, health monitoring, and scaling.\n\n## Overview\n\nThis example demonstrates production-grade browser pool management:\n\n- **Pool metrics**: Monitor browser utilization, queue depth, and request latency\n- **Health checks**: Detect and recover from unhealthy browsers\n- **Auto-recycling**: Prevent memory leaks by retiring browsers after use\n- **Prometheus integration**: Export metrics for monitoring dashboards\n- **Graceful degradation**: Handle overload without crashing\n\n## Setup\n\n1. Install dependencies:\n   ```bash\n   cd examples/production/browser-pool-scaling\n   npm install\n   ```\n\n2. Start the server:\n   ```bash\n   npm run start\n   ```\n\n## API Endpoints\n\n### Health Check\n\n```bash\ncurl http://localhost:3003/health\n```\n\nResponse:\n```json\n{\n  \"status\": \"healthy\",\n  \"timestamp\": \"2024-01-01T00:00:00.000Z\",\n  \"uptime\": 3600000,\n  \"uptimeFormatted\": \"1h 0m\",\n  \"pool\": {\n    \"healthy\": true,\n    \"issues\": []\n  }\n}\n```\n\n### Metrics (JSON)\n\n```bash\ncurl http://localhost:3003/metrics\n```\n\nResponse:\n```json\n{\n  \"pool\": {\n    \"total\": 4,\n    \"available\": 2,\n    \"busy\": 2,\n    \"recycling\": 0,\n    \"unhealthy\": 0,\n    \"queueLength\": 0\n  },\n  \"performance\": {\n    \"totalRequests\": 150,\n    \"avgRequestDurationMs\": 2500\n  },\n  \"utilization\": {\n    \"percentage\": 50,\n    \"status\": \"moderate\"\n  },\n  \"config\": {\n    \"poolSize\": 4,\n    \"retireAfterPageCount\": 50,\n    \"retireAfterAgeMs\": 900000,\n    \"maxQueueSize\": 200,\n    \"queueTimeout\": 120000\n  }\n}\n```\n\n### Metrics (Prometheus)\n\n```bash\ncurl \"http://localhost:3003/metrics?format=prometheus\"\n```\n\nResponse:\n```\n# HELP reader_pool_total Total browser instances in pool\n# TYPE reader_pool_total gauge\nreader_pool_total 4\n\n# HELP reader_pool_available Available browser instances\n# TYPE reader_pool_available gauge\nreader_pool_available 2\n\n# HELP reader_pool_busy Busy browser instances\n# TYPE reader_pool_busy gauge\nreader_pool_busy 2\n...\n```\n\n### Scrape URL\n\n```bash\ncurl -X POST http://localhost:3003/scrape \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"url\": \"https://example.com\"}'\n```\n\nResponse:\n```json\n{\n  \"success\": true,\n  \"url\": \"https://example.com\",\n  \"title\": \"Example Domain\",\n  \"htmlLength\": 1256,\n  \"durationMs\": 1523\n}\n```\n\n### Batch Scrape\n\n```bash\ncurl -X POST http://localhost:3003/batch \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"urls\": [\"https://example.com\", \"https://httpbin.org/html\"],\n    \"concurrency\": 2\n  }'\n```\n\nResponse:\n```json\n{\n  \"success\": true,\n  \"summary\": {\n    \"total\": 2,\n    \"successful\": 2,\n    \"failed\": 0,\n    \"durationMs\": 3200,\n    \"avgPerUrl\": 1600\n  },\n  \"results\": [...]\n}\n```\n\n## Configuration\n\n### Environment Variables\n\n| Variable | Default | Description |\n|----------|---------|-------------|\n| `PORT` | 3003 | Server port |\n| `POOL_SIZE` | 4 | Number of browser instances |\n| `RETIRE_AFTER_PAGES` | 50 | Recycle browser after N pages |\n| `RETIRE_AFTER_MS` | 900000 | Recycle browser after 15 minutes |\n| `MAX_QUEUE_SIZE` | 200 | Maximum pending requests |\n| `QUEUE_TIMEOUT` | 120000 | Request timeout in queue (2 min) |\n\n### Scaling Recommendations\n\n| Use Case | Pool Size | Notes |\n|----------|-----------|-------|\n| Development | 2 | Low memory usage |\n| Small API | 4-8 | Handles ~10 req/min |\n| Medium traffic | 8-16 | Handles ~50 req/min |\n| High traffic | 16-32+ | Use multiple instances |\n\n### Memory Considerations\n\nEach browser instance uses approximately 100-300MB RAM. Plan accordingly:\n\n| Pool Size | Memory (approx) |\n|-----------|-----------------|\n| 2 | 400-600 MB |\n| 4 | 800 MB - 1.2 GB |\n| 8 | 1.6 - 2.4 GB |\n| 16 | 3.2 - 4.8 GB |\n\n## Prometheus & Grafana\n\n### Prometheus Configuration\n\nAdd to `prometheus.yml`:\n\n```yaml\nscrape_configs:\n  - job_name: 'reader'\n    scrape_interval: 15s\n    metrics_path: /metrics\n    params:\n      format: ['prometheus']\n    static_configs:\n      - targets: ['localhost:3003']\n```\n\n### Grafana Dashboard\n\nKey metrics to monitor:\n\n1. **Pool Utilization**: `reader_pool_busy / reader_pool_total`\n2. **Queue Depth**: `reader_pool_queue_length`\n3. **Unhealthy Instances**: `reader_pool_unhealthy`\n4. **Request Latency**: `reader_pool_request_duration_avg_ms`\n\n### Alerting Rules\n\n```yaml\ngroups:\n  - name: reader\n    rules:\n      - alert: HighPoolUtilization\n        expr: reader_pool_busy / reader_pool_total > 0.9\n        for: 5m\n        annotations:\n          summary: \"Browser pool near capacity\"\n\n      - alert: UnhealthyBrowsers\n        expr: reader_pool_unhealthy > 0\n        for: 2m\n        annotations:\n          summary: \"Unhealthy browser instances detected\"\n\n      - alert: HighQueueDepth\n        expr: reader_pool_queue_length > 50\n        for: 1m\n        annotations:\n          summary: \"Request queue growing\"\n```\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────────────────┐\n│                     Browser Pool                            │\n├─────────────────────────────────────────────────────────────┤\n│  ┌──────────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐    │\n│  │ Browser  │  │ Browser  │  │ Browser  │  │ Browser  │    │\n│  │   #1     │  │   #2     │  │   #3     │  │   #4     │    │\n│  │  (busy)  │  │ (avail)  │  │  (busy)  │  │ (avail)  │    │\n│  └──────────┘  └──────────┘  └──────────┘  └──────────┘    │\n├─────────────────────────────────────────────────────────────┤\n│  Request Queue: [req5] [req6] [req7] ...                    │\n├─────────────────────────────────────────────────────────────┤\n│  Recycler: Checks every 60s, retires old/heavy browsers     │\n│  Health Check: Every 5min, marks unhealthy browsers         │\n└─────────────────────────────────────────────────────────────┘\n```\n\n## Files\n\n```\nbrowser-pool-scaling/\n├── README.md           # This file\n├── package.json        # Dependencies\n└── src/\n    └── index.ts        # Server with pool management\n```\n"
  },
  {
    "path": "examples/production/browser-pool-scaling/package.json",
    "content": "{\n  \"name\": \"browser-pool-scaling-example\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Browser pool scaling example with metrics and health monitoring\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"start\": \"npx tsx src/index.ts\"\n  },\n  \"dependencies\": {\n    \"@vakra-dev/reader\": \"file:../../..\",\n    \"express\": \"^4.18.2\"\n  },\n  \"devDependencies\": {\n    \"@types/express\": \"^4.17.21\",\n    \"@types/node\": \"^20.10.6\",\n    \"tsx\": \"^4.7.0\",\n    \"typescript\": \"^5.3.3\"\n  }\n}\n"
  },
  {
    "path": "examples/production/browser-pool-scaling/src/index.ts",
    "content": "/**\n * Browser Pool Scaling Example\n *\n * Demonstrates advanced browser pool configuration with:\n * - Pool metrics endpoint for monitoring\n * - Health checks with detailed status\n * - Graceful degradation under load\n * - Resource cleanup on shutdown\n *\n * Usage: npx tsx src/index.ts\n */\n\nimport express, { Request, Response, NextFunction } from \"express\";\nimport { BrowserPool } from \"@vakra-dev/reader\";\nimport type { PoolConfig } from \"@vakra-dev/reader\";\nimport HeroCore from \"@ulixee/hero-core\";\nimport { TransportBridge } from \"@ulixee/net\";\nimport { ConnectionToHeroCore } from \"@ulixee/hero\";\n\n// Global HeroCore instance\nlet heroCore: HeroCore | null = null;\n\nfunction createConnectionToCore(): ConnectionToHeroCore {\n  if (!heroCore) {\n    throw new Error(\"HeroCore not initialized\");\n  }\n  const bridge = new TransportBridge();\n  heroCore.addConnection(bridge.transportToClient);\n  return new ConnectionToHeroCore(bridge.transportToCore);\n}\n\n// ============================================================================\n// Pool Configuration\n// ============================================================================\n\nconst poolConfig: Partial<PoolConfig> = {\n  // Number of browser instances to maintain\n  size: parseInt(process.env.POOL_SIZE || \"4\"),\n\n  // Retire browser after N pages (prevents memory leaks)\n  retireAfterPageCount: parseInt(process.env.RETIRE_AFTER_PAGES || \"50\"),\n\n  // Retire browser after N milliseconds (15 minutes default)\n  retireAfterAgeMs: parseInt(process.env.RETIRE_AFTER_MS || String(15 * 60 * 1000)),\n\n  // How often to check for browsers to recycle (1 minute)\n  recycleCheckInterval: 60 * 1000,\n\n  // Health check interval (5 minutes)\n  healthCheckInterval: 5 * 60 * 1000,\n\n  // Max failures before marking browser unhealthy\n  maxConsecutiveFailures: 3,\n\n  // Request queue settings\n  maxQueueSize: parseInt(process.env.MAX_QUEUE_SIZE || \"200\"),\n  queueTimeout: parseInt(process.env.QUEUE_TIMEOUT || String(120 * 1000)),\n};\n\n// Pool instance (created after HeroCore starts)\nlet pool: BrowserPool;\n\nconst app = express();\nconst PORT = process.env.PORT || 3003;\nconst serverStartTime = Date.now();\n\n// Middleware\napp.use(express.json({ limit: \"1mb\" }));\n\n// Request logging\napp.use((req: Request, res: Response, next: NextFunction) => {\n  console.log(`[${new Date().toISOString()}] ${req.method} ${req.path}`);\n  next();\n});\n\n// ============================================================================\n// Routes\n// ============================================================================\n\n/**\n * GET /health - Basic health check\n */\napp.get(\"/health\", async (req: Request, res: Response) => {\n  try {\n    const health = await pool.healthCheck();\n    const uptime = Date.now() - serverStartTime;\n\n    res.status(health.healthy ? 200 : 503).json({\n      status: health.healthy ? \"healthy\" : \"degraded\",\n      timestamp: new Date().toISOString(),\n      uptime,\n      uptimeFormatted: formatDuration(uptime),\n      pool: {\n        healthy: health.healthy,\n        issues: health.issues,\n      },\n    });\n  } catch (error: any) {\n    res.status(503).json({\n      status: \"unhealthy\",\n      error: error.message,\n    });\n  }\n});\n\n/**\n * GET /metrics - Detailed pool metrics (Prometheus-compatible format available)\n */\napp.get(\"/metrics\", (req: Request, res: Response) => {\n  const stats = pool.getStats();\n  const format = req.query.format;\n\n  if (format === \"prometheus\") {\n    // Prometheus exposition format\n    const lines = [\n      `# HELP reader_pool_total Total browser instances in pool`,\n      `# TYPE reader_pool_total gauge`,\n      `reader_pool_total ${stats.total}`,\n      ``,\n      `# HELP reader_pool_available Available browser instances`,\n      `# TYPE reader_pool_available gauge`,\n      `reader_pool_available ${stats.available}`,\n      ``,\n      `# HELP reader_pool_busy Busy browser instances`,\n      `# TYPE reader_pool_busy gauge`,\n      `reader_pool_busy ${stats.busy}`,\n      ``,\n      `# HELP reader_pool_recycling Browser instances being recycled`,\n      `# TYPE reader_pool_recycling gauge`,\n      `reader_pool_recycling ${stats.recycling}`,\n      ``,\n      `# HELP reader_pool_unhealthy Unhealthy browser instances`,\n      `# TYPE reader_pool_unhealthy gauge`,\n      `reader_pool_unhealthy ${stats.unhealthy}`,\n      ``,\n      `# HELP reader_pool_queue_length Pending requests in queue`,\n      `# TYPE reader_pool_queue_length gauge`,\n      `reader_pool_queue_length ${stats.queueLength}`,\n      ``,\n      `# HELP reader_pool_requests_total Total requests processed`,\n      `# TYPE reader_pool_requests_total counter`,\n      `reader_pool_requests_total ${stats.totalRequests}`,\n      ``,\n      `# HELP reader_pool_request_duration_avg_ms Average request duration`,\n      `# TYPE reader_pool_request_duration_avg_ms gauge`,\n      `reader_pool_request_duration_avg_ms ${stats.avgRequestDuration.toFixed(2)}`,\n    ];\n\n    res.set(\"Content-Type\", \"text/plain; version=0.0.4\");\n    res.send(lines.join(\"\\n\"));\n  } else {\n    // JSON format\n    res.json({\n      pool: {\n        total: stats.total,\n        available: stats.available,\n        busy: stats.busy,\n        recycling: stats.recycling,\n        unhealthy: stats.unhealthy,\n        queueLength: stats.queueLength,\n      },\n      performance: {\n        totalRequests: stats.totalRequests,\n        avgRequestDurationMs: Math.round(stats.avgRequestDuration),\n      },\n      utilization: {\n        percentage: stats.total > 0 ? Math.round((stats.busy / stats.total) * 100) : 0,\n        status: getUtilizationStatus(stats),\n      },\n      config: {\n        poolSize: poolConfig.size,\n        retireAfterPageCount: poolConfig.retireAfterPageCount,\n        retireAfterAgeMs: poolConfig.retireAfterAgeMs,\n        maxQueueSize: poolConfig.maxQueueSize,\n        queueTimeout: poolConfig.queueTimeout,\n      },\n    });\n  }\n});\n\n/**\n * POST /scrape - Scrape a URL using the pool\n */\napp.post(\"/scrape\", async (req: Request, res: Response) => {\n  const { url, waitForSelector, timeout } = req.body;\n\n  // Validation\n  if (!url || typeof url !== \"string\") {\n    return res.status(400).json({\n      success: false,\n      error: \"url is required and must be a string\",\n    });\n  }\n\n  try {\n    new URL(url);\n  } catch {\n    return res.status(400).json({\n      success: false,\n      error: `Invalid URL: ${url}`,\n    });\n  }\n\n  const startTime = Date.now();\n\n  try {\n    const result = await pool.withBrowser(async (hero) => {\n      // Navigate to URL\n      await hero.goto(url);\n\n      // Wait for selector if specified\n      if (waitForSelector) {\n        await hero.waitForElement(hero.document.querySelector(waitForSelector), {\n          timeoutMs: timeout || 30000,\n        });\n      } else {\n        await hero.waitForLoad(\"AllContentLoaded\");\n      }\n\n      // Extract content\n      const html = await hero.document.documentElement.outerHTML;\n      const title = await hero.document.title;\n\n      return { html, title };\n    });\n\n    const duration = Date.now() - startTime;\n\n    res.json({\n      success: true,\n      url,\n      title: result.title,\n      htmlLength: result.html.length,\n      durationMs: duration,\n    });\n  } catch (error: any) {\n    const duration = Date.now() - startTime;\n\n    console.error(`[Scrape] Error for ${url}:`, error.message);\n\n    res.status(500).json({\n      success: false,\n      url,\n      error: error.message,\n      durationMs: duration,\n    });\n  }\n});\n\n/**\n * POST /batch - Scrape multiple URLs concurrently\n */\napp.post(\"/batch\", async (req: Request, res: Response) => {\n  const { urls, concurrency = 2 } = req.body;\n\n  // Validation\n  if (!urls || !Array.isArray(urls) || urls.length === 0) {\n    return res.status(400).json({\n      success: false,\n      error: \"urls is required and must be a non-empty array\",\n    });\n  }\n\n  const startTime = Date.now();\n  const results: Array<{ url: string; success: boolean; title?: string; error?: string }> = [];\n\n  // Process URLs with limited concurrency\n  const chunks: string[][] = [];\n  for (let i = 0; i < urls.length; i += concurrency) {\n    chunks.push(urls.slice(i, i + concurrency));\n  }\n\n  for (const chunk of chunks) {\n    const chunkResults = await Promise.allSettled(\n      chunk.map(async (url: string) => {\n        try {\n          const result = await pool.withBrowser(async (hero) => {\n            await hero.goto(url);\n            await hero.waitForLoad(\"AllContentLoaded\");\n            const title = await hero.document.title;\n            return { url, success: true, title };\n          });\n          return result;\n        } catch (error: any) {\n          return { url, success: false, error: error.message };\n        }\n      })\n    );\n\n    for (const result of chunkResults) {\n      if (result.status === \"fulfilled\") {\n        results.push(result.value);\n      } else {\n        results.push({ url: \"unknown\", success: false, error: result.reason?.message });\n      }\n    }\n  }\n\n  const duration = Date.now() - startTime;\n  const successCount = results.filter((r) => r.success).length;\n\n  res.json({\n    success: true,\n    summary: {\n      total: urls.length,\n      successful: successCount,\n      failed: urls.length - successCount,\n      durationMs: duration,\n      avgPerUrl: Math.round(duration / urls.length),\n    },\n    results,\n  });\n});\n\n// ============================================================================\n// Helpers\n// ============================================================================\n\nfunction formatDuration(ms: number): string {\n  const seconds = Math.floor(ms / 1000);\n  const minutes = Math.floor(seconds / 60);\n  const hours = Math.floor(minutes / 60);\n  const days = Math.floor(hours / 24);\n\n  if (days > 0) return `${days}d ${hours % 24}h`;\n  if (hours > 0) return `${hours}h ${minutes % 60}m`;\n  if (minutes > 0) return `${minutes}m ${seconds % 60}s`;\n  return `${seconds}s`;\n}\n\nfunction getUtilizationStatus(stats: { total: number; busy: number; queueLength: number }): string {\n  const utilization = stats.total > 0 ? stats.busy / stats.total : 0;\n\n  if (stats.queueLength > 0) return \"saturated\";\n  if (utilization > 0.8) return \"high\";\n  if (utilization > 0.5) return \"moderate\";\n  if (utilization > 0) return \"low\";\n  return \"idle\";\n}\n\n// ============================================================================\n// Error handling\n// ============================================================================\n\napp.use((err: Error, req: Request, res: Response, _next: NextFunction) => {\n  console.error(\"[Server Error]\", err);\n  res.status(500).json({\n    success: false,\n    error: err.message || \"Internal server error\",\n  });\n});\n\n// 404 handler\napp.use((req: Request, res: Response) => {\n  res.status(404).json({\n    success: false,\n    error: `Not found: ${req.method} ${req.path}`,\n  });\n});\n\n// ============================================================================\n// Start server\n// ============================================================================\n\nasync function startServer() {\n  try {\n    // Start HeroCore first\n    console.log(\"[Pool] Starting HeroCore...\");\n    heroCore = new HeroCore();\n    await heroCore.start();\n    console.log(\"[Pool] HeroCore started\");\n\n    // Create pool with connection to HeroCore\n    console.log(\"[Pool] Initializing browser pool...\");\n    pool = new BrowserPool(\n      poolConfig,\n      undefined, // proxy\n      false, // showChrome\n      createConnectionToCore()\n    );\n    await pool.initialize();\n    console.log(`[Pool] Pool initialized with ${poolConfig.size} browsers`);\n\n    app.listen(PORT, () => {\n      console.log(`\n╔════════════════════════════════════════════════════════════════╗\n║       Reader - Browser Pool Scaling Example             ║\n╠════════════════════════════════════════════════════════════════╣\n║  Server running on http://localhost:${PORT}                       ║\n╠════════════════════════════════════════════════════════════════╣\n║  Endpoints:                                                    ║\n║    GET  /health           - Health check with pool status      ║\n║    GET  /metrics          - Pool metrics (JSON or Prometheus)  ║\n║    POST /scrape           - Scrape a single URL                ║\n║    POST /batch            - Scrape multiple URLs               ║\n╠════════════════════════════════════════════════════════════════╣\n║  Pool Configuration:                                           ║\n║    Size: ${poolConfig.size} browsers                                        ║\n║    Retire after: ${poolConfig.retireAfterPageCount} pages or ${Math.round((poolConfig.retireAfterAgeMs || 0) / 60000)}min             ║\n║    Max queue: ${poolConfig.maxQueueSize} requests                                ║\n╚════════════════════════════════════════════════════════════════╝\n      `);\n    });\n\n    // Graceful shutdown\n    const shutdown = async () => {\n      console.log(\"\\n[Pool] Shutting down...\");\n      await pool.shutdown();\n      if (heroCore) {\n        await heroCore.close();\n      }\n      console.log(\"[Pool] Pool shutdown complete\");\n      process.exit(0);\n    };\n\n    process.on(\"SIGINT\", shutdown);\n    process.on(\"SIGTERM\", shutdown);\n  } catch (error: any) {\n    console.error(\"[Pool] Failed to start:\", error.message);\n    process.exit(1);\n  }\n}\n\nstartServer();\n"
  },
  {
    "path": "examples/production/express-server/README.md",
    "content": "# Express Server Example\n\nA production-ready Express server exposing Reader as a REST API.\n\n## Features\n\n- Health check endpoint\n- Scrape endpoint (single and batch)\n- Crawl endpoint\n- Shared Hero Core for efficiency\n- Graceful shutdown handling\n\n## Setup\n\n```bash\ncd examples\nnpm install\n```\n\n## Usage\n\n```bash\n# Start the server\nnpx tsx production/express-server/src/index.ts\n```\n\nServer runs on http://localhost:3001\n\n## API Endpoints\n\n### GET /health\n\nHealth check endpoint.\n\n```bash\ncurl http://localhost:3001/health\n```\n\n### POST /scrape\n\nScrape one or more URLs.\n\n```bash\ncurl -X POST http://localhost:3001/scrape \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"urls\": [\"https://example.com\"],\n    \"formats\": [\"markdown\", \"html\"]\n  }'\n```\n\n**Request body:**\n| Field | Type | Default | Description |\n|-------|------|---------|-------------|\n| urls | string[] | required | URLs to scrape |\n| formats | string[] | [\"markdown\"] | Output formats |\n| batchConcurrency | number | 1 | Parallel requests |\n| verbose | boolean | false | Enable logging |\n\n### POST /crawl\n\nCrawl a website to discover pages.\n\n```bash\ncurl -X POST http://localhost:3001/crawl \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"url\": \"https://example.com\",\n    \"depth\": 2,\n    \"maxPages\": 20,\n    \"scrape\": true\n  }'\n```\n\n**Request body:**\n| Field | Type | Default | Description |\n|-------|------|---------|-------------|\n| url | string | required | Seed URL |\n| depth | number | 1 | Max depth (0-5) |\n| maxPages | number | 20 | Max pages (1-100) |\n| scrape | boolean | false | Also scrape content |\n\n## Why Shared Hero Core?\n\nThis server uses a shared Hero Core instance instead of letting each request create its own:\n\n| Approach | Startup Time | Memory | Best For |\n|----------|--------------|--------|----------|\n| Per-request Core | ~5-10s | High (each request) | Scripts, CLI |\n| Shared Core | Once at startup | Shared across requests | Servers |\n\nThe shared Core is initialized once when the server starts, and all incoming requests share it via `TransportBridge`. This approach:\n\n- **Eliminates cold starts** - No browser startup delay per request\n- **Reduces memory usage** - Single Core instance shared across all requests\n- **Improves throughput** - Requests don't wait for Core initialization\n\nSee [src/index.ts](./src/index.ts) for the implementation.\n\n## Docker\n\nSee the [Docker deployment example](../../deployment/docker) for containerized deployment.\n\n## Production Considerations\n\n1. **Rate Limiting**: Add rate limiting middleware\n2. **Authentication**: Add API key authentication\n3. **Caching**: Cache scrape results (Redis, etc.)\n4. **Queue**: Use job queue for async processing\n5. **Monitoring**: Add metrics and logging\n"
  },
  {
    "path": "examples/production/express-server/package.json",
    "content": "{\n  \"name\": \"reader-express-server\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Express server example for @vakra-dev/reader\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"start\": \"npx tsx src/index.ts\",\n    \"dev\": \"npx tsx --watch src/index.ts\"\n  },\n  \"dependencies\": {\n    \"@ulixee/hero\": \"^2.0.0-alpha.34\",\n    \"@ulixee/hero-core\": \"^2.0.0-alpha.34\",\n    \"@ulixee/net\": \"^2.0.0-alpha.29\",\n    \"@vakra-dev/reader\": \"^1.0.0\",\n    \"express\": \"^4.18.2\"\n  },\n  \"devDependencies\": {\n    \"@types/express\": \"^4.17.21\",\n    \"@types/node\": \"^20.10.6\",\n    \"tsx\": \"^4.7.0\",\n    \"typescript\": \"^5.3.3\"\n  }\n}\n"
  },
  {
    "path": "examples/production/express-server/src/index.ts",
    "content": "/**\n * Express Server Example for Reader\n *\n * Demonstrates how to run Reader as a REST API.\n * Uses ReaderClient which manages the HeroCore lifecycle internally.\n *\n * Key concepts:\n * - Initialize ReaderClient once at startup\n * - Reuse the same client for all requests\n * - Graceful shutdown to properly close the client\n */\n\nimport express, { Request, Response, NextFunction } from \"express\";\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport type { ScrapeResult, CrawlResult } from \"@vakra-dev/reader\";\n\n// Global ReaderClient instance (initialized in startServer)\nlet reader: ReaderClient | null = null;\n\nconst app = express();\nconst PORT = process.env.PORT || 3001;\nconst serverStartTime = Date.now();\n\n// Middleware\napp.use(express.json({ limit: \"10mb\" }));\n\n// Request logging\napp.use((req: Request, res: Response, next: NextFunction) => {\n  console.log(`[${new Date().toISOString()}] ${req.method} ${req.path}`);\n  next();\n});\n\n// ============================================================================\n// Routes\n// ============================================================================\n\n/**\n * GET /health - Health check endpoint\n */\napp.get(\"/health\", (req: Request, res: Response) => {\n  const uptime = Date.now() - serverStartTime;\n\n  res.json({\n    status: \"healthy\",\n    timestamp: new Date().toISOString(),\n    uptime,\n    uptimeFormatted: `${Math.floor(uptime / 1000)}s`,\n  });\n});\n\n/**\n * POST /scrape - Scrape one or more URLs\n *\n * Request body:\n * {\n *   urls: string[]              // Required\n *   formats?: string[]          // Default: ['markdown']\n *   batchConcurrency?: number   // Default: 1\n *   waitForSelector?: string\n *   screenshot?: boolean\n *   verbose?: boolean\n *   showChrome?: boolean\n *   proxy?: ProxyConfig\n * }\n */\napp.post(\"/scrape\", async (req: Request, res: Response) => {\n  try {\n    const { urls, formats, ...options } = req.body;\n\n    // Validation\n    if (!urls || !Array.isArray(urls) || urls.length === 0) {\n      return res.status(400).json({\n        success: false,\n        error: \"urls is required and must be a non-empty array\",\n      });\n    }\n\n    // Validate URLs\n    for (const url of urls) {\n      try {\n        new URL(url);\n      } catch {\n        return res.status(400).json({\n          success: false,\n          error: `Invalid URL: ${url}`,\n        });\n      }\n    }\n\n    // Validate formats if provided\n    if (formats) {\n      const validFormats = [\"markdown\", \"html\"];\n      if (!Array.isArray(formats) || !formats.every((f: string) => validFormats.includes(f))) {\n        return res.status(400).json({\n          success: false,\n          error: \"formats must be an array of: markdown, html\",\n        });\n      }\n    }\n\n    console.log(`[scrape] Starting scrape of ${urls.length} URL(s)`);\n\n    if (!reader) {\n      throw new Error(\"ReaderClient not initialized\");\n    }\n\n    const result: ScrapeResult = await reader.scrape({\n      urls,\n      formats: formats || [\"markdown\"],\n      ...options,\n    });\n\n    console.log(\n      `[scrape] Completed: ${result.batchMetadata.successfulUrls}/${result.batchMetadata.totalUrls} successful`\n    );\n\n    res.json({\n      success: true,\n      data: result.data,\n      batchMetadata: result.batchMetadata,\n    });\n  } catch (error: any) {\n    console.error(\"[scrape] Error:\", error);\n    res.status(500).json({\n      success: false,\n      error: error.message || \"Scrape failed\",\n    });\n  }\n});\n\n/**\n * POST /crawl - Crawl a website\n *\n * Request body:\n * {\n *   url: string        // Required - seed URL\n *   depth?: number     // Default: 1, max: 5\n *   maxPages?: number  // Default: 20, max: 100\n *   scrape?: boolean   // Also scrape full content\n * }\n */\napp.post(\"/crawl\", async (req: Request, res: Response) => {\n  try {\n    const { url, depth, maxPages, scrape: shouldScrape } = req.body;\n\n    // Validation\n    if (!url || typeof url !== \"string\") {\n      return res.status(400).json({\n        success: false,\n        error: \"url is required and must be a string\",\n      });\n    }\n\n    try {\n      new URL(url);\n    } catch {\n      return res.status(400).json({\n        success: false,\n        error: `Invalid URL: ${url}`,\n      });\n    }\n\n    // Validate depth\n    if (depth !== undefined && (typeof depth !== \"number\" || depth < 0 || depth > 5)) {\n      return res.status(400).json({\n        success: false,\n        error: \"depth must be a number between 0 and 5\",\n      });\n    }\n\n    // Validate maxPages\n    if (\n      maxPages !== undefined &&\n      (typeof maxPages !== \"number\" || maxPages < 1 || maxPages > 100)\n    ) {\n      return res.status(400).json({\n        success: false,\n        error: \"maxPages must be a number between 1 and 100\",\n      });\n    }\n\n    console.log(`[crawl] Starting crawl of ${url} (depth: ${depth || 1})`);\n\n    if (!reader) {\n      throw new Error(\"ReaderClient not initialized\");\n    }\n\n    const result: CrawlResult = await reader.crawl({\n      url,\n      depth: depth || 1,\n      maxPages: maxPages || 20,\n      scrape: shouldScrape || false,\n    });\n\n    console.log(`[crawl] Completed: found ${result.urls.length} URLs`);\n\n    res.json({\n      success: true,\n      urls: result.urls,\n      scraped: result.scraped\n        ? {\n            success: true,\n            data: result.scraped.data,\n            batchMetadata: result.scraped.batchMetadata,\n          }\n        : undefined,\n      metadata: result.metadata,\n    });\n  } catch (error: any) {\n    console.error(\"[crawl] Error:\", error);\n    res.status(500).json({\n      success: false,\n      error: error.message || \"Crawl failed\",\n    });\n  }\n});\n\n// ============================================================================\n// Error handling\n// ============================================================================\n\napp.use((err: Error, req: Request, res: Response, _next: NextFunction) => {\n  console.error(\"[Server Error]\", err);\n  res.status(500).json({\n    success: false,\n    error: err.message || \"Internal server error\",\n  });\n});\n\n// 404 handler\napp.use((req: Request, res: Response) => {\n  res.status(404).json({\n    success: false,\n    error: `Not found: ${req.method} ${req.path}`,\n  });\n});\n\n// ============================================================================\n// Start server\n// ============================================================================\n\n// Initialize ReaderClient and start Express server\nasync function startServer() {\n  try {\n    // Initialize ReaderClient (starts HeroCore internally)\n    reader = new ReaderClient({ verbose: true });\n    await reader.start();\n    console.log(\"[reader] ReaderClient started\");\n\n    app.listen(PORT, () => {\n      console.log(`\n╔════════════════════════════════════════════════════════════════╗\n║       Reader - Express Server Example                   ║\n╠════════════════════════════════════════════════════════════════╣\n║  Server running on http://localhost:${PORT}                    ║\n╠════════════════════════════════════════════════════════════════╣\n║  Endpoints:                                                    ║\n║    GET  /health  - Health check                                ║\n║    POST /scrape  - Scrape URLs                                 ║\n║    POST /crawl   - Crawl website                               ║\n╚════════════════════════════════════════════════════════════════╝\n      `);\n    });\n\n    // Graceful shutdown\n    const shutdown = async () => {\n      console.log(\"\\n[reader] Shutting down...\");\n      if (reader) {\n        await reader.close();\n      }\n      process.exit(0);\n    };\n\n    process.on(\"SIGINT\", shutdown);\n    process.on(\"SIGTERM\", shutdown);\n  } catch (err: any) {\n    console.error(\"[reader] Failed to start:\", err.message);\n    process.exit(1);\n  }\n}\n\nstartServer();\n"
  },
  {
    "path": "examples/production/job-queue-bullmq/README.md",
    "content": "# Job Queue with BullMQ\n\nAsync job processing for Reader using BullMQ and Redis.\n\n## Overview\n\nThis example demonstrates how to run scrape operations asynchronously using a job queue. This is ideal for:\n\n- **Batch processing**: Submit hundreds of URLs and process them in the background\n- **Webhook notifications**: Get notified when jobs complete\n- **Horizontal scaling**: Run multiple workers to increase throughput\n- **Retry logic**: Automatically retry failed jobs with exponential backoff\n- **Progress tracking**: Monitor job progress in real-time\n\n## Architecture\n\n```\n┌─────────────┐     ┌─────────────┐     ┌─────────────┐\n│   Client    │────▶│  API Server │────▶│    Redis    │\n└─────────────┘     └─────────────┘     └──────┬──────┘\n                                               │\n                    ┌──────────────────────────┼──────────────────────────┐\n                    │                          │                          │\n              ┌─────▼─────┐            ┌───────▼───────┐           ┌──────▼──────┐\n              │  Worker 1 │            │   Worker 2    │           │  Worker N   │\n              └───────────┘            └───────────────┘           └─────────────┘\n```\n\n## Prerequisites\n\n- Redis server running (local or remote)\n- Node.js >= 18\n\n## Setup\n\n1. Install dependencies:\n   ```bash\n   cd examples/production/job-queue-bullmq\n   npm install\n   ```\n\n2. Start Redis (if not running):\n   ```bash\n   # Using Docker\n   docker run -d -p 6379:6379 redis:alpine\n\n   # Or using Homebrew (macOS)\n   brew services start redis\n   ```\n\n3. Start the API server:\n   ```bash\n   npm run start\n   ```\n\n4. Start the worker (in a separate terminal):\n   ```bash\n   npm run worker\n   ```\n\n5. Or run both together:\n   ```bash\n   npm run dev\n   ```\n\n## API Endpoints\n\n### Submit a Job\n\n```bash\ncurl -X POST http://localhost:3002/jobs \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"urls\": [\"https://example.com\", \"https://httpbin.org/html\"],\n    \"formats\": [\"markdown\"],\n    \"webhookUrl\": \"https://your-server.com/webhook\"\n  }'\n```\n\nResponse:\n```json\n{\n  \"jobId\": \"1\",\n  \"status\": \"queued\",\n  \"urls\": 2\n}\n```\n\n### Check Job Status\n\n```bash\ncurl http://localhost:3002/jobs/1\n```\n\nResponse:\n```json\n{\n  \"id\": \"1\",\n  \"state\": \"completed\",\n  \"progress\": 100,\n  \"data\": {\n    \"urls\": [\"https://example.com\"],\n    \"formats\": [\"markdown\"]\n  },\n  \"result\": {\n    \"success\": true,\n    \"data\": {\n      \"batchMetadata\": {\n        \"totalUrls\": 1,\n        \"successfulUrls\": 1,\n        \"failedUrls\": 0,\n        \"totalDurationMs\": 2500\n      },\n      \"results\": [...]\n    }\n  },\n  \"timestamps\": {\n    \"created\": 1704067200000,\n    \"processed\": 1704067201000,\n    \"finished\": 1704067203500\n  },\n  \"attempts\": 1\n}\n```\n\n### Queue Statistics\n\n```bash\ncurl http://localhost:3002/stats\n```\n\nResponse:\n```json\n{\n  \"waiting\": 5,\n  \"active\": 2,\n  \"completed\": 150,\n  \"failed\": 3,\n  \"delayed\": 0\n}\n```\n\n### Retry a Failed Job\n\n```bash\ncurl -X POST http://localhost:3002/jobs/1/retry\n```\n\n### Remove a Job\n\n```bash\ncurl -X DELETE http://localhost:3002/jobs/1\n```\n\n## Configuration\n\n### Environment Variables\n\n| Variable | Default | Description |\n|----------|---------|-------------|\n| `PORT` | 3002 | API server port |\n| `REDIS_URL` | redis://localhost:6379 | Redis connection URL |\n| `WORKER_CONCURRENCY` | 2 | Jobs processed simultaneously |\n\n### Job Options\n\nWhen submitting a job, you can configure:\n\n```json\n{\n  \"urls\": [\"...\"],\n  \"formats\": [\"markdown\", \"html\"],\n  \"webhookUrl\": \"https://...\",\n  \"priority\": 1,\n  \"delay\": 5000\n}\n```\n\n- **priority**: Lower number = higher priority (default: undefined)\n- **delay**: Milliseconds to wait before processing (default: 0)\n\n## Webhook Notifications\n\nWhen a `webhookUrl` is provided, the worker sends notifications:\n\n### Job Completed\n```json\n{\n  \"event\": \"job.completed\",\n  \"jobId\": \"1\",\n  \"timestamp\": \"2024-01-01T00:00:00.000Z\",\n  \"result\": {\n    \"success\": true,\n    \"batchMetadata\": {...},\n    \"urlCount\": 2\n  }\n}\n```\n\n### Job Failed\n```json\n{\n  \"event\": \"job.failed\",\n  \"jobId\": \"1\",\n  \"timestamp\": \"2024-01-01T00:00:00.000Z\",\n  \"error\": \"Timeout waiting for page\"\n}\n```\n\n## Scaling Workers\n\nRun multiple workers to increase throughput:\n\n```bash\n# Terminal 1\nWORKER_CONCURRENCY=4 npm run worker\n\n# Terminal 2\nWORKER_CONCURRENCY=4 npm run worker\n\n# Terminal 3\nWORKER_CONCURRENCY=4 npm run worker\n```\n\nEach worker processes jobs independently. BullMQ ensures no job is processed twice.\n\n## Production Considerations\n\n1. **Redis Persistence**: Configure Redis with AOF or RDB persistence for durability\n2. **Memory Limits**: Set Redis maxmemory to prevent OOM\n3. **Worker Health**: Use process managers like PM2 to restart crashed workers\n4. **Monitoring**: Use BullMQ's built-in dashboard or integrate with observability tools\n5. **Rate Limiting**: The worker is configured to process max 10 jobs/second\n\n## Files\n\n```\njob-queue-bullmq/\n├── README.md           # This file\n├── package.json        # Dependencies\n└── src/\n    ├── index.ts        # API server\n    ├── queue.ts        # Queue configuration\n    └── worker.ts       # Job processor\n```\n"
  },
  {
    "path": "examples/production/job-queue-bullmq/package.json",
    "content": "{\n  \"name\": \"job-queue-bullmq-example\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Async job queue example using BullMQ and Redis\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"start\": \"npx tsx src/index.ts\",\n    \"worker\": \"npx tsx src/worker.ts\",\n    \"dev\": \"concurrently \\\"npm run start\\\" \\\"npm run worker\\\"\"\n  },\n  \"dependencies\": {\n    \"@vakra-dev/reader\": \"file:../../..\",\n    \"@ulixee/hero\": \"^2.0.0-alpha.34\",\n    \"@ulixee/hero-core\": \"^2.0.0-alpha.34\",\n    \"@ulixee/net\": \"^2.0.0-alpha.29\",\n    \"bullmq\": \"^5.0.0\",\n    \"express\": \"^4.18.2\",\n    \"ioredis\": \"^5.3.0\"\n  },\n  \"devDependencies\": {\n    \"@types/express\": \"^4.17.21\",\n    \"@types/node\": \"^20.10.6\",\n    \"concurrently\": \"^8.2.0\",\n    \"tsx\": \"^4.7.0\",\n    \"typescript\": \"^5.3.3\"\n  }\n}\n"
  },
  {
    "path": "examples/production/job-queue-bullmq/src/index.ts",
    "content": "/**\n * Job Queue API Server\n *\n * REST API for submitting and monitoring scrape jobs.\n * Jobs are processed asynchronously by the worker process.\n *\n * Usage: npx tsx src/index.ts\n */\n\nimport express, { Request, Response, NextFunction } from \"express\";\nimport {\n  addScrapeJob,\n  getJob,\n  getQueueStats,\n  scrapeQueue,\n  connection,\n  ScrapeJobData,\n} from \"./queue.js\";\n\nconst app = express();\nconst PORT = process.env.PORT || 3002;\n\n// Middleware\napp.use(express.json({ limit: \"1mb\" }));\n\n// Request logging\napp.use((req: Request, res: Response, next: NextFunction) => {\n  console.log(`[${new Date().toISOString()}] ${req.method} ${req.path}`);\n  next();\n});\n\n// ============================================================================\n// Routes\n// ============================================================================\n\n/**\n * GET /health - Health check\n */\napp.get(\"/health\", async (req: Request, res: Response) => {\n  try {\n    // Check Redis connection\n    await connection.ping();\n\n    const stats = await getQueueStats();\n\n    res.json({\n      status: \"healthy\",\n      timestamp: new Date().toISOString(),\n      queue: stats,\n    });\n  } catch (error: any) {\n    res.status(503).json({\n      status: \"unhealthy\",\n      error: error.message,\n    });\n  }\n});\n\n/**\n * GET /stats - Queue statistics\n */\napp.get(\"/stats\", async (req: Request, res: Response) => {\n  try {\n    const stats = await getQueueStats();\n    res.json(stats);\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n/**\n * POST /jobs - Submit a new scrape job\n *\n * Request body:\n * {\n *   urls: string[]          // Required: URLs to scrape\n *   formats?: string[]      // Optional: Output formats (default: ['markdown'])\n *   webhookUrl?: string     // Optional: URL to notify on completion\n *   priority?: number       // Optional: Job priority (lower = higher priority)\n *   delay?: number          // Optional: Delay in ms before processing\n * }\n */\napp.post(\"/jobs\", async (req: Request, res: Response) => {\n  try {\n    const { urls, formats, webhookUrl, priority, delay } = req.body;\n\n    // Validation\n    if (!urls || !Array.isArray(urls) || urls.length === 0) {\n      return res.status(400).json({\n        error: \"urls is required and must be a non-empty array\",\n      });\n    }\n\n    // Validate URLs\n    for (const url of urls) {\n      try {\n        new URL(url);\n      } catch {\n        return res.status(400).json({\n          error: `Invalid URL: ${url}`,\n        });\n      }\n    }\n\n    // Validate formats if provided\n    const validFormats = [\"markdown\", \"html\"];\n    if (formats) {\n      if (!Array.isArray(formats) || !formats.every((f: string) => validFormats.includes(f))) {\n        return res.status(400).json({\n          error: `formats must be an array of: ${validFormats.join(\", \")}`,\n        });\n      }\n    }\n\n    // Validate webhook URL if provided\n    if (webhookUrl) {\n      try {\n        new URL(webhookUrl);\n      } catch {\n        return res.status(400).json({\n          error: `Invalid webhook URL: ${webhookUrl}`,\n        });\n      }\n    }\n\n    // Create job data\n    const jobData: ScrapeJobData = {\n      urls,\n      formats: formats || [\"markdown\"],\n      webhookUrl,\n      priority,\n    };\n\n    // Add job to queue\n    const jobId = await addScrapeJob(jobData, { priority, delay });\n\n    console.log(`[API] Job ${jobId} created: ${urls.length} URL(s)`);\n\n    res.status(201).json({\n      jobId,\n      status: \"queued\",\n      urls: urls.length,\n      estimatedWait: delay ? `${delay}ms` : undefined,\n    });\n  } catch (error: any) {\n    console.error(\"[API] Error creating job:\", error);\n    res.status(500).json({ error: error.message });\n  }\n});\n\n/**\n * GET /jobs/:id - Get job status and result\n */\napp.get(\"/jobs/:id\", async (req: Request, res: Response) => {\n  try {\n    const job = await getJob(req.params.id);\n\n    if (!job) {\n      return res.status(404).json({ error: \"Job not found\" });\n    }\n\n    const state = await job.getState();\n    const progress = job.progress;\n    const result = job.returnvalue;\n    const failedReason = job.failedReason;\n\n    res.json({\n      id: job.id,\n      state,\n      progress,\n      data: job.data,\n      result: result || undefined,\n      error: failedReason || undefined,\n      timestamps: {\n        created: job.timestamp,\n        processed: job.processedOn,\n        finished: job.finishedOn,\n      },\n      attempts: job.attemptsMade,\n    });\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n/**\n * DELETE /jobs/:id - Cancel/remove a job\n */\napp.delete(\"/jobs/:id\", async (req: Request, res: Response) => {\n  try {\n    const job = await getJob(req.params.id);\n\n    if (!job) {\n      return res.status(404).json({ error: \"Job not found\" });\n    }\n\n    const state = await job.getState();\n\n    if (state === \"active\") {\n      return res.status(400).json({\n        error: \"Cannot remove active job. Wait for it to complete or fail.\",\n      });\n    }\n\n    await job.remove();\n\n    res.json({\n      message: \"Job removed\",\n      id: req.params.id,\n    });\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n/**\n * POST /jobs/:id/retry - Retry a failed job\n */\napp.post(\"/jobs/:id/retry\", async (req: Request, res: Response) => {\n  try {\n    const job = await getJob(req.params.id);\n\n    if (!job) {\n      return res.status(404).json({ error: \"Job not found\" });\n    }\n\n    const state = await job.getState();\n\n    if (state !== \"failed\") {\n      return res.status(400).json({\n        error: `Cannot retry job in state: ${state}. Only failed jobs can be retried.`,\n      });\n    }\n\n    await job.retry();\n\n    res.json({\n      message: \"Job retried\",\n      id: req.params.id,\n      newState: \"waiting\",\n    });\n  } catch (error: any) {\n    res.status(500).json({ error: error.message });\n  }\n});\n\n// ============================================================================\n// Error handling\n// ============================================================================\n\napp.use((err: Error, req: Request, res: Response) => {\n  console.error(\"[API Error]\", err);\n  res.status(500).json({ error: err.message || \"Internal server error\" });\n});\n\n// 404 handler\napp.use((req: Request, res: Response) => {\n  res.status(404).json({ error: `Not found: ${req.method} ${req.path}` });\n});\n\n// ============================================================================\n// Start server\n// ============================================================================\n\nasync function startServer() {\n  try {\n    // Test Redis connection\n    await connection.ping();\n    console.log(\"[API] Redis connected\");\n\n    app.listen(PORT, () => {\n      console.log(`\n╔════════════════════════════════════════════════════════════════╗\n║       Reader - Job Queue API                            ║\n╠════════════════════════════════════════════════════════════════╣\n║  Server running on http://localhost:${PORT}                    ║\n╠════════════════════════════════════════════════════════════════╣\n║  Endpoints:                                                    ║\n║    GET  /health        - Health check with queue stats         ║\n║    GET  /stats         - Queue statistics                      ║\n║    POST /jobs          - Submit a new scrape job               ║\n║    GET  /jobs/:id      - Get job status and result             ║\n║    DELETE /jobs/:id    - Remove a job                          ║\n║    POST /jobs/:id/retry - Retry a failed job                   ║\n╠════════════════════════════════════════════════════════════════╣\n║  Note: Start the worker separately with: npm run worker        ║\n╚════════════════════════════════════════════════════════════════╝\n      `);\n    });\n\n    // Graceful shutdown\n    const shutdown = async () => {\n      console.log(\"\\n[API] Shutting down...\");\n      await scrapeQueue.close();\n      await connection.quit();\n      process.exit(0);\n    };\n\n    process.on(\"SIGINT\", shutdown);\n    process.on(\"SIGTERM\", shutdown);\n  } catch (error: any) {\n    console.error(\"[API] Failed to start:\", error.message);\n    process.exit(1);\n  }\n}\n\nstartServer();\n"
  },
  {
    "path": "examples/production/job-queue-bullmq/src/queue.ts",
    "content": "/**\n * Queue Configuration\n *\n * Defines the BullMQ queue and job types for async scraping.\n */\n\nimport { Queue } from \"bullmq\";\nimport IORedis from \"ioredis\";\n\n// Redis connection (shared across queue and workers)\nexport const connection = new IORedis(process.env.REDIS_URL || \"redis://localhost:6379\", {\n  maxRetriesPerRequest: null, // Required by BullMQ\n});\n\n// Scrape job queue\nexport const scrapeQueue = new Queue(\"scrape\", {\n  connection,\n  defaultJobOptions: {\n    attempts: 3,\n    backoff: {\n      type: \"exponential\",\n      delay: 1000,\n    },\n    removeOnComplete: {\n      age: 3600, // Keep completed jobs for 1 hour\n      count: 1000, // Keep last 1000 completed jobs\n    },\n    removeOnFail: {\n      age: 86400, // Keep failed jobs for 24 hours\n    },\n  },\n});\n\n/**\n * Scrape job input data\n */\nexport interface ScrapeJobData {\n  /** URLs to scrape */\n  urls: string[];\n  /** Output formats */\n  formats: string[];\n  /** Optional webhook URL to notify on completion */\n  webhookUrl?: string;\n  /** Optional priority (lower = higher priority) */\n  priority?: number;\n}\n\n/**\n * Scrape job result\n */\nexport interface ScrapeJobResult {\n  success: boolean;\n  data?: {\n    batchMetadata: {\n      totalUrls: number;\n      successfulUrls: number;\n      failedUrls: number;\n      totalDurationMs: number;\n    };\n    results: Array<{\n      url: string;\n      success: boolean;\n      markdown?: string;\n      html?: string;\n      json?: object;\n      error?: string;\n    }>;\n  };\n  error?: string;\n}\n\n/**\n * Add a scrape job to the queue\n */\nexport async function addScrapeJob(\n  data: ScrapeJobData,\n  options?: { priority?: number; delay?: number }\n): Promise<string> {\n  const job = await scrapeQueue.add(\"scrape\", data, {\n    priority: options?.priority ?? data.priority,\n    delay: options?.delay,\n  });\n  return job.id!;\n}\n\n/**\n * Get job by ID\n */\nexport async function getJob(jobId: string) {\n  return scrapeQueue.getJob(jobId);\n}\n\n/**\n * Get queue statistics\n */\nexport async function getQueueStats() {\n  const [waiting, active, completed, failed, delayed] = await Promise.all([\n    scrapeQueue.getWaitingCount(),\n    scrapeQueue.getActiveCount(),\n    scrapeQueue.getCompletedCount(),\n    scrapeQueue.getFailedCount(),\n    scrapeQueue.getDelayedCount(),\n  ]);\n\n  return { waiting, active, completed, failed, delayed };\n}\n"
  },
  {
    "path": "examples/production/job-queue-bullmq/src/worker.ts",
    "content": "/**\n * Scrape Worker\n *\n * Processes scrape jobs from the BullMQ queue.\n * Run this as a separate process from the API server.\n *\n * Usage: npx tsx src/worker.ts\n */\n\nimport { Worker, Job } from \"bullmq\";\nimport { ReaderClient } from \"@vakra-dev/reader\";\nimport { connection, ScrapeJobData, ScrapeJobResult } from \"./queue.js\";\n\n// Shared ReaderClient instance\nlet reader: ReaderClient | null = null;\n\n/**\n * Process a scrape job\n */\nasync function processJob(job: Job<ScrapeJobData>): Promise<ScrapeJobResult> {\n  const { urls, formats, webhookUrl } = job.data;\n\n  console.log(`[Worker] Processing job ${job.id}: ${urls.length} URL(s)`);\n\n  if (!reader) {\n    throw new Error(\"ReaderClient not initialized\");\n  }\n\n  try {\n    // Update progress: starting\n    await job.updateProgress(10);\n\n    // Perform scrape\n    const result = await reader.scrape({\n      urls,\n      formats: formats as Array<\"markdown\" | \"html\">,\n    });\n\n    // Update progress: scraping complete\n    await job.updateProgress(80);\n\n    // Send webhook notification if configured\n    if (webhookUrl) {\n      try {\n        await fetch(webhookUrl, {\n          method: \"POST\",\n          headers: { \"Content-Type\": \"application/json\" },\n          body: JSON.stringify({\n            event: \"job.completed\",\n            jobId: job.id,\n            timestamp: new Date().toISOString(),\n            result: {\n              success: true,\n              batchMetadata: result.batchMetadata,\n              urlCount: urls.length,\n            },\n          }),\n        });\n        console.log(`[Worker] Webhook sent to ${webhookUrl}`);\n      } catch (webhookError) {\n        console.error(`[Worker] Webhook failed:`, webhookError);\n        // Don't fail the job if webhook fails\n      }\n    }\n\n    // Update progress: complete\n    await job.updateProgress(100);\n\n    console.log(\n      `[Worker] Job ${job.id} completed: ${result.batchMetadata.successfulUrls}/${result.batchMetadata.totalUrls} successful`\n    );\n\n    return {\n      success: true,\n      data: {\n        batchMetadata: {\n          totalUrls: result.batchMetadata.totalUrls,\n          successfulUrls: result.batchMetadata.successfulUrls,\n          failedUrls: result.batchMetadata.failedUrls,\n          totalDurationMs: result.batchMetadata.totalDuration,\n        },\n        results: result.data.map((r) => ({\n          url: r.metadata.baseUrl,\n          success: true,\n          markdown: r.markdown,\n          html: r.html,\n        })),\n      },\n    };\n  } catch (error: any) {\n    console.error(`[Worker] Job ${job.id} failed:`, error.message);\n\n    // Send failure webhook if configured\n    if (webhookUrl) {\n      try {\n        await fetch(webhookUrl, {\n          method: \"POST\",\n          headers: { \"Content-Type\": \"application/json\" },\n          body: JSON.stringify({\n            event: \"job.failed\",\n            jobId: job.id,\n            timestamp: new Date().toISOString(),\n            error: error.message,\n          }),\n        });\n      } catch {\n        // Ignore webhook errors on failure\n      }\n    }\n\n    throw error; // Re-throw to mark job as failed\n  }\n}\n\n/**\n * Start the worker\n */\nasync function startWorker() {\n  console.log(\"[Worker] Starting ReaderClient...\");\n\n  // Initialize ReaderClient\n  reader = new ReaderClient({ verbose: true });\n  await reader.start();\n\n  console.log(\"[Worker] ReaderClient started\");\n\n  // Create worker\n  const worker = new Worker<ScrapeJobData, ScrapeJobResult>(\"scrape\", processJob, {\n    connection,\n    concurrency: parseInt(process.env.WORKER_CONCURRENCY || \"2\"),\n    limiter: {\n      max: 10,\n      duration: 1000, // Max 10 jobs per second\n    },\n  });\n\n  // Event handlers\n  worker.on(\"completed\", (job) => {\n    console.log(`[Worker] Job ${job.id} completed successfully`);\n  });\n\n  worker.on(\"failed\", (job, error) => {\n    console.error(`[Worker] Job ${job?.id} failed:`, error.message);\n  });\n\n  worker.on(\"error\", (error) => {\n    console.error(\"[Worker] Worker error:\", error);\n  });\n\n  console.log(`\n╔════════════════════════════════════════════════════════════════╗\n║       Reader - BullMQ Worker                            ║\n╠════════════════════════════════════════════════════════════════╣\n║  Worker started and listening for jobs                         ║\n║  Concurrency: ${process.env.WORKER_CONCURRENCY || \"2\"} jobs                                          ║\n║  Redis: ${process.env.REDIS_URL || \"redis://localhost:6379\"}                            ║\n╚════════════════════════════════════════════════════════════════╝\n  `);\n\n  // Graceful shutdown\n  const shutdown = async () => {\n    console.log(\"\\n[Worker] Shutting down...\");\n\n    // Close worker (waits for active jobs to complete)\n    await worker.close();\n\n    // Close ReaderClient\n    if (reader) {\n      await reader.close();\n    }\n\n    // Close Redis connection\n    await connection.quit();\n\n    console.log(\"[Worker] Shutdown complete\");\n    process.exit(0);\n  };\n\n  process.on(\"SIGINT\", shutdown);\n  process.on(\"SIGTERM\", shutdown);\n}\n\n// Start worker\nstartWorker().catch((error) => {\n  console.error(\"[Worker] Failed to start:\", error);\n  process.exit(1);\n});\n"
  },
  {
    "path": "examples/tsconfig.json",
    "content": "{\n  \"compilerOptions\": {\n    \"ignoreDeprecations\": \"6.0\",\n    \"target\": \"ESNext\",\n    \"module\": \"ESNext\",\n    \"moduleResolution\": \"bundler\",\n    \"lib\": [\"ESNext\"],\n    \"baseUrl\": \"..\",\n    \"paths\": {\n      \"@vakra-dev/reader\": [\"./src/index.ts\"]\n    },\n    \"strict\": true,\n    \"esModuleInterop\": true,\n    \"allowSyntheticDefaultImports\": true,\n    \"skipLibCheck\": true,\n    \"noEmit\": true,\n    \"resolveJsonModule\": true,\n    \"types\": [\"node\"]\n  },\n  \"include\": [\n    \"basic/**/*.ts\",\n    \"ai-tools/**/*.ts\",\n    \"production/**/*.ts\",\n    \"deployment/**/*.ts\"\n  ],\n  \"exclude\": [\"node_modules\"]\n}\n"
  },
  {
    "path": "package.json",
    "content": "{\n  \"name\": \"@vakra-dev/reader\",\n  \"version\": \"0.2.0\",\n  \"description\": \"Open source, production grade web scraping engine for LLMs. Clean markdown output, ready for your agents.\",\n  \"license\": \"Apache-2.0\",\n  \"type\": \"module\",\n  \"main\": \"./dist/index.js\",\n  \"types\": \"./dist/index.d.ts\",\n  \"bin\": {\n    \"reader\": \"./dist/cli/index.js\"\n  },\n  \"exports\": {\n    \".\": {\n      \"import\": \"./dist/index.js\",\n      \"types\": \"./dist/index.d.ts\"\n    }\n  },\n  \"files\": [\n    \"dist\",\n    \"README.md\",\n    \"LICENSE\"\n  ],\n  \"keywords\": [\n    \"web-scraper\",\n    \"web-crawler\",\n    \"markdown\",\n    \"llm\",\n    \"rag\",\n    \"ai-agents\",\n    \"headless-browser\",\n    \"typescript\",\n    \"nodejs\",\n    \"web-data-extraction\",\n    \"content-extraction\",\n    \"html-to-markdown\",\n    \"web-scraping\",\n    \"browser-automation\",\n    \"cdp\",\n    \"ai\"\n  ],\n  \"author\": \"Nihal <nihal.codes@gmail.com>\",\n  \"repository\": {\n    \"type\": \"git\",\n    \"url\": \"https://github.com/vakra-dev/reader.git\"\n  },\n  \"scripts\": {\n    \"start\": \"node dist/cli/index.js\",\n    \"daemon\": \"node dist/cli/index.js start --port 6003\",\n    \"lint\": \"eslint src/\",\n    \"lint:fix\": \"eslint src/ --fix\",\n    \"format\": \"prettier --write 'src/**/*.ts'\",\n    \"format:check\": \"prettier --check 'src/**/*.ts'\",\n    \"todo\": \"leasot 'src/**/*.ts'\",\n    \"test\": \"vitest run\",\n    \"test:watch\": \"vitest\",\n    \"typecheck\": \"tsc --noEmit\",\n    \"build\": \"tsup\",\n    \"build:tsc\": \"tsc\",\n    \"dev\": \"tsup --watch\",\n    \"clean\": \"rm -rf dist\",\n    \"prepublishOnly\": \"npm run clean && npm run build\"\n  },\n  \"dependencies\": {\n    \"@ulixee/chrome-139-0\": \"^7258.155.11\",\n    \"@ulixee/hero\": \"^2.0.0-alpha.34\",\n    \"@ulixee/hero-core\": \"^2.0.0-alpha.34\",\n    \"@ulixee/net\": \"^2.0.0-alpha.34\",\n    \"@vakra-dev/supermarkdown\": \"^0.0.6\",\n    \"commander\": \"^12.0.0\",\n    \"dotenv\": \"^17.4.1\",\n    \"linkedom\": \"^0.18.12\",\n    \"p-limit\": \"^4.0.0\",\n    \"pino\": \"^9.0.0\",\n    \"pino-pretty\": \"^13.1.3\",\n    \"re2\": \"^1.23.0\",\n    \"undici\": \"^7.24.7\"\n  },\n  \"devDependencies\": {\n    \"@types/node\": \"^20.10.6\",\n    \"@types/selenium-webdriver\": \"^4.35.5\",\n    \"@typescript-eslint/eslint-plugin\": \"^7.0.0\",\n    \"@typescript-eslint/parser\": \"^7.0.0\",\n    \"chromedriver\": \"^147.0.4\",\n    \"eslint\": \"^8.57.0\",\n    \"leasot\": \"^13.3.0\",\n    \"playwright-core\": \"^1.59.1\",\n    \"prettier\": \"^3.2.0\",\n    \"puppeteer-core\": \"^24.42.0\",\n    \"selenium-webdriver\": \"^4.43.0\",\n    \"tsup\": \"^8.5.1\",\n    \"typescript\": \"^5.3.3\",\n    \"vitest\": \"^4.1.0\"\n  },\n  \"engines\": {\n    \"node\": \">=18\"\n  }\n}\n"
  },
  {
    "path": "result.md",
    "content": "{\n  \"data\": [\n    {\n      \"markdown\": \"Example Domain\\n\\n# Example Domain\\n\\nThis domain is for use in documentation examples without needing permission. Avoid use in operations.\\n\\n[Learn more](https://iana.org/domains/example)\",\n      \"metadata\": {\n        \"baseUrl\": \"https://example.com\",\n        \"totalPages\": 1,\n        \"scrapedAt\": \"2026-02-02T01:43:05.132Z\",\n        \"duration\": 256,\n        \"website\": {\n          \"title\": \"Example Domain\",\n          \"description\": null,\n          \"author\": null,\n          \"language\": \"en\",\n          \"charset\": null,\n          \"favicon\": \"https://example.com/favicon.ico\",\n          \"canonical\": null,\n          \"image\": null,\n          \"keywords\": null,\n          \"robots\": null,\n          \"themeColor\": null,\n          \"openGraph\": null,\n          \"twitter\": null\n        }\n      }\n    }\n  ],\n  \"batchMetadata\": {\n    \"totalUrls\": 1,\n    \"successfulUrls\": 1,\n    \"failedUrls\": 0,\n    \"scrapedAt\": \"2026-02-02T01:43:05.132Z\",\n    \"totalDuration\": 260,\n    \"errors\": []\n  }\n}"
  },
  {
    "path": "scripts/release.sh",
    "content": "#!/usr/bin/env bash\n#\n# Release script for reader\n#\n# Usage:\n#   ./scripts/release.sh 0.2.0\n#   ./scripts/release.sh 0.2.0 --dry-run\n#\n# What it does (in order):\n#   1. Validates: clean working tree, on main, tag doesn't exist\n#   2. Bumps version in package.json + package-lock.json\n#   3. Runs all checks (typecheck, lint, format, test, build)\n#   4. If checks fail: reverts version bump, exits\n#   5. If checks pass: commits, tags, pushes commit+tag, creates release\n#\n# Nothing is pushed until all checks pass. Dry run never modifies files.\n#\n\nset -euo pipefail\n\nVERSION=\"${1:-}\"\nDRY_RUN=\"${2:-}\"\n\nif [ -z \"$VERSION\" ]; then\n  echo \"Usage: ./scripts/release.sh <version> [--dry-run]\"\n  echo \"Example: ./scripts/release.sh 0.2.0\"\n  exit 1\nfi\n\nif ! echo \"$VERSION\" | grep -qE '^[0-9]+\\.[0-9]+\\.[0-9]+$'; then\n  echo \"Error: Version must be in X.Y.Z format, got: $VERSION\"\n  exit 1\nfi\n\nTAG=\"v$VERSION\"\nREPO_ROOT=\"$(cd \"$(dirname \"$0\")/..\" && pwd)\"\ncd \"$REPO_ROOT\"\n\n# Load nvm if available\nexport NVM_DIR=\"${NVM_DIR:-$HOME/.nvm}\"\n[ -s \"$NVM_DIR/nvm.sh\" ] && . \"$NVM_DIR/nvm.sh\"\nnvm use v22 > /dev/null 2>&1 || true\n\necho \"=== reader release $TAG ===\"\necho \"\"\n\n# ─── Preflight ────────────────────────────────────────────────────────\n\nif ! command -v gh &>/dev/null; then\n  echo \"Error: GitHub CLI (gh) is required. Install: brew install gh\"\n  exit 1\nfi\n\nBRANCH=$(git branch --show-current)\nif [ \"$BRANCH\" != \"main\" ]; then\n  echo \"Error: Must be on main branch (currently on $BRANCH)\"\n  exit 1\nfi\n\nif [ -n \"$(git status --porcelain)\" ]; then\n  echo \"Error: Working tree is dirty. Commit or stash changes first.\"\n  git status --short\n  exit 1\nfi\n\nif git rev-parse \"$TAG\" &>/dev/null; then\n  echo \"Error: Tag $TAG already exists\"\n  exit 1\nfi\n\nCURRENT_VERSION=$(node -p \"require('./package.json').version\")\necho \"Current: $CURRENT_VERSION\"\necho \"Release: $VERSION\"\necho \"\"\n\nif [ \"$DRY_RUN\" = \"--dry-run\" ]; then\n  echo \"[DRY RUN] No files will be modified.\"\n  echo \"\"\nfi\n\n# ─── Step 1: Bump version ────────────────────────────────────────────\n\necho \"[1/5] Bumping version...\"\nif [ \"$DRY_RUN\" != \"--dry-run\" ]; then\n  npm version \"$VERSION\" --no-git-tag-version --allow-same-version > /dev/null\nfi\necho \"  $CURRENT_VERSION -> $VERSION\"\n\n# ─── Step 2: Run all checks ──────────────────────────────────────────\n\necho \"[2/5] Running checks...\"\n\n# If any check fails, revert the version bump before exiting\nrevert_on_failure() {\n  if [ \"$DRY_RUN\" != \"--dry-run\" ]; then\n    git checkout -- package.json package-lock.json 2>/dev/null || true\n    echo \"\"\n    echo \"  Version bump reverted. Fix the issue and re-run.\"\n  fi\n}\ntrap revert_on_failure ERR\n\necho \"  Typecheck...\"\nnpx tsc --noEmit\n\necho \"  Lint...\"\nnpm run lint > /dev/null 2>&1\n\necho \"  Format...\"\nnpm run format:check > /dev/null 2>&1\n\necho \"  Test...\"\nTEST_OUTPUT=$(npm test 2>&1)\necho \"$TEST_OUTPUT\" | grep -E \"Test Files|Tests \" | sed 's/^/  /'\n\necho \"  Build...\"\nnpm run build > /dev/null 2>&1\n\ntrap - ERR\necho \"  All checks passed.\"\n\n# ─── Step 3: Commit + tag ────────────────────────────────────────────\n\necho \"[3/5] Committing...\"\nif [ \"$DRY_RUN\" = \"--dry-run\" ]; then\n  echo \"  Would commit: chore: release $TAG\"\nelse\n  git add package.json package-lock.json\n  git commit -m \"chore: release $TAG\"\n  git tag \"$TAG\"\n  echo \"  Committed and tagged $TAG\"\nfi\n\n# ─── Step 4: Push ────────────────────────────────────────────────────\n\necho \"[4/5] Pushing...\"\nif [ \"$DRY_RUN\" = \"--dry-run\" ]; then\n  echo \"  Would push main + $TAG\"\nelse\n  git push origin main --tags --no-verify\n  echo \"  Pushed main + $TAG\"\nfi\n\n# ─── Step 5: GitHub release ──────────────────────────────────────────\n\necho \"[5/5] Creating release...\"\n\nPREV_TAG=$(git describe --tags --abbrev=0 \"$TAG^\" 2>/dev/null || echo \"\")\nif [ -n \"$PREV_TAG\" ]; then\n  NOTES=$(git log \"$PREV_TAG..$TAG\" --pretty=format:\"- %s\" --no-merges)\nelse\n  NOTES=\"Initial release\"\nfi\n\nif [ \"$DRY_RUN\" = \"--dry-run\" ]; then\n  echo \"  Would create release $TAG with notes:\"\n  echo \"$NOTES\" | sed 's/^/    /'\n  echo \"\"\n  echo \"[DRY RUN] Nothing was modified.\"\nelse\n  gh release create \"$TAG\" --title \"$TAG\" --notes \"$NOTES\"\n  echo \"  https://github.com/vakra-dev/reader/releases/tag/$TAG\"\nfi\n\necho \"\"\necho \"=== Done ===\"\n"
  },
  {
    "path": "src/browser/hero-config.ts",
    "content": "import type { ProxyConfig } from \"../types\";\nimport { createProxyUrl } from \"../proxy/config\";\n\n/**\n * Hero configuration options\n */\nexport interface HeroConfigOptions {\n  /** Proxy configuration */\n  proxy?: ProxyConfig;\n  /** Show Chrome window (default: false) */\n  showChrome?: boolean;\n  /** IANA timezone ID to match proxy exit location (default: America/New_York) */\n  timezoneId?: string;\n  /** Connection to Core (for in-process Core) */\n  connectionToCore?: any;\n  /**\n   * Custom user agent string. Overrides Hero's default emulated UA.\n   *\n   * WARNING: Hero's default UA is matched to the Chromium TLS fingerprint.\n   * Overriding it can cause TLS/UA mismatches that anti-bot systems detect.\n   * Only set this if you know the target site doesn't check TLS fingerprints.\n   */\n  userAgent?: string;\n}\n\n/**\n * Create Hero configuration with optimal anti-bot bypass settings\n *\n * Extracted from proven hero-test implementation.\n * Includes:\n * - TLS fingerprint emulation (disableMitm: false)\n * - DNS over TLS (mimics Chrome)\n * - WebRTC IP masking\n * - Proper locale and timezone\n *\n * @param options - Configuration options\n * @returns Hero configuration object\n */\nexport function createHeroConfig(options: HeroConfigOptions = {}): any {\n  const config: any = {\n    // Show or hide Chrome window\n    showChrome: options.showChrome ?? false,\n\n    // ============================================================================\n    // CRITICAL: TLS fingerprint emulation\n    // ============================================================================\n    // Setting disableMitm to false enables TLS/TCP fingerprint emulation\n    // This is ESSENTIAL for bypassing Cloudflare and other anti-bot systems\n    disableMitm: false,\n\n    // ============================================================================\n    // Session management\n    // ============================================================================\n    // Use incognito for clean session state\n    disableIncognito: false,\n\n    // ============================================================================\n    // Docker compatibility\n    // ============================================================================\n    // Required when running in containerized environments\n    noChromeSandbox: true,\n\n    // ============================================================================\n    // DNS over TLS (mimics Chrome behavior)\n    // ============================================================================\n    // Using Cloudflare's DNS (1.1.1.1) over TLS makes the connection\n    // look more like a real Chrome browser\n    dnsOverTlsProvider: {\n      host: \"1.1.1.1\",\n      servername: \"cloudflare-dns.com\",\n    },\n\n    // ============================================================================\n    // WebRTC IP leak prevention\n    // ============================================================================\n    // Masks the real IP address in WebRTC connections\n    // Uses ipify.org to detect the public IP\n    upstreamProxyIpMask: {\n      ipLookupService: \"https://api.ipify.org?format=json\",\n    },\n\n    // ============================================================================\n    // Locale and timezone\n    // ============================================================================\n    locale: \"en-US\",\n    timezoneId: options.proxy?.timezoneId ?? options.timezoneId ?? \"America/New_York\",\n\n    // ============================================================================\n    // Viewport (standard desktop size)\n    // ============================================================================\n    viewport: {\n      width: 1920,\n      height: 1080,\n    },\n\n    // ============================================================================\n    // Connection to Core (if provided)\n    // ============================================================================\n    ...(options.connectionToCore && { connectionToCore: options.connectionToCore }),\n\n    // ============================================================================\n    // User agent override (if provided)\n    // ============================================================================\n    ...(options.userAgent && { userAgentString: options.userAgent }),\n  };\n\n  // ============================================================================\n  // Proxy configuration\n  // ============================================================================\n  if (options.proxy) {\n    config.upstreamProxyUrl = createProxyUrl(options.proxy);\n    // Don't use system DNS when using proxy\n    config.upstreamProxyUseSystemDns = false;\n  }\n\n  return config;\n}\n\n/**\n * Default Hero configuration (no proxy)\n */\nexport function getDefaultHeroConfig(): any {\n  return createHeroConfig();\n}\n"
  },
  {
    "path": "src/browser/pool.ts",
    "content": "import Hero from \"@ulixee/hero\";\nimport { createHeroConfig } from \"./hero-config\";\nimport type {\n  BrowserInstance,\n  QueueItem,\n  PoolConfig,\n  PoolStats,\n  HealthStatus,\n  IBrowserPool,\n} from \"./types\";\nimport type { ProxyConfig } from \"../types\";\nimport { createLogger } from \"../utils/logger\";\n\n/**\n * Default pool configuration\n */\nconst DEFAULT_POOL_CONFIG: PoolConfig = {\n  size: 2,\n  retireAfterPageCount: 100,\n  retireAfterAgeMs: 30 * 60 * 1000, // 30 minutes\n  recycleCheckInterval: 60 * 1000, // 1 minute\n  healthCheckInterval: 5 * 60 * 1000, // 5 minutes\n  maxConsecutiveFailures: 3,\n  maxQueueSize: 100,\n  queueTimeout: 60 * 1000, // 1 minute\n};\n\n/**\n * Generate unique ID\n */\nfunction generateId(): string {\n  return `browser_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`;\n}\n\n/**\n * Browser Pool\n *\n * Manages a pool of Hero browser instances with:\n * - Auto-recycling based on age/request count\n * - Request queuing when pool is full\n * - Health monitoring\n *\n * @example\n * const pool = new BrowserPool({ size: 5 });\n * await pool.initialize();\n *\n * // Use withBrowser for automatic acquire/release\n * await pool.withBrowser(async (hero) => {\n *   await hero.goto('https://example.com');\n *   const title = await hero.document.title;\n *   return title;\n * });\n *\n * await pool.shutdown();\n */\nexport class BrowserPool implements IBrowserPool {\n  private instances: BrowserInstance[] = [];\n  private available: BrowserInstance[] = [];\n  private inUse: Set<BrowserInstance> = new Set();\n  private queue: QueueItem[] = [];\n  private config: PoolConfig;\n  private proxy?: ProxyConfig;\n  private recycleTimer?: NodeJS.Timeout;\n  private healthTimer?: NodeJS.Timeout;\n  private totalRequests = 0;\n  private totalRequestDuration = 0;\n  private showChrome: boolean;\n  private connectionToCore?: any;\n  private verbose: boolean;\n  private logger = createLogger(\"pool\");\n\n  constructor(\n    config: Partial<PoolConfig> = {},\n    proxy?: ProxyConfig,\n    showChrome: boolean = false,\n    connectionToCore?: any,\n    _userAgent?: string,\n    verbose: boolean = false\n  ) {\n    this.config = { ...DEFAULT_POOL_CONFIG, ...config };\n    this.proxy = proxy;\n    this.showChrome = showChrome;\n    this.connectionToCore = connectionToCore;\n    this.verbose = verbose;\n  }\n\n  /**\n   * Initialize the pool by pre-launching browsers\n   */\n  async initialize(): Promise<void> {\n    if (this.verbose) {\n      this.logger.info(`Initializing pool with ${this.config.size} browsers...`);\n    }\n\n    // Pre-launch browsers\n    const launchPromises: Promise<BrowserInstance>[] = [];\n    for (let i = 0; i < this.config.size; i++) {\n      launchPromises.push(this.createInstance());\n    }\n\n    this.instances = await Promise.all(launchPromises);\n    this.available = [...this.instances];\n\n    // Start background tasks\n    this.startRecycling();\n    this.startHealthChecks();\n\n    if (this.verbose) {\n      this.logger.info(`Pool ready: ${this.instances.length} browsers available`);\n    }\n  }\n\n  /**\n   * Shutdown the pool and close all browsers\n   */\n  async shutdown(): Promise<void> {\n    if (this.verbose) {\n      const stats = this.getStats();\n      this.logger.info(\n        `Shutting down pool: ${stats.totalRequests} total requests processed, ` +\n          `${Math.round(stats.avgRequestDuration)}ms avg duration`\n      );\n    }\n\n    // Stop background tasks\n    if (this.recycleTimer) clearInterval(this.recycleTimer);\n    if (this.healthTimer) clearInterval(this.healthTimer);\n\n    // Reject all queued requests\n    for (const item of this.queue) {\n      item.reject(new Error(\"Pool shutting down\"));\n    }\n    this.queue = [];\n\n    // Close all browsers\n    const closePromises = this.instances.map((instance) => instance.hero.close().catch(() => {}));\n    await Promise.all(closePromises);\n\n    // Disconnect the connection to core to release event listeners\n    if (this.connectionToCore) {\n      try {\n        await this.connectionToCore.disconnect();\n      } catch {\n        // Ignore disconnect errors\n      }\n      this.connectionToCore = undefined;\n    }\n\n    // Clear instances\n    this.instances = [];\n    this.available = [];\n    this.inUse.clear();\n  }\n\n  /**\n   * Acquire a browser from the pool\n   */\n  async acquire(): Promise<Hero> {\n    // Get available instance\n    const instance = this.available.shift();\n    if (!instance) {\n      // No available instances, queue the request\n      if (this.verbose) {\n        this.logger.info(\n          `No browsers available, queuing request (queue: ${this.queue.length + 1})`\n        );\n      }\n      return this.queueRequest();\n    }\n\n    // Mark as busy\n    instance.status = \"busy\";\n    instance.lastUsed = Date.now();\n    this.inUse.add(instance);\n\n    if (this.verbose) {\n      this.logger.info(\n        `Acquired browser ${instance.id} (available: ${this.available.length}, busy: ${this.inUse.size})`\n      );\n    }\n\n    return instance.hero;\n  }\n\n  /**\n   * Release a browser back to the pool\n   */\n  release(hero: Hero): void {\n    const instance = this.instances.find((i) => i.hero === hero);\n    if (!instance) return;\n\n    // Update stats\n    instance.status = \"idle\";\n    instance.requestCount++;\n    this.inUse.delete(instance);\n\n    if (this.verbose) {\n      this.logger.info(\n        `Released browser ${instance.id} (requests: ${instance.requestCount}, available: ${this.available.length + 1})`\n      );\n    }\n\n    // Check if needs recycling\n    if (this.shouldRecycle(instance)) {\n      if (this.verbose) {\n        this.logger.info(`Recycling browser ${instance.id} (age or request limit reached)`);\n      }\n      this.recycleInstance(instance).catch(() => {});\n    } else {\n      this.available.push(instance);\n      this.processQueue();\n    }\n  }\n\n  /**\n   * Execute callback with auto-managed browser\n   */\n  async withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T> {\n    const startTime = Date.now();\n    const hero = await this.acquire();\n\n    try {\n      const result = await callback(hero);\n\n      // Update request stats\n      this.totalRequests++;\n      this.totalRequestDuration += Date.now() - startTime;\n\n      return result;\n    } finally {\n      this.release(hero);\n    }\n  }\n\n  /**\n   * Get pool statistics\n   */\n  getStats(): PoolStats {\n    const recycling = this.instances.filter((i) => i.status === \"recycling\").length;\n    const unhealthy = this.instances.filter((i) => i.status === \"unhealthy\").length;\n\n    return {\n      total: this.instances.length,\n      available: this.available.length,\n      busy: this.inUse.size,\n      recycling,\n      unhealthy,\n      queueLength: this.queue.length,\n      totalRequests: this.totalRequests,\n      avgRequestDuration:\n        this.totalRequests > 0 ? this.totalRequestDuration / this.totalRequests : 0,\n    };\n  }\n\n  /**\n   * Run health check\n   */\n  async healthCheck(): Promise<HealthStatus> {\n    const issues: string[] = [];\n    const stats = this.getStats();\n\n    // Check for unhealthy instances\n    if (stats.unhealthy > 0) {\n      issues.push(`${stats.unhealthy} unhealthy instances`);\n    }\n\n    // Check queue size\n    if (stats.queueLength > this.config.maxQueueSize * 0.8) {\n      issues.push(`Queue near capacity: ${stats.queueLength}/${this.config.maxQueueSize}`);\n    }\n\n    // Check if pool is saturated\n    if (stats.available === 0 && stats.queueLength > 0) {\n      issues.push(\"Pool saturated - all browsers busy with pending requests\");\n    }\n\n    return {\n      healthy: issues.length === 0,\n      issues,\n      stats,\n    };\n  }\n\n  // =========================================================================\n  // Private methods\n  // =========================================================================\n\n  /**\n   * Create a new browser instance\n   */\n  private async createInstance(): Promise<BrowserInstance> {\n    const heroConfig = createHeroConfig({\n      proxy: this.proxy,\n      showChrome: this.showChrome,\n      connectionToCore: this.connectionToCore,\n    });\n\n    const hero = new Hero(heroConfig);\n\n    return {\n      hero,\n      id: generateId(),\n      createdAt: Date.now(),\n      lastUsed: Date.now(),\n      requestCount: 0,\n      status: \"idle\",\n    };\n  }\n\n  /**\n   * Check if instance should be recycled\n   */\n  private shouldRecycle(instance: BrowserInstance): boolean {\n    const age = Date.now() - instance.createdAt;\n    return (\n      instance.requestCount >= this.config.retireAfterPageCount ||\n      age >= this.config.retireAfterAgeMs\n    );\n  }\n\n  /**\n   * Recycle an instance (close old, create new)\n   */\n  private async recycleInstance(instance: BrowserInstance): Promise<void> {\n    instance.status = \"recycling\";\n\n    try {\n      // Close old instance\n      await instance.hero.close().catch(() => {});\n\n      // Create new instance\n      const newInstance = await this.createInstance();\n\n      // Replace in instances array\n      const index = this.instances.indexOf(instance);\n      if (index !== -1) {\n        this.instances[index] = newInstance;\n      }\n\n      // Add to available pool\n      this.available.push(newInstance);\n\n      if (this.verbose) {\n        this.logger.info(`Recycled browser: ${instance.id} → ${newInstance.id}`);\n      }\n\n      // Process queue\n      this.processQueue();\n    } catch (error) {\n      // Failed to recycle, mark as unhealthy\n      instance.status = \"unhealthy\";\n      if (this.verbose) {\n        this.logger.warn(`Failed to recycle browser ${instance.id}`);\n      }\n    }\n  }\n\n  /**\n   * Queue a request when no browsers available\n   */\n  private queueRequest(): Promise<Hero> {\n    return new Promise<Hero>((resolve, reject) => {\n      // Check queue size\n      if (this.queue.length >= this.config.maxQueueSize) {\n        reject(new Error(\"Queue full\"));\n        return;\n      }\n\n      // Add to queue\n      const item: QueueItem = {\n        resolve,\n        reject,\n        queuedAt: Date.now(),\n      };\n      this.queue.push(item);\n\n      // Set timeout\n      setTimeout(() => {\n        const index = this.queue.indexOf(item);\n        if (index !== -1) {\n          this.queue.splice(index, 1);\n          reject(new Error(\"Queue timeout\"));\n        }\n      }, this.config.queueTimeout);\n    });\n  }\n\n  /**\n   * Process queued requests\n   */\n  private processQueue(): void {\n    while (this.queue.length > 0 && this.available.length > 0) {\n      const item = this.queue.shift()!;\n\n      // Check if still valid (not timed out)\n      const age = Date.now() - item.queuedAt;\n      if (age > this.config.queueTimeout) {\n        item.reject(new Error(\"Queue timeout\"));\n        continue;\n      }\n\n      // Acquire and resolve\n      this.acquire().then(item.resolve).catch(item.reject);\n    }\n  }\n\n  /**\n   * Start background recycling task\n   */\n  private startRecycling(): void {\n    this.recycleTimer = setInterval(() => {\n      for (const instance of this.instances) {\n        if (instance.status === \"idle\" && this.shouldRecycle(instance)) {\n          this.recycleInstance(instance).catch(() => {});\n        }\n      }\n    }, this.config.recycleCheckInterval);\n    // Allow process to exit even if timer is still running\n    this.recycleTimer.unref();\n  }\n\n  /**\n   * Start background health checks\n   */\n  private startHealthChecks(): void {\n    this.healthTimer = setInterval(async () => {\n      const health = await this.healthCheck();\n      if (!health.healthy && health.issues.length > 0) {\n        console.warn(\"[BrowserPool] Health issues:\", health.issues);\n      }\n    }, this.config.healthCheckInterval);\n    // Allow process to exit even if timer is still running\n    this.healthTimer.unref();\n  }\n}\n\n// Backward compatibility alias\nexport { BrowserPool as HeroBrowserPool };\n"
  },
  {
    "path": "src/browser/proxy-bound-browser.ts",
    "content": "/**\n * ProxyBoundBrowser — a single Hero instance pinned to exactly one proxy URL.\n *\n * This is the per-IP unit of the new TieredBrowserPool. Each instance owns:\n *   - one Hero process (launched with `upstreamProxyUrl` = this.proxyUrl)\n *   - a deterministic fingerprint derived from the proxy URL\n *   - an internal pLimit gate that caps concurrent `withPage` calls\n *   - a four-state lifecycle (launching / active / retired / closed)\n *\n * Design rules (from the architecture review with Nihal):\n *   1. 1 IP = 1 Hero process. Never two browsers on the same proxy URL —\n *      the TieredBrowserPool enforces the 1:1 map above us.\n *   2. Max 2 concurrent tabs per browser by default. This is the per-browser\n *      mirror of the scraper-level PerProxyGate cap.\n *   3. Fingerprint is paired with the proxy, not random per request. Hero is\n *      launched with a stable UA derived from `hash(proxyUrl) -> USER_AGENTS`.\n *   4. Retirement drains. Calling `retire()` stops accepting new work, lets\n *      in-flight tabs finish, then hard-closes Hero. The returned Promise\n *      resolves once the browser is truly gone.\n *   5. Relaunch keeps the binding. When the health tracker revives a proxy\n *      or when the page-count threshold triggers recycling, `relaunch()`\n *      closes the old Hero and starts a fresh one through the same proxy\n *      with the same fingerprint. The browser's identity is the proxy URL,\n *      not the Hero process.\n *\n * Test seam: the constructor accepts a `HeroFactory` injection so unit tests\n * can pass a fake Hero without launching a real Chromium process. Production\n * callers use `createDefaultHeroFactory()` which imports `@ulixee/hero`.\n */\n\nimport pLimit from \"p-limit\";\nimport { createHeroConfig } from \"./hero-config\";\nimport { createLogger, type Logger } from \"../utils/logger\";\n\n/**\n * The subset of a Hero Tab that callers of `withPage` interact with.\n * Kept minimal so tests can fake it. At runtime this is a real\n * `@ulixee/hero` Tab object with goto, document, waitForLoad, etc.\n */\nexport interface TabLike {\n  goto(href: string, options?: { timeoutMs?: number; referrer?: string }): Promise<unknown>;\n  get url(): Promise<string>;\n  get document(): unknown;\n  waitForLoad(status: string, options?: { timeoutMs?: number }): Promise<void>;\n  waitForPaintingStable(options?: { timeoutMs?: number }): Promise<void>;\n  waitForElement(element: unknown, options?: { timeoutMs?: number }): Promise<unknown>;\n  close(): Promise<void>;\n}\n\n/**\n * The subset of the Hero API that ProxyBoundBrowser relies on. Kept minimal\n * so tests can fake it without importing @ulixee/hero.\n */\nexport interface HeroLike {\n  newTab(): Promise<TabLike>;\n  closeTab(tab: TabLike): Promise<void>;\n  close(): Promise<void>;\n}\n\n/**\n * Factory for Hero instances. Production uses `createDefaultHeroFactory()`\n * which lazily imports @ulixee/hero; tests inject a fake that returns a\n * mock Hero.\n */\nexport interface HeroFactory {\n  create(config: Record<string, unknown>): HeroLike;\n  /**\n   * Optional async initializer. Production factory uses this to\n   * `await import(\"@ulixee/hero\")` before the first `create()` call.\n   * Test factories can omit it (they don't need async loading).\n   */\n  init?(): Promise<void>;\n}\n\n/**\n * Lazy-loaded real Hero factory. `@ulixee/hero` is a heavy dependency; we\n * only import it when first actually asked to create a browser, so unit\n * tests that stick to the fake factory don't pay the import cost.\n *\n * Uses dynamic `import()` because the project runs as ESM (via tsx).\n * `require()` is not available in ESM context.\n */\nexport function createDefaultHeroFactory(): HeroFactory {\n  let HeroCtor: new (config: Record<string, unknown>) => HeroLike;\n  return {\n    create(config) {\n      if (!HeroCtor) {\n        throw new Error(\"HeroFactory: Hero constructor not loaded yet. Call factory.init() first.\");\n      }\n      return new HeroCtor(config);\n    },\n    /**\n     * Pre-load the Hero constructor. Must be called (and awaited) once\n     * before the first `create()` call. The TieredBrowserPool constructor\n     * can't be async, so we expose this as a separate init step that the\n     * caller (ReaderClient.initializeCore) awaits before building the pool.\n     */\n    async init() {\n      if (!HeroCtor) {\n        const mod = await import(\"@ulixee/hero\");\n        HeroCtor = mod.default;\n      }\n    },\n  };\n}\n\n/**\n * Lifecycle state of a ProxyBoundBrowser.\n */\nexport type BrowserState = \"launching\" | \"active\" | \"retired\" | \"closed\";\n\n/**\n * Stats snapshot. Not a full StatsPool type — just what we need for logging\n * and tests.\n */\nexport interface ProxyBoundBrowserStats {\n  proxyUrl: string | null;\n  state: BrowserState;\n  activeTabs: number;\n  totalPages: number;\n  createdAt: number;\n  fingerprintIndex: number;\n}\n\n/**\n * Options for a ProxyBoundBrowser.\n */\nexport interface ProxyBoundBrowserOptions {\n  /**\n   * The proxy URL this browser is bound to. `null` represents the direct\n   * lane (no proxy — the browser scrapes from the host's own IP).\n   */\n  proxyUrl: string | null;\n\n  /** IANA timezone ID for this proxy's exit location (e.g., 'America/Los_Angeles') */\n  timezoneId?: string;\n\n  /**\n   * Max concurrent `withPage` calls allowed on this browser. Default: 2.\n   * This is the \"N tabs per browser\" knob — matches the scraper-level\n   * PerProxyGate cap by default, and can be tightened per-domain via\n   * domain profiles that set `maxConcurrentPerProxy: 1`.\n   */\n  maxTabs?: number;\n\n  /**\n   * Retire (drain + relaunch) after this many total `withPage` calls. Fresh\n   * Chromium processes prevent memory leaks. Default: 100.\n   */\n  retireAfterPages?: number;\n\n  /**\n   * Factory to create the underlying Hero instance. Defaults to the real\n   * `@ulixee/hero` import. Tests pass a fake.\n   */\n  heroFactory?: HeroFactory;\n\n  /**\n   * Show the Chrome window. Forwarded to `createHeroConfig`.\n   */\n  showChrome?: boolean;\n\n  /**\n   * A shared Hero `connectionToCore`. Optional — when present, every Hero\n   * created by this browser is routed through the same HeroCore, which is\n   * how ReaderClient currently shares one Core across many browsers.\n   */\n  connectionToCore?: unknown;\n\n  /**\n   * Custom user agent string. Overrides Hero's default emulated UA.\n   * WARNING: Can cause TLS/UA mismatches that anti-bot systems detect.\n   */\n  userAgent?: string;\n\n  /**\n   * Logger. Defaults to a fresh \"proxy-bound-browser\" logger. Tests can pass\n   * a silent logger to keep output clean.\n   */\n  logger?: Logger;\n\n  /**\n   * Clock. Defaults to `Date.now`. Tests inject a fake clock to keep\n   * `createdAt` deterministic.\n   */\n  now?: () => number;\n}\n\n/**\n * A single Hero instance bound to exactly one proxy URL.\n */\nexport class ProxyBoundBrowser {\n  readonly proxyUrl: string | null;\n  readonly timezoneId: string | undefined;\n  readonly maxTabs: number;\n  readonly retireAfterPages: number;\n  readonly createdAt: number;\n\n  private state: BrowserState = \"launching\";\n  private totalPages = 0;\n  private recycling = false;\n  private readonly limit: ReturnType<typeof pLimit>;\n  private readonly heroFactory: HeroFactory;\n  private readonly heroConfig: Record<string, unknown>;\n  private readonly logger: Logger;\n  private readonly now: () => number;\n  private hero: HeroLike | null = null;\n\n  /**\n   * Resolves when the Hero instance is ready for use. Rejects if launch\n   * fails. Callers should `await browser.ready` before their first `withPage`.\n   */\n  readonly ready: Promise<void>;\n  private resolveReady!: () => void;\n  private rejectReady!: (err: Error) => void;\n\n  /**\n   * Resolves when the browser is fully closed (drained and Hero.close()\n   * has returned). A fresh Promise is created on each `relaunch()`.\n   */\n  private closedDeferred: { promise: Promise<void>; resolve: () => void };\n\n  constructor(options: ProxyBoundBrowserOptions) {\n    this.proxyUrl = options.proxyUrl;\n    this.timezoneId = options.timezoneId;\n    this.maxTabs = options.maxTabs ?? 2;\n    this.retireAfterPages = options.retireAfterPages ?? 100;\n    this.heroFactory = options.heroFactory ?? createDefaultHeroFactory();\n    this.logger = options.logger ?? createLogger(\"proxy-bound-browser\");\n    this.now = options.now ?? Date.now;\n    this.createdAt = this.now();\n\n    if (!Number.isInteger(this.maxTabs) || this.maxTabs < 1) {\n      throw new Error(`ProxyBoundBrowser: maxTabs must be an integer >= 1, got ${this.maxTabs}`);\n    }\n    if (!Number.isInteger(this.retireAfterPages) || this.retireAfterPages < 1) {\n      throw new Error(\n        `ProxyBoundBrowser: retireAfterPages must be an integer >= 1, got ${this.retireAfterPages}`\n      );\n    }\n\n    this.limit = pLimit(this.maxTabs);\n    // Build the Hero config once. Proxy URL and timezone are burned in\n    // at construction — if you want a different proxy, make a different\n    // ProxyBoundBrowser.\n    //\n    // By default we do NOT override userAgent — Hero's default-browser-emulator\n    // picks a UA that matches the Chromium TLS/TCP fingerprint and platform.\n    // Overriding it can cause TLS/UA mismatches that anti-bot systems detect.\n    // Only pass userAgent if the caller explicitly set it.\n    this.heroConfig = createHeroConfig({\n      proxy: this.proxyUrl ? { url: this.proxyUrl, timezoneId: this.timezoneId } : undefined,\n      showChrome: options.showChrome ?? false,\n      timezoneId: this.timezoneId,\n      connectionToCore: options.connectionToCore,\n      userAgent: options.userAgent,\n    });\n\n    this.ready = new Promise<void>((resolve, reject) => {\n      this.resolveReady = resolve;\n      this.rejectReady = reject;\n    });\n\n    this.closedDeferred = makeDeferred<void>();\n\n    // Kick off the launch. We don't await it here — callers await\n    // `this.ready` explicitly. This lets the pool create N browsers in\n    // parallel and wait on all their ready promises with one Promise.all.\n    void this.launch();\n  }\n\n  /**\n   * Get the current lifecycle state. Read-only from outside the class.\n   */\n  getState(): BrowserState {\n    return this.state;\n  }\n\n  /**\n   * Whether this browser is accepting new work. Returns true only in the\n   * `active` state.\n   */\n  isAvailable(): boolean {\n    return this.state === \"active\";\n  }\n\n  /**\n   * Number of in-flight `withPage` calls on this browser. Used by the\n   * TieredBrowserPool to pick the least-loaded browser for a new request.\n   */\n  getActiveTabs(): number {\n    return this.limit.activeCount;\n  }\n\n  /**\n   * Stats snapshot for logging and /status.\n   */\n  getStats(): ProxyBoundBrowserStats {\n    return {\n      proxyUrl: this.proxyUrl,\n      state: this.state,\n      activeTabs: this.limit.activeCount,\n      totalPages: this.totalPages,\n      createdAt: this.createdAt,\n      fingerprintIndex: 0,\n    };\n  }\n\n  /**\n   * Execute `fn` with the Hero instance. Acquires an internal tab slot;\n   * at most `maxTabs` calls can be running at once. Throws if the browser\n   * is not in the `active` state when `fn` is scheduled to run — callers\n   * who want to wait for launch should await `ready` first.\n   *\n   * Increments `totalPages` after `fn` completes (success or failure). If\n   * the post-completion count hits `retireAfterPages`, triggers `retire()`\n   * in the background.\n   */\n  async withPage<T>(fn: (tab: TabLike) => Promise<T>): Promise<T> {\n    if (this.state === \"closed\") {\n      throw new Error(\n        `ProxyBoundBrowser[${redactProxyUrl(this.proxyUrl)}]: cannot withPage on closed browser`\n      );\n    }\n    if (this.state === \"retired\") {\n      throw new Error(\n        `ProxyBoundBrowser[${redactProxyUrl(this.proxyUrl)}]: cannot withPage on retired browser`\n      );\n    }\n\n    // Wait for launch to complete (no-op if already active). If launch\n    // failed, `ready` has already rejected and the state is `closed`.\n    await this.ready;\n\n    // After awaiting ready, the browser might have been retired — re-check.\n    if (this.state !== \"active\") {\n      throw new Error(\n        `ProxyBoundBrowser[${redactProxyUrl(this.proxyUrl)}]: browser became ${this.state} before withPage could run`\n      );\n    }\n\n    return this.limit(async () => {\n      // Re-check inside the limit — another in-flight withPage may have\n      // triggered retirement.\n      if (this.state !== \"active\" || !this.hero) {\n        throw new Error(\n          `ProxyBoundBrowser[${redactProxyUrl(this.proxyUrl)}]: browser became unavailable while waiting for tab slot`\n        );\n      }\n\n      // Open a fresh tab in the warm Hero browser. Each tab gets a clean\n      // navigation context — no leftover JS state from previous scrapes.\n      // The Hero instance (= Chromium process) stays alive across scrapes;\n      // only the tab is created and destroyed per call.\n      const tab = await this.hero.newTab();\n\n      try {\n        return await fn(tab);\n      } finally {\n        // Close the tab to free Chromium resources. Swallow errors —\n        // the scrape result is already captured.\n        try {\n          await this.hero.closeTab(tab);\n        } catch {\n          /* swallow */\n        }\n        this.totalPages += 1;\n        // If we hit the recycle threshold, kick off retire+relaunch in the\n        // background. The `recycling` flag prevents two concurrent handlers\n        // from both triggering a relaunch when they all cross the threshold\n        // together. `retire()` inside relaunch will drain the remaining\n        // in-flight tabs before closing.\n        if (\n          this.state === \"active\" &&\n          !this.recycling &&\n          this.totalPages >= this.retireAfterPages\n        ) {\n          this.recycling = true;\n          // Schedule via setImmediate so the current task fully exits the\n          // pLimit slot before relaunch starts draining — otherwise we'd\n          // deadlock on ourselves (drainLimit waits for activeCount to hit\n          // 0, but we're still in a pLimit task).\n          setImmediate(() => {\n            void this.relaunch()\n              .catch((err) => {\n                this.logger.error({ err, proxy: redactProxyUrl(this.proxyUrl) }, \"recycle failed\");\n              })\n              .finally(() => {\n                this.recycling = false;\n              });\n          });\n        }\n      }\n    });\n  }\n\n  /**\n   * Gracefully drain and close the browser. Stops accepting new work. In-\n   * flight tabs run to completion. Returns a Promise that resolves once the\n   * underlying Hero is closed. After this resolves, `withPage` will throw.\n   *\n   * Safe to call multiple times — subsequent calls return the same promise.\n   */\n  async retire(): Promise<void> {\n    if (this.state === \"closed\") return;\n    if (this.state === \"retired\") {\n      return this.closedDeferred.promise;\n    }\n    this.state = \"retired\";\n    this.logger.debug(\n      { proxy: redactProxyUrl(this.proxyUrl), activeTabs: this.limit.activeCount },\n      \"retiring browser\"\n    );\n\n    // Drain: wait until the limit has 0 active and 0 pending.\n    await this.drainLimit();\n\n    // Close the underlying Hero. Swallow errors — we're shutting down\n    // anyway and a failed close shouldn't block the caller.\n    if (this.hero) {\n      try {\n        await this.hero.close();\n      } catch (err) {\n        this.logger.warn(\n          { err, proxy: redactProxyUrl(this.proxyUrl) },\n          \"error while closing Hero during retire\"\n        );\n      }\n      this.hero = null;\n    }\n\n    this.state = \"closed\";\n    this.closedDeferred.resolve();\n    return this.closedDeferred.promise;\n  }\n\n  /**\n   * Retire and relaunch with the same proxy URL and fingerprint. Used for:\n   *   - Recycling after `retireAfterPages`\n   *   - Reviving a proxy that was benched and then cleared the cooldown\n   *   - Recovering from a Hero crash (launch fails → state goes closed →\n   *     the pool can call relaunch to try again)\n   *\n   * Resets `totalPages` to 0. Creates a fresh `ready` promise so callers\n   * can await the new Hero.\n   */\n  async relaunch(): Promise<void> {\n    // Tear down current instance if any.\n    if (this.state !== \"closed\") {\n      await this.retire();\n    }\n\n    // Reset state for a fresh launch.\n    this.state = \"launching\";\n    this.totalPages = 0;\n    this.closedDeferred = makeDeferred<void>();\n\n    // Create a new ready promise. The old one is already resolved/rejected\n    // so overwriting it is safe — callers who held a reference to the old\n    // `ready` just see the old outcome.\n    (this as { ready: Promise<void> }).ready = new Promise<void>((resolve, reject) => {\n      this.resolveReady = resolve;\n      this.rejectReady = reject;\n    });\n\n    void this.launch();\n    await this.ready;\n  }\n\n  /**\n   * Launch the underlying Hero instance. Called by the constructor and by\n   * `relaunch`. On failure, marks the browser as closed and rejects the\n   * ready promise — the pool can then call relaunch to retry.\n   */\n  private async launch(): Promise<void> {\n    try {\n      this.logger.debug({ proxy: redactProxyUrl(this.proxyUrl) }, \"launching browser\");\n      // Ensure the factory has loaded its constructor (async import for ESM).\n      // No-op for test factories that don't define init().\n      if (this.heroFactory.init) {\n        await this.heroFactory.init();\n      }\n      this.hero = this.heroFactory.create(this.heroConfig);\n      this.state = \"active\";\n      this.resolveReady();\n    } catch (err) {\n      this.state = \"closed\";\n      this.closedDeferred.resolve();\n      this.logger.error({ err, proxy: redactProxyUrl(this.proxyUrl) }, \"browser launch failed\");\n      this.rejectReady(err instanceof Error ? err : new Error(String(err)));\n    }\n  }\n\n  /**\n   * Wait until `limit` has no active or pending tasks. Polls — there's no\n   * \"all done\" event in p-limit, but the wait is short in practice (a few\n   * in-flight scrapes finish their current navigation).\n   */\n  private async drainLimit(): Promise<void> {\n    while (this.limit.activeCount > 0 || this.limit.pendingCount > 0) {\n      await new Promise((r) => setImmediate(r));\n    }\n  }\n}\n\n/**\n * Redact credentials from a proxy URL for logging. `http://user:pass@host:port`\n * becomes `http://***@host:port`. Never log the raw URL — it contains secrets.\n */\nexport function redactProxyUrl(proxyUrl: string | null): string {\n  if (!proxyUrl) return \"direct\";\n  try {\n    const u = new URL(proxyUrl);\n    const creds = u.username ? \"***@\" : \"\";\n    return `${u.protocol}//${creds}${u.host}`;\n  } catch {\n    // Malformed URL — at least don't accidentally dump credentials.\n    return \"<invalid-proxy-url>\";\n  }\n}\n\n/**\n * Tiny deferred helper — creates a promise together with its resolve/reject\n * handles, so we can resolve from inside an async method without wrapping.\n */\nfunction makeDeferred<T>(): { promise: Promise<T>; resolve: (v: T) => void } {\n  let resolve!: (v: T) => void;\n  const promise = new Promise<T>((r) => (resolve = r));\n  return { promise, resolve };\n}\n"
  },
  {
    "path": "src/browser/tiered-pool.ts",
    "content": "/**\n * TieredBrowserPool — the top-level browser pool for Reader.\n *\n * Composes N ProxyBoundBrowser instances grouped by tier\n * (datacenter / residential / direct), with one browser per proxy URL. The\n * pool owns the lifecycle of its browsers: it pre-warms every browser at\n * startup, routes `acquire(tier)` to the least-loaded healthy browser in\n * that tier, and reacts to `proxy-benched` / `proxy-revived` events from the\n * injected ProxyHealthTracker by retiring or relaunching browsers.\n *\n * Architecture rules (from the design review):\n *   - 1 proxy URL = 1 ProxyBoundBrowser. Never two browsers on the same URL.\n *   - Browsers are pre-warmed at startup in parallel (Promise.all across all\n *     ready promises). `ready` on the pool resolves when all browsers have\n *     reported ready — success or failure — so the daemon can fail loud at\n *     startup via a separate `api.ipify.org` verification step.\n *   - `acquire(tier)` picks the least-loaded healthy browser in the tier. If\n *     none exist, it throws — callers should check `hasTier(tier)` first or\n *     handle the error as a tier-unavailable case (e.g., fall back to a\n *     different tier or return a structured error to the API).\n *   - The direct tier is only populated when no proxies are configured at\n *     all (see `buildFromPools` below). Mixing direct with proxies is a\n *     config error that leaks your real IP.\n *\n * This is *not* a drop-in replacement for the old `BrowserPool` — the API is\n * new (`acquire(tier)` instead of `withBrowser(fn)`). The scraper and hero\n * engine are updated separately in a later phase to use this shape.\n */\n\nimport {\n  ProxyBoundBrowser,\n  type HeroFactory,\n  type ProxyBoundBrowserOptions,\n  type ProxyBoundBrowserStats,\n  redactProxyUrl,\n} from \"./proxy-bound-browser\";\nimport type { ProxyHealthTracker } from \"../proxy/health-tracker\";\nimport { createLogger, type Logger } from \"../utils/logger\";\n\n/**\n * The three tiers we support. `direct` is only populated when there are no\n * configured proxies (local dev, CI without secrets).\n */\nexport type PoolTier = \"datacenter\" | \"residential\" | \"direct\";\n\n/**\n * Input to the pool: a tier name and the list of proxy URLs for that tier.\n *\n * A null URL inside `direct` represents the actual direct connection. For\n * `datacenter` and `residential`, the URLs are real proxy URLs.\n */\nexport interface TierConfig {\n  tier: PoolTier;\n  proxyUrls: Array<string | null>;\n  /** Map of proxy URL -> IANA timezone ID for Hero fingerprint consistency. */\n  timezones?: Record<string, string>;\n}\n\n/**\n * Options for the TieredBrowserPool.\n */\nexport interface TieredBrowserPoolOptions {\n  /**\n   * The tiers and their proxy URLs. Use `buildFromPools()` helper to\n   * convert a ProxyPoolConfig into this shape.\n   */\n  tiers: TierConfig[];\n\n  /**\n   * Max concurrent tabs per browser. Default: 2. Matches the scraper-level\n   * PerProxyGate default; the two layers together give us defence in depth.\n   */\n  maxTabsPerBrowser?: number;\n\n  /**\n   * Page-count threshold for browser recycling. Default: 100 (matches the\n   * old pool).\n   */\n  retireAfterPages?: number;\n\n  /**\n   * Optional ProxyHealthTracker. When supplied, the pool subscribes to its\n   * `proxy-benched` and `proxy-revived` events: benched proxies get their\n   * browser retired, revived proxies get a fresh one launched. Without a\n   * tracker, the pool ignores proxy health and relies purely on the\n   * scraper's retry loop.\n   */\n  healthTracker?: ProxyHealthTracker;\n\n  /**\n   * Factory for Hero instances. Passed through to every ProxyBoundBrowser.\n   * Tests inject a fake; production leaves it undefined (uses the real\n   * `@ulixee/hero`).\n   */\n  heroFactory?: HeroFactory;\n\n  /**\n   * Show Chrome window. Forwarded to every browser.\n   */\n  showChrome?: boolean;\n\n  /**\n   * Shared Hero `connectionToCore`. One HeroCore shared across all browsers\n   * avoids spinning up N Core processes.\n   */\n  connectionToCore?: unknown;\n\n  /**\n   * Custom user agent string. Forwarded to every browser.\n   * Overrides Hero's default emulated UA.\n   */\n  userAgent?: string;\n\n  /**\n   * Logger. Defaults to a fresh \"tiered-pool\" logger.\n   */\n  logger?: Logger;\n}\n\n/**\n * A result from `acquire(tier)`. Callers should `await lease.ready` (no-op\n * if already ready) and then use `lease.withPage(fn)` for the actual work.\n * Release is implicit — withPage releases its own slot.\n */\nexport interface BrowserLease {\n  /** The ProxyBoundBrowser you're using. */\n  browser: ProxyBoundBrowser;\n  /** The tier it was leased from. */\n  tier: PoolTier;\n}\n\n/**\n * Stats for a tier.\n */\nexport interface TierStats {\n  tier: PoolTier;\n  browsers: ProxyBoundBrowserStats[];\n}\n\n/**\n * Stats for the whole pool.\n */\nexport interface PoolStatsSnapshot {\n  tiers: TierStats[];\n}\n\n/**\n * The pool.\n */\nexport class TieredBrowserPool {\n  private readonly tiers = new Map<PoolTier, Map<string, ProxyBoundBrowser>>();\n  private readonly healthTracker?: ProxyHealthTracker;\n  private readonly maxTabsPerBrowser: number;\n  private readonly retireAfterPages: number;\n  private readonly heroFactory?: HeroFactory;\n  private readonly showChrome: boolean;\n  private readonly connectionToCore?: unknown;\n  private readonly userAgent?: string;\n  private readonly logger: Logger;\n  /** Keyed by proxy URL (\"\" for null/direct) -> tier, so event handlers can find the right tier. */\n  private readonly proxyToTier = new Map<string, PoolTier>();\n  private closed = false;\n\n  /**\n   * Resolves when every browser has completed its initial launch attempt\n   * (success or failure). Success failures are NOT thrown here — this is\n   * not the health check, it's the \"pre-warm finished\" gate. The separate\n   * `api.ipify.org` verification step in daemon startup is responsible for\n   * actually validating that traffic flows through each proxy.\n   */\n  readonly ready: Promise<void>;\n\n  constructor(options: TieredBrowserPoolOptions) {\n    this.maxTabsPerBrowser = options.maxTabsPerBrowser ?? 2;\n    this.retireAfterPages = options.retireAfterPages ?? 100;\n    this.healthTracker = options.healthTracker;\n    this.heroFactory = options.heroFactory;\n    this.showChrome = options.showChrome ?? false;\n    this.connectionToCore = options.connectionToCore;\n    this.userAgent = options.userAgent;\n    this.logger = options.logger ?? createLogger(\"tiered-pool\");\n\n    // Build every browser up front. No lazy launch.\n    const readyPromises: Promise<unknown>[] = [];\n\n    for (const tierConfig of options.tiers) {\n      const map = new Map<string, ProxyBoundBrowser>();\n      for (const proxyUrl of tierConfig.proxyUrls) {\n        const key = proxyUrlKey(proxyUrl);\n        if (map.has(key)) {\n          this.logger.warn(\n            { proxy: redactProxyUrl(proxyUrl), tier: tierConfig.tier },\n            \"duplicate proxy URL in tier; skipping duplicate\"\n          );\n          continue;\n        }\n        const timezoneId = proxyUrl ? tierConfig.timezones?.[proxyUrl] : undefined;\n        const browser = this.createBrowser(proxyUrl, timezoneId);\n        map.set(key, browser);\n        this.proxyToTier.set(key, tierConfig.tier);\n        // Swallow per-browser launch failures — one dead browser shouldn't\n        // block the pool's ready promise. The startup health check in the\n        // daemon is responsible for failing loud.\n        readyPromises.push(\n          browser.ready.catch((err) => {\n            this.logger.error(\n              { err, proxy: redactProxyUrl(proxyUrl), tier: tierConfig.tier },\n              \"browser failed to launch during pool startup\"\n            );\n          })\n        );\n      }\n      this.tiers.set(tierConfig.tier, map);\n    }\n\n    this.ready = Promise.all(readyPromises).then(() => undefined);\n\n    // Subscribe to health events if a tracker was provided.\n    if (this.healthTracker) {\n      this.attachHealthListeners(this.healthTracker);\n    }\n  }\n\n  /**\n   * Acquire the least-loaded healthy browser from a tier. Does NOT hold a\n   * lock — the caller must invoke `lease.browser.withPage(fn)` to actually\n   * run something, and `withPage` takes the tab slot.\n   *\n   * Throws if the tier has no browsers at all, or if every browser in the\n   * tier is unavailable (launching, retired, closed, or benched). Callers\n   * should catch and either fall back to another tier or return a structured\n   * error.\n   */\n  acquire(tier: PoolTier): BrowserLease {\n    if (this.closed) {\n      throw new Error(\"TieredBrowserPool: pool is closed\");\n    }\n    const map = this.tiers.get(tier);\n    if (!map || map.size === 0) {\n      throw new Error(`TieredBrowserPool: no browsers configured for tier \"${tier}\"`);\n    }\n\n    // Pick least-loaded among browsers that are active (not launching,\n    // retired, closed) and — if we have a tracker — healthy.\n    let best: ProxyBoundBrowser | null = null;\n    let bestLoad = Infinity;\n\n    for (const browser of map.values()) {\n      if (!browser.isAvailable()) continue;\n      if (this.healthTracker && !this.healthTracker.isHealthy(browser.proxyUrl ?? \"\")) {\n        continue;\n      }\n      const load = browser.getActiveTabs();\n      if (load < bestLoad) {\n        best = browser;\n        bestLoad = load;\n      }\n    }\n\n    if (!best) {\n      throw new Error(\n        `TieredBrowserPool: no available browsers in tier \"${tier}\" ` +\n          `(all launching, retired, or benched)`\n      );\n    }\n\n    return { browser: best, tier };\n  }\n\n  /**\n   * Whether this tier has any configured browsers (not whether they're\n   * available right now). Useful for caller-side tier fallback logic.\n   */\n  hasTier(tier: PoolTier): boolean {\n    const map = this.tiers.get(tier);\n    return !!map && map.size > 0;\n  }\n\n  /**\n   * Look up the browser bound to a specific proxy URL, regardless of tier.\n   * Returns null if no such browser exists. Used by the Hero engine when\n   * the scraper has already resolved a proxy URL and needs the exact\n   * browser bound to it.\n   */\n  getBrowserByProxy(proxyUrl: string | null): ProxyBoundBrowser | null {\n    const tier = this.proxyToTier.get(proxyUrlKey(proxyUrl));\n    if (!tier) return null;\n    const map = this.tiers.get(tier);\n    if (!map) return null;\n    return map.get(proxyUrlKey(proxyUrl)) ?? null;\n  }\n\n  /**\n   * Snapshot stats for every browser in every tier.\n   */\n  getStats(): PoolStatsSnapshot {\n    const tiers: TierStats[] = [];\n    for (const [tier, map] of this.tiers.entries()) {\n      const browsers: ProxyBoundBrowserStats[] = [];\n      for (const browser of map.values()) {\n        browsers.push(browser.getStats());\n      }\n      tiers.push({ tier, browsers });\n    }\n    return { tiers };\n  }\n\n  /**\n   * Shut down the whole pool. Retires every browser in parallel.\n   */\n  async close(): Promise<void> {\n    if (this.closed) return;\n    this.closed = true;\n    const retirements: Promise<void>[] = [];\n    for (const map of this.tiers.values()) {\n      for (const browser of map.values()) {\n        retirements.push(browser.retire().catch(() => undefined));\n      }\n    }\n    await Promise.all(retirements);\n  }\n\n  /**\n   * Create a fresh ProxyBoundBrowser with the pool's shared config.\n   */\n  private createBrowser(proxyUrl: string | null, timezoneId?: string): ProxyBoundBrowser {\n    const opts: ProxyBoundBrowserOptions = {\n      proxyUrl,\n      timezoneId,\n      maxTabs: this.maxTabsPerBrowser,\n      retireAfterPages: this.retireAfterPages,\n      heroFactory: this.heroFactory,\n      showChrome: this.showChrome,\n      connectionToCore: this.connectionToCore,\n      userAgent: this.userAgent,\n      logger: this.logger,\n    };\n    return new ProxyBoundBrowser(opts);\n  }\n\n  /**\n   * Wire up event listeners on the ProxyHealthTracker so the pool reacts to\n   * runtime bench/revive signals:\n   *\n   *   proxy-benched  -> retire() the corresponding browser (drain + close).\n   *                     The browser stays in the map but is in the \"closed\"\n   *                     state, so acquire() will skip it.\n   *\n   *   proxy-revived  -> relaunch() the corresponding browser, restoring it\n   *                     to \"active\" with a fresh Hero process.\n   */\n  private attachHealthListeners(tracker: ProxyHealthTracker): void {\n    tracker.on(\"proxy-benched\", ({ proxyUrl }) => {\n      const browser = this.getBrowserByProxy(proxyUrl);\n      if (!browser) return;\n      this.logger.warn({ proxy: redactProxyUrl(proxyUrl) }, \"proxy benched, retiring browser\");\n      void browser.retire().catch((err) => {\n        this.logger.error(\n          { err, proxy: redactProxyUrl(proxyUrl) },\n          \"failed to retire benched browser\"\n        );\n      });\n    });\n\n    tracker.on(\"proxy-revived\", ({ proxyUrl }) => {\n      const browser = this.getBrowserByProxy(proxyUrl);\n      if (!browser) return;\n      this.logger.info({ proxy: redactProxyUrl(proxyUrl) }, \"proxy revived, relaunching browser\");\n      void browser.relaunch().catch((err) => {\n        this.logger.error(\n          { err, proxy: redactProxyUrl(proxyUrl) },\n          \"failed to relaunch revived browser\"\n        );\n      });\n    });\n  }\n}\n\n/**\n * Build a TieredBrowserPool config from the existing ProxyPoolConfig shape\n * used by the daemon's env parser. Applies the rule:\n *\n *   - If datacenter OR residential proxies are configured, the direct tier\n *     is EMPTY. We never leak the host IP when proxies exist.\n *   - If no proxies are configured anywhere, create a single direct browser\n *     (sized by `directPoolSize`, default 1).\n *\n * This matches the mental model we agreed on earlier in the design review.\n */\nexport function buildTierConfigsFromPools(\n  pools:\n    | {\n        datacenter?: Array<{ url?: string; timezoneId?: string }>;\n        residential?: Array<{ url?: string; timezoneId?: string }>;\n      }\n    | undefined,\n  opts: { directPoolSize?: number } = {}\n): TierConfig[] {\n  const directSize = opts.directPoolSize ?? 1;\n\n  function extract(list: Array<{ url?: string; timezoneId?: string }> | undefined) {\n    const urls: string[] = [];\n    const timezones: Record<string, string> = {};\n    for (const p of list ?? []) {\n      const url = p.url ?? \"\";\n      if (url.length === 0) continue;\n      urls.push(url);\n      if (p.timezoneId) timezones[url] = p.timezoneId;\n    }\n    return { urls, timezones: Object.keys(timezones).length > 0 ? timezones : undefined };\n  }\n\n  const dc = extract(pools?.datacenter);\n  const res = extract(pools?.residential);\n\n  const tiers: TierConfig[] = [];\n\n  if (dc.urls.length > 0 || res.urls.length > 0) {\n    if (dc.urls.length > 0) {\n      tiers.push({ tier: \"datacenter\", proxyUrls: dc.urls, timezones: dc.timezones });\n    }\n    if (res.urls.length > 0) {\n      tiers.push({ tier: \"residential\", proxyUrls: res.urls, timezones: res.timezones });\n    }\n    // No direct tier when proxies exist — direct: 0.\n  } else {\n    // No proxies configured anywhere. Spin up a direct-only pool.\n    const directUrls: Array<string | null> = Array.from({ length: directSize }, () => null);\n    tiers.push({ tier: \"direct\", proxyUrls: directUrls });\n  }\n\n  return tiers;\n}\n\n/**\n * Canonical key for a proxy URL in the pool's maps. null/undefined collapse\n * to the empty string so the direct lane has a stable key.\n */\nfunction proxyUrlKey(proxyUrl: string | null | undefined): string {\n  return proxyUrl ?? \"\";\n}\n\n/**\n * Re-export TabLike so callers who only import from `tiered-pool` don't\n * also need to import from `proxy-bound-browser`.\n */\nexport type { TabLike } from \"./proxy-bound-browser\";\n"
  },
  {
    "path": "src/browser/types.ts",
    "content": "import type Hero from \"@ulixee/hero\";\n\n/**\n * Browser instance in the pool\n */\nexport interface BrowserInstance {\n  /** Hero instance */\n  hero: Hero;\n\n  /** Unique identifier */\n  id: string;\n\n  /** When the instance was created */\n  createdAt: number;\n\n  /** When the instance was last used */\n  lastUsed: number;\n\n  /** Number of requests handled */\n  requestCount: number;\n\n  /** Current status */\n  status: \"idle\" | \"busy\" | \"recycling\" | \"unhealthy\";\n}\n\n/**\n * Queue item for pending requests\n */\nexport interface QueueItem {\n  /** Promise resolve function */\n  resolve: (hero: Hero) => void;\n\n  /** Promise reject function */\n  reject: (error: Error) => void;\n\n  /** When the request was queued */\n  queuedAt: number;\n}\n\n/**\n * Pool configuration\n */\nexport interface PoolConfig {\n  /** Pool size (number of browser instances) */\n  size: number;\n\n  /** Retire browser after this many page loads */\n  retireAfterPageCount: number;\n\n  /** Retire browser after this age in milliseconds */\n  retireAfterAgeMs: number;\n\n  /** How often to check for recycling (ms) */\n  recycleCheckInterval: number;\n\n  /** How often to run health checks (ms) */\n  healthCheckInterval: number;\n\n  /** Max consecutive failures before marking unhealthy */\n  maxConsecutiveFailures: number;\n\n  /** Maximum queue size */\n  maxQueueSize: number;\n\n  /** Queue timeout in milliseconds */\n  queueTimeout: number;\n}\n\n/**\n * Pool statistics\n */\nexport interface PoolStats {\n  /** Total instances */\n  total: number;\n\n  /** Available instances */\n  available: number;\n\n  /** Busy instances */\n  busy: number;\n\n  /** Recycling instances */\n  recycling: number;\n\n  /** Unhealthy instances */\n  unhealthy: number;\n\n  /** Queue length */\n  queueLength: number;\n\n  /** Total requests handled */\n  totalRequests: number;\n\n  /** Average request duration */\n  avgRequestDuration: number;\n}\n\n/**\n * Health status\n */\nexport interface HealthStatus {\n  /** Overall health */\n  healthy: boolean;\n\n  /** Issues found */\n  issues: string[];\n\n  /** Stats snapshot */\n  stats: PoolStats;\n}\n\n/**\n * Browser pool interface\n */\nexport interface IBrowserPool {\n  /** Initialize the pool */\n  initialize(): Promise<void>;\n\n  /** Shutdown the pool */\n  shutdown(): Promise<void>;\n\n  /** Acquire a browser instance */\n  acquire(): Promise<Hero>;\n\n  /** Release a browser instance back to the pool */\n  release(hero: Hero): void;\n\n  /** Execute callback with auto-managed browser */\n  withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T>;\n\n  /** Get pool statistics */\n  getStats(): PoolStats;\n\n  /** Run health check */\n  healthCheck?(): Promise<HealthStatus>;\n}\n"
  },
  {
    "path": "src/browser-session.ts",
    "content": "/**\n * Browser Session\n *\n * Launches a Chrome instance directly and returns a CDP WebSocket URL.\n * No Hero involvement — Chrome is the product, not Hero.\n *\n * For authenticated proxies, a lightweight local proxy forwarder is\n * started per session. Chrome connects to `localhost:PORT` (no auth),\n * the forwarder adds credentials and forwards to the upstream proxy.\n *\n * Architecture at scale:\n * - 1 Chrome process per session\n * - 1 local proxy forwarder per session (if proxy has auth)\n * - No Hero overhead\n * - Clean lifecycle: close = kill processes, done\n *\n * @example\n * ```typescript\n * const session = await createBrowserSession({ verbose: true });\n * const browser = await chromium.connectOverCDP(session.wsEndpoint);\n * const page = (await browser.newContext()).newPage();\n * await page.goto('https://example.com');\n * await session.close();\n * ```\n */\n\nimport { spawn } from \"child_process\";\nimport { createInterface } from \"readline\";\nimport { createServer, type Server } from \"http\";\nimport net from \"net\";\nimport { randomUUID } from \"crypto\";\nimport { mkdtempSync, rmSync, accessSync } from \"fs\";\nimport { join } from \"path\";\nimport { tmpdir } from \"os\";\nimport { createRequire } from \"module\";\nimport { createProxyUrl } from \"./proxy/config\";\nimport { createLogger } from \"./utils/logger\";\nimport type { BrowserSession, BrowserSessionInternalOptions } from \"./browser-types\";\n\nconst logger = createLogger(\"browser-session\");\n\nconst DEFAULT_SESSION_TIMEOUT_MS = 300_000; // 5 minutes\nconst CHROME_LAUNCH_TIMEOUT_MS = 15_000;\n\n/**\n * Find the Chrome executable path.\n * Priority: CHROME_139_BIN env var → Hero's bundled Chrome → system Chrome.\n */\nfunction findChromePath(): string {\n  if (process.env.CHROME_139_BIN) {\n    return process.env.CHROME_139_BIN;\n  }\n\n  try {\n    const req = createRequire(import.meta.url);\n    const ChromeEngine = req(\"@ulixee/chrome-139-0\");\n    const chrome = new ChromeEngine();\n    if (chrome.executablePath) return chrome.executablePath;\n  } catch {\n    // Not available\n  }\n\n  if (process.platform === \"darwin\") {\n    return \"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome\";\n  }\n  if (process.platform === \"linux\") {\n    for (const p of [\n      \"/usr/bin/google-chrome-stable\",\n      \"/usr/bin/google-chrome\",\n      \"/usr/bin/chromium-browser\",\n      \"/usr/bin/chromium\",\n    ]) {\n      try {\n        accessSync(p);\n        return p;\n      } catch {\n        /* continue */\n      }\n    }\n  }\n  return \"google-chrome\";\n}\n\n// ─── Local Auth Proxy Forwarder ──────────────────────────────────────\n\n/**\n * Start a lightweight local HTTP CONNECT proxy that adds auth to an\n * upstream proxy. Chrome connects to localhost:PORT (no auth needed),\n * the forwarder adds Proxy-Authorization and forwards to the real proxy.\n *\n * Handles both CONNECT (HTTPS tunneling) and plain HTTP requests.\n */\nfunction startAuthProxy(\n  upstreamHost: string,\n  upstreamPort: number,\n  username: string,\n  password: string\n): Promise<{ server: Server; port: number }> {\n  return new Promise((resolve, reject) => {\n    const authHeader = \"Basic \" + Buffer.from(`${username}:${password}`).toString(\"base64\");\n\n    const server = createServer((req, res) => {\n      // Plain HTTP proxy (non-CONNECT)\n      const upstream = net.createConnection(upstreamPort, upstreamHost, () => {\n        const reqLine = `${req.method} ${req.url} HTTP/${req.httpVersion}\\r\\n`;\n        let headers = \"\";\n        for (let i = 0; i < req.rawHeaders.length; i += 2) {\n          headers += `${req.rawHeaders[i]}: ${req.rawHeaders[i + 1]}\\r\\n`;\n        }\n        headers += `Proxy-Authorization: ${authHeader}\\r\\n`;\n        upstream.write(reqLine + headers + \"\\r\\n\");\n        req.pipe(upstream);\n        upstream.pipe(res);\n      });\n      upstream.on(\"error\", () => res.destroy());\n    });\n\n    // CONNECT method (HTTPS tunneling)\n    server.on(\"connect\", (req, clientSocket, head) => {\n      const upstream = net.createConnection(upstreamPort, upstreamHost, () => {\n        // Send CONNECT to upstream with auth\n        upstream.write(\n          `CONNECT ${req.url} HTTP/1.1\\r\\n` +\n            `Host: ${req.url}\\r\\n` +\n            `Proxy-Authorization: ${authHeader}\\r\\n` +\n            `\\r\\n`\n        );\n\n        // Wait for upstream's 200 response\n        let buf = Buffer.alloc(0);\n        const onData = (chunk: Buffer) => {\n          buf = Buffer.concat([buf, chunk]);\n          const headerEnd = buf.indexOf(\"\\r\\n\\r\\n\");\n          if (headerEnd === -1) return;\n\n          upstream.removeListener(\"data\", onData);\n\n          const statusLine = buf.subarray(0, buf.indexOf(\"\\r\\n\")).toString();\n          const remaining = buf.subarray(headerEnd + 4);\n\n          if (statusLine.includes(\"200\")) {\n            clientSocket.write(\"HTTP/1.1 200 Connection Established\\r\\n\\r\\n\");\n            if (remaining.length > 0) clientSocket.write(remaining);\n            if (head.length > 0) upstream.write(head);\n            clientSocket.pipe(upstream);\n            upstream.pipe(clientSocket);\n          } else {\n            clientSocket.write(`HTTP/1.1 502 Bad Gateway\\r\\n\\r\\n`);\n            clientSocket.destroy();\n            upstream.destroy();\n          }\n        };\n        upstream.on(\"data\", onData);\n      });\n\n      upstream.on(\"error\", () => {\n        clientSocket.write(\"HTTP/1.1 502 Bad Gateway\\r\\n\\r\\n\");\n        clientSocket.destroy();\n      });\n    });\n\n    server.listen(0, \"127.0.0.1\", () => {\n      const addr = server.address();\n      if (!addr || typeof addr === \"string\") {\n        reject(new Error(\"Failed to start auth proxy\"));\n        return;\n      }\n      resolve({ server, port: addr.port });\n    });\n\n    server.on(\"error\", reject);\n  });\n}\n\n/**\n * Parse a proxy URL into components.\n * Returns { host, port, username?, password?, hasAuth }\n */\nfunction parseProxy(proxyUrl: string): {\n  host: string;\n  port: number;\n  username?: string;\n  password?: string;\n  hasAuth: boolean;\n} {\n  const url = new URL(proxyUrl);\n  return {\n    host: url.hostname,\n    port: parseInt(url.port, 10),\n    username: url.username || undefined,\n    password: url.password || undefined,\n    hasAuth: !!url.username,\n  };\n}\n\n// ─── Main ────────────────────────────────────────────────────────────\n\n/**\n * Create a browser session with a CDP WebSocket endpoint.\n *\n * Launches Chrome directly with remote debugging enabled. Each session\n * gets an isolated user-data-dir. For authenticated proxies, a local\n * proxy forwarder is started to handle auth transparently.\n */\nexport async function createBrowserSession(\n  options: BrowserSessionInternalOptions\n): Promise<BrowserSession> {\n  const sessionId = randomUUID();\n  const timeoutMs = options.timeoutMs ?? DEFAULT_SESSION_TIMEOUT_MS;\n  const verbose = options.verbose ?? false;\n\n  if (verbose) {\n    logger.info(`Creating browser session ${sessionId}`);\n  }\n\n  // Resolve proxy from pool or explicit option\n  const proxyConfig = options.proxy ?? options.resolveProxy?.(options.proxyTier);\n  const proxyUrl = proxyConfig ? createProxyUrl(proxyConfig) : undefined;\n\n  // If proxy has auth, start a local forwarder\n  let authProxyServer: Server | undefined;\n  let chromeProxyArg: string | undefined;\n\n  if (proxyUrl) {\n    const parsed = parseProxy(proxyUrl);\n    if (parsed.hasAuth) {\n      // Start local auth proxy forwarder\n      const { server, port } = await startAuthProxy(\n        parsed.host,\n        parsed.port,\n        parsed.username!,\n        parsed.password!\n      );\n      authProxyServer = server;\n      chromeProxyArg = `http://127.0.0.1:${port}`;\n      if (verbose) {\n        logger.info(`Auth proxy forwarder on :${port} → ${parsed.host}:${parsed.port}`);\n      }\n    } else {\n      // No auth needed, pass directly\n      chromeProxyArg = proxyUrl;\n    }\n  }\n\n  // Each session gets its own profile directory for isolation\n  const userDataDir = mkdtempSync(join(tmpdir(), `reader-session-${sessionId}-`));\n\n  // Build Chrome launch args\n  const chromePath = findChromePath();\n  const args = [\n    `--remote-debugging-port=0`,\n    `--user-data-dir=${userDataDir}`,\n    \"--no-first-run\",\n    \"--no-default-browser-check\",\n    \"--use-mock-keychain\",\n    \"--disable-features=MediaRouter\",\n    \"--no-sandbox\",\n    \"--disable-dev-shm-usage\",\n    \"--disable-background-networking\",\n    \"--disable-default-apps\",\n    \"--disable-extensions\",\n    \"--disable-sync\",\n    \"--disable-translate\",\n    \"--metrics-recording-only\",\n    \"--mute-audio\",\n    \"--disable-blink-features=AutomationControlled\",\n  ];\n\n  if (!options.showChrome) {\n    args.push(\"--headless=new\");\n  }\n\n  if (chromeProxyArg) {\n    args.push(`--proxy-server=${chromeProxyArg}`);\n    args.push(\"--proxy-bypass-list=<-loopback>\");\n    // Accept self-signed certs from the proxy forwarder\n    args.push(\"--ignore-certificate-errors\");\n  }\n\n  // Open about:blank so there's a page ready for the user\n  args.push(\"about:blank\");\n\n  if (verbose) {\n    logger.info(\n      `Launching Chrome: ${chromePath} (${args.length} args, proxy: ${chromeProxyArg ?? \"none\"})`\n    );\n  }\n\n  // Launch Chrome process\n  const chromeProcess = spawn(chromePath, args, {\n    detached: process.platform !== \"win32\",\n    stdio: [\"ignore\", \"pipe\", \"pipe\"],\n  });\n\n  let closed = false;\n  let timeoutHandle: ReturnType<typeof setTimeout> | null = null;\n\n  // Extract the WebSocket URL from Chrome's stderr\n  let wsEndpoint: string;\n  try {\n    wsEndpoint = await new Promise<string>((resolve, reject) => {\n      const launchTimeout = setTimeout(() => {\n        reject(new Error(\"Timed out waiting for Chrome to start\"));\n      }, CHROME_LAUNCH_TIMEOUT_MS);\n\n      if (chromeProcess.stderr) {\n        const rl = createInterface({ input: chromeProcess.stderr });\n        rl.on(\"line\", (line) => {\n          const match = line.match(/DevTools listening on (ws:\\/\\/\\S+)/);\n          if (match) {\n            clearTimeout(launchTimeout);\n            rl.close();\n            resolve(match[1]);\n          }\n        });\n      }\n\n      chromeProcess.on(\"error\", (err) => {\n        clearTimeout(launchTimeout);\n        reject(new Error(`Failed to launch Chrome: ${err.message}`));\n      });\n\n      chromeProcess.on(\"exit\", (code) => {\n        if (!closed) {\n          clearTimeout(launchTimeout);\n          reject(new Error(`Chrome exited with code ${code} before ready`));\n        }\n      });\n    });\n  } catch (error: unknown) {\n    try {\n      chromeProcess.kill(\"SIGKILL\");\n    } catch {\n      /* ignore */\n    }\n    authProxyServer?.close();\n    try {\n      rmSync(userDataDir, { recursive: true, force: true });\n    } catch {\n      /* ignore */\n    }\n    throw new Error(`Failed to launch browser session: ${(error as Error).message}`);\n  }\n\n  if (verbose) {\n    logger.info(`Session ${sessionId} ready: ${wsEndpoint}`);\n  }\n\n  const createdAt = new Date().toISOString();\n\n  const close = async (): Promise<void> => {\n    if (closed) return;\n    closed = true;\n\n    if (timeoutHandle) {\n      clearTimeout(timeoutHandle);\n      timeoutHandle = null;\n    }\n\n    if (verbose) {\n      logger.info(`Closing browser session ${sessionId}`);\n    }\n\n    // Kill Chrome process group\n    try {\n      if (chromeProcess.pid && !chromeProcess.killed) {\n        if (process.platform !== \"win32\") {\n          try {\n            process.kill(-chromeProcess.pid, \"SIGTERM\");\n          } catch {\n            /* ignore */\n          }\n        } else {\n          chromeProcess.kill(\"SIGTERM\");\n        }\n      }\n    } catch {\n      /* ignore */\n    }\n\n    // Stop the auth proxy forwarder\n    authProxyServer?.close();\n\n    // Clean up temp profile directory (delayed so Chrome can release locks)\n    setTimeout(() => {\n      try {\n        rmSync(userDataDir, { recursive: true, force: true });\n      } catch {\n        /* ignore */\n      }\n    }, 1000);\n  };\n\n  // Auto-close on timeout\n  timeoutHandle = setTimeout(() => {\n    if (verbose) {\n      logger.info(`Session ${sessionId} timed out after ${timeoutMs}ms`);\n    }\n    close().catch(() => {});\n  }, timeoutMs);\n\n  if (timeoutHandle && typeof timeoutHandle === \"object\" && \"unref\" in timeoutHandle) {\n    timeoutHandle.unref();\n  }\n\n  // Clean up if Chrome crashes\n  chromeProcess.on(\"exit\", () => {\n    if (!closed) {\n      close().catch(() => {});\n    }\n  });\n\n  return {\n    sessionId,\n    wsEndpoint,\n    createdAt,\n    close,\n  };\n}\n"
  },
  {
    "path": "src/browser-types.ts",
    "content": "import type { ProxyConfig, ProxyTier } from \"./types\";\n\n/**\n * Options for creating a browser session.\n *\n * A browser session launches a Hero-stealthed Chrome instance and returns\n * a CDP WebSocket URL. Users connect Playwright/Puppeteer via\n * `chromium.connectOverCDP(wsEndpoint)` and get full anti-bot stealth\n * (TLS fingerprinting, navigator/WebGL spoofing, WebRTC masking).\n */\nexport interface BrowserOptions {\n  /** Proxy configuration (single proxy — use proxyTier for pool-based) */\n  proxy?: ProxyConfig;\n\n  /** Proxy tier selection (default: \"auto\") */\n  proxyTier?: ProxyTier;\n\n  /** Show Chrome browser window (default: false) */\n  showChrome?: boolean;\n\n  /**\n   * Maximum session lifetime in milliseconds (default: 300000 = 5 min).\n   * Session auto-closes after this duration.\n   */\n  timeoutMs?: number;\n\n  /** Enable verbose logging (default: false) */\n  verbose?: boolean;\n}\n\n/**\n * An active browser session with a CDP WebSocket endpoint.\n *\n * Connect to `wsEndpoint` using Playwright or Puppeteer:\n *\n * @example\n * ```typescript\n * import { chromium } from 'playwright';\n *\n * const session = await reader.browser({ proxyTier: 'stealth' });\n * const browser = await chromium.connectOverCDP(session.wsEndpoint);\n * const page = browser.contexts()[0].pages()[0];\n *\n * await page.goto('https://example.com');\n * console.log(await page.title());\n *\n * await session.close();\n * ```\n */\nexport interface BrowserSession {\n  /** Unique session identifier */\n  sessionId: string;\n\n  /** CDP WebSocket URL for Playwright/Puppeteer connection */\n  wsEndpoint: string;\n\n  /** ISO timestamp of session creation */\n  createdAt: string;\n\n  /** Close the session and release all resources */\n  close: () => Promise<void>;\n}\n\n/**\n * Internal options for createBrowserSession (includes injected deps).\n * Not part of the public API.\n */\nexport interface BrowserSessionInternalOptions extends BrowserOptions {\n  /** Connection to shared HeroCore instance */\n  connectionToCore?: any;\n\n  /** Proxy resolver callback (provided by ReaderClient) */\n  resolveProxy?: (tier: ProxyTier | undefined) => ProxyConfig | undefined;\n}\n"
  },
  {
    "path": "src/cli/index.ts",
    "content": "#!/usr/bin/env node\n// Load .env from cwd before any code reads process.env. This makes\n// `PROXY_DATACENTER` / `PROXY_RESIDENTIAL` / `READER_AUTH_TOKEN` etc.\n// work from a local `.env` file without the operator having to export\n// vars manually before starting the daemon.\nimport \"dotenv/config\";\n\n/**\n * Reader CLI\n *\n * Command-line interface for web scraping with Cloudflare bypass.\n *\n * @example\n * # Start daemon (once)\n * npx reader start --direct-pool-size 5\n *\n * # Scrape a single URL (auto-detects daemon)\n * npx reader scrape https://example.com\n *\n * # Scrape multiple URLs with markdown and text output\n * npx reader scrape https://example.com https://example.org -f markdown,text\n *\n * # Crawl a website\n * npx reader crawl https://example.com -d 2 -m 20\n *\n * # Force standalone mode (bypass daemon)\n * npx reader scrape https://example.com --standalone\n *\n * # Check daemon status\n * npx reader status\n *\n * # Stop daemon\n * npx reader stop\n */\n\nimport { Command } from \"commander\";\nimport { ReaderClient } from \"../client\";\nimport {\n  DaemonServer,\n  DaemonClient,\n  isDaemonRunning,\n  getDaemonInfo,\n  DEFAULT_DAEMON_PORT,\n} from \"../daemon\";\nimport { readFileSync, writeFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\n\n// Get version from package.json\nconst __dirname = dirname(fileURLToPath(import.meta.url));\nconst pkg = JSON.parse(readFileSync(join(__dirname, \"../../package.json\"), \"utf-8\"));\n\nconst program = new Command();\n\nprogram\n  .name(\"reader\")\n  .description(\n    \"Production-grade web scraping engine for LLMs. Clean markdown output, ready for your agents.\"\n  )\n  .version(pkg.version);\n\n// =============================================================================\n// Daemon Commands\n// =============================================================================\n\nprogram\n  .command(\"start\")\n  .description(\"Start the reader daemon server\")\n  .option(\n    \"-p, --port <n>\",\n    `Port to listen on (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  // The daemon now binds 1 browser per configured proxy URL, so --direct-pool-size\n  // only controls the size of the *direct* sub-pool that's used when no proxies\n  // are configured (local dev / CI). When PROXY_DATACENTER or PROXY_RESIDENTIAL\n  // is set, this flag is ignored — the proxy count determines the pool size.\n  .option(\n    \"--direct-pool-size <n>\",\n    \"Direct-tier browser pool size (only used when no proxies are configured)\",\n    \"1\"\n  )\n  // Backwards-compat alias for the pre-tiered-pool flag. Logs a deprecation\n  // warning at startup. Will be removed in a future release.\n  .option(\"--pool-size <n>\", \"(deprecated, use --direct-pool-size)\")\n  .option(\"--show-chrome\", \"Show browser windows for debugging\")\n  .option(\"-v, --verbose\", \"Enable verbose logging\")\n  .action(async (options) => {\n    const port = parseInt(options.port, 10);\n\n    // Resolve --direct-pool-size, accepting --pool-size as a deprecated alias.\n    // If both are provided, --direct-pool-size wins and we warn about the\n    // ambiguity. The legacy flag emits a deprecation notice the user can\n    // grep for in their startup logs.\n    let directPoolSize = parseInt(options.directPoolSize, 10);\n    if (options.poolSize !== undefined) {\n      console.warn(\n        \"Warning: --pool-size is deprecated; use --direct-pool-size instead. \" +\n          \"Note that with proxies configured, the pool is sized to match your proxy count \" +\n          \"and this flag is ignored.\"\n      );\n      // Honor the legacy flag only when --direct-pool-size wasn't explicitly set.\n      // Commander gives us the default (\"1\") when the user didn't pass it.\n      if (options.directPoolSize === \"1\") {\n        directPoolSize = parseInt(options.poolSize, 10);\n      }\n    }\n\n    // Check if daemon is already running\n    if (await isDaemonRunning(port)) {\n      console.error(`Error: Daemon is already running on port ${port}`);\n      process.exit(1);\n    }\n\n    const daemon = new DaemonServer({\n      port,\n      poolSize: directPoolSize,\n      verbose: options.verbose || false,\n      showChrome: options.showChrome || false,\n    });\n\n    try {\n      await daemon.start();\n      console.log(`Reader daemon started on port ${port} (direct-pool-size=${directPoolSize})`);\n      console.log(`Use \"npx reader stop\" to stop the daemon`);\n\n      // Keep process running\n      process.on(\"SIGINT\", async () => {\n        console.log(\"\\nShutting down daemon...\");\n        await daemon.stop();\n        process.exit(0);\n      });\n\n      process.on(\"SIGTERM\", async () => {\n        await daemon.stop();\n        process.exit(0);\n      });\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    }\n  });\n\nprogram\n  .command(\"stop\")\n  .description(\"Stop the running reader daemon\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .action(async (options) => {\n    const port = parseInt(options.port, 10);\n    const client = new DaemonClient({ port });\n\n    try {\n      if (!(await client.isRunning())) {\n        console.log(\"Daemon is not running\");\n        return;\n      }\n\n      await client.shutdown();\n      console.log(\"Daemon stopped\");\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    }\n  });\n\nprogram\n  .command(\"status\")\n  .description(\"Check daemon status\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .action(async (options) => {\n    // First check PID file\n    const daemonInfo = await getDaemonInfo();\n\n    if (!daemonInfo) {\n      console.log(\"Daemon is not running\");\n      return;\n    }\n\n    // Use port from options if specified, otherwise from PID file\n    const port = options.port ? parseInt(options.port, 10) : daemonInfo.port;\n\n    // Verify it's actually running by connecting\n    const client = new DaemonClient({ port });\n    try {\n      const status = await client.status();\n      console.log(\"Daemon is running:\");\n      console.log(`  Port: ${status.port}`);\n      console.log(`  PID: ${status.pid}`);\n      console.log(`  Pool size: ${status.poolSize}`);\n      console.log(`  Uptime: ${Math.round(status.uptime / 1000)}s`);\n    } catch {\n      console.log(\"Daemon is not running (stale PID file)\");\n    }\n  });\n\n// =============================================================================\n// Scrape Command\n// =============================================================================\n\nprogram\n  .command(\"scrape <urls...>\")\n  .description(\"Scrape one or more URLs\")\n  .option(\n    \"-f, --format <formats>\",\n    \"Content formats to include (comma-separated: markdown,html)\",\n    \"markdown\"\n  )\n  .option(\"-o, --output <file>\", \"Output file (stdout if omitted)\")\n  .option(\"-c, --concurrency <n>\", \"Parallel requests\", \"1\")\n  .option(\"-t, --timeout <ms>\", \"Request timeout in milliseconds\", \"30000\")\n  .option(\"--proxy <url>\", \"Proxy URL (e.g., http://user:pass@host:port)\")\n  .option(\"--user-agent <string>\", \"Custom user agent string\")\n  .option(\"--batch-timeout <ms>\", \"Total timeout for entire batch operation\", \"300000\")\n  .option(\"--show-chrome\", \"Show browser window for debugging\")\n  .option(\"--standalone\", \"Force standalone mode (bypass daemon)\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .option(\"-v, --verbose\", \"Enable verbose logging\")\n  .option(\"--no-main-content\", \"Disable main content extraction (include full page)\")\n  .option(\"--include-tags <selectors>\", \"CSS selectors for elements to include (comma-separated)\")\n  .option(\"--exclude-tags <selectors>\", \"CSS selectors for elements to exclude (comma-separated)\")\n  .action(async (urls: string[], options) => {\n    const port = parseInt(options.port, 10);\n    const useStandalone = options.standalone || false;\n\n    // Auto-detect daemon unless --standalone is specified\n    let useDaemon = false;\n    if (!useStandalone) {\n      useDaemon = await isDaemonRunning(port);\n      if (options.verbose && useDaemon) {\n        console.error(`Using daemon on port ${port}`);\n      }\n    }\n\n    // Create client (daemon or standalone)\n    const daemonClient = useDaemon ? new DaemonClient({ port }) : null;\n    const standaloneClient = !useDaemon\n      ? new ReaderClient({\n          verbose: options.verbose || false,\n          showChrome: options.showChrome || false,\n        })\n      : null;\n\n    try {\n      const formats = options.format.split(\",\").map((f: string) => f.trim());\n\n      // Validate formats\n      const validFormats = [\"markdown\", \"html\"];\n      for (const format of formats) {\n        if (!validFormats.includes(format)) {\n          console.error(\n            `Error: Invalid format \"${format}\". Valid formats: ${validFormats.join(\", \")}`\n          );\n          process.exit(1);\n        }\n      }\n\n      if (options.verbose) {\n        console.error(`Scraping ${urls.length} URL(s)...`);\n        console.error(`Formats: ${formats.join(\", \")}`);\n      }\n\n      // Parse tag selectors\n      const includeTags = options.includeTags\n        ? options.includeTags.split(\",\").map((s: string) => s.trim())\n        : undefined;\n      const excludeTags = options.excludeTags\n        ? options.excludeTags.split(\",\").map((s: string) => s.trim())\n        : undefined;\n\n      const scrapeOptions = {\n        urls,\n        formats,\n        batchConcurrency: parseInt(options.concurrency, 10),\n        timeoutMs: parseInt(options.timeout, 10),\n        batchTimeoutMs: parseInt(options.batchTimeout, 10),\n        proxy: options.proxy ? { url: options.proxy } : undefined,\n        userAgent: options.userAgent,\n        verbose: options.verbose || false,\n        showChrome: options.showChrome || false,\n        // Content cleaning options\n        onlyMainContent: options.mainContent !== false, // --no-main-content sets this to false\n        includeTags,\n        excludeTags,\n        onProgress: options.verbose\n          ? ({\n              completed,\n              total,\n              currentUrl,\n            }: {\n              completed: number;\n              total: number;\n              currentUrl: string;\n            }) => {\n              console.error(`[${completed}/${total}] ${currentUrl}`);\n            }\n          : undefined,\n      };\n\n      const result = useDaemon\n        ? await daemonClient!.scrape(scrapeOptions)\n        : await standaloneClient!.scrape(scrapeOptions);\n\n      // Always output JSON\n      const output = JSON.stringify(result, null, 2);\n\n      // Write output\n      if (options.output) {\n        writeFileSync(options.output, output);\n        if (options.verbose) {\n          console.error(`Output written to ${options.output}`);\n        }\n      } else {\n        console.log(output);\n      }\n\n      // Print summary to stderr\n      if (options.verbose) {\n        console.error(`\\nSummary:`);\n        console.error(\n          `  Successful: ${result.batchMetadata.successfulUrls}/${result.batchMetadata.totalUrls}`\n        );\n        console.error(`  Duration: ${result.batchMetadata.totalDuration}ms`);\n      }\n\n      // Exit with error code if any URLs failed\n      if (result.batchMetadata.failedUrls > 0) {\n        process.exit(1);\n      }\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    } finally {\n      if (standaloneClient) {\n        await standaloneClient.close();\n        process.exit(0);\n      }\n    }\n  });\n\n// =============================================================================\n// Crawl Command\n// =============================================================================\n\nprogram\n  .command(\"crawl <url>\")\n  .description(\"Crawl a website to discover and optionally scrape pages\")\n  .option(\"-d, --depth <n>\", \"Maximum crawl depth\", \"1\")\n  .option(\"-m, --max-pages <n>\", \"Maximum pages to discover\", \"20\")\n  .option(\"-s, --scrape\", \"Also scrape content of discovered pages\")\n  .option(\n    \"-f, --format <formats>\",\n    \"Content formats when scraping (comma-separated: markdown,html)\",\n    \"markdown\"\n  )\n  .option(\"-o, --output <file>\", \"Output file (stdout if omitted)\")\n  .option(\"--delay <ms>\", \"Delay between requests in milliseconds\", \"1000\")\n  .option(\"-t, --timeout <ms>\", \"Total timeout for crawl operation in milliseconds\")\n  .option(\"--include <patterns>\", \"URL patterns to include (comma-separated regex)\")\n  .option(\"--exclude <patterns>\", \"URL patterns to exclude (comma-separated regex)\")\n  .option(\"--proxy <url>\", \"Proxy URL (e.g., http://user:pass@host:port)\")\n  .option(\"--user-agent <string>\", \"Custom user agent string\")\n  .option(\"--show-chrome\", \"Show browser window for debugging\")\n  .option(\"--standalone\", \"Force standalone mode (bypass daemon)\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .option(\"-v, --verbose\", \"Enable verbose logging\")\n  .action(async (url: string, options) => {\n    const port = parseInt(options.port, 10);\n    const useStandalone = options.standalone || false;\n\n    // Auto-detect daemon unless --standalone is specified\n    let useDaemon = false;\n    if (!useStandalone) {\n      useDaemon = await isDaemonRunning(port);\n      if (options.verbose && useDaemon) {\n        console.error(`Using daemon on port ${port}`);\n      }\n    }\n\n    // Create client (daemon or standalone)\n    const daemonClient = useDaemon ? new DaemonClient({ port }) : null;\n    const standaloneClient = !useDaemon\n      ? new ReaderClient({\n          verbose: options.verbose || false,\n          showChrome: options.showChrome || false,\n        })\n      : null;\n\n    try {\n      if (options.verbose) {\n        console.error(`Crawling ${url}...`);\n        console.error(`Max depth: ${options.depth}, Max pages: ${options.maxPages}`);\n      }\n\n      // Parse include/exclude patterns\n      const includePatterns = options.include\n        ? options.include.split(\",\").map((p: string) => p.trim())\n        : undefined;\n      const excludePatterns = options.exclude\n        ? options.exclude.split(\",\").map((p: string) => p.trim())\n        : undefined;\n\n      const crawlOptions = {\n        url,\n        depth: parseInt(options.depth, 10),\n        maxPages: parseInt(options.maxPages, 10),\n        scrape: options.scrape || false,\n        delayMs: parseInt(options.delay, 10),\n        timeoutMs: options.timeout ? parseInt(options.timeout, 10) : undefined,\n        includePatterns,\n        excludePatterns,\n        proxy: options.proxy ? { url: options.proxy } : undefined,\n        userAgent: options.userAgent,\n        verbose: options.verbose || false,\n        showChrome: options.showChrome || false,\n      };\n\n      // Add formats to crawl options if scraping\n      const formats = options.format.split(\",\").map((f: string) => f.trim());\n      const crawlOptionsWithFormats = {\n        ...crawlOptions,\n        formats,\n      };\n\n      const result = useDaemon\n        ? await daemonClient!.crawl(crawlOptionsWithFormats)\n        : await standaloneClient!.crawl(crawlOptionsWithFormats);\n\n      // Always output JSON\n      const output = JSON.stringify(result, null, 2);\n\n      // Write output\n      if (options.output) {\n        writeFileSync(options.output, output);\n        if (options.verbose) {\n          console.error(`Output written to ${options.output}`);\n        }\n      } else {\n        console.log(output);\n      }\n\n      // Print summary to stderr\n      if (options.verbose) {\n        console.error(`\\nSummary:`);\n        console.error(`  Discovered: ${result.urls.length} URLs`);\n        console.error(`  Duration: ${result.metadata.totalDuration}ms`);\n      }\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    } finally {\n      if (standaloneClient) {\n        await standaloneClient.close();\n        process.exit(0);\n      }\n    }\n  });\n\n// =============================================================================\n// Browser Command\n// =============================================================================\n\nconst browserCmd = program\n  .command(\"browser\")\n  .description(\"Launch a browser session with CDP endpoint for Playwright/Puppeteer\");\n\nbrowserCmd\n  .command(\"create\", { isDefault: true })\n  .description(\"Create a new browser session and print the CDP WebSocket URL\")\n  .option(\"--proxy <url>\", \"Proxy URL (e.g., http://user:pass@host:port)\")\n  .option(\"-t, --timeout <ms>\", \"Session lifetime in milliseconds\", \"300000\")\n  .option(\"--show-chrome\", \"Show browser window for debugging\")\n  .option(\"--standalone\", \"Force standalone mode (bypass daemon)\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .option(\"-v, --verbose\", \"Enable verbose logging\")\n  .action(async (options) => {\n    const port = parseInt(options.port, 10);\n    const useStandalone = options.standalone || false;\n\n    let useDaemon = false;\n    if (!useStandalone) {\n      useDaemon = await isDaemonRunning(port);\n    }\n\n    if (useDaemon) {\n      // Daemon mode: create via RPC, print info, keep alive until Ctrl+C\n      const client = new DaemonClient({ port });\n      try {\n        const session = await client.browserCreate({\n          proxy: options.proxy ? { url: options.proxy } : undefined,\n          timeoutMs: parseInt(options.timeout, 10),\n          showChrome: options.showChrome || false,\n          verbose: options.verbose || false,\n        });\n\n        // Print JSON to stdout for programmatic consumption\n        console.log(JSON.stringify(session, null, 2));\n\n        // Print human-readable instructions to stderr\n        console.error(`\\nBrowser session started: ${session.sessionId}`);\n        console.error(`Connect with Playwright:`);\n        console.error(`  const browser = await chromium.connectOverCDP(\"${session.wsEndpoint}\");`);\n        console.error(`\\nPress Ctrl+C to stop the session.`);\n\n        // Block until Ctrl+C\n        await new Promise<void>((resolve) => {\n          process.on(\"SIGINT\", async () => {\n            console.error(\"\\nStopping session...\");\n            await client.browserStop(session.sessionId).catch(() => {});\n            resolve();\n          });\n        });\n      } catch (error: any) {\n        console.error(`Error: ${error.message}`);\n        process.exit(1);\n      }\n    } else {\n      // Standalone mode: create ReaderClient, launch session, block\n      const reader = new ReaderClient({\n        verbose: options.verbose || false,\n        showChrome: options.showChrome || false,\n      });\n\n      try {\n        const session = await reader.browser({\n          proxy: options.proxy ? { url: options.proxy } : undefined,\n          timeoutMs: parseInt(options.timeout, 10),\n          showChrome: options.showChrome || false,\n          verbose: options.verbose || false,\n        });\n\n        // Print JSON to stdout\n        console.log(\n          JSON.stringify(\n            {\n              sessionId: session.sessionId,\n              wsEndpoint: session.wsEndpoint,\n              createdAt: session.createdAt,\n            },\n            null,\n            2\n          )\n        );\n\n        // Print instructions to stderr\n        console.error(`\\nBrowser session started: ${session.sessionId}`);\n        console.error(`Connect with Playwright:`);\n        console.error(`  const browser = await chromium.connectOverCDP(\"${session.wsEndpoint}\");`);\n        console.error(`\\nPress Ctrl+C to stop the session.`);\n\n        // Block until Ctrl+C\n        await new Promise<void>((resolve) => {\n          process.on(\"SIGINT\", async () => {\n            console.error(\"\\nStopping session...\");\n            await session.close();\n            await reader.close();\n            resolve();\n          });\n        });\n\n        process.exit(0);\n      } catch (error: any) {\n        console.error(`Error: ${error.message}`);\n        await reader.close().catch(() => {});\n        process.exit(1);\n      }\n    }\n  });\n\nbrowserCmd\n  .command(\"stop <sessionId>\")\n  .description(\"Stop a browser session\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .action(async (sessionId: string, options) => {\n    const port = parseInt(options.port, 10);\n    const client = new DaemonClient({ port });\n\n    try {\n      await client.browserStop(sessionId);\n      console.log(`Session ${sessionId} stopped`);\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    }\n  });\n\nbrowserCmd\n  .command(\"list\")\n  .description(\"List active browser sessions\")\n  .option(\n    \"-p, --port <n>\",\n    `Daemon port (default: ${DEFAULT_DAEMON_PORT})`,\n    String(DEFAULT_DAEMON_PORT)\n  )\n  .action(async (options) => {\n    const port = parseInt(options.port, 10);\n    const client = new DaemonClient({ port });\n\n    try {\n      const sessions = await client.browserList();\n      if (sessions.length === 0) {\n        console.log(\"No active browser sessions\");\n      } else {\n        console.log(JSON.stringify(sessions, null, 2));\n      }\n    } catch (error: any) {\n      console.error(`Error: ${error.message}`);\n      process.exit(1);\n    }\n  });\n\n// =============================================================================\n// Parse and execute\n// =============================================================================\n\nprogram.parse();\n"
  },
  {
    "path": "src/client.ts",
    "content": "/**\n * ReaderClient\n *\n * A client wrapper that manages HeroCore lifecycle and provides\n * a simple interface for scraping and crawling.\n *\n * @example\n * const reader = new ReaderClient();\n *\n * const result = await reader.scrape({\n *   urls: ['https://example.com'],\n *   formats: ['markdown'],\n * });\n *\n * console.log(result.data[0].markdown);\n *\n * // When done (optional - auto-closes on process exit)\n * await reader.close();\n */\n\nimport HeroCore from \"@ulixee/hero-core\";\nimport { TransportBridge } from \"@ulixee/net\";\nimport { ConnectionToHeroCore } from \"@ulixee/hero\";\nimport { scrape } from \"./scraper\";\nimport { crawl } from \"./crawler\";\nimport { createBrowserSession } from \"./browser-session\";\nimport type { BrowserOptions, BrowserSession } from \"./browser-types\";\nimport { TieredBrowserPool, buildTierConfigsFromPools } from \"./browser/tiered-pool\";\nimport type { HeroFactory } from \"./browser/proxy-bound-browser\";\nimport { PerProxyGate } from \"./proxy/proxy-gate\";\nimport { ProxyHealthTracker } from \"./proxy/health-tracker\";\nimport type {\n  ScrapeOptions,\n  ScrapeResult,\n  ProxyConfig,\n  ProxyPoolConfig,\n  BrowserPoolConfig,\n  ProxyTier,\n} from \"./types\";\nimport type { CrawlOptions, CrawlResult } from \"./crawl-types\";\nimport { createLogger } from \"./utils/logger\";\n\nconst logger = createLogger(\"client\");\n\n/**\n * Proxy rotation strategy\n */\nexport type ProxyRotation = \"round-robin\" | \"random\";\n\n/**\n * Configuration options for ReaderClient\n */\nexport interface ReaderClientOptions {\n  /** Enable verbose logging (default: false) */\n  verbose?: boolean;\n  /** Show Chrome browser window (default: false) */\n  showChrome?: boolean;\n\n  /** Browser pool configuration */\n  browserPool?: BrowserPoolConfig;\n\n  /** List of proxies to rotate through (legacy — use proxyPools for tier-based) */\n  proxies?: ProxyConfig[];\n\n  /**\n   * Multi-tier proxy pools.\n   * When configured, proxy selection is based on the `proxyTier` option per-request.\n   *\n   * @example\n   * proxyPools: {\n   *   datacenter: [{ url: \"http://dc-proxy:port\" }],\n   *   residential: [{ url: \"http://res-proxy:port\" }],\n   * }\n   */\n  proxyPools?: ProxyPoolConfig;\n\n  /** Proxy rotation strategy (default: \"round-robin\") */\n  proxyRotation?: ProxyRotation;\n\n  /**\n   * Custom user agent string. Overrides Hero's default emulated UA.\n   * Applied to all browsers in the pool.\n   *\n   * WARNING: Hero's default UA matches the Chromium TLS fingerprint.\n   * Overriding can cause TLS/UA mismatches detected by anti-bot systems.\n   */\n  userAgent?: string;\n\n  /** Skip TLS/SSL certificate verification (default: true) */\n  skipTLSVerification?: boolean;\n}\n\n/**\n * ReaderClient manages the HeroCore lifecycle and provides\n * scrape/crawl methods with automatic initialization.\n */\nexport class ReaderClient {\n  private heroCore: HeroCore | null = null;\n  private tieredPool: TieredBrowserPool | null = null;\n  private proxyGate: PerProxyGate | null = null;\n  private healthTracker: ProxyHealthTracker | null = null;\n  private initialized = false;\n  private initializing: Promise<void> | null = null;\n  private closed = false;\n  private options: ReaderClientOptions;\n  private proxyIndex = 0;\n  private cleanupHandler: (() => Promise<void>) | null = null;\n  private activeSessions = new Map<string, BrowserSession>();\n\n  constructor(options: ReaderClientOptions = {}) {\n    this.options = options;\n\n    // Configure TLS verification\n    // Hero uses MITM_ALLOW_INSECURE env var to skip certificate verification\n    // Default is true (skip verification) for compatibility with various sites\n    const skipTLS = options.skipTLSVerification ?? true;\n    if (skipTLS) {\n      process.env.MITM_ALLOW_INSECURE = \"true\";\n    }\n\n    // Register cleanup on process exit\n    this.registerCleanup();\n  }\n\n  /**\n   * Get the next proxy from the legacy rotation pool\n   */\n  private getNextProxy(): ProxyConfig | undefined {\n    const { proxies, proxyRotation = \"round-robin\" } = this.options;\n\n    if (!proxies || proxies.length === 0) {\n      return undefined;\n    }\n\n    if (proxyRotation === \"random\") {\n      return proxies[Math.floor(Math.random() * proxies.length)];\n    }\n\n    // Round-robin (default)\n    const proxy = proxies[this.proxyIndex % proxies.length];\n    this.proxyIndex++;\n    return proxy;\n  }\n\n  /**\n   * Get a proxy from a specific tier pool.\n   * Falls back to legacy proxy pool if tier pools not configured.\n   */\n  getProxyForTier(tier: \"datacenter\" | \"residential\"): ProxyConfig | undefined {\n    const pools = this.options.proxyPools;\n\n    if (pools) {\n      const pool = tier === \"residential\" ? pools.residential : pools.datacenter;\n      if (pool && pool.length > 0) {\n        // Round-robin within the tier pool\n        const idx = this.proxyIndex % pool.length;\n        this.proxyIndex++;\n        return pool[idx];\n      }\n    }\n\n    // Fallback to legacy proxies\n    return this.getNextProxy();\n  }\n\n  /**\n   * Resolve which proxy to use based on tier preference.\n   *\n   * Priority: proxyTier pool > legacy proxy rotation > undefined\n   *\n   * For \"auto\" tier: starts with datacenter (caller handles escalation on block detection).\n   */\n  private resolveProxy(proxyTier?: import(\"./types\").ProxyTier): ProxyConfig | undefined {\n    if (!proxyTier || proxyTier === \"auto\") {\n      // Auto: prefer datacenter pool if available, else legacy rotation\n      if (this.hasProxyTier(\"datacenter\")) {\n        return this.getProxyForTier(\"datacenter\");\n      }\n      return this.getNextProxy();\n    }\n\n    if (proxyTier === \"residential\" || proxyTier === \"datacenter\") {\n      if (this.hasProxyTier(proxyTier)) {\n        return this.getProxyForTier(proxyTier);\n      }\n      // Tier requested but not configured — fall back to legacy\n      return this.getNextProxy();\n    }\n\n    return this.getNextProxy();\n  }\n\n  /**\n   * Check if a proxy tier is available\n   */\n  hasProxyTier(tier: \"datacenter\" | \"residential\"): boolean {\n    const pools = this.options.proxyPools;\n    if (!pools) return false;\n    const pool = tier === \"residential\" ? pools.residential : pools.datacenter;\n    return !!pool && pool.length > 0;\n  }\n\n  /**\n   * Initialize HeroCore. Called automatically on first scrape/crawl.\n   * Can be called explicitly if you want to pre-warm the client.\n   */\n  async start(): Promise<void> {\n    if (this.closed) {\n      throw new Error(\"ReaderClient has been closed. Create a new instance.\");\n    }\n\n    if (this.initialized) {\n      return;\n    }\n\n    // Prevent concurrent initialization\n    if (this.initializing) {\n      await this.initializing;\n      return;\n    }\n\n    this.initializing = this.initializeCore();\n    await this.initializing;\n    this.initializing = null;\n  }\n\n  /**\n   * Internal initialization logic.\n   *\n   * Builds (in order):\n   *   1. HeroCore  - shared Hero runtime for every browser in every tier.\n   *   2. PerProxyGate  - scraper-boundary concurrency cap keyed by proxy URL.\n   *   3. ProxyHealthTracker  - 10-strikes-5-min-cooldown circuit breaker.\n   *   4. TieredBrowserPool  - one ProxyBoundBrowser per proxy URL, grouped\n   *      by tier. Pre-warms all browsers in parallel; `pool.ready` awaits\n   *      every browser's initial launch attempt (success or failure).\n   *\n   * `this.options.browserPool?.directPoolSize` controls how many direct\n   * browsers to spin up when no proxies are configured (local dev, CI).\n   */\n  private async initializeCore(): Promise<void> {\n    try {\n      if (this.options.verbose) {\n        logger.info(\"Starting HeroCore...\");\n      }\n\n      this.heroCore = new HeroCore();\n      await this.heroCore.start();\n\n      if (this.options.verbose) {\n        logger.info(\"HeroCore started successfully\");\n      }\n\n      // Build the scraper-level primitives.\n      this.proxyGate = new PerProxyGate({\n        maxConcurrentPerProxy: 2, // default; domain profiles can tighten\n      });\n      this.healthTracker = new ProxyHealthTracker();\n\n      // Build the tiered browser pool from the configured proxy pools.\n      const tierConfigs = buildTierConfigsFromPools(this.options.proxyPools, {\n        directPoolSize: this.options.browserPool?.size ?? 1,\n      });\n\n      if (this.options.verbose) {\n        const summary = tierConfigs.map((t) => `${t.tier}:${t.proxyUrls.length}`).join(\" \");\n        logger.info(`Initializing tiered browser pool (${summary})`);\n      }\n\n      this.tieredPool = new TieredBrowserPool({\n        tiers: tierConfigs,\n        maxTabsPerBrowser: 2,\n        retireAfterPages: this.options.browserPool?.retireAfterPages ?? 100,\n        healthTracker: this.healthTracker,\n        heroFactory: undefined as HeroFactory | undefined, // use real factory\n        showChrome: this.options.showChrome,\n        connectionToCore: this.createConnection(),\n        userAgent: this.options.userAgent,\n        logger,\n      });\n\n      // Pre-warm: await every browser's initial launch attempt. Per-browser\n      // failures are already logged and swallowed; they don't block the\n      // pool's ready promise. The separate startup api.ipify.org check\n      // (added in a later item) will fail loud if any proxy is dead.\n      await this.tieredPool.ready;\n\n      this.initialized = true;\n\n      if (this.options.verbose) {\n        const stats = this.tieredPool.getStats();\n        const counts = stats.tiers.map((t) => `${t.tier}=${t.browsers.length}`).join(\" \");\n        logger.info(`Browser pool initialized (${counts})`);\n      }\n    } catch (error: any) {\n      // Clean up on failure\n      if (this.tieredPool) {\n        await this.tieredPool.close().catch(() => {});\n        this.tieredPool = null;\n      }\n      this.proxyGate = null;\n      this.healthTracker = null;\n      if (this.heroCore) {\n        await this.heroCore.close().catch(() => {});\n        this.heroCore = null;\n      }\n      this.initialized = false;\n\n      // Provide helpful error messages\n      const message = error.message || String(error);\n\n      if (message.includes(\"EADDRINUSE\")) {\n        throw new Error(\n          \"Failed to start HeroCore: Port already in use. \" +\n            \"Another instance may be running. \" +\n            \"Close it or use a different port.\"\n        );\n      }\n\n      if (message.includes(\"chrome\") || message.includes(\"Chrome\")) {\n        throw new Error(\n          \"Failed to start HeroCore: Chrome/Chromium not found. \" +\n            \"Please install Chrome or set CHROME_PATH environment variable.\"\n        );\n      }\n\n      throw new Error(`Failed to start HeroCore: ${message}`);\n    }\n  }\n\n  /**\n   * Create a connection to the HeroCore instance\n   */\n  private createConnection(): ConnectionToHeroCore {\n    if (!this.heroCore) {\n      throw new Error(\"HeroCore not initialized. This should not happen.\");\n    }\n\n    const bridge = new TransportBridge();\n    this.heroCore.addConnection(bridge.transportToClient);\n    return new ConnectionToHeroCore(bridge.transportToCore);\n  }\n\n  /**\n   * Ensure client is initialized before operation\n   */\n  private async ensureInitialized(): Promise<void> {\n    if (this.closed) {\n      throw new Error(\"ReaderClient has been closed. Create a new instance.\");\n    }\n\n    if (!this.initialized) {\n      await this.start();\n    }\n  }\n\n  /**\n   * Scrape one or more URLs\n   *\n   * @param options - Scrape options (urls, formats, etc.)\n   * @returns Scrape result with data and metadata\n   *\n   * @example\n   * const result = await reader.scrape({\n   *   urls: ['https://example.com'],\n   *   formats: ['markdown', 'html'],\n   * });\n   */\n  async scrape(options: Omit<ScrapeOptions, \"connectionToCore\" | \"pool\">): Promise<ScrapeResult> {\n    await this.ensureInitialized();\n\n    if (!this.tieredPool) {\n      throw new Error(\"Browser pool not initialized. This should not happen.\");\n    }\n\n    // Bind `resolveProxy` to `this` so the scraper can call it per-attempt\n    // without losing the client context.\n    const boundResolveProxy = (tier: ProxyTier | undefined) => this.resolveProxy(tier);\n\n    return await scrape({\n      ...options,\n      // Caller may still pass an explicit proxy to opt out of tier routing.\n      proxy: options.proxy,\n      showChrome: options.showChrome ?? this.options.showChrome,\n      verbose: options.verbose ?? this.options.verbose,\n      tieredPool: this.tieredPool,\n      proxyGate: this.proxyGate ?? undefined,\n      healthTracker: this.healthTracker ?? undefined,\n      resolveProxy: boundResolveProxy,\n    });\n  }\n\n  /**\n   * Crawl a website to discover URLs\n   *\n   * @param options - Crawl options (url, depth, maxPages, etc.)\n   * @returns Crawl result with discovered URLs and optional scraped content\n   *\n   * @example\n   * const result = await reader.crawl({\n   *   url: 'https://example.com',\n   *   depth: 2,\n   *   maxPages: 50,\n   *   scrape: true,\n   * });\n   */\n  async crawl(options: Omit<CrawlOptions, \"connectionToCore\" | \"pool\">): Promise<CrawlResult> {\n    await this.ensureInitialized();\n\n    if (!this.tieredPool) {\n      throw new Error(\"Browser pool not initialized. This should not happen.\");\n    }\n\n    const boundResolveProxy = (tier: ProxyTier | undefined) => this.resolveProxy(tier);\n\n    return await crawl({\n      ...options,\n      proxy: options.proxy,\n      tieredPool: this.tieredPool,\n      proxyGate: this.proxyGate ?? undefined,\n      healthTracker: this.healthTracker ?? undefined,\n      resolveProxy: boundResolveProxy,\n    });\n  }\n\n  /**\n   * Create a browser session with a CDP WebSocket endpoint.\n   *\n   * Launches a Hero-stealthed Chrome and returns a WebSocket URL that\n   * Playwright or Puppeteer can connect to via `connectOverCDP()`.\n   * Full anti-bot stealth is active (TLS fingerprinting, navigator\n   * spoofing, WebRTC masking, MITM proxy).\n   *\n   * @param options - Browser session options\n   * @returns Browser session with wsEndpoint and close() method\n   *\n   * @example\n   * ```typescript\n   * import { chromium } from 'playwright';\n   *\n   * const session = await reader.browser({ proxyTier: 'residential' });\n   * const browser = await chromium.connectOverCDP(session.wsEndpoint);\n   * const page = browser.contexts()[0].pages()[0];\n   *\n   * await page.goto('https://example.com');\n   * console.log(await page.title());\n   *\n   * await session.close();\n   * ```\n   */\n  async browser(options: Omit<BrowserOptions, \"connectionToCore\"> = {}): Promise<BrowserSession> {\n    // No ensureInitialized() — browser sessions create their own dedicated\n    // HeroCore instance. They don't need the shared pool or HeroCore.\n    if (this.closed) {\n      throw new Error(\"ReaderClient has been closed. Create a new instance.\");\n    }\n\n    const boundResolveProxy = (tier: ProxyTier | undefined) => this.resolveProxy(tier);\n\n    const session = await createBrowserSession({\n      ...options,\n      resolveProxy: boundResolveProxy,\n      showChrome: options.showChrome ?? this.options.showChrome,\n      verbose: options.verbose ?? this.options.verbose,\n    });\n\n    // Track active sessions so close() can clean them up\n    this.activeSessions.set(session.sessionId, session);\n\n    // Remove from tracking when the session closes\n    const originalClose = session.close;\n    session.close = async () => {\n      this.activeSessions.delete(session.sessionId);\n      await originalClose();\n    };\n\n    return session;\n  }\n\n  /**\n   * Check if the client is initialized and ready\n   */\n  isReady(): boolean {\n    return this.initialized && !this.closed;\n  }\n\n  /**\n   * Close the client and release resources\n   *\n   * Note: This is optional - the client will auto-close on process exit.\n   */\n  async close(): Promise<void> {\n    if (this.closed) {\n      return;\n    }\n\n    this.closed = true;\n\n    // Remove process event handlers to allow clean exit\n    this.removeCleanupHandlers();\n\n    // Close all active browser sessions first\n    if (this.activeSessions.size > 0) {\n      if (this.options.verbose) {\n        logger.info(`Closing ${this.activeSessions.size} active browser session(s)...`);\n      }\n      const sessionClosePromises = Array.from(this.activeSessions.values()).map((session) =>\n        session.close().catch(() => {})\n      );\n      await Promise.all(sessionClosePromises);\n      this.activeSessions.clear();\n    }\n\n    // Shutdown the tiered pool first (closes every browser in every tier)\n    if (this.tieredPool) {\n      if (this.options.verbose) {\n        logger.info(\"Shutting down tiered browser pool...\");\n      }\n\n      try {\n        await this.tieredPool.close();\n      } catch (error: any) {\n        if (this.options.verbose) {\n          logger.warn(`Error shutting down pool: ${error.message}`);\n        }\n      }\n\n      this.tieredPool = null;\n    }\n\n    this.proxyGate = null;\n    this.healthTracker = null;\n\n    // Then close HeroCore\n    if (this.heroCore) {\n      if (this.options.verbose) {\n        logger.info(\"Closing HeroCore...\");\n      }\n\n      try {\n        await this.heroCore.close();\n        // Also call static shutdown to clean up any remaining resources\n        await HeroCore.shutdown();\n      } catch (error: any) {\n        // Ignore close errors\n        if (this.options.verbose) {\n          logger.warn(`Error closing HeroCore: ${error.message}`);\n        }\n      }\n\n      this.heroCore = null;\n    }\n\n    this.initialized = false;\n\n    if (this.options.verbose) {\n      logger.info(\"ReaderClient closed\");\n    }\n  }\n\n  /**\n   * Register cleanup handlers for process exit\n   */\n  private registerCleanup(): void {\n    this.cleanupHandler = async () => {\n      await this.close();\n    };\n\n    // Handle various exit signals\n    process.once(\"beforeExit\", this.cleanupHandler);\n    process.once(\"SIGINT\", async () => {\n      await this.cleanupHandler?.();\n      process.exit(0);\n    });\n    process.once(\"SIGTERM\", async () => {\n      await this.cleanupHandler?.();\n      process.exit(0);\n    });\n  }\n\n  /**\n   * Remove process cleanup handlers\n   */\n  private removeCleanupHandlers(): void {\n    if (this.cleanupHandler) {\n      process.removeListener(\"beforeExit\", this.cleanupHandler);\n      this.cleanupHandler = null;\n    }\n  }\n}\n"
  },
  {
    "path": "src/cloudflare/detector.ts",
    "content": "import type Hero from \"@ulixee/hero\";\nimport type { ChallengeDetection } from \"./types\";\n\n/**\n * CLOUDFLARE-SPECIFIC DOM SELECTORS\n *\n * These are ONLY present during active Cloudflare challenges.\n * We query for actual DOM elements, not string matching.\n */\nconst CLOUDFLARE_CHALLENGE_SELECTORS = [\n  \"#challenge-running\",\n  \"#challenge-stage\",\n  \"#challenge-form\",\n  \".cf-browser-verification\",\n  \"#cf-wrapper\",\n  \"#cf-hcaptcha-container\",\n  \"#turnstile-wrapper\",\n];\n\n/**\n * CLOUDFLARE-SPECIFIC TEXT PATTERNS\n *\n * These phrases only appear during active Cloudflare challenges.\n * Must be combined with other Cloudflare signals to avoid false positives.\n */\nconst CLOUDFLARE_TEXT_PATTERNS = [\n  \"checking if the site connection is secure\",\n  \"this process is automatic. your browser will redirect\",\n  \"ray id:\",\n  \"performance & security by cloudflare\",\n];\n\n/**\n * CLOUDFLARE INFRASTRUCTURE SIGNALS\n *\n * Indicators that the page is served by Cloudflare\n */\nconst CLOUDFLARE_INFRA_PATTERNS = [\"/cdn-cgi/\", \"cloudflare\", \"__cf_bm\", \"cf-ray\"];\n\n/**\n * BLOCKED/403 SIGNALS (Cloudflare-specific)\n *\n * Detect when Cloudflare explicitly blocks access\n */\nconst CLOUDFLARE_BLOCKED_PATTERNS = [\"sorry, you have been blocked\", \"ray id:\"];\n\n/**\n * Detect if current page is a Cloudflare challenge\n *\n * Uses multi-signal approach requiring BOTH:\n * 1. Cloudflare infrastructure indicators (cdn-cgi, cf-ray, etc.)\n * 2. Challenge-specific elements or text\n *\n * This prevents false positives on login pages or other sites\n * that happen to use similar text.\n *\n * @param hero - Hero instance with loaded page\n * @returns Detection result with confidence score and signals\n */\nexport async function detectChallenge(hero: Hero): Promise<ChallengeDetection> {\n  const signals: string[] = [];\n  let type: ChallengeDetection[\"type\"] = \"none\";\n  let hasCloudflareInfra = false;\n  let hasChallengeIndicator = false;\n\n  try {\n    // Ensure we have access to document\n    if (!hero.document) {\n      return {\n        isChallenge: false,\n        type: \"none\",\n        confidence: 0,\n        signals: [\"No document available\"],\n      };\n    }\n\n    // =========================================================================\n    // CHECK 1: CLOUDFLARE INFRASTRUCTURE (required for any detection)\n    // =========================================================================\n    const html = await hero.document.documentElement.outerHTML;\n    const htmlLower = html.toLowerCase();\n\n    for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {\n      if (htmlLower.includes(pattern)) {\n        hasCloudflareInfra = true;\n        signals.push(`Cloudflare infra: \"${pattern}\"`);\n        break;\n      }\n    }\n\n    // If no Cloudflare infrastructure detected, it's not a Cloudflare challenge\n    if (!hasCloudflareInfra) {\n      return {\n        isChallenge: false,\n        type: \"none\",\n        confidence: 0,\n        signals: [\"No Cloudflare infrastructure detected\"],\n      };\n    }\n\n    // =========================================================================\n    // CHECK 2: CHALLENGE DOM ELEMENTS (using actual DOM queries)\n    // =========================================================================\n    for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {\n      try {\n        const element = await hero.document.querySelector(selector);\n        if (element) {\n          hasChallengeIndicator = true;\n          signals.push(`Challenge element: ${selector}`);\n          type = \"js_challenge\";\n        }\n      } catch {\n        // Element not found, continue\n      }\n    }\n\n    // =========================================================================\n    // CHECK 3: CHALLENGE-SPECIFIC TEXT\n    // =========================================================================\n    for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {\n      if (htmlLower.includes(pattern)) {\n        hasChallengeIndicator = true;\n        signals.push(`Challenge text: \"${pattern}\"`);\n        type = type === \"none\" ? \"js_challenge\" : type;\n      }\n    }\n\n    // =========================================================================\n    // CHECK 4: \"WAITING FOR\" + \"TO RESPOND\" (Cloudflare-specific combo)\n    // =========================================================================\n    if (htmlLower.includes(\"waiting for\") && htmlLower.includes(\"to respond\")) {\n      hasChallengeIndicator = true;\n      signals.push('Challenge text: \"waiting for...to respond\"');\n      type = type === \"none\" ? \"js_challenge\" : type;\n    }\n\n    // =========================================================================\n    // CHECK 5: CLOUDFLARE BLOCKED DETECTION\n    // =========================================================================\n    // Check for blocked page with Ray ID (Cloudflare-specific)\n    const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));\n    if (hasBlocked) {\n      hasChallengeIndicator = true;\n      signals.push(\"Cloudflare block page detected\");\n      type = \"blocked\";\n    }\n\n    // Challenge only if we have BOTH Cloudflare infra AND challenge indicators\n    const isChallenge = hasCloudflareInfra && hasChallengeIndicator;\n    const confidence = isChallenge ? 100 : 0;\n\n    return {\n      isChallenge,\n      type: isChallenge ? type : \"none\",\n      confidence,\n      signals,\n    };\n  } catch (error: any) {\n    return {\n      isChallenge: false,\n      type: \"none\",\n      confidence: 0,\n      signals: [`Error during detection: ${error.message}`],\n    };\n  }\n}\n\n/**\n * Quick check - just returns boolean\n *\n * @param hero - Hero instance\n * @returns True if challenge page detected\n */\nexport async function isChallengePage(hero: Hero): Promise<boolean> {\n  const detection = await detectChallenge(hero);\n  return detection.isChallenge;\n}\n"
  },
  {
    "path": "src/cloudflare/handler.ts",
    "content": "import type Hero from \"@ulixee/hero\";\nimport { detectChallenge } from \"./detector\";\nimport type { ChallengeResolutionResult, ChallengeWaitOptions } from \"./types\";\n\n/**\n * Wait for Cloudflare challenge to resolve\n *\n * Uses multiple detection strategies:\n * 1. URL redirect detection (page redirects after challenge)\n * 2. Signal polling (challenge-specific elements/text disappear)\n *\n * @param hero - Hero instance with challenge page loaded\n * @param options - Waiting options\n * @returns Resolution result with method and time waited\n *\n * @example\n * const result = await waitForChallengeResolution(hero, {\n *   maxWaitMs: 45000,\n *   pollIntervalMs: 500,\n *   verbose: true,\n *   initialUrl: 'https://example.com'\n * });\n *\n * if (result.resolved) {\n *   console.log(`Challenge resolved via ${result.method} in ${result.waitedMs}ms`);\n * }\n */\nexport async function waitForChallengeResolution(\n  hero: Hero,\n  options: ChallengeWaitOptions\n): Promise<ChallengeResolutionResult> {\n  const { maxWaitMs = 45000, pollIntervalMs = 500, verbose = false, initialUrl } = options;\n\n  const startTime = Date.now();\n  const log = (msg: string) => verbose && console.log(`   ${msg}`);\n\n  while (Date.now() - startTime < maxWaitMs) {\n    const elapsed = Date.now() - startTime;\n\n    // =========================================================================\n    // STRATEGY 1: Check for URL change (redirect after challenge)\n    // =========================================================================\n    try {\n      const currentUrl = await hero.url;\n      if (currentUrl !== initialUrl) {\n        log(`✓ URL changed: ${initialUrl} → ${currentUrl}`);\n        // Wait for the new page to fully load after redirect\n        log(`  Waiting for new page to load...`);\n        try {\n          await hero.waitForLoad(\"DomContentLoaded\", { timeoutMs: 30000 });\n          log(`  DOMContentLoaded`);\n        } catch {\n          log(`  DOMContentLoaded timeout, continuing...`);\n        }\n        // Additional wait for JS to execute and render\n        await hero.waitForPaintingStable().catch(() => {});\n        log(`  Page stabilized`);\n        return { resolved: true, method: \"url_redirect\", waitedMs: elapsed };\n      }\n    } catch {\n      // URL check failed, continue with other strategies\n    }\n\n    // =========================================================================\n    // STRATEGY 2: Check if challenge signals are gone\n    // =========================================================================\n    const detection = await detectChallenge(hero);\n\n    if (!detection.isChallenge) {\n      log(`✓ Challenge signals cleared (confidence dropped to ${detection.confidence})`);\n      // Wait for page to fully load after challenge clears\n      log(`  Waiting for page to load...`);\n      try {\n        await hero.waitForLoad(\"DomContentLoaded\", { timeoutMs: 30000 });\n        log(`  DOMContentLoaded`);\n      } catch {\n        log(`  DOMContentLoaded timeout, continuing...`);\n      }\n      await hero.waitForPaintingStable().catch(() => {});\n      log(`  Page stabilized`);\n      return { resolved: true, method: \"signals_cleared\", waitedMs: elapsed };\n    }\n\n    // Log progress\n    log(\n      `⏳ ${(elapsed / 1000).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`\n    );\n\n    // Wait before next poll\n    await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));\n  }\n\n  // Timeout reached\n  return {\n    resolved: false,\n    method: \"timeout\",\n    waitedMs: Date.now() - startTime,\n  };\n}\n\n/**\n * Wait for a specific CSS selector to appear\n *\n * Useful when you know exactly what element should appear after challenge.\n *\n * @param hero - Hero instance\n * @param selector - CSS selector to wait for\n * @param maxWaitMs - Maximum time to wait\n * @param verbose - Enable logging\n * @returns Whether selector was found and time waited\n *\n * @example\n * const result = await waitForSelector(hero, '.content', 30000, true);\n * if (result.found) {\n *   console.log(`Content appeared after ${result.waitedMs}ms`);\n * }\n */\nexport async function waitForSelector(\n  hero: Hero,\n  selector: string,\n  maxWaitMs: number,\n  verbose: boolean = false\n): Promise<{ found: boolean; waitedMs: number }> {\n  const startTime = Date.now();\n  const log = (msg: string) => verbose && console.log(`   ${msg}`);\n\n  log(`Waiting for selector: \"${selector}\"`);\n\n  while (Date.now() - startTime < maxWaitMs) {\n    try {\n      const element = await hero.document.querySelector(selector);\n      if (element) {\n        const elapsed = Date.now() - startTime;\n        log(`✓ Selector found after ${(elapsed / 1000).toFixed(1)}s`);\n        return { found: true, waitedMs: elapsed };\n      }\n    } catch {\n      // Selector not found yet, continue\n    }\n\n    await new Promise((resolve) => setTimeout(resolve, 300));\n  }\n\n  log(`✗ Selector not found within timeout`);\n  return { found: false, waitedMs: Date.now() - startTime };\n}\n\n/**\n * Handle Cloudflare challenge with automatic detection and waiting\n *\n * High-level function that combines detection and resolution.\n *\n * @param hero - Hero instance\n * @param options - Wait options (without initialUrl)\n * @returns Resolution result\n *\n * @example\n * await hero.goto('https://example.com');\n * const result = await handleChallenge(hero, { verbose: true });\n * if (result.resolved) {\n *   // Challenge passed, continue scraping\n * }\n */\nexport async function handleChallenge(\n  hero: Hero,\n  options: Omit<ChallengeWaitOptions, \"initialUrl\"> = {}\n): Promise<ChallengeResolutionResult> {\n  // Get current URL\n  const initialUrl = await hero.url;\n\n  // Detect challenge\n  const detection = await detectChallenge(hero);\n\n  if (!detection.isChallenge) {\n    // No challenge, return immediately\n    return { resolved: true, method: \"signals_cleared\", waitedMs: 0 };\n  }\n\n  // Challenge detected, wait for resolution\n  return waitForChallengeResolution(hero, {\n    ...options,\n    initialUrl,\n  });\n}\n"
  },
  {
    "path": "src/cloudflare/types.ts",
    "content": "/**\n * Cloudflare challenge detection result\n */\nexport interface ChallengeDetection {\n  /** Whether a challenge was detected */\n  isChallenge: boolean;\n\n  /** Type of challenge */\n  type: \"js_challenge\" | \"turnstile\" | \"captcha\" | \"blocked\" | \"none\";\n\n  /** Confidence level (0-100) */\n  confidence: number;\n\n  /** Detection signals found */\n  signals: string[];\n}\n\n/**\n * Challenge resolution result\n */\nexport interface ChallengeResolutionResult {\n  /** Whether the challenge was resolved */\n  resolved: boolean;\n\n  /** Method used to detect resolution */\n  method: \"url_redirect\" | \"signals_cleared\" | \"timeout\";\n\n  /** Time waited in milliseconds */\n  waitedMs: number;\n}\n\n/**\n * Challenge waiting options\n */\nexport interface ChallengeWaitOptions {\n  /** Maximum time to wait for resolution (default: 45000ms) */\n  maxWaitMs?: number;\n\n  /** How often to poll for resolution (default: 500ms) */\n  pollIntervalMs?: number;\n\n  /** Enable verbose logging */\n  verbose?: boolean;\n\n  /** Initial URL before challenge */\n  initialUrl: string;\n}\n"
  },
  {
    "path": "src/config/domain-profiles.ts",
    "content": "/**\n * Domain Profiles\n *\n * Per-domain scrape configuration overrides. Reader ships with NO\n * built-in profiles — the caller provides them via ScrapeOptions.domainProfiles.\n *\n * Profiles are merged with user-provided options — user options\n * take precedence. If a user explicitly sets a value, the profile\n * won't override it.\n */\n\nimport type { ScrapeOptions } from \"../types\";\n\n/**\n * Subset of ScrapeOptions that can be overridden per domain\n */\nexport interface DomainProfile {\n  /** Override proxy tier for this domain */\n  proxyTier?: \"datacenter\" | \"residential\";\n  /** Override timeout for this domain */\n  timeoutMs?: number;\n  /** Override batch concurrency (limit parallel requests to this domain) */\n  batchConcurrency?: number;\n  /** Minimum delay between requests in ms (for rate-sensitive sites) */\n  minDelayMs?: number;\n  /**\n   * Tighten the per-proxy concurrency cap when scraping this domain.\n   */\n  maxConcurrentPerProxy?: number;\n}\n\n/**\n * Look up a domain profile by URL or hostname.\n *\n * @param urlOrHostname - Full URL or hostname\n * @param profiles - Domain profile map (from ScrapeOptions.domainProfiles)\n * @returns Domain profile if found, undefined otherwise\n */\nexport function getDomainProfile(\n  urlOrHostname: string,\n  profiles?: Record<string, DomainProfile>\n): DomainProfile | undefined {\n  if (!profiles || Object.keys(profiles).length === 0) return undefined;\n\n  let hostname: string;\n  try {\n    hostname = urlOrHostname.includes(\"://\") ? new URL(urlOrHostname).hostname : urlOrHostname;\n  } catch {\n    return undefined;\n  }\n\n  hostname = hostname.replace(/^www\\./, \"\");\n\n  // Exact match\n  if (profiles[hostname]) {\n    return profiles[hostname];\n  }\n\n  // Subdomain match (e.g., \"shop.amazon.com\" → \"amazon.com\")\n  for (const domain of Object.keys(profiles)) {\n    if (hostname.endsWith(`.${domain}`)) {\n      return profiles[domain];\n    }\n  }\n\n  return undefined;\n}\n\n/**\n * Merge a domain profile with user options.\n * User-provided options take precedence over profile values.\n */\nexport function applyDomainProfile<T extends Partial<ScrapeOptions>>(\n  options: T,\n  profile: DomainProfile\n): T {\n  const merged = { ...options };\n\n  if (profile.timeoutMs && !options.timeoutMs) {\n    merged.timeoutMs = profile.timeoutMs;\n  }\n  if (profile.batchConcurrency && !options.batchConcurrency) {\n    merged.batchConcurrency = profile.batchConcurrency;\n  }\n  if (profile.proxyTier && !options.proxyTier) {\n    merged.proxyTier = profile.proxyTier;\n  }\n\n  return merged;\n}\n"
  },
  {
    "path": "src/crawl-types.ts",
    "content": "import type { ScrapeResult, ProxyConfig, ProxyTier } from \"./types\";\nimport type { IBrowserPool } from \"./browser/types\";\n\n/**\n * Crawl options interface\n */\nexport interface CrawlOptions {\n  /** Single seed URL to start crawling from */\n  url: string;\n\n  /** Maximum depth to crawl (default: 1) */\n  depth?: number;\n\n  /** Maximum pages to discover (default: 20) */\n  maxPages?: number;\n\n  /** Also scrape full content (default: false) */\n  scrape?: boolean;\n\n  /** Delay between requests in milliseconds (default: 1000) */\n  delayMs?: number;\n\n  /** Total timeout for the entire crawl operation in milliseconds */\n  timeoutMs?: number;\n\n  /** URL patterns to include (regex strings) - if set, only matching URLs are crawled */\n  includePatterns?: string[];\n\n  /** URL patterns to exclude (regex strings) - matching URLs are skipped */\n  excludePatterns?: string[];\n\n  // ============================================================================\n  // Scrape options (used when scrape: true)\n  // ============================================================================\n\n  /** Output formats for scraped content (default: ['markdown']) */\n  formats?: Array<\"markdown\" | \"html\">;\n\n  /** Number of URLs to scrape in parallel (default: 2) */\n  scrapeConcurrency?: number;\n\n  // ============================================================================\n  // Content cleaning options\n  // ============================================================================\n\n  /** Remove ads and tracking elements (default: true) */\n  removeAds?: boolean;\n\n  /** Remove base64-encoded images to reduce output size (default: true) */\n  removeBase64Images?: boolean;\n\n  // ============================================================================\n  // Hero-specific options\n  // ============================================================================\n\n  /** Proxy configuration for Hero */\n  proxy?: ProxyConfig;\n\n  /** Proxy tier selection (default: \"auto\") */\n  proxyTier?: ProxyTier;\n\n  /** Custom user agent string */\n  userAgent?: string;\n\n  /** Enable verbose logging (default: false) */\n  verbose?: boolean;\n\n  /** Show Chrome window (default: false) */\n  showChrome?: boolean;\n\n  /** Connection to Hero Core (for shared Core usage) */\n  connectionToCore?: any;\n\n  /** Legacy single browser pool (internal). Kept for backward-compat during the migration. */\n  pool?: IBrowserPool;\n\n  /**\n   * Tiered browser pool (internal, provided by ReaderClient).\n   *\n   * When present, the crawler uses this instead of the legacy `pool`.\n   * Typed as `unknown` to avoid a type cycle; the crawler casts it to\n   * TieredBrowserPool at the use site.\n   */\n  tieredPool?: unknown;\n\n  /**\n   * Per-proxy concurrency gate (internal, provided by ReaderClient).\n   *\n   * When present, the crawler wraps every fetchPage in `proxyGate.withSlot`\n   * the same way the scraper does. Typed as `unknown` to avoid a cycle.\n   */\n  proxyGate?: unknown;\n\n  /**\n   * Per-proxy health tracker (internal, provided by ReaderClient).\n   */\n  healthTracker?: unknown;\n\n  /**\n   * Callback that resolves a proxy URL for a given tier. Provided by\n   * ReaderClient. Used per-fetch so escalation works.\n   */\n  resolveProxy?: (tier: ProxyTier | undefined) => ProxyConfig | undefined;\n}\n\n/**\n * Crawl URL result interface\n */\nexport interface CrawlUrl {\n  /** URL of the page */\n  url: string;\n\n  /** Page title */\n  title: string;\n\n  /** Page description or null if not found */\n  description: string | null;\n}\n\n/**\n * Crawl result interface\n */\nexport interface CrawlResult {\n  /** Array of discovered URLs with basic info */\n  urls: CrawlUrl[];\n\n  /** Full scrape results (only when scrape: true) */\n  scraped?: ScrapeResult;\n\n  /** Crawl operation metadata */\n  metadata: CrawlMetadata;\n}\n\n/**\n * Crawl metadata interface\n */\nexport interface CrawlMetadata {\n  /** Total URLs discovered */\n  totalUrls: number;\n\n  /** Maximum depth reached */\n  maxDepth: number;\n\n  /** Total crawl duration in milliseconds */\n  totalDuration: number;\n\n  /** Seed URL that started the crawl */\n  seedUrl: string;\n}\n"
  },
  {
    "path": "src/crawler.ts",
    "content": "import { parseHTML } from \"linkedom\";\nimport {\n  resolveUrl,\n  isValidUrl,\n  isSameDomain,\n  getUrlKey,\n  isContentUrl,\n  shouldIncludeUrl,\n} from \"./utils/url-helpers\";\nimport { fetchRobotsTxt, isUrlAllowed, type RobotsRules } from \"./utils/robots-parser\";\nimport { rateLimit } from \"./utils/rate-limiter\";\nimport { createLogger } from \"./utils/logger\";\nimport { scrape } from \"./scraper\";\nimport type { CrawlOptions, CrawlResult, CrawlUrl, CrawlMetadata } from \"./crawl-types\";\nimport type { ScrapeResult } from \"./types\";\n\n/**\n * Crawler class for discovering and optionally scraping pages.\n *\n * Discovery and scraping both go through the scraper, which handles\n * Hero, proxy escalation, and timeouts. The crawler owns BFS traversal,\n * link extraction, deduplication, robots.txt, and rate limiting.\n */\nexport class Crawler {\n  private options: CrawlOptions;\n  private visited: Set<string> = new Set();\n  private queue: Array<{ url: string; depth: number }> = [];\n  private urls: CrawlUrl[] = [];\n  private logger = createLogger(\"crawler\");\n  private robotsRules: RobotsRules | null = null;\n\n  constructor(options: CrawlOptions) {\n    this.options = {\n      depth: 1,\n      maxPages: 20,\n      scrape: false,\n      delayMs: 1000,\n      formats: [\"markdown\", \"html\"],\n      scrapeConcurrency: 2,\n      verbose: false,\n      showChrome: false,\n      ...options,\n    };\n  }\n\n  /**\n   * Start crawling\n   */\n  async crawl(): Promise<CrawlResult> {\n    const startTime = Date.now();\n\n    // Fetch robots.txt rules\n    this.robotsRules = await fetchRobotsTxt(this.options.url);\n    if (this.robotsRules) {\n      this.logger.info(\"Loaded robots.txt rules\");\n    }\n\n    // Add seed URL to queue\n    if (isUrlAllowed(this.options.url, this.robotsRules)) {\n      this.queue.push({ url: this.options.url, depth: 0 });\n    } else {\n      this.logger.warn(`Seed URL blocked by robots.txt: ${this.options.url}`);\n    }\n\n    // BFS crawl\n    while (this.queue.length > 0 && this.urls.length < (this.options.maxPages ?? 20)) {\n      if (this.options.timeoutMs && Date.now() - startTime > this.options.timeoutMs) {\n        this.logger.warn(`Crawl timed out after ${this.options.timeoutMs}ms`);\n        break;\n      }\n\n      const item = this.queue.shift()!;\n      const urlKey = getUrlKey(item.url);\n\n      if (this.visited.has(urlKey)) {\n        continue;\n      }\n\n      // Fetch page via scraper\n      const result = await this.fetchPage(item.url);\n\n      if (result) {\n        this.urls.push(result.crawlUrl);\n        this.visited.add(urlKey);\n\n        // Extract links if not at max depth\n        if (item.depth < (this.options.depth ?? 1)) {\n          const links = this.extractLinks(result.html, item.url, item.depth + 1);\n          this.queue.push(...links);\n        }\n      }\n\n      // Rate limit\n      const delay = this.robotsRules?.crawlDelay || (this.options.delayMs ?? 1000);\n      await rateLimit(delay);\n    }\n\n    const metadata: CrawlMetadata = {\n      totalUrls: this.urls.length,\n      maxDepth: this.options.depth ?? 1,\n      totalDuration: Date.now() - startTime,\n      seedUrl: this.options.url,\n    };\n\n    // Optionally scrape all discovered URLs for content\n    let scraped: ScrapeResult | undefined;\n    if (this.options.scrape) {\n      scraped = await this.scrapeDiscoveredUrls();\n    }\n\n    return {\n      urls: this.urls,\n      scraped,\n      metadata,\n    };\n  }\n\n  /**\n   * Fetch a single page for discovery using the scraper.\n   *\n   * Calls scrape() with onlyMainContent=false so link extraction gets\n   * the full page HTML. The scraper handles Hero, proxy escalation,\n   * and timeouts internally.\n   */\n  private async fetchPage(url: string): Promise<{ crawlUrl: CrawlUrl; html: string } | null> {\n    try {\n      const result = await scrape({\n        urls: [url],\n        formats: [], // We only need rawHtml for discovery\n        onlyMainContent: false,\n        proxy: this.options.proxy,\n        proxyTier: this.options.proxyTier,\n        timeoutMs: this.options.timeoutMs,\n        verbose: this.options.verbose,\n        showChrome: this.options.showChrome,\n        connectionToCore: this.options.connectionToCore,\n        pool: this.options.pool,\n        tieredPool: this.options.tieredPool,\n        proxyGate: this.options.proxyGate,\n        healthTracker: this.options.healthTracker,\n        resolveProxy: this.options.resolveProxy,\n      });\n\n      if (result.data.length === 0) {\n        this.logger.warn(`[crawler] No data returned for ${url}`);\n        return null;\n      }\n\n      const page = result.data[0];\n\n      return {\n        crawlUrl: {\n          url: page.metadata.baseUrl,\n          title: page.metadata.website?.title || \"Untitled\",\n          description: page.metadata.website?.description ?? null,\n        },\n        html: page.rawHtml,\n      };\n    } catch (error: unknown) {\n      const msg = error instanceof Error ? error.message : String(error);\n      this.logger.error(`[crawler] Failed to fetch ${url}: ${msg}`);\n      return null;\n    }\n  }\n\n  /**\n   * Extract links from HTML content using DOM parsing\n   */\n  private extractLinks(\n    html: string,\n    baseUrl: string,\n    depth: number\n  ): Array<{ url: string; depth: number }> {\n    const links: Array<{ url: string; depth: number }> = [];\n    const { document } = parseHTML(html);\n\n    document.querySelectorAll(\"a[href]\").forEach((anchor: Element) => {\n      const rawHref = anchor.getAttribute(\"href\");\n      if (!rawHref) return;\n\n      const href = rawHref.trim();\n      if (!href) return;\n\n      // Skip fragment-only links\n      if (href.startsWith(\"#\")) return;\n\n      // Skip non-HTTP schemes\n      const lowerHref = href.toLowerCase();\n      if (\n        lowerHref.startsWith(\"javascript:\") ||\n        lowerHref.startsWith(\"mailto:\") ||\n        lowerHref.startsWith(\"tel:\") ||\n        lowerHref.startsWith(\"data:\") ||\n        lowerHref.startsWith(\"blob:\") ||\n        lowerHref.startsWith(\"ftp:\")\n      ) {\n        return;\n      }\n\n      // Resolve relative URLs\n      let resolved = resolveUrl(href, baseUrl);\n      if (!resolved || !isValidUrl(resolved)) return;\n\n      // Strip hash fragments\n      try {\n        const parsed = new URL(resolved);\n        parsed.hash = \"\";\n        resolved = parsed.toString();\n      } catch {\n        return;\n      }\n\n      // Same domain only\n      if (!isSameDomain(resolved, this.options.url)) return;\n\n      // Content pages only\n      if (!isContentUrl(resolved)) return;\n\n      // Include/exclude patterns\n      if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns))\n        return;\n\n      // Robots.txt\n      if (!isUrlAllowed(resolved, this.robotsRules)) return;\n\n      // Deduplication\n      const urlKey = getUrlKey(resolved);\n      if (this.visited.has(urlKey) || this.queue.some((q) => getUrlKey(q.url) === urlKey)) {\n        return;\n      }\n\n      links.push({ url: resolved, depth });\n    });\n\n    return links;\n  }\n\n  /**\n   * Scrape all discovered URLs for content.\n   */\n  private async scrapeDiscoveredUrls(): Promise<ScrapeResult> {\n    const urls = this.urls.map((u) => u.url);\n\n    return scrape({\n      urls,\n      formats: this.options.formats || [\"markdown\", \"html\"],\n      batchConcurrency: this.options.scrapeConcurrency || 2,\n      proxy: this.options.proxy,\n      proxyTier: this.options.proxyTier,\n      userAgent: this.options.userAgent,\n      verbose: this.options.verbose,\n      showChrome: this.options.showChrome,\n      pool: this.options.pool,\n      tieredPool: this.options.tieredPool,\n      proxyGate: this.options.proxyGate,\n      healthTracker: this.options.healthTracker,\n      resolveProxy: this.options.resolveProxy,\n      removeAds: this.options.removeAds,\n      removeBase64Images: this.options.removeBase64Images,\n    });\n  }\n}\n\n/**\n * Convenience function to crawl a website\n */\nexport async function crawl(options: CrawlOptions): Promise<CrawlResult> {\n  const crawler = new Crawler(options);\n  return crawler.crawl();\n}\n"
  },
  {
    "path": "src/daemon/client.ts",
    "content": "/**\n * Daemon Client\n *\n * A client that connects to the daemon server via HTTP.\n * Used by CLI commands when a daemon is running.\n *\n * @example\n * const client = new DaemonClient({ port: 3847 });\n *\n * const result = await client.scrape({\n *   urls: ['https://example.com'],\n *   formats: ['markdown'],\n * });\n */\n\nimport http from \"http\";\nimport type { ScrapeOptions, ScrapeResult } from \"../types\";\nimport type { CrawlOptions, CrawlResult } from \"../crawl-types\";\nimport type { BrowserOptions } from \"../browser-types\";\nimport type { DaemonStatus, BrowserSessionInfo } from \"./server\";\nimport { DEFAULT_DAEMON_PORT } from \"./server\";\n\n/**\n * Daemon client configuration\n */\nexport interface DaemonClientOptions {\n  /** Port the daemon is running on (default: 3847) */\n  port?: number;\n  /** Request timeout in milliseconds (default: 600000 = 10 minutes) */\n  timeoutMs?: number;\n  /** Bearer token for daemon auth (default: READER_AUTH_TOKEN env var) */\n  authToken?: string;\n}\n\n/**\n * Daemon Client\n */\nexport class DaemonClient {\n  private options: Required<DaemonClientOptions>;\n\n  constructor(options: DaemonClientOptions = {}) {\n    this.options = {\n      port: options.port ?? DEFAULT_DAEMON_PORT,\n      timeoutMs: options.timeoutMs ?? 600000, // 10 minutes default\n      authToken: options.authToken ?? process.env.READER_AUTH_TOKEN ?? \"\",\n    };\n  }\n\n  /**\n   * Scrape URLs via daemon\n   */\n  async scrape(options: Omit<ScrapeOptions, \"connectionToCore\">): Promise<ScrapeResult> {\n    return this.request<ScrapeResult>({\n      action: \"scrape\",\n      options,\n    });\n  }\n\n  /**\n   * Crawl URL via daemon\n   */\n  async crawl(options: Omit<CrawlOptions, \"connectionToCore\">): Promise<CrawlResult> {\n    return this.request<CrawlResult>({\n      action: \"crawl\",\n      options,\n    });\n  }\n\n  /**\n   * Get daemon status\n   */\n  async status(): Promise<DaemonStatus> {\n    return this.request<DaemonStatus>({\n      action: \"status\",\n    });\n  }\n\n  /**\n   * Request daemon shutdown\n   */\n  async shutdown(): Promise<void> {\n    await this.request<{ message: string }>({\n      action: \"shutdown\",\n    });\n  }\n\n  /**\n   * Create a browser session via daemon\n   */\n  async browserCreate(\n    options: Omit<BrowserOptions, \"connectionToCore\"> = {}\n  ): Promise<BrowserSessionInfo> {\n    return this.request<BrowserSessionInfo>({\n      action: \"browser.create\",\n      options,\n    });\n  }\n\n  /**\n   * Stop a browser session via daemon\n   */\n  async browserStop(sessionId: string): Promise<void> {\n    await this.request<{ sessionId: string }>({\n      action: \"browser.stop\",\n      sessionId,\n    });\n  }\n\n  /**\n   * List active browser sessions via daemon\n   */\n  async browserList(): Promise<BrowserSessionInfo[]> {\n    return this.request<BrowserSessionInfo[]>({\n      action: \"browser.list\",\n    });\n  }\n\n  /**\n   * Check if daemon is reachable\n   */\n  async isRunning(): Promise<boolean> {\n    try {\n      await this.status();\n      return true;\n    } catch {\n      return false;\n    }\n  }\n\n  /**\n   * Make HTTP request to daemon\n   */\n  private request<T>(body: object): Promise<T> {\n    return new Promise((resolve, reject) => {\n      const data = JSON.stringify(body);\n\n      const req = http.request(\n        {\n          hostname: \"127.0.0.1\",\n          port: this.options.port,\n          path: \"/\",\n          method: \"POST\",\n          headers: {\n            \"Content-Type\": \"application/json\",\n            \"Content-Length\": Buffer.byteLength(data),\n            ...(this.options.authToken\n              ? { Authorization: `Bearer ${this.options.authToken}` }\n              : {}),\n          },\n          timeout: this.options.timeoutMs,\n        },\n        (res) => {\n          let responseBody = \"\";\n\n          res.on(\"data\", (chunk) => {\n            responseBody += chunk;\n          });\n\n          res.on(\"end\", () => {\n            try {\n              const response = JSON.parse(responseBody);\n\n              if (response.success) {\n                resolve(response.data);\n              } else {\n                reject(new Error(response.error || \"Unknown daemon error\"));\n              }\n            } catch (error) {\n              reject(new Error(`Failed to parse daemon response: ${responseBody}`));\n            }\n          });\n        }\n      );\n\n      req.on(\"error\", (error: NodeJS.ErrnoException) => {\n        if (error.code === \"ECONNREFUSED\") {\n          reject(\n            new Error(`Cannot connect to daemon on port ${this.options.port}. Is it running?`)\n          );\n        } else {\n          reject(error);\n        }\n      });\n\n      req.on(\"timeout\", () => {\n        req.destroy();\n        reject(new Error(`Request to daemon timed out after ${this.options.timeoutMs}ms`));\n      });\n\n      req.write(data);\n      req.end();\n    });\n  }\n}\n\n/**\n * Check if daemon is running on the specified port\n */\nexport async function isDaemonRunning(port: number = DEFAULT_DAEMON_PORT): Promise<boolean> {\n  const client = new DaemonClient({ port, timeoutMs: 5000 });\n  return client.isRunning();\n}\n"
  },
  {
    "path": "src/daemon/index.ts",
    "content": "/**\n * Daemon module exports\n */\n\nexport { DaemonServer, DEFAULT_DAEMON_PORT, getDaemonInfo, getPidFilePath } from \"./server\";\nexport type { DaemonServerOptions, DaemonStatus, BrowserSessionInfo } from \"./server\";\n\nexport { DaemonClient, isDaemonRunning } from \"./client\";\nexport type { DaemonClientOptions } from \"./client\";\n"
  },
  {
    "path": "src/daemon/server.ts",
    "content": "/**\n * Daemon Server\n *\n * An HTTP server that wraps ReaderClient, allowing multiple CLI\n * commands to share a single browser pool for efficient scraping.\n *\n * Endpoints:\n *   POST /          — Scrape/crawl/status/shutdown (JSON body with \"action\" field)\n *   GET  /health    — Liveness check (always 200 if server is up)\n *   GET  /ready     — Readiness check (200 only after browser pool is warm)\n *   GET  /status    — Pool stats, uptime, and engine info\n *\n * Auth:\n *   Set READER_AUTH_TOKEN env var to require Bearer token on all endpoints\n *   except /health (liveness should always be unauthenticated).\n *\n * @example\n * // Start daemon\n * const daemon = new DaemonServer({ port: 3847, poolSize: 5 });\n * await daemon.start();\n *\n * // Stop daemon\n * await daemon.stop();\n */\n\nimport http from \"http\";\nimport { ReaderClient, type ReaderClientOptions } from \"../client\";\nimport type { ScrapeOptions, ScrapeResult } from \"../types\";\nimport type { CrawlOptions, CrawlResult } from \"../crawl-types\";\nimport type { BrowserOptions, BrowserSession } from \"../browser-types\";\nimport { createLogger } from \"../utils/logger\";\nimport { parseProxyPoolsFromEnv } from \"../proxy/env\";\nimport { verifyProxiesOrThrow } from \"../proxy/verify\";\nimport { redactProxyUrl } from \"../browser/proxy-bound-browser\";\n\nconst logger = createLogger(\"daemon\");\n\nexport const DEFAULT_DAEMON_PORT = 6003;\nconst PID_FILE_NAME = \".reader-daemon.pid\";\nconst SHUTDOWN_TIMEOUT_MS = 30_000;\n\n/**\n * Daemon server configuration\n */\nexport interface DaemonServerOptions {\n  /** Port to listen on (default: 3847) */\n  port?: number;\n  /** Browser pool size (default: 5) */\n  poolSize?: number;\n  /** Enable verbose logging (default: false) */\n  verbose?: boolean;\n  /** Show Chrome browser windows (default: false) */\n  showChrome?: boolean;\n  /** Bearer token for API authentication (default: READER_AUTH_TOKEN env var) */\n  authToken?: string;\n}\n\n/**\n * Request body types\n */\ninterface ScrapeRequest {\n  action: \"scrape\";\n  options: Omit<ScrapeOptions, \"connectionToCore\">;\n}\n\ninterface CrawlRequest {\n  action: \"crawl\";\n  options: Omit<CrawlOptions, \"connectionToCore\">;\n}\n\ninterface StatusRequest {\n  action: \"status\";\n}\n\ninterface ShutdownRequest {\n  action: \"shutdown\";\n}\n\ninterface BrowserCreateRequest {\n  action: \"browser.create\";\n  options: Omit<BrowserOptions, \"connectionToCore\">;\n}\n\ninterface BrowserStopRequest {\n  action: \"browser.stop\";\n  sessionId: string;\n}\n\ninterface BrowserListRequest {\n  action: \"browser.list\";\n}\n\ntype DaemonRequest =\n  | ScrapeRequest\n  | CrawlRequest\n  | StatusRequest\n  | ShutdownRequest\n  | BrowserCreateRequest\n  | BrowserStopRequest\n  | BrowserListRequest;\n\n/**\n * Response types\n */\ninterface SuccessResponse<T> {\n  success: true;\n  data: T;\n}\n\ninterface ErrorResponse {\n  success: false;\n  error: string;\n}\n\ntype DaemonResponse<T> = SuccessResponse<T> | ErrorResponse;\n\n/**\n * Status response data\n */\nexport interface DaemonStatus {\n  running: true;\n  ready: boolean;\n  port: number;\n  poolSize: number;\n  uptime: number;\n  pid: number;\n  activeRequests: number;\n}\n\n/**\n * Serializable browser session info (without the close function)\n */\nexport interface BrowserSessionInfo {\n  sessionId: string;\n  wsEndpoint: string;\n  createdAt: string;\n}\n\n/**\n * Daemon Server\n */\nexport class DaemonServer {\n  private server: http.Server | null = null;\n  private client: ReaderClient | null = null;\n  private options: Required<DaemonServerOptions>;\n  private startTime: number = 0;\n  private activeRequests: number = 0;\n  private shuttingDown: boolean = false;\n  private browserSessions = new Map<string, BrowserSession>();\n\n  constructor(options: DaemonServerOptions = {}) {\n    this.options = {\n      port: options.port ?? DEFAULT_DAEMON_PORT,\n      poolSize: options.poolSize ?? 5,\n      verbose: options.verbose ?? false,\n      showChrome: options.showChrome ?? false,\n      authToken: options.authToken ?? process.env.READER_AUTH_TOKEN ?? \"\",\n    };\n  }\n\n  /**\n   * Start the daemon server\n   */\n  async start(): Promise<void> {\n    if (this.server) {\n      throw new Error(\"Daemon is already running\");\n    }\n\n    // Load proxy pools from PROXY_DATACENTER / PROXY_RESIDENTIAL env vars.\n    // Throws on malformed URLs — we refuse to start with a bad proxy config\n    // rather than silently falling through to direct connections, which\n    // would hide the misconfiguration behind partial successes.\n    const { pools: proxyPools, summary: proxySummary } = parseProxyPoolsFromEnv();\n    logger.info(proxySummary);\n\n    // Verify each configured proxy by GETting api.ipify.org through it.\n    // This catches dead URLs, wrong creds, and reachability problems BEFORE\n    // we spend the cost of launching N Hero instances. Throws a clear\n    // multi-line error if any proxy fails — the daemon won't start with a\n    // broken config.\n    if (proxyPools) {\n      logger.info(\"Verifying proxies via api.ipify.org...\");\n      const verified = await verifyProxiesOrThrow(proxyPools);\n      for (const v of verified) {\n        logger.info(`  ✓ [${v.tier}] ${redactProxyUrl(v.proxyUrl)} -> egress IP ${v.egressIp}`);\n      }\n    }\n\n    // Initialize ReaderClient\n    const clientOptions: ReaderClientOptions = {\n      verbose: this.options.verbose,\n      showChrome: this.options.showChrome,\n      browserPool: {\n        size: this.options.poolSize,\n      },\n      ...(proxyPools ? { proxyPools } : {}),\n    };\n\n    this.client = new ReaderClient(clientOptions);\n    await this.client.start();\n\n    // Guard against uncaught exceptions from Hero internals.\n    // Hero's MITM proxy can throw after a page closes (e.g.,\n    // Resources.onMitmError accessing null framesManager). These\n    // are non-fatal race conditions — the scrape already failed,\n    // this is cleanup code hitting a null reference. Log and continue.\n    process.on(\"uncaughtException\", (err) => {\n      logger.error({ err }, \"Uncaught exception (non-fatal, Hero internal)\");\n    });\n\n    // Create HTTP server\n    this.server = http.createServer(this.handleRequest.bind(this));\n\n    // Start listening\n    await new Promise<void>((resolve, reject) => {\n      this.server!.listen(this.options.port, () => {\n        this.startTime = Date.now();\n        if (this.options.verbose) {\n          logger.info(\n            `Daemon started on port ${this.options.port} with pool size ${this.options.poolSize}`\n          );\n        }\n        resolve();\n      });\n\n      this.server!.on(\"error\", (error: NodeJS.ErrnoException) => {\n        if (error.code === \"EADDRINUSE\") {\n          reject(\n            new Error(`Port ${this.options.port} is already in use. Is another daemon running?`)\n          );\n        } else {\n          reject(error);\n        }\n      });\n    });\n\n    // Write PID file\n    await this.writePidFile();\n  }\n\n  /**\n   * Stop the daemon server\n   */\n  async stop(): Promise<void> {\n    if (this.server) {\n      await new Promise<void>((resolve) => {\n        this.server!.close(() => resolve());\n      });\n      this.server = null;\n    }\n\n    if (this.client) {\n      await this.client.close();\n      this.client = null;\n    }\n\n    // Remove PID file\n    await this.removePidFile();\n\n    if (this.options.verbose) {\n      logger.info(\"Daemon stopped\");\n    }\n  }\n\n  /**\n   * Get the port the daemon is running on\n   */\n  getPort(): number {\n    return this.options.port;\n  }\n\n  /**\n   * Validate Bearer token if auth is configured\n   * Returns true if authorized, false if rejected (response already sent).\n   */\n  private checkAuth(req: http.IncomingMessage, res: http.ServerResponse): boolean {\n    if (!this.options.authToken) return true;\n\n    const authHeader = req.headers.authorization;\n    if (authHeader !== `Bearer ${this.options.authToken}`) {\n      this.sendResponse(res, 401, { success: false, error: \"Unauthorized\" });\n      return false;\n    }\n    return true;\n  }\n\n  /**\n   * Handle incoming HTTP requests\n   */\n  private async handleRequest(req: http.IncomingMessage, res: http.ServerResponse): Promise<void> {\n    const method = req.method ?? \"GET\";\n    const url = req.url ?? \"/\";\n    const requestId = req.headers[\"x-request-id\"] as string | undefined;\n    if (requestId) res.setHeader(\"x-request-id\", requestId);\n\n    // --- GET endpoints ---\n\n    // Liveness: always 200 if process is up (no auth required)\n    if (method === \"GET\" && url === \"/health\") {\n      this.sendResponse(res, 200, { success: true, data: { status: \"ok\" } });\n      return;\n    }\n\n    // Readiness: 200 only after pool is warm\n    if (method === \"GET\" && url === \"/ready\") {\n      if (!this.checkAuth(req, res)) return;\n      const ready = this.client?.isReady() ?? false;\n      if (ready) {\n        this.sendResponse(res, 200, { success: true, data: { ready: true } });\n      } else {\n        this.sendResponse(res, 503, { success: false, error: \"Not ready — pool is initializing\" });\n      }\n      return;\n    }\n\n    // Status: pool stats + uptime\n    if (method === \"GET\" && url === \"/status\") {\n      if (!this.checkAuth(req, res)) return;\n      this.handleStatus(res);\n      return;\n    }\n\n    // --- POST / (existing action-based RPC) ---\n\n    if (method !== \"POST\" || url !== \"/\") {\n      this.sendResponse(res, 404, { success: false, error: \"Not found\" });\n      return;\n    }\n\n    if (!this.checkAuth(req, res)) return;\n\n    // Reject new work during shutdown\n    if (this.shuttingDown) {\n      this.sendResponse(res, 503, { success: false, error: \"Server is shutting down\" });\n      return;\n    }\n\n    // Parse request body\n    let body = \"\";\n    for await (const chunk of req) {\n      body += chunk;\n    }\n\n    let request: DaemonRequest;\n    try {\n      request = JSON.parse(body);\n    } catch {\n      this.sendResponse(res, 400, { success: false, error: \"Invalid JSON\" });\n      return;\n    }\n\n    // Track in-flight requests for graceful shutdown\n    this.activeRequests++;\n    try {\n      switch (request.action) {\n        case \"scrape\":\n          await this.handleScrape(res, request.options);\n          break;\n        case \"crawl\":\n          await this.handleCrawl(res, request.options);\n          break;\n        case \"status\":\n          this.handleStatus(res);\n          break;\n        case \"shutdown\":\n          await this.handleShutdown(res);\n          break;\n        case \"browser.create\":\n          await this.handleBrowserCreate(res, request.options);\n          break;\n        case \"browser.stop\":\n          await this.handleBrowserStop(res, request.sessionId);\n          break;\n        case \"browser.list\":\n          this.handleBrowserList(res);\n          break;\n        default:\n          this.sendResponse(res, 400, { success: false, error: \"Unknown action\" });\n      }\n    } catch (error: any) {\n      this.sendResponse(res, 500, { success: false, error: error.message });\n    } finally {\n      this.activeRequests--;\n    }\n  }\n\n  /**\n   * Handle scrape request\n   */\n  private async handleScrape(\n    res: http.ServerResponse,\n    options: Omit<ScrapeOptions, \"connectionToCore\">\n  ): Promise<void> {\n    if (!this.client) {\n      this.sendResponse(res, 500, { success: false, error: \"Client not initialized\" });\n      return;\n    }\n\n    const result = await this.client.scrape(options);\n    this.sendResponse<ScrapeResult>(res, 200, { success: true, data: result });\n  }\n\n  /**\n   * Handle crawl request\n   */\n  private async handleCrawl(\n    res: http.ServerResponse,\n    options: Omit<CrawlOptions, \"connectionToCore\">\n  ): Promise<void> {\n    if (!this.client) {\n      this.sendResponse(res, 500, { success: false, error: \"Client not initialized\" });\n      return;\n    }\n\n    const result = await this.client.crawl(options);\n    this.sendResponse<CrawlResult>(res, 200, { success: true, data: result });\n  }\n\n  /**\n   * Handle status request\n   */\n  private handleStatus(res: http.ServerResponse): void {\n    const status: DaemonStatus = {\n      running: true,\n      ready: this.client?.isReady() ?? false,\n      port: this.options.port,\n      poolSize: this.options.poolSize,\n      uptime: Date.now() - this.startTime,\n      pid: process.pid,\n      activeRequests: this.activeRequests,\n    };\n    this.sendResponse<DaemonStatus>(res, 200, { success: true, data: status });\n  }\n\n  /**\n   * Handle shutdown request\n   */\n  private async handleShutdown(res: http.ServerResponse): Promise<void> {\n    this.sendResponse(res, 200, { success: true, data: { message: \"Shutting down\" } });\n\n    // Graceful shutdown: wait for in-flight requests, then stop\n    setTimeout(() => {\n      this.gracefulStop().then(() => process.exit(0));\n    }, 100);\n  }\n\n  /**\n   * Graceful shutdown: stop accepting new requests, drain in-flight, then close.\n   */\n  async gracefulStop(): Promise<void> {\n    if (this.shuttingDown) return;\n    this.shuttingDown = true;\n\n    logger.info(\"Graceful shutdown initiated...\");\n\n    // 1. Stop accepting new connections\n    if (this.server) {\n      this.server.close();\n    }\n\n    // 2. Wait for in-flight requests to complete (with timeout)\n    const drainStart = Date.now();\n    while (this.activeRequests > 0 && Date.now() - drainStart < SHUTDOWN_TIMEOUT_MS) {\n      if (this.options.verbose) {\n        logger.info(`Waiting for ${this.activeRequests} in-flight request(s) to complete...`);\n      }\n      await new Promise((resolve) => setTimeout(resolve, 500));\n    }\n\n    if (this.activeRequests > 0) {\n      logger.warn(\n        `Shutdown timeout reached with ${this.activeRequests} requests still in-flight — forcing close`\n      );\n    }\n\n    // 3. Close all browser sessions\n    for (const session of this.browserSessions.values()) {\n      await session.close().catch(() => {});\n    }\n    this.browserSessions.clear();\n\n    // 4. Close client and pool\n    await this.stop();\n\n    logger.info(\"Graceful shutdown complete\");\n  }\n\n  /**\n   * Handle browser.create request\n   */\n  private async handleBrowserCreate(\n    res: http.ServerResponse,\n    options: Omit<BrowserOptions, \"connectionToCore\">\n  ): Promise<void> {\n    if (!this.client) {\n      this.sendResponse(res, 500, { success: false, error: \"Client not initialized\" });\n      return;\n    }\n\n    const session = await this.client.browser(options);\n    this.browserSessions.set(session.sessionId, session);\n\n    // Return serializable info (no close function)\n    const info: BrowserSessionInfo = {\n      sessionId: session.sessionId,\n      wsEndpoint: session.wsEndpoint,\n      createdAt: session.createdAt,\n    };\n    this.sendResponse<BrowserSessionInfo>(res, 200, { success: true, data: info });\n  }\n\n  /**\n   * Handle browser.stop request\n   */\n  private async handleBrowserStop(res: http.ServerResponse, sessionId: string): Promise<void> {\n    const session = this.browserSessions.get(sessionId);\n    if (!session) {\n      this.sendResponse(res, 404, { success: false, error: `Session ${sessionId} not found` });\n      return;\n    }\n\n    await session.close();\n    this.browserSessions.delete(sessionId);\n    this.sendResponse(res, 200, { success: true, data: { sessionId } });\n  }\n\n  /**\n   * Handle browser.list request\n   */\n  private handleBrowserList(res: http.ServerResponse): void {\n    const sessions: BrowserSessionInfo[] = Array.from(this.browserSessions.values()).map((s) => ({\n      sessionId: s.sessionId,\n      wsEndpoint: s.wsEndpoint,\n      createdAt: s.createdAt,\n    }));\n    this.sendResponse<BrowserSessionInfo[]>(res, 200, { success: true, data: sessions });\n  }\n\n  /**\n   * Send JSON response\n   */\n  private sendResponse<T>(\n    res: http.ServerResponse,\n    statusCode: number,\n    data: DaemonResponse<T>\n  ): void {\n    res.writeHead(statusCode, { \"Content-Type\": \"application/json\" });\n    res.end(JSON.stringify(data));\n  }\n\n  /**\n   * Write PID file\n   */\n  private async writePidFile(): Promise<void> {\n    const fs = await import(\"fs/promises\");\n    const path = await import(\"path\");\n    const os = await import(\"os\");\n\n    const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);\n    const data = JSON.stringify({\n      pid: process.pid,\n      port: this.options.port,\n      startedAt: new Date().toISOString(),\n    });\n\n    await fs.writeFile(pidFile, data);\n  }\n\n  /**\n   * Remove PID file\n   */\n  private async removePidFile(): Promise<void> {\n    const fs = await import(\"fs/promises\");\n    const path = await import(\"path\");\n    const os = await import(\"os\");\n\n    const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);\n    try {\n      await fs.unlink(pidFile);\n    } catch {\n      // Ignore errors\n    }\n  }\n}\n\n/**\n * Get path to PID file\n */\nexport async function getPidFilePath(): Promise<string> {\n  const path = await import(\"path\");\n  const os = await import(\"os\");\n  return path.join(os.tmpdir(), PID_FILE_NAME);\n}\n\n/**\n * Check if daemon is running by reading PID file\n */\nexport async function getDaemonInfo(): Promise<{\n  pid: number;\n  port: number;\n  startedAt: string;\n} | null> {\n  const fs = await import(\"fs/promises\");\n  const pidFile = await getPidFilePath();\n\n  try {\n    const data = await fs.readFile(pidFile, \"utf-8\");\n    const info = JSON.parse(data);\n\n    // Check if process is still running\n    try {\n      process.kill(info.pid, 0); // Signal 0 tests if process exists\n      return info;\n    } catch {\n      // Process not running, clean up stale PID file\n      await fs.unlink(pidFile).catch(() => {});\n      return null;\n    }\n  } catch {\n    return null;\n  }\n}\n"
  },
  {
    "path": "src/engines/errors.ts",
    "content": "/**\n * Engine error classes\n *\n * Used by the Hero engine and orchestrator to signal specific failure\n * conditions. Consumed by the scraper's retry/escalation logic.\n */\n\nimport type { EngineName } from \"./types.js\";\n\n/**\n * Base error for all engine errors\n */\nexport class EngineError extends Error {\n  readonly engine: EngineName;\n  readonly retryable: boolean;\n\n  constructor(\n    engine: EngineName,\n    message: string,\n    options?: { cause?: Error; retryable?: boolean }\n  ) {\n    super(`[${engine}] ${message}`);\n    this.name = \"EngineError\";\n    this.engine = engine;\n    this.retryable = options?.retryable ?? true;\n    this.cause = options?.cause;\n\n    if (Error.captureStackTrace) {\n      Error.captureStackTrace(this, this.constructor);\n    }\n  }\n}\n\n/**\n * Content too short or empty\n */\nexport class InsufficientContentError extends EngineError {\n  readonly contentLength: number;\n  readonly threshold: number;\n\n  constructor(engine: EngineName, contentLength: number, threshold: number = 100) {\n    super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, {\n      retryable: true,\n    });\n    this.name = \"InsufficientContentError\";\n    this.contentLength = contentLength;\n    this.threshold = threshold;\n  }\n}\n\n/**\n * HTTP error status (4xx, 5xx)\n */\nexport class HttpError extends EngineError {\n  readonly statusCode: number;\n\n  constructor(engine: EngineName, statusCode: number, statusText?: string) {\n    const retryable = statusCode >= 500 || statusCode === 429;\n    super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : \"\"}`, { retryable });\n    this.name = \"HttpError\";\n    this.statusCode = statusCode;\n  }\n}\n\n/**\n * Engine timeout\n */\nexport class EngineTimeoutError extends EngineError {\n  readonly timeoutMs: number;\n\n  constructor(engine: EngineName, timeoutMs: number) {\n    super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });\n    this.name = \"EngineTimeoutError\";\n    this.timeoutMs = timeoutMs;\n  }\n}\n\n/**\n * Engine not available (not configured, missing dependency)\n */\nexport class EngineUnavailableError extends EngineError {\n  constructor(engine: EngineName, reason?: string) {\n    super(engine, reason || \"Engine not available\", { retryable: false });\n    this.name = \"EngineUnavailableError\";\n  }\n}\n\n/**\n * Engine failed — wraps the underlying error with proxy block signals.\n *\n * The scraper uses `proxyBlock` to decide whether to escalate to a\n * stronger proxy tier.\n */\nexport class ScrapeFailedError extends Error {\n  /** True when the failure is a proxy-level block (HTTP 401/403/429, redirect loop) */\n  readonly proxyBlock: boolean;\n\n  constructor(error: Error, options?: { proxyBlock?: boolean }) {\n    super(error.message);\n    this.name = \"ScrapeFailedError\";\n    this.cause = error;\n    this.proxyBlock = options?.proxyBlock ?? false;\n  }\n}\n"
  },
  {
    "path": "src/engines/hero/index.ts",
    "content": "/**\n * Hero Engine - Full browser with JavaScript execution\n *\n * Uses Hero browser automation with a tiered browser pool. Each proxy\n * gets its own long-lived Hero instance (Chrome process); scrapes run\n * in fresh tabs that are opened and closed per request.\n *\n * Pool selection:\n *   - Prefers `options.tieredPool` (TieredBrowserPool) when present.\n *     Looks up the browser bound to `options.proxy?.url` and runs the\n *     scrape through `ProxyBoundBrowser.withPage`.\n *   - Falls back to `options.pool` (legacy IBrowserPool.withBrowser) so\n *     the crawler and any other remaining legacy caller keeps working.\n */\n\nimport Hero from \"@ulixee/hero\";\nimport type { Engine, EngineConfig, EngineMeta, EngineResult } from \"../types.js\";\nimport {\n  EngineError,\n  InsufficientContentError,\n  EngineTimeoutError,\n  EngineUnavailableError,\n} from \"../errors.js\";\nimport { ENGINE_CONFIG } from \"../types.js\";\nimport type { IBrowserPool } from \"../../browser/types.js\";\nimport type { TieredBrowserPool, PoolTier } from \"../../browser/tiered-pool.js\";\nimport { redactProxyUrl } from \"../../browser/proxy-bound-browser.js\";\n\n/**\n * Minimum content length threshold\n */\nconst MIN_CONTENT_LENGTH = 100;\n\n/**\n * Hero Engine implementation using browser pool\n */\nexport class HeroEngine implements Engine {\n  readonly config: EngineConfig = ENGINE_CONFIG;\n\n  async scrape(meta: EngineMeta): Promise<EngineResult> {\n    const startTime = Date.now();\n    const { url, options, logger, abortSignal } = meta;\n\n    const tieredPool = options.tieredPool as TieredBrowserPool | undefined;\n    const legacyPool = options.pool as IBrowserPool | undefined;\n    if (!tieredPool && !legacyPool) {\n      throw new EngineUnavailableError(\"hero\", \"Browser pool not available\");\n    }\n\n    if (abortSignal?.aborted) {\n      throw new EngineTimeoutError(\"hero\", 0);\n    }\n\n    const proxyUrl = options.proxy?.url ?? null;\n    logger?.debug(`[hero] Starting browser scrape of ${url} (proxy: ${redactProxyUrl(proxyUrl)})`);\n\n    // Runner: drives Hero/Tab to extract HTML. Both Hero and Tab expose\n    // the same navigation surface (goto, document, waitForLoad, etc.).\n    const runScrape = async (heroOrTab: any): Promise<EngineResult> => {\n      let aborted = false;\n      if (abortSignal) {\n        abortSignal.addEventListener(\n          \"abort\",\n          () => {\n            aborted = true;\n          },\n          { once: true }\n        );\n      }\n\n      const timeoutMs = options.timeoutMs || this.config.maxTimeout;\n      await heroOrTab.goto(url, { timeoutMs });\n\n      if (aborted) {\n        throw new EngineTimeoutError(\"hero\", Date.now() - startTime);\n      }\n\n      try {\n        await heroOrTab.waitForLoad(\"DomContentLoaded\", { timeoutMs });\n      } catch {\n        // Timeout is OK, continue anyway\n      }\n      await heroOrTab.waitForPaintingStable();\n\n      if (aborted) {\n        throw new EngineTimeoutError(\"hero\", Date.now() - startTime);\n      }\n\n      // Wait for selector if specified\n      if (options.waitForSelector) {\n        try {\n          await heroOrTab.waitForElement(\n            heroOrTab.document.querySelector(options.waitForSelector),\n            {\n              timeoutMs,\n            }\n          );\n        } catch {\n          logger?.debug(`[hero] Selector not found: ${options.waitForSelector}`);\n        }\n      }\n\n      // Extract content\n      const html = await heroOrTab.document.documentElement.outerHTML;\n      const finalUrl = await heroOrTab.url;\n\n      // Validate content length\n      const textContent = this.extractText(html);\n      if (textContent.length < MIN_CONTENT_LENGTH) {\n        logger?.debug(`[hero] Insufficient content: ${textContent.length} chars`);\n        throw new InsufficientContentError(\"hero\", textContent.length, MIN_CONTENT_LENGTH);\n      }\n\n      const duration = Date.now() - startTime;\n      logger?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);\n\n      return {\n        html,\n        url: finalUrl,\n        statusCode: 200,\n        engine: \"hero\" as const,\n        duration,\n      };\n    };\n\n    try {\n      let result: EngineResult;\n\n      if (tieredPool) {\n        const bound = tieredPool.getBrowserByProxy(proxyUrl);\n        if (bound && bound.isAvailable()) {\n          await bound.ready;\n          result = await bound.withPage(async (tab) => runScrape(tab));\n        } else {\n          const tier = resolveTierFromOptions(options.proxyTier);\n          if (!tieredPool.hasTier(tier)) {\n            throw new EngineUnavailableError(\n              \"hero\",\n              `no browser bound to ${redactProxyUrl(proxyUrl)} and tier \"${tier}\" has no browsers`\n            );\n          }\n          const lease = tieredPool.acquire(tier);\n          await lease.browser.ready;\n          result = await lease.browser.withPage(async (tab) => runScrape(tab));\n        }\n      } else {\n        result = await legacyPool!.withBrowser(async (hero: Hero) => runScrape(hero));\n      }\n\n      return result;\n    } catch (error: unknown) {\n      if (\n        error instanceof InsufficientContentError ||\n        error instanceof EngineTimeoutError ||\n        error instanceof EngineUnavailableError\n      ) {\n        throw error;\n      }\n\n      if (error instanceof Error) {\n        if (error.name === \"TimeoutError\" || error.message.includes(\"timeout\")) {\n          throw new EngineTimeoutError(\"hero\", this.config.maxTimeout);\n        }\n\n        if (error.message.includes(\"Navigation\") || error.message.includes(\"ERR_\")) {\n          throw new EngineError(\"hero\", `Navigation failed: ${error.message}`, { cause: error });\n        }\n\n        throw new EngineError(\"hero\", error.message, { cause: error });\n      }\n\n      throw new EngineError(\"hero\", String(error));\n    }\n  }\n\n  private extractText(html: string): string {\n    return html\n      .replace(/<script[^>]*>[\\s\\S]*?<\\/script>/gi, \"\")\n      .replace(/<style[^>]*>[\\s\\S]*?<\\/style>/gi, \"\")\n      .replace(/<[^>]+>/g, \" \")\n      .replace(/\\s+/g, \" \")\n      .trim();\n  }\n\n  isAvailable(): boolean {\n    return true;\n  }\n}\n\n/**\n * Singleton instance\n */\nexport const heroEngine = new HeroEngine();\n\n/**\n * Map a ScrapeOptions.proxyTier to a TieredBrowserPool PoolTier.\n */\nfunction resolveTierFromOptions(proxyTier: string | undefined): PoolTier {\n  if (proxyTier === \"residential\") return \"residential\";\n  if (proxyTier === \"direct\") return \"direct\";\n  return \"datacenter\";\n}\n"
  },
  {
    "path": "src/engines/index.ts",
    "content": "/**\n * Scraping Engine\n *\n * Hero-only engine with orchestrator for quality checks and\n * proxy block detection.\n */\n\n// Types\nexport type {\n  EngineName,\n  Engine,\n  EngineConfig,\n  EngineFeatures,\n  EngineMeta,\n  EngineResult,\n} from \"./types.js\";\n\nexport { ENGINE_CONFIG } from \"./types.js\";\n\n// Errors\nexport {\n  EngineError,\n  InsufficientContentError,\n  HttpError,\n  EngineTimeoutError,\n  EngineUnavailableError,\n  ScrapeFailedError,\n} from \"./errors.js\";\n\n// Hero engine\nexport { heroEngine, HeroEngine } from \"./hero/index.js\";\n\n// Orchestrator\nexport {\n  EngineOrchestrator,\n  createOrchestrator,\n  type OrchestratorOptions,\n  type OrchestratorResult,\n} from \"./orchestrator.js\";\n"
  },
  {
    "path": "src/engines/orchestrator.ts",
    "content": "/**\n * Engine Orchestrator\n *\n * Runs Hero against a URL, applies a minimal quality check, and returns\n * the result. Detects proxy-level blocks (HTTP 401/403/429, redirect\n * loops) so the scraper's retry loop can escalate to a stronger proxy.\n */\n\nimport type { EngineMeta, EngineResult } from \"./types.js\";\nimport { ScrapeFailedError, HttpError, EngineUnavailableError } from \"./errors.js\";\nimport { heroEngine } from \"./hero/index.js\";\nimport type { Logger } from \"../utils/logger.js\";\n\n/**\n * Orchestrator options\n */\nexport interface OrchestratorOptions {\n  /** Logger instance */\n  logger?: Logger;\n  /** Verbose logging */\n  verbose?: boolean;\n}\n\n/**\n * Orchestrator result with scrape metadata\n */\nexport interface OrchestratorResult extends EngineResult {\n  /** Whether the response was detected as a block page */\n  blocked: boolean;\n}\n\n/**\n * Engine Orchestrator\n *\n * @example\n * const orchestrator = new EngineOrchestrator({ verbose: true });\n * const result = await orchestrator.scrape({\n *   url: 'https://example.com',\n *   options: { timeoutMs: 30000 }\n * });\n */\nexport class EngineOrchestrator {\n  private options: OrchestratorOptions;\n\n  constructor(options: OrchestratorOptions = {}) {\n    this.options = options;\n  }\n\n  /**\n   * Assess result quality. Intentionally minimal — if there's any text\n   * content, it's a pass. Block detection is a proxy concern, not ours.\n   */\n  private assessQuality(result: EngineResult): {\n    passed: boolean;\n    reason?: \"empty_content\" | \"http_error\";\n  } {\n    const statusOk =\n      (result.statusCode >= 200 && result.statusCode < 300) || result.statusCode === 304;\n\n    const textContent =\n      result.html\n        ?.replace(/<script[^>]*>[\\s\\S]*?<\\/script>/gi, \"\")\n        .replace(/<style[^>]*>[\\s\\S]*?<\\/style>/gi, \"\")\n        .replace(/<[^>]*>/g, \" \")\n        .replace(/\\s+/g, \" \")\n        .trim() ?? \"\";\n\n    const hasContent = textContent.length > 0;\n\n    if (!statusOk && !hasContent) return { passed: false, reason: \"http_error\" };\n    if (statusOk && !hasContent) return { passed: false, reason: \"empty_content\" };\n\n    return { passed: true };\n  }\n\n  /**\n   * Scrape a URL using Hero.\n   *\n   * @throws ScrapeFailedError on failure (with proxyBlock flag for escalation)\n   */\n  async scrape(meta: EngineMeta): Promise<OrchestratorResult> {\n    const logger = meta.logger || this.options.logger;\n    const verbose = this.options.verbose || meta.options.verbose;\n\n    const log = (msg: string) => {\n      if (verbose) logger?.info(msg);\n      else logger?.debug(msg);\n    };\n\n    if (!heroEngine.isAvailable()) {\n      throw new ScrapeFailedError(new EngineUnavailableError(\"hero\", \"Hero engine not available\"));\n    }\n\n    log(`[orchestrator] Scraping ${meta.url} with Hero`);\n\n    try {\n      const result = await heroEngine.scrape(meta);\n\n      const quality = this.assessQuality(result);\n      if (!quality.passed) {\n        log(`[orchestrator] Quality check failed: ${quality.reason}`);\n        throw new ScrapeFailedError(new Error(`Quality check failed: ${quality.reason}`));\n      }\n\n      log(`[orchestrator] ✓ Hero succeeded in ${result.duration}ms`);\n      return { ...result, blocked: false };\n    } catch (error: unknown) {\n      // Already wrapped — re-throw\n      if (error instanceof ScrapeFailedError) throw error;\n\n      const err = error instanceof Error ? error : new Error(String(error));\n\n      // Detect proxy-level blocks for escalation\n      let proxyBlock = false;\n      if (err instanceof HttpError && [401, 403, 429].includes(err.statusCode)) {\n        proxyBlock = true;\n      }\n      if (err.message.includes(\"redirect\") || err.message.includes(\"ERR_TOO_MANY\")) {\n        proxyBlock = true;\n      }\n\n      log(`[orchestrator] Hero failed: ${err.message}${proxyBlock ? \" (proxy block)\" : \"\"}`);\n      throw new ScrapeFailedError(err, { proxyBlock });\n    }\n  }\n}\n\n/**\n * Create an orchestrator with default settings\n */\nexport function createOrchestrator(options: OrchestratorOptions = {}): EngineOrchestrator {\n  return new EngineOrchestrator(options);\n}\n"
  },
  {
    "path": "src/engines/types.ts",
    "content": "/**\n * Engine types for the scraping engine.\n *\n * Reader uses a single engine: Hero (Ulixee), a full browser with\n * JavaScript execution, TLS fingerprinting, and Cloudflare bypass.\n */\n\nimport type { ScrapeOptions } from \"../types.js\";\nimport type { Logger } from \"../utils/logger.js\";\n\n/**\n * Engine name — Hero is the only engine.\n */\nexport type EngineName = \"hero\";\n\n/**\n * Result returned by the engine after scraping\n */\nexport interface EngineResult {\n  /** Raw HTML content */\n  html: string;\n  /** Final URL after redirects */\n  url: string;\n  /** HTTP status code */\n  statusCode: number;\n  /** Content-Type header */\n  contentType?: string;\n  /** Response headers */\n  headers?: Record<string, string>;\n\n  /** Engine that produced this result */\n  engine: EngineName;\n  /** Time taken in milliseconds */\n  duration: number;\n}\n\n/**\n * Metadata passed to engine scrape method\n */\nexport interface EngineMeta {\n  /** URL to scrape */\n  url: string;\n  /** Scrape options */\n  options: ScrapeOptions;\n  /** Logger instance */\n  logger?: Logger;\n  /** Abort signal for cancellation */\n  abortSignal?: AbortSignal;\n}\n\n/**\n * Engine configuration\n */\nexport interface EngineConfig {\n  /** Engine name */\n  name: EngineName;\n  /** Default timeout (ms) */\n  timeout: number;\n  /** Absolute max time before killing (ms) */\n  maxTimeout: number;\n  /** Engine capabilities */\n  features: EngineFeatures;\n}\n\n/**\n * Engine feature flags\n */\nexport interface EngineFeatures {\n  /** Can execute JavaScript */\n  javascript: boolean;\n  /** Can handle Cloudflare challenges */\n  cloudflare: boolean;\n  /** Matches browser TLS fingerprint */\n  tlsFingerprint: boolean;\n  /** Supports waitFor selector */\n  waitFor: boolean;\n  /** Can take screenshots */\n  screenshots: boolean;\n}\n\n/**\n * Engine interface\n */\nexport interface Engine {\n  /** Engine configuration */\n  readonly config: EngineConfig;\n\n  /**\n   * Scrape a URL\n   */\n  scrape(meta: EngineMeta): Promise<EngineResult>;\n\n  /**\n   * Check if engine is available and configured\n   */\n  isAvailable(): boolean;\n}\n\n/**\n * Hero engine configuration\n */\nexport const ENGINE_CONFIG: EngineConfig = {\n  name: \"hero\",\n  timeout: 10000,\n  maxTimeout: 30000,\n  features: {\n    javascript: true,\n    cloudflare: true,\n    tlsFingerprint: true,\n    waitFor: true,\n    screenshots: true,\n  },\n};\n"
  },
  {
    "path": "src/errors.ts",
    "content": "/**\n * Typed error classes for Reader\n *\n * Provides actionable error messages and structured error information\n * for better debugging and error handling.\n */\n\n/**\n * Error codes for categorization\n */\nexport enum ReaderErrorCode {\n  // Network errors\n  NETWORK_ERROR = \"NETWORK_ERROR\",\n  TIMEOUT = \"TIMEOUT\",\n  CONNECTION_REFUSED = \"CONNECTION_REFUSED\",\n  DNS_ERROR = \"DNS_ERROR\",\n  TLS_ERROR = \"TLS_ERROR\",\n\n  // Cloudflare/bot detection\n  CLOUDFLARE_CHALLENGE = \"CLOUDFLARE_CHALLENGE\",\n  BOT_DETECTED = \"BOT_DETECTED\",\n  ACCESS_DENIED = \"ACCESS_DENIED\",\n\n  // Proxy errors\n  PROXY_CONNECTION_ERROR = \"PROXY_CONNECTION_ERROR\",\n  PROXY_EXHAUSTED = \"PROXY_EXHAUSTED\",\n\n  // Content errors\n  CONTENT_EXTRACTION_FAILED = \"CONTENT_EXTRACTION_FAILED\",\n  EMPTY_CONTENT = \"EMPTY_CONTENT\",\n  CONTENT_TOO_LARGE = \"CONTENT_TOO_LARGE\",\n  MARKDOWN_CONVERSION_FAILED = \"MARKDOWN_CONVERSION_FAILED\",\n\n  // Engine errors\n  ALL_ENGINES_FAILED = \"ALL_ENGINES_FAILED\",\n\n  // Validation errors\n  INVALID_URL = \"INVALID_URL\",\n  INVALID_OPTIONS = \"INVALID_OPTIONS\",\n\n  // Robots.txt\n  ROBOTS_BLOCKED = \"ROBOTS_BLOCKED\",\n\n  // Browser/pool errors\n  BROWSER_ERROR = \"BROWSER_ERROR\",\n  POOL_EXHAUSTED = \"POOL_EXHAUSTED\",\n\n  // Client errors\n  CLIENT_CLOSED = \"CLIENT_CLOSED\",\n  NOT_INITIALIZED = \"NOT_INITIALIZED\",\n\n  // Unknown\n  UNKNOWN = \"UNKNOWN\",\n}\n\n/**\n * Base error class for all Reader errors\n */\nexport class ReaderError extends Error {\n  readonly code: ReaderErrorCode;\n  readonly url?: string;\n  readonly cause?: Error;\n  readonly timestamp: string;\n  readonly retryable: boolean;\n\n  constructor(\n    message: string,\n    code: ReaderErrorCode,\n    options?: {\n      url?: string;\n      cause?: Error;\n      retryable?: boolean;\n    }\n  ) {\n    super(message);\n    this.name = \"ReaderError\";\n    this.code = code;\n    this.url = options?.url;\n    this.cause = options?.cause;\n    this.timestamp = new Date().toISOString();\n    this.retryable = options?.retryable ?? false;\n\n    // Maintain proper stack trace\n    if (Error.captureStackTrace) {\n      Error.captureStackTrace(this, this.constructor);\n    }\n  }\n\n  /**\n   * Convert to a plain object for serialization\n   */\n  toJSON(): Record<string, unknown> {\n    return {\n      name: this.name,\n      code: this.code,\n      message: this.message,\n      url: this.url,\n      timestamp: this.timestamp,\n      retryable: this.retryable,\n      cause: this.cause?.message,\n      stack: this.stack,\n    };\n  }\n}\n\n/**\n * Network-related errors (connection issues, DNS failures, etc.)\n */\nexport class NetworkError extends ReaderError {\n  constructor(message: string, options?: { url?: string; cause?: Error }) {\n    super(message, ReaderErrorCode.NETWORK_ERROR, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"NetworkError\";\n  }\n}\n\n/**\n * Timeout errors (page load, navigation, etc.)\n */\nexport class TimeoutError extends ReaderError {\n  readonly timeoutMs: number;\n\n  constructor(message: string, timeoutMs: number, options?: { url?: string; cause?: Error }) {\n    super(message, ReaderErrorCode.TIMEOUT, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"TimeoutError\";\n    this.timeoutMs = timeoutMs;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return {\n      ...super.toJSON(),\n      timeoutMs: this.timeoutMs,\n    };\n  }\n}\n\n/**\n * Cloudflare challenge errors\n */\nexport class CloudflareError extends ReaderError {\n  readonly challengeType: string;\n\n  constructor(challengeType: string, options?: { url?: string; cause?: Error }) {\n    super(\n      `Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,\n      ReaderErrorCode.CLOUDFLARE_CHALLENGE,\n      {\n        ...options,\n        retryable: true,\n      }\n    );\n    this.name = \"CloudflareError\";\n    this.challengeType = challengeType;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return {\n      ...super.toJSON(),\n      challengeType: this.challengeType,\n    };\n  }\n}\n\n/**\n * Access denied errors (blocked, forbidden, etc.)\n */\nexport class AccessDeniedError extends ReaderError {\n  readonly statusCode?: number;\n\n  constructor(message: string, options?: { url?: string; statusCode?: number; cause?: Error }) {\n    super(message, ReaderErrorCode.ACCESS_DENIED, {\n      ...options,\n      retryable: false,\n    });\n    this.name = \"AccessDeniedError\";\n    this.statusCode = options?.statusCode;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return {\n      ...super.toJSON(),\n      statusCode: this.statusCode,\n    };\n  }\n}\n\n/**\n * Content extraction errors\n */\nexport class ContentExtractionError extends ReaderError {\n  constructor(message: string, options?: { url?: string; cause?: Error }) {\n    super(message, ReaderErrorCode.CONTENT_EXTRACTION_FAILED, {\n      ...options,\n      retryable: false,\n    });\n    this.name = \"ContentExtractionError\";\n  }\n}\n\n/**\n * Validation errors (invalid URLs, options, etc.)\n */\nexport class ValidationError extends ReaderError {\n  readonly field?: string;\n\n  constructor(message: string, options?: { field?: string; url?: string }) {\n    super(message, ReaderErrorCode.INVALID_OPTIONS, {\n      url: options?.url,\n      retryable: false,\n    });\n    this.name = \"ValidationError\";\n    this.field = options?.field;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return {\n      ...super.toJSON(),\n      field: this.field,\n    };\n  }\n}\n\n/**\n * URL validation error\n */\nexport class InvalidUrlError extends ReaderError {\n  constructor(url: string, reason?: string) {\n    super(\n      reason ? `Invalid URL \"${url}\": ${reason}` : `Invalid URL: ${url}`,\n      ReaderErrorCode.INVALID_URL,\n      {\n        url,\n        retryable: false,\n      }\n    );\n    this.name = \"InvalidUrlError\";\n  }\n}\n\n/**\n * Robots.txt blocked error\n */\nexport class RobotsBlockedError extends ReaderError {\n  constructor(url: string) {\n    super(\n      `URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`,\n      ReaderErrorCode.ROBOTS_BLOCKED,\n      {\n        url,\n        retryable: false,\n      }\n    );\n    this.name = \"RobotsBlockedError\";\n  }\n}\n\n/**\n * Browser pool errors\n */\nexport class BrowserPoolError extends ReaderError {\n  constructor(message: string, options?: { cause?: Error }) {\n    super(message, ReaderErrorCode.BROWSER_ERROR, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"BrowserPoolError\";\n  }\n}\n\n/**\n * Client state errors\n */\nexport class ClientClosedError extends ReaderError {\n  constructor() {\n    super(\n      \"ReaderClient has been closed. Create a new instance to continue.\",\n      ReaderErrorCode.CLIENT_CLOSED,\n      {\n        retryable: false,\n      }\n    );\n    this.name = \"ClientClosedError\";\n  }\n}\n\n/**\n * Not initialized error\n */\nexport class NotInitializedError extends ReaderError {\n  constructor(component: string) {\n    super(\n      `${component} not initialized. This should not happen - please report this bug.`,\n      ReaderErrorCode.NOT_INITIALIZED,\n      {\n        retryable: false,\n      }\n    );\n    this.name = \"NotInitializedError\";\n  }\n}\n\n// ============================================================================\n// DNS/TLS errors\n// ============================================================================\n\n/**\n * DNS resolution failure\n */\nexport class DNSError extends ReaderError {\n  readonly hostname: string;\n\n  constructor(hostname: string, options?: { url?: string; cause?: Error }) {\n    super(`Cannot resolve hostname: ${hostname}`, ReaderErrorCode.DNS_ERROR, {\n      ...options,\n      retryable: false,\n    });\n    this.name = \"DNSError\";\n    this.hostname = hostname;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return { ...super.toJSON(), hostname: this.hostname };\n  }\n}\n\n/**\n * TLS/SSL handshake failure\n */\nexport class TLSError extends ReaderError {\n  constructor(detail: string, options?: { url?: string; cause?: Error }) {\n    super(`TLS handshake failed: ${detail}`, ReaderErrorCode.TLS_ERROR, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"TLSError\";\n  }\n}\n\n// ============================================================================\n// Bot detection errors\n// ============================================================================\n\n/**\n * Bot detection triggered (distinct from Cloudflare — covers Amazon, etc.)\n */\nexport class BotDetectedError extends ReaderError {\n  readonly signal: string;\n\n  constructor(signal: string, options?: { url?: string; cause?: Error }) {\n    super(`Bot detection triggered: ${signal}`, ReaderErrorCode.BOT_DETECTED, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"BotDetectedError\";\n    this.signal = signal;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return { ...super.toJSON(), signal: this.signal };\n  }\n}\n\n// ============================================================================\n// Proxy errors\n// ============================================================================\n\n/**\n * Proxy connection failed\n */\nexport class ProxyConnectionError extends ReaderError {\n  readonly proxyTier: string;\n\n  constructor(proxyTier: string, options?: { url?: string; cause?: Error }) {\n    super(`Proxy connection failed (${proxyTier})`, ReaderErrorCode.PROXY_CONNECTION_ERROR, {\n      ...options,\n      retryable: true,\n    });\n    this.name = \"ProxyConnectionError\";\n    this.proxyTier = proxyTier;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return { ...super.toJSON(), proxyTier: this.proxyTier };\n  }\n}\n\n/**\n * All proxy tiers exhausted\n */\nexport class ProxyExhaustedError extends ReaderError {\n  constructor(options?: { url?: string; cause?: Error }) {\n    super(\n      \"All proxy tiers exhausted — unable to reach the target\",\n      ReaderErrorCode.PROXY_EXHAUSTED,\n      {\n        ...options,\n        retryable: false,\n      }\n    );\n    this.name = \"ProxyExhaustedError\";\n  }\n}\n\n// ============================================================================\n// Content errors\n// ============================================================================\n\n/**\n * Content too large for processing\n */\nexport class ContentTooLargeError extends ReaderError {\n  readonly sizeBytes: number;\n  readonly limitBytes: number;\n\n  constructor(sizeBytes: number, limitBytes: number, options?: { url?: string }) {\n    super(\n      `HTML content (${sizeBytes} bytes) exceeds processing limit (${limitBytes} bytes)`,\n      ReaderErrorCode.CONTENT_TOO_LARGE,\n      { ...options, retryable: false }\n    );\n    this.name = \"ContentTooLargeError\";\n    this.sizeBytes = sizeBytes;\n    this.limitBytes = limitBytes;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return { ...super.toJSON(), sizeBytes: this.sizeBytes, limitBytes: this.limitBytes };\n  }\n}\n\n/**\n * Markdown conversion failed (e.g., supermarkdown panic caught)\n */\nexport class MarkdownConversionError extends ReaderError {\n  constructor(detail: string, options?: { url?: string; cause?: Error }) {\n    super(`Markdown conversion failed: ${detail}`, ReaderErrorCode.MARKDOWN_CONVERSION_FAILED, {\n      ...options,\n      retryable: false,\n    });\n    this.name = \"MarkdownConversionError\";\n  }\n}\n\n/**\n * Content is empty or insufficient\n */\nexport class EmptyContentError extends ReaderError {\n  readonly contentLength: number;\n\n  constructor(contentLength: number, options?: { url?: string }) {\n    super(\n      `Content too short (${contentLength} chars) — page may require JavaScript rendering or may be bot-blocked`,\n      ReaderErrorCode.EMPTY_CONTENT,\n      { ...options, retryable: true }\n    );\n    this.name = \"EmptyContentError\";\n    this.contentLength = contentLength;\n  }\n\n  toJSON(): Record<string, unknown> {\n    return { ...super.toJSON(), contentLength: this.contentLength };\n  }\n}\n\n// ============================================================================\n// Engine/retry errors\n// ============================================================================\n\n// Note: ScrapeFailedError is defined in src/engines/errors.ts.\n// Re-exported from src/engines/index.ts for consumers.\n\n// ============================================================================\n// Utility\n// ============================================================================\n\n/**\n * Helper to wrap unknown errors in ReaderError\n */\nexport function wrapError(error: unknown, url?: string): ReaderError {\n  if (error instanceof ReaderError) {\n    return error;\n  }\n\n  if (error instanceof Error) {\n    const message = error.message.toLowerCase();\n\n    // Proxy patterns (check before timeout — \"tunnel timeout\" is a proxy error)\n    if (message.includes(\"proxy\") || (message.includes(\"tunnel\") && !message.includes(\"timeout\"))) {\n      return new ProxyConnectionError(\"unknown\", { url, cause: error });\n    }\n\n    // Timeout patterns\n    if (\n      message.includes(\"timeout\") ||\n      message.includes(\"timed out\") ||\n      message.includes(\"etimedout\")\n    ) {\n      // Check if this is actually a proxy tunnel timeout\n      if (message.includes(\"tunnel\")) {\n        return new ProxyConnectionError(\"unknown\", { url, cause: error });\n      }\n      return new TimeoutError(error.message, 30000, { url, cause: error });\n    }\n\n    // DNS patterns\n    if (message.includes(\"enotfound\") || message.includes(\"getaddrinfo\")) {\n      const hostname = url ? new URL(url).hostname : \"unknown\";\n      return new DNSError(hostname, { url, cause: error });\n    }\n\n    // TLS/SSL patterns\n    if (\n      message.includes(\"ssl\") ||\n      message.includes(\"certificate\") ||\n      message.includes(\"cert_\") ||\n      message.includes(\"unable to verify\") ||\n      message.includes(\"self signed\") ||\n      message.includes(\"err_tls\")\n    ) {\n      return new TLSError(error.message, { url, cause: error });\n    }\n\n    // Connection patterns\n    if (message.includes(\"econnrefused\") || message.includes(\"connection refused\")) {\n      return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });\n    }\n\n    if (\n      message.includes(\"econnreset\") ||\n      message.includes(\"socket hang up\") ||\n      message.includes(\"err_connection_reset\") ||\n      message.includes(\"err_connection_closed\")\n    ) {\n      return new NetworkError(`Connection reset: ${error.message}`, { url, cause: error });\n    }\n\n    // Too many redirects\n    if (\n      message.includes(\"too many redirects\") ||\n      message.includes(\"err_too_many_redirects\") ||\n      message.includes(\"maxredirects\")\n    ) {\n      return new NetworkError(`Too many redirects for ${url ?? \"URL\"}`, { url, cause: error });\n    }\n\n    // Empty response\n    if (message.includes(\"err_empty_response\") || message.includes(\"empty response\")) {\n      return new NetworkError(`Server returned empty response`, { url, cause: error });\n    }\n\n    // HTTP/2 protocol errors\n    if (message.includes(\"err_http2_protocol_error\") || message.includes(\"http2 protocol\")) {\n      return new NetworkError(`HTTP/2 protocol error: ${error.message}`, { url, cause: error });\n    }\n\n    // Client blocking patterns (ad blockers, extensions, etc.)\n    if (message.includes(\"err_blocked_by_client\") || message.includes(\"blocked by client\")) {\n      return new NetworkError(`Request blocked by client`, { url, cause: error });\n    }\n\n    // Proxy patterns\n    if (message.includes(\"proxy\") || message.includes(\"tunnel\")) {\n      return new ProxyConnectionError(\"unknown\", { url, cause: error });\n    }\n\n    // Cloudflare patterns\n    if (message.includes(\"cloudflare\") || message.includes(\"challenge\")) {\n      return new CloudflareError(\"unknown\", { url, cause: error });\n    }\n\n    // Markdown conversion patterns (supermarkdown panics caught by NAPI)\n    if (\n      message.includes(\"supermarkdown\") ||\n      message.includes(\"conversion failed\") ||\n      message.includes(\"formatting argument\")\n    ) {\n      return new MarkdownConversionError(error.message, { url, cause: error });\n    }\n\n    return new ReaderError(error.message, ReaderErrorCode.UNKNOWN, {\n      url,\n      cause: error,\n      retryable: false,\n    });\n  }\n\n  return new ReaderError(String(error), ReaderErrorCode.UNKNOWN, {\n    url,\n    retryable: false,\n  });\n}\n"
  },
  {
    "path": "src/formatters/html.ts",
    "content": "/**\n * HTML formatter\n *\n * Returns the cleaned HTML content as-is.\n * The content has already been processed by content-cleaner.ts\n * (ads removed, base64 images stripped, scripts/styles removed).\n */\n\n/**\n * Return HTML content as-is (already cleaned by content-cleaner)\n *\n * This is essentially a pass-through. The cleaning happens in scraper.ts\n * via cleanContent() before this is called.\n */\nexport function formatToHTML(html: string): string {\n  return html;\n}\n"
  },
  {
    "path": "src/formatters/index.ts",
    "content": "// Export all formatters\nexport { formatToMarkdown, htmlToMarkdown } from \"./markdown\";\nexport { formatToHTML } from \"./html\";\n"
  },
  {
    "path": "src/formatters/markdown.ts",
    "content": "import { convert } from \"@vakra-dev/supermarkdown\";\nimport { logger } from \"../utils/logger.js\";\n\nconst log = logger.child({ name: \"markdown\" });\n\n/**\n * Convert HTML to Markdown\n *\n * Simple conversion without any headers, metadata, or formatting wrappers.\n * Returns clean markdown content ready for LLM consumption.\n *\n * Uses supermarkdown (Rust-based) for high-performance conversion.\n *\n * Safety layers:\n * 1. Rust catch_unwind in NAPI wrapper catches most panics (returns empty string)\n * 2. JS try/catch catches any thrown errors from NAPI binding\n * 3. Timeout prevents hanging on pathological inputs\n * 4. Fallback text extraction if conversion fails entirely\n */\nexport function htmlToMarkdown(html: string): string {\n  try {\n    const result = convert(html, {\n      headingStyle: \"atx\",\n      bulletMarker: \"-\",\n      codeFence: \"`\",\n      linkStyle: \"inline\",\n    });\n\n    // catch_unwind returns empty string on Rust panic -- detect this\n    if (result === \"\" && html.length > 100) {\n      log.warn(\n        \"supermarkdown returned empty string for %d byte input -- possible Rust panic caught by NAPI wrapper. Falling back to text extraction.\",\n        html.length\n      );\n      return fallbackTextExtract(html);\n    }\n\n    return result;\n  } catch (error) {\n    log.error(\n      { err: error },\n      \"supermarkdown threw an error during conversion. Falling back to text extraction.\"\n    );\n    return fallbackTextExtract(html);\n  }\n}\n\n/**\n * Fallback: strip HTML tags and return plain text.\n * Used when supermarkdown fails (panic, error, or empty result on large input).\n * Not great output quality, but keeps the pipeline alive instead of crashing.\n */\nfunction fallbackTextExtract(html: string): string {\n  return html\n    .replace(/<script[\\s\\S]*?<\\/script>/gi, \"\")\n    .replace(/<style[\\s\\S]*?<\\/style>/gi, \"\")\n    .replace(/<[^>]*>/g, \" \")\n    .replace(/\\s+/g, \" \")\n    .trim();\n}\n\n/**\n * Alias for htmlToMarkdown (backward compatibility)\n */\nexport const formatToMarkdown = htmlToMarkdown;\n"
  },
  {
    "path": "src/formatters/postprocess.ts",
    "content": "/**\n * Markdown post-processing.\n *\n * Light-touch cleanup on the markdown output from supermarkdown. Only\n * fixes patterns that are clearly noise, not content.\n */\n\n/**\n * Apply all post-processing passes to a markdown string.\n */\nexport function postprocessMarkdown(md: string): string {\n  let result = md;\n\n  // 1. Remove [Skip to Content](#...) accessibility links. These are\n  //    CSS-hidden on the rendered page (only visible on keyboard focus for\n  //    screen readers) but Hero sees the full DOM. Never useful content.\n  result = result.replace(/\\[(?:Skip|Jump) to (?:main )?Content\\]\\(#[^)]*\\)/gi, \"\");\n\n  // 2. Collapse image-in-link patterns: [![alt](img)](url) where img === url\n  //    is a common pattern for clickable images that link to themselves.\n  //    The duplication is noise; collapse to just the image.\n  result = deduplicateImageLinks(result);\n\n  // 3. Collapse 3+ consecutive blank lines to 2 (standard markdown separator).\n  result = result.replace(/\\n{3,}/g, \"\\n\\n\");\n\n  // 4. Trim the document.\n  result = result.trim();\n\n  return result;\n}\n\n/**\n * Collapse [![alt](imgUrl)](linkUrl) to ![alt](imgUrl) when imgUrl and\n * linkUrl are the same (image links to itself).\n */\nfunction deduplicateImageLinks(md: string): string {\n  return md.replace(/\\[!\\[([^\\]]*)\\]\\(([^)]+)\\)\\]\\(([^)]+)\\)/g, (_match, alt, imgUrl, linkUrl) => {\n    const imgBase = imgUrl.split(/\\s+/)[0];\n    const linkBase = linkUrl.split(/\\s+/)[0];\n    if (imgBase === linkBase) {\n      return `![${alt}](${imgUrl})`;\n    }\n    return _match;\n  });\n}\n"
  },
  {
    "path": "src/index.ts",
    "content": "/**\n * @vakra-dev/reader\n *\n * Production-grade web scraping engine for LLMs.\n * Clean markdown output, ready for your agents.\n */\n\n// =============================================================================\n// Main API exports\n// =============================================================================\nexport { ReaderClient } from \"./client\";\nexport type { ReaderClientOptions, ProxyRotation } from \"./client\";\nexport { scrape, Scraper } from \"./scraper\";\nexport { crawl, Crawler } from \"./crawler\";\nexport { createBrowserSession } from \"./browser-session\";\nexport type { BrowserOptions, BrowserSession } from \"./browser-types\";\n\n// =============================================================================\n// Daemon exports\n// =============================================================================\nexport {\n  DaemonServer,\n  DaemonClient,\n  isDaemonRunning,\n  getDaemonInfo,\n  getPidFilePath,\n  DEFAULT_DAEMON_PORT,\n} from \"./daemon\";\nexport type { DaemonServerOptions, DaemonClientOptions, DaemonStatus } from \"./daemon\";\n\n// =============================================================================\n// Type exports\n// =============================================================================\nexport type {\n  ScrapeOptions,\n  ScrapeResult,\n  WebsiteScrapeResult,\n  BatchMetadata,\n  Page,\n  WebsiteMetadata,\n  ProxyConfig,\n  ProxyMetadata,\n  ProxyPoolConfig,\n  ProxyTier,\n  BrowserPoolConfig,\n} from \"./types\";\n\nexport type { CrawlOptions, CrawlResult, CrawlUrl, CrawlMetadata } from \"./crawl-types\";\n\n// =============================================================================\n// Formatter exports (for custom formatting)\n// =============================================================================\nexport { formatToMarkdown, htmlToMarkdown } from \"./formatters/markdown\";\nexport { formatToHTML } from \"./formatters/html\";\n\n// =============================================================================\n// Utility exports (for advanced usage)\n// =============================================================================\nexport { extractMetadata } from \"./utils/metadata-extractor\";\nexport { cleanContent } from \"./utils/content-cleaner\";\nexport {\n  isSameDomain,\n  resolveUrl,\n  isValidUrl,\n  validateUrls,\n  getUrlKey,\n  shouldCrawlUrl,\n} from \"./utils/url-helpers\";\nexport { rateLimit } from \"./utils/rate-limiter\";\n\n// =============================================================================\n// Browser pool exports (for advanced usage)\n// =============================================================================\nexport { BrowserPool, HeroBrowserPool } from \"./browser/pool\";\nexport { createHeroConfig } from \"./browser/hero-config\";\nexport type {\n  IBrowserPool,\n  PoolConfig,\n  BrowserInstance,\n  PoolStats,\n  HealthStatus,\n} from \"./browser/types\";\n\n// =============================================================================\n// Proxy exports (for advanced usage)\n// =============================================================================\nexport { createProxyUrl, parseProxyUrl } from \"./proxy/config\";\n\n// =============================================================================\n// Default options export\n// =============================================================================\nexport { DEFAULT_OPTIONS, isValidFormat, shouldCrawlUrl as shouldCrawlUrlFn } from \"./types\";\n\n// =============================================================================\n// Error exports\n// =============================================================================\nexport {\n  ReaderError,\n  ReaderErrorCode,\n  NetworkError,\n  TimeoutError,\n  CloudflareError,\n  AccessDeniedError,\n  ContentExtractionError,\n  ValidationError,\n  InvalidUrlError,\n  RobotsBlockedError,\n  BrowserPoolError,\n  ClientClosedError,\n  NotInitializedError,\n  DNSError,\n  TLSError,\n  BotDetectedError,\n  ProxyConnectionError,\n  ProxyExhaustedError,\n  ContentTooLargeError,\n  MarkdownConversionError,\n  EmptyContentError,\n  wrapError,\n} from \"./errors\";\n\n// Engine errors\nexport { ScrapeFailedError } from \"./engines/errors\";\n\n// =============================================================================\n// Block detection exports\n// =============================================================================\nexport { detectBotPage, detectBotTitle, isBlockedResponse } from \"./utils/block-detector\";\nexport type { BlockDetectionConfig } from \"./utils/block-detector\";\n\n// =============================================================================\n// URL rewriter exports\n// =============================================================================\nexport { rewriteUrl } from \"./utils/url-rewriter\";\nexport type { UrlRewriteRule, RewriteResult } from \"./utils/url-rewriter\";\n\n// =============================================================================\n// Domain profiles exports\n// =============================================================================\nexport { getDomainProfile, applyDomainProfile } from \"./config/domain-profiles\";\nexport type { DomainProfile } from \"./config/domain-profiles\";\n"
  },
  {
    "path": "src/proxy/config.ts",
    "content": "import type { ProxyConfig } from \"../types\";\n\n/**\n * Create proxy URL from configuration\n *\n * Supports both datacenter and residential proxies.\n * For residential proxies, generates a sticky session ID.\n *\n * @param config - Proxy configuration\n * @returns Formatted proxy URL\n *\n * @example\n * // Datacenter proxy\n * createProxyUrl({\n *   type: 'datacenter',\n *   username: 'user',\n *   password: 'pass',\n *   host: 'proxy.example.com',\n *   port: 8080\n * })\n * // Returns: \"http://user:pass@proxy.example.com:8080\"\n */\nexport function createProxyUrl(config: ProxyConfig): string {\n  // If full URL provided, use it directly\n  if (config.url) {\n    return config.url;\n  }\n\n  // Residential proxy with sticky session\n  if (config.type === \"residential\") {\n    // Generate unique session ID for sticky sessions\n    const sessionId = `hero_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;\n\n    // Format: customer-{username}_session-{sessionId}_country-{country}:{password}@{host}:{port}\n    return `http://customer-${config.username}_session-${sessionId}_country-${\n      config.country || \"us\"\n    }:${config.password}@${config.host}:${config.port}`;\n  }\n\n  // Datacenter proxy (simple authentication)\n  return `http://${config.username}:${config.password}@${config.host}:${config.port}`;\n}\n\n/**\n * Parse proxy URL into ProxyConfig\n *\n * @param url - Proxy URL string\n * @returns Parsed proxy configuration\n *\n * @example\n * parseProxyUrl(\"http://user:pass@proxy.example.com:8080\")\n * // Returns: { username: 'user', password: 'pass', host: 'proxy.example.com', port: 8080 }\n */\nexport function parseProxyUrl(url: string): ProxyConfig {\n  try {\n    const parsed = new URL(url);\n\n    return {\n      url,\n      username: parsed.username,\n      password: parsed.password,\n      host: parsed.hostname,\n      port: parsed.port ? parseInt(parsed.port, 10) : undefined,\n    };\n  } catch (error) {\n    throw new Error(`Invalid proxy URL: ${url}`);\n  }\n}\n"
  },
  {
    "path": "src/proxy/env.ts",
    "content": "/**\n * Environment-driven proxy pool configuration.\n *\n * Lets operators configure datacenter and residential proxy pools without\n * touching code — relevant for the daemon, which is run as a long-lived\n * process and gets its config from `.env`.\n *\n * Env vars:\n *   PROXY_DATACENTER   - one URL, or comma-separated list of URLs\n *   PROXY_RESIDENTIAL  - one URL, or comma-separated list of URLs\n *\n * Each URL must be of the form `http://user:pass@host:port`. Empty strings\n * and whitespace-only entries are ignored, so `PROXY_DATACENTER=,` or an\n * unset var both resolve to \"no proxies for that tier\". An unparseable\n * URL throws at startup — we fail loud here rather than silently fall\n * through to direct connections, which would hide a misconfiguration\n * behind scrape results that look mostly fine until they get blocked.\n *\n * Returns `undefined` when no proxy env vars are set, so the caller can\n * distinguish \"no proxies configured\" (pass-through) from \"empty pool\".\n */\n\nimport type { ProxyPoolConfig, ProxyConfig } from \"../types\";\nimport { parseProxyUrl } from \"./config\";\n\n/**\n * Parse a proxy entry which may include a timezone suffix: `url|timezone`\n * e.g. `http://user:pass@host:port|America/Los_Angeles`\n */\nfunction parseList(raw: string | undefined, tierLabel: string): ProxyConfig[] {\n  if (!raw) return [];\n  const items = raw\n    .split(\",\")\n    .map((s) => s.trim())\n    .filter((s) => s.length > 0);\n\n  return items.map((entry) => {\n    // Split on last pipe to separate URL from optional timezone\n    const pipeIdx = entry.lastIndexOf(\"|\");\n    const url = pipeIdx > 0 ? entry.slice(0, pipeIdx) : entry;\n    const timezoneId = pipeIdx > 0 ? entry.slice(pipeIdx + 1) : undefined;\n\n    try {\n      const config = parseProxyUrl(url);\n      if (timezoneId) config.timezoneId = timezoneId;\n      return config;\n    } catch (err) {\n      throw new Error(\n        `Invalid ${tierLabel} proxy URL (expected http://user:pass@host:port[|timezone]): ${entry}`\n      );\n    }\n  });\n}\n\nexport interface ParsedProxyPools {\n  /** Undefined means no proxy env vars were set at all. */\n  pools: ProxyPoolConfig | undefined;\n  /** Human-readable summary for startup logging. */\n  summary: string;\n}\n\n/**\n * Read PROXY_DATACENTER and PROXY_RESIDENTIAL from `env` (defaults to\n * `process.env`) and build a ProxyPoolConfig.\n */\nexport function parseProxyPoolsFromEnv(env: NodeJS.ProcessEnv = process.env): ParsedProxyPools {\n  const datacenter = parseList(env.PROXY_DATACENTER, \"datacenter\");\n  const residential = parseList(env.PROXY_RESIDENTIAL, \"residential\");\n\n  if (datacenter.length === 0 && residential.length === 0) {\n    return {\n      pools: undefined,\n      summary: \"no proxies configured — scrapes go direct\",\n    };\n  }\n\n  return {\n    pools: {\n      ...(datacenter.length > 0 ? { datacenter } : {}),\n      ...(residential.length > 0 ? { residential } : {}),\n    },\n    summary: `proxies loaded: ${datacenter.length} datacenter, ${residential.length} residential`,\n  };\n}\n"
  },
  {
    "path": "src/proxy/health-tracker.ts",
    "content": "/**\n * ProxyHealthTracker — minimal per-proxy circuit breaker.\n *\n * Goal: detect a dead or blacklisted proxy mid-session and take it out of\n * rotation for a fixed cooldown period, so the scraper stops burning attempts\n * on a proxy that's clearly broken. This is the runtime counterpart to the\n * startup-time `api.ipify.org` verification — startup catches dead creds and\n * misconfigured URLs; runtime tracking catches proxies that go bad after\n * they were healthy (IP got blacklisted, provider rate-limited us, etc.).\n *\n * Scope for the first cut (intentionally minimal):\n *   - Count consecutive failures per proxy URL.\n *   - After N (default 10) consecutive failures, the proxy is benched.\n *   - After M (default 5 minutes) the proxy is auto-revived and gets one\n *     \"probationary\" attempt. If that fails, it's benched again immediately.\n *   - A single success clears the failure counter.\n *   - Emits `proxy-benched` and `proxy-revived` events so the browser pool\n *     can react by retiring / relaunching the affected ProxyBoundBrowser.\n *\n * NOT in this version:\n *   - Failure-rate windows (just consecutive count).\n *   - Per-proxy cooldown escalation (exponential backoff, max cooldowns).\n *   - Per-destination-domain tracking (a proxy could be benched for amazon\n *     but healthy for github — we don't model that yet).\n *   - Persistence across daemon restarts.\n *   - Metrics / /status endpoint surface.\n *\n * All of those are easy extensions once the basic machinery is in place\n * and we have real e2e data showing what the thresholds should be.\n *\n * See backlog item in reader-context/BACKLOG.md for the full version.\n */\n\nimport { EventEmitter } from \"node:events\";\n\n/**\n * Default knobs.\n */\nexport const DEFAULT_FAILURE_THRESHOLD = 10;\nexport const DEFAULT_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes\n\n/**\n * Events emitted by the tracker.\n */\nexport interface ProxyHealthEvents {\n  \"proxy-benched\": (info: {\n    proxyUrl: string;\n    consecutiveFailures: number;\n    benchedUntil: number;\n  }) => void;\n  \"proxy-revived\": (info: { proxyUrl: string }) => void;\n}\n\n/**\n * Options for the tracker.\n */\nexport interface ProxyHealthTrackerOptions {\n  /** Consecutive failures before benching. Default: 10 */\n  failureThreshold?: number;\n  /** Cooldown duration in milliseconds. Default: 5 minutes */\n  cooldownMs?: number;\n  /**\n   * Time source for testability. Defaults to `Date.now`. Tests can inject\n   * a fake clock; we never reach for `Date.now()` elsewhere in this class.\n   */\n  now?: () => number;\n}\n\n/**\n * Per-proxy state. Not exported — the public API is `isHealthy` / `recordX`.\n */\ninterface ProxyState {\n  consecutiveFailures: number;\n  totalSuccesses: number;\n  totalFailures: number;\n  lastSuccessAt: number | null;\n  lastFailureAt: number | null;\n  /**\n   * If set, the proxy is benched until this timestamp (ms since epoch). A\n   * read after this timestamp auto-revives the proxy (no separate timer\n   * needed — revival is lazy).\n   */\n  benchedUntil: number | null;\n}\n\n/**\n * Snapshot of a proxy's current health. For logging / /status endpoint.\n */\nexport interface ProxyHealthSnapshot {\n  proxyUrl: string;\n  healthy: boolean;\n  consecutiveFailures: number;\n  totalSuccesses: number;\n  totalFailures: number;\n  lastSuccessAt: number | null;\n  lastFailureAt: number | null;\n  benchedUntil: number | null;\n}\n\n/**\n * ProxyHealthTracker\n *\n * ```ts\n * const tracker = new ProxyHealthTracker();\n * tracker.on(\"proxy-benched\", ({ proxyUrl }) => {\n *   browserPool.retire(proxyUrl);\n * });\n * tracker.on(\"proxy-revived\", ({ proxyUrl }) => {\n *   browserPool.relaunch(proxyUrl);\n * });\n *\n * // In the scrape loop:\n * if (tracker.isHealthy(proxyUrl)) {\n *   try {\n *     await scrape(proxyUrl);\n *     tracker.recordSuccess(proxyUrl);\n *   } catch {\n *     tracker.recordFailure(proxyUrl);\n *     throw;\n *   }\n * }\n * ```\n */\nexport class ProxyHealthTracker extends EventEmitter {\n  private readonly failureThreshold: number;\n  private readonly cooldownMs: number;\n  private readonly now: () => number;\n  private readonly states = new Map<string, ProxyState>();\n\n  constructor(options: ProxyHealthTrackerOptions = {}) {\n    super();\n    this.failureThreshold = options.failureThreshold ?? DEFAULT_FAILURE_THRESHOLD;\n    this.cooldownMs = options.cooldownMs ?? DEFAULT_COOLDOWN_MS;\n    this.now = options.now ?? Date.now;\n\n    if (this.failureThreshold < 1 || !Number.isInteger(this.failureThreshold)) {\n      throw new Error(\n        `ProxyHealthTracker: failureThreshold must be an integer >= 1, got ${this.failureThreshold}`\n      );\n    }\n    if (this.cooldownMs < 0) {\n      throw new Error(`ProxyHealthTracker: cooldownMs must be >= 0, got ${this.cooldownMs}`);\n    }\n  }\n\n  /**\n   * Strongly-typed `on`/`emit`/`once`. Allows TypeScript to know the event\n   * payload shape. `EventEmitter`'s default types are just `string | symbol`.\n   */\n  override on<E extends keyof ProxyHealthEvents>(event: E, listener: ProxyHealthEvents[E]): this {\n    return super.on(event, listener as (...args: unknown[]) => void);\n  }\n  override once<E extends keyof ProxyHealthEvents>(event: E, listener: ProxyHealthEvents[E]): this {\n    return super.once(event, listener as (...args: unknown[]) => void);\n  }\n  override emit<E extends keyof ProxyHealthEvents>(\n    event: E,\n    ...args: Parameters<ProxyHealthEvents[E]>\n  ): boolean {\n    return super.emit(event, ...args);\n  }\n\n  /**\n   * Whether this proxy is currently usable. Returns true for unknown proxies\n   * (innocent until proven guilty). A benched proxy whose cooldown has\n   * expired is auto-revived lazily here, which also emits `proxy-revived`.\n   */\n  isHealthy(proxyUrl: string): boolean {\n    const state = this.states.get(proxyUrl);\n    if (!state) return true;\n    if (state.benchedUntil === null) return true;\n    if (this.now() >= state.benchedUntil) {\n      // Cooldown expired — revive on probation (counter stays at threshold\n      // so that a single failure immediately re-benches).\n      state.benchedUntil = null;\n      this.emit(\"proxy-revived\", { proxyUrl });\n      return true;\n    }\n    return false;\n  }\n\n  /**\n   * Record a successful scrape through this proxy. Decrements the failure\n   * counter by 3 (so successes erode failures gradually rather than\n   * requiring a full reset). If the proxy was benched and we got a\n   * success anyway (probationary attempt after cooldown), clear the bench.\n   */\n  recordSuccess(proxyUrl: string): void {\n    const state = this.ensureState(proxyUrl);\n    const wasBenched = state.benchedUntil !== null;\n    // Decay: each success erodes 3 failure points instead of full reset.\n    // A proxy that alternates success/failure stays healthy (3:1 ratio).\n    // A proxy that gets 10 failures in a row still benches quickly.\n    state.consecutiveFailures = Math.max(0, state.consecutiveFailures - 3);\n    state.totalSuccesses += 1;\n    state.lastSuccessAt = this.now();\n    state.benchedUntil = null;\n    if (wasBenched) {\n      this.emit(\"proxy-revived\", { proxyUrl });\n    }\n  }\n\n  /**\n   * Record a failed scrape through this proxy. Increments the counter and\n   * benches the proxy if the threshold is reached. Emits `proxy-benched`\n   * exactly once per bench transition.\n   */\n  recordFailure(proxyUrl: string): void {\n    const state = this.ensureState(proxyUrl);\n    state.consecutiveFailures += 1;\n    state.totalFailures += 1;\n    state.lastFailureAt = this.now();\n\n    // Already benched — don't re-emit and don't extend the cooldown. A\n    // probationary failure (cooldown expired, isHealthy() re-activated it)\n    // will arrive with benchedUntil == null, so we fall through and re-bench\n    // below.\n    if (state.benchedUntil !== null) {\n      return;\n    }\n\n    if (state.consecutiveFailures >= this.failureThreshold) {\n      state.benchedUntil = this.now() + this.cooldownMs;\n      this.emit(\"proxy-benched\", {\n        proxyUrl,\n        consecutiveFailures: state.consecutiveFailures,\n        benchedUntil: state.benchedUntil,\n      });\n    }\n  }\n\n  /**\n   * Snapshot the health of a single proxy. Returns null for unknown URLs.\n   * Does NOT auto-revive — unlike `isHealthy`, this is a pure read.\n   */\n  snapshot(proxyUrl: string): ProxyHealthSnapshot | null {\n    const state = this.states.get(proxyUrl);\n    if (!state) return null;\n    return {\n      proxyUrl,\n      healthy: state.benchedUntil === null || this.now() >= state.benchedUntil,\n      consecutiveFailures: state.consecutiveFailures,\n      totalSuccesses: state.totalSuccesses,\n      totalFailures: state.totalFailures,\n      lastSuccessAt: state.lastSuccessAt,\n      lastFailureAt: state.lastFailureAt,\n      benchedUntil: state.benchedUntil,\n    };\n  }\n\n  /**\n   * Snapshot every tracked proxy.\n   */\n  allSnapshots(): ProxyHealthSnapshot[] {\n    return [...this.states.keys()]\n      .map((url) => this.snapshot(url))\n      .filter((s): s is ProxyHealthSnapshot => s !== null);\n  }\n\n  /**\n   * Manually reset a proxy's state. Used by `ipify` startup verification:\n   * if verification passes after a history of failures, clear the slate.\n   */\n  reset(proxyUrl: string): void {\n    this.states.delete(proxyUrl);\n  }\n\n  private ensureState(proxyUrl: string): ProxyState {\n    let state = this.states.get(proxyUrl);\n    if (!state) {\n      state = {\n        consecutiveFailures: 0,\n        totalSuccesses: 0,\n        totalFailures: 0,\n        lastSuccessAt: null,\n        lastFailureAt: null,\n        benchedUntil: null,\n      };\n      this.states.set(proxyUrl, state);\n    }\n    return state;\n  }\n}\n"
  },
  {
    "path": "src/proxy/proxy-gate.ts",
    "content": "/**\n * PerProxyGate — per-IP concurrency cap.\n *\n * Enforces a hard limit on the number of simultaneous scrapes that can share a\n * single proxy URL, across ALL engines (http, tlsclient, hero). Sitting above\n * the engines at the scraper boundary, this is what guarantees we never double-\n * book an IP even when a single scrape runs multiple engines in parallel via\n * the orchestrator waterfall.\n *\n * Design notes:\n * - Keyed by the raw proxy URL string. Same URL -> same gate. A `null` /\n *   undefined key means \"no proxy\" (direct connection); direct traffic is not\n *   capped per-request by this gate (the direct sub-pool's tab limit handles\n *   that downstream).\n * - Slots are acquired via pLimit, so queueing is FIFO and fair.\n * - The cap is configurable globally and overridable per-proxy, so Amazon's\n *   domain profile can drop it to 1 without affecting datacenter throughput\n *   elsewhere.\n *\n * The \"2 concurrent per IP\" default is a conservative starting point. It can\n * be overridden per-proxy via `setOverride(proxyUrl, max)` for domains that\n * need tighter caps (e.g. 1 concurrent for anti-bot sites).\n */\n\nimport pLimit from \"p-limit\";\n\n/**\n * Options for the PerProxyGate.\n */\nexport interface PerProxyGateOptions {\n  /**\n   * Global default for the number of concurrent scrapes allowed through a\n   * single proxy URL. Must be >= 1.\n   *\n   * Default: 2\n   */\n  maxConcurrentPerProxy?: number;\n}\n\n/**\n * A release function returned from `acquire()`. Call it exactly once when the\n * scrape is finished to free the slot. `acquire()` guarantees this function is\n * safe to call any number of times — only the first call has effect.\n */\nexport type PerProxyRelease = () => void;\n\n/**\n * Snapshot of a single proxy gate's current load.\n */\nexport interface PerProxyStats {\n  /** Proxy URL. `null` represents the direct-connection lane. */\n  proxyUrl: string | null;\n  /** Maximum concurrent slots for this proxy. */\n  max: number;\n  /** Slots currently in use. */\n  active: number;\n  /** Requests waiting for a slot. */\n  queued: number;\n}\n\n/**\n * Per-proxy concurrency gate.\n *\n * ```ts\n * const gate = new PerProxyGate({ maxConcurrentPerProxy: 2 });\n * const release = await gate.acquire(\"http://user:pass@dc1.example.com:8080\");\n * try {\n *   // do the scrape; at most 2 other acquires for the same URL can be active\n * } finally {\n *   release();\n * }\n * ```\n */\nexport class PerProxyGate {\n  private readonly defaultMax: number;\n  private readonly gates = new Map<string, { limit: ReturnType<typeof pLimit>; max: number }>();\n  private readonly overrides = new Map<string, number>();\n\n  constructor(options: PerProxyGateOptions = {}) {\n    const max = options.maxConcurrentPerProxy ?? 2;\n    if (!Number.isInteger(max) || max < 1) {\n      throw new Error(`PerProxyGate: maxConcurrentPerProxy must be an integer >= 1, got ${max}`);\n    }\n    this.defaultMax = max;\n  }\n\n  /**\n   * Override the concurrency cap for a specific proxy URL. Used by domain\n   * profiles that want to tighten the per-IP cap (e.g. Amazon → 1).\n   *\n   * Calling this after a gate already exists for the URL replaces the\n   * underlying pLimit. In-flight scrapes on the old gate are unaffected and\n   * continue to completion; new acquires use the new cap. This is fine for the\n   * expected use (startup-time configuration), but don't rely on it for\n   * hot-swapping under load.\n   */\n  setOverride(proxyUrl: string, max: number): void {\n    if (!Number.isInteger(max) || max < 1) {\n      throw new Error(`PerProxyGate: override must be an integer >= 1, got ${max}`);\n    }\n    this.overrides.set(proxyUrl, max);\n    // Reset the gate so the next acquire picks up the new cap\n    this.gates.delete(proxyUrl);\n  }\n\n  /**\n   * Acquire a slot for `proxyUrl`. Resolves to a release function when the\n   * slot is free. If `proxyUrl` is `null` / `undefined`, the direct-connection\n   * lane is used — a single shared lane with no cap (the direct sub-pool\n   * enforces its own tab limit downstream).\n   *\n   * Acquire never throws; queueing is unbounded. If you need a timeout, wrap\n   * the returned promise in `Promise.race`.\n   */\n  async acquire(proxyUrl: string | null | undefined): Promise<PerProxyRelease> {\n    if (!proxyUrl) {\n      // Direct lane: no per-URL cap here, the browser pool's tab limit is\n      // the downstream authority. Return a no-op release so callers don't\n      // branch on null.\n      return noopRelease();\n    }\n\n    const gate = this.gateFor(proxyUrl);\n\n    // pLimit.acquire would be cleaner but isn't in all versions of p-limit\n    // we might pin to. Use the \"held promise\" pattern: submit a task that\n    // blocks on a manual resolver. The task holds the slot until released.\n    let release!: PerProxyRelease;\n    const held = new Promise<void>((resolve) => {\n      release = makeRelease(resolve);\n    });\n\n    // Fire-and-forget the task that holds the slot. We must NOT await it\n    // here — we want to await only the \"slot acquired\" signal, not the\n    // \"task complete\" signal.\n    const acquired = new Promise<void>((resolveAcquired) => {\n      gate.limit(async () => {\n        resolveAcquired();\n        await held;\n      });\n    });\n\n    await acquired;\n    return release;\n  }\n\n  /**\n   * Wrap an async function in an `acquire`/`release` pair. Prefer this over\n   * bare `acquire()` in call sites so you can't forget to release on the\n   * error path.\n   */\n  async withSlot<T>(proxyUrl: string | null | undefined, fn: () => Promise<T>): Promise<T> {\n    const release = await this.acquire(proxyUrl);\n    try {\n      return await fn();\n    } finally {\n      release();\n    }\n  }\n\n  /**\n   * Inspect the current load of a specific proxy URL, or `null` if no gate\n   * exists for it yet. Useful for health-tracker and /status endpoint.\n   */\n  stats(proxyUrl: string): PerProxyStats | null {\n    const gate = this.gates.get(proxyUrl);\n    if (!gate) return null;\n    return {\n      proxyUrl,\n      max: gate.max,\n      active: gate.limit.activeCount,\n      queued: gate.limit.pendingCount,\n    };\n  }\n\n  /**\n   * Inspect the current load of every known proxy.\n   */\n  allStats(): PerProxyStats[] {\n    return [...this.gates.entries()].map(([proxyUrl, gate]) => ({\n      proxyUrl,\n      max: gate.max,\n      active: gate.limit.activeCount,\n      queued: gate.limit.pendingCount,\n    }));\n  }\n\n  /**\n   * Get or create the gate for a proxy URL.\n   */\n  private gateFor(proxyUrl: string): { limit: ReturnType<typeof pLimit>; max: number } {\n    const existing = this.gates.get(proxyUrl);\n    if (existing) return existing;\n    const max = this.overrides.get(proxyUrl) ?? this.defaultMax;\n    const gate = { limit: pLimit(max), max };\n    this.gates.set(proxyUrl, gate);\n    return gate;\n  }\n}\n\n/**\n * Create a release function that can be called multiple times safely.\n */\nfunction makeRelease(resolve: () => void): PerProxyRelease {\n  let released = false;\n  return () => {\n    if (released) return;\n    released = true;\n    resolve();\n  };\n}\n\n/**\n * A no-op release, used for the direct lane.\n */\nfunction noopRelease(): PerProxyRelease {\n  return () => {\n    /* no-op */\n  };\n}\n"
  },
  {
    "path": "src/proxy/verify.ts",
    "content": "/**\n * Startup-time proxy verification.\n *\n * Before the daemon declares itself ready, every configured proxy URL is\n * tested by making a real HTTP request to api.ipify.org through it. The\n * returned IP is the proxy's egress IP — confirming three things at once:\n *\n *   1. The proxy URL is reachable.\n *   2. The credentials are valid.\n *   3. Traffic actually flows through the proxy (the egress IP is not\n *      the host's own IP).\n *\n * If any proxy fails verification, `verifyProxiesOrThrow` rejects with a\n * clear multi-line error listing every failure. The daemon refuses to\n * start with a broken proxy configuration.\n *\n * The Fetcher abstraction lets unit tests inject a fake without spinning\n * up a real undici ProxyAgent or hitting api.ipify.org over the network.\n */\n\nimport { fetch as undiciFetch, ProxyAgent } from \"undici\";\nimport type { ProxyPoolConfig } from \"../types\";\nimport { redactProxyUrl } from \"../browser/proxy-bound-browser\";\n\nexport const IP_CHECK_URL = \"https://api.ipify.org?format=json\";\nexport const IP_CHECK_TIMEOUT_MS = 10_000;\n\nexport type ProxyTierName = \"datacenter\" | \"residential\";\n\nexport interface VerifiedProxy {\n  proxyUrl: string;\n  egressIp: string;\n  tier: ProxyTierName;\n}\n\nexport interface ProxyVerificationFailure {\n  proxyUrl: string;\n  tier: ProxyTierName;\n  error: string;\n}\n\nexport interface ProxyVerificationResult {\n  verified: VerifiedProxy[];\n  failed: ProxyVerificationFailure[];\n}\n\n/**\n * Function that fetches the egress IP for a proxy URL. Production uses\n * `defaultFetcher` (undici + ProxyAgent + api.ipify.org). Tests inject a\n * fake.\n */\nexport type EgressIpFetcher = (proxyUrl: string) => Promise<string>;\n\nexport interface VerifyProxiesOptions {\n  /** Override the fetcher for tests. */\n  fetcher?: EgressIpFetcher;\n}\n\n/**\n * Verify every proxy in the pool. Returns a result object containing both\n * successes and failures — the caller decides whether failures are fatal\n * (`verifyProxiesOrThrow` is the strict variant used at daemon startup).\n *\n * Verification runs in parallel across all proxies; total wall time is\n * bounded by `IP_CHECK_TIMEOUT_MS`, not by the number of proxies.\n */\nexport async function verifyProxies(\n  pools: ProxyPoolConfig | undefined,\n  options: VerifyProxiesOptions = {}\n): Promise<ProxyVerificationResult> {\n  const fetcher = options.fetcher ?? defaultFetcher;\n  const verified: VerifiedProxy[] = [];\n  const failed: ProxyVerificationFailure[] = [];\n\n  if (!pools) return { verified, failed };\n\n  const tasks: Array<Promise<void>> = [];\n\n  for (const tier of [\"datacenter\", \"residential\"] as const) {\n    for (const cfg of pools[tier] ?? []) {\n      const url = cfg.url;\n      if (!url) continue;\n      tasks.push(\n        fetcher(url).then(\n          (ip) => {\n            verified.push({ proxyUrl: url, egressIp: ip, tier });\n          },\n          (err: unknown) => {\n            const msg = err instanceof Error ? err.message : String(err);\n            failed.push({ proxyUrl: url, tier, error: msg });\n          }\n        )\n      );\n    }\n  }\n\n  await Promise.all(tasks);\n  return { verified, failed };\n}\n\n/**\n * Verify proxies and throw a clear multi-line error if any failed. Used by\n * the daemon at startup to fail loud on misconfiguration.\n */\nexport async function verifyProxiesOrThrow(\n  pools: ProxyPoolConfig | undefined,\n  options: VerifyProxiesOptions = {}\n): Promise<VerifiedProxy[]> {\n  const result = await verifyProxies(pools, options);\n  if (result.failed.length > 0) {\n    const lines = [\n      `Proxy verification failed for ${result.failed.length} proxy/proxies:`,\n      ...result.failed.map((f) => `  - [${f.tier}] ${redactProxyUrl(f.proxyUrl)}: ${f.error}`),\n      \"\",\n      \"The daemon refuses to start with a broken proxy configuration.\",\n      \"Fix or remove the failing proxy URLs in PROXY_DATACENTER / PROXY_RESIDENTIAL,\",\n      `or check whether ${IP_CHECK_URL} is reachable from this network.`,\n    ];\n    throw new Error(lines.join(\"\\n\"));\n  }\n  return result.verified;\n}\n\n/**\n * Production fetcher: build a ProxyAgent for the URL, GET api.ipify.org\n * through it, parse the JSON, return the egress IP.\n *\n * The agent is single-use — closed in `finally` to release the TLS pool.\n * If verification is something we end up running periodically (not just\n * at startup), it's worth caching agents instead.\n */\nasync function defaultFetcher(proxyUrl: string): Promise<string> {\n  const agent = new ProxyAgent(proxyUrl);\n  const controller = new AbortController();\n  const timer = setTimeout(() => controller.abort(), IP_CHECK_TIMEOUT_MS);\n  try {\n    const res = await undiciFetch(IP_CHECK_URL, {\n      dispatcher: agent,\n      signal: controller.signal,\n      headers: { \"User-Agent\": \"reader-daemon-startup-check/1.0\" },\n    });\n    if (!res.ok) {\n      throw new Error(`HTTP ${res.status} from ${IP_CHECK_URL}`);\n    }\n    const body = (await res.json()) as { ip?: string };\n    if (!body.ip || typeof body.ip !== \"string\") {\n      throw new Error(`Missing or invalid 'ip' field in api.ipify.org response`);\n    }\n    return body.ip;\n  } finally {\n    clearTimeout(timer);\n    await agent.close().catch(() => undefined);\n  }\n}\n"
  },
  {
    "path": "src/scraper.ts",
    "content": "import pLimit from \"p-limit\";\nimport { htmlToMarkdown } from \"./formatters/markdown\";\nimport { postprocessMarkdown } from \"./formatters/postprocess\";\nimport { cleanContent } from \"./utils/content-cleaner\";\nimport { extractMetadata } from \"./utils/metadata-extractor\";\nimport { createLogger } from \"./utils/logger\";\nimport { fetchRobotsTxt, isUrlAllowed, type RobotsRules } from \"./utils/robots-parser\";\nimport {\n  DEFAULT_OPTIONS,\n  type ScrapeOptions,\n  type ScrapeResult,\n  type WebsiteScrapeResult,\n  type BatchMetadata,\n  type ProxyMetadata,\n  type ProxyTier,\n} from \"./types\";\nimport { EngineOrchestrator, ScrapeFailedError } from \"./engines/index.js\";\nimport { getDomainProfile, applyDomainProfile } from \"./config/domain-profiles.js\";\nimport { isBlockedResponse } from \"./utils/block-detector.js\";\nimport { rewriteUrl } from \"./utils/url-rewriter.js\";\nimport {\n  wrapError,\n  ReaderError,\n  DNSError,\n  RobotsBlockedError,\n  InvalidUrlError,\n  ProxyConnectionError,\n} from \"./errors.js\";\nimport type { PerProxyGate } from \"./proxy/proxy-gate.js\";\nimport type { ProxyHealthTracker } from \"./proxy/health-tracker.js\";\nimport { redactProxyUrl } from \"./browser/proxy-bound-browser.js\";\n\n/** Default hard deadline for any single URL (ms). */\nconst DEFAULT_HARD_DEADLINE_MS = 30_000;\n\n/** Default timeout for the first datacenter proxy attempt (ms). */\nconst DEFAULT_DATACENTER_TIMEOUT_MS = 10_000;\n\n/**\n * Scraper class with built-in concurrency and proxy escalation.\n *\n * Retry strategy per URL:\n *   1. Hero on datacenter proxy, 10s timeout\n *   2. Any failure → Hero on residential proxy, remaining time (up to 30s total)\n *   3. Any failure → done, report error\n *\n * Non-retryable errors (DNS, invalid URL, robots.txt) skip directly to failure.\n */\nexport class Scraper {\n  private options: Required<ScrapeOptions>;\n  private logger = createLogger(\"scraper\");\n  private robotsCache: Map<string, RobotsRules | null> = new Map();\n\n  constructor(options: ScrapeOptions) {\n    this.options = {\n      ...DEFAULT_OPTIONS,\n      ...options,\n    } as Required<ScrapeOptions>;\n  }\n\n  /**\n   * Get robots.txt rules for a URL, cached per domain\n   */\n  private async getRobotsRules(url: string): Promise<RobotsRules | null> {\n    const origin = new URL(url).origin;\n    if (!this.robotsCache.has(origin)) {\n      const rules = await fetchRobotsTxt(origin);\n      this.robotsCache.set(origin, rules);\n    }\n    return this.robotsCache.get(origin) ?? null;\n  }\n\n  /**\n   * Scrape all URLs\n   */\n  async scrape(): Promise<ScrapeResult> {\n    const startTime = Date.now();\n    const results = await this.scrapeWithConcurrency();\n    return this.buildScrapeResult(results, startTime);\n  }\n\n  /**\n   * Scrape URLs with concurrency control\n   */\n  private async scrapeWithConcurrency(): Promise<\n    Array<{ result: WebsiteScrapeResult | null; error?: string }>\n  > {\n    const limit = pLimit(this.options.batchConcurrency || 1);\n    const tasks = this.options.urls.map((url, index) =>\n      limit(() => this.scrapeSingleUrlWithRetry(url, index))\n    );\n\n    const batchPromise = Promise.all(tasks);\n\n    if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {\n      const timeoutPromise = new Promise<never>((_, reject) => {\n        setTimeout(() => {\n          reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));\n        }, this.options.batchTimeoutMs);\n      });\n\n      return Promise.race([batchPromise, timeoutPromise]);\n    }\n\n    return batchPromise;\n  }\n\n  /**\n   * Scrape a single URL with proxy escalation.\n   *\n   *   1. Try datacenter proxy with 10s timeout\n   *   2. On ANY failure (timeout, empty, blocked, error) → residential with remaining time\n   *   3. On failure → done\n   */\n  private async scrapeSingleUrlWithRetry(\n    url: string,\n    index: number\n  ): Promise<{ result: WebsiteScrapeResult | null; error?: string }> {\n    const hardDeadlineMs = this.options.hardDeadlineMs ?? DEFAULT_HARD_DEADLINE_MS;\n    const datacenterTimeoutMs = this.options.datacenterTimeoutMs ?? DEFAULT_DATACENTER_TIMEOUT_MS;\n    const deadline = Date.now() + hardDeadlineMs;\n\n    // If domain profile or caller specifies residential, skip datacenter attempt entirely\n    const domainProfile = getDomainProfile(url, this.options.domainProfiles);\n    const profileTier = domainProfile?.proxyTier ?? this.options.proxyTier;\n    if (profileTier === \"residential\") {\n      try {\n        const result = await this.scrapeSingleUrl(url, index, \"residential\", hardDeadlineMs);\n        if (result) return { result };\n      } catch (error: any) {\n        this.logger.error(`[scraper] Residential attempt failed for ${url}: ${error.message}`);\n        return { result: null, error: error.message };\n      }\n      return { result: null, error: `Residential scrape returned no data for ${url}` };\n    }\n\n    // --- Attempt 1: datacenter, configurable timeout ---\n    try {\n      const result = await this.scrapeSingleUrl(url, index, undefined, datacenterTimeoutMs);\n\n      if (result) {\n        // Check for soft blocks (200 + bot page content)\n        const blockCheck = isBlockedResponse(\n          result.metadata?.statusCode,\n          result.rawHtml,\n          this.options.blockDetection\n        );\n\n        if (!blockCheck.blocked) {\n          return { result };\n        }\n\n        this.logger.warn(\n          `[scraper] Block detected for ${url} (${blockCheck.reason}), escalating to residential`\n        );\n        // Fall through to residential attempt\n      }\n    } catch (error: any) {\n      // Non-retryable errors — don't escalate\n      if (error instanceof ReaderError && error.retryable === false) {\n        this.logger.error(`Non-retryable error for ${url}: ${error.name} - ${error.message}`);\n        return { result: null, error: error.message };\n      }\n\n      this.logger.warn(\n        `[scraper] Datacenter attempt failed for ${url}: ${error.message}, escalating to residential`\n      );\n      // Fall through to residential attempt\n    }\n\n    // --- Attempt 2: residential, remaining time ---\n    const remaining = deadline - Date.now();\n    if (remaining <= 0) {\n      return {\n        result: null,\n        error: `Scrape exceeded ${hardDeadlineMs / 1000}s hard cap for ${url}`,\n      };\n    }\n\n    try {\n      const result = await this.scrapeSingleUrl(url, index, \"residential\", remaining);\n\n      if (result) {\n        return { result };\n      }\n\n      return { result: null, error: `No content returned for ${url} on residential proxy` };\n    } catch (error: any) {\n      this.logger.error(`[scraper] Residential attempt failed for ${url}: ${error.message}`);\n      return { result: null, error: error.message };\n    }\n  }\n\n  /**\n   * Scrape a single URL using the engine orchestrator.\n   *\n   * @param proxyOverride - Forces this proxy tier instead of the configured one.\n   * @param timeoutMs - Overrides the configured timeout.\n   */\n  private async scrapeSingleUrl(\n    url: string,\n    index: number,\n    proxyOverride?: ProxyTier,\n    timeoutMs?: number\n  ): Promise<WebsiteScrapeResult | null> {\n    const startTime = Date.now();\n\n    // Apply URL rewrite rules (caller-provided, e.g. Google Docs → export)\n    const rewrite = rewriteUrl(url, this.options.urlRewriters);\n    const scrapeTargetUrl = rewrite.url;\n    if (rewrite.rewritten && this.options.verbose) {\n      this.logger.info(`[scraper] Rewriting ${url} -> ${scrapeTargetUrl} (${rewrite.reason})`);\n    }\n\n    // Validate URL format\n    try {\n      new URL(url);\n    } catch {\n      throw new InvalidUrlError(url, \"malformed URL\");\n    }\n\n    // Check robots.txt\n    const robotsRules = await this.getRobotsRules(url);\n    if (!isUrlAllowed(url, robotsRules)) {\n      throw new RobotsBlockedError(url);\n    }\n\n    try {\n      // Apply domain-specific overrides (caller-provided profiles)\n      const domainProfile = getDomainProfile(url, this.options.domainProfiles);\n      let effectiveOptions = domainProfile\n        ? applyDomainProfile(this.options, domainProfile)\n        : { ...this.options };\n\n      // Apply proxy escalation override\n      if (proxyOverride) {\n        effectiveOptions = { ...effectiveOptions, proxyTier: proxyOverride };\n      }\n\n      // Apply timeout override\n      if (timeoutMs) {\n        effectiveOptions = { ...effectiveOptions, timeoutMs };\n      }\n\n      if (domainProfile && this.options.verbose) {\n        this.logger.info(\n          `[scraper] Applied domain profile for ${url}: ${JSON.stringify(domainProfile)}`\n        );\n      }\n\n      // --- Per-attempt proxy resolution ---\n      const resolveProxyFn = this.options.resolveProxy;\n      if (!this.options.proxy && resolveProxyFn) {\n        const resolved = resolveProxyFn(effectiveOptions.proxyTier);\n        if (resolved) {\n          effectiveOptions = { ...effectiveOptions, proxy: resolved };\n        }\n      }\n\n      const currentProxyUrl = effectiveOptions.proxy?.url ?? null;\n\n      // Domain-profile per-IP cap override\n      if (domainProfile?.maxConcurrentPerProxy && currentProxyUrl && this.options.proxyGate) {\n        (this.options.proxyGate as PerProxyGate).setOverride(\n          currentProxyUrl,\n          domainProfile.maxConcurrentPerProxy\n        );\n      }\n\n      if (this.options.verbose) {\n        this.logger.info(\n          `[scraper] ${url} using tier=${effectiveOptions.proxyTier ?? \"auto\"} ` +\n            `proxy=${redactProxyUrl(currentProxyUrl)}` +\n            (domainProfile?.maxConcurrentPerProxy\n              ? ` cap=${domainProfile.maxConcurrentPerProxy}`\n              : \"\")\n        );\n      }\n\n      // Create orchestrator\n      const orchestrator = new EngineOrchestrator({\n        logger: this.logger,\n        verbose: effectiveOptions.verbose,\n      });\n\n      // --- Gated scrape ---\n      const proxyGate = this.options.proxyGate as PerProxyGate | undefined;\n      const healthTracker = this.options.healthTracker as ProxyHealthTracker | undefined;\n\n      const runScrape = () =>\n        orchestrator.scrape({\n          url: scrapeTargetUrl,\n          options: effectiveOptions,\n          logger: this.logger,\n        });\n\n      let engineResult;\n      try {\n        engineResult = proxyGate\n          ? await proxyGate.withSlot(currentProxyUrl, runScrape)\n          : await runScrape();\n\n        if (currentProxyUrl) healthTracker?.recordSuccess(currentProxyUrl);\n      } catch (err: any) {\n        const isProxyFault =\n          err instanceof ProxyConnectionError ||\n          (err.code && [\"ECONNREFUSED\", \"ECONNRESET\", \"ETIMEDOUT\"].includes(err.code));\n        if (currentProxyUrl && isProxyFault) {\n          healthTracker?.recordFailure(currentProxyUrl);\n        }\n        throw err;\n      }\n\n      if (this.options.verbose) {\n        this.logger.info(`[scraper] ${url} scraped with Hero in ${engineResult.duration}ms`);\n      }\n\n      // Detect JSON responses\n      const jsonPayload = detectJsonPayload(engineResult.html, engineResult.statusCode);\n\n      // Extract metadata from raw HTML before cleaning\n      const websiteMetadata = extractMetadata(engineResult.html, engineResult.url);\n\n      // Clean content\n      const cleanedHtml = jsonPayload\n        ? engineResult.html\n        : cleanContent(engineResult.html, engineResult.url, {\n            removeAds: this.options.removeAds,\n            removeBase64Images: this.options.removeBase64Images,\n            onlyMainContent: this.options.onlyMainContent,\n            includeTags: this.options.includeTags,\n            excludeTags: this.options.excludeTags,\n            navigationSelectors: this.options.navigationSelectors,\n          });\n\n      const duration = Date.now() - startTime;\n\n      // Convert to markdown\n      const MAX_HTML_BYTES = parseInt(process.env.READER_MAX_HTML_SIZE || \"2097152\"); // 2MB\n      let markdown: string | undefined;\n\n      if (this.options.formats.includes(\"markdown\")) {\n        if (jsonPayload) {\n          markdown = \"```json\\n\" + jsonPayload + \"\\n```\";\n        } else {\n          try {\n            const htmlForConversion =\n              cleanedHtml.length > MAX_HTML_BYTES\n                ? (this.logger.warn(\n                    `HTML too large for conversion (${cleanedHtml.length} bytes), truncating to ${MAX_HTML_BYTES}`\n                  ),\n                  cleanedHtml.slice(0, MAX_HTML_BYTES))\n                : cleanedHtml;\n\n            markdown = postprocessMarkdown(htmlToMarkdown(htmlForConversion));\n\n            // onlyMainContent empty fallback\n            if (\n              this.options.onlyMainContent &&\n              markdown.trim().length < 50 &&\n              engineResult.html.length > 500\n            ) {\n              this.logger.warn(\n                `[scraper] onlyMainContent produced ${markdown.trim().length} chars for ${url}, ` +\n                  `retrying with full content`\n              );\n              const fullHtml = cleanContent(engineResult.html, engineResult.url, {\n                removeAds: this.options.removeAds,\n                removeBase64Images: this.options.removeBase64Images,\n                onlyMainContent: false,\n              });\n              const fullForConversion =\n                fullHtml.length > MAX_HTML_BYTES ? fullHtml.slice(0, MAX_HTML_BYTES) : fullHtml;\n              markdown = postprocessMarkdown(htmlToMarkdown(fullForConversion));\n            }\n          } catch (conversionError: unknown) {\n            const errMsg =\n              conversionError instanceof Error ? conversionError.message : String(conversionError);\n            this.logger.error(`Markdown conversion failed for ${url}: ${errMsg}`);\n            markdown = cleanedHtml\n              .replace(/<[^>]*>/g, \" \")\n              .replace(/\\s+/g, \" \")\n              .trim();\n          }\n        }\n      }\n\n      const htmlOutput = this.options.formats.includes(\"html\") ? cleanedHtml : undefined;\n\n      // Report progress\n      if (this.options.onProgress) {\n        this.options.onProgress({\n          completed: index + 1,\n          total: this.options.urls.length,\n          currentUrl: url,\n        });\n      }\n\n      // Build proxy metadata from effective options (after escalation)\n      let proxyMetadata: ProxyMetadata | undefined;\n      if (effectiveOptions.proxy) {\n        const proxy = effectiveOptions.proxy;\n        const tier = effectiveOptions.proxyTier as ProxyTier | undefined;\n        if (proxy.url) {\n          try {\n            const proxyUrl = new URL(proxy.url);\n            proxyMetadata = {\n              host: proxyUrl.hostname,\n              port: parseInt(proxyUrl.port, 10) || 80,\n              tier,\n              country: proxy.country,\n            };\n          } catch {\n            // Invalid URL, skip proxy metadata\n          }\n        } else if (proxy.host && proxy.port) {\n          proxyMetadata = {\n            host: proxy.host,\n            port: proxy.port,\n            tier,\n            country: proxy.country,\n          };\n        }\n      }\n\n      const finalUrl = engineResult.url !== scrapeTargetUrl ? engineResult.url : undefined;\n\n      const result: WebsiteScrapeResult = {\n        rawHtml: engineResult.html,\n        markdown,\n        html: htmlOutput,\n        metadata: {\n          baseUrl: url,\n          ...(finalUrl ? { finalUrl } : {}),\n          statusCode: engineResult.statusCode,\n          engine: engineResult.engine,\n          totalPages: 1,\n          scrapedAt: new Date().toISOString(),\n          duration,\n          website: websiteMetadata,\n          proxy: proxyMetadata,\n        },\n      };\n\n      return result;\n    } catch (error: unknown) {\n      // Report progress (failed) before re-throwing\n      if (this.options.onProgress) {\n        this.options.onProgress({\n          completed: index + 1,\n          total: this.options.urls.length,\n          currentUrl: url,\n        });\n      }\n\n      // Non-retryable typed errors — re-throw as-is\n      if (\n        error instanceof InvalidUrlError ||\n        error instanceof RobotsBlockedError ||\n        error instanceof DNSError\n      ) {\n        this.logger.error(`${error.name} for ${url}: ${error.message}`);\n        throw error;\n      }\n\n      // ScrapeFailedError from orchestrator — re-throw for retry loop\n      if (error instanceof ScrapeFailedError) {\n        this.logger.error(`Failed to scrape ${url}: ${error.message}`);\n        throw error;\n      }\n\n      // Unknown error — classify and re-throw\n      const classified = wrapError(error, url);\n      this.logger.error(\n        `${classified.name} for ${url}: ${classified.message}` +\n          (classified.retryable ? \" (retryable)\" : \"\")\n      );\n      throw classified;\n    }\n  }\n\n  /**\n   * Build final scrape result\n   */\n  private buildScrapeResult(\n    results: Array<{ result: WebsiteScrapeResult | null; error?: string }>,\n    startTime: number\n  ): ScrapeResult {\n    const successful = results\n      .filter((r) => r.result !== null)\n      .map((r) => r.result as WebsiteScrapeResult);\n\n    const errors: Array<{ url: string; error: string }> = [];\n    results.forEach((r, index) => {\n      if (r.result === null && r.error) {\n        errors.push({ url: this.options.urls[index], error: r.error });\n      }\n    });\n\n    const batchMetadata: BatchMetadata = {\n      totalUrls: this.options.urls.length,\n      successfulUrls: successful.length,\n      failedUrls: results.filter((r) => r.result === null).length,\n      scrapedAt: new Date().toISOString(),\n      totalDuration: Date.now() - startTime,\n      errors,\n    };\n\n    return {\n      data: successful,\n      batchMetadata,\n    };\n  }\n}\n\n/**\n * Detect if an engine response body is a JSON payload rather than HTML.\n */\nfunction detectJsonPayload(body: string, statusCode: number): string | null {\n  if (statusCode < 200 || statusCode >= 300) return null;\n  if (!body) return null;\n\n  const trimmed = body.trim();\n  if (trimmed.length === 0) return null;\n  if (trimmed.length > 500_000) return null;\n\n  const firstChar = trimmed[0];\n  const lastChar = trimmed[trimmed.length - 1];\n  const looksJson =\n    (firstChar === \"{\" && lastChar === \"}\") || (firstChar === \"[\" && lastChar === \"]\");\n  if (!looksJson) return null;\n\n  try {\n    const parsed = JSON.parse(trimmed);\n    return JSON.stringify(parsed, null, 2);\n  } catch {\n    return null;\n  }\n}\n\n/**\n * Convenience function to scrape URLs\n */\nexport async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {\n  const scraper = new Scraper(options);\n  return scraper.scrape();\n}\n"
  },
  {
    "path": "src/types.ts",
    "content": "import type { IBrowserPool } from \"./browser/types\";\n\n/**\n * Proxy configuration for Hero\n */\nexport interface ProxyConfig {\n  /** Full proxy URL (takes precedence over other fields) */\n  url?: string;\n  /** Proxy type */\n  type?: \"datacenter\" | \"residential\";\n  /** Proxy username */\n  username?: string;\n  /** Proxy password */\n  password?: string;\n  /** Proxy host */\n  host?: string;\n  /** Proxy port */\n  port?: number;\n  /** Country code for residential proxies (e.g., 'us', 'uk') */\n  country?: string;\n  /** IANA timezone ID matching the proxy's exit location (e.g., 'America/Los_Angeles') */\n  timezoneId?: string;\n}\n\n/**\n * Proxy tier — controls which proxy pool is used\n *\n * - \"datacenter\": Fast, cheap datacenter IPs — works for most sites\n * - \"residential\": Residential/mobile IPs — needed for anti-bot sites (Amazon, LinkedIn)\n * - \"auto\": Start with datacenter, auto-escalate to residential on block detection\n */\nexport type ProxyTier = \"datacenter\" | \"residential\" | \"auto\";\n\n/**\n * Multi-tier proxy pool configuration\n */\nexport interface ProxyPoolConfig {\n  /** Datacenter proxies (fast, cheap, most sites) */\n  datacenter?: ProxyConfig[];\n  /** Residential proxies (slower, expensive, anti-bot sites) */\n  residential?: ProxyConfig[];\n}\n\n/**\n * Proxy metadata in scrape results\n */\nexport interface ProxyMetadata {\n  /** Proxy host that was used */\n  host: string;\n  /** Proxy port that was used */\n  port: number;\n  /** Which proxy tier was actually used */\n  tier?: ProxyTier;\n  /** Country code if geo-targeting was used */\n  country?: string;\n}\n\n/**\n * Browser pool configuration for ReaderClient\n */\nexport interface BrowserPoolConfig {\n  /** Number of browser instances (default: 2) */\n  size?: number;\n  /** Retire browser after this many page loads (default: 100) */\n  retireAfterPages?: number;\n  /** Retire browser after this many minutes (default: 30) */\n  retireAfterMinutes?: number;\n  /** Maximum pending requests in queue (default: 100) */\n  maxQueueSize?: number;\n}\n\n/**\n * Main scraping options interface\n */\nexport interface ScrapeOptions {\n  /** Array of URLs to scrape */\n  urls: string[];\n\n  /** Output formats - which content fields to include (default: ['markdown']) */\n  formats?: Array<\"markdown\" | \"html\">;\n\n  /** Custom user agent string (overrides Hero's default emulated UA) */\n  userAgent?: string;\n\n  /** Request timeout in milliseconds (default: 30000) */\n  timeoutMs?: number;\n\n  // ============================================================================\n  // Content cleaning options\n  // ============================================================================\n\n  /** Remove ads and tracking elements (default: true) */\n  removeAds?: boolean;\n\n  /** Remove base64-encoded images to reduce output size (default: true) */\n  removeBase64Images?: boolean;\n\n  /** Extract only main content, removing nav/header/footer/sidebar (default: true) */\n  onlyMainContent?: boolean;\n\n  /** CSS selectors for elements to include (if set, only these elements are kept) */\n  includeTags?: string[];\n\n  /** CSS selectors for elements to exclude (removed from output) */\n  excludeTags?: string[];\n\n  /**\n   * Additional CSS selectors to remove when onlyMainContent is true.\n   * Merged with the built-in nav/footer/sidebar selectors.\n   */\n  navigationSelectors?: string[];\n\n  // ============================================================================\n  // Retry & escalation options\n  // ============================================================================\n\n  /**\n   * Hard deadline for a single URL in milliseconds (default: 30000).\n   * After this, the scraper gives up regardless of proxy tier.\n   */\n  hardDeadlineMs?: number;\n\n  /**\n   * Timeout for the first attempt on datacenter proxy in milliseconds (default: 10000).\n   * If no result in this time, the scraper escalates to residential.\n   */\n  datacenterTimeoutMs?: number;\n\n  // ============================================================================\n  // Pluggable config (injected by platform, not set by end users)\n  // ============================================================================\n\n  /**\n   * Domain-specific overrides. Keyed by domain (e.g. \"amazon.com\").\n   * Matched against the URL's hostname (www. stripped, subdomain matching).\n   * Reader ships with NO built-in profiles — the caller provides them.\n   */\n  domainProfiles?: Record<string, import(\"./config/domain-profiles.js\").DomainProfile>;\n\n  /**\n   * Block detection config. When provided, the scraper checks successful\n   * responses for bot-block signals and escalates to residential on match.\n   * Reader ships with NO built-in patterns — the caller provides them.\n   */\n  blockDetection?: {\n    /** Regex patterns matched against page text content */\n    patterns?: RegExp[];\n    /** Regex patterns matched against page title */\n    titlePatterns?: RegExp[];\n    /** Pages shorter than this (chars) with any signal = blocked (default: 500) */\n    shortContentThreshold?: number;\n    /** Longer pages need this many signals to be blocked (default: 3) */\n    longContentSignalThreshold?: number;\n  };\n\n  /**\n   * URL rewrite rules applied before scraping. Each rule has a `match`\n   * function and a `rewrite` function. Reader ships with NO built-in\n   * rules — the caller provides them (e.g. Google Docs → export URL).\n   */\n  urlRewriters?: Array<{\n    /** Name for diagnostics */\n    name: string;\n    /** Return true if this rewriter applies to the URL */\n    match: (url: URL) => boolean;\n    /** Return the rewritten URL string */\n    rewrite: (url: URL) => string;\n  }>;\n\n  // ============================================================================\n  // Batch processing options\n  // ============================================================================\n\n  /** Number of URLs to process in parallel (default: 1 - sequential) */\n  batchConcurrency?: number;\n\n  /** Total timeout for the entire batch operation in milliseconds (default: 300000) */\n  batchTimeoutMs?: number;\n\n  /** Progress callback for batch operations */\n  onProgress?: (progress: { completed: number; total: number; currentUrl: string }) => void;\n\n  // ============================================================================\n  // Hero-specific options\n  // ============================================================================\n\n  /** Proxy configuration for Hero (single proxy — use proxyTier for pool-based) */\n  proxy?: ProxyConfig;\n\n  /**\n   * Proxy tier selection (default: \"auto\")\n   * - \"datacenter\": Use datacenter proxy pool\n   * - \"residential\": Use residential proxy pool\n   * - \"auto\": Start with datacenter, escalate to residential on block detection\n   *\n   * Requires proxyPools to be configured on ReaderClient.\n   * If a single `proxy` is set, it takes precedence over pools.\n   */\n  proxyTier?: ProxyTier;\n\n  /** CSS selector to wait for before considering page loaded */\n  waitForSelector?: string;\n\n  /** Enable verbose logging (default: false) */\n  verbose?: boolean;\n\n  /** Show Chrome window (default: false) */\n  showChrome?: boolean;\n\n  /** Connection to Hero Core (for shared Core usage) */\n  connectionToCore?: any;\n\n  /** Browser pool configuration (passed from ReaderClient) */\n  browserPool?: BrowserPoolConfig;\n\n  /** Browser pool instance (internal, provided by ReaderClient, legacy single pool) */\n  pool?: IBrowserPool;\n\n  /**\n   * Tiered browser pool (internal, provided by ReaderClient).\n   *\n   * When present, this takes precedence over `pool` for the Hero engine.\n   * The Hero engine will ask the tiered pool for the browser bound to\n   * `options.proxy?.url` (falling back to the tier resolved from\n   * `options.proxyTier`).\n   *\n   * Typed as `unknown` to avoid a type cycle between types.ts and\n   * browser/tiered-pool.ts.\n   */\n  tieredPool?: unknown;\n\n  /**\n   * Per-proxy concurrency gate (internal, provided by ReaderClient).\n   *\n   * When present, the scraper wraps the entire engine waterfall in\n   * `proxyGate.withSlot(proxyUrl, ...)`, ensuring at most N simultaneous\n   * scrapes go through any single proxy URL at a time. All three engines\n   * share the slot because they race in parallel through the same proxy.\n   *\n   * Typed as `unknown` to avoid a type cycle.\n   */\n  proxyGate?: unknown;\n\n  /**\n   * Per-proxy health tracker (internal, provided by ReaderClient).\n   *\n   * Optional. When present, the scraper records success/failure after each\n   * attempt. The tracker emits bench/revive events that the TieredBrowserPool\n   * listens to; the scraper itself just reports outcomes.\n   */\n  healthTracker?: unknown;\n\n  /**\n   * Callback that resolves a proxy URL for a given tier.\n   *\n   * Provided by ReaderClient. Called per-attempt inside the scraper's\n   * retry loop so domain-profile and retry-loop escalation actually swap\n   * proxies between attempts (instead of just flipping a tier string in\n   * options and still using the original proxy).\n   *\n   * Returns the proxy to use, or `undefined` for the direct lane.\n   */\n  resolveProxy?: (tier: ProxyTier | undefined) => ProxyConfig | undefined;\n}\n\n/**\n * Website metadata extracted from the base page\n */\nexport interface WebsiteMetadata {\n  /** Basic meta tags */\n  title: string | null /** <title> or <meta property=\"og:title\"> */;\n  description: string | null /** <meta name=\"description\"> */;\n  author: string | null /** <meta name=\"author\"> */;\n  language: string | null /** <html lang=\"...\"> */;\n  charset: string | null /** <meta charset=\"...\"> */;\n\n  /** Links */\n  favicon: string | null /** <link rel=\"icon\"> */;\n  image: string | null /** <meta property=\"og:image\"> */;\n  canonical: string | null /** <link rel=\"canonical\"> */;\n\n  /** SEO */\n  keywords: string[] | null /** <meta name=\"keywords\"> */;\n  robots: string | null /** <meta name=\"robots\"> */;\n\n  /** Branding */\n  themeColor: string | null /** <meta name=\"theme-color\"> */;\n\n  /** Open Graph */\n  openGraph: {\n    title: string | null /** <meta property=\"og:title\"> */;\n    description: string | null /** <meta property=\"og:description\"> */;\n    type: string | null /** <meta property=\"og:type\"> */;\n    url: string | null /** <meta property=\"og:url\"> */;\n    image: string | null /** <meta property=\"og:image\"> */;\n    siteName: string | null /** <meta property=\"og:site_name\"> */;\n    locale: string | null /** <meta property=\"og:locale\"> */;\n  } | null;\n\n  /** Twitter Card */\n  twitter: {\n    card: string | null /** <meta name=\"twitter:card\"> */;\n    site: string | null /** <meta name=\"twitter:site\"> */;\n    creator: string | null /** <meta name=\"twitter:creator\"> */;\n    title: string | null /** <meta name=\"twitter:title\"> */;\n    description: string | null /** <meta name=\"twitter:description\"> */;\n    image: string | null /** <meta name=\"twitter:image\"> */;\n  } | null;\n}\n\n/**\n * Individual page data\n */\nexport interface Page {\n  /** Full URL of the page */\n  url: string;\n\n  /** Page title */\n  title: string;\n\n  /** Markdown content */\n  markdown: string;\n\n  /** HTML content */\n  html: string;\n\n  /** When the page was fetched */\n  fetchedAt: string;\n\n  /** Crawl depth from base URL */\n  depth: number;\n}\n\n/**\n * Individual website scrape result\n */\nexport interface WebsiteScrapeResult {\n  /** Raw HTML from the engine before cleaning (always present) */\n  rawHtml: string;\n\n  /** Markdown content (present if 'markdown' in formats) */\n  markdown?: string;\n\n  /** Cleaned HTML content (present if 'html' in formats) */\n  html?: string;\n\n  /** Metadata about the scraping operation */\n  metadata: {\n    /** Base URL that was scraped */\n    baseUrl: string;\n\n    /** HTTP status code from the response */\n    statusCode: number;\n\n    /** Engine that successfully scraped this URL */\n    engine: string;\n\n    /** Total number of pages scraped */\n    totalPages: number;\n\n    /** ISO timestamp when scraping started */\n    scrapedAt: string;\n\n    /** Duration in milliseconds */\n    duration: number;\n\n    /** Website metadata extracted from base page */\n    website: WebsiteMetadata;\n\n    /** Proxy used for this request (if proxy pooling was enabled) */\n    proxy?: ProxyMetadata;\n  };\n}\n\n/**\n * Batch metadata for multi-URL operations\n */\nexport interface BatchMetadata {\n  /** Total number of URLs provided */\n  totalUrls: number;\n\n  /** Number of URLs successfully scraped */\n  successfulUrls: number;\n\n  /** Number of URLs that failed */\n  failedUrls: number;\n\n  /** ISO timestamp when the batch operation started */\n  scrapedAt: string;\n\n  /** Total duration for the entire batch in milliseconds */\n  totalDuration: number;\n\n  /** Array of errors for failed URLs */\n  errors?: Array<{ url: string; error: string }>;\n}\n\n/**\n * Main scrape result interface\n */\nexport interface ScrapeResult {\n  /** Array of individual website results */\n  data: WebsiteScrapeResult[];\n\n  /** Metadata about the batch operation */\n  batchMetadata: BatchMetadata;\n}\n\n/**\n * Internal crawler state\n */\nexport interface CrawlerState {\n  /** Set of visited URLs to avoid duplicates */\n  visited: Set<string>;\n\n  /** Queue of URLs to process */\n  queue: Array<{ url: string; depth: number }>;\n\n  /** Completed pages */\n  pages: Page[];\n}\n\n/**\n * Internal scraper configuration\n */\nexport interface ScraperConfig {\n  /** Merged options with defaults */\n  options: Required<ScrapeOptions>;\n\n  /** Parsed base URL */\n  baseUrl: URL;\n\n  /** Base domain for same-origin checking */\n  baseDomain: string;\n}\n\n/**\n * Default scrape options\n */\nexport const DEFAULT_OPTIONS: Omit<\n  Required<ScrapeOptions>,\n  | \"proxy\"\n  | \"proxyTier\"\n  | \"waitForSelector\"\n  | \"connectionToCore\"\n  | \"userAgent\"\n  | \"browserPool\"\n  | \"pool\"\n  | \"tieredPool\"\n  | \"proxyGate\"\n  | \"healthTracker\"\n  | \"resolveProxy\"\n  | \"navigationSelectors\"\n  | \"hardDeadlineMs\"\n  | \"datacenterTimeoutMs\"\n  | \"domainProfiles\"\n  | \"blockDetection\"\n  | \"urlRewriters\"\n> & {\n  proxy?: ProxyConfig;\n  proxyTier?: ProxyTier;\n  waitForSelector?: string;\n  connectionToCore?: any;\n  userAgent?: string;\n  browserPool?: BrowserPoolConfig;\n  pool?: IBrowserPool;\n  tieredPool?: unknown;\n  proxyGate?: unknown;\n  healthTracker?: unknown;\n  resolveProxy?: (tier: ProxyTier | undefined) => ProxyConfig | undefined;\n  navigationSelectors?: string[];\n  hardDeadlineMs?: number;\n  datacenterTimeoutMs?: number;\n  domainProfiles?: Record<string, import(\"./config/domain-profiles.js\").DomainProfile>;\n  blockDetection?: ScrapeOptions[\"blockDetection\"];\n  urlRewriters?: ScrapeOptions[\"urlRewriters\"];\n} = {\n  urls: [],\n  formats: [\"markdown\"],\n  timeoutMs: 30000,\n  // Content cleaning defaults\n  removeAds: true,\n  removeBase64Images: true,\n  onlyMainContent: true,\n  includeTags: [],\n  excludeTags: [],\n  // Batch defaults\n  batchConcurrency: 5,\n  batchTimeoutMs: 300000,\n  onProgress: () => {}, // Default no-op progress callback\n  // Hero-specific defaults\n  verbose: false,\n  showChrome: false,\n};\n\n/**\n * Format type guard\n */\nexport function isValidFormat(format: string): format is \"markdown\" | \"html\" {\n  return format === \"markdown\" || format === \"html\";\n}\n\n/**\n * Check if a URL should be crawled based on base domain\n */\nexport function shouldCrawlUrl(url: URL, baseDomain: string): boolean {\n  return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);\n}\n"
  },
  {
    "path": "src/utils/block-detector.ts",
    "content": "/**\n * Block Detector\n *\n * Detects bot-block pages that return HTTP 200 but contain\n * anti-bot content instead of actual page content.\n *\n * Reader ships with NO built-in patterns. The caller provides\n * block detection config via ScrapeOptions.blockDetection.\n * Without config, no content-based block detection runs.\n */\n\n/**\n * Block detection configuration — provided by the caller.\n *\n * Patterns can be RegExp objects (in-process usage) or strings\n * (serialized over HTTP/JSON — compiled to RegExp internally).\n */\nexport interface BlockDetectionConfig {\n  /** Regex patterns matched against page text content (RegExp or string) */\n  patterns?: Array<RegExp | string>;\n  /** Regex patterns matched against page title (RegExp or string) */\n  titlePatterns?: Array<RegExp | string>;\n  /** Pages shorter than this (chars) with any signal = blocked (default: 500) */\n  shortContentThreshold?: number;\n  /** Longer pages need this many signals to be blocked (default: 3) */\n  longContentSignalThreshold?: number;\n}\n\n/** Compile a pattern (string or RegExp) into a RegExp */\nfunction toRegExp(p: RegExp | string): RegExp {\n  return typeof p === \"string\" ? new RegExp(p, \"i\") : p;\n}\n\n/**\n * Detect if an HTML page is a bot-block/challenge page.\n *\n * Returns false if no config is provided (unopinionated default).\n */\nexport function detectBotPage(html: string, config?: BlockDetectionConfig): boolean {\n  if (!html || html.trim().length === 0) return false;\n  if (!config?.patterns || config.patterns.length === 0) return false;\n\n  const text = stripTags(html);\n  const shortThreshold = config.shortContentThreshold ?? 500;\n  const longThreshold = config.longContentSignalThreshold ?? 3;\n\n  const signalCount = config.patterns.filter((p) => toRegExp(p).test(text)).length;\n\n  if (text.length < shortThreshold && signalCount >= 1) return true;\n  if (signalCount >= longThreshold) return true;\n\n  return false;\n}\n\n/**\n * Detect if a page title indicates a block page.\n *\n * Returns false if no config is provided.\n */\nexport function detectBotTitle(title: string, config?: BlockDetectionConfig): boolean {\n  if (!title) return false;\n  if (!config?.titlePatterns || config.titlePatterns.length === 0) return false;\n  return config.titlePatterns.some((p) => toRegExp(p).test(title));\n}\n\n/**\n * Check if an HTTP response looks like a blocked response.\n *\n * HTTP-level blocks (401/403/429/503) are always detected.\n * Content-based detection (200 + bot page) only runs when\n * block detection config is provided.\n */\nexport function isBlockedResponse(\n  statusCode: number,\n  html?: string,\n  config?: BlockDetectionConfig\n): { blocked: boolean; reason?: string } {\n  // HTTP-level blocks — always detected\n  if (statusCode === 401) return { blocked: true, reason: \"unauthorized\" };\n  if (statusCode === 403) return { blocked: true, reason: \"forbidden\" };\n  if (statusCode === 429) return { blocked: true, reason: \"rate_limited\" };\n  if (statusCode === 503) return { blocked: true, reason: \"service_unavailable\" };\n\n  // Content-based detection — only if config provided\n  if (statusCode >= 200 && statusCode < 300 && html && config) {\n    if (detectBotPage(html, config)) {\n      return { blocked: true, reason: \"bot_page_detected\" };\n    }\n  }\n\n  return { blocked: false };\n}\n\n/**\n * Strip HTML tags from content for text analysis\n */\nfunction stripTags(html: string): string {\n  return html\n    .replace(/<script[^>]*>[\\s\\S]*?<\\/script>/gi, \"\")\n    .replace(/<style[^>]*>[\\s\\S]*?<\\/style>/gi, \"\")\n    .replace(/<[^>]*>/g, \" \")\n    .replace(/\\s+/g, \" \")\n    .trim();\n}\n"
  },
  {
    "path": "src/utils/content-cleaner.ts",
    "content": "import { parseHTML } from \"linkedom\";\n\n/**\n * HTML content cleaning — minimal approach.\n *\n * Philosophy: strip only what is CERTAINLY not content, then let\n * supermarkdown handle the rest. Aggressive pre-cleaning with wildcard\n * selectors and heuristic scoring causes more damage than it prevents\n * (e.g. [class*=\"dialog\"] nuked Wikipedia's entire <body>). The markdown\n * converter is the real filter.\n *\n * Pipeline:\n *   1. Remove script, style, noscript, meta, head (always)\n *   2. Remove user-provided excludeTags\n *   3. If onlyMainContent: remove nav/header/footer/sidebar (exact selectors)\n *   4. If includeTags: whitelist mode — keep only matching elements\n *   5. Remove base64 images (if enabled)\n *   6. Resolve srcset to pick largest image\n *   7. Absolutify relative URLs\n */\n\n/**\n * Content cleaning options\n */\nexport interface CleaningOptions {\n  /** Remove ads and tracking elements (default: true) */\n  removeAds?: boolean;\n  /** Remove base64-encoded images (default: true) */\n  removeBase64Images?: boolean;\n  /** Extract only main content, removing nav/header/footer/sidebar (default: true) */\n  onlyMainContent?: boolean;\n  /** CSS selectors for elements to include (if set, only these elements are kept) */\n  includeTags?: string[];\n  /** CSS selectors for elements to exclude (removed from output) */\n  excludeTags?: string[];\n  /** Additional CSS selectors to remove when onlyMainContent is true. Merged with built-in selectors. */\n  navigationSelectors?: string[];\n}\n\n/**\n * Elements that are NEVER content. Safe to remove unconditionally.\n */\nconst ALWAYS_REMOVE_SELECTORS = [\"script\", \"style\", \"noscript\", \"meta\", \"head\"];\n\n/**\n * Navigation/boilerplate selectors — applied only when onlyMainContent\n * is true. Exact class/ID matches only, NO wildcards like [class*=\"...\"]\n * which risk matching legitimate content containers.\n */\nconst NAVIGATION_SELECTORS = [\n  // Semantic elements\n  \"header\",\n  \"footer\",\n  \"nav\",\n  \"aside\",\n\n  // Header variations\n  \".header\",\n  \".top\",\n  \".navbar\",\n  \"#header\",\n\n  // Footer variations\n  \".footer\",\n  \".bottom\",\n  \"#footer\",\n\n  // Sidebars\n  \".sidebar\",\n  \".side\",\n  \".aside\",\n  \"#sidebar\",\n\n  // Modals/popups (exact class only)\n  \".modal\",\n  \".popup\",\n  \"#modal\",\n  \".overlay\",\n\n  // Ads\n  \".ad\",\n  \".ads\",\n  \".advert\",\n  \"#ad\",\n\n  // Language selectors\n  \".lang-selector\",\n  \".language\",\n  \"#language-selector\",\n\n  // Social\n  \".social\",\n  \".social-media\",\n  \".social-links\",\n  \"#social\",\n\n  // Navigation/menus\n  \".menu\",\n  \".navigation\",\n  \"#nav\",\n\n  // Breadcrumbs\n  \".breadcrumbs\",\n  \"#breadcrumbs\",\n\n  // Share buttons\n  \".share\",\n  \"#share\",\n\n  // Widgets\n  \".widget\",\n  \"#widget\",\n\n  // Cookie notices\n  \".cookie\",\n  \"#cookie\",\n];\n\n/**\n * Elements containing these selectors are PROTECTED from nav removal.\n * Prevents stripping a <header> or <nav> that wraps the actual content\n * on sites with non-standard layouts (e.g. Wikipedia's #content lives\n * inside a structure that could match nav selectors on some themes).\n */\nconst FORCE_INCLUDE_SELECTORS = [\n  \"#main\",\n  \"#content\",\n  \"#main-content\",\n  \"#mw-content-text\",\n  \"#bodyContent\",\n  \"main\",\n  \"article\",\n  \"[role='main']\",\n  \"[data-page-content]\",\n];\n\n// ============================================================================\n// Removal Functions\n// ============================================================================\n\n/**\n * Simple removal — no protection checks.\n */\nfunction removeElements(document: Document, selectors: string[]): void {\n  for (const selector of selectors) {\n    try {\n      document.querySelectorAll(selector).forEach((el: Element) => el.remove());\n    } catch {\n      // Some selectors may not be supported by linkedom, skip them\n    }\n  }\n}\n\n/**\n * Remove elements WITH PROTECTION — checks each element before removing.\n * If an element IS or CONTAINS a protected selector, skip it.\n */\nfunction removeWithProtection(\n  document: Document,\n  selectorsToRemove: string[],\n  protectedSelectors: string[]\n): void {\n  for (const selector of selectorsToRemove) {\n    try {\n      document.querySelectorAll(selector).forEach((element: Element) => {\n        // Is this element itself protected?\n        const isProtected = protectedSelectors.some((ps) => {\n          try {\n            return element.matches(ps);\n          } catch {\n            return false;\n          }\n        });\n        if (isProtected) return;\n\n        // Does it CONTAIN protected content?\n        const containsProtected = protectedSelectors.some((ps) => {\n          try {\n            return element.querySelector(ps) !== null;\n          } catch {\n            return false;\n          }\n        });\n        if (containsProtected) return;\n\n        element.remove();\n      });\n    } catch {\n      // Skip invalid selector\n    }\n  }\n}\n\n// ============================================================================\n// Main Cleaning Function\n// ============================================================================\n\n/**\n * Clean HTML content with minimal, safe transformations.\n */\nexport function cleanHtml(html: string, baseUrl: string, options: CleaningOptions = {}): string {\n  const { removeBase64Images = true, onlyMainContent = true, includeTags, excludeTags } = options;\n\n  const { document } = parseHTML(html);\n\n  // Step 1: Always remove elements that are never content\n  removeElements(document, ALWAYS_REMOVE_SELECTORS);\n\n  // Step 2: Apply user-provided excludeTags\n  if (excludeTags && excludeTags.length > 0) {\n    removeElements(document, excludeTags);\n  }\n\n  // Step 3: Remove navigation/boilerplate (only when onlyMainContent is on)\n  if (onlyMainContent) {\n    const navSelectors = options.navigationSelectors\n      ? [...NAVIGATION_SELECTORS, ...options.navigationSelectors]\n      : NAVIGATION_SELECTORS;\n    removeWithProtection(document, navSelectors, FORCE_INCLUDE_SELECTORS);\n  }\n\n  // Step 4: Apply user-provided includeTags (whitelist mode)\n  if (includeTags && includeTags.length > 0) {\n    const matchedElements: Element[] = [];\n    for (const selector of includeTags) {\n      try {\n        document.querySelectorAll(selector).forEach((el: Element) => {\n          matchedElements.push(el.cloneNode(true) as Element);\n        });\n      } catch {\n        // Invalid selector, skip\n      }\n    }\n    if (matchedElements.length > 0) {\n      const body = document.body;\n      if (body) {\n        body.innerHTML = \"\";\n        matchedElements.forEach((el) => body.appendChild(el));\n      }\n    }\n  }\n\n  // Step 5: Remove base64 images\n  if (removeBase64Images) {\n    removeBase64ImagesFromDocument(document);\n  }\n\n  // Step 6: Remove HTML comments\n  const walker = document.createTreeWalker(document, 128 /* NodeFilter.SHOW_COMMENT */);\n  const comments: Node[] = [];\n  while (walker.nextNode()) {\n    comments.push(walker.currentNode);\n  }\n  comments.forEach((comment) => comment.parentNode?.removeChild(comment));\n\n  // Step 7: Resolve srcset to pick the largest image\n  resolveSrcsets(document);\n\n  // Step 8: Convert relative URLs to absolute\n  convertRelativeUrls(document, baseUrl);\n\n  return document.documentElement?.outerHTML || html;\n}\n\n// ============================================================================\n// Helper Functions\n// ============================================================================\n\n/**\n * Remove base64-encoded images from the document\n */\nfunction removeBase64ImagesFromDocument(document: Document): void {\n  document.querySelectorAll(\"img[src^='data:']\").forEach((el: Element) => {\n    el.remove();\n  });\n\n  document.querySelectorAll(\"[style*='data:image']\").forEach((el: Element) => {\n    const style = el.getAttribute(\"style\");\n    if (style) {\n      const cleanedStyle = style.replace(\n        /background(-image)?:\\s*url\\([^)]*data:image[^)]*\\)[^;]*;?/gi,\n        \"\"\n      );\n      if (cleanedStyle.trim()) {\n        el.setAttribute(\"style\", cleanedStyle);\n      } else {\n        el.removeAttribute(\"style\");\n      }\n    }\n  });\n\n  document\n    .querySelectorAll(\"source[src^='data:'], source[srcset*='data:']\")\n    .forEach((el: Element) => {\n      el.remove();\n    });\n}\n\n/**\n * Resolve srcset attributes to pick the largest image.\n */\nfunction resolveSrcsets(document: Document): void {\n  document.querySelectorAll(\"img[srcset]\").forEach((el: Element) => {\n    const srcset = el.getAttribute(\"srcset\");\n    if (!srcset) return;\n\n    const candidates = srcset\n      .split(\",\")\n      .map((entry) => {\n        const trimmed = entry.trim();\n        const parts = trimmed.split(/\\s+/);\n        const url = parts[0];\n        const descriptor = parts[1] || \"1x\";\n        let weight = 0;\n        if (descriptor.endsWith(\"w\")) {\n          weight = parseInt(descriptor.slice(0, -1), 10) || 0;\n        } else if (descriptor.endsWith(\"x\")) {\n          weight = (parseFloat(descriptor.slice(0, -1)) || 1) * 100;\n        }\n        return { url, weight };\n      })\n      .filter((c) => c.url)\n      .sort((a, b) => b.weight - a.weight);\n\n    if (candidates.length > 0) {\n      el.setAttribute(\"src\", candidates[0].url);\n    }\n  });\n}\n\n/**\n * Convert relative URLs to absolute URLs\n */\nfunction convertRelativeUrls(document: Document, baseUrl: string): void {\n  document.querySelectorAll(\"[src]\").forEach((el: Element) => {\n    const src = el.getAttribute(\"src\");\n    if (src && !src.startsWith(\"http\") && !src.startsWith(\"//\") && !src.startsWith(\"data:\")) {\n      try {\n        el.setAttribute(\"src\", new URL(src, baseUrl).toString());\n      } catch {\n        /* Invalid URL, leave as-is */\n      }\n    }\n  });\n\n  document.querySelectorAll(\"[href]\").forEach((el: Element) => {\n    const href = el.getAttribute(\"href\");\n    if (\n      href &&\n      !href.startsWith(\"http\") &&\n      !href.startsWith(\"//\") &&\n      !href.startsWith(\"#\") &&\n      !href.startsWith(\"mailto:\") &&\n      !href.startsWith(\"tel:\") &&\n      !href.startsWith(\"javascript:\")\n    ) {\n      try {\n        el.setAttribute(\"href\", new URL(href, baseUrl).toString());\n      } catch {\n        /* Invalid URL, leave as-is */\n      }\n    }\n  });\n}\n\n/**\n * Main export\n */\nexport function cleanContent(html: string, baseUrl: string, options: CleaningOptions = {}): string {\n  return cleanHtml(html, baseUrl, options);\n}\n"
  },
  {
    "path": "src/utils/logger.ts",
    "content": "import pino from \"pino\";\n\n/**\n * Logger type\n */\nexport type Logger = ReturnType<typeof createLogger>;\n\n/**\n * Check if pino-pretty is available\n */\nfunction hasPinoPretty(): boolean {\n  try {\n    require.resolve(\"pino-pretty\");\n    return true;\n  } catch {\n    return false;\n  }\n}\n\n/**\n * Create a logger instance\n *\n * @param name - Logger name\n * @param level - Log level (default: from env or 'info')\n * @returns Pino logger instance\n */\nexport function createLogger(\n  name: string = \"reader\",\n  level: string = process.env.LOG_LEVEL || \"info\"\n) {\n  const usePretty = process.env.NODE_ENV !== \"production\" && hasPinoPretty();\n\n  return pino({\n    name,\n    level,\n    redact: [\n      \"req.headers.authorization\",\n      \"req.headers.cookie\",\n      \"*.password\",\n      \"*.token\",\n      \"*.apiKey\",\n      \"*.secret\",\n    ],\n    transport: usePretty\n      ? {\n          target: \"pino-pretty\",\n          options: {\n            colorize: true,\n            translateTime: \"SYS:standard\",\n            ignore: \"pid,hostname\",\n          },\n        }\n      : undefined,\n  });\n}\n\n/**\n * Default logger instance\n */\nexport const logger = createLogger();\n"
  },
  {
    "path": "src/utils/metadata-extractor.ts",
    "content": "import { parseHTML } from \"linkedom\";\nimport type { WebsiteMetadata } from \"../types\";\nimport { normalizeUrl } from \"./url-helpers\";\n\n/**\n * Extract comprehensive website metadata from HTML content\n * Uses proper DOM parsing for reliable attribute extraction\n */\nexport function extractMetadata(html: string, baseUrl: string): WebsiteMetadata {\n  return extractWebsiteMetadata(html, baseUrl);\n}\n\n/**\n * Extract comprehensive website metadata from HTML content\n */\nexport function extractWebsiteMetadata(html: string, baseUrl: string): WebsiteMetadata {\n  const { document } = parseHTML(html);\n\n  const metadata: WebsiteMetadata = {\n    title: null,\n    description: null,\n    author: null,\n    language: null,\n    charset: null,\n    favicon: null,\n    canonical: null,\n    image: null,\n    keywords: null,\n    robots: null,\n    themeColor: null,\n    openGraph: null,\n    twitter: null,\n  };\n\n  // Extract basic meta tags\n  metadata.title = extractTitle(document);\n  metadata.description = extractMetaContent(document, \"description\");\n  metadata.author = extractMetaContent(document, \"author\");\n  metadata.language = extractLanguage(document);\n  metadata.charset = extractCharset(document);\n\n  // Extract links\n  metadata.favicon = extractFavicon(document, baseUrl);\n  metadata.canonical = extractCanonical(document, baseUrl);\n  metadata.image =\n    extractMetaContent(document, \"og:image\") || extractMetaContent(document, \"twitter:image\");\n\n  // Extract SEO metadata\n  metadata.keywords = extractKeywords(document);\n  metadata.robots = extractMetaContent(document, \"robots\");\n  metadata.themeColor = extractMetaContent(document, \"theme-color\");\n\n  // Extract Open Graph metadata\n  metadata.openGraph = extractOpenGraph(document);\n\n  // Extract Twitter Card metadata\n  metadata.twitter = extractTwitterCard(document);\n\n  return metadata;\n}\n\n/**\n * Extract page title from HTML\n */\nfunction extractTitle(document: Document): string | null {\n  // Try <title> tag first\n  const titleElement = document.querySelector(\"title\");\n  if (titleElement?.textContent) {\n    return titleElement.textContent.trim();\n  }\n\n  // Fallback to og:title\n  return extractMetaContent(document, \"og:title\");\n}\n\n/**\n * Extract content from meta tag by name or property\n * Works regardless of attribute order\n */\nfunction extractMetaContent(document: Document, name: string): string | null {\n  // Try name attribute first\n  const byName = document.querySelector(`meta[name=\"${name}\"]`);\n  if (byName) {\n    const content = byName.getAttribute(\"content\");\n    if (content) return content.trim();\n  }\n\n  // Try property attribute (for Open Graph)\n  const byProperty = document.querySelector(`meta[property=\"${name}\"]`);\n  if (byProperty) {\n    const content = byProperty.getAttribute(\"content\");\n    if (content) return content.trim();\n  }\n\n  return null;\n}\n\n/**\n * Extract language from HTML tag\n */\nfunction extractLanguage(document: Document): string | null {\n  const lang = document.documentElement?.getAttribute(\"lang\");\n  return lang?.trim() || null;\n}\n\n/**\n * Extract character set from meta tag\n */\nfunction extractCharset(document: Document): string | null {\n  // Try <meta charset=\"...\">\n  const charsetMeta = document.querySelector(\"meta[charset]\");\n  if (charsetMeta) {\n    const charset = charsetMeta.getAttribute(\"charset\");\n    if (charset) return charset.trim();\n  }\n\n  // Try <meta http-equiv=\"Content-Type\" content=\"...charset=...\">\n  const httpEquivMeta = document.querySelector('meta[http-equiv=\"Content-Type\"]');\n  if (httpEquivMeta) {\n    const content = httpEquivMeta.getAttribute(\"content\");\n    if (content) {\n      const charsetMatch = content.match(/charset=([^\\s;]+)/i);\n      if (charsetMatch) return charsetMatch[1].trim();\n    }\n  }\n\n  return null;\n}\n\n/**\n * Extract favicon URL\n */\nfunction extractFavicon(document: Document, baseUrl: string): string | null {\n  // Try various icon link types\n  const iconSelectors = [\n    'link[rel=\"icon\"]',\n    'link[rel=\"shortcut icon\"]',\n    'link[rel=\"apple-touch-icon\"]',\n    'link[rel*=\"icon\"]',\n  ];\n\n  for (const selector of iconSelectors) {\n    const iconLink = document.querySelector(selector);\n    if (iconLink) {\n      const href = iconLink.getAttribute(\"href\");\n      if (href) {\n        return normalizeUrl(href, baseUrl);\n      }\n    }\n  }\n\n  // Fallback to /favicon.ico\n  try {\n    return normalizeUrl(\"/favicon.ico\", baseUrl);\n  } catch {\n    return null;\n  }\n}\n\n/**\n * Extract canonical URL\n */\nfunction extractCanonical(document: Document, baseUrl: string): string | null {\n  const canonicalLink = document.querySelector('link[rel=\"canonical\"]');\n  if (canonicalLink) {\n    const href = canonicalLink.getAttribute(\"href\");\n    if (href) {\n      return normalizeUrl(href, baseUrl);\n    }\n  }\n\n  return null;\n}\n\n/**\n * Extract keywords from meta tag\n */\nfunction extractKeywords(document: Document): string[] | null {\n  const keywordsContent = extractMetaContent(document, \"keywords\");\n  if (!keywordsContent) {\n    return null;\n  }\n\n  return keywordsContent\n    .split(\",\")\n    .map((keyword) => keyword.trim())\n    .filter((keyword) => keyword.length > 0);\n}\n\n/**\n * Extract Open Graph metadata\n */\nfunction extractOpenGraph(document: Document): WebsiteMetadata[\"openGraph\"] {\n  const openGraph: WebsiteMetadata[\"openGraph\"] = {\n    title: null,\n    description: null,\n    type: null,\n    url: null,\n    image: null,\n    siteName: null,\n    locale: null,\n  };\n\n  openGraph.title = extractMetaContent(document, \"og:title\");\n  openGraph.description = extractMetaContent(document, \"og:description\");\n  openGraph.type = extractMetaContent(document, \"og:type\");\n  openGraph.url = extractMetaContent(document, \"og:url\");\n  openGraph.image = extractMetaContent(document, \"og:image\");\n  openGraph.siteName = extractMetaContent(document, \"og:site_name\");\n  openGraph.locale = extractMetaContent(document, \"og:locale\");\n\n  // Return null if no Open Graph data found\n  if (Object.values(openGraph).every((value) => !value)) {\n    return null;\n  }\n\n  return openGraph;\n}\n\n/**\n * Extract Twitter Card metadata\n */\nfunction extractTwitterCard(document: Document): WebsiteMetadata[\"twitter\"] {\n  const twitter: WebsiteMetadata[\"twitter\"] = {\n    card: null,\n    site: null,\n    creator: null,\n    title: null,\n    description: null,\n    image: null,\n  };\n\n  twitter.card = extractMetaContent(document, \"twitter:card\");\n  twitter.site = extractMetaContent(document, \"twitter:site\");\n  twitter.creator = extractMetaContent(document, \"twitter:creator\");\n  twitter.title = extractMetaContent(document, \"twitter:title\");\n  twitter.description = extractMetaContent(document, \"twitter:description\");\n  twitter.image = extractMetaContent(document, \"twitter:image\");\n\n  // Return null if no Twitter Card data found\n  if (Object.values(twitter).every((value) => !value)) {\n    return null;\n  }\n\n  return twitter;\n}\n\n/**\n * Extract structured data (JSON-LD) from HTML\n */\nexport function extractStructuredData(html: string): unknown[] {\n  const { document } = parseHTML(html);\n  const structuredData: unknown[] = [];\n\n  document.querySelectorAll('script[type=\"application/ld+json\"]').forEach((script: Element) => {\n    try {\n      const jsonData = JSON.parse(script.textContent || \"\");\n      structuredData.push(jsonData);\n    } catch {\n      // Invalid JSON, skip\n    }\n  });\n\n  return structuredData;\n}\n\n/**\n * Extract microdata from HTML (basic implementation)\n */\nexport function extractMicrodata(_html: string): unknown[] {\n  const microdata: unknown[] = [];\n  // This is a simplified implementation\n  // In a real-world scenario, you'd want to use a proper microdata parser\n  return microdata;\n}\n\n/**\n * Get a summary of the website metadata for debugging\n */\nexport function getMetadataSummary(metadata: WebsiteMetadata): string {\n  const parts: string[] = [];\n\n  if (metadata.title) parts.push(`Title: ${metadata.title}`);\n  if (metadata.description) parts.push(`Description: ${metadata.description.substring(0, 100)}...`);\n  if (metadata.author) parts.push(`Author: ${metadata.author}`);\n  if (metadata.language) parts.push(`Language: ${metadata.language}`);\n  if (metadata.keywords) parts.push(`Keywords: ${metadata.keywords.length} found`);\n  if (metadata.openGraph)\n    parts.push(`Open Graph: ${Object.keys(metadata.openGraph).length} fields`);\n  if (metadata.twitter) parts.push(`Twitter Card: ${Object.keys(metadata.twitter).length} fields`);\n\n  return parts.join(\" | \") || \"No metadata found\";\n}\n"
  },
  {
    "path": "src/utils/rate-limiter.ts",
    "content": "import pLimit from \"p-limit\";\n\n/**\n * Simple rate limit function\n */\nexport async function rateLimit(ms: number): Promise<void> {\n  return new Promise((resolve) => setTimeout(resolve, ms));\n}\n\n/**\n * Rate limiter using p-limit to control concurrent requests\n */\nexport class RateLimiter {\n  private limit: ReturnType<typeof pLimit>;\n\n  constructor(requestsPerSecond: number) {\n    // Convert requests per second to concurrency limit\n    // For rate limiting, we use pLimit with a delay between requests\n    this.limit = pLimit(1);\n    this.requestsPerSecond = requestsPerSecond;\n  }\n\n  private requestsPerSecond: number;\n  private lastRequestTime = 0;\n\n  /**\n   * Execute a function with rate limiting\n   */\n  async execute<T>(fn: () => Promise<T>): Promise<T> {\n    return this.limit(async () => {\n      await this.waitForNextSlot();\n      return fn();\n    });\n  }\n\n  /**\n   * Wait for the next available time slot based on rate limit\n   */\n  private async waitForNextSlot(): Promise<void> {\n    const now = Date.now();\n    const timeSinceLastRequest = now - this.lastRequestTime;\n    const minInterval = 1000 / this.requestsPerSecond;\n\n    if (timeSinceLastRequest < minInterval) {\n      const delay = minInterval - timeSinceLastRequest;\n      await new Promise((resolve) => setTimeout(resolve, delay));\n    }\n\n    this.lastRequestTime = Date.now();\n  }\n\n  /**\n   * Execute multiple functions concurrently with rate limiting\n   */\n  async executeAll<T>(functions: Array<() => Promise<T>>): Promise<T[]> {\n    return Promise.all(functions.map((fn) => this.execute(fn)));\n  }\n}\n"
  },
  {
    "path": "src/utils/robots-parser.ts",
    "content": "/**\n * Simple robots.txt parser for crawler compliance\n */\n\nexport interface RobotsRules {\n  disallowedPaths: string[];\n  allowedPaths: string[];\n  crawlDelay: number | null;\n}\n\n/**\n * Parse robots.txt content and extract rules for a specific user agent\n */\nexport function parseRobotsTxt(content: string, userAgent: string = \"*\"): RobotsRules {\n  const rules: RobotsRules = {\n    disallowedPaths: [],\n    allowedPaths: [],\n    crawlDelay: null,\n  };\n\n  const lines = content.split(\"\\n\").map((line) => line.trim());\n  let currentUserAgent = \"\";\n  let matchesUserAgent = false;\n\n  for (const line of lines) {\n    // Skip empty lines and comments\n    if (!line || line.startsWith(\"#\")) {\n      continue;\n    }\n\n    const colonIndex = line.indexOf(\":\");\n    if (colonIndex === -1) {\n      continue;\n    }\n\n    const directive = line.substring(0, colonIndex).trim().toLowerCase();\n    const value = line.substring(colonIndex + 1).trim();\n\n    if (directive === \"user-agent\") {\n      currentUserAgent = value.toLowerCase();\n      // Match specific user agent or wildcard\n      matchesUserAgent = currentUserAgent === \"*\" || currentUserAgent === userAgent.toLowerCase();\n    } else if (matchesUserAgent) {\n      if (directive === \"disallow\" && value) {\n        rules.disallowedPaths.push(value);\n      } else if (directive === \"allow\" && value) {\n        rules.allowedPaths.push(value);\n      } else if (directive === \"crawl-delay\") {\n        const delay = parseFloat(value);\n        if (!isNaN(delay)) {\n          rules.crawlDelay = delay * 1000; // Convert to milliseconds\n        }\n      }\n    }\n  }\n\n  return rules;\n}\n\n/**\n * Check if a URL path is allowed by robots.txt rules\n */\nexport function isPathAllowed(path: string, rules: RobotsRules): boolean {\n  // Normalize path\n  const normalizedPath = path.startsWith(\"/\") ? path : \"/\" + path;\n\n  // Check allow rules first (they take precedence)\n  for (const allowedPath of rules.allowedPaths) {\n    if (pathMatches(normalizedPath, allowedPath)) {\n      return true;\n    }\n  }\n\n  // Check disallow rules\n  for (const disallowedPath of rules.disallowedPaths) {\n    if (pathMatches(normalizedPath, disallowedPath)) {\n      return false;\n    }\n  }\n\n  // Default: allowed\n  return true;\n}\n\n/**\n * Check if a path matches a robots.txt pattern\n * Supports * (wildcard) and $ (end anchor)\n */\nfunction pathMatches(path: string, pattern: string): boolean {\n  // Empty pattern matches nothing\n  if (!pattern) {\n    return false;\n  }\n\n  // Convert robots.txt pattern to regex\n  let regexPattern = pattern\n    .replace(/[.+?^${}()|[\\]\\\\]/g, \"\\\\$&\") // Escape regex special chars except * and $\n    .replace(/\\*/g, \".*\"); // * becomes .*\n\n  // Handle $ end anchor\n  if (regexPattern.endsWith(\"\\\\$\")) {\n    regexPattern = regexPattern.slice(0, -2) + \"$\";\n  } else {\n    regexPattern = \"^\" + regexPattern;\n  }\n\n  try {\n    const regex = new RegExp(regexPattern);\n    return regex.test(path);\n  } catch {\n    // Invalid pattern, treat as literal prefix match\n    return path.startsWith(pattern);\n  }\n}\n\n/**\n * Fetch and parse robots.txt for a given base URL\n */\nexport async function fetchRobotsTxt(baseUrl: string): Promise<RobotsRules | null> {\n  try {\n    const url = new URL(\"/robots.txt\", baseUrl);\n    const response = await fetch(url.toString(), {\n      headers: {\n        \"User-Agent\": \"ReaderEngine/1.0\",\n      },\n    });\n\n    if (!response.ok) {\n      // No robots.txt or error - allow everything\n      return null;\n    }\n\n    const content = await response.text();\n    return parseRobotsTxt(content, \"ReaderEngine\");\n  } catch {\n    // Network error or invalid URL - allow everything\n    return null;\n  }\n}\n\n/**\n * Check if a URL is allowed by robots.txt\n */\nexport function isUrlAllowed(url: string, rules: RobotsRules | null): boolean {\n  if (!rules) {\n    return true;\n  }\n\n  try {\n    const parsedUrl = new URL(url);\n    return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);\n  } catch {\n    return true;\n  }\n}\n"
  },
  {
    "path": "src/utils/url-helpers.ts",
    "content": "import { URL } from \"url\";\nimport RE2 from \"re2\";\n\n/**\n * URL validation and normalization utilities\n */\n\n/**\n * Resolve a relative URL against a base URL\n */\nexport function resolveUrl(relative: string, base: string): string {\n  try {\n    return new URL(relative, base).toString();\n  } catch {\n    return relative;\n  }\n}\n\n/**\n * Validate if a string is a valid URL\n */\nexport function isValidUrl(string: string): boolean {\n  try {\n    new URL(string);\n    return true;\n  } catch {\n    return false;\n  }\n}\n\n/**\n * Normalize a URL by removing fragments and ensuring proper format\n */\nexport function normalizeUrl(url: string, baseUrl?: string): string {\n  try {\n    let parsedUrl: URL;\n\n    if (url.startsWith(\"http://\") || url.startsWith(\"https://\")) {\n      parsedUrl = new URL(url);\n    } else if (baseUrl) {\n      parsedUrl = new URL(url, baseUrl);\n    } else {\n      throw new Error(\"Relative URL requires base URL\");\n    }\n\n    // Remove fragment and search params for consistency\n    parsedUrl.hash = \"\";\n\n    return parsedUrl.toString();\n  } catch {\n    throw new Error(`Invalid URL: ${url}`);\n  }\n}\n\n/**\n * Extract base domain from a URL\n */\nexport function extractBaseDomain(url: string): string {\n  try {\n    const parsedUrl = new URL(url);\n    return parsedUrl.hostname;\n  } catch {\n    throw new Error(`Invalid URL for domain extraction: ${url}`);\n  }\n}\n\n/**\n * Check if a URL belongs to the same domain as the base URL.\n *\n * Strict hostname match — `dashboard.stripe.com` does NOT match\n * `docs.stripe.com`. The only normalization is stripping `www.`.\n * Crawlers should stay on the exact hostname they were seeded with.\n */\nexport function isSameDomain(url: string, baseUrl: string): boolean {\n  try {\n    const urlHost = extractBaseDomain(url).replace(/^www\\./, \"\");\n    const baseHost = extractBaseDomain(baseUrl).replace(/^www\\./, \"\");\n\n    return urlHost === baseHost;\n  } catch {\n    return false;\n  }\n}\n\n/**\n * Generate a URL key for deduplication\n * Normalizes:\n * - Removes fragments (hash)\n * - Removes search params\n * - Removes trailing slashes (except root)\n * - Lowercases\n * - Normalizes www vs non-www\n * - Removes default ports (80 for http, 443 for https)\n * - Normalizes index files (index.html, index.htm, default.html)\n */\nexport function getUrlKey(url: string): string {\n  try {\n    const parsedUrl = new URL(url);\n\n    // Remove hash fragments\n    parsedUrl.hash = \"\";\n\n    // Remove search params for consistency\n    parsedUrl.search = \"\";\n\n    // Normalize www vs non-www (remove www. prefix for deduplication)\n    if (parsedUrl.hostname.startsWith(\"www.\")) {\n      parsedUrl.hostname = parsedUrl.hostname.slice(4);\n    }\n\n    // Remove default ports (80 for http, 443 for https)\n    if (\n      (parsedUrl.protocol === \"http:\" && parsedUrl.port === \"80\") ||\n      (parsedUrl.protocol === \"https:\" && parsedUrl.port === \"443\")\n    ) {\n      parsedUrl.port = \"\";\n    }\n\n    // Normalize index files (treat /path/index.html as /path/)\n    const indexFiles = [\"index.html\", \"index.htm\", \"default.html\", \"default.htm\", \"index.php\"];\n    for (const indexFile of indexFiles) {\n      if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {\n        parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);\n        break;\n      }\n    }\n\n    // Normalize trailing slashes (keep for root path only)\n    let normalized = parsedUrl.toString().toLowerCase();\n    if (normalized.endsWith(\"/\") && parsedUrl.pathname !== \"/\") {\n      normalized = normalized.slice(0, -1);\n    }\n\n    return normalized;\n  } catch {\n    return url.toLowerCase();\n  }\n}\n\n/**\n * Validate an array of URLs and return validation results\n */\nexport function validateUrls(urls: string[]): {\n  isValid: boolean;\n  validUrls: string[];\n  errors: Array<{ url: string; error: string }>;\n} {\n  const validUrls: string[] = [];\n  const errors: Array<{ url: string; error: string }> = [];\n\n  if (!urls || urls.length === 0) {\n    return {\n      isValid: false,\n      validUrls: [],\n      errors: [{ url: \"\", error: \"At least one URL is required\" }],\n    };\n  }\n\n  for (const url of urls) {\n    if (!url || typeof url !== \"string\") {\n      errors.push({\n        url: String(url),\n        error: \"URL must be a non-empty string\",\n      });\n      continue;\n    }\n\n    const trimmedUrl = url.trim();\n    if (trimmedUrl === \"\") {\n      errors.push({ url: String(url), error: \"URL cannot be empty\" });\n      continue;\n    }\n\n    if (!isValidUrl(trimmedUrl)) {\n      errors.push({ url: trimmedUrl, error: \"Invalid URL format\" });\n      continue;\n    }\n\n    if (!trimmedUrl.startsWith(\"http://\") && !trimmedUrl.startsWith(\"https://\")) {\n      errors.push({\n        url: trimmedUrl,\n        error: \"URL must start with http:// or https://\",\n      });\n      continue;\n    }\n\n    validUrls.push(trimmedUrl);\n  }\n\n  // Remove duplicates while preserving order\n  const uniqueValidUrls = Array.from(new Set(validUrls));\n\n  return {\n    isValid: uniqueValidUrls.length > 0 && errors.length === 0,\n    validUrls: uniqueValidUrls,\n    errors,\n  };\n}\n\n/**\n * Check if a URL matches any of the given regex patterns\n *\n * Uses Google's RE2 engine which guarantees linear time execution,\n * preventing ReDoS attacks from malicious or pathological patterns.\n */\nexport function matchesPatterns(url: string, patterns: string[]): boolean {\n  if (!patterns || patterns.length === 0) {\n    return false;\n  }\n\n  return patterns.some((pattern) => {\n    try {\n      const regex = new RE2(pattern, \"i\");\n      return regex.test(url);\n    } catch {\n      // Invalid regex pattern or unsupported RE2 syntax, skip it\n      return false;\n    }\n  });\n}\n\n/**\n * Check if a URL should be included based on include/exclude patterns\n * - If includePatterns is set, URL must match at least one\n * - If excludePatterns is set, URL must not match any\n */\nexport function shouldIncludeUrl(\n  url: string,\n  includePatterns?: string[],\n  excludePatterns?: string[]\n): boolean {\n  // If include patterns are specified, URL must match at least one\n  if (includePatterns && includePatterns.length > 0) {\n    if (!matchesPatterns(url, includePatterns)) {\n      return false;\n    }\n  }\n\n  // If exclude patterns are specified, URL must not match any\n  if (excludePatterns && excludePatterns.length > 0) {\n    if (matchesPatterns(url, excludePatterns)) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\n/**\n * Check if a URL is likely a content page (not legal, policy, or utility page)\n * Used by crawler to filter out non-content pages\n */\nexport function isContentUrl(url: string): boolean {\n  const lowerUrl = url.toLowerCase();\n\n  // Skip legal and policy pages\n  const nonContentPatterns = [\n    // Legal and policy pages\n    /\\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\\b/i,\n    /\\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\\b/i,\n    /\\/(cookie-policy|data-protection|acceptable-use|user-agreement)\\b/i,\n    /\\/(refund|cancellation|shipping|return)-?(policy)?\\b/i,\n    // Contact and support pages (usually not main content)\n    /\\/(contact|support|help|faq|feedback)\\/?$/i,\n    // About pages that are typically boilerplate\n    /\\/(about-us|careers|jobs|press|investors|team)\\/?$/i,\n    // Authentication and admin areas\n    /\\/(admin|login|auth|account|dashboard|profile|settings)\\//i,\n    // E-commerce utility pages\n    /\\/(cart|checkout|payment|subscription|wishlist)\\//i,\n    // File downloads and assets\n    /\\/(uploads|assets|files|static|media|resources)\\//i,\n    // API endpoints\n    /\\/(api|graphql|rest|webhook)\\//i,\n  ];\n\n  if (nonContentPatterns.some((pattern) => pattern.test(lowerUrl))) {\n    return false;\n  }\n\n  // Skip common non-content file extensions\n  const skipExtensions = [\".pdf\", \".doc\", \".docx\", \".xls\", \".xlsx\", \".zip\", \".exe\"];\n  if (skipExtensions.some((ext) => lowerUrl.endsWith(ext))) {\n    return false;\n  }\n\n  return true;\n}\n\n/**\n * Check if a URL should be crawled based on various criteria\n */\nexport function shouldCrawlUrl(\n  url: string,\n  baseUrl: string,\n  maxDepth: number,\n  currentDepth: number,\n  visited: Set<string>\n): boolean {\n  // Check depth limit - FIXED: use > instead of >=\n  if (currentDepth > maxDepth) {\n    return false;\n  }\n\n  // Check if already visited\n  const urlKey = getUrlKey(url);\n  if (visited.has(urlKey)) {\n    return false;\n  }\n\n  // Check if same domain\n  if (!isSameDomain(url, baseUrl)) {\n    return false;\n  }\n\n  // Enhanced filtering for non-content files and patterns\n  const lowerUrl = url.toLowerCase();\n\n  // Skip common non-content file extensions\n  const skipExtensions = [\n    \".pdf\",\n    \".doc\",\n    \".docx\",\n    \".xls\",\n    \".xlsx\",\n    \".ppt\",\n    \".pptx\",\n    \".zip\",\n    \".rar\",\n    \".tar\",\n    \".gz\",\n    \".exe\",\n    \".dmg\",\n    \".pkg\",\n    \".deb\",\n    \".rpm\",\n    \".apk\",\n    \".ipa\",\n    // Image files\n    \".jpg\",\n    \".jpeg\",\n    \".png\",\n    \".gif\",\n    \".bmp\",\n    \".svg\",\n    \".webp\",\n    \".ico\",\n    \".favicon\",\n    // Video files\n    \".mp4\",\n    \".avi\",\n    \".mov\",\n    \".wmv\",\n    \".flv\",\n    \".webm\",\n    // Audio files\n    \".mp3\",\n    \".wav\",\n    \".ogg\",\n    \".m4a\",\n    \".aac\",\n    // Font files\n    \".woff\",\n    \".woff2\",\n    \".ttf\",\n    \".otf\",\n    \".eot\",\n    // Style and script files\n    \".css\",\n    \".js\",\n    \".mjs\",\n    \".ts\",\n    \".jsx\",\n    \".tsx\",\n    // Data and config files\n    \".json\",\n    \".xml\",\n    \".txt\",\n    \".md\",\n    \".rss\",\n    \".atom\",\n    \".sitemap\",\n    \".robots\",\n    \".webmanifest\",\n    // Archive files\n    \".zip\",\n    \".tar\",\n    \".gz\",\n    \".bz2\",\n    \".7z\",\n  ];\n\n  if (skipExtensions.some((ext) => lowerUrl.includes(ext))) {\n    return false;\n  }\n\n  // Skip common non-content URL patterns\n  const skipPatterns = [\n    // File downloads and assets\n    /\\/(uploads|assets|files|static|media|resources)\\//i,\n    // Authentication and admin areas\n    /\\/(admin|login|auth|account|dashboard|profile|settings)\\//i,\n    // API endpoints\n    /\\/(api|graphql|rest|ws:|webhook)\\//i,\n    // Common tracking and analytics\n    /\\/(analytics|tracking|pixel|beacon|ads)\\//i,\n    // Development and testing areas\n    /\\/(test|dev|staging|beta|demo)\\//i,\n    // Common utility and service pages\n    /\\/(search|cart|checkout|payment|subscription)\\//i,\n    // Social media and external services\n    /\\/(facebook|twitter|instagram|youtube|linkedin|github)\\//i,\n    // Legal and policy pages\n    /\\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\\b/i,\n    /\\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\\b/i,\n    /\\/(cookie-policy|data-protection|acceptable-use|user-agreement)\\b/i,\n    /\\/(refund|cancellation|shipping|return)-?(policy)?\\b/i,\n    // Contact and support pages (usually not main content)\n    /\\/(contact|support|help|faq|feedback)\\/?$/i,\n    // About pages that are typically boilerplate\n    /\\/(about-us|careers|jobs|press|investors|team)\\/?$/i,\n  ];\n\n  if (skipPatterns.some((pattern) => pattern.test(url))) {\n    return false;\n  }\n\n  // Skip URLs with query parameters that indicate non-content\n  if (\n    url.includes(\"?\") &&\n    [\"download\", \"file\", \"attachment\", \"export\", \"print\", \"share\", \"email\"].some((param) =>\n      url.toLowerCase().includes(param)\n    )\n  ) {\n    return false;\n  }\n\n  // Skip very short URLs (likely navigation or utility)\n  if (url.split(\"/\").filter(Boolean).length < 2 && url.split(\"?\")[0].split(\"/\").length <= 2) {\n    return false;\n  }\n\n  return true;\n}\n"
  },
  {
    "path": "src/utils/url-rewriter.ts",
    "content": "/**\n * URL Rewriter\n *\n * Rewrites certain URLs to their export/download equivalents before scraping.\n * Reader ships with NO built-in rules. The caller provides rewrite rules\n * via ScrapeOptions.urlRewriters.\n */\n\nimport { createLogger } from \"./logger\";\n\nconst logger = createLogger(\"url-rewriter\");\n\n/**\n * A single URL rewrite rule.\n */\nexport interface UrlRewriteRule {\n  /** Name for diagnostics */\n  name: string;\n  /** Return true if this rewriter applies to the URL */\n  match: (url: URL) => boolean;\n  /** Return the rewritten URL string */\n  rewrite: (url: URL) => string;\n}\n\n/**\n * Result of a URL rewrite attempt.\n */\nexport interface RewriteResult {\n  /** The final URL to scrape (rewritten or original) */\n  url: string;\n  /** Whether the URL was actually rewritten */\n  rewritten: boolean;\n  /** Reason/source of the rewrite for diagnostics */\n  reason?: string;\n}\n\n/**\n * Attempt to rewrite a URL using the provided rules.\n *\n * Returns the original URL unchanged if no rule matches or no rules provided.\n */\nexport function rewriteUrl(inputUrl: string, rules?: UrlRewriteRule[]): RewriteResult {\n  if (!rules || rules.length === 0) {\n    return { url: inputUrl, rewritten: false };\n  }\n\n  let parsed: URL;\n  try {\n    parsed = new URL(inputUrl);\n  } catch {\n    return { url: inputUrl, rewritten: false };\n  }\n\n  for (const rule of rules) {\n    if (rule.match(parsed)) {\n      const rewritten = rule.rewrite(parsed);\n      logger.debug(`[url-rewriter] Rewrote (${rule.name}): ${inputUrl} -> ${rewritten}`);\n      return { url: rewritten, rewritten: true, reason: rule.name };\n    }\n  }\n\n  return { url: inputUrl, rewritten: false };\n}\n"
  },
  {
    "path": "tests/engines/orchestrator.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { EngineOrchestrator } from \"../../src/engines/orchestrator\";\nimport { ScrapeFailedError, HttpError } from \"../../src/engines/errors\";\nimport type { EngineResult } from \"../../src/engines/types\";\nimport type { ScrapeOptions } from \"../../src/types\";\n\nfunction createMeta(url = \"https://example.com\") {\n  return {\n    url,\n    options: { urls: [url] } as ScrapeOptions,\n  };\n}\n\ndescribe(\"EngineOrchestrator\", () => {\n  describe(\"quality assessment\", () => {\n    it(\"passes content with sufficient length and good status\", () => {\n      const orchestrator = new EngineOrchestrator();\n      const result: EngineResult = {\n        html: `<html><body><p>${\"Real content. \".repeat(20)}</p></body></html>`,\n        url: \"https://example.com\",\n        statusCode: 200,\n        engine: \"hero\",\n        duration: 100,\n      };\n\n      const quality = (orchestrator as any).assessQuality(result);\n      expect(quality.passed).toBe(true);\n    });\n\n    it(\"passes bot pages with content (quality gate is minimal)\", () => {\n      const orchestrator = new EngineOrchestrator();\n      const result: EngineResult = {\n        html: '<html><body><h4>Click the button below to continue shopping</h4></body></html>',\n        url: \"https://amazon.com/dp/123\",\n        statusCode: 200,\n        engine: \"hero\",\n        duration: 50,\n      };\n\n      const quality = (orchestrator as any).assessQuality(result);\n      expect(quality.passed).toBe(true);\n    });\n\n    it(\"fails empty content with good status\", () => {\n      const orchestrator = new EngineOrchestrator();\n      const result: EngineResult = {\n        html: \"<html><body></body></html>\",\n        url: \"https://example.com\",\n        statusCode: 200,\n        engine: \"hero\",\n        duration: 50,\n      };\n\n      const quality = (orchestrator as any).assessQuality(result);\n      expect(quality.passed).toBe(false);\n      expect(quality.reason).toBe(\"empty_content\");\n    });\n\n    it(\"fails on HTTP error with empty content\", () => {\n      const orchestrator = new EngineOrchestrator();\n      const result: EngineResult = {\n        html: \"\",\n        url: \"https://example.com\",\n        statusCode: 500,\n        engine: \"hero\",\n        duration: 50,\n      };\n\n      const quality = (orchestrator as any).assessQuality(result);\n      expect(quality.passed).toBe(false);\n      expect(quality.reason).toBe(\"http_error\");\n    });\n  });\n\n  describe(\"ScrapeFailedError\", () => {\n    it(\"has correct structure with proxyBlock=false\", () => {\n      const inner = new Error(\"timeout\");\n      const err = new ScrapeFailedError(inner);\n\n      expect(err.name).toBe(\"ScrapeFailedError\");\n      expect(err.proxyBlock).toBe(false);\n      expect(err.message).toBe(\"timeout\");\n      expect(err.cause).toBe(inner);\n    });\n\n    it(\"has correct structure with proxyBlock=true\", () => {\n      const inner = new HttpError(\"hero\", 403, \"Forbidden\");\n      const err = new ScrapeFailedError(inner, { proxyBlock: true });\n\n      expect(err.name).toBe(\"ScrapeFailedError\");\n      expect(err.proxyBlock).toBe(true);\n      expect(err.message).toContain(\"403\");\n    });\n\n    it(\"defaults proxyBlock to false\", () => {\n      const err = new ScrapeFailedError(new Error(\"something\"));\n      expect(err.proxyBlock).toBe(false);\n    });\n  });\n});\n"
  },
  {
    "path": "tests/fixtures/amazon-bot-page.html",
    "content": "<html class=\"a-no-js\" lang=\"en-us\"><head>\n<title dir=\"ltr\">Amazon.com</title>\n</head>\n<body>\n<div class=\"a-container a-padding-double-large\" style=\"min-width:350px;padding:44px 0 !important\">\n    <div class=\"a-row a-spacing-double-large\" style=\"width: 350px; margin: 0 auto\">\n        <div class=\"a-row a-spacing-medium a-text-center\"><i class=\"a-icon a-logo\" alt=\"Amazon logo\"></i></div>\n        <div class=\"a-box a-alert a-alert-info a-spacing-base\">\n            <div class=\"a-box-inner\">\n                <i class=\"a-icon a-icon-alert\" alt=\"Alert icon\"></i>\n                <h4>Click the button below to continue shopping</h4>\n            </div>\n        </div>\n        <div class=\"a-section\">\n            <div class=\"a-box a-color-offset-background\">\n                <div class=\"a-box-inner a-padding-extra-large\">\n                </div>\n            </div>\n        </div>\n    </div>\n    <div class=\"a-divider a-divider-section\"><div class=\"a-divider-inner\"></div></div>\n    <div class=\"a-text-center a-spacing-small a-size-mini\">\n        <a href=\"https://www.amazon.com/gp/help/customer/display.html/ref=footer_cou?ie=UTF8&nodeId=508088\">Conditions of Use</a>\n        <span class=\"a-letter-space\"></span>\n        <span class=\"a-letter-space\"></span>\n        <span class=\"a-letter-space\"></span>\n        <span class=\"a-letter-space\"></span>\n        <a href=\"https://www.amazon.com/gp/help/customer/display.html/ref=footer_privacy?ie=UTF8&nodeId=468496\">Privacy Policy</a>\n    </div>\n    <div class=\"a-text-center a-size-mini a-color-base\">\n      © 1996-2025, Amazon.com, Inc. or its affiliates\n    </div>\n</div>\n</body></html>\n"
  },
  {
    "path": "tests/fixtures/cloudflare-challenge.html",
    "content": "<!DOCTYPE html>\n<html>\n<head>\n  <title>Just a moment...</title>\n</head>\n<body>\n  <div id=\"challenge-running\">\n    <div class=\"cf-browser-verification\">\n      <noscript>\n        <h1>Enable JavaScript and cookies to continue</h1>\n      </noscript>\n      <div id=\"trk_jschal_js\" style=\"display:none;background-image:url('/cdn-cgi/images/trace/managed/js/transparent.gif?ray=abc123')\"></div>\n      <div id=\"challenge-body-text\">\n        Checking your browser before accessing the website.\n      </div>\n      <div id=\"turnstile-wrapper\">\n        <div class=\"cf-turnstile\"></div>\n      </div>\n    </div>\n    <div class=\"ray-id\">\n      <p>Performance &amp; security by Cloudflare</p>\n      <p>Ray ID: abc123def456</p>\n    </div>\n  </div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/fixtures/empty-page.html",
    "content": "<!DOCTYPE html>\n<html><head><title></title></head><body></body></html>\n"
  },
  {
    "path": "tests/fixtures/simple-static.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n  <meta charset=\"utf-8\">\n  <title>Simple Test Page</title>\n  <meta name=\"description\" content=\"A simple static test page for reader tests\">\n  <meta property=\"og:title\" content=\"Simple Test Page OG\">\n  <meta property=\"og:description\" content=\"Open Graph description\">\n  <link rel=\"canonical\" href=\"https://example.com/simple\">\n</head>\n<body>\n  <header>\n    <nav><a href=\"/\">Home</a> | <a href=\"/about\">About</a></nav>\n  </header>\n  <main>\n    <article>\n      <h1>Simple Test Page</h1>\n      <p>This is a simple static page used for testing the reader scraping engine.</p>\n      <p>It contains multiple paragraphs with <strong>bold text</strong> and <em>italic text</em>.</p>\n      <h2>Section Two</h2>\n      <p>More content in the second section. Here is a <a href=\"https://example.com/link\">link to another page</a>.</p>\n      <ul>\n        <li>First item</li>\n        <li>Second item</li>\n        <li>Third item</li>\n      </ul>\n    </article>\n  </main>\n  <footer>\n    <p>&copy; 2025 Test Site</p>\n  </footer>\n</body>\n</html>\n"
  },
  {
    "path": "tests/integration/daemon.test.ts",
    "content": "import { describe, it, expect, beforeAll, afterAll } from \"vitest\";\nimport http from \"http\";\n\n/**\n * Daemon integration tests\n *\n * These test the DaemonServer HTTP endpoints without starting a real\n * browser pool. They verify the request routing, auth, health/ready\n * endpoints, and graceful shutdown behavior.\n *\n * NOTE: These tests import the server class directly and mock the\n * ReaderClient to avoid needing Chrome/Hero installed.\n */\n\n// Helper to make HTTP requests\nfunction request(\n  port: number,\n  method: string,\n  path: string,\n  body?: object,\n  headers?: Record<string, string>,\n): Promise<{ status: number; body: any }> {\n  return new Promise((resolve, reject) => {\n    const options: http.RequestOptions = {\n      hostname: \"127.0.0.1\",\n      port,\n      path,\n      method,\n      headers: {\n        \"Content-Type\": \"application/json\",\n        ...headers,\n      },\n    };\n\n    const req = http.request(options, (res) => {\n      let data = \"\";\n      res.on(\"data\", (chunk) => (data += chunk));\n      res.on(\"end\", () => {\n        try {\n          resolve({ status: res.statusCode!, body: JSON.parse(data) });\n        } catch {\n          resolve({ status: res.statusCode!, body: data });\n        }\n      });\n    });\n\n    req.on(\"error\", reject);\n\n    if (body) {\n      req.write(JSON.stringify(body));\n    }\n    req.end();\n  });\n}\n\ndescribe(\"DaemonServer endpoints\", () => {\n  // These tests verify the HTTP routing logic.\n  // We test against a minimal HTTP server that mimics the daemon's routing.\n\n  let server: http.Server;\n  const PORT = 18847; // high port to avoid conflicts\n\n  beforeAll(async () => {\n    // Create a minimal server that mimics daemon routing\n    server = http.createServer((req, res) => {\n      const url = req.url ?? \"/\";\n      const method = req.method ?? \"GET\";\n\n      // Health — always 200, no auth\n      if (method === \"GET\" && url === \"/health\") {\n        res.writeHead(200, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({ success: true, data: { status: \"ok\" } }));\n        return;\n      }\n\n      // Ready — returns 503 (simulating cold pool)\n      if (method === \"GET\" && url === \"/ready\") {\n        res.writeHead(503, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({ success: false, error: \"Not ready — pool is initializing\" }));\n        return;\n      }\n\n      // Status — returns mock status\n      if (method === \"GET\" && url === \"/status\") {\n        res.writeHead(200, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({\n          success: true,\n          data: { running: true, ready: false, port: PORT, poolSize: 5, uptime: 1000, pid: process.pid, activeRequests: 0 },\n        }));\n        return;\n      }\n\n      // 404 for everything else\n      res.writeHead(404, { \"Content-Type\": \"application/json\" });\n      res.end(JSON.stringify({ success: false, error: \"Not found\" }));\n    });\n\n    await new Promise<void>((resolve) => server.listen(PORT, \"127.0.0.1\", resolve));\n  });\n\n  afterAll(async () => {\n    await new Promise<void>((resolve) => server.close(() => resolve()));\n  });\n\n  describe(\"GET /health\", () => {\n    it(\"returns 200 with ok status\", async () => {\n      const res = await request(PORT, \"GET\", \"/health\");\n      expect(res.status).toBe(200);\n      expect(res.body.success).toBe(true);\n      expect(res.body.data.status).toBe(\"ok\");\n    });\n  });\n\n  describe(\"GET /ready\", () => {\n    it(\"returns 503 when pool is not warm\", async () => {\n      const res = await request(PORT, \"GET\", \"/ready\");\n      expect(res.status).toBe(503);\n      expect(res.body.success).toBe(false);\n    });\n  });\n\n  describe(\"GET /status\", () => {\n    it(\"returns pool stats and uptime\", async () => {\n      const res = await request(PORT, \"GET\", \"/status\");\n      expect(res.status).toBe(200);\n      expect(res.body.data.running).toBe(true);\n      expect(res.body.data.poolSize).toBe(5);\n      expect(typeof res.body.data.uptime).toBe(\"number\");\n      expect(typeof res.body.data.activeRequests).toBe(\"number\");\n    });\n  });\n\n  describe(\"unknown routes\", () => {\n    it(\"returns 404 for GET /unknown\", async () => {\n      const res = await request(PORT, \"GET\", \"/unknown\");\n      expect(res.status).toBe(404);\n    });\n\n    it(\"returns 404 for POST /scrape\", async () => {\n      const res = await request(PORT, \"POST\", \"/scrape\");\n      expect(res.status).toBe(404);\n    });\n  });\n});\n\ndescribe(\"DaemonServer auth\", () => {\n  let server: http.Server;\n  const PORT = 18848;\n  const AUTH_TOKEN = \"test-secret-token\";\n\n  beforeAll(async () => {\n    server = http.createServer((req, res) => {\n      const url = req.url ?? \"/\";\n      const method = req.method ?? \"GET\";\n\n      // Health — no auth\n      if (method === \"GET\" && url === \"/health\") {\n        res.writeHead(200, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({ success: true, data: { status: \"ok\" } }));\n        return;\n      }\n\n      // Everything else requires auth\n      const authHeader = req.headers.authorization;\n      if (authHeader !== `Bearer ${AUTH_TOKEN}`) {\n        res.writeHead(401, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({ success: false, error: \"Unauthorized\" }));\n        return;\n      }\n\n      if (method === \"GET\" && url === \"/ready\") {\n        res.writeHead(200, { \"Content-Type\": \"application/json\" });\n        res.end(JSON.stringify({ success: true, data: { ready: true } }));\n        return;\n      }\n\n      res.writeHead(404, { \"Content-Type\": \"application/json\" });\n      res.end(JSON.stringify({ success: false, error: \"Not found\" }));\n    });\n\n    await new Promise<void>((resolve) => server.listen(PORT, \"127.0.0.1\", resolve));\n  });\n\n  afterAll(async () => {\n    await new Promise<void>((resolve) => server.close(() => resolve()));\n  });\n\n  it(\"allows /health without auth\", async () => {\n    const res = await request(PORT, \"GET\", \"/health\");\n    expect(res.status).toBe(200);\n  });\n\n  it(\"rejects /ready without auth token\", async () => {\n    const res = await request(PORT, \"GET\", \"/ready\");\n    expect(res.status).toBe(401);\n    expect(res.body.error).toBe(\"Unauthorized\");\n  });\n\n  it(\"rejects /ready with wrong token\", async () => {\n    const res = await request(PORT, \"GET\", \"/ready\", undefined, {\n      Authorization: \"Bearer wrong-token\",\n    });\n    expect(res.status).toBe(401);\n  });\n\n  it(\"allows /ready with correct token\", async () => {\n    const res = await request(PORT, \"GET\", \"/ready\", undefined, {\n      Authorization: `Bearer ${AUTH_TOKEN}`,\n    });\n    expect(res.status).toBe(200);\n    expect(res.body.data.ready).toBe(true);\n  });\n});\n"
  },
  {
    "path": "tests/unit/block-detector-cloudflare.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { readFileSync } from \"fs\";\nimport { join } from \"path\";\nimport { detectBotPage, detectBotTitle, type BlockDetectionConfig } from \"../../src/utils/block-detector\";\n\nconst FIXTURES_DIR = join(__dirname, \"..\", \"fixtures\");\n\nfunction loadFixture(name: string): string {\n  return readFileSync(join(FIXTURES_DIR, name), \"utf-8\");\n}\n\nconst CF_CONFIG: BlockDetectionConfig = {\n  patterns: [\n    /just a moment/i,\n    /enable javascript and cookies to continue/i,\n    /checking your browser before accessing/i,\n    /this process is automatic/i,\n  ],\n  titlePatterns: [/just a moment/i],\n  shortContentThreshold: 500,\n  longContentSignalThreshold: 3,\n};\n\ndescribe(\"detectBotPage with Cloudflare fixture\", () => {\n  it(\"detects real Cloudflare challenge page when config provided\", () => {\n    const html = loadFixture(\"cloudflare-challenge.html\");\n    expect(detectBotPage(html, CF_CONFIG)).toBe(true);\n  });\n\n  it(\"does NOT detect without config (unopinionated)\", () => {\n    const html = loadFixture(\"cloudflare-challenge.html\");\n    expect(detectBotPage(html)).toBe(false);\n  });\n});\n\ndescribe(\"detectBotTitle with Cloudflare fixture\", () => {\n  it(\"detects 'Just a moment...' title with config\", () => {\n    expect(detectBotTitle(\"Just a moment...\", CF_CONFIG)).toBe(true);\n  });\n});\n\ndescribe(\"detectBotPage with simple static page\", () => {\n  it(\"does NOT flag a normal static page\", () => {\n    const html = loadFixture(\"simple-static.html\");\n    expect(detectBotPage(html, CF_CONFIG)).toBe(false);\n  });\n});\n\ndescribe(\"detectBotPage with empty page\", () => {\n  it(\"does NOT flag an empty page\", () => {\n    const html = loadFixture(\"empty-page.html\");\n    expect(detectBotPage(html, CF_CONFIG)).toBe(false);\n  });\n});\n"
  },
  {
    "path": "tests/unit/block-detector-fixtures.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { readFileSync } from \"fs\";\nimport { join } from \"path\";\nimport { detectBotPage, type BlockDetectionConfig } from \"../../src/utils/block-detector\";\n\nconst FIXTURES_DIR = join(__dirname, \"..\", \"fixtures\");\n\nfunction loadFixture(name: string): string {\n  return readFileSync(join(FIXTURES_DIR, name), \"utf-8\");\n}\n\nconst AMAZON_CONFIG: BlockDetectionConfig = {\n  patterns: [\n    /click the button below to continue shopping/i,\n    /to discuss automated access/i,\n  ],\n  shortContentThreshold: 500,\n  longContentSignalThreshold: 3,\n};\n\ndescribe(\"detectBotPage with real HTML fixtures\", () => {\n  it(\"detects real Amazon bot page with config\", () => {\n    const html = loadFixture(\"amazon-bot-page.html\");\n    expect(detectBotPage(html, AMAZON_CONFIG)).toBe(true);\n  });\n\n  it(\"does NOT detect without config (unopinionated)\", () => {\n    const html = loadFixture(\"amazon-bot-page.html\");\n    expect(detectBotPage(html)).toBe(false);\n  });\n});\n"
  },
  {
    "path": "tests/unit/block-detector.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { detectBotPage, detectBotTitle, isBlockedResponse, type BlockDetectionConfig } from \"../../src/utils/block-detector\";\n\n// Test config — mimics what reader-api would provide\nconst TEST_CONFIG: BlockDetectionConfig = {\n  patterns: [\n    /robot check/i,\n    /access denied/i,\n    /attention required/i,\n    /just a moment/i,\n    /verify you are a human/i,\n    /click the button below to continue shopping/i,\n    /to discuss automated access/i,\n    /unusual traffic from your computer/i,\n    /enable javascript and cookies to continue/i,\n    /checking your browser before accessing/i,\n    /this process is automatic/i,\n    /complete the captcha/i,\n  ],\n  titlePatterns: [\n    /robot check/i,\n    /access denied/i,\n    /attention required/i,\n    /just a moment/i,\n    /blocked/i,\n    /captcha/i,\n  ],\n  shortContentThreshold: 500,\n  longContentSignalThreshold: 3,\n};\n\ndescribe(\"detectBotPage\", () => {\n  it(\"returns false when no config provided (unopinionated)\", () => {\n    const html = `<html><body>Click the button below to continue shopping</body></html>`;\n    expect(detectBotPage(html)).toBe(false);\n    expect(detectBotPage(html, undefined)).toBe(false);\n    expect(detectBotPage(html, {})).toBe(false);\n  });\n\n  describe(\"with config: Amazon bot pages\", () => {\n    it(\"detects Amazon 'click the button' block page\", () => {\n      const html = `\n        <html><head><title>Amazon.com</title></head>\n        <body>\n          <div class=\"a-container\">\n            <h4>Click the button below to continue shopping</h4>\n            © 1996-2025, Amazon.com, Inc.\n          </div>\n        </body></html>\n      `;\n      expect(detectBotPage(html, TEST_CONFIG)).toBe(true);\n    });\n\n    it(\"detects Amazon 'automated access' block page\", () => {\n      const html = `<html><body>To discuss automated access to Amazon data please contact us.</body></html>`;\n      expect(detectBotPage(html, TEST_CONFIG)).toBe(true);\n    });\n  });\n\n  describe(\"with config: Cloudflare pages\", () => {\n    it(\"detects Cloudflare JS challenge\", () => {\n      const html = `\n        <html><head><title>Just a moment...</title></head>\n        <body>\n          <div>Enable JavaScript and cookies to continue</div>\n          <div>Checking your browser before accessing</div>\n          <div>This process is automatic.</div>\n        </body></html>\n      `;\n      expect(detectBotPage(html, TEST_CONFIG)).toBe(true);\n    });\n  });\n\n  describe(\"legitimate pages (no false positives)\", () => {\n    it(\"does not flag a normal news article\", () => {\n      const html = `\n        <html><body>\n          <h1>Tech News Today</h1>\n          <p>${\"Lorem ipsum dolor sit amet. \".repeat(20)}</p>\n        </body></html>\n      `;\n      expect(detectBotPage(html, TEST_CONFIG)).toBe(false);\n    });\n\n    it(\"does not flag an article about bots (needs 3+ signals for long content)\", () => {\n      const html = `\n        <html><body>\n          <h1>How Bot Detection Works</h1>\n          <p>Modern systems verify you are a human using various challenge mechanisms.\n          Understanding these systems is important for web security. ${\"Regular content. \".repeat(30)}</p>\n        </body></html>\n      `;\n      expect(detectBotPage(html, TEST_CONFIG)).toBe(false);\n    });\n  });\n\n  describe(\"edge cases\", () => {\n    it(\"handles empty HTML\", () => {\n      expect(detectBotPage(\"\", TEST_CONFIG)).toBe(false);\n    });\n\n    it(\"handles whitespace-only HTML\", () => {\n      expect(detectBotPage(\"   \\n\\t  \", TEST_CONFIG)).toBe(false);\n    });\n  });\n});\n\ndescribe(\"detectBotTitle\", () => {\n  it(\"returns false when no config provided\", () => {\n    expect(detectBotTitle(\"Robot Check\")).toBe(false);\n  });\n\n  it(\"detects 'Robot Check' title with config\", () => {\n    expect(detectBotTitle(\"Robot Check\", TEST_CONFIG)).toBe(true);\n  });\n\n  it(\"detects 'Access Denied' title\", () => {\n    expect(detectBotTitle(\"Access Denied\", TEST_CONFIG)).toBe(true);\n  });\n\n  it(\"does not flag normal titles\", () => {\n    expect(detectBotTitle(\"Amazon.com: Best Products\", TEST_CONFIG)).toBe(false);\n    expect(detectBotTitle(\"Wikipedia\", TEST_CONFIG)).toBe(false);\n  });\n\n  it(\"handles empty title\", () => {\n    expect(detectBotTitle(\"\", TEST_CONFIG)).toBe(false);\n  });\n});\n\ndescribe(\"isBlockedResponse\", () => {\n  it(\"detects HTTP 401/403/429/503 without config (always)\", () => {\n    expect(isBlockedResponse(401).blocked).toBe(true);\n    expect(isBlockedResponse(403).blocked).toBe(true);\n    expect(isBlockedResponse(429).blocked).toBe(true);\n    expect(isBlockedResponse(503).blocked).toBe(true);\n  });\n\n  it(\"does NOT detect bot page without config\", () => {\n    const html = `<html><body>Click the button below to continue shopping</body></html>`;\n    expect(isBlockedResponse(200, html).blocked).toBe(false);\n  });\n\n  it(\"detects 200 + bot page WITH config\", () => {\n    const html = `<html><body><h4>Click the button below to continue shopping</h4></body></html>`;\n    expect(isBlockedResponse(200, html, TEST_CONFIG).blocked).toBe(true);\n    expect(isBlockedResponse(200, html, TEST_CONFIG).reason).toBe(\"bot_page_detected\");\n  });\n\n  it(\"allows 200 with real content\", () => {\n    const html = `<html><body><h1>Real Page</h1><p>${\"Lorem ipsum \".repeat(100)}</p></body></html>`;\n    expect(isBlockedResponse(200, html, TEST_CONFIG).blocked).toBe(false);\n  });\n\n  it(\"allows 200 without HTML\", () => {\n    expect(isBlockedResponse(200).blocked).toBe(false);\n  });\n\n  it(\"allows redirects\", () => {\n    expect(isBlockedResponse(301).blocked).toBe(false);\n    expect(isBlockedResponse(302).blocked).toBe(false);\n  });\n});\n"
  },
  {
    "path": "tests/unit/browser-session.test.ts",
    "content": "/**\n * Browser Session Unit Tests\n *\n * Tests the findChromePath logic and session structure.\n * Full integration is tested in the E2E suite (reader-e2e).\n */\nimport { describe, it, expect, vi } from \"vitest\";\n\n// Since browser-session.ts spawns real Chrome processes,\n// unit tests focus on the exported types and utilities.\n// The heavy lifting is tested in E2E (suites/browser-session/run.ts).\n\ndescribe(\"browser-session module\", () => {\n  it(\"exports createBrowserSession function\", async () => {\n    const mod = await import(\"../../src/browser-session\");\n    expect(typeof mod.createBrowserSession).toBe(\"function\");\n  });\n\n  it(\"BrowserSession type has required fields\", async () => {\n    // Type-level check — if this compiles, the types are correct\n    const session: import(\"../../src/browser-types\").BrowserSession = {\n      sessionId: \"test-id\",\n      wsEndpoint: \"ws://localhost:9222/devtools/browser/uuid\",\n      createdAt: new Date().toISOString(),\n      close: async () => {},\n    };\n    expect(session.sessionId).toBe(\"test-id\");\n    expect(session.wsEndpoint).toContain(\"ws://\");\n    expect(typeof session.close).toBe(\"function\");\n  });\n\n  it(\"BrowserOptions accepts all expected fields\", async () => {\n    const opts: import(\"../../src/browser-types\").BrowserOptions = {\n      proxy: { host: \"proxy.example.com\", port: 8080 },\n      proxyTier: \"residential\",\n      showChrome: true,\n      timeoutMs: 60_000,\n      verbose: true,\n    };\n    expect(opts.proxyTier).toBe(\"residential\");\n    expect(opts.timeoutMs).toBe(60_000);\n  });\n});\n"
  },
  {
    "path": "tests/unit/content-cleaner.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { cleanContent } from \"../../src/utils/content-cleaner\";\n\ndescribe(\"cleanContent\", () => {\n  describe(\"script and style removal\", () => {\n    it(\"removes script tags\", () => {\n      const html = `<html><body><script>alert('xss')</script><p>Content</p></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      expect(result).not.toContain(\"<script\");\n      expect(result).toContain(\"Content\");\n    });\n\n    it(\"removes style tags\", () => {\n      const html = `<html><body><style>.x { color: red }</style><p>Content</p></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      expect(result).not.toContain(\"<style\");\n      expect(result).toContain(\"Content\");\n    });\n\n    it(\"removes noscript tags\", () => {\n      const html = `<html><body><noscript>Enable JS</noscript><p>Content</p></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      expect(result).not.toContain(\"Enable JS\");\n    });\n  });\n\n  describe(\"onlyMainContent navigation removal\", () => {\n    it(\"removes nav, header, footer when onlyMainContent=true\", () => {\n      const html = `\n        <html><body>\n          <nav>Navigation links</nav>\n          <header>Site header</header>\n          <main><p>Main article content here that is long enough to not be filtered</p></main>\n          <footer>Footer info</footer>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", { onlyMainContent: true });\n      expect(result).toContain(\"Main article content\");\n      expect(result).not.toContain(\"Navigation links\");\n      expect(result).not.toContain(\"Footer info\");\n    });\n\n    it(\"keeps nav, header, footer when onlyMainContent=false\", () => {\n      const html = `\n        <html><body>\n          <nav>Navigation links</nav>\n          <p>Main content</p>\n          <footer>Footer info</footer>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", { onlyMainContent: false });\n      expect(result).toContain(\"Navigation links\");\n      expect(result).toContain(\"Main content\");\n      expect(result).toContain(\"Footer info\");\n    });\n\n    it(\"protects #content from removal even if it's inside a removable element\", () => {\n      const html = `\n        <html><body>\n          <header>\n            <div id=\"content\"><p>This is the real content</p></div>\n          </header>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", { onlyMainContent: true });\n      expect(result).toContain(\"This is the real content\");\n    });\n  });\n\n  describe(\"does NOT strip legitimate content\", () => {\n    it(\"preserves body with class containing 'dialog' substring\", () => {\n      // Regression test: Wikipedia's <body class=\"...uls-dialog-sticky-hide...\">\n      // was being nuked by the old [class*=\"dialog\"] wildcard selector.\n      const html = `\n        <html><body class=\"skin uls-dialog-sticky-hide action-view\">\n          <div id=\"content\">\n            <p>This is the real article content that should survive cleaning.</p>\n          </div>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://en.wikipedia.org/wiki/Test\", { onlyMainContent: true });\n      expect(result).toContain(\"real article content\");\n    });\n\n    it(\"preserves forms and inputs (they may contain visible text)\", () => {\n      const html = `\n        <html><body>\n          <form><label>Search: <input type=\"text\" value=\"query\"></label></form>\n          <p>Content</p>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", { onlyMainContent: false });\n      expect(result).toContain(\"Search:\");\n    });\n\n    it(\"preserves aria-hidden elements (may be re-shown by JS)\", () => {\n      const html = `\n        <html><body>\n          <div aria-hidden=\"true\"><p>Hidden but potentially real content</p></div>\n          <p>Visible</p>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", { onlyMainContent: false });\n      expect(result).toContain(\"Hidden but potentially real content\");\n    });\n  });\n\n  describe(\"Wikipedia content extraction\", () => {\n    it(\"preserves Wikipedia article body through #mw-content-text protection\", () => {\n      const html = `\n        <html><body class=\"mediawiki uls-dialog-sticky-hide\">\n          <div id=\"mw-page-base\"></div>\n          <nav id=\"p-personal\"><a href=\"/login\">Log in</a></nav>\n          <div id=\"content\">\n            <h1 id=\"firstHeading\">Web scraping</h1>\n            <div id=\"bodyContent\">\n              <div id=\"mw-content-text\">\n                <p>Web scraping is the process of extracting data from websites. ${\"More body text. \".repeat(20)}</p>\n                <p>It involves making HTTP requests, parsing HTML, and extracting the content of interest.</p>\n              </div>\n            </div>\n          </div>\n          <footer>Wikipedia footer</footer>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://en.wikipedia.org/wiki/Web_scraping\", {\n        onlyMainContent: true,\n      });\n      expect(result).toContain(\"Web scraping is the process\");\n      expect(result).toContain(\"HTTP requests\");\n      expect(result).not.toContain(\"Wikipedia footer\");\n      expect(result).not.toContain(\"Log in\");\n    });\n  });\n\n  describe(\"docs.anthropic.com content extraction\", () => {\n    it(\"preserves Mintlify-style main.relative content\", () => {\n      const html = `\n        <html><body>\n          <nav>Sidebar nav</nav>\n          <main class=\"relative max-w-4xl\">\n            <h1>Welcome to Claude</h1>\n            <p>Claude is an AI assistant. ${\"Documentation body text. \".repeat(15)}</p>\n            <p>Get started by reading the API reference.</p>\n          </main>\n          <footer>Doc footer</footer>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://docs.anthropic.com/en/docs/welcome\", {\n        onlyMainContent: true,\n      });\n      expect(result).toContain(\"Welcome to Claude\");\n      expect(result).toContain(\"Documentation body text\");\n      expect(result).not.toContain(\"Doc footer\");\n    });\n  });\n\n  describe(\"selector filtering\", () => {\n    it(\"applies excludeTags correctly\", () => {\n      const html = `\n        <html><body>\n          <div class=\"comments\">User comments here</div>\n          <p>Main content paragraph</p>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", {\n        excludeTags: [\".comments\"],\n      });\n      expect(result).not.toContain(\"User comments\");\n      expect(result).toContain(\"Main content\");\n    });\n\n    it(\"applies includeTags correctly\", () => {\n      const html = `\n        <html><body>\n          <div class=\"sidebar\">Sidebar</div>\n          <div class=\"article-content\">Article text</div>\n          <div class=\"footer\">Footer</div>\n        </body></html>\n      `;\n      const result = cleanContent(html, \"https://example.com\", {\n        includeTags: [\".article-content\"],\n      });\n      expect(result).toContain(\"Article text\");\n    });\n  });\n\n  describe(\"edge cases\", () => {\n    it(\"handles empty HTML without crashing\", () => {\n      // linkedom may throw on truly empty input\n      expect(() => cleanContent(\"\", \"https://example.com\")).toThrow();\n    });\n\n    it(\"handles HTML with only whitespace without crashing\", () => {\n      expect(() => cleanContent(\"   \\n\\t   \", \"https://example.com\")).toThrow();\n    });\n\n    it(\"handles minimal HTML structure\", () => {\n      const result = cleanContent(\"<html><body></body></html>\", \"https://example.com\");\n      expect(result).toBeDefined();\n    });\n\n    it(\"preserves text content through cleaning\", () => {\n      const html = `<html><body><h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      expect(result).toContain(\"Title\");\n      expect(result).toContain(\"bold\");\n    });\n  });\n\n  describe(\"URL handling\", () => {\n    it(\"absolutifies relative URLs\", () => {\n      const html = `<html><body><a href=\"/page\">Link</a><img src=\"/img.png\"></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      expect(result).toContain(\"https://example.com/page\");\n      expect(result).toContain(\"https://example.com/img.png\");\n    });\n\n    it(\"resolves srcset to largest image\", () => {\n      const html = `<html><body><img srcset=\"small.jpg 200w, large.jpg 800w\" src=\"tiny.jpg\"></body></html>`;\n      const result = cleanContent(html, \"https://example.com\");\n      // srcset resolves to large.jpg, then URL absolutifier makes it https://example.com/large.jpg\n      expect(result).toContain(\"large.jpg\");\n      expect(result).not.toContain('src=\"tiny.jpg\"');\n    });\n  });\n\n  describe(\"base64 image removal\", () => {\n    it(\"removes base64 img elements when removeBase64Images=true\", () => {\n      const html = `<html><body><img src=\"data:image/png;base64,abc123\"><p>Content</p></body></html>`;\n      const result = cleanContent(html, \"https://example.com\", { removeBase64Images: true });\n      expect(result).not.toContain(\"data:image\");\n      expect(result).toContain(\"Content\");\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/crawler.test.ts",
    "content": "/**\n * Crawler Tests\n *\n * Tests link extraction, depth limiting, maxPages cap, URL dedup,\n * same-domain filtering, and robots.txt compliance. We mock fetchPage\n * and fetchRobotsTxt to avoid needing a live browser or network.\n */\n\nimport { describe, it, expect, vi, beforeEach } from \"vitest\";\nimport { Crawler } from \"../../src/crawler\";\nimport type { IBrowserPool } from \"../../src/browser/types\";\nimport type { CrawlResult } from \"../../src/crawl-types\";\n\n// ── Mock robots parser (no network) ──────────────────────────────────────────\n\nvi.mock(\"../../src/utils/robots-parser\", () => ({\n  fetchRobotsTxt: vi.fn().mockResolvedValue(null), // no robots.txt by default\n  isUrlAllowed: vi.fn().mockReturnValue(true),\n}));\n\nvi.mock(\"../../src/utils/rate-limiter\", () => ({\n  rateLimit: vi.fn().mockResolvedValue(undefined), // skip delays in tests\n}));\n\n// ── Helpers ──────────────────────────────────────────────────────────────────\n\n/** Minimal mock pool that satisfies the constructor check */\nfunction mockPool(): IBrowserPool {\n  return {\n    withBrowser: vi.fn(),\n    shutdown: vi.fn().mockResolvedValue(undefined),\n    getStats: vi.fn().mockReturnValue({ size: 1, active: 0, idle: 1, pending: 0 }),\n    isReady: vi.fn().mockReturnValue(true),\n  } as unknown as IBrowserPool;\n}\n\n/**\n * Create a Crawler with mocked fetchPage. Returns the crawler and the\n * fetchPage mock so tests can control what each page returns.\n */\nfunction createTestCrawler(options: {\n  url: string;\n  depth?: number;\n  maxPages?: number;\n  includePatterns?: string[];\n  excludePatterns?: string[];\n}) {\n  const crawler = new Crawler({\n    url: options.url,\n    depth: options.depth ?? 1,\n    maxPages: options.maxPages ?? 20,\n    delayMs: 0, // no delay in tests\n    pool: mockPool(),\n    includePatterns: options.includePatterns,\n    excludePatterns: options.excludePatterns,\n  });\n\n  // Suppress log noise\n  (crawler as any).logger = {\n    info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(),\n  };\n\n  const fetchPageMock = vi.fn<[string], Promise<{ crawlUrl: { url: string; title: string; description: string | null }; html: string } | null>>();\n  (crawler as any).fetchPage = fetchPageMock;\n\n  return { crawler, fetchPageMock };\n}\n\n/** Build a simple HTML page with links */\nfunction makeHtml(links: string[], title = \"Test Page\"): string {\n  const anchors = links.map((href) => `<a href=\"${href}\">Link</a>`).join(\"\\n\");\n  return `<html><head><title>${title}</title></head><body>${anchors}</body></html>`;\n}\n\n/** Build a fetchPage result */\nfunction pageResult(url: string, html: string, title = \"Test Page\") {\n  return {\n    crawlUrl: { url, title, description: null },\n    html,\n  };\n}\n\n// ── Tests ────────────────────────────────────────────────────────────────────\n\ndescribe(\"Crawler\", () => {\n  beforeEach(() => {\n    vi.clearAllMocks();\n  });\n\n  describe(\"constructor\", () => {\n    it(\"defaults depth=1, maxPages=20\", () => {\n      const crawler = new Crawler({ url: \"https://example.com\" });\n      expect((crawler as any).options.depth).toBe(1);\n      expect((crawler as any).options.maxPages).toBe(20);\n    });\n  });\n\n  describe(\"link extraction\", () => {\n    it(\"extracts same-domain absolute links\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com\",\n          makeHtml([\n            \"https://example.com/page1\",\n            \"https://example.com/page2\",\n            \"https://other.com/external\", // different domain\n          ]),\n        ))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/page1\", makeHtml([])))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/page2\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      // Seed + 2 same-domain links (external filtered)\n      expect(result.urls).toHaveLength(3);\n      expect(result.urls.map((u) => u.url)).toContain(\"https://example.com/page1\");\n      expect(result.urls.map((u) => u.url)).toContain(\"https://example.com/page2\");\n    });\n\n    it(\"resolves relative URLs against the page base URL\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com\",\n          makeHtml([\"/about\", \"./contact\", \"blog/post1\"]),\n        ))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/about\", makeHtml([])))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/contact\", makeHtml([])))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/blog/post1\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      const urls = result.urls.map((u) => u.url);\n      expect(urls).toContain(\"https://example.com/about\");\n      expect(urls).toContain(\"https://example.com/contact\");\n    });\n\n    it(\"skips fragment-only links\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\"#section1\", \"#top\", \"https://example.com/real-page\"]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/real-page\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(2); // seed + real-page, not fragments\n    });\n\n    it(\"skips non-HTTP schemes (mailto, javascript, tel, etc.)\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\n          \"mailto:test@example.com\",\n          \"javascript:void(0)\",\n          \"tel:+1234567890\",\n          \"data:text/html,hello\",\n          \"ftp://files.example.com/file\",\n          \"https://example.com/valid\",\n        ]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/valid\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(2); // seed + valid\n    });\n\n    it(\"strips hash fragments from discovered URLs\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com\",\n          makeHtml([\"https://example.com/page#section1\"]),\n        ))\n        .mockResolvedValueOnce(pageResult(\"https://example.com/page\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.urls[1].url).toBe(\"https://example.com/page\");\n    });\n  });\n\n  describe(\"depth limiting\", () => {\n    it(\"does not extract links when at max depth\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      // depth=0 (seed) → links extracted at depth=1\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\"https://example.com/level1\"]),\n      ));\n      // depth=1 → at max depth, links NOT extracted (even though page has them)\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com/level1\",\n        makeHtml([\"https://example.com/level2\"]),\n      ));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(2); // seed + level1, NOT level2\n      expect(result.urls.map((u) => u.url)).not.toContain(\"https://example.com/level2\");\n    });\n\n    it(\"crawls deeper with depth=2\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 2,\n      });\n\n      fetchPageMock\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com\",\n          makeHtml([\"https://example.com/a\"]),\n        ))\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com/a\",\n          makeHtml([\"https://example.com/a/b\"]),\n        ))\n        .mockResolvedValueOnce(pageResult(\n          \"https://example.com/a/b\",\n          makeHtml([\"https://example.com/a/b/c\"]), // depth=2, at max, won't extract\n        ));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(3); // seed + a + a/b\n      expect(result.urls.map((u) => u.url)).not.toContain(\"https://example.com/a/b/c\");\n    });\n  });\n\n  describe(\"maxPages cap\", () => {\n    it(\"stops after reaching maxPages\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n        maxPages: 3,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\n          \"https://example.com/p1\",\n          \"https://example.com/p2\",\n          \"https://example.com/p3\",\n          \"https://example.com/p4\",\n          \"https://example.com/p5\",\n        ]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/p1\", makeHtml([])));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/p2\", makeHtml([])));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/p3\", makeHtml([])));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/p4\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(3); // capped at maxPages\n    });\n  });\n\n  describe(\"URL deduplication\", () => {\n    it(\"does not visit the same URL twice\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\n          \"https://example.com/page\",\n          \"https://example.com/page\", // duplicate\n          \"https://example.com/page\", // duplicate\n        ]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/page\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.urls).toHaveLength(2); // seed + page (not 4)\n      expect(fetchPageMock).toHaveBeenCalledTimes(2); // only fetched twice\n    });\n  });\n\n  describe(\"failed pages\", () => {\n    it(\"continues crawling when fetchPage returns null\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\"https://example.com/broken\", \"https://example.com/ok\"]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(null); // broken page\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com/ok\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      // seed + ok (broken didn't add to urls)\n      expect(result.urls).toHaveLength(2);\n      expect(result.urls.map((u) => u.url)).toContain(\"https://example.com/ok\");\n    });\n  });\n\n  describe(\"metadata\", () => {\n    it(\"returns correct metadata with seed URL and duration\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n        maxPages: 5,\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\"https://example.com\", makeHtml([])));\n\n      const result = await crawler.crawl();\n      expect(result.metadata.seedUrl).toBe(\"https://example.com\");\n      expect(result.metadata.maxDepth).toBe(1);\n      expect(result.metadata.totalUrls).toBe(1);\n      expect(result.metadata.totalDuration).toBeGreaterThanOrEqual(0);\n    });\n  });\n\n  describe(\"include/exclude patterns\", () => {\n    it(\"respects includePatterns filter\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n        includePatterns: [\"/blog/\"],\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\n          \"https://example.com/blog/post1\",\n          \"https://example.com/about\", // excluded by include pattern\n        ]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(\n        pageResult(\"https://example.com/blog/post1\", makeHtml([])),\n      );\n\n      const result = await crawler.crawl();\n      const urls = result.urls.map((u) => u.url);\n      expect(urls).toContain(\"https://example.com/blog/post1\");\n      expect(urls).not.toContain(\"https://example.com/about\");\n    });\n\n    it(\"respects excludePatterns filter\", async () => {\n      const { crawler, fetchPageMock } = createTestCrawler({\n        url: \"https://example.com\",\n        depth: 1,\n        excludePatterns: [\"/admin\"],\n      });\n\n      fetchPageMock.mockResolvedValueOnce(pageResult(\n        \"https://example.com\",\n        makeHtml([\n          \"https://example.com/page1\",\n          \"https://example.com/admin/dashboard\", // excluded\n        ]),\n      ));\n      fetchPageMock.mockResolvedValueOnce(\n        pageResult(\"https://example.com/page1\", makeHtml([])),\n      );\n\n      const result = await crawler.crawl();\n      const urls = result.urls.map((u) => u.url);\n      expect(urls).toContain(\"https://example.com/page1\");\n      expect(urls).not.toContain(\"https://example.com/admin/dashboard\");\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/daemon-dispatch.test.ts",
    "content": "import { describe, it, expect, beforeEach, vi } from \"vitest\";\nimport { Readable } from \"stream\";\nimport http from \"http\";\nimport { DaemonServer } from \"../../src/daemon/server\";\n\n/**\n * Unit tests for DaemonServer POST / request dispatch.\n *\n * These test the handleRequest method directly (via `as any`) with mock\n * IncomingMessage and ServerResponse objects, avoiding the need to start\n * a real server or browser pool.\n */\n\n// ---- Helpers ----\n\n/** Create a mock IncomingMessage from method, url, body string, and optional headers. */\nfunction mockReq(\n  method: string,\n  url: string,\n  body: string = \"\",\n  headers: Record<string, string> = {},\n): http.IncomingMessage {\n  const readable = new Readable({\n    read() {\n      this.push(body);\n      this.push(null);\n    },\n  });\n\n  // Overlay the HTTP-specific properties onto the Readable stream.\n  Object.assign(readable, {\n    method,\n    url,\n    headers: {\n      \"content-type\": \"application/json\",\n      ...headers,\n    },\n  });\n\n  return readable as unknown as http.IncomingMessage;\n}\n\n/** Captured response data from a mock ServerResponse. */\ninterface CapturedResponse {\n  statusCode: number;\n  headers: Record<string, string>;\n  body: any;\n}\n\n/** Create a mock ServerResponse that captures writeHead/end calls. */\nfunction mockRes(): { res: http.ServerResponse; captured: () => CapturedResponse } {\n  let statusCode = 200;\n  let responseHeaders: Record<string, string> = {};\n  let bodyChunks: string[] = [];\n\n  const fake = {\n    writeHead(code: number, headers?: Record<string, string>) {\n      statusCode = code;\n      if (headers) responseHeaders = headers;\n    },\n    end(data?: string) {\n      if (data) bodyChunks.push(data);\n    },\n  };\n\n  return {\n    res: fake as unknown as http.ServerResponse,\n    captured: () => ({\n      statusCode,\n      headers: responseHeaders,\n      body: (() => {\n        const raw = bodyChunks.join(\"\");\n        try {\n          return JSON.parse(raw);\n        } catch {\n          return raw;\n        }\n      })(),\n    }),\n  };\n}\n\n// ---- Tests ----\n\ndescribe(\"DaemonServer POST / dispatch\", () => {\n  let daemon: DaemonServer;\n  let handleRequest: (req: http.IncomingMessage, res: http.ServerResponse) => Promise<void>;\n\n  // Mock client with scrape, crawl, isReady\n  const mockClient = {\n    scrape: vi.fn(),\n    crawl: vi.fn(),\n    isReady: vi.fn(() => true),\n  };\n\n  beforeEach(() => {\n    vi.clearAllMocks();\n\n    daemon = new DaemonServer({ port: 0 });\n    // Inject mock client without starting the server\n    (daemon as any).client = mockClient;\n    // Set startTime so status uptime works\n    (daemon as any).startTime = Date.now();\n    // Bind handleRequest\n    handleRequest = (daemon as any).handleRequest.bind(daemon);\n  });\n\n  // 1. action=scrape calls client.scrape and returns result\n  it(\"dispatches action=scrape to client.scrape and returns 200\", async () => {\n    const scrapeResult = { data: [{ url: \"https://example.com\", markdown: \"# Hello\" }] };\n    mockClient.scrape.mockResolvedValue(scrapeResult);\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"scrape\",\n      options: { urls: [\"https://example.com\"] },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(200);\n    expect(out.body.success).toBe(true);\n    expect(out.body.data).toEqual(scrapeResult);\n    expect(mockClient.scrape).toHaveBeenCalledWith({ urls: [\"https://example.com\"] });\n  });\n\n  // 2. action=crawl calls client.crawl and returns result\n  it(\"dispatches action=crawl to client.crawl and returns 200\", async () => {\n    const crawlResult = { urls: [\"https://example.com\", \"https://example.com/about\"] };\n    mockClient.crawl.mockResolvedValue(crawlResult);\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"crawl\",\n      options: { url: \"https://example.com\", depth: 2 },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(200);\n    expect(out.body.success).toBe(true);\n    expect(out.body.data).toEqual(crawlResult);\n    expect(mockClient.crawl).toHaveBeenCalledWith({ url: \"https://example.com\", depth: 2 });\n  });\n\n  // 3. action=status returns pool stats\n  it(\"dispatches action=status and returns daemon status\", async () => {\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({ action: \"status\" }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(200);\n    expect(out.body.success).toBe(true);\n    expect(out.body.data.running).toBe(true);\n    expect(out.body.data.ready).toBe(true);\n    expect(typeof out.body.data.uptime).toBe(\"number\");\n    expect(typeof out.body.data.pid).toBe(\"number\");\n    expect(typeof out.body.data.activeRequests).toBe(\"number\");\n  });\n\n  // 4. action=unknown returns 400\n  it(\"returns 400 for unknown action\", async () => {\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({ action: \"bogus\" }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(400);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Unknown action\");\n  });\n\n  // 5. Invalid JSON returns 400\n  it(\"returns 400 for invalid JSON body\", async () => {\n    const req = mockReq(\"POST\", \"/\", \"not-json{{{\");\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(400);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Invalid JSON\");\n  });\n\n  // 6. During shutdown returns 503\n  it(\"returns 503 when server is shutting down\", async () => {\n    (daemon as any).shuttingDown = true;\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({ action: \"scrape\", options: { urls: [\"https://example.com\"] } }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(503);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Server is shutting down\");\n  });\n\n  // 7. Client is null returns 500\n  it(\"returns 500 when client is not initialized (scrape)\", async () => {\n    (daemon as any).client = null;\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"scrape\",\n      options: { urls: [\"https://example.com\"] },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(500);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Client not initialized\");\n  });\n\n  it(\"returns 500 when client is not initialized (crawl)\", async () => {\n    (daemon as any).client = null;\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"crawl\",\n      options: { url: \"https://example.com\" },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(500);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Client not initialized\");\n  });\n\n  // 8. Scrape that throws returns 500 with error message\n  it(\"returns 500 when client.scrape throws\", async () => {\n    mockClient.scrape.mockRejectedValue(new Error(\"Browser crashed\"));\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"scrape\",\n      options: { urls: [\"https://example.com\"] },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(500);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Browser crashed\");\n  });\n\n  it(\"returns 500 when client.crawl throws\", async () => {\n    mockClient.crawl.mockRejectedValue(new Error(\"Timeout exceeded\"));\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"crawl\",\n      options: { url: \"https://example.com\" },\n    }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(500);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Timeout exceeded\");\n  });\n\n  // 9. GET /health returns 200 (no auth needed)\n  it(\"GET /health returns 200 without auth\", async () => {\n    // Re-create daemon with auth token to prove /health skips auth\n    daemon = new DaemonServer({ port: 0, authToken: \"secret\" });\n    (daemon as any).client = mockClient;\n    handleRequest = (daemon as any).handleRequest.bind(daemon);\n\n    const req = mockReq(\"GET\", \"/health\");\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(200);\n    expect(out.body.success).toBe(true);\n    expect(out.body.data.status).toBe(\"ok\");\n  });\n\n  // 10. POST / without auth token returns 401\n  it(\"returns 401 when auth is required but missing\", async () => {\n    daemon = new DaemonServer({ port: 0, authToken: \"secret\" });\n    (daemon as any).client = mockClient;\n    (daemon as any).startTime = Date.now();\n    handleRequest = (daemon as any).handleRequest.bind(daemon);\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({ action: \"status\" }));\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(401);\n    expect(out.body.success).toBe(false);\n    expect(out.body.error).toBe(\"Unauthorized\");\n  });\n\n  it(\"allows POST / with correct auth token\", async () => {\n    daemon = new DaemonServer({ port: 0, authToken: \"secret\" });\n    (daemon as any).client = mockClient;\n    (daemon as any).startTime = Date.now();\n    handleRequest = (daemon as any).handleRequest.bind(daemon);\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({ action: \"status\" }), {\n      authorization: \"Bearer secret\",\n    });\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(200);\n    expect(out.body.success).toBe(true);\n    expect(out.body.data.running).toBe(true);\n  });\n\n  // Edge case: 404 for non-POST non-GET routes\n  it(\"returns 404 for unsupported method/path\", async () => {\n    const req = mockReq(\"PUT\", \"/\");\n    const { res, captured } = mockRes();\n\n    await handleRequest(req, res);\n    const out = captured();\n\n    expect(out.statusCode).toBe(404);\n    expect(out.body.error).toBe(\"Not found\");\n  });\n\n  // Edge case: activeRequests counter is decremented even on error\n  it(\"decrements activeRequests after scrape error\", async () => {\n    mockClient.scrape.mockRejectedValue(new Error(\"fail\"));\n    expect((daemon as any).activeRequests).toBe(0);\n\n    const req = mockReq(\"POST\", \"/\", JSON.stringify({\n      action: \"scrape\",\n      options: { urls: [\"https://example.com\"] },\n    }));\n    const { res } = mockRes();\n\n    await handleRequest(req, res);\n\n    expect((daemon as any).activeRequests).toBe(0);\n  });\n});\n"
  },
  {
    "path": "tests/unit/domain-profiles.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { getDomainProfile, applyDomainProfile } from \"../../src/config/domain-profiles\";\n\n// Test profiles — reader has no built-in profiles, so we provide our own\nconst TEST_PROFILES = {\n  \"amazon.com\": { proxyTier: \"residential\" as const, timeoutMs: 60000, batchConcurrency: 2 },\n  \"amazon.co.uk\": { proxyTier: \"residential\" as const, timeoutMs: 60000 },\n  \"amazon.de\": { proxyTier: \"residential\" as const, timeoutMs: 60000 },\n  \"amazon.co.jp\": { proxyTier: \"residential\" as const, timeoutMs: 60000 },\n  \"linkedin.com\": { proxyTier: \"residential\" as const, timeoutMs: 60000 },\n  \"google.com\": { batchConcurrency: 1 },\n};\n\ndescribe(\"getDomainProfile\", () => {\n  describe(\"exact domain match\", () => {\n    it(\"returns profile for amazon.com\", () => {\n      const profile = getDomainProfile(\"amazon.com\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n      expect(profile!.proxyTier).toBe(\"residential\");\n      expect(profile!.timeoutMs).toBe(60000);\n    });\n\n    it(\"returns profile for linkedin.com\", () => {\n      const profile = getDomainProfile(\"linkedin.com\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n      expect(profile!.proxyTier).toBe(\"residential\");\n    });\n\n    it(\"returns undefined for unknown domain\", () => {\n      expect(getDomainProfile(\"example.com\", TEST_PROFILES)).toBeUndefined();\n    });\n\n    it(\"returns undefined when no profiles provided\", () => {\n      expect(getDomainProfile(\"amazon.com\")).toBeUndefined();\n      expect(getDomainProfile(\"amazon.com\", undefined)).toBeUndefined();\n      expect(getDomainProfile(\"amazon.com\", {})).toBeUndefined();\n    });\n  });\n\n  describe(\"www stripping\", () => {\n    it(\"strips www. prefix before lookup\", () => {\n      const profile = getDomainProfile(\"www.amazon.com\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n      expect(profile!.proxyTier).toBe(\"residential\");\n    });\n  });\n\n  describe(\"subdomain matching\", () => {\n    it(\"matches shop.amazon.com to amazon.com profile\", () => {\n      const profile = getDomainProfile(\"shop.amazon.com\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n      expect(profile!.proxyTier).toBe(\"residential\");\n    });\n\n    it(\"matches smile.amazon.com to amazon.com profile\", () => {\n      const profile = getDomainProfile(\"smile.amazon.com\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n    });\n\n    it(\"does not match amazonclone.com to amazon.com\", () => {\n      expect(getDomainProfile(\"amazonclone.com\", TEST_PROFILES)).toBeUndefined();\n    });\n  });\n\n  describe(\"full URL input\", () => {\n    it(\"extracts hostname from full URL\", () => {\n      const profile = getDomainProfile(\"https://www.amazon.com/dp/B08N5WRWNW\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n      expect(profile!.proxyTier).toBe(\"residential\");\n    });\n\n    it(\"handles URL with port\", () => {\n      const profile = getDomainProfile(\"https://amazon.com:443/dp/B08N5WRWNW\", TEST_PROFILES);\n      expect(profile).toBeDefined();\n    });\n\n    it(\"returns undefined for invalid URL\", () => {\n      expect(getDomainProfile(\"not a url at all\", TEST_PROFILES)).toBeUndefined();\n    });\n  });\n\n  describe(\"international Amazon domains\", () => {\n    it(\"matches amazon.co.uk\", () => {\n      expect(getDomainProfile(\"amazon.co.uk\", TEST_PROFILES)).toBeDefined();\n    });\n\n    it(\"matches amazon.de\", () => {\n      expect(getDomainProfile(\"amazon.de\", TEST_PROFILES)).toBeDefined();\n    });\n\n    it(\"matches amazon.co.jp\", () => {\n      expect(getDomainProfile(\"amazon.co.jp\", TEST_PROFILES)).toBeDefined();\n    });\n  });\n});\n\ndescribe(\"applyDomainProfile\", () => {\n  it(\"applies profile values when user has not set them\", () => {\n    const options = { urls: [\"https://amazon.com\"], formats: [\"markdown\" as const] };\n    const profile = { proxyTier: \"residential\" as const, timeoutMs: 60000 };\n    const merged = applyDomainProfile(options, profile);\n\n    expect(merged.timeoutMs).toBe(60000);\n    expect(merged.proxyTier).toBe(\"residential\");\n  });\n\n  it(\"does not override user-provided values\", () => {\n    const options = { urls: [\"https://amazon.com\"], timeoutMs: 15000, proxyTier: \"datacenter\" as const };\n    const profile = { proxyTier: \"residential\" as const, timeoutMs: 60000 };\n    const merged = applyDomainProfile(options, profile);\n\n    expect(merged.timeoutMs).toBe(15000);\n    expect(merged.proxyTier).toBe(\"datacenter\");\n  });\n\n  it(\"preserves all original options\", () => {\n    const options = {\n      urls: [\"https://amazon.com\"],\n      formats: [\"markdown\" as const],\n      onlyMainContent: true,\n      verbose: true,\n    };\n    const profile = { proxyTier: \"residential\" as const };\n    const merged = applyDomainProfile(options, profile);\n\n    expect(merged.urls).toEqual([\"https://amazon.com\"]);\n    expect(merged.formats).toEqual([\"markdown\"]);\n    expect(merged.onlyMainContent).toBe(true);\n    expect(merged.verbose).toBe(true);\n  });\n});\n"
  },
  {
    "path": "tests/unit/errors.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport {\n  ReaderError,\n  ReaderErrorCode,\n  NetworkError,\n  TimeoutError,\n  CloudflareError,\n  AccessDeniedError,\n  DNSError,\n  TLSError,\n  BotDetectedError,\n  ProxyConnectionError,\n  ProxyExhaustedError,\n  ContentTooLargeError,\n  MarkdownConversionError,\n  EmptyContentError,\n  BrowserPoolError,\n  ClientClosedError,\n  NotInitializedError,\n  RobotsBlockedError,\n  InvalidUrlError,\n  wrapError,\n} from \"../../src/errors\";\nimport { ScrapeFailedError } from \"../../src/engines/errors\";\n\ndescribe(\"Error types\", () => {\n  describe(\"error codes\", () => {\n    it(\"NetworkError has NETWORK_ERROR code\", () => {\n      const err = new NetworkError(\"Connection failed\", { url: \"https://example.com\" });\n      expect(err.code).toBe(ReaderErrorCode.NETWORK_ERROR);\n    });\n\n    it(\"TimeoutError has TIMEOUT code\", () => {\n      const err = new TimeoutError(\"Timed out\", 30000);\n      expect(err.code).toBe(ReaderErrorCode.TIMEOUT);\n      expect(err.timeoutMs).toBe(30000);\n    });\n\n    it(\"DNSError has DNS_ERROR code\", () => {\n      const err = new DNSError(\"nonexistent.example.com\");\n      expect(err.code).toBe(ReaderErrorCode.DNS_ERROR);\n      expect(err.hostname).toBe(\"nonexistent.example.com\");\n    });\n\n    it(\"TLSError has TLS_ERROR code\", () => {\n      const err = new TLSError(\"Certificate expired\");\n      expect(err.code).toBe(ReaderErrorCode.TLS_ERROR);\n    });\n\n    it(\"BotDetectedError has BOT_DETECTED code\", () => {\n      const err = new BotDetectedError(\"Amazon block page\");\n      expect(err.code).toBe(ReaderErrorCode.BOT_DETECTED);\n      expect(err.signal).toBe(\"Amazon block page\");\n    });\n\n    it(\"ProxyConnectionError has PROXY_CONNECTION_ERROR code\", () => {\n      const err = new ProxyConnectionError(\"datacenter\");\n      expect(err.code).toBe(ReaderErrorCode.PROXY_CONNECTION_ERROR);\n      expect(err.proxyTier).toBe(\"datacenter\");\n    });\n\n    it(\"ProxyExhaustedError has PROXY_EXHAUSTED code\", () => {\n      const err = new ProxyExhaustedError();\n      expect(err.code).toBe(ReaderErrorCode.PROXY_EXHAUSTED);\n    });\n\n    it(\"ContentTooLargeError has CONTENT_TOO_LARGE code\", () => {\n      const err = new ContentTooLargeError(500000, 300000);\n      expect(err.code).toBe(ReaderErrorCode.CONTENT_TOO_LARGE);\n      expect(err.sizeBytes).toBe(500000);\n      expect(err.limitBytes).toBe(300000);\n    });\n\n    it(\"MarkdownConversionError has MARKDOWN_CONVERSION_FAILED code\", () => {\n      const err = new MarkdownConversionError(\"Formatting argument out of range\");\n      expect(err.code).toBe(ReaderErrorCode.MARKDOWN_CONVERSION_FAILED);\n    });\n\n    it(\"EmptyContentError has EMPTY_CONTENT code\", () => {\n      const err = new EmptyContentError(10);\n      expect(err.code).toBe(ReaderErrorCode.EMPTY_CONTENT);\n      expect(err.contentLength).toBe(10);\n    });\n\n    it(\"ScrapeFailedError wraps underlying error with proxyBlock flag\", () => {\n      const inner = new Error(\"timeout\");\n      const err = new ScrapeFailedError(inner, { proxyBlock: true });\n      expect(err.name).toBe(\"ScrapeFailedError\");\n      expect(err.proxyBlock).toBe(true);\n      expect(err.cause).toBe(inner);\n    });\n\n  });\n\n  describe(\"retryable flags\", () => {\n    it(\"NetworkError is retryable\", () => {\n      expect(new NetworkError(\"fail\").retryable).toBe(true);\n    });\n\n    it(\"TimeoutError is retryable\", () => {\n      expect(new TimeoutError(\"timeout\", 1000).retryable).toBe(true);\n    });\n\n    it(\"CloudflareError is retryable\", () => {\n      expect(new CloudflareError(\"turnstile\").retryable).toBe(true);\n    });\n\n    it(\"BotDetectedError is retryable\", () => {\n      expect(new BotDetectedError(\"amazon\").retryable).toBe(true);\n    });\n\n    it(\"ProxyConnectionError is retryable\", () => {\n      expect(new ProxyConnectionError(\"datacenter\").retryable).toBe(true);\n    });\n\n    it(\"TLSError is retryable\", () => {\n      expect(new TLSError(\"cert expired\").retryable).toBe(true);\n    });\n\n    it(\"EmptyContentError is retryable\", () => {\n      expect(new EmptyContentError(0).retryable).toBe(true);\n    });\n\n    it(\"BrowserPoolError is retryable\", () => {\n      expect(new BrowserPoolError(\"pool full\").retryable).toBe(true);\n    });\n\n    it(\"AccessDeniedError is NOT retryable\", () => {\n      expect(new AccessDeniedError(\"403\").retryable).toBe(false);\n    });\n\n    it(\"DNSError is NOT retryable\", () => {\n      expect(new DNSError(\"bad.host\").retryable).toBe(false);\n    });\n\n    it(\"ProxyExhaustedError is NOT retryable\", () => {\n      expect(new ProxyExhaustedError().retryable).toBe(false);\n    });\n\n    it(\"ContentTooLargeError is NOT retryable\", () => {\n      expect(new ContentTooLargeError(1, 1).retryable).toBe(false);\n    });\n\n    it(\"ScrapeFailedError extends Error\", () => {\n      const err = new ScrapeFailedError(new Error(\"test\"));\n      expect(err).toBeInstanceOf(Error);\n      expect(err.name).toBe(\"ScrapeFailedError\");\n    });\n\n    it(\"ClientClosedError is NOT retryable\", () => {\n      expect(new ClientClosedError().retryable).toBe(false);\n    });\n\n    it(\"InvalidUrlError is NOT retryable\", () => {\n      expect(new InvalidUrlError(\"bad-url\").retryable).toBe(false);\n    });\n\n    it(\"RobotsBlockedError is NOT retryable\", () => {\n      expect(new RobotsBlockedError(\"https://example.com/secret\").retryable).toBe(false);\n    });\n  });\n\n  describe(\"toJSON serialization\", () => {\n    it(\"serializes base ReaderError correctly\", () => {\n      const err = new NetworkError(\"Connection lost\", { url: \"https://example.com\" });\n      const json = err.toJSON();\n\n      expect(json.name).toBe(\"NetworkError\");\n      expect(json.code).toBe(\"NETWORK_ERROR\");\n      expect(json.message).toBe(\"Connection lost\");\n      expect(json.url).toBe(\"https://example.com\");\n      expect(json.retryable).toBe(true);\n      expect(json.timestamp).toBeDefined();\n      expect(typeof json.timestamp).toBe(\"string\");\n      expect(json.stack).toBeDefined();\n    });\n\n    it(\"serializes DNSError with hostname\", () => {\n      const json = new DNSError(\"bad.host\", { url: \"https://bad.host\" }).toJSON();\n      expect(json.hostname).toBe(\"bad.host\");\n    });\n\n    it(\"serializes ContentTooLargeError with sizes\", () => {\n      const json = new ContentTooLargeError(500000, 300000).toJSON();\n      expect(json.sizeBytes).toBe(500000);\n      expect(json.limitBytes).toBe(300000);\n    });\n\n    it(\"ScrapeFailedError preserves underlying error message\", () => {\n      const inner = new Error(\"Hero timed out after 10s\");\n      const err = new ScrapeFailedError(inner);\n      expect(err.message).toContain(\"timed out\");\n    });\n\n    it(\"serializes cause message\", () => {\n      const cause = new Error(\"root cause\");\n      const err = new NetworkError(\"wrapped\", { cause });\n      expect(err.toJSON().cause).toBe(\"root cause\");\n    });\n  });\n});\n\ndescribe(\"wrapError\", () => {\n  it(\"passes through ReaderError unchanged\", () => {\n    const err = new NetworkError(\"test\");\n    expect(wrapError(err)).toBe(err);\n  });\n\n  it(\"wraps timeout errors\", () => {\n    const err = new Error(\"Request timed out after 30s\");\n    const wrapped = wrapError(err, \"https://example.com\");\n    expect(wrapped.code).toBe(ReaderErrorCode.TIMEOUT);\n    expect(wrapped.url).toBe(\"https://example.com\");\n  });\n\n  it(\"wraps DNS errors (ENOTFOUND)\", () => {\n    const err = new Error(\"getaddrinfo ENOTFOUND nonexistent.example.com\");\n    const wrapped = wrapError(err, \"https://nonexistent.example.com/page\");\n    expect(wrapped.code).toBe(ReaderErrorCode.DNS_ERROR);\n  });\n\n  it(\"wraps TLS/SSL errors\", () => {\n    const err = new Error(\"unable to verify the first certificate\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.TLS_ERROR);\n  });\n\n  it(\"wraps connection refused errors\", () => {\n    const err = new Error(\"connect ECONNREFUSED 127.0.0.1:443\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.NETWORK_ERROR);\n  });\n\n  it(\"wraps connection reset errors\", () => {\n    const err = new Error(\"read ECONNRESET\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.NETWORK_ERROR);\n  });\n\n  it(\"wraps proxy errors\", () => {\n    const err = new Error(\"proxy connection failed: tunnel timeout\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.PROXY_CONNECTION_ERROR);\n  });\n\n  it(\"wraps cloudflare errors\", () => {\n    const err = new Error(\"Cloudflare challenge detected\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.CLOUDFLARE_CHALLENGE);\n  });\n\n  it(\"wraps supermarkdown conversion errors\", () => {\n    const err = new Error(\"Supermarkdown conversion failed: Formatting argument out of range\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.MARKDOWN_CONVERSION_FAILED);\n  });\n\n  it(\"wraps unknown errors as UNKNOWN\", () => {\n    const err = new Error(\"something completely unexpected\");\n    const wrapped = wrapError(err);\n    expect(wrapped.code).toBe(ReaderErrorCode.UNKNOWN);\n  });\n\n  it(\"wraps non-Error objects\", () => {\n    const wrapped = wrapError(\"string error\");\n    expect(wrapped.code).toBe(ReaderErrorCode.UNKNOWN);\n    expect(wrapped.message).toBe(\"string error\");\n  });\n\n  it(\"preserves cause chain\", () => {\n    const cause = new Error(\"root\");\n    const err = new Error(\"surface: root\");\n    const wrapped = wrapError(err, \"https://example.com\");\n    expect(wrapped.cause).toBeDefined();\n  });\n});\n"
  },
  {
    "path": "tests/unit/health-tracker.test.ts",
    "content": "import { describe, it, expect, vi, beforeEach } from \"vitest\";\nimport { ProxyHealthTracker } from \"../../src/proxy/health-tracker\";\n\n/**\n * Fake clock that the tracker reads via the injected `now` option.\n */\nfunction fakeClock(start = 1_000_000_000_000) {\n  let current = start;\n  return {\n    now: () => current,\n    advance: (ms: number) => {\n      current += ms;\n    },\n  };\n}\n\ndescribe(\"ProxyHealthTracker\", () => {\n  describe(\"defaults and validation\", () => {\n    it(\"unknown proxy is healthy by default\", () => {\n      const t = new ProxyHealthTracker();\n      expect(t.isHealthy(\"http://unknown\")).toBe(true);\n      expect(t.snapshot(\"http://unknown\")).toBeNull();\n    });\n\n    it(\"rejects invalid failureThreshold\", () => {\n      expect(() => new ProxyHealthTracker({ failureThreshold: 0 })).toThrow();\n      expect(() => new ProxyHealthTracker({ failureThreshold: -1 })).toThrow();\n      expect(() => new ProxyHealthTracker({ failureThreshold: 1.5 })).toThrow();\n    });\n\n    it(\"rejects negative cooldownMs\", () => {\n      expect(() => new ProxyHealthTracker({ cooldownMs: -1 })).toThrow();\n    });\n  });\n\n  describe(\"bench + cooldown (default thresholds)\", () => {\n    it(\"benches after 10 consecutive failures and emits event\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now });\n      const onBench = vi.fn();\n      t.on(\"proxy-benched\", onBench);\n\n      for (let i = 0; i < 9; i++) {\n        t.recordFailure(\"http://dc1\");\n      }\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n      expect(onBench).not.toHaveBeenCalled();\n\n      t.recordFailure(\"http://dc1\"); // 10th\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n      expect(onBench).toHaveBeenCalledTimes(1);\n      expect(onBench.mock.calls[0][0]).toMatchObject({\n        proxyUrl: \"http://dc1\",\n        consecutiveFailures: 10,\n      });\n    });\n\n    it(\"bench event fires exactly once, not on every subsequent failure\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now });\n      const onBench = vi.fn();\n      t.on(\"proxy-benched\", onBench);\n\n      for (let i = 0; i < 15; i++) {\n        t.recordFailure(\"http://dc1\");\n      }\n      expect(onBench).toHaveBeenCalledTimes(1);\n    });\n\n    it(\"success decays failure counter by 3 (not full reset)\", () => {\n      const t = new ProxyHealthTracker();\n      for (let i = 0; i < 9; i++) t.recordFailure(\"http://dc1\");\n      // 9 failures → recordSuccess → decay by 3 → 6 remaining\n      t.recordSuccess(\"http://dc1\");\n      expect(t.snapshot(\"http://dc1\")?.consecutiveFailures).toBe(6);\n      // 4 more failures → 6 + 4 = 10 → benched\n      for (let i = 0; i < 3; i++) t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n      t.recordFailure(\"http://dc1\"); // 10th total\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n    });\n  });\n\n  describe(\"cooldown auto-revive\", () => {\n    it(\"isHealthy returns false until cooldown expires, then true with revive event\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 60_000 });\n      const onRevive = vi.fn();\n      t.on(\"proxy-revived\", onRevive);\n\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n\n      clock.advance(30_000);\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n      expect(onRevive).not.toHaveBeenCalled();\n\n      clock.advance(30_001);\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n      expect(onRevive).toHaveBeenCalledTimes(1);\n    });\n\n    it(\"revive event fires exactly once\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 10 });\n      const onRevive = vi.fn();\n      t.on(\"proxy-revived\", onRevive);\n\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      clock.advance(11);\n      t.isHealthy(\"http://dc1\"); // revives\n      t.isHealthy(\"http://dc1\");\n      t.isHealthy(\"http://dc1\");\n      expect(onRevive).toHaveBeenCalledTimes(1);\n    });\n  });\n\n  describe(\"probationary failure re-benches immediately\", () => {\n    it(\"a single failure after revive re-bumps to benched on the next strike\", () => {\n      // After revive, the counter is still at 10. One more failure *does*\n      // re-bench because it crosses the threshold again on a non-benched\n      // state.\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 1000 });\n      const onBench = vi.fn();\n      t.on(\"proxy-benched\", onBench);\n\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      expect(onBench).toHaveBeenCalledTimes(1);\n\n      clock.advance(1001);\n      expect(t.isHealthy(\"http://dc1\")).toBe(true); // revived\n\n      t.recordFailure(\"http://dc1\"); // probationary failure\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n      expect(onBench).toHaveBeenCalledTimes(2);\n    });\n\n    it(\"a success during probation clears the counter and unbenches\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 1000 });\n      const onRevive = vi.fn();\n      t.on(\"proxy-revived\", onRevive);\n\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      clock.advance(1001);\n      t.isHealthy(\"http://dc1\"); // revives, +1 onRevive\n      t.recordSuccess(\"http://dc1\");\n\n      // After success: counter decrements by 3 (decay model) from 10 → 7.\n      // Not benched because benchedUntil was cleared by isHealthy. No second\n      // revive event from recordSuccess because benchedUntil was already null.\n      expect(onRevive).toHaveBeenCalledTimes(1);\n      expect(t.snapshot(\"http://dc1\")?.consecutiveFailures).toBe(7);\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n    });\n  });\n\n  describe(\"per-proxy isolation\", () => {\n    it(\"benching dc1 does not affect dc2\", () => {\n      const t = new ProxyHealthTracker();\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n      expect(t.isHealthy(\"http://dc2\")).toBe(true);\n    });\n  });\n\n  describe(\"snapshot\", () => {\n    it(\"tracks total successes and failures over time\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({ now: clock.now });\n\n      t.recordFailure(\"http://dc1\");\n      clock.advance(1000);\n      t.recordSuccess(\"http://dc1\");\n      clock.advance(1000);\n      t.recordFailure(\"http://dc1\");\n      clock.advance(1000);\n      t.recordFailure(\"http://dc1\");\n\n      const s = t.snapshot(\"http://dc1\")!;\n      expect(s.totalFailures).toBe(3);\n      expect(s.totalSuccesses).toBe(1);\n      expect(s.consecutiveFailures).toBe(2); // reset by the success\n      expect(s.lastSuccessAt).not.toBeNull();\n      expect(s.lastFailureAt).not.toBeNull();\n      expect(s.healthy).toBe(true);\n    });\n\n    it(\"allSnapshots lists every tracked proxy\", () => {\n      const t = new ProxyHealthTracker();\n      t.recordFailure(\"http://dc1\");\n      t.recordSuccess(\"http://dc2\");\n      t.recordFailure(\"http://dc3\");\n\n      const all = t.allSnapshots();\n      expect(all.map((s) => s.proxyUrl).sort()).toEqual([\n        \"http://dc1\",\n        \"http://dc2\",\n        \"http://dc3\",\n      ]);\n    });\n  });\n\n  describe(\"reset\", () => {\n    it(\"reset drops all state for a proxy\", () => {\n      const t = new ProxyHealthTracker();\n      for (let i = 0; i < 10; i++) t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n\n      t.reset(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n      expect(t.snapshot(\"http://dc1\")).toBeNull();\n    });\n  });\n\n  describe(\"custom thresholds\", () => {\n    it(\"respects custom failureThreshold=3 and cooldownMs=100\", () => {\n      const clock = fakeClock();\n      const t = new ProxyHealthTracker({\n        failureThreshold: 3,\n        cooldownMs: 100,\n        now: clock.now,\n      });\n\n      t.recordFailure(\"http://dc1\");\n      t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n      t.recordFailure(\"http://dc1\");\n      expect(t.isHealthy(\"http://dc1\")).toBe(false);\n\n      clock.advance(101);\n      expect(t.isHealthy(\"http://dc1\")).toBe(true);\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/html-size-guard.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\n\n/**\n * HTML Size Guard tests.\n *\n * The scraper truncates HTML > MAX_HTML_BYTES before markdown conversion.\n * We test the logic in isolation (the guard is inline in scraper.ts).\n */\n\nconst DEFAULT_MAX = 307200; // 300KB\n\nfunction applyGuard(html: string, maxBytes: number = DEFAULT_MAX): { truncated: boolean; output: string } {\n  if (html.length > maxBytes) {\n    return { truncated: true, output: html.slice(0, maxBytes) };\n  }\n  return { truncated: false, output: html };\n}\n\ndescribe(\"HTML size guard\", () => {\n  it(\"passes through HTML under limit unchanged\", () => {\n    const html = \"<p>Short content</p>\";\n    const result = applyGuard(html);\n    expect(result.truncated).toBe(false);\n    expect(result.output).toBe(html);\n  });\n\n  it(\"truncates HTML over limit\", () => {\n    const html = \"x\".repeat(400000);\n    const result = applyGuard(html);\n    expect(result.truncated).toBe(true);\n    expect(result.output.length).toBe(DEFAULT_MAX);\n  });\n\n  it(\"handles exactly-at-limit HTML\", () => {\n    const html = \"x\".repeat(DEFAULT_MAX);\n    const result = applyGuard(html);\n    expect(result.truncated).toBe(false);\n    expect(result.output.length).toBe(DEFAULT_MAX);\n  });\n\n  it(\"handles empty HTML\", () => {\n    const result = applyGuard(\"\");\n    expect(result.truncated).toBe(false);\n    expect(result.output).toBe(\"\");\n  });\n\n  it(\"respects custom limit\", () => {\n    const html = \"x\".repeat(1000);\n    const result = applyGuard(html, 500);\n    expect(result.truncated).toBe(true);\n    expect(result.output.length).toBe(500);\n  });\n\n  it(\"default limit is 300KB\", () => {\n    expect(DEFAULT_MAX).toBe(300 * 1024);\n  });\n});\n"
  },
  {
    "path": "tests/unit/markdown-formatter.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { htmlToMarkdown, formatToMarkdown } from \"../../src/formatters/markdown\";\n\ndescribe(\"htmlToMarkdown\", () => {\n  describe(\"with real supermarkdown\", () => {\n    it(\"converts heading to atx-style markdown\", () => {\n      const result = htmlToMarkdown(\"<h1>Hello World</h1>\");\n      expect(result).toContain(\"# Hello World\");\n    });\n\n    it(\"converts paragraph to plain text\", () => {\n      const result = htmlToMarkdown(\"<p>This is a paragraph.</p>\");\n      expect(result).toContain(\"This is a paragraph.\");\n      // Should not contain any HTML tags\n      expect(result).not.toContain(\"<p>\");\n    });\n\n    it(\"converts links to inline markdown\", () => {\n      const result = htmlToMarkdown(\n        '<p><a href=\"https://example.com\">Click here</a></p>'\n      );\n      expect(result).toContain(\"[Click here](https://example.com)\");\n    });\n\n    it(\"converts unordered lists with - bullet marker\", () => {\n      const result = htmlToMarkdown(\n        \"<ul><li>First</li><li>Second</li><li>Third</li></ul>\"\n      );\n      expect(result).toContain(\"- First\");\n      expect(result).toContain(\"- Second\");\n      expect(result).toContain(\"- Third\");\n    });\n\n    it(\"converts bold and italic text\", () => {\n      const result = htmlToMarkdown(\n        \"<p><strong>bold</strong> and <em>italic</em></p>\"\n      );\n      expect(result).toContain(\"**bold**\");\n      expect(result).toContain(\"*italic*\");\n    });\n\n    it(\"converts code blocks with backtick fence\", () => {\n      const result = htmlToMarkdown(\n        \"<pre><code>const x = 1;</code></pre>\"\n      );\n      expect(result).toContain(\"`\");\n      expect(result).toContain(\"const x = 1;\");\n    });\n\n    it(\"returns empty string for empty input\", () => {\n      const result = htmlToMarkdown(\"\");\n      expect(result).toBe(\"\");\n    });\n\n    it(\"handles whitespace-only HTML\", () => {\n      const result = htmlToMarkdown(\"   \\n\\t  \");\n      // Should return empty or whitespace-only (short input, no fallback triggered)\n      expect(result.trim()).toBe(\"\");\n    });\n\n    it(\"converts tables to GFM format\", () => {\n      const result = htmlToMarkdown(\n        \"<table><thead><tr><th>Name</th><th>Age</th></tr></thead>\" +\n          \"<tbody><tr><td>Alice</td><td>30</td></tr></tbody></table>\"\n      );\n      expect(result).toContain(\"Name\");\n      expect(result).toContain(\"Age\");\n      expect(result).toContain(\"Alice\");\n      expect(result).toContain(\"30\");\n      // GFM tables use pipes\n      expect(result).toContain(\"|\");\n    });\n\n    it(\"converts images to markdown syntax\", () => {\n      const result = htmlToMarkdown(\n        '<img src=\"https://example.com/image.png\" alt=\"A photo\">'\n      );\n      expect(result).toContain(\"![A photo](https://example.com/image.png)\");\n    });\n\n    it(\"handles nested HTML structures\", () => {\n      const result = htmlToMarkdown(\n        '<p>This has <strong>bold</strong>, <em>italic</em>, and <a href=\"https://example.com\">a link</a>.</p>'\n      );\n      expect(result).toContain(\"**bold**\");\n      expect(result).toContain(\"*italic*\");\n      expect(result).toContain(\"[a link](https://example.com)\");\n    });\n  });\n\n  describe(\"fallback behavior\", () => {\n    it(\"falls back to text extraction when convert returns empty on large input\", () => {\n      // Build HTML > 100 chars that would normally convert fine,\n      // but if supermarkdown returned empty, fallback strips tags.\n      // We can't easily mock the Rust module, so we test the fallback\n      // path indirectly: pass in HTML with only script/style tags and\n      // enough length to trigger the fallback threshold check.\n      // The real convert handles this fine, so this test validates\n      // that normal large input does NOT trigger fallback.\n      const largeHtml =\n        \"<p>\" + \"Hello world. \".repeat(20) + \"</p>\";\n      const result = htmlToMarkdown(largeHtml);\n      // Should contain the text (real convert works, no fallback)\n      expect(result).toContain(\"Hello world.\");\n      expect(result.length).toBeGreaterThan(0);\n    });\n  });\n\n  describe(\"formatToMarkdown alias\", () => {\n    it(\"is the same function as htmlToMarkdown\", () => {\n      expect(formatToMarkdown).toBe(htmlToMarkdown);\n    });\n\n    it(\"produces identical output\", () => {\n      const html = \"<h2>Test</h2><p>Content here</p>\";\n      expect(formatToMarkdown(html)).toBe(htmlToMarkdown(html));\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/metadata-extractor.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { extractMetadata } from \"../../src/utils/metadata-extractor\";\n\ndescribe(\"extractMetadata\", () => {\n  describe(\"basic meta tags\", () => {\n    it(\"extracts title from <title> tag\", () => {\n      const html = \"<html><head><title>My Page</title></head><body></body></html>\";\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.title).toBe(\"My Page\");\n    });\n\n    it(\"extracts description from meta tag\", () => {\n      const html = '<html><head><meta name=\"description\" content=\"A great page\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.description).toBe(\"A great page\");\n    });\n\n    it(\"extracts language from html lang attribute\", () => {\n      const html = '<html lang=\"en\"><head></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.language).toBe(\"en\");\n    });\n\n    it(\"extracts author from meta tag\", () => {\n      const html = '<html><head><meta name=\"author\" content=\"John Doe\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.author).toBe(\"John Doe\");\n    });\n\n    it(\"extracts canonical URL\", () => {\n      const html = '<html><head><link rel=\"canonical\" href=\"https://example.com/canonical\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.canonical).toBe(\"https://example.com/canonical\");\n    });\n\n    it(\"extracts favicon\", () => {\n      const html = '<html><head><link rel=\"icon\" href=\"/favicon.ico\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.favicon).toContain(\"favicon.ico\");\n    });\n  });\n\n  describe(\"Open Graph tags\", () => {\n    it(\"extracts og:title\", () => {\n      const html = '<html><head><meta property=\"og:title\" content=\"OG Title\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.openGraph?.title).toBe(\"OG Title\");\n    });\n\n    it(\"extracts og:description\", () => {\n      const html = '<html><head><meta property=\"og:description\" content=\"OG Desc\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.openGraph?.description).toBe(\"OG Desc\");\n    });\n\n    it(\"extracts og:image\", () => {\n      const html = '<html><head><meta property=\"og:image\" content=\"https://example.com/image.jpg\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.openGraph?.image).toBe(\"https://example.com/image.jpg\");\n    });\n  });\n\n  describe(\"Twitter card tags\", () => {\n    it(\"extracts twitter:card\", () => {\n      const html = '<html><head><meta name=\"twitter:card\" content=\"summary_large_image\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.twitter?.card).toBe(\"summary_large_image\");\n    });\n\n    it(\"extracts twitter:title\", () => {\n      const html = '<html><head><meta name=\"twitter:title\" content=\"Tweet Title\"></head><body></body></html>';\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.twitter?.title).toBe(\"Tweet Title\");\n    });\n  });\n\n  describe(\"edge cases\", () => {\n    it(\"handles HTML with no metadata\", () => {\n      const html = \"<html><body><p>Just content</p></body></html>\";\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.title).toBeNull();\n      expect(meta.description).toBeNull();\n    });\n\n    it(\"handles empty HTML\", () => {\n      const meta = extractMetadata(\"\", \"https://example.com\");\n      expect(meta).toBeDefined();\n      expect(meta.title).toBeNull();\n    });\n\n    it(\"handles malformed HTML\", () => {\n      const html = \"<html><head><title>Unclosed\";\n      const meta = extractMetadata(html, \"https://example.com\");\n      expect(meta.title).toBe(\"Unclosed\");\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/postprocess.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { postprocessMarkdown } from \"../../src/formatters/postprocess\";\n\ndescribe(\"postprocessMarkdown\", () => {\n  // ── Skip/Jump to Content removal ──────────────────────────────────\n\n  describe(\"skip to content removal\", () => {\n    it(\"removes [Skip to Content](#main)\", () => {\n      const input = \"[Skip to Content](#main)\\n\\nHello world\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello world\");\n    });\n\n    it(\"removes [Jump to Content](#content)\", () => {\n      const input = \"[Jump to Content](#content)\\n\\nHello world\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello world\");\n    });\n\n    it(\"is case insensitive\", () => {\n      const input = \"[skip to content](#nav)\\n\\nHello world\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello world\");\n    });\n\n    it(\"removes [Skip to main Content](#main-content)\", () => {\n      const input = \"[Skip to main Content](#main-content)\\n\\nBody text\";\n      expect(postprocessMarkdown(input)).toBe(\"Body text\");\n    });\n\n    it(\"removes [JUMP TO MAIN CONTENT](#top)\", () => {\n      const input = \"[JUMP TO MAIN CONTENT](#top)\\n\\nBody text\";\n      expect(postprocessMarkdown(input)).toBe(\"Body text\");\n    });\n\n    it(\"handles various fragment anchors\", () => {\n      const input = \"[Skip to Content](#skip-nav)\\n\\nContent here\";\n      expect(postprocessMarkdown(input)).toBe(\"Content here\");\n    });\n\n    it(\"does NOT remove when linking to a real URL (not a fragment)\", () => {\n      const input = \"[Skip to Content](https://example.com/content)\\n\\nHello\";\n      expect(postprocessMarkdown(input)).toBe(\n        \"[Skip to Content](https://example.com/content)\\n\\nHello\",\n      );\n    });\n  });\n\n  // ── Image link deduplication ──────────────────────────────────────\n\n  describe(\"image link deduplication\", () => {\n    it(\"deduplicates when image URL and link URL match\", () => {\n      const input = \"[![alt text](https://img.com/photo.jpg)](https://img.com/photo.jpg)\";\n      expect(postprocessMarkdown(input)).toBe(\"![alt text](https://img.com/photo.jpg)\");\n    });\n\n    it(\"does NOT deduplicate when URLs differ\", () => {\n      const input =\n        \"[![alt text](https://img.com/photo.jpg)](https://example.com/page)\";\n      expect(postprocessMarkdown(input)).toBe(\n        \"[![alt text](https://img.com/photo.jpg)](https://example.com/page)\",\n      );\n    });\n\n    it(\"deduplicates multiple image links in one document\", () => {\n      const input = [\n        \"[![a](https://x.com/1.png)](https://x.com/1.png)\",\n        \"[![b](https://x.com/2.png)](https://x.com/2.png)\",\n      ].join(\"\\n\\n\");\n      const expected = [\n        \"![a](https://x.com/1.png)\",\n        \"![b](https://x.com/2.png)\",\n      ].join(\"\\n\\n\");\n      expect(postprocessMarkdown(input)).toBe(expected);\n    });\n  });\n\n  // ── Blank line collapsing ─────────────────────────────────────────\n\n  describe(\"blank line collapsing\", () => {\n    it(\"collapses 3 consecutive blank lines to 2\", () => {\n      const input = \"Hello\\n\\n\\nWorld\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello\\n\\nWorld\");\n    });\n\n    it(\"collapses 5 consecutive blank lines to 2\", () => {\n      const input = \"Hello\\n\\n\\n\\n\\nWorld\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello\\n\\nWorld\");\n    });\n\n    it(\"keeps 2 consecutive newlines as-is\", () => {\n      const input = \"Hello\\n\\nWorld\";\n      expect(postprocessMarkdown(input)).toBe(\"Hello\\n\\nWorld\");\n    });\n  });\n\n  // ── Trim ──────────────────────────────────────────────────────────\n\n  describe(\"trim\", () => {\n    it(\"trims leading and trailing whitespace\", () => {\n      const input = \"   \\n\\nHello world\\n\\n   \";\n      expect(postprocessMarkdown(input)).toBe(\"Hello world\");\n    });\n  });\n\n  // ── Edge cases ────────────────────────────────────────────────────\n\n  describe(\"edge cases\", () => {\n    it(\"handles empty input\", () => {\n      expect(postprocessMarkdown(\"\")).toBe(\"\");\n    });\n  });\n\n  // ── Combined ──────────────────────────────────────────────────────\n\n  describe(\"combined patterns\", () => {\n    it(\"applies all transformations in one document\", () => {\n      const input = [\n        \"  \",\n        \"[Skip to Content](#main)\",\n        \"\",\n        \"\",\n        \"\",\n        \"\",\n        \"# Title\",\n        \"\",\n        \"[![hero](https://img.com/hero.jpg)](https://img.com/hero.jpg)\",\n        \"\",\n        \"Some content here.\",\n        \"\",\n        \"\",\n        \"\",\n        \"Footer text\",\n        \"  \",\n      ].join(\"\\n\");\n\n      const expected = [\n        \"# Title\",\n        \"\",\n        \"![hero](https://img.com/hero.jpg)\",\n        \"\",\n        \"Some content here.\",\n        \"\",\n        \"Footer text\",\n      ].join(\"\\n\");\n\n      expect(postprocessMarkdown(input)).toBe(expected);\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/proxy-bound-browser.test.ts",
    "content": "import { describe, it, expect, vi } from \"vitest\";\nimport pino from \"pino\";\nimport {\n  ProxyBoundBrowser,\n  redactProxyUrl,\n  type HeroFactory,\n  type HeroLike,\n  type TabLike,\n} from \"../../src/browser/proxy-bound-browser\";\n\n/**\n * Silent logger so tests don't spam stdout.\n */\nconst silentLogger = pino({ level: \"silent\" });\n\n/**\n * Fake Tab returned by fake Hero's newTab().\n */\ninterface FakeTab extends TabLike {\n  tabClosed: boolean;\n}\n\nfunction makeFakeTab(): FakeTab {\n  return {\n    tabClosed: false,\n    async goto() { return undefined; },\n    get url() { return Promise.resolve(\"about:blank\"); },\n    get document() { return {} as unknown; },\n    async waitForLoad() {},\n    async waitForPaintingStable() {},\n    async waitForElement() { return undefined as unknown; },\n    async close() { this.tabClosed = true; },\n  };\n}\n\n/**\n * Fake Hero that records the config it was launched with and optionally\n * delays/throws on close. Good enough for exercising ProxyBoundBrowser\n * without importing @ulixee/hero.\n */\ninterface FakeHero extends HeroLike {\n  config: Record<string, unknown>;\n  closed: boolean;\n  tabs: FakeTab[];\n}\n\nfunction makeFakeFactory(opts: {\n  failOnCreate?: Error;\n  slowClose?: number;\n  failOnClose?: Error;\n} = {}): { factory: HeroFactory; instances: FakeHero[]; createCount: number } {\n  const instances: FakeHero[] = [];\n  let createCount = 0;\n  const factory: HeroFactory = {\n    create(config: Record<string, unknown>) {\n      createCount++;\n      if (opts.failOnCreate) throw opts.failOnCreate;\n      const hero: FakeHero = {\n        config,\n        closed: false,\n        tabs: [],\n        async newTab() {\n          const tab = makeFakeTab();\n          this.tabs.push(tab);\n          return tab;\n        },\n        async closeTab(tab: TabLike) {\n          await tab.close();\n        },\n        async close() {\n          if (opts.slowClose) {\n            await new Promise((r) => setTimeout(r, opts.slowClose));\n          }\n          if (opts.failOnClose) throw opts.failOnClose;\n          this.closed = true;\n        },\n      };\n      instances.push(hero);\n      return hero;\n    },\n  };\n  return {\n    factory,\n    instances,\n    get createCount() {\n      return createCount;\n    },\n  };\n}\n\n/**\n * Helper: let microtasks run so pLimit can move its queue forward.\n */\nasync function tick(n = 1) {\n  for (let i = 0; i < n; i++) await new Promise((r) => setImmediate(r));\n}\n\ndescribe(\"ProxyBoundBrowser\", () => {\n  describe(\"construction\", () => {\n    it(\"throws on invalid maxTabs\", () => {\n      const { factory } = makeFakeFactory();\n      expect(\n        () =>\n          new ProxyBoundBrowser({\n            proxyUrl: \"http://p\",\n            maxTabs: 0,\n            heroFactory: factory,\n            logger: silentLogger,\n          }),\n      ).toThrow();\n    });\n\n    it(\"throws on invalid retireAfterPages\", () => {\n      const { factory } = makeFakeFactory();\n      expect(\n        () =>\n          new ProxyBoundBrowser({\n            proxyUrl: \"http://p\",\n            retireAfterPages: 0,\n            heroFactory: factory,\n            logger: silentLogger,\n          }),\n      ).toThrow();\n    });\n\n    it(\"defaults maxTabs=2 and retireAfterPages=100\", () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      expect(b.maxTabs).toBe(2);\n      expect(b.retireAfterPages).toBe(100);\n    });\n  });\n\n  describe(\"ready gate\", () => {\n    it(\"resolves once Hero is launched\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      expect(b.getState()).toBe(\"active\");\n      expect(instances).toHaveLength(1);\n    });\n\n    it(\"rejects if Hero construction throws\", async () => {\n      const err = new Error(\"launch boom\");\n      const { factory } = makeFakeFactory({ failOnCreate: err });\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await expect(b.ready).rejects.toThrow(\"launch boom\");\n      expect(b.getState()).toBe(\"closed\");\n    });\n  });\n\n  describe(\"proxy binding\", () => {\n    it(\"burns the proxy URL into the Hero config\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const url = \"http://user:pass@dc1.example.com:8080\";\n      const b = new ProxyBoundBrowser({\n        proxyUrl: url,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      expect(instances[0].config.upstreamProxyUrl).toBe(url);\n    });\n\n    it(\"sets no upstream proxy for the direct lane\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: null,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      expect(instances[0].config.upstreamProxyUrl).toBeUndefined();\n    });\n\n    it(\"stable UA across browsers with the same proxy URL\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const url = \"http://x:y@host:1\";\n      const a = new ProxyBoundBrowser({\n        proxyUrl: url,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      const b = new ProxyBoundBrowser({\n        proxyUrl: url,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await Promise.all([a.ready, b.ready]);\n      expect(instances[0].config.userAgent).toBe(instances[1].config.userAgent);\n    });\n  });\n\n  describe(\"withPage tab limiting\", () => {\n    it(\"serializes beyond maxTabs\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        maxTabs: 2,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n\n      let active = 0;\n      let peak = 0;\n      const observe = async () => {\n        active++;\n        peak = Math.max(peak, active);\n        await new Promise((r) => setTimeout(r, 5));\n        active--;\n      };\n\n      await Promise.all([\n        b.withPage(async () => { await observe(); }),\n        b.withPage(async () => { await observe(); }),\n        b.withPage(async () => { await observe(); }),\n        b.withPage(async () => { await observe(); }),\n        b.withPage(async () => { await observe(); }),\n      ]);\n\n      expect(peak).toBeLessThanOrEqual(2);\n    });\n\n    it(\"increments totalPages on every withPage completion\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      await b.withPage(async () => 1);\n      await b.withPage(async () => 2);\n      await b.withPage(async () => 3);\n      expect(b.getStats().totalPages).toBe(3);\n    });\n\n    it(\"increments totalPages even on error\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      await expect(\n        b.withPage(async () => {\n          throw new Error(\"nope\");\n        }),\n      ).rejects.toThrow(\"nope\");\n      expect(b.getStats().totalPages).toBe(1);\n    });\n  });\n\n  describe(\"retirement draining\", () => {\n    it(\"waits for in-flight tabs to finish before closing\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        maxTabs: 2,\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n\n      let inFlightResolve!: () => void;\n      const inFlight = new Promise<void>((r) => (inFlightResolve = r));\n\n      const page = b.withPage(async () => {\n        await inFlight;\n        return \"done\";\n      });\n\n      await tick(2);\n      // Retire while a tab is in flight. Should not close the Hero yet.\n      const retirePromise = b.retire();\n      await tick(2);\n      expect(instances[0].closed).toBe(false);\n      expect(b.getState()).toBe(\"retired\");\n\n      inFlightResolve();\n      await page;\n      await retirePromise;\n\n      expect(instances[0].closed).toBe(true);\n      expect(b.getState()).toBe(\"closed\");\n    });\n\n    it(\"rejects new withPage calls once retired\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      await b.retire();\n      await expect(b.withPage(async () => 1)).rejects.toThrow(/retired|closed/);\n    });\n\n    it(\"is safe to call retire multiple times\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      await Promise.all([b.retire(), b.retire(), b.retire()]);\n      expect(instances[0].closed).toBe(true);\n    });\n\n    it(\"swallows close errors during retire\", async () => {\n      const { factory } = makeFakeFactory({\n        failOnClose: new Error(\"close boom\"),\n      });\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      // Should not throw\n      await b.retire();\n      expect(b.getState()).toBe(\"closed\");\n    });\n  });\n\n  describe(\"relaunch\", () => {\n    it(\"closes current Hero and launches a fresh one with the same proxy\", async () => {\n      const fakeFactory = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: fakeFactory.factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      expect(fakeFactory.createCount).toBe(1);\n\n      await b.relaunch();\n      expect(fakeFactory.createCount).toBe(2);\n      expect(fakeFactory.instances[0].closed).toBe(true);\n      expect(b.getState()).toBe(\"active\");\n      expect(b.getStats().totalPages).toBe(0);\n    });\n\n    it(\"accepts withPage after relaunch\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      await b.relaunch();\n      const result = await b.withPage(async () => \"ok\");\n      expect(result).toBe(\"ok\");\n    });\n  });\n\n  describe(\"auto-recycle after retireAfterPages\", () => {\n    it(\"relaunches after hitting the threshold\", async () => {\n      const fakeFactory = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        retireAfterPages: 3,\n        heroFactory: fakeFactory.factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n\n      await b.withPage(async () => 1);\n      await b.withPage(async () => 2);\n      await b.withPage(async () => 3);\n\n      // Recycle is scheduled via setImmediate inside the 3rd withPage's\n      // finally. Poll briefly for the state machine to settle into the new\n      // `active` state with a freshly-launched Hero.\n      for (let i = 0; i < 50 && fakeFactory.createCount < 2; i++) {\n        await tick(1);\n      }\n      await b.ready;\n\n      expect(fakeFactory.createCount).toBe(2);\n      expect(b.getState()).toBe(\"active\");\n      expect(b.getStats().totalPages).toBe(0);\n    });\n  });\n\n  describe(\"stats\", () => {\n    it(\"reports state, activeTabs, totalPages, fingerprintIndex\", async () => {\n      const { factory } = makeFakeFactory();\n      const b = new ProxyBoundBrowser({\n        proxyUrl: \"http://p\",\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await b.ready;\n      const s = b.getStats();\n      expect(s.state).toBe(\"active\");\n      expect(s.activeTabs).toBe(0);\n      expect(s.totalPages).toBe(0);\n      expect(s.fingerprintIndex).toBeGreaterThanOrEqual(0);\n    });\n  });\n});\n\ndescribe(\"redactProxyUrl\", () => {\n  it(\"strips credentials but keeps host\", () => {\n    expect(redactProxyUrl(\"http://user:pass@host:8080\")).toBe(\"http://***@host:8080\");\n  });\n\n  it(\"returns 'direct' for null\", () => {\n    expect(redactProxyUrl(null)).toBe(\"direct\");\n  });\n\n  it(\"handles URLs without credentials\", () => {\n    expect(redactProxyUrl(\"http://host:8080\")).toBe(\"http://host:8080\");\n  });\n\n  it(\"returns a safe placeholder for malformed URLs\", () => {\n    expect(redactProxyUrl(\"not a url\")).toBe(\"<invalid-proxy-url>\");\n  });\n});\n"
  },
  {
    "path": "tests/unit/proxy-config.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { createProxyUrl, parseProxyUrl } from \"../../src/proxy/config\";\n\ndescribe(\"createProxyUrl\", () => {\n  it(\"creates URL containing host and port\", () => {\n    const url = createProxyUrl({ host: \"proxy.example.com\", port: 8080 });\n    expect(url).toContain(\"proxy.example.com\");\n    expect(url).toContain(\"8080\");\n  });\n\n  it(\"includes auth credentials when provided\", () => {\n    const url = createProxyUrl({ host: \"proxy.example.com\", port: 8080, username: \"user\", password: \"pass\" });\n    expect(url).toContain(\"user\");\n    expect(url).toContain(\"pass\");\n    expect(url).toContain(\"proxy.example.com\");\n  });\n\n  it(\"returns direct URL if provided\", () => {\n    const url = createProxyUrl({ url: \"http://custom-proxy:9999\" });\n    expect(url).toBe(\"http://custom-proxy:9999\");\n  });\n});\n\ndescribe(\"parseProxyUrl\", () => {\n  it(\"parses simple proxy URL\", () => {\n    const result = parseProxyUrl(\"http://proxy.example.com:8080\");\n    expect(result.host).toBe(\"proxy.example.com\");\n    expect(result.port).toBe(8080);\n  });\n\n  it(\"parses proxy URL with auth\", () => {\n    const result = parseProxyUrl(\"http://user:pass@proxy.example.com:8080\");\n    expect(result.host).toBe(\"proxy.example.com\");\n    expect(result.port).toBe(8080);\n    expect(result.username).toBe(\"user\");\n    expect(result.password).toBe(\"pass\");\n  });\n\n  it(\"handles https proxy URLs\", () => {\n    const result = parseProxyUrl(\"https://proxy.example.com:443\");\n    expect(result.host).toBe(\"proxy.example.com\");\n    // Port may be number or undefined depending on implementation\n    expect(result.port === 443 || result.port === undefined).toBe(true);\n  });\n});\n"
  },
  {
    "path": "tests/unit/proxy-gate.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { PerProxyGate } from \"../../src/proxy/proxy-gate\";\n\n/**\n * Helper: a deferred that you can resolve from outside. Tests use this to\n * hold slots for as long as they want.\n */\nfunction defer<T = void>() {\n  let resolve!: (v: T) => void;\n  const promise = new Promise<T>((r) => (resolve = r));\n  return { promise, resolve };\n}\n\n/**\n * Helper: let microtasks and timers flush before the next assertion. Gives\n * pLimit a chance to move its queue forward.\n */\nasync function tick(n = 1) {\n  for (let i = 0; i < n; i++) {\n    await new Promise((r) => setImmediate(r));\n  }\n}\n\ndescribe(\"PerProxyGate\", () => {\n  describe(\"constructor\", () => {\n    it(\"defaults to maxConcurrentPerProxy=2\", async () => {\n      const gate = new PerProxyGate();\n      const d1 = defer();\n      const d2 = defer();\n      const d3 = defer();\n\n      // Hold 2 slots\n      const acquired: Array<Promise<void>> = [];\n      const releases: Array<() => void> = [];\n      for (const d of [d1, d2]) {\n        const p = gate.acquire(\"http://dc1\").then((r) => {\n          releases.push(r);\n          return d.promise;\n        });\n        acquired.push(p);\n      }\n      await tick(2);\n\n      // Both should be running\n      expect(gate.stats(\"http://dc1\")?.active).toBe(2);\n\n      // A third should be queued\n      let thirdAcquired = false;\n      const third = gate.acquire(\"http://dc1\").then((r) => {\n        thirdAcquired = true;\n        releases.push(r);\n        return d3.promise;\n      });\n      await tick(2);\n      expect(thirdAcquired).toBe(false);\n      expect(gate.stats(\"http://dc1\")?.queued).toBe(1);\n\n      // Release one — third should run\n      d1.resolve();\n      releases[0]!();\n      await tick(2);\n      expect(thirdAcquired).toBe(true);\n\n      // Cleanup\n      d2.resolve();\n      d3.resolve();\n      releases.forEach((r) => r());\n      await Promise.all([...acquired, third]);\n    });\n\n    it(\"rejects non-integer or <1 max\", () => {\n      expect(() => new PerProxyGate({ maxConcurrentPerProxy: 0 })).toThrow();\n      expect(() => new PerProxyGate({ maxConcurrentPerProxy: -1 })).toThrow();\n      expect(() => new PerProxyGate({ maxConcurrentPerProxy: 1.5 })).toThrow();\n    });\n\n    it(\"accepts custom maxConcurrentPerProxy\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const d1 = defer();\n      const d2 = defer();\n\n      let secondAcquired = false;\n      const r1p = gate.acquire(\"http://p\").then((r) => d1.promise.then(() => r));\n      await tick(2);\n      const r2p = gate.acquire(\"http://p\").then((r) => {\n        secondAcquired = true;\n        return d2.promise.then(() => r);\n      });\n      await tick(2);\n\n      expect(secondAcquired).toBe(false);\n      expect(gate.stats(\"http://p\")?.active).toBe(1);\n\n      // Release first\n      d1.resolve();\n      const r1 = await r1p;\n      r1();\n      await tick(2);\n\n      expect(secondAcquired).toBe(true);\n      d2.resolve();\n      const r2 = await r2p;\n      r2();\n    });\n  });\n\n  describe(\"per-proxy isolation\", () => {\n    it(\"does not cross-gate different proxy URLs\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const d1 = defer();\n      const d2 = defer();\n\n      // Hold dc1's slot\n      const r1p = gate.acquire(\"http://dc1\").then((r) => d1.promise.then(() => r));\n      await tick(2);\n\n      // dc2 should NOT be blocked by dc1\n      let dc2Ok = false;\n      const r2p = gate.acquire(\"http://dc2\").then((r) => {\n        dc2Ok = true;\n        return d2.promise.then(() => r);\n      });\n      await tick(2);\n\n      expect(dc2Ok).toBe(true);\n      expect(gate.stats(\"http://dc1\")?.active).toBe(1);\n      expect(gate.stats(\"http://dc2\")?.active).toBe(1);\n\n      d1.resolve();\n      d2.resolve();\n      (await r1p)();\n      (await r2p)();\n    });\n\n    it(\"direct lane (null proxyUrl) never blocks\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n\n      // Acquire 5 direct slots all at once\n      const releases = await Promise.all([\n        gate.acquire(null),\n        gate.acquire(undefined),\n        gate.acquire(null),\n        gate.acquire(null),\n        gate.acquire(null),\n      ]);\n\n      expect(releases).toHaveLength(5);\n      releases.forEach((r) => r());\n    });\n\n    it(\"direct lane does not appear in stats (no gate is created)\", async () => {\n      const gate = new PerProxyGate();\n      const release = await gate.acquire(null);\n      expect(gate.allStats()).toEqual([]);\n      release();\n    });\n  });\n\n  describe(\"withSlot\", () => {\n    it(\"releases on success\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const result = await gate.withSlot(\"http://p\", async () => 42);\n      expect(result).toBe(42);\n      await tick(2);\n      expect(gate.stats(\"http://p\")?.active).toBe(0);\n    });\n\n    it(\"releases on error\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      await expect(\n        gate.withSlot(\"http://p\", async () => {\n          throw new Error(\"boom\");\n        }),\n      ).rejects.toThrow(\"boom\");\n      await tick(2);\n      expect(gate.stats(\"http://p\")?.active).toBe(0);\n\n      // Must be usable again after the failure\n      const ok = await gate.withSlot(\"http://p\", async () => \"ok\");\n      expect(ok).toBe(\"ok\");\n    });\n\n    it(\"serializes withSlot calls on the same proxy\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const order: string[] = [];\n      const a = gate.withSlot(\"http://p\", async () => {\n        order.push(\"a-start\");\n        await tick(1);\n        order.push(\"a-end\");\n      });\n      const b = gate.withSlot(\"http://p\", async () => {\n        order.push(\"b-start\");\n        order.push(\"b-end\");\n      });\n      await Promise.all([a, b]);\n      expect(order).toEqual([\"a-start\", \"a-end\", \"b-start\", \"b-end\"]);\n    });\n  });\n\n  describe(\"release idempotency\", () => {\n    it(\"release function is safe to call multiple times\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const r = await gate.acquire(\"http://p\");\n      r();\n      r();\n      r();\n      // Next acquire should succeed immediately\n      const r2 = await gate.acquire(\"http://p\");\n      expect(gate.stats(\"http://p\")?.active).toBe(1);\n      r2();\n    });\n  });\n\n  describe(\"per-proxy override\", () => {\n    it(\"setOverride tightens the cap for a specific URL\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 2 });\n      gate.setOverride(\"http://amazon\", 1);\n\n      const d1 = defer();\n      let secondAcquired = false;\n\n      const r1p = gate.acquire(\"http://amazon\").then((r) => d1.promise.then(() => r));\n      await tick(2);\n\n      const r2p = gate.acquire(\"http://amazon\").then((r) => {\n        secondAcquired = true;\n        return r;\n      });\n      await tick(2);\n\n      expect(secondAcquired).toBe(false);\n      d1.resolve();\n      (await r1p)();\n      await tick(2);\n      expect(secondAcquired).toBe(true);\n      (await r2p)();\n    });\n\n    it(\"override only affects the named URL\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 2 });\n      gate.setOverride(\"http://amazon\", 1);\n\n      // Other proxies still get the default of 2\n      const d1 = defer();\n      const d2 = defer();\n      const r1p = gate.acquire(\"http://other\").then((r) => d1.promise.then(() => r));\n      const r2p = gate.acquire(\"http://other\").then((r) => d2.promise.then(() => r));\n      await tick(2);\n\n      expect(gate.stats(\"http://other\")?.active).toBe(2);\n\n      d1.resolve();\n      d2.resolve();\n      (await r1p)();\n      (await r2p)();\n    });\n\n    it(\"rejects invalid override values\", () => {\n      const gate = new PerProxyGate();\n      expect(() => gate.setOverride(\"http://p\", 0)).toThrow();\n      expect(() => gate.setOverride(\"http://p\", -1)).toThrow();\n      expect(() => gate.setOverride(\"http://p\", 1.5)).toThrow();\n    });\n  });\n\n  describe(\"stats\", () => {\n    it(\"returns null for unknown URL\", () => {\n      const gate = new PerProxyGate();\n      expect(gate.stats(\"http://unknown\")).toBeNull();\n    });\n\n    it(\"reports active + queued counts\", async () => {\n      const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 });\n      const d1 = defer();\n      const r1p = gate.acquire(\"http://p\").then((r) => d1.promise.then(() => r));\n      await tick(2);\n      // Queue 2 more\n      const r2p = gate.acquire(\"http://p\");\n      const r3p = gate.acquire(\"http://p\");\n      await tick(2);\n\n      const s = gate.stats(\"http://p\");\n      expect(s).toEqual({\n        proxyUrl: \"http://p\",\n        max: 1,\n        active: 1,\n        queued: 2,\n      });\n\n      d1.resolve();\n      (await r1p)();\n      (await r2p)();\n      (await r3p)();\n    });\n\n    it(\"allStats lists every known gate\", async () => {\n      const gate = new PerProxyGate();\n      await (await gate.acquire(\"http://a\"))();\n      await (await gate.acquire(\"http://b\"))();\n      const all = gate.allStats();\n      expect(all.map((s) => s.proxyUrl).sort()).toEqual([\"http://a\", \"http://b\"]);\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/proxy-verify.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { verifyProxies, verifyProxiesOrThrow } from \"../../src/proxy/verify\";\nimport type { EgressIpFetcher } from \"../../src/proxy/verify\";\n\n/**\n * Build an injected fetcher that maps proxy URLs -> mocked egress behaviour.\n * Each entry is either a string (the egress IP to return) or an Error\n * (the failure to throw).\n */\nfunction makeFakeFetcher(\n  routes: Record<string, string | Error>,\n): EgressIpFetcher {\n  return async (proxyUrl) => {\n    const v = routes[proxyUrl];\n    if (v === undefined) {\n      throw new Error(`fake fetcher: no route for ${proxyUrl}`);\n    }\n    if (v instanceof Error) throw v;\n    return v;\n  };\n}\n\ndescribe(\"verifyProxies\", () => {\n  it(\"returns empty result for undefined pools\", async () => {\n    const result = await verifyProxies(undefined);\n    expect(result).toEqual({ verified: [], failed: [] });\n  });\n\n  it(\"returns empty result for empty pools\", async () => {\n    const result = await verifyProxies({});\n    expect(result.verified).toEqual([]);\n    expect(result.failed).toEqual([]);\n  });\n\n  it(\"verifies a single datacenter proxy and returns its egress IP\", async () => {\n    const fetcher = makeFakeFetcher({ \"http://dc1\": \"1.2.3.4\" });\n    const result = await verifyProxies(\n      { datacenter: [{ url: \"http://dc1\" }] },\n      { fetcher },\n    );\n    expect(result.failed).toEqual([]);\n    expect(result.verified).toEqual([\n      { proxyUrl: \"http://dc1\", egressIp: \"1.2.3.4\", tier: \"datacenter\" },\n    ]);\n  });\n\n  it(\"tags residential proxies with the right tier\", async () => {\n    const fetcher = makeFakeFetcher({ \"http://res1\": \"5.6.7.8\" });\n    const result = await verifyProxies(\n      { residential: [{ url: \"http://res1\" }] },\n      { fetcher },\n    );\n    expect(result.verified[0]).toMatchObject({ tier: \"residential\" });\n  });\n\n  it(\"verifies datacenter and residential pools together\", async () => {\n    const fetcher = makeFakeFetcher({\n      \"http://dc1\": \"1.1.1.1\",\n      \"http://dc2\": \"2.2.2.2\",\n      \"http://res1\": \"9.9.9.9\",\n    });\n    const result = await verifyProxies(\n      {\n        datacenter: [{ url: \"http://dc1\" }, { url: \"http://dc2\" }],\n        residential: [{ url: \"http://res1\" }],\n      },\n      { fetcher },\n    );\n    expect(result.failed).toEqual([]);\n    expect(result.verified).toHaveLength(3);\n    const tiers = result.verified.map((v) => v.tier).sort();\n    expect(tiers).toEqual([\"datacenter\", \"datacenter\", \"residential\"]);\n  });\n\n  it(\"collects failures alongside successes\", async () => {\n    const fetcher = makeFakeFetcher({\n      \"http://dc1\": \"1.1.1.1\",\n      \"http://dc2\": new Error(\"connection refused\"),\n      \"http://res1\": \"9.9.9.9\",\n    });\n    const result = await verifyProxies(\n      {\n        datacenter: [{ url: \"http://dc1\" }, { url: \"http://dc2\" }],\n        residential: [{ url: \"http://res1\" }],\n      },\n      { fetcher },\n    );\n    expect(result.verified).toHaveLength(2);\n    expect(result.failed).toEqual([\n      { proxyUrl: \"http://dc2\", tier: \"datacenter\", error: \"connection refused\" },\n    ]);\n  });\n\n  it(\"ignores entries without a URL\", async () => {\n    const fetcher = makeFakeFetcher({ \"http://dc1\": \"1.1.1.1\" });\n    const result = await verifyProxies(\n      { datacenter: [{ url: \"http://dc1\" }, {}, { url: \"\" }] },\n      { fetcher },\n    );\n    expect(result.verified).toHaveLength(1);\n    expect(result.failed).toEqual([]);\n  });\n});\n\ndescribe(\"verifyProxiesOrThrow\", () => {\n  it(\"returns the verified list when everything succeeds\", async () => {\n    const fetcher = makeFakeFetcher({ \"http://dc1\": \"1.1.1.1\" });\n    const verified = await verifyProxiesOrThrow(\n      { datacenter: [{ url: \"http://dc1\" }] },\n      { fetcher },\n    );\n    expect(verified).toHaveLength(1);\n    expect(verified[0].egressIp).toBe(\"1.1.1.1\");\n  });\n\n  it(\"throws a multi-line error listing every failed proxy\", async () => {\n    const fetcher = makeFakeFetcher({\n      \"http://dc1\": new Error(\"EHOSTUNREACH\"),\n      \"http://res1\": new Error(\"HTTP 407 from api.ipify.org\"),\n    });\n    await expect(\n      verifyProxiesOrThrow(\n        {\n          datacenter: [{ url: \"http://dc1\" }],\n          residential: [{ url: \"http://res1\" }],\n        },\n        { fetcher },\n      ),\n    ).rejects.toThrow(/Proxy verification failed for 2 proxy/);\n  });\n\n  it(\"redacts proxy credentials in the error message\", async () => {\n    const fetcher = makeFakeFetcher({\n      \"http://user:secret@dc1.example.com:8080\": new Error(\"nope\"),\n    });\n    let captured: string = \"\";\n    try {\n      await verifyProxiesOrThrow(\n        { datacenter: [{ url: \"http://user:secret@dc1.example.com:8080\" }] },\n        { fetcher },\n      );\n    } catch (e: unknown) {\n      captured = e instanceof Error ? e.message : String(e);\n    }\n    expect(captured).toMatch(/dc1\\.example\\.com/);\n    expect(captured).not.toContain(\"secret\");\n    expect(captured).not.toContain(\"user:secret\");\n  });\n\n  it(\"does not throw when there are zero proxies\", async () => {\n    const verified = await verifyProxiesOrThrow(undefined);\n    expect(verified).toEqual([]);\n  });\n});\n"
  },
  {
    "path": "tests/unit/robots-parser.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport {\n  parseRobotsTxt,\n  isPathAllowed,\n  isUrlAllowed,\n  type RobotsRules,\n} from \"../../src/utils/robots-parser\";\n\ndescribe(\"parseRobotsTxt\", () => {\n  it(\"should parse a basic disallow rule\", () => {\n    const content = `User-agent: *\\nDisallow: /private`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n    expect(rules.allowedPaths).toEqual([]);\n    expect(rules.crawlDelay).toBeNull();\n  });\n\n  it(\"should parse multiple disallow rules\", () => {\n    const content = `User-agent: *\\nDisallow: /private\\nDisallow: /admin\\nDisallow: /secret`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\", \"/admin\", \"/secret\"]);\n  });\n\n  it(\"should parse allow rules alongside disallow rules\", () => {\n    const content = `User-agent: *\\nDisallow: /private\\nAllow: /private/public`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n    expect(rules.allowedPaths).toEqual([\"/private/public\"]);\n  });\n\n  it(\"should parse crawl-delay and convert to milliseconds\", () => {\n    const content = `User-agent: *\\nCrawl-delay: 2`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.crawlDelay).toBe(2000);\n  });\n\n  it(\"should parse fractional crawl-delay\", () => {\n    const content = `User-agent: *\\nCrawl-delay: 0.5`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.crawlDelay).toBe(500);\n  });\n\n  it(\"should match a specific user agent\", () => {\n    const content = `User-agent: Googlebot\\nDisallow: /no-google\\n\\nUser-agent: *\\nDisallow: /no-all`;\n    const rules = parseRobotsTxt(content, \"Googlebot\");\n    expect(rules.disallowedPaths).toContain(\"/no-google\");\n    expect(rules.disallowedPaths).toContain(\"/no-all\");\n  });\n\n  it(\"should match user agent case-insensitively\", () => {\n    const content = `User-agent: MyBot\\nDisallow: /blocked`;\n    const rules = parseRobotsTxt(content, \"mybot\");\n    expect(rules.disallowedPaths).toEqual([\"/blocked\"]);\n  });\n\n  it(\"should only collect rules under matching user agent sections\", () => {\n    const content = `User-agent: OtherBot\\nDisallow: /other-only\\n\\nUser-agent: *\\nDisallow: /all`;\n    const rules = parseRobotsTxt(content, \"MyBot\");\n    expect(rules.disallowedPaths).not.toContain(\"/other-only\");\n    expect(rules.disallowedPaths).toContain(\"/all\");\n  });\n\n  it(\"should use wildcard agent by default\", () => {\n    const content = `User-agent: *\\nDisallow: /blocked`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/blocked\"]);\n  });\n\n  it(\"should ignore comments\", () => {\n    const content = `# This is a comment\\nUser-agent: *\\n# Another comment\\nDisallow: /private`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n  });\n\n  it(\"should ignore empty lines\", () => {\n    const content = `\\nUser-agent: *\\n\\n\\nDisallow: /private\\n\\n`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n  });\n\n  it(\"should return empty rules for empty content\", () => {\n    const rules = parseRobotsTxt(\"\");\n    expect(rules.disallowedPaths).toEqual([]);\n    expect(rules.allowedPaths).toEqual([]);\n    expect(rules.crawlDelay).toBeNull();\n  });\n\n  it(\"should ignore lines without a colon\", () => {\n    const content = `User-agent: *\\nThis is not a directive\\nDisallow: /private`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n  });\n\n  it(\"should skip empty Disallow values\", () => {\n    const content = `User-agent: *\\nDisallow:\\nDisallow: /private`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.disallowedPaths).toEqual([\"/private\"]);\n  });\n\n  it(\"should ignore non-numeric crawl-delay\", () => {\n    const content = `User-agent: *\\nCrawl-delay: abc`;\n    const rules = parseRobotsTxt(content);\n    expect(rules.crawlDelay).toBeNull();\n  });\n});\n\ndescribe(\"isPathAllowed\", () => {\n  it(\"should disallow an exact path match\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/private\", rules)).toBe(false);\n  });\n\n  it(\"should disallow a prefix match\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/private/secret\", rules)).toBe(false);\n  });\n\n  it(\"should allow paths that do not match any disallow rule\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/public\", rules)).toBe(true);\n  });\n\n  it(\"should handle wildcard patterns\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private/*\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/private/foo\", rules)).toBe(false);\n    expect(isPathAllowed(\"/private/bar/baz\", rules)).toBe(false);\n  });\n\n  it(\"should handle $ end anchor\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/*.pdf$\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/document.pdf\", rules)).toBe(false);\n    expect(isPathAllowed(\"/document.pdf?id=1\", rules)).toBe(true);\n  });\n\n  it(\"should give allow precedence over disallow\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [\"/private/public\"],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/private/public\", rules)).toBe(true);\n    expect(isPathAllowed(\"/private/secret\", rules)).toBe(false);\n  });\n\n  it(\"should default to allowed when no rules match\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/anything\", rules)).toBe(true);\n  });\n\n  it(\"should normalize paths without leading slash\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"private\", rules)).toBe(false);\n  });\n\n  it(\"should handle wildcard in the middle of a pattern\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/api/*/internal\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isPathAllowed(\"/api/v1/internal\", rules)).toBe(false);\n    expect(isPathAllowed(\"/api/v2/internal\", rules)).toBe(false);\n    expect(isPathAllowed(\"/api/v1/public\", rules)).toBe(true);\n  });\n});\n\ndescribe(\"isUrlAllowed\", () => {\n  it(\"should return true when rules are null\", () => {\n    expect(isUrlAllowed(\"https://example.com/anything\", null)).toBe(true);\n  });\n\n  it(\"should check the pathname of a full URL\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isUrlAllowed(\"https://example.com/private\", rules)).toBe(false);\n    expect(isUrlAllowed(\"https://example.com/public\", rules)).toBe(true);\n  });\n\n  it(\"should include query string in path matching\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/search?q=blocked\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isUrlAllowed(\"https://example.com/search?q=blocked\", rules)).toBe(false);\n    expect(isUrlAllowed(\"https://example.com/search?q=allowed\", rules)).toBe(true);\n  });\n\n  it(\"should return true for an invalid URL\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    expect(isUrlAllowed(\"not-a-valid-url\", rules)).toBe(true);\n  });\n\n  it(\"should handle URLs with paths and fragments\", () => {\n    const rules: RobotsRules = {\n      disallowedPaths: [\"/private\"],\n      allowedPaths: [],\n      crawlDelay: null,\n    };\n    // Fragments are not sent to the server, URL constructor excludes them from pathname+search\n    expect(isUrlAllowed(\"https://example.com/private#section\", rules)).toBe(false);\n  });\n});\n"
  },
  {
    "path": "tests/unit/scraper-pipeline.test.ts",
    "content": "/**\n * Scraper Content Pipeline Tests\n *\n * Tests the end-to-end content pipeline: raw HTML → metadata extraction →\n * content cleaning → markdown conversion → postprocessing. We mock the\n * orchestrator to return controlled HTML and test everything downstream.\n */\n\nimport { describe, it, expect, vi } from \"vitest\";\nimport { Scraper } from \"../../src/scraper\";\nimport type { WebsiteScrapeResult } from \"../../src/types\";\n\n// ── Helpers ──────────────────────────────────────────────────────────────────\n\nfunction makeScraper(options?: Record<string, unknown>): Scraper {\n  return new Scraper({\n    urls: [\"https://example.com\"],\n    formats: [\"markdown\"],\n    ...options,\n  });\n}\n\n/**\n * Mock scrapeSingleUrl to simulate the orchestrator returning raw HTML.\n * This lets us test the content pipeline (metadata → clean → convert →\n * postprocess) without hitting real engines.\n */\nfunction mockPipeline(scraper: Scraper, html: string, url = \"https://example.com\") {\n  // We need to mock at a level that still exercises the pipeline.\n  // The pipeline runs inside scrapeSingleUrl after the orchestrator returns.\n  // Since scrapeSingleUrl is private and tightly coupled, we mock it to\n  // exercise the pipeline by calling the real functions directly.\n  //\n  // Instead, let's test the pipeline functions in isolation:\n  // extractMetadata + cleanContent + htmlToMarkdown + postprocessMarkdown\n  (scraper as any).logger = {\n    info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(),\n  };\n}\n\n// ── Direct pipeline function tests ───────────────────────────────────────────\n\nimport { extractMetadata } from \"../../src/utils/metadata-extractor\";\nimport { cleanContent } from \"../../src/utils/content-cleaner\";\nimport { htmlToMarkdown } from \"../../src/formatters/markdown\";\nimport { postprocessMarkdown } from \"../../src/formatters/postprocess\";\n\ndescribe(\"Scraper content pipeline\", () => {\n  describe(\"end-to-end: HTML → metadata + markdown\", () => {\n    const SAMPLE_HTML = `\n      <html>\n      <head>\n        <title>Example Page Title</title>\n        <meta name=\"description\" content=\"A test page for the content pipeline\">\n        <meta property=\"og:title\" content=\"OG Title\">\n        <meta property=\"og:image\" content=\"https://example.com/og.png\">\n        <meta name=\"twitter:card\" content=\"summary_large_image\">\n      </head>\n      <body>\n        <nav><a href=\"/\">Home</a><a href=\"/about\">About</a></nav>\n        <main>\n          <h1>Welcome to Example</h1>\n          <p>This is a real page with meaningful content that should pass quality checks.</p>\n          <p>It has multiple paragraphs to ensure the content pipeline works correctly.</p>\n          <a href=\"https://example.com/link\">A useful link</a>\n        </main>\n        <footer>© 2026 Example Corp</footer>\n      </body>\n      </html>\n    `;\n\n    it(\"extracts metadata from raw HTML before cleaning\", () => {\n      const metadata = extractMetadata(SAMPLE_HTML, \"https://example.com\");\n      expect(metadata.title).toBe(\"Example Page Title\");\n      expect(metadata.description).toBe(\"A test page for the content pipeline\");\n      expect(metadata.openGraph?.title).toBe(\"OG Title\");\n      expect(metadata.openGraph?.image).toBe(\"https://example.com/og.png\");\n      expect(metadata.twitter?.card).toBe(\"summary_large_image\");\n    });\n\n    it(\"metadata is NOT available after cleaning (head stripped)\", () => {\n      const cleaned = cleanContent(SAMPLE_HTML, \"https://example.com\", {\n        onlyMainContent: false,\n      });\n      const metadata = extractMetadata(cleaned, \"https://example.com\");\n      // Title should be null because <head> was stripped\n      expect(metadata.title).toBeNull();\n    });\n\n    it(\"produces markdown from cleaned HTML\", () => {\n      const cleaned = cleanContent(SAMPLE_HTML, \"https://example.com\", {\n        onlyMainContent: false,\n      });\n      const markdown = htmlToMarkdown(cleaned);\n      expect(markdown).toContain(\"Welcome to Example\");\n      expect(markdown).toContain(\"meaningful content\");\n      expect(markdown.length).toBeGreaterThan(50);\n    });\n\n    it(\"onlyMainContent extracts main content and removes nav/footer\", () => {\n      const cleaned = cleanContent(SAMPLE_HTML, \"https://example.com\", {\n        onlyMainContent: true,\n      });\n      const markdown = htmlToMarkdown(cleaned);\n      expect(markdown).toContain(\"Welcome to Example\");\n      // Nav and footer should be stripped\n      expect(markdown).not.toContain(\"© 2026 Example Corp\");\n    });\n\n    it(\"postprocessing cleans up the output\", () => {\n      const raw = \"[Skip to Content](#main)\\n\\n\\n\\n\\n# Title\\n\\nContent\";\n      const processed = postprocessMarkdown(raw);\n      expect(processed).not.toContain(\"Skip to Content\");\n      expect(processed).not.toContain(\"\\n\\n\\n\"); // collapsed to 2\n      expect(processed).toContain(\"# Title\");\n    });\n\n    it(\"full pipeline: raw HTML → metadata + clean markdown\", () => {\n      // Step 1: Extract metadata from raw HTML\n      const metadata = extractMetadata(SAMPLE_HTML, \"https://example.com\");\n\n      // Step 2: Clean HTML\n      const cleaned = cleanContent(SAMPLE_HTML, \"https://example.com\", {\n        onlyMainContent: true,\n      });\n\n      // Step 3: Convert to markdown\n      const markdown = htmlToMarkdown(cleaned);\n\n      // Step 4: Postprocess\n      const final = postprocessMarkdown(markdown);\n\n      // Verify the full pipeline\n      expect(metadata.title).toBe(\"Example Page Title\");\n      expect(final).toContain(\"Welcome to Example\");\n      expect(final).toContain(\"meaningful content\");\n      expect(final.length).toBeGreaterThan(50);\n    });\n  });\n\n  describe(\"JSON payload detection\", () => {\n    it(\"wraps JSON responses in code fences\", () => {\n      // The Scraper detects JSON payloads and wraps them.\n      // Test the detection logic directly.\n      const jsonBody = '{\"key\": \"value\", \"items\": [1, 2, 3]}';\n      // detectJsonPayload is not exported, but we can verify the behavior\n      // by checking that valid JSON with 200 status would be detected\n      const trimmed = jsonBody.trim();\n      const firstChar = trimmed[0];\n      const lastChar = trimmed[trimmed.length - 1];\n      const looksJson = (firstChar === \"{\" && lastChar === \"}\");\n      expect(looksJson).toBe(true);\n      expect(() => JSON.parse(trimmed)).not.toThrow();\n    });\n  });\n\n  describe(\"conversion fallback\", () => {\n    it(\"htmlToMarkdown falls back to text extraction on empty result from large input\", () => {\n      // When supermarkdown returns \"\" for a large input, the formatter\n      // falls back to tag stripping. We can't easily trigger this without\n      // mocking supermarkdown, but we can verify the fallback behavior\n      // by testing with input that works normally.\n      const html = \"<html><body><p>Simple content</p></body></html>\";\n      const result = htmlToMarkdown(html);\n      expect(result).toContain(\"Simple content\");\n    });\n  });\n\n  describe(\"Wikipedia-like content\", () => {\n    const WIKIPEDIA_HTML = `\n      <html>\n      <head><title>Web scraping - Wikipedia</title></head>\n      <body class=\"mediawiki ltr sitedir-ltr\">\n        <nav id=\"mw-navigation\">\n          <a href=\"/\">Main Page</a>\n        </nav>\n        <main id=\"content\">\n          <div id=\"bodyContent\">\n            <div id=\"mw-content-text\">\n              <h1>Web scraping</h1>\n              <p><b>Web scraping</b> is data scraping used for extracting data from websites.\n              Web scraping software may directly access the World Wide Web using the\n              Hypertext Transfer Protocol or a web browser.</p>\n              <h2>Techniques</h2>\n              <p>Human copy-and-paste is the simplest form of web scraping.</p>\n              <table class=\"wikitable\">\n                <tr><th>Method</th><th>Description</th></tr>\n                <tr><td>HTTP</td><td>Direct request</td></tr>\n                <tr><td>Browser</td><td>DOM parsing</td></tr>\n              </table>\n            </div>\n          </div>\n        </main>\n      </body>\n      </html>\n    `;\n\n    it(\"extracts title from Wikipedia HTML\", () => {\n      const metadata = extractMetadata(WIKIPEDIA_HTML, \"https://en.wikipedia.org/wiki/Web_scraping\");\n      expect(metadata.title).toBe(\"Web scraping - Wikipedia\");\n    });\n\n    it(\"produces substantial markdown from Wikipedia content\", () => {\n      const cleaned = cleanContent(WIKIPEDIA_HTML, \"https://en.wikipedia.org/wiki/Web_scraping\", {\n        onlyMainContent: true,\n      });\n      const markdown = postprocessMarkdown(htmlToMarkdown(cleaned));\n\n      expect(markdown).toContain(\"Web scraping\");\n      expect(markdown).toContain(\"Techniques\");\n      expect(markdown).toContain(\"HTTP\");\n      // Table should be present as GFM\n      expect(markdown).toContain(\"|\");\n      expect(markdown.length).toBeGreaterThan(200);\n    });\n\n    it(\"does not include navigation in onlyMainContent mode\", () => {\n      const cleaned = cleanContent(WIKIPEDIA_HTML, \"https://en.wikipedia.org/wiki/Web_scraping\", {\n        onlyMainContent: true,\n      });\n      const markdown = postprocessMarkdown(htmlToMarkdown(cleaned));\n      expect(markdown).not.toContain(\"Main Page\");\n    });\n  });\n\n  describe(\"SaaS landing page content\", () => {\n    const SAAS_HTML = `\n      <html>\n      <head>\n        <title>Acme - Build faster</title>\n        <meta name=\"description\" content=\"The modern platform for developers\">\n        <meta property=\"og:image\" content=\"https://acme.com/og.png\">\n      </head>\n      <body>\n        <header>\n          <nav><a href=\"/pricing\">Pricing</a><a href=\"/docs\">Docs</a></nav>\n        </header>\n        <main>\n          <h1>Build faster with Acme</h1>\n          <p>Acme helps developers ship products 10x faster with our modern platform.</p>\n          <section>\n            <h2>Features</h2>\n            <ul>\n              <li>Instant deployments</li>\n              <li>Edge functions</li>\n              <li>Database included</li>\n            </ul>\n          </section>\n        </main>\n        <footer>\n          <a href=\"/privacy\">Privacy</a>\n          <a href=\"/terms\">Terms</a>\n        </footer>\n      </body>\n      </html>\n    `;\n\n    it(\"extracts title and OG image from SaaS page\", () => {\n      const metadata = extractMetadata(SAAS_HTML, \"https://acme.com\");\n      expect(metadata.title).toBe(\"Acme - Build faster\");\n      expect(metadata.description).toBe(\"The modern platform for developers\");\n      expect(metadata.openGraph?.image).toBe(\"https://acme.com/og.png\");\n    });\n\n    it(\"produces markdown with heading and list\", () => {\n      const cleaned = cleanContent(SAAS_HTML, \"https://acme.com\", { onlyMainContent: true });\n      const markdown = postprocessMarkdown(htmlToMarkdown(cleaned));\n\n      expect(markdown).toContain(\"Build faster with Acme\");\n      expect(markdown).toContain(\"Features\");\n      expect(markdown).toContain(\"Instant deployments\");\n      expect(markdown).toContain(\"- \"); // list items\n    });\n  });\n\n  describe(\"edge cases\", () => {\n    it(\"handles empty HTML\", () => {\n      const metadata = extractMetadata(\"\", \"https://example.com\");\n      expect(metadata.title).toBeNull();\n\n      const markdown = htmlToMarkdown(\"\");\n      expect(markdown).toBe(\"\");\n    });\n\n    it(\"handles HTML with only scripts and styles\", () => {\n      const html = \"<html><head><script>alert(1)</script><style>body{}</style></head><body><script>x()</script></body></html>\";\n      const cleaned = cleanContent(html, \"https://example.com\", { onlyMainContent: false });\n      const markdown = htmlToMarkdown(cleaned);\n      // Scripts and styles should be stripped\n      expect(markdown).not.toContain(\"alert\");\n      expect(markdown).not.toContain(\"body{}\");\n    });\n\n    it(\"handles includeTags filter\", () => {\n      const html = `\n        <html><body>\n          <div class=\"content\"><p>Keep this</p></div>\n          <div class=\"sidebar\"><p>Remove this</p></div>\n        </body></html>\n      `;\n      const cleaned = cleanContent(html, \"https://example.com\", {\n        onlyMainContent: false,\n        includeTags: [\".content\"],\n      });\n      const markdown = htmlToMarkdown(cleaned);\n      expect(markdown).toContain(\"Keep this\");\n      expect(markdown).not.toContain(\"Remove this\");\n    });\n\n    it(\"handles excludeTags filter\", () => {\n      const html = `\n        <html><body>\n          <div class=\"content\"><p>Keep this</p></div>\n          <div class=\"ads\"><p>Remove this ad</p></div>\n        </body></html>\n      `;\n      const cleaned = cleanContent(html, \"https://example.com\", {\n        onlyMainContent: false,\n        excludeTags: [\".ads\"],\n      });\n      const markdown = htmlToMarkdown(cleaned);\n      expect(markdown).toContain(\"Keep this\");\n      expect(markdown).not.toContain(\"Remove this ad\");\n    });\n  });\n});\n"
  },
  {
    "path": "tests/unit/scraper-retry.test.ts",
    "content": "/**\n * Scraper Retry & Escalation Tests\n *\n * Tests the retry loop in Scraper.scrapeSingleUrlWithRetry:\n *   1. Datacenter attempt with 10s timeout\n *   2. Any failure → residential attempt with remaining time (up to 30s total)\n *   3. Any failure → done\n *\n * We mock `scrapeSingleUrl` on the Scraper prototype so the retry logic\n * is tested in isolation without hitting real engines.\n */\n\nimport { describe, it, expect, vi, beforeEach, afterEach } from \"vitest\";\nimport { Scraper } from \"../../src/scraper\";\nimport { ScrapeFailedError } from \"../../src/engines/errors\";\nimport { ProxyConnectionError, DNSError } from \"../../src/errors\";\nimport type { WebsiteScrapeResult } from \"../../src/types\";\n\n// ── Helpers ──────────────────────────────────────────────────────────────────\n\nfunction makeResult(overrides?: Partial<WebsiteScrapeResult>): WebsiteScrapeResult {\n  return {\n    rawHtml: \"<html><body><h1>Hello World</h1><p>This is real content.</p></body></html>\",\n    markdown: \"# Hello World\\n\\nThis is real content with enough text.\",\n    metadata: {\n      baseUrl: \"https://example.com\",\n      statusCode: 200,\n      engine: \"hero\",\n      totalPages: 1,\n      scrapedAt: new Date().toISOString(),\n      duration: 100,\n      website: { title: \"Example\", description: null } as any,\n    },\n    ...overrides,\n  };\n}\n\nfunction makeScraper(overrides?: Record<string, unknown>): Scraper {\n  return new Scraper({ urls: [\"https://example.com\"], formats: [\"markdown\"], ...overrides });\n}\n\nfunction spySingleUrl(scraper: Scraper) {\n  const spy = vi.fn() as any;\n  (scraper as any).scrapeSingleUrl = spy;\n  (scraper as any).logger = {\n    info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(),\n  };\n  return spy;\n}\n\n// ── Tests ────────────────────────────────────────────────────────────────────\n\ndescribe(\"Scraper retry & escalation\", () => {\n  beforeEach(() => {\n    vi.useFakeTimers({ shouldAdvanceTime: true });\n  });\n\n  afterEach(() => {\n    vi.useRealTimers();\n  });\n\n  // ── Happy path ──\n\n  it(\"returns result on first success without escalation\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data).toHaveLength(1);\n    expect(data[0].markdown).toContain(\"Hello World\");\n    expect(spy).toHaveBeenCalledTimes(1);\n  });\n\n  // ── Non-retryable errors ──\n\n  it(\"fast-fails on non-retryable errors without escalating\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n    spy.mockRejectedValueOnce(new DNSError(\"example.com\"));\n\n    const { data, batchMetadata } = await scraper.scrape();\n    expect(data).toHaveLength(0);\n    expect(batchMetadata.failedUrls).toBe(1);\n    expect(spy).toHaveBeenCalledTimes(1); // No second attempt\n  });\n\n  // ── Escalation on failure ──\n\n  it(\"escalates to residential on datacenter failure\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockRejectedValueOnce(\n      new ScrapeFailedError(new Error(\"timeout\"), { proxyBlock: true }),\n    );\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data).toHaveLength(1);\n    expect(spy).toHaveBeenCalledTimes(2);\n    // Second call should have proxyOverride = \"residential\"\n    expect(spy.mock.calls[1][2]).toBe(\"residential\");\n  });\n\n  // ── Escalation on proxy connection error ──\n\n  it(\"escalates to residential on ProxyConnectionError\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockRejectedValueOnce(new ProxyConnectionError(\"datacenter\"));\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data).toHaveLength(1);\n    expect(spy).toHaveBeenCalledTimes(2);\n    expect(spy.mock.calls[1][2]).toBe(\"residential\");\n  });\n\n  // ── Escalation on empty result ──\n\n  it(\"escalates to residential when datacenter returns null\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockResolvedValueOnce(null);\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data).toHaveLength(1);\n    expect(spy).toHaveBeenCalledTimes(2);\n    expect(spy.mock.calls[1][2]).toBe(\"residential\");\n  });\n\n  // ── Escalation on blocked content ──\n\n  it(\"escalates when result looks blocked (200 + bot page content)\", async () => {\n    const scraper = makeScraper({\n      blockDetection: {\n        patterns: [/click the button below to continue shopping/i],\n        shortContentThreshold: 500,\n      },\n    });\n    const spy = spySingleUrl(scraper);\n\n    spy.mockResolvedValueOnce(makeResult({\n      rawHtml: '<html><body><h4>Click the button below to continue shopping</h4><p>© Amazon.com</p></body></html>',\n      markdown: \"Click the button below to continue shopping\",\n      metadata: {\n        baseUrl: \"https://amazon.com/dp/123\",\n        statusCode: 200,\n        engine: \"hero\",\n        totalPages: 1,\n        scrapedAt: new Date().toISOString(),\n        duration: 50,\n        website: { title: null, description: null } as any,\n      },\n    }));\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data).toHaveLength(1);\n    expect(data[0].markdown).toContain(\"Hello World\");\n    expect(spy).toHaveBeenCalledTimes(2);\n  });\n\n  // ── Both attempts fail ──\n\n  it(\"reports error when both datacenter and residential fail\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockRejectedValueOnce(new ScrapeFailedError(new Error(\"dc timeout\")));\n    spy.mockRejectedValueOnce(new ScrapeFailedError(new Error(\"res timeout\")));\n\n    const { data, batchMetadata } = await scraper.scrape();\n    expect(data).toHaveLength(0);\n    expect(batchMetadata.failedUrls).toBe(1);\n    expect(spy).toHaveBeenCalledTimes(2);\n  });\n\n  // ── No third attempt ──\n\n  it(\"does NOT retry a third time — max 2 attempts (dc + residential)\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockRejectedValueOnce(new ScrapeFailedError(new Error(\"fail 1\")));\n    spy.mockRejectedValueOnce(new ScrapeFailedError(new Error(\"fail 2\")));\n\n    await scraper.scrape();\n    expect(spy).toHaveBeenCalledTimes(2);\n  });\n\n  // ── Timeout passed to attempts ──\n\n  it(\"passes 10s timeout to datacenter attempt\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n    spy.mockResolvedValueOnce(makeResult());\n\n    await scraper.scrape();\n\n    // 4th arg is timeoutMs\n    expect(spy.mock.calls[0][3]).toBe(10_000);\n  });\n\n  it(\"passes remaining time to residential attempt\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n\n    spy.mockRejectedValueOnce(new ScrapeFailedError(new Error(\"dc fail\")));\n    spy.mockResolvedValueOnce(makeResult());\n\n    await scraper.scrape();\n\n    // Residential timeout should be <= 30s and > 0\n    const residentialTimeout = spy.mock.calls[1][3];\n    expect(residentialTimeout).toBeGreaterThan(0);\n    expect(residentialTimeout).toBeLessThanOrEqual(30_000);\n  });\n\n  // ── rawHtml is always present ──\n\n  it(\"includes rawHtml in successful result\", async () => {\n    const scraper = makeScraper();\n    const spy = spySingleUrl(scraper);\n    spy.mockResolvedValueOnce(makeResult());\n\n    const { data } = await scraper.scrape();\n    expect(data[0].rawHtml).toContain(\"<html>\");\n  });\n});\n"
  },
  {
    "path": "tests/unit/tiered-pool.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport pino from \"pino\";\nimport {\n  TieredBrowserPool,\n  buildTierConfigsFromPools,\n} from \"../../src/browser/tiered-pool\";\nimport type { HeroFactory, HeroLike, TabLike } from \"../../src/browser/proxy-bound-browser\";\nimport { ProxyHealthTracker } from \"../../src/proxy/health-tracker\";\n\nconst silentLogger = pino({ level: \"silent\" });\n\ninterface FakeHero extends HeroLike {\n  config: Record<string, unknown>;\n  closed: boolean;\n}\n\nfunction makeFakeTab(): TabLike {\n  return {\n    async goto() { return undefined; },\n    get url() { return Promise.resolve(\"about:blank\"); },\n    get document() { return {} as unknown; },\n    async waitForLoad() {},\n    async waitForPaintingStable() {},\n    async waitForElement() { return undefined as unknown; },\n    async close() {},\n  };\n}\n\nfunction makeFakeFactory(opts: { failFor?: Set<string> } = {}): {\n  factory: HeroFactory;\n  instances: FakeHero[];\n} {\n  const instances: FakeHero[] = [];\n  const factory: HeroFactory = {\n    create(config: Record<string, unknown>) {\n      const url = (config.upstreamProxyUrl as string | undefined) ?? null;\n      if (url && opts.failFor?.has(url)) {\n        throw new Error(`launch failed for ${url}`);\n      }\n      const hero: FakeHero = {\n        config,\n        closed: false,\n        async newTab() { return makeFakeTab(); },\n        async closeTab(tab: TabLike) { await tab.close(); },\n        async close() {\n          this.closed = true;\n        },\n      };\n      instances.push(hero);\n      return hero;\n    },\n  };\n  return { factory, instances };\n}\n\nasync function tick(n = 1) {\n  for (let i = 0; i < n; i++) await new Promise((r) => setImmediate(r));\n}\n\ndescribe(\"TieredBrowserPool\", () => {\n  describe(\"construction + pre-warm\", () => {\n    it(\"launches one browser per proxy URL at startup\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc2\", \"http://dc3\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      expect(instances).toHaveLength(3);\n      expect(pool.getStats().tiers[0].browsers).toHaveLength(3);\n      await pool.close();\n    });\n\n    it(\"skips duplicate proxy URLs within a tier\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc1\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      expect(instances).toHaveLength(2);\n      await pool.close();\n    });\n\n    it(\"tolerates a per-browser launch failure and resolves ready anyway\", async () => {\n      const { factory } = makeFakeFactory({ failFor: new Set([\"http://bad\"]) });\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://bad\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready; // should not throw\n      const stats = pool.getStats();\n      const dcBrowsers = stats.tiers.find((t) => t.tier === \"datacenter\")!.browsers;\n      expect(dcBrowsers).toHaveLength(3);\n      const closedCount = dcBrowsers.filter((b) => b.state === \"closed\").length;\n      expect(closedCount).toBe(1);\n      await pool.close();\n    });\n  });\n\n  describe(\"acquire\", () => {\n    it(\"returns least-loaded browser from the tier\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n\n      // Hold dc1 with an in-flight page\n      const dc1 = pool.acquire(\"datacenter\").browser;\n      let releaseDc1!: () => void;\n      const heldDc1 = new Promise<void>((r) => (releaseDc1 = r));\n      const dc1Page = dc1.withPage(async () => {\n        await heldDc1;\n      });\n      await tick(2);\n\n      // The next acquire should prefer the OTHER browser (dc2)\n      const lease = pool.acquire(\"datacenter\");\n      expect(lease.browser).not.toBe(dc1);\n\n      releaseDc1();\n      await dc1Page;\n      await pool.close();\n    });\n\n    it(\"throws when tier is unknown\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      expect(() => pool.acquire(\"residential\")).toThrow(/no browsers configured for tier/);\n      await pool.close();\n    });\n\n    it(\"throws when all browsers in the tier are unavailable\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      // Retire both\n      const lease1 = pool.acquire(\"datacenter\");\n      const lease2 = pool.acquire(\"datacenter\");\n      // They might be the same browser (least-loaded) — force retire via stats map\n      for (const tierStats of pool.getStats().tiers) {\n        for (const _ of tierStats.browsers) {\n          /* retirement below */\n        }\n      }\n      // Actually retire both via pool.close? No, we want the pool open but\n      // browsers unavailable. Grab them via getBrowserByProxy.\n      const b1 = pool.getBrowserByProxy(\"http://dc1\")!;\n      const b2 = pool.getBrowserByProxy(\"http://dc2\")!;\n      await Promise.all([b1.retire(), b2.retire()]);\n\n      expect(() => pool.acquire(\"datacenter\")).toThrow(/no available browsers/);\n      await pool.close();\n      void lease1;\n      void lease2;\n    });\n  });\n\n  describe(\"hasTier\", () => {\n    it(\"returns true for configured tiers\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      expect(pool.hasTier(\"datacenter\")).toBe(true);\n      expect(pool.hasTier(\"residential\")).toBe(false);\n      expect(pool.hasTier(\"direct\")).toBe(false);\n      await pool.close();\n    });\n  });\n\n  describe(\"getBrowserByProxy\", () => {\n    it(\"returns the browser bound to a proxy URL\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      const b1 = pool.getBrowserByProxy(\"http://dc1\")!;\n      const b2 = pool.getBrowserByProxy(\"http://dc2\")!;\n      expect(b1.proxyUrl).toBe(\"http://dc1\");\n      expect(b2.proxyUrl).toBe(\"http://dc2\");\n      expect(pool.getBrowserByProxy(\"http://dc3\")).toBeNull();\n      await pool.close();\n    });\n\n    it(\"resolves null for the direct lane\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"direct\", proxyUrls: [null] }],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      const direct = pool.getBrowserByProxy(null);\n      expect(direct).not.toBeNull();\n      expect(direct!.proxyUrl).toBeNull();\n      await pool.close();\n    });\n  });\n\n  describe(\"health tracker integration\", () => {\n    it(\"retires browser when its proxy is benched\", async () => {\n      const { factory } = makeFakeFactory();\n      const tracker = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 1000 });\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        healthTracker: tracker,\n        logger: silentLogger,\n      });\n      await pool.ready;\n\n      for (let i = 0; i < 3; i++) tracker.recordFailure(\"http://dc1\");\n\n      // Event handler schedules retire asynchronously\n      await tick(5);\n\n      const browser = pool.getBrowserByProxy(\"http://dc1\")!;\n      // retire is fire-and-forget; wait for it to settle\n      for (let i = 0; i < 50 && browser.getState() !== \"closed\"; i++) {\n        await tick(1);\n      }\n      expect(browser.getState()).toBe(\"closed\");\n\n      await pool.close();\n    });\n\n    it(\"relaunches browser when its proxy is revived\", async () => {\n      const clock = { t: 1_000_000 };\n      const { factory } = makeFakeFactory();\n      const tracker = new ProxyHealthTracker({\n        failureThreshold: 3,\n        cooldownMs: 1000,\n        now: () => clock.t,\n      });\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        healthTracker: tracker,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      const browser = pool.getBrowserByProxy(\"http://dc1\")!;\n\n      // Bench\n      for (let i = 0; i < 3; i++) tracker.recordFailure(\"http://dc1\");\n      await tick(5);\n      for (let i = 0; i < 50 && browser.getState() !== \"closed\"; i++) {\n        await tick(1);\n      }\n      expect(browser.getState()).toBe(\"closed\");\n\n      // Advance the fake clock past the cooldown, then trigger a health\n      // check which will emit the revive event.\n      clock.t += 2000;\n      expect(tracker.isHealthy(\"http://dc1\")).toBe(true);\n\n      // Relaunch happens asynchronously via the event listener\n      for (let i = 0; i < 50 && browser.getState() !== \"active\"; i++) {\n        await tick(1);\n      }\n      expect(browser.getState()).toBe(\"active\");\n\n      await pool.close();\n    });\n\n    it(\"acquire skips benched browsers\", async () => {\n      const { factory } = makeFakeFactory();\n      const tracker = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 10000 });\n      const pool = new TieredBrowserPool({\n        tiers: [\n          {\n            tier: \"datacenter\",\n            proxyUrls: [\"http://dc1\", \"http://dc2\"],\n          },\n        ],\n        heroFactory: factory,\n        healthTracker: tracker,\n        logger: silentLogger,\n      });\n      await pool.ready;\n\n      for (let i = 0; i < 3; i++) tracker.recordFailure(\"http://dc1\");\n      // Wait for dc1 retirement to settle\n      for (let i = 0; i < 50; i++) {\n        await tick(1);\n        if (pool.getBrowserByProxy(\"http://dc1\")!.getState() === \"closed\") break;\n      }\n\n      // Acquire should now always return dc2\n      for (let i = 0; i < 5; i++) {\n        const lease = pool.acquire(\"datacenter\");\n        expect(lease.browser.proxyUrl).toBe(\"http://dc2\");\n      }\n\n      await pool.close();\n    });\n  });\n\n  describe(\"close\", () => {\n    it(\"retires every browser across every tier\", async () => {\n      const { factory, instances } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [\n          { tier: \"datacenter\", proxyUrls: [\"http://dc1\", \"http://dc2\"] },\n          { tier: \"residential\", proxyUrls: [\"http://res1\"] },\n        ],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      await pool.close();\n      expect(instances.every((i) => i.closed)).toBe(true);\n    });\n\n    it(\"is safe to call close() twice\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      await pool.close();\n      await pool.close();\n    });\n\n    it(\"acquire throws after close\", async () => {\n      const { factory } = makeFakeFactory();\n      const pool = new TieredBrowserPool({\n        tiers: [{ tier: \"datacenter\", proxyUrls: [\"http://dc1\"] }],\n        heroFactory: factory,\n        logger: silentLogger,\n      });\n      await pool.ready;\n      await pool.close();\n      expect(() => pool.acquire(\"datacenter\")).toThrow(/closed/);\n    });\n  });\n});\n\ndescribe(\"buildTierConfigsFromPools\", () => {\n  it(\"returns datacenter + residential when both configured, no direct\", () => {\n    const tiers = buildTierConfigsFromPools({\n      datacenter: [{ url: \"http://dc1\" }, { url: \"http://dc2\" }],\n      residential: [{ url: \"http://res1\" }],\n    });\n    expect(tiers).toHaveLength(2);\n    expect(tiers[0]).toEqual({ tier: \"datacenter\", proxyUrls: [\"http://dc1\", \"http://dc2\"] });\n    expect(tiers[1]).toEqual({ tier: \"residential\", proxyUrls: [\"http://res1\"] });\n  });\n\n  it(\"returns only datacenter when residential is empty\", () => {\n    const tiers = buildTierConfigsFromPools({\n      datacenter: [{ url: \"http://dc1\" }],\n    });\n    expect(tiers).toHaveLength(1);\n    expect(tiers[0].tier).toBe(\"datacenter\");\n  });\n\n  it(\"returns direct when no proxies configured (default size 1)\", () => {\n    const tiers = buildTierConfigsFromPools({});\n    expect(tiers).toHaveLength(1);\n    expect(tiers[0]).toEqual({ tier: \"direct\", proxyUrls: [null] });\n  });\n\n  it(\"respects directPoolSize when creating direct tier\", () => {\n    const tiers = buildTierConfigsFromPools({}, { directPoolSize: 3 });\n    expect(tiers[0].proxyUrls).toEqual([null, null, null]);\n  });\n\n  it(\"does NOT add a direct tier when any proxy is configured\", () => {\n    const tiers = buildTierConfigsFromPools({\n      datacenter: [{ url: \"http://dc1\" }],\n    });\n    expect(tiers.find((t) => t.tier === \"direct\")).toBeUndefined();\n  });\n\n  it(\"treats undefined pools as empty\", () => {\n    const tiers = buildTierConfigsFromPools(undefined);\n    expect(tiers).toHaveLength(1);\n    expect(tiers[0].tier).toBe(\"direct\");\n  });\n\n  it(\"filters out proxies with no URL\", () => {\n    const tiers = buildTierConfigsFromPools({\n      datacenter: [{ url: \"http://dc1\" }, {}, { url: \"\" }],\n    });\n    expect(tiers[0].proxyUrls).toEqual([\"http://dc1\"]);\n  });\n});\n"
  },
  {
    "path": "tests/unit/url-helpers.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { isValidUrl, getUrlKey, isSameDomain, resolveUrl } from \"../../src/utils/url-helpers\";\n\ndescribe(\"isValidUrl\", () => {\n  it(\"accepts valid http URLs\", () => {\n    expect(isValidUrl(\"http://example.com\")).toBe(true);\n  });\n\n  it(\"accepts valid https URLs\", () => {\n    expect(isValidUrl(\"https://example.com\")).toBe(true);\n  });\n\n  it(\"accepts URLs with paths\", () => {\n    expect(isValidUrl(\"https://example.com/path/to/page\")).toBe(true);\n  });\n\n  it(\"accepts URLs with query strings\", () => {\n    expect(isValidUrl(\"https://example.com?q=test&page=1\")).toBe(true);\n  });\n\n  it(\"rejects empty string\", () => {\n    expect(isValidUrl(\"\")).toBe(false);\n  });\n\n  it(\"rejects plain text\", () => {\n    expect(isValidUrl(\"not a url\")).toBe(false);\n  });\n\n  it(\"handles javascript: URLs (implementation-dependent)\", () => {\n    // isValidUrl uses URL constructor which may accept javascript: protocol\n    const result = isValidUrl(\"javascript:alert(1)\");\n    expect(typeof result).toBe(\"boolean\");\n  });\n});\n\ndescribe(\"getUrlKey\", () => {\n  it(\"normalizes www prefix\", () => {\n    expect(getUrlKey(\"https://www.example.com\")).toBe(getUrlKey(\"https://example.com\"));\n  });\n\n  it(\"removes hash fragments\", () => {\n    expect(getUrlKey(\"https://example.com#section\")).toBe(getUrlKey(\"https://example.com\"));\n  });\n\n  it(\"removes trailing slash\", () => {\n    expect(getUrlKey(\"https://example.com/\")).toBe(getUrlKey(\"https://example.com\"));\n  });\n\n  it(\"normalizes index files\", () => {\n    expect(getUrlKey(\"https://example.com/index.html\")).toBe(getUrlKey(\"https://example.com/\"));\n  });\n\n  it(\"preserves path differences\", () => {\n    expect(getUrlKey(\"https://example.com/a\")).not.toBe(getUrlKey(\"https://example.com/b\"));\n  });\n\n  it(\"lowercases the result\", () => {\n    const key = getUrlKey(\"https://EXAMPLE.COM/Path\");\n    expect(key).toBe(key.toLowerCase());\n  });\n});\n\ndescribe(\"isSameDomain\", () => {\n  it(\"matches same domain\", () => {\n    expect(isSameDomain(\"https://example.com/a\", \"https://example.com/b\")).toBe(true);\n  });\n\n  it(\"matches with www difference\", () => {\n    expect(isSameDomain(\"https://www.example.com\", \"https://example.com\")).toBe(true);\n  });\n\n  it(\"rejects different domains\", () => {\n    expect(isSameDomain(\"https://example.com\", \"https://other.com\")).toBe(false);\n  });\n\n  it(\"rejects subdomains (strict hostname match)\", () => {\n    expect(isSameDomain(\"https://blog.example.com\", \"https://example.com\")).toBe(false);\n    expect(isSameDomain(\"https://dashboard.stripe.com\", \"https://docs.stripe.com\")).toBe(false);\n  });\n});\n\ndescribe(\"resolveUrl\", () => {\n  it(\"resolves relative path against base\", () => {\n    const resolved = resolveUrl(\"/about\", \"https://example.com/page\");\n    expect(resolved).toBe(\"https://example.com/about\");\n  });\n\n  it(\"returns absolute URL (may normalize trailing slash)\", () => {\n    const resolved = resolveUrl(\"https://other.com\", \"https://example.com\");\n    expect(resolved).toContain(\"other.com\");\n  });\n\n  it(\"handles fragment-only URLs\", () => {\n    const resolved = resolveUrl(\"#section\", \"https://example.com/page\");\n    expect(resolved).toContain(\"example.com\");\n  });\n});\n"
  },
  {
    "path": "tests/unit/url-rewriter.test.ts",
    "content": "import { describe, it, expect } from \"vitest\";\nimport { rewriteUrl, type UrlRewriteRule } from \"../../src/utils/url-rewriter\";\n\n// Google rewrite rules — mimics what reader-api would provide\nfunction extractGoogleDocId(pathname: string): string | null {\n  const match = pathname.match(/\\/d\\/([a-zA-Z0-9_-]+)/);\n  return match ? match[1] : null;\n}\n\nconst GOOGLE_RULES: UrlRewriteRule[] = [\n  {\n    name: \"google-docs\",\n    match: (url) => url.hostname === \"docs.google.com\" && url.pathname.startsWith(\"/document/\"),\n    rewrite: (url) => {\n      const id = extractGoogleDocId(url.pathname);\n      return `https://docs.google.com/document/d/${id}/export?format=html`;\n    },\n  },\n  {\n    name: \"google-sheets\",\n    match: (url) => url.hostname === \"docs.google.com\" && url.pathname.startsWith(\"/spreadsheets/\"),\n    rewrite: (url) => {\n      const id = extractGoogleDocId(url.pathname);\n      return `https://docs.google.com/spreadsheets/d/${id}/export?format=html`;\n    },\n  },\n  {\n    name: \"google-slides\",\n    match: (url) => url.hostname === \"docs.google.com\" && url.pathname.startsWith(\"/presentation/\"),\n    rewrite: (url) => {\n      const id = extractGoogleDocId(url.pathname);\n      return `https://docs.google.com/presentation/d/${id}/export/html`;\n    },\n  },\n  {\n    name: \"google-drive\",\n    match: (url) => url.hostname === \"drive.google.com\" && url.pathname.startsWith(\"/file/\"),\n    rewrite: (url) => {\n      const id = extractGoogleDocId(url.pathname);\n      return `https://drive.google.com/uc?id=${id}&export=download`;\n    },\n  },\n];\n\ndescribe(\"rewriteUrl\", () => {\n  it(\"returns unchanged when no rules provided (unopinionated)\", () => {\n    const result = rewriteUrl(\"https://docs.google.com/document/d/abc123/edit\");\n    expect(result.rewritten).toBe(false);\n    expect(result.url).toBe(\"https://docs.google.com/document/d/abc123/edit\");\n  });\n\n  describe(\"Google Docs\", () => {\n    it(\"rewrites a Google Docs /edit URL to HTML export\", () => {\n      const result = rewriteUrl(\n        \"https://docs.google.com/document/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit\",\n        GOOGLE_RULES,\n      );\n      expect(result).toEqual({\n        url: \"https://docs.google.com/document/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/export?format=html\",\n        rewritten: true,\n        reason: \"google-docs\",\n      });\n    });\n\n    it(\"handles document IDs with hyphens and underscores\", () => {\n      const result = rewriteUrl(\n        \"https://docs.google.com/document/d/abc-123_DEF-456_ghi/edit\",\n        GOOGLE_RULES,\n      );\n      expect(result.rewritten).toBe(true);\n      expect(result.reason).toBe(\"google-docs\");\n    });\n  });\n\n  describe(\"Google Sheets\", () => {\n    it(\"rewrites a Google Sheets URL to HTML export\", () => {\n      const result = rewriteUrl(\n        \"https://docs.google.com/spreadsheets/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit\",\n        GOOGLE_RULES,\n      );\n      expect(result.rewritten).toBe(true);\n      expect(result.reason).toBe(\"google-sheets\");\n    });\n  });\n\n  describe(\"Google Slides\", () => {\n    it(\"rewrites a Google Slides URL to HTML export\", () => {\n      const result = rewriteUrl(\n        \"https://docs.google.com/presentation/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit\",\n        GOOGLE_RULES,\n      );\n      expect(result.rewritten).toBe(true);\n      expect(result.reason).toBe(\"google-slides\");\n    });\n  });\n\n  describe(\"Google Drive\", () => {\n    it(\"rewrites a Google Drive file URL to direct download\", () => {\n      const result = rewriteUrl(\n        \"https://drive.google.com/file/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/view\",\n        GOOGLE_RULES,\n      );\n      expect(result.rewritten).toBe(true);\n      expect(result.reason).toBe(\"google-drive\");\n    });\n  });\n\n  describe(\"non-matching URLs\", () => {\n    it(\"returns non-Google URLs unchanged\", () => {\n      const result = rewriteUrl(\"https://example.com/some-page\", GOOGLE_RULES);\n      expect(result.rewritten).toBe(false);\n    });\n\n    it(\"returns invalid URLs unchanged\", () => {\n      const result = rewriteUrl(\"not-a-valid-url\", GOOGLE_RULES);\n      expect(result.rewritten).toBe(false);\n    });\n\n    it(\"does not rewrite Google Docs non-document paths like /forms/\", () => {\n      const result = rewriteUrl(\n        \"https://docs.google.com/forms/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit\",\n        GOOGLE_RULES,\n      );\n      expect(result.rewritten).toBe(false);\n    });\n  });\n});\n"
  },
  {
    "path": "tsconfig.json",
    "content": "{\n  \"compilerOptions\": {\n    \"target\": \"ESNext\",\n    \"module\": \"ESNext\",\n    \"moduleResolution\": \"bundler\",\n    \"lib\": [\"ESNext\", \"DOM\"],\n    \"outDir\": \"./dist\",\n    \"strict\": true,\n    \"esModuleInterop\": true,\n    \"allowSyntheticDefaultImports\": true,\n    \"skipLibCheck\": true,\n    \"forceConsistentCasingInFileNames\": true,\n    \"declaration\": true,\n    \"declarationMap\": true,\n    \"sourceMap\": true,\n    \"removeComments\": false,\n    \"noImplicitAny\": true,\n    \"noImplicitReturns\": false,\n    \"noImplicitThis\": true,\n    \"noUnusedLocals\": true,\n    \"noUnusedParameters\": false,\n    \"exactOptionalPropertyTypes\": false,\n    \"resolveJsonModule\": true,\n    \"types\": [\"node\"]\n  },\n  \"include\": [\"src/**/*\"],\n  \"exclude\": [\"node_modules\", \"dist\", \"**/*.test.ts\"]\n}\n"
  },
  {
    "path": "tsup.config.ts",
    "content": "import { defineConfig } from \"tsup\";\n\n// Packages that should not be bundled (native modules, CommonJS deps)\n// Packages that must NOT be bundled — they contain native modules,\n// use require() internally, or need to be resolved from node_modules\n// at runtime. Every entry here MUST also be in package.json dependencies.\nconst external = [\n  \"@ulixee/hero\",\n  \"@ulixee/hero-core\",\n  \"@ulixee/net\",\n  \"re2\",\n  \"pino\",\n  \"pino-pretty\",\n];\n\nexport default defineConfig([\n  // Main library\n  {\n    entry: [\"src/index.ts\"],\n    format: [\"esm\"],\n    dts: true,\n    clean: true,\n    outDir: \"dist\",\n    splitting: false,\n    sourcemap: true,\n    target: \"node18\",\n    external,\n  },\n  // CLI (shebang preserved from source)\n  {\n    entry: [\"src/cli/index.ts\"],\n    format: [\"esm\"],\n    dts: false,\n    outDir: \"dist/cli\",\n    splitting: false,\n    sourcemap: true,\n    target: \"node18\",\n    external,\n  },\n]);\n"
  },
  {
    "path": "vitest.config.ts",
    "content": "import { defineConfig } from \"vitest/config\";\n\nexport default defineConfig({\n  test: {\n    globals: true,\n    environment: \"node\",\n    include: [\"tests/**/*.test.ts\"],\n    testTimeout: 30_000,\n    hookTimeout: 15_000,\n  },\n});\n"
  }
]