Repository: vakra-dev/reader Branch: main Commit: fbf5a54bff96 Files: 147 Total size: 751.3 KB Directory structure: gitextract_cms0mrdu/ ├── .eslintrc.json ├── .github/ │ └── workflows/ │ ├── ci.yml │ └── publish.yml ├── .gitignore ├── .leasotrc ├── .nvmrc ├── .prettierrc ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── docs/ │ ├── api-reference.md │ ├── architecture.md │ ├── assets/ │ │ ├── .gitkeep │ │ └── demo.tape │ ├── deployment/ │ │ ├── docker.md │ │ ├── job-queues.md │ │ └── production-server.md │ ├── getting-started.md │ ├── guides/ │ │ ├── browser-pool.md │ │ ├── browser-sessions.md │ │ ├── cloudflare-bypass.md │ │ ├── output-formats.md │ │ └── proxy-configuration.md │ └── troubleshooting.md ├── ecosystem.config.cjs ├── examples/ │ ├── .gitignore │ ├── .nvmrc │ ├── README.md │ ├── ai-tools/ │ │ ├── README.md │ │ ├── anthropic-summary.ts │ │ ├── langchain-loader.ts │ │ ├── llamaindex-loader.ts │ │ ├── openai-summary.ts │ │ ├── pinecone-ingest.ts │ │ ├── qdrant-ingest.ts │ │ └── vercel-ai-stream.ts │ ├── basic/ │ │ ├── README.md │ │ ├── all-formats.ts │ │ ├── basic-scrape.ts │ │ ├── batch-scrape.ts │ │ ├── browser-pool-config.ts │ │ ├── browser-session-actions.ts │ │ ├── browser-session-puppeteer.ts │ │ ├── browser-session-selenium.ts │ │ ├── browser-session.ts │ │ ├── cloudflare-bypass.ts │ │ ├── crawl-website.ts │ │ ├── large-batch-scrape.ts │ │ ├── proxy-pool.ts │ │ └── with-proxy.ts │ ├── package.json │ ├── production/ │ │ ├── README.md │ │ ├── browser-pool-scaling/ │ │ │ ├── README.md │ │ │ ├── package.json │ │ │ └── src/ │ │ │ └── index.ts │ │ ├── express-server/ │ │ │ ├── README.md │ │ │ ├── package.json │ │ │ └── src/ │ │ │ └── index.ts │ │ └── job-queue-bullmq/ │ │ ├── README.md │ │ ├── package.json │ │ └── src/ │ │ ├── index.ts │ │ ├── queue.ts │ │ └── worker.ts │ └── tsconfig.json ├── package.json ├── result.md ├── scripts/ │ └── release.sh ├── src/ │ ├── browser/ │ │ ├── hero-config.ts │ │ ├── pool.ts │ │ ├── proxy-bound-browser.ts │ │ ├── tiered-pool.ts │ │ └── types.ts │ ├── browser-session.ts │ ├── browser-types.ts │ ├── cli/ │ │ └── index.ts │ ├── client.ts │ ├── cloudflare/ │ │ ├── detector.ts │ │ ├── handler.ts │ │ └── types.ts │ ├── config/ │ │ └── domain-profiles.ts │ ├── crawl-types.ts │ ├── crawler.ts │ ├── daemon/ │ │ ├── client.ts │ │ ├── index.ts │ │ └── server.ts │ ├── engines/ │ │ ├── errors.ts │ │ ├── hero/ │ │ │ └── index.ts │ │ ├── index.ts │ │ ├── orchestrator.ts │ │ └── types.ts │ ├── errors.ts │ ├── formatters/ │ │ ├── html.ts │ │ ├── index.ts │ │ ├── markdown.ts │ │ └── postprocess.ts │ ├── index.ts │ ├── proxy/ │ │ ├── config.ts │ │ ├── env.ts │ │ ├── health-tracker.ts │ │ ├── proxy-gate.ts │ │ └── verify.ts │ ├── scraper.ts │ ├── types.ts │ └── utils/ │ ├── block-detector.ts │ ├── content-cleaner.ts │ ├── logger.ts │ ├── metadata-extractor.ts │ ├── rate-limiter.ts │ ├── robots-parser.ts │ ├── url-helpers.ts │ └── url-rewriter.ts ├── tests/ │ ├── engines/ │ │ └── orchestrator.test.ts │ ├── fixtures/ │ │ ├── amazon-bot-page.html │ │ ├── cloudflare-challenge.html │ │ ├── empty-page.html │ │ └── simple-static.html │ ├── integration/ │ │ └── daemon.test.ts │ └── unit/ │ ├── block-detector-cloudflare.test.ts │ ├── block-detector-fixtures.test.ts │ ├── block-detector.test.ts │ ├── browser-session.test.ts │ ├── content-cleaner.test.ts │ ├── crawler.test.ts │ ├── daemon-dispatch.test.ts │ ├── domain-profiles.test.ts │ ├── errors.test.ts │ ├── health-tracker.test.ts │ ├── html-size-guard.test.ts │ ├── markdown-formatter.test.ts │ ├── metadata-extractor.test.ts │ ├── postprocess.test.ts │ ├── proxy-bound-browser.test.ts │ ├── proxy-config.test.ts │ ├── proxy-gate.test.ts │ ├── proxy-verify.test.ts │ ├── robots-parser.test.ts │ ├── scraper-pipeline.test.ts │ ├── scraper-retry.test.ts │ ├── tiered-pool.test.ts │ ├── url-helpers.test.ts │ └── url-rewriter.test.ts ├── tsconfig.json ├── tsup.config.ts └── vitest.config.ts ================================================ FILE CONTENTS ================================================ ================================================ FILE: .eslintrc.json ================================================ { "root": true, "parser": "@typescript-eslint/parser", "parserOptions": { "ecmaVersion": "latest", "sourceType": "module", "project": true }, "plugins": ["@typescript-eslint"], "extends": [ "eslint:recommended", "plugin:@typescript-eslint/recommended" ], "env": { "node": true, "es2022": true }, "rules": { "@typescript-eslint/no-explicit-any": "warn", "@typescript-eslint/no-unused-vars": ["error", { "argsIgnorePattern": "^_" }], "@typescript-eslint/explicit-function-return-type": "off", "@typescript-eslint/explicit-module-boundary-types": "off", "@typescript-eslint/no-non-null-assertion": "warn", "no-console": ["warn", { "allow": ["warn", "error"] }] }, "ignorePatterns": ["dist/", "node_modules/", "*.js", "*.config.ts"] } ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [main] pull_request: branches: [main] jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: node-version: "22" cache: "npm" - run: npm ci - name: Typecheck run: npx tsc --noEmit - name: Lint run: npm run lint - name: Format check run: npm run format:check - name: Test run: npm test - name: Build run: npm run build ================================================ FILE: .github/workflows/publish.yml ================================================ name: Publish to npm on: release: types: [published] jobs: publish: runs-on: ubuntu-latest permissions: contents: read steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: node-version: "22" registry-url: "https://registry.npmjs.org" - run: npm ci - name: Verify version matches tag run: | TAG_VERSION="${GITHUB_REF_NAME#v}" PKG_VERSION=$(node -p "require('./package.json').version") if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then echo "Error: Tag $TAG_VERSION does not match package.json $PKG_VERSION" exit 1 fi echo "Version verified: $PKG_VERSION" - name: Build run: npm run build - name: Publish run: npm publish --access public env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} ================================================ FILE: .gitignore ================================================ # Dependencies node_modules/ # Build output dist/ # Environment files .env .env.local .env.*.local # Logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* # OS files .DS_Store Thumbs.db # IDE .idea/ .vscode/ *.swp *.swo # Coverage coverage/ .nyc_output/ # Package manager locks # Note: package-lock.json is tracked for reproducible builds yarn.lock # Bun bun.lockb # Temporary files tmp/ temp/ *.tmp # Hero/Ulixee session data .ulixee/ # Claude Code context CLAUDE.md # Deployment configs (contain sensitive data) deploy/ ================================================ FILE: .leasotrc ================================================ { "tags": ["TODO", "FIXME", "HACK", "XXX", "BUG", "OPTIMIZE", "REVIEW"], "ignore": ["node_modules/**", "dist/**"] } ================================================ FILE: .nvmrc ================================================ v22.12.0 ================================================ FILE: .prettierrc ================================================ { "semi": true, "singleQuote": false, "tabWidth": 2, "trailingComma": "es5", "printWidth": 100, "useTabs": false, "bracketSpacing": true, "arrowParens": "always", "endOfLine": "lf" } ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 message: "If you use Reader in your research or project, please cite it." title: "Reader: Open-source, production-grade web scraping engine built for LLMs" type: software authors: - family-names: Kaul given-names: Nihal license: Apache-2.0 url: "https://github.com/vakra-dev/reader" repository-code: "https://github.com/vakra-dev/reader" ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a welcoming experience for everyone, regardless of background or identity. ## Our Standards Examples of behavior that contributes to a positive environment: - Using welcoming and inclusive language - Being respectful of differing viewpoints and experiences - Gracefully accepting constructive criticism - Focusing on what is best for the community - Showing empathy towards other community members Examples of unacceptable behavior: - Trolling, insulting or derogatory comments, and personal attacks - Public or private harassment - Publishing others' private information without explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Project maintainers are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate or harmful. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. ## Enforcement Instances of unacceptable behavior may be reported to the project maintainers at **nihal.codes@gmail.com**. All complaints will be reviewed and investigated promptly and fairly. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact:** Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence:** A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact:** A violation through a single incident or series of actions. **Consequence:** A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact:** A serious violation of community standards, including sustained inappropriate behavior. **Consequence:** A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact:** Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence:** A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Reader Thank you for your interest in contributing to Reader! This document provides guidelines and instructions for contributing. ## Development Setup ### Prerequisites - **Node.js** >= 18 (v22 recommended) - **npm** for package management - **Git** > **Note:** Always run scripts with Node.js (`npx tsx` or `node`) as Hero has ESM compatibility issues with other runtimes. ### Getting Started 1. **Fork the repository** on GitHub 2. **Clone your fork:** ```bash git clone https://github.com/YOUR_USERNAME/reader.git cd reader ``` 3. **Install dependencies:** ```bash npm install ``` 4. **Verify setup:** ```bash npm run typecheck npm run build ``` 5. **Test the CLI:** ```bash npx tsx src/cli/index.ts scrape https://example.com ``` ## Project Structure ``` src/ ├── index.ts # Public API exports ├── client.ts # ReaderClient - main API entry point ├── scraper.ts # Scraper class - main scraping logic ├── crawler.ts # Crawler class - link discovery ├── types.ts # TypeScript types for scraping ├── crawl-types.ts # TypeScript types for crawling │ ├── browser/ │ ├── pool.ts # BrowserPool - manages Hero instances │ ├── hero-config.ts # Hero configuration │ └── types.ts # Pool types │ ├── cloudflare/ │ ├── detector.ts # Challenge detection │ ├── handler.ts # Challenge resolution │ └── types.ts # Cloudflare types │ ├── formatters/ │ ├── markdown.ts # Markdown formatter │ ├── html.ts # HTML formatter │ ├── json.ts # JSON formatter │ ├── text.ts # Text formatter │ └── index.ts # Re-exports │ ├── utils/ │ ├── content-cleaner.ts # HTML content cleaning │ ├── metadata-extractor.ts # Metadata extraction │ ├── url-helpers.ts # URL utilities │ ├── rate-limiter.ts # Rate limiting │ └── logger.ts # Logging │ ├── proxy/ │ └── config.ts # Proxy configuration │ ├── daemon/ │ ├── index.ts # Module exports │ ├── server.ts # DaemonServer - HTTP server with browser pool │ └── client.ts # DaemonClient - connects CLI to daemon │ └── cli/ └── index.ts # CLI implementation ``` ## Development Workflow ### Running the CLI ```bash # Run CLI directly npx tsx src/cli/index.ts scrape https://example.com # With verbose output npx tsx src/cli/index.ts scrape https://example.com -v # Show browser window npx tsx src/cli/index.ts scrape https://example.com --show-chrome ``` ### Daemon Mode ```bash # Start daemon with browser pool npx tsx src/cli/index.ts start --pool-size 5 # Check daemon status npx tsx src/cli/index.ts status # Run commands (auto-connects to daemon) npx tsx src/cli/index.ts scrape https://example.com # Force standalone mode (bypass daemon) npx tsx src/cli/index.ts scrape https://example.com --standalone # Stop daemon npx tsx src/cli/index.ts stop ``` ### Code Quality Run these commands before submitting a PR: ```bash # Type checking npm run typecheck # Linting npm run lint # Auto-fix lint issues npm run lint:fix # Format code npm run format # Check formatting npm run format:check # Build npm run build ``` ### Finding TODOs Track outstanding work: ```bash npm run todo ``` ## Making Changes ### Branch Naming - `feature/description` - New features - `fix/description` - Bug fixes - `docs/description` - Documentation updates - `refactor/description` - Code refactoring ### Commit Messages Write clear, concise commit messages: ``` type: short description Longer description if needed. ``` Types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore` Examples: ``` feat: add support for custom user agents fix: resolve timeout issue with Cloudflare challenges docs: update proxy configuration guide refactor: simplify browser pool recycling logic ``` ### Pull Request Process 1. Create a new branch from `main` 2. Make your changes 3. Run all checks: ```bash npm run lint npm run format:check npm run typecheck npm run build ``` 4. Push your branch and create a PR 5. Fill out the PR template 6. Wait for review ## Common Tasks ### Adding a New Output Format 1. Create `src/formatters/newformat.ts`: ```typescript export function formatToNewFormat( pages: Page[], baseUrl: string, scrapedAt: string, duration: number, metadata?: WebsiteMetadata ): string { // Implementation } ``` 2. Export from `src/formatters/index.ts` 3. Add to format type in `src/types.ts` 4. Call formatter in `src/scraper.ts` 5. Update CLI validation in `src/cli/index.ts` ### Adding a New ScrapeOption 1. Add to `ScrapeOptions` interface in `src/types.ts` 2. Add default in `DEFAULT_OPTIONS` 3. Use in `Scraper` class via `this.options.newOption` 4. Add CLI flag in `src/cli/index.ts` if applicable 5. Update documentation ### Modifying Cloudflare Detection 1. Detection patterns: `src/cloudflare/detector.ts` 2. Resolution logic: `src/cloudflare/handler.ts` 3. Test with known Cloudflare-protected sites ### Adjusting Browser Pool 1. Default config: `src/browser/types.ts` 2. Pool logic: `src/browser/pool.ts` ## Testing Currently testing is done manually. When adding new features: 1. **Test basic functionality:** ```bash npx tsx src/cli/index.ts scrape https://example.com ``` 2. **Test Cloudflare-protected sites:** ```bash npx tsx src/cli/index.ts scrape https://cloudflare-protected-site.com -v ``` 3. **Test different output formats:** ```bash npx tsx src/cli/index.ts scrape https://example.com -f markdown,html,json,text ``` 4. **Test crawling:** ```bash npx tsx src/cli/index.ts crawl https://example.com -d 2 -m 10 ``` 5. **Test batch scraping:** ```bash npx tsx src/cli/index.ts scrape url1 url2 url3 -c 3 -v ``` 6. **Test daemon mode:** ```bash # Start daemon npx tsx src/cli/index.ts start --pool-size 3 # Test scraping via daemon npx tsx src/cli/index.ts scrape https://example.com # Check status npx tsx src/cli/index.ts status # Stop daemon npx tsx src/cli/index.ts stop ``` ## Running Examples The `examples/` folder contains working examples: ```bash cd examples npm install # Basic examples npx tsx basic/basic-scrape.ts npx tsx basic/batch-scrape.ts npx tsx basic/crawl-website.ts # AI integration examples (requires API keys) export OPENAI_API_KEY="sk-..." npx tsx ai-tools/openai-summary.ts https://example.com # Production server npx tsx production/express-server/src/index.ts ``` ## Code Style - Use TypeScript for all new code - Follow existing patterns in the codebase - Use async/await instead of callbacks - Prefer explicit types over `any` - Use meaningful variable and function names - Add JSDoc comments for public APIs ## Documentation When making changes: 1. Update relevant markdown files in `docs/` 2. Update README.md if adding new features 3. Add JSDoc comments to new public functions 4. Update CLAUDE.md for AI context if architecture changes ### Documentation Files | File | Purpose | | ------------------------- | ------------------------------- | | `README.md` | Main documentation, quick start | | `CONTRIBUTING.md` | This file | | `docs/getting-started.md` | Detailed setup guide | | `docs/api-reference.md` | Complete API docs | | `docs/architecture.md` | System design | | `docs/troubleshooting.md` | Common issues | | `docs/guides/` | Feature guides | | `docs/deployment/` | Deployment guides | ## Reporting Issues When reporting bugs, please include: - Operating system and version - Node.js version (`node --version`) - Reader version - Steps to reproduce - Expected vs actual behavior - Error messages and stack traces - Verbose output (`-v` flag) ## Code of Conduct - Be respectful and inclusive - Focus on constructive feedback - Help others learn and grow - Follow project guidelines ## License By contributing, you agree that your contributions will be licensed under the Apache 2.0 License. ## Disclaimer By using Reader, you agree to the following: - You are solely responsible for respecting websites' policies when scraping and crawling - You will adhere to applicable privacy policies and terms of use before initiating scraping activities - Reader respects robots.txt directives by default, but ultimate compliance is your responsibility ## Questions? - Check the [documentation](https://docs.reader.dev) - Search [GitHub Issues](https://github.com/vakra-dev/reader/issues) - Ask in [Discord](https://discord.gg/6tjkq7J5WV) - Open a new issue or discussion Thank you for contributing! ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to the Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS Copyright (c) 2026 vakra-dev Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

Reader Logo

Reader

Open source web infrastructure for AI.

Access the web without the complexity.

License: Apache 2.0 npm version GitHub stars

Docs · Examples · Discord

Reader demo - scrape any URL to clean markdown

## The Problem Building agents that need web access is frustrating. You piece together Puppeteer, add stealth plugins, fight Cloudflare, manage proxies and it still breaks in production. Because production grade web scraping isn't about rendering a page and converting HTML to markdown. It's about everything underneath: | Layer | What it actually takes | | ------------------------ | ------------------------------------------------------------------- | | **Browser architecture** | Managing browser instances at scale, not one-off scripts | | **Anti-bot bypass** | Cloudflare, Turnstile, JS challenges, they all block naive scrapers | | **TLS fingerprinting** | Real browsers have fingerprints. Puppeteer doesn't. Sites know. | | **Proxy infrastructure** | Datacenter vs residential, rotation strategies, sticky sessions | | **Resource management** | Browser pooling, memory limits, graceful recycling | | **Reliability** | Rate limiting, retries, timeouts, caching, graceful degradation | I built **Reader**, a production-grade web scraping engine on top of [Ulixee Hero](https://ulixee.org/), a headless browser designed for exactly this. ## The Solution Three primitives. That's it. ```typescript import { ReaderClient } from "@vakra-dev/reader"; import { chromium } from "playwright-core"; const reader = new ReaderClient(); // 1. Scrape URLs → clean markdown const result = await reader.scrape({ urls: ["https://example.com"] }); console.log(result.data[0].markdown); // 2. Crawl a site → discover + scrape pages const pages = await reader.crawl({ url: "https://example.com", depth: 2, scrape: true, }); console.log(`Found ${pages.urls.length} pages`); // 3. Browser session → full Playwright/Puppeteer control with stealth const session = await reader.browser(); const browser = await chromium.connectOverCDP(session.wsEndpoint); const page = browser.contexts()[0].pages()[0]; await page.goto("https://example.com"); console.log(await page.title()); await session.close(); ``` All the hard stuff (browser pooling, anti-bot bypass, proxy rotation, retries) happens under the hood. You get clean markdown. Your agents get the web. And when you need full browser control, `browser()` gives you a stealthed Chrome that Playwright or Puppeteer can drive. > [!TIP] > If Reader is useful to you, a [star on GitHub](https://github.com/vakra-dev/reader) helps others discover the project. ## Features - **Browser Sessions** - Launch stealthed Chrome, connect Playwright/Puppeteer via CDP - **Anti-Bot Bypass** - TLS fingerprinting, navigator spoofing, WebRTC masking, `webdriver=false` - **Clean Output** - Markdown and HTML with automatic main content extraction - **Smart Content Cleaning** - Removes nav, headers, footers, popups, cookie banners - **CLI & API** - Use from command line or programmatically - **Browser Pool** - Auto-recycling, health monitoring, tiered proxy pools - **Concurrent Scraping** - Parallel URL processing with progress tracking - **Website Crawling** - BFS link discovery with depth/page limits - **Tiered Proxies** - Datacenter and residential pools with auto-escalation and health tracking ## Installation ```bash npm install @vakra-dev/reader ``` **Requirements:** Node.js >= 18 > **Apple Silicon (M1/M2/M3):** Hero's bundled Chrome binary isn't available for arm64. Point to your system Chrome: > > ```bash > export CHROME_139_BIN="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" > ``` ## Quick Start ### Cloud (Fastest) Get an API key at [app.reader.dev](https://app.reader.dev) and start scraping immediately: ```typescript import { ReaderClient } from "@vakra-dev/reader-js"; const reader = new ReaderClient({ apiKey: process.env.READER_API_KEY }); const result = await reader.read({ url: "https://example.com" }); if (result.kind === "scrape") { console.log(result.data.markdown); } ``` ```bash npm install @vakra-dev/reader-js ``` See the [cloud docs](https://docs.reader.dev) for the full API reference. ### Self-Hosted Install the reader engine and run scraping on your own infrastructure: ### Basic Scrape ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.scrape({ urls: ["https://example.com"], formats: ["markdown", "html"], }); console.log(result.data[0].markdown); console.log(result.data[0].html); await reader.close(); ``` ### Batch Scraping with Concurrency ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.scrape({ urls: ["https://example.com", "https://example.org", "https://example.net"], formats: ["markdown"], batchConcurrency: 3, onProgress: (progress) => { console.log(`${progress.completed}/${progress.total}: ${progress.currentUrl}`); }, }); console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`); await reader.close(); ``` ### Crawling ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.crawl({ url: "https://example.com", depth: 2, maxPages: 20, scrape: true, }); console.log(`Discovered ${result.urls.length} URLs`); console.log(`Scraped ${result.scraped?.batchMetadata.successfulUrls} pages`); await reader.close(); ``` ### Browser Session Launch a stealthed Chrome and control it with Playwright or Puppeteer. The browser has anti-bot stealth active (`webdriver=false`, navigator spoofing, WebRTC masking). Your existing scripts just work. ```typescript import { ReaderClient } from "@vakra-dev/reader"; import { chromium } from "playwright-core"; const reader = new ReaderClient(); // Create a browser session - returns a CDP WebSocket URL const session = await reader.browser(); // Connect Playwright (one-line change from a local script) const browser = await chromium.connectOverCDP(session.wsEndpoint); const context = await browser.newContext(); const page = await context.newPage(); // Use Playwright normally - full stealth active await page.goto("https://news.ycombinator.com/"); console.log(await page.title()); await browser.close(); await session.close(); await reader.close(); ``` Also works with Puppeteer: ```typescript import { connect } from "puppeteer-core"; const browser = await connect({ browserWSEndpoint: session.wsEndpoint }); ``` ### With Proxy ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.scrape({ urls: ["https://example.com"], formats: ["markdown"], proxy: { type: "residential", host: "proxy.example.com", port: 8080, username: "username", password: "password", country: "us", }, }); await reader.close(); ``` ### With Tiered Proxy Pools Configure datacenter (fast, cheap) and residential (anti-bot) proxy tiers. Reader auto-escalates from datacenter to residential when sites block: ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient({ proxyPools: { datacenter: [ { url: "http://user:pass@dc-proxy1:8080" }, { url: "http://user:pass@dc-proxy2:8080" }, ], residential: [{ url: "http://user:pass@res-proxy1:8080" }], }, }); const result = await reader.scrape({ urls: ["https://example.com"], proxyTier: "auto", // datacenter first, escalate to residential on block }); await reader.close(); ``` Or via environment variables: ```bash PROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080 PROXY_RESIDENTIAL=http://user:pass@res1:8080 ``` ### With Browser Pool Configuration ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient({ browserPool: { size: 5, // 5 browser instances retireAfterPages: 50, // Recycle after 50 pages retireAfterMinutes: 15, // Recycle after 15 minutes }, verbose: true, }); const result = await reader.scrape({ urls: manyUrls, batchConcurrency: 5, }); await reader.close(); ``` ## CLI Reference ### Daemon Mode For multiple requests, start a daemon to keep browser pool warm: ```bash # Start daemon with browser pool npx reader start --direct-pool-size 5 # All subsequent commands auto-connect to daemon npx reader scrape https://example.com npx reader crawl https://example.com -d 2 # Check daemon status npx reader status # Stop daemon npx reader stop # Force standalone mode (bypass daemon) npx reader scrape https://example.com --standalone ``` ### `reader scrape ` Scrape one or more URLs. ```bash # Scrape a single URL npx reader scrape https://example.com # Scrape with multiple formats npx reader scrape https://example.com -f markdown,html # Scrape multiple URLs concurrently npx reader scrape https://example.com https://example.org -c 2 # Save to file npx reader scrape https://example.com -o output.md ``` | Option | Type | Default | Description | | ------------------------ | ------ | ------------ | ------------------------------------------------------- | | `-f, --format ` | string | `"markdown"` | Output formats (comma-separated: markdown,html) | | `-o, --output ` | string | stdout | Output file path | | `-c, --concurrency ` | number | `1` | Parallel requests | | `-t, --timeout ` | number | `30000` | Request timeout in milliseconds | | `--batch-timeout ` | number | `300000` | Total timeout for entire batch operation | | `--proxy ` | string | - | Proxy URL (e.g., http://user:pass@host:port) | | `--user-agent ` | string | - | Custom user agent string | | `--show-chrome` | flag | - | Show browser window for debugging | | `--no-main-content` | flag | - | Disable main content extraction (include full page) | | `--include-tags ` | string | - | CSS selectors for elements to include (comma-separated) | | `--exclude-tags ` | string | - | CSS selectors for elements to exclude (comma-separated) | | `-v, --verbose` | flag | - | Enable verbose logging | ### `reader crawl ` Crawl a website to discover pages. ```bash # Crawl with default settings npx reader crawl https://example.com # Crawl deeper with more pages npx reader crawl https://example.com -d 3 -m 50 # Crawl and scrape content npx reader crawl https://example.com -d 2 --scrape # Filter URLs with patterns npx reader crawl https://example.com --include "blog/*" --exclude "admin/*" ``` | Option | Type | Default | Description | | ------------------------ | ------ | ------------ | ----------------------------------------------- | | `-d, --depth ` | number | `1` | Maximum crawl depth | | `-m, --max-pages ` | number | `20` | Maximum pages to discover | | `-s, --scrape` | flag | - | Also scrape content of discovered pages | | `-f, --format ` | string | `"markdown"` | Output formats when scraping (comma-separated) | | `-o, --output ` | string | stdout | Output file path | | `--delay ` | number | `1000` | Delay between requests in milliseconds | | `-t, --timeout ` | number | - | Total timeout for crawl operation | | `--include ` | string | - | URL patterns to include (comma-separated regex) | | `--exclude ` | string | - | URL patterns to exclude (comma-separated regex) | | `--proxy ` | string | - | Proxy URL (e.g., http://user:pass@host:port) | | `--user-agent ` | string | - | Custom user agent string | | `--show-chrome` | flag | - | Show browser window for debugging | | `-v, --verbose` | flag | - | Enable verbose logging | ### `reader browser` Launch a browser session with a CDP WebSocket endpoint. ```bash # Create a session (prints wsEndpoint, blocks until Ctrl+C) npx reader browser create # Create with options npx reader browser create --timeout 60000 --show-chrome # List active sessions (daemon mode) npx reader browser list # Stop a session npx reader browser stop ``` | Option | Type | Default | Description | | -------------------- | ------ | -------- | -------------------------------- | | `--proxy ` | string | - | Proxy URL | | `-t, --timeout ` | number | `300000` | Session lifetime in milliseconds | | `--show-chrome` | flag | - | Show browser window | | `--standalone` | flag | - | Force standalone mode | | `-v, --verbose` | flag | - | Enable verbose logging | ## API Reference ### `ReaderClient` The recommended way to use Reader. Manages HeroCore lifecycle automatically. ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient({ verbose: true }); // Scrape const result = await reader.scrape({ urls: ["https://example.com"] }); // Crawl const crawlResult = await reader.crawl({ url: "https://example.com", depth: 2 }); // Browser session const session = await reader.browser(); // → session.wsEndpoint for Playwright/Puppeteer // Close when done (optional - auto-closes on exit) await reader.close(); ``` #### Constructor Options | Option | Type | Default | Description | | --------------- | ------------------- | --------------- | ------------------------------------------------ | | `verbose` | `boolean` | `false` | Enable verbose logging | | `showChrome` | `boolean` | `false` | Show browser window for debugging | | `browserPool` | `BrowserPoolConfig` | `undefined` | Browser pool configuration (size, recycling) | | `proxyPools` | `ProxyPoolConfig` | `undefined` | Tiered proxy pools (datacenter + residential) | | `proxies` | `ProxyConfig[]` | `undefined` | Array of proxies for rotation (legacy) | | `proxyRotation` | `string` | `"round-robin"` | Rotation strategy: `"round-robin"` or `"random"` | #### BrowserPoolConfig | Option | Type | Default | Description | | -------------------- | -------- | ------- | ----------------------------------- | | `size` | `number` | `2` | Number of browser instances in pool | | `retireAfterPages` | `number` | `100` | Recycle browser after N page loads | | `retireAfterMinutes` | `number` | `30` | Recycle browser after N minutes | | `maxQueueSize` | `number` | `100` | Max pending requests in queue | #### Methods | Method | Description | | ------------------- | -------------------------------------------------- | | `scrape(options)` | Scrape one or more URLs | | `crawl(options)` | Crawl a website to discover pages | | `browser(options?)` | Launch a stealthed browser session (CDP WebSocket) | | `start()` | Pre-initialize HeroCore (optional) | | `isReady()` | Check if client is initialized | | `close()` | Close client and release resources | ### `scrape(options): Promise` Scrape one or more URLs. Can be used directly or via `ReaderClient`. | Option | Type | Required | Default | Description | | ------------------ | ----------------------------- | -------- | -------------- | --------------------------------------------------------------- | | `urls` | `string[]` | Yes | - | Array of URLs to scrape | | `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats | | `onlyMainContent` | `boolean` | No | `true` | Extract only main content (removes nav/header/footer) | | `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep | | `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove | | `waitForSelector` | `string` | No | - | CSS selector to wait for before page is loaded | | `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds | | `batchConcurrency` | `number` | No | `1` | Number of URLs to process in parallel | | `batchTimeoutMs` | `number` | No | `300000` | Total timeout for entire batch operation | | `proxy` | `ProxyConfig` | No | - | Proxy configuration object | | `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` | | `onProgress` | `function` | No | - | Progress callback: `({ completed, total, currentUrl }) => void` | | `verbose` | `boolean` | No | `false` | Enable verbose logging | | `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging | **Returns:** `Promise` ```typescript interface ScrapeResult { data: WebsiteScrapeResult[]; batchMetadata: BatchMetadata; } interface WebsiteScrapeResult { markdown?: string; html?: string; metadata: { baseUrl: string; finalUrl?: string; // Present if URL redirected totalPages: number; scrapedAt: string; duration: number; website: WebsiteMetadata; }; } interface BatchMetadata { totalUrls: number; successfulUrls: number; failedUrls: number; scrapedAt: string; totalDuration: number; errors?: Array<{ url: string; error: string }>; } ``` ### `crawl(options): Promise` Crawl a website to discover pages. | Option | Type | Required | Default | Description | | ------------------- | ----------------------------- | -------- | -------------- | ----------------------------------------------- | | `url` | `string` | Yes | - | Single seed URL to start crawling from | | `depth` | `number` | No | `1` | Maximum depth to crawl | | `maxPages` | `number` | No | `20` | Maximum pages to discover | | `scrape` | `boolean` | No | `false` | Also scrape full content of discovered pages | | `delayMs` | `number` | No | `1000` | Delay between requests in milliseconds | | `timeoutMs` | `number` | No | - | Total timeout for entire crawl operation | | `includePatterns` | `string[]` | No | - | URL patterns to include (regex strings) | | `excludePatterns` | `string[]` | No | - | URL patterns to exclude (regex strings) | | `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats for scraped content | | `scrapeConcurrency` | `number` | No | `2` | Number of URLs to scrape in parallel | | `proxy` | `ProxyConfig` | No | - | Proxy configuration object | | `userAgent` | `string` | No | - | Custom user agent string | | `verbose` | `boolean` | No | `false` | Enable verbose logging | | `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging | | `connectionToCore` | `any` | No | - | Connection to shared Hero Core (for production) | **Returns:** `Promise` ```typescript interface CrawlResult { urls: CrawlUrl[]; scraped?: ScrapeResult; metadata: CrawlMetadata; } interface CrawlUrl { url: string; title: string; description: string | null; } interface CrawlMetadata { totalUrls: number; maxDepth: number; totalDuration: number; seedUrl: string; } ``` ### `browser(options?): Promise` Launch a stealthed Chrome and return a CDP WebSocket URL for Playwright/Puppeteer. | Option | Type | Required | Default | Description | | ------------ | ------------- | -------- | -------- | ----------------------------------------------------- | | `proxy` | `ProxyConfig` | No | - | Proxy configuration | | `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` | | `showChrome` | `boolean` | No | `false` | Show browser window | | `timeoutMs` | `number` | No | `300000` | Session lifetime (auto-closes after) | | `verbose` | `boolean` | No | `false` | Enable verbose logging | **Returns:** `Promise` ```typescript interface BrowserSession { sessionId: string; // Unique session identifier wsEndpoint: string; // CDP WebSocket URL for Playwright/Puppeteer createdAt: string; // ISO timestamp close(): Promise; // Close session and release resources } ``` **Stealth features active on all sessions:** - `navigator.webdriver = false` (via `--disable-blink-features=AutomationControlled`) - Proxy routing through authenticated proxy forwarder (if configured) - Isolated user profile per session (no cookie/state leaks) ### ProxyConfig | Option | Type | Required | Default | Description | | ---------- | ------------------------------- | -------- | ------- | ------------------------------------------------------- | | `url` | `string` | No | - | Full proxy URL (takes precedence over other fields) | | `type` | `"datacenter" \| "residential"` | No | - | Proxy type | | `host` | `string` | No | - | Proxy host | | `port` | `number` | No | - | Proxy port | | `username` | `string` | No | - | Proxy username | | `password` | `string` | No | - | Proxy password | | `country` | `string` | No | - | Country code for residential proxies (e.g., 'us', 'uk') | ## Daemon Mode (Production) For production servers, start the daemon once and all scrape/crawl/browser requests share the warm browser pool: ```typescript import { ReaderClient } from "@vakra-dev/reader"; // Create once at startup const reader = new ReaderClient({ proxyPools: { datacenter: [{ url: "http://user:pass@dc-proxy:8080" }], residential: [{ url: "http://user:pass@res-proxy:8080" }], }, }); // Reuse for all requests const result = await reader.scrape({ urls: ["https://example.com"] }); // Graceful shutdown process.on("SIGTERM", () => reader.close()); ``` ## How It Works ### Anti-Bot Bypass Reader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced anti-detection: 1. **TLS Fingerprinting** - Emulates real Chrome browser fingerprints via MITM proxy 2. **Navigator Spoofing** - `webdriver=false`, device memory, hardware concurrency 3. **DNS over TLS** - Uses Cloudflare DNS (1.1.1.1) to mimic Chrome behavior 4. **WebRTC IP Masking** - Prevents IP leaks through WebRTC connections 5. **WebGL/Canvas Fingerprinting** - Randomized rendering signatures ### Browser Pool - **Tiered Proxy Pools** - Separate datacenter and residential pools with auto-escalation - **Auto-Recycling** - Browsers recycled after 100 requests or 30 minutes - **Health Tracking** - Auto-benches failed proxies for 5 minutes, revives on recovery - **Per-Proxy Concurrency** - Limits concurrent requests per proxy URL (default: 2) ### HTML to Markdown: supermarkdown Reader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines. **Why we built it:** When you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown. **What supermarkdown offers:** | Feature | Benefit | | -------------------- | ---------------------------------------------------- | | **Written in Rust** | Native performance with Node.js bindings via napi-rs | | **Full GFM support** | Tables, task lists, strikethrough, autolinks | | **LLM-optimized** | Clean output designed for AI consumption | | **Battle-tested** | Handles malformed HTML from real web pages | | **CSS selectors** | Include/exclude elements during conversion | supermarkdown is open source and available as both a Rust crate and npm package: ```bash # npm npm install @vakra-dev/supermarkdown # Rust cargo add supermarkdown ``` Check out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation. ## Server Deployment Reader uses a real Chromium browser under the hood. On headless Linux servers (VPS, EC2, etc.), you need to install Chrome's system dependencies: ```bash # Debian/Ubuntu sudo apt-get install -y libnspr4 libnss3 libatk1.0-0 libatk-bridge2.0-0 \ libcups2 libxcb1 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \ libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 libasound2 ``` This is the same requirement that Puppeteer and Playwright have on headless Linux. macOS, Windows, and Linux desktops already have these libraries. For Docker and production deployment guides, see the [deployment documentation](https://docs.reader.dev/documentation/guides/deployment). ## Documentation Full documentation is available at **[docs.reader.dev](https://docs.reader.dev)**, including guides for scraping, crawling, proxy configuration, browser pool management, and deployment. ### Examples | Example | Description | | -------------------------------------------------------------------------- | ---------------------------------------------- | | [Basic Scraping](examples/basic/basic-scrape.ts) | Simple single-URL scraping | | [Batch Scraping](examples/basic/batch-scrape.ts) | Concurrent multi-URL scraping | | [Crawl Website](examples/basic/crawl-website.ts) | Crawl and discover pages | | [Browser Session (Playwright)](examples/basic/browser-session.ts) | Navigate, extract data, screenshot | | [Browser Session (Actions)](examples/basic/browser-session-actions.ts) | Click, type, search, wait for elements | | [Browser Session (Puppeteer)](examples/basic/browser-session-puppeteer.ts) | Puppeteer via `connect({ browserWSEndpoint })` | | [Browser Session (Raw CDP)](examples/basic/browser-session-selenium.ts) | Direct CDP WebSocket commands | | [Browser Pool Config](examples/basic/browser-pool-config.ts) | Configure browser pool for high throughput | | [Proxy Pool](examples/basic/proxy-pool.ts) | Proxy rotation with multiple proxies | | [Cloudflare Bypass](examples/basic/cloudflare-bypass.ts) | Scrape Cloudflare-protected sites | | [All Formats](examples/basic/all-formats.ts) | Output in markdown and html | | [AI Tools](examples/ai-tools/) | OpenAI, Anthropic, LangChain integrations | ## Development ```bash # Install dependencies npm install # Run linting npm run lint # Format code npm run format # Type check npm run typecheck # Find TODOs npm run todo ``` ## Contributing Contributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. ## License [Apache 2.0](LICENSE) - See LICENSE for details. ## Citation If you use Reader in your research or project, please cite it: ```bibtex @software{reader.dev, author = {Kaul, Nihal}, title = {Reader: Open-source, production-grade web scraping engine built for LLMs}, year = {2026}, publisher = {GitHub}, url = {https://github.com/vakra-dev/reader} } ``` ## Support - [GitHub Issues](https://github.com/vakra-dev/reader/issues) - [Documentation](https://docs.reader.dev) - [Discord](https://discord.gg/6tjkq7J5WV) ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions | Version | Supported | | ------- | --------- | | Latest | Yes | We only provide security fixes for the latest release. ## Reporting a Vulnerability If you discover a security vulnerability in Reader, please report it responsibly. **Do not open a public GitHub issue for security vulnerabilities.** Instead, email **nihal.codes@gmail.com** with: - A description of the vulnerability - Steps to reproduce the issue - The potential impact - Any suggested fixes (optional) ## What to Expect - **Acknowledgment** within 48 hours of your report - **Status update** within 7 days with an assessment and timeline - **Credit** in the release notes (unless you prefer to remain anonymous) ## Scope The following are in scope: - The `@vakra-dev/reader` npm package - The Reader CLI tool - The Reader Cloud API (`cloud.reader.dev`) The following are out of scope: - Vulnerabilities in upstream dependencies (report these to the respective projects) - Issues related to websites blocking scraping (this is expected behavior, not a vulnerability) ## Responsible Use Reader is a web scraping tool. Users are responsible for complying with applicable laws and website terms of service. The project maintainers are not responsible for how the tool is used. ================================================ FILE: docs/api-reference.md ================================================ # API Reference Complete API documentation for Reader. ## ReaderClient (Recommended) The recommended way to use Reader. Manages HeroCore lifecycle automatically, reuses connections efficiently, and auto-closes on process exit. ```typescript import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient({ verbose: true }); // Scrape URLs const result = await reader.scrape({ urls: ["https://example.com"], formats: ["markdown"], }); // Crawl a website const crawlResult = await reader.crawl({ url: "https://example.com", depth: 2, }); // Launch a stealthed browser session const session = await reader.browser(); // → session.wsEndpoint for Playwright/Puppeteer // Close when done (optional - auto-closes on exit) await reader.close(); ``` ### Constructor ```typescript new ReaderClient(options?: ReaderClientOptions) ``` | Option | Type | Default | Description | |--------|------|---------|-------------| | `verbose` | `boolean` | `false` | Enable verbose logging | | `showChrome` | `boolean` | `false` | Show browser window for debugging | | `browserPool` | `BrowserPoolConfig` | - | Browser pool configuration | | `proxyPools` | `ProxyPoolConfig` | - | Tiered proxy pools (datacenter + residential) | | `proxies` | `ProxyConfig[]` | - | List of proxies to rotate through (legacy) | | `proxyRotation` | `"round-robin" \| "random"` | `"round-robin"` | Proxy rotation strategy | #### ProxyPoolConfig ```typescript interface ProxyPoolConfig { datacenter?: ProxyConfig[]; // Fast, cheap - works for most sites residential?: ProxyConfig[]; // Slower, anti-bot sites (Amazon, LinkedIn) } ``` #### BrowserPoolConfig | Option | Type | Default | Description | |--------|------|---------|-------------| | `size` | `number` | `2` | Number of browser instances | | `retireAfterPages` | `number` | `100` | Retire browser after N page loads | | `retireAfterMinutes` | `number` | `30` | Retire browser after N minutes | | `maxQueueSize` | `number` | `100` | Maximum pending requests in queue | ### Methods #### start() Pre-initialize HeroCore. Called automatically on first scrape/crawl. ```typescript await reader.start(): Promise ``` #### scrape(options) Scrape one or more URLs. ```typescript const result = await reader.scrape(options): Promise ``` See [ScrapeOptions](#scrapeoptions) for available options. #### crawl(options) Crawl a website to discover pages. ```typescript const result = await reader.crawl(options): Promise ``` See [CrawlOptions](#crawloptions) for available options. #### browser(options?) Launch a stealthed browser session and return a CDP WebSocket URL for Playwright/Puppeteer. ```typescript const session = await reader.browser(options?): Promise ``` | Option | Type | Default | Description | |--------|------|---------|-------------| | `proxy` | `ProxyConfig` | - | Proxy configuration | | `proxyTier` | `ProxyTier` | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` | | `showChrome` | `boolean` | `false` | Show browser window | | `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) | | `verbose` | `boolean` | `false` | Enable verbose logging | Returns: ```typescript interface BrowserSession { sessionId: string; // Unique session identifier wsEndpoint: string; // CDP WebSocket URL createdAt: string; // ISO timestamp close(): Promise; // Close session and release resources } ``` See the [Browser Sessions guide](guides/browser-sessions.md) for full examples. #### isReady() Check if the client is initialized and ready. ```typescript reader.isReady(): boolean ``` #### close() Close the client and release resources. ```typescript await reader.close(): Promise ``` --- ## Direct Functions (Advanced) For advanced use cases where you need custom HeroCore management, you can use the direct functions. Note that without `connectionToCore`, each call spawns a new HeroCore instance which is less efficient. ### scrape(options) Scrape one or more URLs and return content in specified formats. ```typescript import { scrape } from "@vakra-dev/reader"; const result = await scrape({ urls: ["https://example.com"], formats: ["markdown"], }); ``` #### Parameters | Name | Type | Required | Default | Description | |------|------|----------|---------|-------------| | `urls` | `string[]` | Yes | - | Array of URLs to scrape | | `formats` | `FormatType[]` | No | `["markdown"]` | Output formats | | `onlyMainContent` | `boolean` | No | `true` | Extract only main content | | `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep | | `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove | | `userAgent` | `string` | No | - | Custom user agent string | | `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds | | `batchConcurrency` | `number` | No | `1` | URLs to process in parallel | | `batchTimeoutMs` | `number` | No | `300000` | Total batch timeout | | `onProgress` | `ProgressCallback` | No | - | Progress callback function | | `proxy` | `ProxyConfig` | No | - | Proxy configuration | | `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` | | `waitForSelector` | `string` | No | - | CSS selector to wait for | | `verbose` | `boolean` | No | `false` | Enable verbose logging | | `showChrome` | `boolean` | No | `false` | Show browser window | | `connectionToCore` | `any` | No | - | Shared Hero Core connection | #### Returns `Promise` ```typescript interface ScrapeResult { data: WebsiteScrapeResult[]; batchMetadata: BatchMetadata; } ``` #### Example ```typescript // Using ReaderClient (recommended) const reader = new ReaderClient(); const result = await reader.scrape({ urls: ["https://example.com", "https://example.org"], formats: ["markdown", "html"], batchConcurrency: 2, onProgress: ({ completed, total, currentUrl }) => { console.log(`[${completed}/${total}] ${currentUrl}`); }, }); for (const site of result.data) { console.log("URL:", site.metadata.baseUrl); console.log("Markdown:", site.markdown?.substring(0, 200)); } await reader.close(); ``` --- ### crawl(options) Crawl a website to discover pages, optionally scraping their content. ```typescript // Using ReaderClient (recommended) import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.crawl({ url: "https://example.com", depth: 2, maxPages: 20, scrape: true, }); await reader.close(); ``` #### Parameters | Name | Type | Required | Default | Description | |------|------|----------|---------|-------------| | `url` | `string` | Yes | - | Seed URL to start crawling | | `depth` | `number` | No | `1` | Maximum crawl depth | | `maxPages` | `number` | No | `20` | Maximum pages to discover | | `scrape` | `boolean` | No | `false` | Also scrape discovered pages | | `delayMs` | `number` | No | `1000` | Delay between requests | | `timeoutMs` | `number` | No | - | Total crawl timeout | | `includePatterns` | `string[]` | No | - | URL patterns to include | | `excludePatterns` | `string[]` | No | - | URL patterns to exclude | | `formats` | `FormatType[]` | No | `["markdown", "html"]` | Output formats when scraping | | `scrapeConcurrency` | `number` | No | `2` | Scraping parallelism | | `proxy` | `ProxyConfig` | No | - | Proxy configuration | | `userAgent` | `string` | No | - | Custom user agent | | `verbose` | `boolean` | No | `false` | Enable verbose logging | | `showChrome` | `boolean` | No | `false` | Show browser window | | `connectionToCore` | `any` | No | - | Shared Hero Core connection | #### Returns `Promise` ```typescript interface CrawlResult { urls: CrawlUrl[]; scraped?: ScrapeResult; metadata: CrawlMetadata; } ``` #### Example ```typescript const reader = new ReaderClient(); const result = await reader.crawl({ url: "https://docs.example.com", depth: 3, maxPages: 50, includePatterns: ["docs/*"], excludePatterns: ["docs/archive/*"], scrape: true, }); console.log(`Discovered ${result.urls.length} pages`); result.urls.forEach((page) => { console.log(`- ${page.title}: ${page.url}`); }); if (result.scraped) { console.log(`Scraped ${result.scraped.batchMetadata.successfulUrls} pages`); } await reader.close(); ``` --- ## Type Definitions ### ScrapeOptions ```typescript interface ScrapeOptions { urls: string[]; formats?: Array<"markdown" | "html">; onlyMainContent?: boolean; includeTags?: string[]; excludeTags?: string[]; userAgent?: string; timeoutMs?: number; batchConcurrency?: number; batchTimeoutMs?: number; onProgress?: (progress: ProgressInfo) => void; proxy?: ProxyConfig; proxyTier?: "datacenter" | "residential" | "auto"; waitForSelector?: string; verbose?: boolean; showChrome?: boolean; connectionToCore?: any; } ``` ### CrawlOptions ```typescript interface CrawlOptions { url: string; depth?: number; maxPages?: number; scrape?: boolean; delayMs?: number; timeoutMs?: number; includePatterns?: string[]; excludePatterns?: string[]; formats?: Array<"markdown" | "html">; scrapeConcurrency?: number; proxy?: ProxyConfig; userAgent?: string; verbose?: boolean; showChrome?: boolean; connectionToCore?: any; } ``` ### ProxyConfig ```typescript interface ProxyConfig { url?: string; type?: "datacenter" | "residential"; host?: string; port?: number; username?: string; password?: string; country?: string; } ``` ### ScrapeResult ```typescript interface ScrapeResult { data: WebsiteScrapeResult[]; batchMetadata: BatchMetadata; } ``` ### WebsiteScrapeResult ```typescript interface WebsiteScrapeResult { markdown?: string; html?: string; metadata: { baseUrl: string; finalUrl?: string; // Present if URL redirected totalPages: number; scrapedAt: string; duration: number; website: WebsiteMetadata; proxy?: ProxyMetadata; // Included when proxy pooling is used }; } ``` ### ProxyMetadata ```typescript interface ProxyMetadata { host: string; port: number; country?: string; // If geo-targeting was used } ``` ### BatchMetadata ```typescript interface BatchMetadata { totalUrls: number; successfulUrls: number; failedUrls: number; scrapedAt: string; totalDuration: number; errors?: Array<{ url: string; error: string }>; } ``` ### CrawlResult ```typescript interface CrawlResult { urls: CrawlUrl[]; scraped?: ScrapeResult; metadata: CrawlMetadata; } ``` ### CrawlUrl ```typescript interface CrawlUrl { url: string; title: string; description: string | null; } ``` ### CrawlMetadata ```typescript interface CrawlMetadata { totalUrls: number; maxDepth: number; totalDuration: number; seedUrl: string; } ``` ### WebsiteMetadata ```typescript interface WebsiteMetadata { title: string | null; description: string | null; author: string | null; language: string | null; charset: string | null; favicon: string | null; image: string | null; canonical: string | null; keywords: string[] | null; robots: string | null; themeColor: string | null; openGraph: { title: string | null; description: string | null; type: string | null; url: string | null; image: string | null; siteName: string | null; locale: string | null; } | null; twitter: { card: string | null; site: string | null; creator: string | null; title: string | null; description: string | null; image: string | null; } | null; } ``` ### ProgressInfo ```typescript interface ProgressInfo { completed: number; total: number; currentUrl: string; } ``` --- ## Classes ### BrowserPool Manages a pool of Hero browser instances for efficient scraping. ```typescript import { BrowserPool } from "@vakra-dev/reader"; const pool = new BrowserPool({ size: 5 }); await pool.initialize(); const result = await pool.withBrowser(async (hero) => { await hero.goto("https://example.com"); return await hero.document.title; }); await pool.shutdown(); ``` #### Constructor ```typescript new BrowserPool(config?: PoolConfig) ``` | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `size` | `number` | `2` | Number of browser instances | | `retireAfterPages` | `number` | `100` | Recycle after N pages | | `retireAfterMinutes` | `number` | `30` | Recycle after N minutes | | `maxQueueSize` | `number` | `100` | Maximum pending requests | | `healthCheckIntervalMs` | `number` | `300000` | Health check interval | #### Methods ##### initialize() Initialize the browser pool. ```typescript await pool.initialize(): Promise ``` ##### withBrowser(fn) Execute a function with an acquired browser, automatically releasing it after. ```typescript await pool.withBrowser(fn: (hero: Hero) => Promise): Promise ``` ##### acquire() Manually acquire a browser instance. Must be paired with `release()`. ```typescript const hero = await pool.acquire(): Promise ``` ##### release(hero) Release a browser instance back to the pool. ```typescript await pool.release(hero: Hero): Promise ``` ##### healthCheck() Check the health of all pool instances. ```typescript const health = await pool.healthCheck(): Promise ``` ##### getStats() Get current pool statistics. ```typescript const stats = pool.getStats(): PoolStats ``` ##### shutdown() Shutdown all browser instances. ```typescript await pool.shutdown(): Promise ``` --- ## Formatter Functions ### formatToMarkdown(pages, baseUrl, scrapedAt, duration, metadata?) Convert scraped pages to Markdown format. ```typescript import { formatToMarkdown } from "@vakra-dev/reader"; const markdown = formatToMarkdown( pages, "https://example.com", new Date().toISOString(), 1500, metadata ); ``` --- ### formatToHTML(pages, baseUrl, scrapedAt, duration, metadata?) Convert scraped pages to a complete HTML document. ```typescript import { formatToHTML } from "@vakra-dev/reader"; const html = formatToHTML( pages, "https://example.com", new Date().toISOString(), 1500, metadata ); ``` --- ## Utility Functions ### cleanContent(html) Remove navigation, ads, scripts, and other non-content elements from HTML. ```typescript import { cleanContent } from "@vakra-dev/reader"; const cleanHtml = cleanContent(rawHtml); ``` --- ### extractMetadata(html) Extract metadata from HTML including Open Graph and Twitter cards. ```typescript import { extractMetadata } from "@vakra-dev/reader"; const metadata = extractMetadata(html); console.log(metadata.title); console.log(metadata.openGraph?.image); ``` --- ## Default Values ```typescript const DEFAULT_OPTIONS = { formats: ["markdown"], onlyMainContent: true, timeoutMs: 30000, batchConcurrency: 1, batchTimeoutMs: 300000, verbose: false, showChrome: false, }; const DEFAULT_CRAWL_OPTIONS = { depth: 1, maxPages: 20, scrape: false, delayMs: 1000, formats: ["markdown", "html"], scrapeConcurrency: 2, verbose: false, showChrome: false, }; const DEFAULT_POOL_CONFIG = { size: 2, retireAfterPages: 100, retireAfterMinutes: 30, maxQueueSize: 100, healthCheckIntervalMs: 300000, }; ``` --- ## See Also - [Getting Started](getting-started.md) - Quick start guide - [Architecture](architecture.md) - System design - [Browser Pool Guide](guides/browser-pool.md) - Pool management - [Cloudflare Bypass Guide](guides/cloudflare-bypass.md) - Challenge handling ================================================ FILE: docs/architecture.md ================================================ # Architecture This document describes the internal architecture of Reader, helping contributors understand how the system works. ## High-Level Overview ``` ┌─────────────────────────────────────────────────────────────────┐ │ Public API │ │ scrape() / crawl() / browser() │ └──────────┬─────────────────┬────────────────┬───────────────────┘ │ │ │ ┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼──────────┐ │ Scraper │ │ Crawler │ │ BrowserSession │ │ Class │ │ Class │ │ (CDP WebSocket)│ └─────┬─────┘ └─────┬─────┘ └─────┬──────────┘ │ │ │ └────────┬───────┘ │ own HeroCore │ │ ┌─────────▼─────────┐ ┌─────────▼─────────┐ │ TieredBrowserPool │ │ Dedicated Chrome │ │ (shared, pooled) │ │ (per-session) │ └─────────┬─────────┘ └───────────────────┘ │ ┌───────────────┼───────────────┐ │ │ │ ┌───▼──────────┐ ┌──▼──────────┐ ┌──▼────────────┐ │ Hero Config │ │ Orchestrator│ │ Formatters │ │ (TLS, DNS, etc.) │ │ Detection │ │ (MD, HTML, etc) │ └──────────────────┘ └─────────────────┘ └─────────────────┘ ``` ## Directory Structure ``` src/ ├── index.ts # Public API exports ├── scraper.ts # Scraper class - main scraping logic ├── crawler.ts # Crawler class - link discovery + scraping ├── types.ts # ScrapeOptions, ScrapeResult, etc. ├── crawl-types.ts # CrawlOptions, CrawlResult, etc. │ ├── browser/ │ ├── pool.ts # BrowserPool - manages Hero instances │ ├── hero-config.ts # Hero configuration (TLS, DNS, viewport) │ └── types.ts # IBrowserPool, PoolConfig, PoolStats │ ├── cloudflare/ │ ├── detector.ts # detectChallenge() - DOM/text matching │ ├── handler.ts # waitForChallengeResolution() - polling │ └── types.ts # ChallengeDetection, ResolutionResult │ ├── formatters/ │ ├── markdown.ts # formatToMarkdown() - uses supermarkdown │ ├── html.ts # formatToHTML() - full HTML document │ ├── postprocess.ts # Post-processing utilities │ └── index.ts # Re-exports all formatters │ ├── utils/ │ ├── content-cleaner.ts # cleanContent() - removes nav, ads │ ├── metadata-extractor.ts # extractMetadata() - OG tags, etc. │ ├── url-helpers.ts # URL validation, normalization │ ├── rate-limiter.ts # Simple delay-based rate limiting │ └── logger.ts # Pino logger with pretty print │ ├── proxy/ │ └── config.ts # createProxyUrl(), parseProxyUrl() │ └── cli/ └── index.ts # CLI using Commander.js ``` ## Core Components ### Scraper The `Scraper` class (`src/scraper.ts`) handles URL scraping: ```typescript class Scraper { constructor(options: ScrapeOptions) { ... } async scrape(): Promise { // 1. Initialize browser pool // 2. Process URLs with concurrency control (p-limit) // 3. For each URL: fetch, detect challenges, extract content // 4. Format to requested output formats // 5. Aggregate results and metadata } private async scrapeSingleUrl(url: string): Promise { // 1. Acquire browser from pool // 2. Navigate to URL // 3. Detect Cloudflare challenge // 4. Wait for resolution if needed // 5. Extract HTML and metadata // 6. Clean content // 7. Format to outputs // 8. Release browser to pool } } ``` **Key design decisions:** - Uses `p-limit` for concurrency control - Each URL gets its own browser instance from the pool - Cloudflare detection runs before content extraction - All formatters run in parallel for each URL ### Crawler The `Crawler` class (`src/crawler.ts`) discovers links: ```typescript class Crawler { async crawl(): Promise { // BFS (Breadth-First Search) algorithm // 1. Start with seed URL at depth 0 // 2. Fetch page, extract links // 3. Filter links (same domain, patterns) // 4. Add to queue with depth + 1 // 5. Repeat until maxPages or maxDepth // 6. Optionally scrape discovered URLs } } ``` **Key design decisions:** - BFS ensures shallow pages are discovered first - Respects `maxPages` and `depth` limits - Optional scraping reuses the Scraper class - Delay between requests for rate limiting ### Browser Pool The `BrowserPool` class (`src/browser/pool.ts`) manages Hero instances: ```typescript class BrowserPool { private instances: HeroInstance[]; private available: HeroInstance[]; private queue: PendingRequest[]; async initialize(): Promise { ... } async acquire(): Promise { ... } async release(hero: Hero): Promise { ... } async withBrowser(fn: (hero: Hero) => Promise): Promise { const hero = await this.acquire(); try { return await fn(hero); } finally { await this.release(hero); } } } ``` **Pool lifecycle:** 1. **Initialize** - Create `size` Hero instances 2. **Acquire** - Get available instance or queue the request 3. **Use** - Execute scraping logic 4. **Release** - Return to pool or recycle if stale 5. **Recycle** - Close old instance, create new one 6. **Shutdown** - Close all instances **Recycling triggers:** - After N pages (default: 100) - After N minutes (default: 30) - On health check failure ### Cloudflare Detection Detection happens in two phases: **1. Challenge Detection** (`src/cloudflare/detector.ts`): ```typescript async function detectChallenge(hero: Hero): Promise { // Check DOM for challenge elements const signals = []; // CSS selectors that indicate challenges if (await hero.document.querySelector("#challenge-form")) { signals.push({ type: "dom", selector: "#challenge-form" }); } // Text patterns that indicate challenges const bodyText = await hero.document.body.textContent; if (bodyText.includes("checking your browser")) { signals.push({ type: "text", pattern: "checking your browser" }); } return { isChallenge: signals.length > 0, type: determineType(signals), signals, }; } ``` **2. Challenge Resolution** (`src/cloudflare/handler.ts`): ```typescript async function waitForChallengeResolution( hero: Hero, options: ResolutionOptions ): Promise { const startTime = Date.now(); while (Date.now() - startTime < options.maxWaitMs) { // Check if URL changed (redirect after challenge) if ((await hero.url) !== options.initialUrl) { return { resolved: true, method: "redirect" }; } // Check if challenge elements disappeared const detection = await detectChallenge(hero); if (!detection.isChallenge) { return { resolved: true, method: "element_removal" }; } await sleep(options.pollIntervalMs); } return { resolved: false }; } ``` ### Formatters Each formatter transforms scraped pages into a specific format: | Formatter | Input | Output | |-----------|-------|--------| | `formatToMarkdown` | Pages, metadata | Markdown document with frontmatter | | `formatToHTML` | Pages, metadata | Complete HTML document with CSS | **Markdown formatter** uses [supermarkdown](https://github.com/vakra-dev/supermarkdown) - a high-performance Rust-based HTML-to-Markdown converter with full GFM support. ## Data Flow ### Scrape Request Flow ``` scrape({ urls: ["https://example.com"], formats: ["markdown"] }) │ ├─► Scraper.scrape() │ │ │ ├─► BrowserPool.initialize(size=concurrency) │ │ │ ├─► For each URL (controlled by p-limit): │ │ │ │ │ ├─► pool.withBrowser(async hero => { │ │ │ │ │ │ │ ├─► hero.goto(url) │ │ │ │ │ │ │ ├─► detectChallenge(hero) │ │ │ │ └─► Returns { isChallenge, type, signals } │ │ │ │ │ │ │ ├─► if (isChallenge): │ │ │ │ └─► waitForChallengeResolution(hero) │ │ │ │ │ │ │ ├─► Extract title, HTML │ │ │ │ │ │ │ ├─► cleanContent(html) │ │ │ │ └─► Remove nav, ads, scripts │ │ │ │ │ │ │ ├─► extractMetadata(html) │ │ │ │ └─► OG tags, Twitter cards, etc. │ │ │ │ │ │ │ └─► Format to requested formats │ │ │ }) │ │ │ │ │ └─► Add to results array │ │ │ ├─► pool.shutdown() │ │ │ └─► Return ScrapeResult { data[], batchMetadata } │ └─► Result returned to caller ``` ### Crawl Request Flow ``` crawl({ url: "https://example.com", depth: 2, scrape: true }) │ ├─► Crawler.crawl() │ │ │ ├─► Initialize queue with seed URL at depth 0 │ │ │ ├─► BFS loop (while queue not empty && pages < maxPages): │ │ │ │ │ ├─► Dequeue next URL │ │ │ │ │ ├─► Fetch page with Hero │ │ │ │ │ ├─► Extract links via regex │ │ │ │ │ ├─► Filter links: │ │ │ ├─► Same domain only │ │ │ ├─► Match includePatterns │ │ │ └─► Exclude excludePatterns │ │ │ │ │ ├─► Add new links to queue with depth + 1 │ │ │ │ │ ├─► Rate limit (delay between requests) │ │ │ │ │ └─► Add to discovered URLs │ │ │ ├─► If scrape=true: │ │ └─► scrape({ urls: discoveredUrls }) │ │ │ └─► Return CrawlResult { urls[], scraped?, metadata } │ └─► Result returned to caller ``` ## Design Decisions ### Why Hero? [Ulixee Hero](https://ulixee.org/) was chosen for: 1. **Stealth** - Advanced TLS fingerprinting and anti-detection 2. **Speed** - Optimized for headless automation 3. **API** - Clean async/await interface 4. **Stability** - Production-tested at scale ### Pool vs Per-Request Browsers We use a pool because: - Browser startup is slow (~2-3 seconds) - Memory overhead per browser is high - Connection reuse improves performance Trade-off: Stale browsers can accumulate state, so we recycle them periodically. ### Cloudflare Detection Strategy Multi-signal approach because: - No single indicator is 100% reliable - Cloudflare changes their challenge pages - Different challenge types have different signatures Detection signals include: - DOM elements (`#challenge-form`, `.cf-browser-verification`) - Text patterns ("checking your browser", "ray id") - URL patterns (`/cdn-cgi/challenge-platform/`) - HTTP status codes ### Content Cleaning We clean HTML before formatting because: - Navigation, ads, scripts bloat output - LLMs perform better with focused content - Reduces token usage Cleaning removes: - `

Content

`; const result = cleanContent(html, "https://example.com"); expect(result).not.toContain(" { const html = `

Content

`; const result = cleanContent(html, "https://example.com"); expect(result).not.toContain(" { const html = `

Content

`; const result = cleanContent(html, "https://example.com"); expect(result).not.toContain("Enable JS"); }); }); describe("onlyMainContent navigation removal", () => { it("removes nav, header, footer when onlyMainContent=true", () => { const html = `
Site header

Main article content here that is long enough to not be filtered

Footer info
`; const result = cleanContent(html, "https://example.com", { onlyMainContent: true }); expect(result).toContain("Main article content"); expect(result).not.toContain("Navigation links"); expect(result).not.toContain("Footer info"); }); it("keeps nav, header, footer when onlyMainContent=false", () => { const html = `

Main content

Footer info
`; const result = cleanContent(html, "https://example.com", { onlyMainContent: false }); expect(result).toContain("Navigation links"); expect(result).toContain("Main content"); expect(result).toContain("Footer info"); }); it("protects #content from removal even if it's inside a removable element", () => { const html = `

This is the real content

`; const result = cleanContent(html, "https://example.com", { onlyMainContent: true }); expect(result).toContain("This is the real content"); }); }); describe("does NOT strip legitimate content", () => { it("preserves body with class containing 'dialog' substring", () => { // Regression test: Wikipedia's // was being nuked by the old [class*="dialog"] wildcard selector. const html = `

This is the real article content that should survive cleaning.

`; const result = cleanContent(html, "https://en.wikipedia.org/wiki/Test", { onlyMainContent: true }); expect(result).toContain("real article content"); }); it("preserves forms and inputs (they may contain visible text)", () => { const html = `

Content

`; const result = cleanContent(html, "https://example.com", { onlyMainContent: false }); expect(result).toContain("Search:"); }); it("preserves aria-hidden elements (may be re-shown by JS)", () => { const html = `

Visible

`; const result = cleanContent(html, "https://example.com", { onlyMainContent: false }); expect(result).toContain("Hidden but potentially real content"); }); }); describe("Wikipedia content extraction", () => { it("preserves Wikipedia article body through #mw-content-text protection", () => { const html = `

Web scraping

Web scraping is the process of extracting data from websites. ${"More body text. ".repeat(20)}

It involves making HTTP requests, parsing HTML, and extracting the content of interest.

Wikipedia footer
`; const result = cleanContent(html, "https://en.wikipedia.org/wiki/Web_scraping", { onlyMainContent: true, }); expect(result).toContain("Web scraping is the process"); expect(result).toContain("HTTP requests"); expect(result).not.toContain("Wikipedia footer"); expect(result).not.toContain("Log in"); }); }); describe("docs.anthropic.com content extraction", () => { it("preserves Mintlify-style main.relative content", () => { const html = `

Welcome to Claude

Claude is an AI assistant. ${"Documentation body text. ".repeat(15)}

Get started by reading the API reference.

Doc footer
`; const result = cleanContent(html, "https://docs.anthropic.com/en/docs/welcome", { onlyMainContent: true, }); expect(result).toContain("Welcome to Claude"); expect(result).toContain("Documentation body text"); expect(result).not.toContain("Doc footer"); }); }); describe("selector filtering", () => { it("applies excludeTags correctly", () => { const html = `
User comments here

Main content paragraph

`; const result = cleanContent(html, "https://example.com", { excludeTags: [".comments"], }); expect(result).not.toContain("User comments"); expect(result).toContain("Main content"); }); it("applies includeTags correctly", () => { const html = `
Article text
`; const result = cleanContent(html, "https://example.com", { includeTags: [".article-content"], }); expect(result).toContain("Article text"); }); }); describe("edge cases", () => { it("handles empty HTML without crashing", () => { // linkedom may throw on truly empty input expect(() => cleanContent("", "https://example.com")).toThrow(); }); it("handles HTML with only whitespace without crashing", () => { expect(() => cleanContent(" \n\t ", "https://example.com")).toThrow(); }); it("handles minimal HTML structure", () => { const result = cleanContent("", "https://example.com"); expect(result).toBeDefined(); }); it("preserves text content through cleaning", () => { const html = `

Title

Paragraph with bold text.

`; const result = cleanContent(html, "https://example.com"); expect(result).toContain("Title"); expect(result).toContain("bold"); }); }); describe("URL handling", () => { it("absolutifies relative URLs", () => { const html = `Link`; const result = cleanContent(html, "https://example.com"); expect(result).toContain("https://example.com/page"); expect(result).toContain("https://example.com/img.png"); }); it("resolves srcset to largest image", () => { const html = ``; const result = cleanContent(html, "https://example.com"); // srcset resolves to large.jpg, then URL absolutifier makes it https://example.com/large.jpg expect(result).toContain("large.jpg"); expect(result).not.toContain('src="tiny.jpg"'); }); }); describe("base64 image removal", () => { it("removes base64 img elements when removeBase64Images=true", () => { const html = `

Content

`; const result = cleanContent(html, "https://example.com", { removeBase64Images: true }); expect(result).not.toContain("data:image"); expect(result).toContain("Content"); }); }); }); ================================================ FILE: tests/unit/crawler.test.ts ================================================ /** * Crawler Tests * * Tests link extraction, depth limiting, maxPages cap, URL dedup, * same-domain filtering, and robots.txt compliance. We mock fetchPage * and fetchRobotsTxt to avoid needing a live browser or network. */ import { describe, it, expect, vi, beforeEach } from "vitest"; import { Crawler } from "../../src/crawler"; import type { IBrowserPool } from "../../src/browser/types"; import type { CrawlResult } from "../../src/crawl-types"; // ── Mock robots parser (no network) ────────────────────────────────────────── vi.mock("../../src/utils/robots-parser", () => ({ fetchRobotsTxt: vi.fn().mockResolvedValue(null), // no robots.txt by default isUrlAllowed: vi.fn().mockReturnValue(true), })); vi.mock("../../src/utils/rate-limiter", () => ({ rateLimit: vi.fn().mockResolvedValue(undefined), // skip delays in tests })); // ── Helpers ────────────────────────────────────────────────────────────────── /** Minimal mock pool that satisfies the constructor check */ function mockPool(): IBrowserPool { return { withBrowser: vi.fn(), shutdown: vi.fn().mockResolvedValue(undefined), getStats: vi.fn().mockReturnValue({ size: 1, active: 0, idle: 1, pending: 0 }), isReady: vi.fn().mockReturnValue(true), } as unknown as IBrowserPool; } /** * Create a Crawler with mocked fetchPage. Returns the crawler and the * fetchPage mock so tests can control what each page returns. */ function createTestCrawler(options: { url: string; depth?: number; maxPages?: number; includePatterns?: string[]; excludePatterns?: string[]; }) { const crawler = new Crawler({ url: options.url, depth: options.depth ?? 1, maxPages: options.maxPages ?? 20, delayMs: 0, // no delay in tests pool: mockPool(), includePatterns: options.includePatterns, excludePatterns: options.excludePatterns, }); // Suppress log noise (crawler as any).logger = { info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(), }; const fetchPageMock = vi.fn<[string], Promise<{ crawlUrl: { url: string; title: string; description: string | null }; html: string } | null>>(); (crawler as any).fetchPage = fetchPageMock; return { crawler, fetchPageMock }; } /** Build a simple HTML page with links */ function makeHtml(links: string[], title = "Test Page"): string { const anchors = links.map((href) => `Link`).join("\n"); return `${title}${anchors}`; } /** Build a fetchPage result */ function pageResult(url: string, html: string, title = "Test Page") { return { crawlUrl: { url, title, description: null }, html, }; } // ── Tests ──────────────────────────────────────────────────────────────────── describe("Crawler", () => { beforeEach(() => { vi.clearAllMocks(); }); describe("constructor", () => { it("defaults depth=1, maxPages=20", () => { const crawler = new Crawler({ url: "https://example.com" }); expect((crawler as any).options.depth).toBe(1); expect((crawler as any).options.maxPages).toBe(20); }); }); describe("link extraction", () => { it("extracts same-domain absolute links", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock .mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "https://example.com/page1", "https://example.com/page2", "https://other.com/external", // different domain ]), )) .mockResolvedValueOnce(pageResult("https://example.com/page1", makeHtml([]))) .mockResolvedValueOnce(pageResult("https://example.com/page2", makeHtml([]))); const result = await crawler.crawl(); // Seed + 2 same-domain links (external filtered) expect(result.urls).toHaveLength(3); expect(result.urls.map((u) => u.url)).toContain("https://example.com/page1"); expect(result.urls.map((u) => u.url)).toContain("https://example.com/page2"); }); it("resolves relative URLs against the page base URL", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock .mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["/about", "./contact", "blog/post1"]), )) .mockResolvedValueOnce(pageResult("https://example.com/about", makeHtml([]))) .mockResolvedValueOnce(pageResult("https://example.com/contact", makeHtml([]))) .mockResolvedValueOnce(pageResult("https://example.com/blog/post1", makeHtml([]))); const result = await crawler.crawl(); const urls = result.urls.map((u) => u.url); expect(urls).toContain("https://example.com/about"); expect(urls).toContain("https://example.com/contact"); }); it("skips fragment-only links", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["#section1", "#top", "https://example.com/real-page"]), )); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/real-page", makeHtml([]))); const result = await crawler.crawl(); expect(result.urls).toHaveLength(2); // seed + real-page, not fragments }); it("skips non-HTTP schemes (mailto, javascript, tel, etc.)", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "mailto:test@example.com", "javascript:void(0)", "tel:+1234567890", "data:text/html,hello", "ftp://files.example.com/file", "https://example.com/valid", ]), )); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/valid", makeHtml([]))); const result = await crawler.crawl(); expect(result.urls).toHaveLength(2); // seed + valid }); it("strips hash fragments from discovered URLs", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock .mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["https://example.com/page#section1"]), )) .mockResolvedValueOnce(pageResult("https://example.com/page", makeHtml([]))); const result = await crawler.crawl(); expect(result.urls[1].url).toBe("https://example.com/page"); }); }); describe("depth limiting", () => { it("does not extract links when at max depth", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); // depth=0 (seed) → links extracted at depth=1 fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["https://example.com/level1"]), )); // depth=1 → at max depth, links NOT extracted (even though page has them) fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com/level1", makeHtml(["https://example.com/level2"]), )); const result = await crawler.crawl(); expect(result.urls).toHaveLength(2); // seed + level1, NOT level2 expect(result.urls.map((u) => u.url)).not.toContain("https://example.com/level2"); }); it("crawls deeper with depth=2", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 2, }); fetchPageMock .mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["https://example.com/a"]), )) .mockResolvedValueOnce(pageResult( "https://example.com/a", makeHtml(["https://example.com/a/b"]), )) .mockResolvedValueOnce(pageResult( "https://example.com/a/b", makeHtml(["https://example.com/a/b/c"]), // depth=2, at max, won't extract )); const result = await crawler.crawl(); expect(result.urls).toHaveLength(3); // seed + a + a/b expect(result.urls.map((u) => u.url)).not.toContain("https://example.com/a/b/c"); }); }); describe("maxPages cap", () => { it("stops after reaching maxPages", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, maxPages: 3, }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "https://example.com/p1", "https://example.com/p2", "https://example.com/p3", "https://example.com/p4", "https://example.com/p5", ]), )); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/p1", makeHtml([]))); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/p2", makeHtml([]))); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/p3", makeHtml([]))); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/p4", makeHtml([]))); const result = await crawler.crawl(); expect(result.urls).toHaveLength(3); // capped at maxPages }); }); describe("URL deduplication", () => { it("does not visit the same URL twice", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "https://example.com/page", "https://example.com/page", // duplicate "https://example.com/page", // duplicate ]), )); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/page", makeHtml([]))); const result = await crawler.crawl(); expect(result.urls).toHaveLength(2); // seed + page (not 4) expect(fetchPageMock).toHaveBeenCalledTimes(2); // only fetched twice }); }); describe("failed pages", () => { it("continues crawling when fetchPage returns null", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml(["https://example.com/broken", "https://example.com/ok"]), )); fetchPageMock.mockResolvedValueOnce(null); // broken page fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com/ok", makeHtml([]))); const result = await crawler.crawl(); // seed + ok (broken didn't add to urls) expect(result.urls).toHaveLength(2); expect(result.urls.map((u) => u.url)).toContain("https://example.com/ok"); }); }); describe("metadata", () => { it("returns correct metadata with seed URL and duration", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, maxPages: 5, }); fetchPageMock.mockResolvedValueOnce(pageResult("https://example.com", makeHtml([]))); const result = await crawler.crawl(); expect(result.metadata.seedUrl).toBe("https://example.com"); expect(result.metadata.maxDepth).toBe(1); expect(result.metadata.totalUrls).toBe(1); expect(result.metadata.totalDuration).toBeGreaterThanOrEqual(0); }); }); describe("include/exclude patterns", () => { it("respects includePatterns filter", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, includePatterns: ["/blog/"], }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "https://example.com/blog/post1", "https://example.com/about", // excluded by include pattern ]), )); fetchPageMock.mockResolvedValueOnce( pageResult("https://example.com/blog/post1", makeHtml([])), ); const result = await crawler.crawl(); const urls = result.urls.map((u) => u.url); expect(urls).toContain("https://example.com/blog/post1"); expect(urls).not.toContain("https://example.com/about"); }); it("respects excludePatterns filter", async () => { const { crawler, fetchPageMock } = createTestCrawler({ url: "https://example.com", depth: 1, excludePatterns: ["/admin"], }); fetchPageMock.mockResolvedValueOnce(pageResult( "https://example.com", makeHtml([ "https://example.com/page1", "https://example.com/admin/dashboard", // excluded ]), )); fetchPageMock.mockResolvedValueOnce( pageResult("https://example.com/page1", makeHtml([])), ); const result = await crawler.crawl(); const urls = result.urls.map((u) => u.url); expect(urls).toContain("https://example.com/page1"); expect(urls).not.toContain("https://example.com/admin/dashboard"); }); }); }); ================================================ FILE: tests/unit/daemon-dispatch.test.ts ================================================ import { describe, it, expect, beforeEach, vi } from "vitest"; import { Readable } from "stream"; import http from "http"; import { DaemonServer } from "../../src/daemon/server"; /** * Unit tests for DaemonServer POST / request dispatch. * * These test the handleRequest method directly (via `as any`) with mock * IncomingMessage and ServerResponse objects, avoiding the need to start * a real server or browser pool. */ // ---- Helpers ---- /** Create a mock IncomingMessage from method, url, body string, and optional headers. */ function mockReq( method: string, url: string, body: string = "", headers: Record = {}, ): http.IncomingMessage { const readable = new Readable({ read() { this.push(body); this.push(null); }, }); // Overlay the HTTP-specific properties onto the Readable stream. Object.assign(readable, { method, url, headers: { "content-type": "application/json", ...headers, }, }); return readable as unknown as http.IncomingMessage; } /** Captured response data from a mock ServerResponse. */ interface CapturedResponse { statusCode: number; headers: Record; body: any; } /** Create a mock ServerResponse that captures writeHead/end calls. */ function mockRes(): { res: http.ServerResponse; captured: () => CapturedResponse } { let statusCode = 200; let responseHeaders: Record = {}; let bodyChunks: string[] = []; const fake = { writeHead(code: number, headers?: Record) { statusCode = code; if (headers) responseHeaders = headers; }, end(data?: string) { if (data) bodyChunks.push(data); }, }; return { res: fake as unknown as http.ServerResponse, captured: () => ({ statusCode, headers: responseHeaders, body: (() => { const raw = bodyChunks.join(""); try { return JSON.parse(raw); } catch { return raw; } })(), }), }; } // ---- Tests ---- describe("DaemonServer POST / dispatch", () => { let daemon: DaemonServer; let handleRequest: (req: http.IncomingMessage, res: http.ServerResponse) => Promise; // Mock client with scrape, crawl, isReady const mockClient = { scrape: vi.fn(), crawl: vi.fn(), isReady: vi.fn(() => true), }; beforeEach(() => { vi.clearAllMocks(); daemon = new DaemonServer({ port: 0 }); // Inject mock client without starting the server (daemon as any).client = mockClient; // Set startTime so status uptime works (daemon as any).startTime = Date.now(); // Bind handleRequest handleRequest = (daemon as any).handleRequest.bind(daemon); }); // 1. action=scrape calls client.scrape and returns result it("dispatches action=scrape to client.scrape and returns 200", async () => { const scrapeResult = { data: [{ url: "https://example.com", markdown: "# Hello" }] }; mockClient.scrape.mockResolvedValue(scrapeResult); const req = mockReq("POST", "/", JSON.stringify({ action: "scrape", options: { urls: ["https://example.com"] }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(200); expect(out.body.success).toBe(true); expect(out.body.data).toEqual(scrapeResult); expect(mockClient.scrape).toHaveBeenCalledWith({ urls: ["https://example.com"] }); }); // 2. action=crawl calls client.crawl and returns result it("dispatches action=crawl to client.crawl and returns 200", async () => { const crawlResult = { urls: ["https://example.com", "https://example.com/about"] }; mockClient.crawl.mockResolvedValue(crawlResult); const req = mockReq("POST", "/", JSON.stringify({ action: "crawl", options: { url: "https://example.com", depth: 2 }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(200); expect(out.body.success).toBe(true); expect(out.body.data).toEqual(crawlResult); expect(mockClient.crawl).toHaveBeenCalledWith({ url: "https://example.com", depth: 2 }); }); // 3. action=status returns pool stats it("dispatches action=status and returns daemon status", async () => { const req = mockReq("POST", "/", JSON.stringify({ action: "status" })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(200); expect(out.body.success).toBe(true); expect(out.body.data.running).toBe(true); expect(out.body.data.ready).toBe(true); expect(typeof out.body.data.uptime).toBe("number"); expect(typeof out.body.data.pid).toBe("number"); expect(typeof out.body.data.activeRequests).toBe("number"); }); // 4. action=unknown returns 400 it("returns 400 for unknown action", async () => { const req = mockReq("POST", "/", JSON.stringify({ action: "bogus" })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(400); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Unknown action"); }); // 5. Invalid JSON returns 400 it("returns 400 for invalid JSON body", async () => { const req = mockReq("POST", "/", "not-json{{{"); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(400); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Invalid JSON"); }); // 6. During shutdown returns 503 it("returns 503 when server is shutting down", async () => { (daemon as any).shuttingDown = true; const req = mockReq("POST", "/", JSON.stringify({ action: "scrape", options: { urls: ["https://example.com"] } })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(503); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Server is shutting down"); }); // 7. Client is null returns 500 it("returns 500 when client is not initialized (scrape)", async () => { (daemon as any).client = null; const req = mockReq("POST", "/", JSON.stringify({ action: "scrape", options: { urls: ["https://example.com"] }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(500); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Client not initialized"); }); it("returns 500 when client is not initialized (crawl)", async () => { (daemon as any).client = null; const req = mockReq("POST", "/", JSON.stringify({ action: "crawl", options: { url: "https://example.com" }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(500); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Client not initialized"); }); // 8. Scrape that throws returns 500 with error message it("returns 500 when client.scrape throws", async () => { mockClient.scrape.mockRejectedValue(new Error("Browser crashed")); const req = mockReq("POST", "/", JSON.stringify({ action: "scrape", options: { urls: ["https://example.com"] }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(500); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Browser crashed"); }); it("returns 500 when client.crawl throws", async () => { mockClient.crawl.mockRejectedValue(new Error("Timeout exceeded")); const req = mockReq("POST", "/", JSON.stringify({ action: "crawl", options: { url: "https://example.com" }, })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(500); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Timeout exceeded"); }); // 9. GET /health returns 200 (no auth needed) it("GET /health returns 200 without auth", async () => { // Re-create daemon with auth token to prove /health skips auth daemon = new DaemonServer({ port: 0, authToken: "secret" }); (daemon as any).client = mockClient; handleRequest = (daemon as any).handleRequest.bind(daemon); const req = mockReq("GET", "/health"); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(200); expect(out.body.success).toBe(true); expect(out.body.data.status).toBe("ok"); }); // 10. POST / without auth token returns 401 it("returns 401 when auth is required but missing", async () => { daemon = new DaemonServer({ port: 0, authToken: "secret" }); (daemon as any).client = mockClient; (daemon as any).startTime = Date.now(); handleRequest = (daemon as any).handleRequest.bind(daemon); const req = mockReq("POST", "/", JSON.stringify({ action: "status" })); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(401); expect(out.body.success).toBe(false); expect(out.body.error).toBe("Unauthorized"); }); it("allows POST / with correct auth token", async () => { daemon = new DaemonServer({ port: 0, authToken: "secret" }); (daemon as any).client = mockClient; (daemon as any).startTime = Date.now(); handleRequest = (daemon as any).handleRequest.bind(daemon); const req = mockReq("POST", "/", JSON.stringify({ action: "status" }), { authorization: "Bearer secret", }); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(200); expect(out.body.success).toBe(true); expect(out.body.data.running).toBe(true); }); // Edge case: 404 for non-POST non-GET routes it("returns 404 for unsupported method/path", async () => { const req = mockReq("PUT", "/"); const { res, captured } = mockRes(); await handleRequest(req, res); const out = captured(); expect(out.statusCode).toBe(404); expect(out.body.error).toBe("Not found"); }); // Edge case: activeRequests counter is decremented even on error it("decrements activeRequests after scrape error", async () => { mockClient.scrape.mockRejectedValue(new Error("fail")); expect((daemon as any).activeRequests).toBe(0); const req = mockReq("POST", "/", JSON.stringify({ action: "scrape", options: { urls: ["https://example.com"] }, })); const { res } = mockRes(); await handleRequest(req, res); expect((daemon as any).activeRequests).toBe(0); }); }); ================================================ FILE: tests/unit/domain-profiles.test.ts ================================================ import { describe, it, expect } from "vitest"; import { getDomainProfile, applyDomainProfile } from "../../src/config/domain-profiles"; // Test profiles — reader has no built-in profiles, so we provide our own const TEST_PROFILES = { "amazon.com": { proxyTier: "residential" as const, timeoutMs: 60000, batchConcurrency: 2 }, "amazon.co.uk": { proxyTier: "residential" as const, timeoutMs: 60000 }, "amazon.de": { proxyTier: "residential" as const, timeoutMs: 60000 }, "amazon.co.jp": { proxyTier: "residential" as const, timeoutMs: 60000 }, "linkedin.com": { proxyTier: "residential" as const, timeoutMs: 60000 }, "google.com": { batchConcurrency: 1 }, }; describe("getDomainProfile", () => { describe("exact domain match", () => { it("returns profile for amazon.com", () => { const profile = getDomainProfile("amazon.com", TEST_PROFILES); expect(profile).toBeDefined(); expect(profile!.proxyTier).toBe("residential"); expect(profile!.timeoutMs).toBe(60000); }); it("returns profile for linkedin.com", () => { const profile = getDomainProfile("linkedin.com", TEST_PROFILES); expect(profile).toBeDefined(); expect(profile!.proxyTier).toBe("residential"); }); it("returns undefined for unknown domain", () => { expect(getDomainProfile("example.com", TEST_PROFILES)).toBeUndefined(); }); it("returns undefined when no profiles provided", () => { expect(getDomainProfile("amazon.com")).toBeUndefined(); expect(getDomainProfile("amazon.com", undefined)).toBeUndefined(); expect(getDomainProfile("amazon.com", {})).toBeUndefined(); }); }); describe("www stripping", () => { it("strips www. prefix before lookup", () => { const profile = getDomainProfile("www.amazon.com", TEST_PROFILES); expect(profile).toBeDefined(); expect(profile!.proxyTier).toBe("residential"); }); }); describe("subdomain matching", () => { it("matches shop.amazon.com to amazon.com profile", () => { const profile = getDomainProfile("shop.amazon.com", TEST_PROFILES); expect(profile).toBeDefined(); expect(profile!.proxyTier).toBe("residential"); }); it("matches smile.amazon.com to amazon.com profile", () => { const profile = getDomainProfile("smile.amazon.com", TEST_PROFILES); expect(profile).toBeDefined(); }); it("does not match amazonclone.com to amazon.com", () => { expect(getDomainProfile("amazonclone.com", TEST_PROFILES)).toBeUndefined(); }); }); describe("full URL input", () => { it("extracts hostname from full URL", () => { const profile = getDomainProfile("https://www.amazon.com/dp/B08N5WRWNW", TEST_PROFILES); expect(profile).toBeDefined(); expect(profile!.proxyTier).toBe("residential"); }); it("handles URL with port", () => { const profile = getDomainProfile("https://amazon.com:443/dp/B08N5WRWNW", TEST_PROFILES); expect(profile).toBeDefined(); }); it("returns undefined for invalid URL", () => { expect(getDomainProfile("not a url at all", TEST_PROFILES)).toBeUndefined(); }); }); describe("international Amazon domains", () => { it("matches amazon.co.uk", () => { expect(getDomainProfile("amazon.co.uk", TEST_PROFILES)).toBeDefined(); }); it("matches amazon.de", () => { expect(getDomainProfile("amazon.de", TEST_PROFILES)).toBeDefined(); }); it("matches amazon.co.jp", () => { expect(getDomainProfile("amazon.co.jp", TEST_PROFILES)).toBeDefined(); }); }); }); describe("applyDomainProfile", () => { it("applies profile values when user has not set them", () => { const options = { urls: ["https://amazon.com"], formats: ["markdown" as const] }; const profile = { proxyTier: "residential" as const, timeoutMs: 60000 }; const merged = applyDomainProfile(options, profile); expect(merged.timeoutMs).toBe(60000); expect(merged.proxyTier).toBe("residential"); }); it("does not override user-provided values", () => { const options = { urls: ["https://amazon.com"], timeoutMs: 15000, proxyTier: "datacenter" as const }; const profile = { proxyTier: "residential" as const, timeoutMs: 60000 }; const merged = applyDomainProfile(options, profile); expect(merged.timeoutMs).toBe(15000); expect(merged.proxyTier).toBe("datacenter"); }); it("preserves all original options", () => { const options = { urls: ["https://amazon.com"], formats: ["markdown" as const], onlyMainContent: true, verbose: true, }; const profile = { proxyTier: "residential" as const }; const merged = applyDomainProfile(options, profile); expect(merged.urls).toEqual(["https://amazon.com"]); expect(merged.formats).toEqual(["markdown"]); expect(merged.onlyMainContent).toBe(true); expect(merged.verbose).toBe(true); }); }); ================================================ FILE: tests/unit/errors.test.ts ================================================ import { describe, it, expect } from "vitest"; import { ReaderError, ReaderErrorCode, NetworkError, TimeoutError, CloudflareError, AccessDeniedError, DNSError, TLSError, BotDetectedError, ProxyConnectionError, ProxyExhaustedError, ContentTooLargeError, MarkdownConversionError, EmptyContentError, BrowserPoolError, ClientClosedError, NotInitializedError, RobotsBlockedError, InvalidUrlError, wrapError, } from "../../src/errors"; import { ScrapeFailedError } from "../../src/engines/errors"; describe("Error types", () => { describe("error codes", () => { it("NetworkError has NETWORK_ERROR code", () => { const err = new NetworkError("Connection failed", { url: "https://example.com" }); expect(err.code).toBe(ReaderErrorCode.NETWORK_ERROR); }); it("TimeoutError has TIMEOUT code", () => { const err = new TimeoutError("Timed out", 30000); expect(err.code).toBe(ReaderErrorCode.TIMEOUT); expect(err.timeoutMs).toBe(30000); }); it("DNSError has DNS_ERROR code", () => { const err = new DNSError("nonexistent.example.com"); expect(err.code).toBe(ReaderErrorCode.DNS_ERROR); expect(err.hostname).toBe("nonexistent.example.com"); }); it("TLSError has TLS_ERROR code", () => { const err = new TLSError("Certificate expired"); expect(err.code).toBe(ReaderErrorCode.TLS_ERROR); }); it("BotDetectedError has BOT_DETECTED code", () => { const err = new BotDetectedError("Amazon block page"); expect(err.code).toBe(ReaderErrorCode.BOT_DETECTED); expect(err.signal).toBe("Amazon block page"); }); it("ProxyConnectionError has PROXY_CONNECTION_ERROR code", () => { const err = new ProxyConnectionError("datacenter"); expect(err.code).toBe(ReaderErrorCode.PROXY_CONNECTION_ERROR); expect(err.proxyTier).toBe("datacenter"); }); it("ProxyExhaustedError has PROXY_EXHAUSTED code", () => { const err = new ProxyExhaustedError(); expect(err.code).toBe(ReaderErrorCode.PROXY_EXHAUSTED); }); it("ContentTooLargeError has CONTENT_TOO_LARGE code", () => { const err = new ContentTooLargeError(500000, 300000); expect(err.code).toBe(ReaderErrorCode.CONTENT_TOO_LARGE); expect(err.sizeBytes).toBe(500000); expect(err.limitBytes).toBe(300000); }); it("MarkdownConversionError has MARKDOWN_CONVERSION_FAILED code", () => { const err = new MarkdownConversionError("Formatting argument out of range"); expect(err.code).toBe(ReaderErrorCode.MARKDOWN_CONVERSION_FAILED); }); it("EmptyContentError has EMPTY_CONTENT code", () => { const err = new EmptyContentError(10); expect(err.code).toBe(ReaderErrorCode.EMPTY_CONTENT); expect(err.contentLength).toBe(10); }); it("ScrapeFailedError wraps underlying error with proxyBlock flag", () => { const inner = new Error("timeout"); const err = new ScrapeFailedError(inner, { proxyBlock: true }); expect(err.name).toBe("ScrapeFailedError"); expect(err.proxyBlock).toBe(true); expect(err.cause).toBe(inner); }); }); describe("retryable flags", () => { it("NetworkError is retryable", () => { expect(new NetworkError("fail").retryable).toBe(true); }); it("TimeoutError is retryable", () => { expect(new TimeoutError("timeout", 1000).retryable).toBe(true); }); it("CloudflareError is retryable", () => { expect(new CloudflareError("turnstile").retryable).toBe(true); }); it("BotDetectedError is retryable", () => { expect(new BotDetectedError("amazon").retryable).toBe(true); }); it("ProxyConnectionError is retryable", () => { expect(new ProxyConnectionError("datacenter").retryable).toBe(true); }); it("TLSError is retryable", () => { expect(new TLSError("cert expired").retryable).toBe(true); }); it("EmptyContentError is retryable", () => { expect(new EmptyContentError(0).retryable).toBe(true); }); it("BrowserPoolError is retryable", () => { expect(new BrowserPoolError("pool full").retryable).toBe(true); }); it("AccessDeniedError is NOT retryable", () => { expect(new AccessDeniedError("403").retryable).toBe(false); }); it("DNSError is NOT retryable", () => { expect(new DNSError("bad.host").retryable).toBe(false); }); it("ProxyExhaustedError is NOT retryable", () => { expect(new ProxyExhaustedError().retryable).toBe(false); }); it("ContentTooLargeError is NOT retryable", () => { expect(new ContentTooLargeError(1, 1).retryable).toBe(false); }); it("ScrapeFailedError extends Error", () => { const err = new ScrapeFailedError(new Error("test")); expect(err).toBeInstanceOf(Error); expect(err.name).toBe("ScrapeFailedError"); }); it("ClientClosedError is NOT retryable", () => { expect(new ClientClosedError().retryable).toBe(false); }); it("InvalidUrlError is NOT retryable", () => { expect(new InvalidUrlError("bad-url").retryable).toBe(false); }); it("RobotsBlockedError is NOT retryable", () => { expect(new RobotsBlockedError("https://example.com/secret").retryable).toBe(false); }); }); describe("toJSON serialization", () => { it("serializes base ReaderError correctly", () => { const err = new NetworkError("Connection lost", { url: "https://example.com" }); const json = err.toJSON(); expect(json.name).toBe("NetworkError"); expect(json.code).toBe("NETWORK_ERROR"); expect(json.message).toBe("Connection lost"); expect(json.url).toBe("https://example.com"); expect(json.retryable).toBe(true); expect(json.timestamp).toBeDefined(); expect(typeof json.timestamp).toBe("string"); expect(json.stack).toBeDefined(); }); it("serializes DNSError with hostname", () => { const json = new DNSError("bad.host", { url: "https://bad.host" }).toJSON(); expect(json.hostname).toBe("bad.host"); }); it("serializes ContentTooLargeError with sizes", () => { const json = new ContentTooLargeError(500000, 300000).toJSON(); expect(json.sizeBytes).toBe(500000); expect(json.limitBytes).toBe(300000); }); it("ScrapeFailedError preserves underlying error message", () => { const inner = new Error("Hero timed out after 10s"); const err = new ScrapeFailedError(inner); expect(err.message).toContain("timed out"); }); it("serializes cause message", () => { const cause = new Error("root cause"); const err = new NetworkError("wrapped", { cause }); expect(err.toJSON().cause).toBe("root cause"); }); }); }); describe("wrapError", () => { it("passes through ReaderError unchanged", () => { const err = new NetworkError("test"); expect(wrapError(err)).toBe(err); }); it("wraps timeout errors", () => { const err = new Error("Request timed out after 30s"); const wrapped = wrapError(err, "https://example.com"); expect(wrapped.code).toBe(ReaderErrorCode.TIMEOUT); expect(wrapped.url).toBe("https://example.com"); }); it("wraps DNS errors (ENOTFOUND)", () => { const err = new Error("getaddrinfo ENOTFOUND nonexistent.example.com"); const wrapped = wrapError(err, "https://nonexistent.example.com/page"); expect(wrapped.code).toBe(ReaderErrorCode.DNS_ERROR); }); it("wraps TLS/SSL errors", () => { const err = new Error("unable to verify the first certificate"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.TLS_ERROR); }); it("wraps connection refused errors", () => { const err = new Error("connect ECONNREFUSED 127.0.0.1:443"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.NETWORK_ERROR); }); it("wraps connection reset errors", () => { const err = new Error("read ECONNRESET"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.NETWORK_ERROR); }); it("wraps proxy errors", () => { const err = new Error("proxy connection failed: tunnel timeout"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.PROXY_CONNECTION_ERROR); }); it("wraps cloudflare errors", () => { const err = new Error("Cloudflare challenge detected"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.CLOUDFLARE_CHALLENGE); }); it("wraps supermarkdown conversion errors", () => { const err = new Error("Supermarkdown conversion failed: Formatting argument out of range"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.MARKDOWN_CONVERSION_FAILED); }); it("wraps unknown errors as UNKNOWN", () => { const err = new Error("something completely unexpected"); const wrapped = wrapError(err); expect(wrapped.code).toBe(ReaderErrorCode.UNKNOWN); }); it("wraps non-Error objects", () => { const wrapped = wrapError("string error"); expect(wrapped.code).toBe(ReaderErrorCode.UNKNOWN); expect(wrapped.message).toBe("string error"); }); it("preserves cause chain", () => { const cause = new Error("root"); const err = new Error("surface: root"); const wrapped = wrapError(err, "https://example.com"); expect(wrapped.cause).toBeDefined(); }); }); ================================================ FILE: tests/unit/health-tracker.test.ts ================================================ import { describe, it, expect, vi, beforeEach } from "vitest"; import { ProxyHealthTracker } from "../../src/proxy/health-tracker"; /** * Fake clock that the tracker reads via the injected `now` option. */ function fakeClock(start = 1_000_000_000_000) { let current = start; return { now: () => current, advance: (ms: number) => { current += ms; }, }; } describe("ProxyHealthTracker", () => { describe("defaults and validation", () => { it("unknown proxy is healthy by default", () => { const t = new ProxyHealthTracker(); expect(t.isHealthy("http://unknown")).toBe(true); expect(t.snapshot("http://unknown")).toBeNull(); }); it("rejects invalid failureThreshold", () => { expect(() => new ProxyHealthTracker({ failureThreshold: 0 })).toThrow(); expect(() => new ProxyHealthTracker({ failureThreshold: -1 })).toThrow(); expect(() => new ProxyHealthTracker({ failureThreshold: 1.5 })).toThrow(); }); it("rejects negative cooldownMs", () => { expect(() => new ProxyHealthTracker({ cooldownMs: -1 })).toThrow(); }); }); describe("bench + cooldown (default thresholds)", () => { it("benches after 10 consecutive failures and emits event", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now }); const onBench = vi.fn(); t.on("proxy-benched", onBench); for (let i = 0; i < 9; i++) { t.recordFailure("http://dc1"); } expect(t.isHealthy("http://dc1")).toBe(true); expect(onBench).not.toHaveBeenCalled(); t.recordFailure("http://dc1"); // 10th expect(t.isHealthy("http://dc1")).toBe(false); expect(onBench).toHaveBeenCalledTimes(1); expect(onBench.mock.calls[0][0]).toMatchObject({ proxyUrl: "http://dc1", consecutiveFailures: 10, }); }); it("bench event fires exactly once, not on every subsequent failure", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now }); const onBench = vi.fn(); t.on("proxy-benched", onBench); for (let i = 0; i < 15; i++) { t.recordFailure("http://dc1"); } expect(onBench).toHaveBeenCalledTimes(1); }); it("success decays failure counter by 3 (not full reset)", () => { const t = new ProxyHealthTracker(); for (let i = 0; i < 9; i++) t.recordFailure("http://dc1"); // 9 failures → recordSuccess → decay by 3 → 6 remaining t.recordSuccess("http://dc1"); expect(t.snapshot("http://dc1")?.consecutiveFailures).toBe(6); // 4 more failures → 6 + 4 = 10 → benched for (let i = 0; i < 3; i++) t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(true); t.recordFailure("http://dc1"); // 10th total expect(t.isHealthy("http://dc1")).toBe(false); }); }); describe("cooldown auto-revive", () => { it("isHealthy returns false until cooldown expires, then true with revive event", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 60_000 }); const onRevive = vi.fn(); t.on("proxy-revived", onRevive); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(false); clock.advance(30_000); expect(t.isHealthy("http://dc1")).toBe(false); expect(onRevive).not.toHaveBeenCalled(); clock.advance(30_001); expect(t.isHealthy("http://dc1")).toBe(true); expect(onRevive).toHaveBeenCalledTimes(1); }); it("revive event fires exactly once", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 10 }); const onRevive = vi.fn(); t.on("proxy-revived", onRevive); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); clock.advance(11); t.isHealthy("http://dc1"); // revives t.isHealthy("http://dc1"); t.isHealthy("http://dc1"); expect(onRevive).toHaveBeenCalledTimes(1); }); }); describe("probationary failure re-benches immediately", () => { it("a single failure after revive re-bumps to benched on the next strike", () => { // After revive, the counter is still at 10. One more failure *does* // re-bench because it crosses the threshold again on a non-benched // state. const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 1000 }); const onBench = vi.fn(); t.on("proxy-benched", onBench); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); expect(onBench).toHaveBeenCalledTimes(1); clock.advance(1001); expect(t.isHealthy("http://dc1")).toBe(true); // revived t.recordFailure("http://dc1"); // probationary failure expect(t.isHealthy("http://dc1")).toBe(false); expect(onBench).toHaveBeenCalledTimes(2); }); it("a success during probation clears the counter and unbenches", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now, cooldownMs: 1000 }); const onRevive = vi.fn(); t.on("proxy-revived", onRevive); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); clock.advance(1001); t.isHealthy("http://dc1"); // revives, +1 onRevive t.recordSuccess("http://dc1"); // After success: counter decrements by 3 (decay model) from 10 → 7. // Not benched because benchedUntil was cleared by isHealthy. No second // revive event from recordSuccess because benchedUntil was already null. expect(onRevive).toHaveBeenCalledTimes(1); expect(t.snapshot("http://dc1")?.consecutiveFailures).toBe(7); expect(t.isHealthy("http://dc1")).toBe(true); }); }); describe("per-proxy isolation", () => { it("benching dc1 does not affect dc2", () => { const t = new ProxyHealthTracker(); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(false); expect(t.isHealthy("http://dc2")).toBe(true); }); }); describe("snapshot", () => { it("tracks total successes and failures over time", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ now: clock.now }); t.recordFailure("http://dc1"); clock.advance(1000); t.recordSuccess("http://dc1"); clock.advance(1000); t.recordFailure("http://dc1"); clock.advance(1000); t.recordFailure("http://dc1"); const s = t.snapshot("http://dc1")!; expect(s.totalFailures).toBe(3); expect(s.totalSuccesses).toBe(1); expect(s.consecutiveFailures).toBe(2); // reset by the success expect(s.lastSuccessAt).not.toBeNull(); expect(s.lastFailureAt).not.toBeNull(); expect(s.healthy).toBe(true); }); it("allSnapshots lists every tracked proxy", () => { const t = new ProxyHealthTracker(); t.recordFailure("http://dc1"); t.recordSuccess("http://dc2"); t.recordFailure("http://dc3"); const all = t.allSnapshots(); expect(all.map((s) => s.proxyUrl).sort()).toEqual([ "http://dc1", "http://dc2", "http://dc3", ]); }); }); describe("reset", () => { it("reset drops all state for a proxy", () => { const t = new ProxyHealthTracker(); for (let i = 0; i < 10; i++) t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(false); t.reset("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(true); expect(t.snapshot("http://dc1")).toBeNull(); }); }); describe("custom thresholds", () => { it("respects custom failureThreshold=3 and cooldownMs=100", () => { const clock = fakeClock(); const t = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 100, now: clock.now, }); t.recordFailure("http://dc1"); t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(true); t.recordFailure("http://dc1"); expect(t.isHealthy("http://dc1")).toBe(false); clock.advance(101); expect(t.isHealthy("http://dc1")).toBe(true); }); }); }); ================================================ FILE: tests/unit/html-size-guard.test.ts ================================================ import { describe, it, expect } from "vitest"; /** * HTML Size Guard tests. * * The scraper truncates HTML > MAX_HTML_BYTES before markdown conversion. * We test the logic in isolation (the guard is inline in scraper.ts). */ const DEFAULT_MAX = 307200; // 300KB function applyGuard(html: string, maxBytes: number = DEFAULT_MAX): { truncated: boolean; output: string } { if (html.length > maxBytes) { return { truncated: true, output: html.slice(0, maxBytes) }; } return { truncated: false, output: html }; } describe("HTML size guard", () => { it("passes through HTML under limit unchanged", () => { const html = "

Short content

"; const result = applyGuard(html); expect(result.truncated).toBe(false); expect(result.output).toBe(html); }); it("truncates HTML over limit", () => { const html = "x".repeat(400000); const result = applyGuard(html); expect(result.truncated).toBe(true); expect(result.output.length).toBe(DEFAULT_MAX); }); it("handles exactly-at-limit HTML", () => { const html = "x".repeat(DEFAULT_MAX); const result = applyGuard(html); expect(result.truncated).toBe(false); expect(result.output.length).toBe(DEFAULT_MAX); }); it("handles empty HTML", () => { const result = applyGuard(""); expect(result.truncated).toBe(false); expect(result.output).toBe(""); }); it("respects custom limit", () => { const html = "x".repeat(1000); const result = applyGuard(html, 500); expect(result.truncated).toBe(true); expect(result.output.length).toBe(500); }); it("default limit is 300KB", () => { expect(DEFAULT_MAX).toBe(300 * 1024); }); }); ================================================ FILE: tests/unit/markdown-formatter.test.ts ================================================ import { describe, it, expect } from "vitest"; import { htmlToMarkdown, formatToMarkdown } from "../../src/formatters/markdown"; describe("htmlToMarkdown", () => { describe("with real supermarkdown", () => { it("converts heading to atx-style markdown", () => { const result = htmlToMarkdown("

Hello World

"); expect(result).toContain("# Hello World"); }); it("converts paragraph to plain text", () => { const result = htmlToMarkdown("

This is a paragraph.

"); expect(result).toContain("This is a paragraph."); // Should not contain any HTML tags expect(result).not.toContain("

"); }); it("converts links to inline markdown", () => { const result = htmlToMarkdown( '

Click here

' ); expect(result).toContain("[Click here](https://example.com)"); }); it("converts unordered lists with - bullet marker", () => { const result = htmlToMarkdown( "
  • First
  • Second
  • Third
" ); expect(result).toContain("- First"); expect(result).toContain("- Second"); expect(result).toContain("- Third"); }); it("converts bold and italic text", () => { const result = htmlToMarkdown( "

bold and italic

" ); expect(result).toContain("**bold**"); expect(result).toContain("*italic*"); }); it("converts code blocks with backtick fence", () => { const result = htmlToMarkdown( "
const x = 1;
" ); expect(result).toContain("`"); expect(result).toContain("const x = 1;"); }); it("returns empty string for empty input", () => { const result = htmlToMarkdown(""); expect(result).toBe(""); }); it("handles whitespace-only HTML", () => { const result = htmlToMarkdown(" \n\t "); // Should return empty or whitespace-only (short input, no fallback triggered) expect(result.trim()).toBe(""); }); it("converts tables to GFM format", () => { const result = htmlToMarkdown( "" + "
NameAge
Alice30
" ); expect(result).toContain("Name"); expect(result).toContain("Age"); expect(result).toContain("Alice"); expect(result).toContain("30"); // GFM tables use pipes expect(result).toContain("|"); }); it("converts images to markdown syntax", () => { const result = htmlToMarkdown( 'A photo' ); expect(result).toContain("![A photo](https://example.com/image.png)"); }); it("handles nested HTML structures", () => { const result = htmlToMarkdown( '

This has bold, italic, and a link.

' ); expect(result).toContain("**bold**"); expect(result).toContain("*italic*"); expect(result).toContain("[a link](https://example.com)"); }); }); describe("fallback behavior", () => { it("falls back to text extraction when convert returns empty on large input", () => { // Build HTML > 100 chars that would normally convert fine, // but if supermarkdown returned empty, fallback strips tags. // We can't easily mock the Rust module, so we test the fallback // path indirectly: pass in HTML with only script/style tags and // enough length to trigger the fallback threshold check. // The real convert handles this fine, so this test validates // that normal large input does NOT trigger fallback. const largeHtml = "

" + "Hello world. ".repeat(20) + "

"; const result = htmlToMarkdown(largeHtml); // Should contain the text (real convert works, no fallback) expect(result).toContain("Hello world."); expect(result.length).toBeGreaterThan(0); }); }); describe("formatToMarkdown alias", () => { it("is the same function as htmlToMarkdown", () => { expect(formatToMarkdown).toBe(htmlToMarkdown); }); it("produces identical output", () => { const html = "

Test

Content here

"; expect(formatToMarkdown(html)).toBe(htmlToMarkdown(html)); }); }); }); ================================================ FILE: tests/unit/metadata-extractor.test.ts ================================================ import { describe, it, expect } from "vitest"; import { extractMetadata } from "../../src/utils/metadata-extractor"; describe("extractMetadata", () => { describe("basic meta tags", () => { it("extracts title from tag", () => { const html = "<html><head><title>My Page"; const meta = extractMetadata(html, "https://example.com"); expect(meta.title).toBe("My Page"); }); it("extracts description from meta tag", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.description).toBe("A great page"); }); it("extracts language from html lang attribute", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.language).toBe("en"); }); it("extracts author from meta tag", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.author).toBe("John Doe"); }); it("extracts canonical URL", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.canonical).toBe("https://example.com/canonical"); }); it("extracts favicon", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.favicon).toContain("favicon.ico"); }); }); describe("Open Graph tags", () => { it("extracts og:title", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.openGraph?.title).toBe("OG Title"); }); it("extracts og:description", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.openGraph?.description).toBe("OG Desc"); }); it("extracts og:image", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.openGraph?.image).toBe("https://example.com/image.jpg"); }); }); describe("Twitter card tags", () => { it("extracts twitter:card", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.twitter?.card).toBe("summary_large_image"); }); it("extracts twitter:title", () => { const html = ''; const meta = extractMetadata(html, "https://example.com"); expect(meta.twitter?.title).toBe("Tweet Title"); }); }); describe("edge cases", () => { it("handles HTML with no metadata", () => { const html = "

Just content

"; const meta = extractMetadata(html, "https://example.com"); expect(meta.title).toBeNull(); expect(meta.description).toBeNull(); }); it("handles empty HTML", () => { const meta = extractMetadata("", "https://example.com"); expect(meta).toBeDefined(); expect(meta.title).toBeNull(); }); it("handles malformed HTML", () => { const html = "Unclosed"; const meta = extractMetadata(html, "https://example.com"); expect(meta.title).toBe("Unclosed"); }); }); }); ================================================ FILE: tests/unit/postprocess.test.ts ================================================ import { describe, it, expect } from "vitest"; import { postprocessMarkdown } from "../../src/formatters/postprocess"; describe("postprocessMarkdown", () => { // ── Skip/Jump to Content removal ────────────────────────────────── describe("skip to content removal", () => { it("removes [Skip to Content](#main)", () => { const input = "[Skip to Content](#main)\n\nHello world"; expect(postprocessMarkdown(input)).toBe("Hello world"); }); it("removes [Jump to Content](#content)", () => { const input = "[Jump to Content](#content)\n\nHello world"; expect(postprocessMarkdown(input)).toBe("Hello world"); }); it("is case insensitive", () => { const input = "[skip to content](#nav)\n\nHello world"; expect(postprocessMarkdown(input)).toBe("Hello world"); }); it("removes [Skip to main Content](#main-content)", () => { const input = "[Skip to main Content](#main-content)\n\nBody text"; expect(postprocessMarkdown(input)).toBe("Body text"); }); it("removes [JUMP TO MAIN CONTENT](#top)", () => { const input = "[JUMP TO MAIN CONTENT](#top)\n\nBody text"; expect(postprocessMarkdown(input)).toBe("Body text"); }); it("handles various fragment anchors", () => { const input = "[Skip to Content](#skip-nav)\n\nContent here"; expect(postprocessMarkdown(input)).toBe("Content here"); }); it("does NOT remove when linking to a real URL (not a fragment)", () => { const input = "[Skip to Content](https://example.com/content)\n\nHello"; expect(postprocessMarkdown(input)).toBe( "[Skip to Content](https://example.com/content)\n\nHello", ); }); }); // ── Image link deduplication ────────────────────────────────────── describe("image link deduplication", () => { it("deduplicates when image URL and link URL match", () => { const input = "[![alt text](https://img.com/photo.jpg)](https://img.com/photo.jpg)"; expect(postprocessMarkdown(input)).toBe("![alt text](https://img.com/photo.jpg)"); }); it("does NOT deduplicate when URLs differ", () => { const input = "[![alt text](https://img.com/photo.jpg)](https://example.com/page)"; expect(postprocessMarkdown(input)).toBe( "[![alt text](https://img.com/photo.jpg)](https://example.com/page)", ); }); it("deduplicates multiple image links in one document", () => { const input = [ "[![a](https://x.com/1.png)](https://x.com/1.png)", "[![b](https://x.com/2.png)](https://x.com/2.png)", ].join("\n\n"); const expected = [ "![a](https://x.com/1.png)", "![b](https://x.com/2.png)", ].join("\n\n"); expect(postprocessMarkdown(input)).toBe(expected); }); }); // ── Blank line collapsing ───────────────────────────────────────── describe("blank line collapsing", () => { it("collapses 3 consecutive blank lines to 2", () => { const input = "Hello\n\n\nWorld"; expect(postprocessMarkdown(input)).toBe("Hello\n\nWorld"); }); it("collapses 5 consecutive blank lines to 2", () => { const input = "Hello\n\n\n\n\nWorld"; expect(postprocessMarkdown(input)).toBe("Hello\n\nWorld"); }); it("keeps 2 consecutive newlines as-is", () => { const input = "Hello\n\nWorld"; expect(postprocessMarkdown(input)).toBe("Hello\n\nWorld"); }); }); // ── Trim ────────────────────────────────────────────────────────── describe("trim", () => { it("trims leading and trailing whitespace", () => { const input = " \n\nHello world\n\n "; expect(postprocessMarkdown(input)).toBe("Hello world"); }); }); // ── Edge cases ──────────────────────────────────────────────────── describe("edge cases", () => { it("handles empty input", () => { expect(postprocessMarkdown("")).toBe(""); }); }); // ── Combined ────────────────────────────────────────────────────── describe("combined patterns", () => { it("applies all transformations in one document", () => { const input = [ " ", "[Skip to Content](#main)", "", "", "", "", "# Title", "", "[![hero](https://img.com/hero.jpg)](https://img.com/hero.jpg)", "", "Some content here.", "", "", "", "Footer text", " ", ].join("\n"); const expected = [ "# Title", "", "![hero](https://img.com/hero.jpg)", "", "Some content here.", "", "Footer text", ].join("\n"); expect(postprocessMarkdown(input)).toBe(expected); }); }); }); ================================================ FILE: tests/unit/proxy-bound-browser.test.ts ================================================ import { describe, it, expect, vi } from "vitest"; import pino from "pino"; import { ProxyBoundBrowser, redactProxyUrl, type HeroFactory, type HeroLike, type TabLike, } from "../../src/browser/proxy-bound-browser"; /** * Silent logger so tests don't spam stdout. */ const silentLogger = pino({ level: "silent" }); /** * Fake Tab returned by fake Hero's newTab(). */ interface FakeTab extends TabLike { tabClosed: boolean; } function makeFakeTab(): FakeTab { return { tabClosed: false, async goto() { return undefined; }, get url() { return Promise.resolve("about:blank"); }, get document() { return {} as unknown; }, async waitForLoad() {}, async waitForPaintingStable() {}, async waitForElement() { return undefined as unknown; }, async close() { this.tabClosed = true; }, }; } /** * Fake Hero that records the config it was launched with and optionally * delays/throws on close. Good enough for exercising ProxyBoundBrowser * without importing @ulixee/hero. */ interface FakeHero extends HeroLike { config: Record<string, unknown>; closed: boolean; tabs: FakeTab[]; } function makeFakeFactory(opts: { failOnCreate?: Error; slowClose?: number; failOnClose?: Error; } = {}): { factory: HeroFactory; instances: FakeHero[]; createCount: number } { const instances: FakeHero[] = []; let createCount = 0; const factory: HeroFactory = { create(config: Record<string, unknown>) { createCount++; if (opts.failOnCreate) throw opts.failOnCreate; const hero: FakeHero = { config, closed: false, tabs: [], async newTab() { const tab = makeFakeTab(); this.tabs.push(tab); return tab; }, async closeTab(tab: TabLike) { await tab.close(); }, async close() { if (opts.slowClose) { await new Promise((r) => setTimeout(r, opts.slowClose)); } if (opts.failOnClose) throw opts.failOnClose; this.closed = true; }, }; instances.push(hero); return hero; }, }; return { factory, instances, get createCount() { return createCount; }, }; } /** * Helper: let microtasks run so pLimit can move its queue forward. */ async function tick(n = 1) { for (let i = 0; i < n; i++) await new Promise((r) => setImmediate(r)); } describe("ProxyBoundBrowser", () => { describe("construction", () => { it("throws on invalid maxTabs", () => { const { factory } = makeFakeFactory(); expect( () => new ProxyBoundBrowser({ proxyUrl: "http://p", maxTabs: 0, heroFactory: factory, logger: silentLogger, }), ).toThrow(); }); it("throws on invalid retireAfterPages", () => { const { factory } = makeFakeFactory(); expect( () => new ProxyBoundBrowser({ proxyUrl: "http://p", retireAfterPages: 0, heroFactory: factory, logger: silentLogger, }), ).toThrow(); }); it("defaults maxTabs=2 and retireAfterPages=100", () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); expect(b.maxTabs).toBe(2); expect(b.retireAfterPages).toBe(100); }); }); describe("ready gate", () => { it("resolves once Hero is launched", async () => { const { factory, instances } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; expect(b.getState()).toBe("active"); expect(instances).toHaveLength(1); }); it("rejects if Hero construction throws", async () => { const err = new Error("launch boom"); const { factory } = makeFakeFactory({ failOnCreate: err }); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await expect(b.ready).rejects.toThrow("launch boom"); expect(b.getState()).toBe("closed"); }); }); describe("proxy binding", () => { it("burns the proxy URL into the Hero config", async () => { const { factory, instances } = makeFakeFactory(); const url = "http://user:pass@dc1.example.com:8080"; const b = new ProxyBoundBrowser({ proxyUrl: url, heroFactory: factory, logger: silentLogger, }); await b.ready; expect(instances[0].config.upstreamProxyUrl).toBe(url); }); it("sets no upstream proxy for the direct lane", async () => { const { factory, instances } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: null, heroFactory: factory, logger: silentLogger, }); await b.ready; expect(instances[0].config.upstreamProxyUrl).toBeUndefined(); }); it("stable UA across browsers with the same proxy URL", async () => { const { factory, instances } = makeFakeFactory(); const url = "http://x:y@host:1"; const a = new ProxyBoundBrowser({ proxyUrl: url, heroFactory: factory, logger: silentLogger, }); const b = new ProxyBoundBrowser({ proxyUrl: url, heroFactory: factory, logger: silentLogger, }); await Promise.all([a.ready, b.ready]); expect(instances[0].config.userAgent).toBe(instances[1].config.userAgent); }); }); describe("withPage tab limiting", () => { it("serializes beyond maxTabs", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", maxTabs: 2, heroFactory: factory, logger: silentLogger, }); await b.ready; let active = 0; let peak = 0; const observe = async () => { active++; peak = Math.max(peak, active); await new Promise((r) => setTimeout(r, 5)); active--; }; await Promise.all([ b.withPage(async () => { await observe(); }), b.withPage(async () => { await observe(); }), b.withPage(async () => { await observe(); }), b.withPage(async () => { await observe(); }), b.withPage(async () => { await observe(); }), ]); expect(peak).toBeLessThanOrEqual(2); }); it("increments totalPages on every withPage completion", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; await b.withPage(async () => 1); await b.withPage(async () => 2); await b.withPage(async () => 3); expect(b.getStats().totalPages).toBe(3); }); it("increments totalPages even on error", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; await expect( b.withPage(async () => { throw new Error("nope"); }), ).rejects.toThrow("nope"); expect(b.getStats().totalPages).toBe(1); }); }); describe("retirement draining", () => { it("waits for in-flight tabs to finish before closing", async () => { const { factory, instances } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", maxTabs: 2, heroFactory: factory, logger: silentLogger, }); await b.ready; let inFlightResolve!: () => void; const inFlight = new Promise<void>((r) => (inFlightResolve = r)); const page = b.withPage(async () => { await inFlight; return "done"; }); await tick(2); // Retire while a tab is in flight. Should not close the Hero yet. const retirePromise = b.retire(); await tick(2); expect(instances[0].closed).toBe(false); expect(b.getState()).toBe("retired"); inFlightResolve(); await page; await retirePromise; expect(instances[0].closed).toBe(true); expect(b.getState()).toBe("closed"); }); it("rejects new withPage calls once retired", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; await b.retire(); await expect(b.withPage(async () => 1)).rejects.toThrow(/retired|closed/); }); it("is safe to call retire multiple times", async () => { const { factory, instances } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; await Promise.all([b.retire(), b.retire(), b.retire()]); expect(instances[0].closed).toBe(true); }); it("swallows close errors during retire", async () => { const { factory } = makeFakeFactory({ failOnClose: new Error("close boom"), }); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; // Should not throw await b.retire(); expect(b.getState()).toBe("closed"); }); }); describe("relaunch", () => { it("closes current Hero and launches a fresh one with the same proxy", async () => { const fakeFactory = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: fakeFactory.factory, logger: silentLogger, }); await b.ready; expect(fakeFactory.createCount).toBe(1); await b.relaunch(); expect(fakeFactory.createCount).toBe(2); expect(fakeFactory.instances[0].closed).toBe(true); expect(b.getState()).toBe("active"); expect(b.getStats().totalPages).toBe(0); }); it("accepts withPage after relaunch", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; await b.relaunch(); const result = await b.withPage(async () => "ok"); expect(result).toBe("ok"); }); }); describe("auto-recycle after retireAfterPages", () => { it("relaunches after hitting the threshold", async () => { const fakeFactory = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", retireAfterPages: 3, heroFactory: fakeFactory.factory, logger: silentLogger, }); await b.ready; await b.withPage(async () => 1); await b.withPage(async () => 2); await b.withPage(async () => 3); // Recycle is scheduled via setImmediate inside the 3rd withPage's // finally. Poll briefly for the state machine to settle into the new // `active` state with a freshly-launched Hero. for (let i = 0; i < 50 && fakeFactory.createCount < 2; i++) { await tick(1); } await b.ready; expect(fakeFactory.createCount).toBe(2); expect(b.getState()).toBe("active"); expect(b.getStats().totalPages).toBe(0); }); }); describe("stats", () => { it("reports state, activeTabs, totalPages, fingerprintIndex", async () => { const { factory } = makeFakeFactory(); const b = new ProxyBoundBrowser({ proxyUrl: "http://p", heroFactory: factory, logger: silentLogger, }); await b.ready; const s = b.getStats(); expect(s.state).toBe("active"); expect(s.activeTabs).toBe(0); expect(s.totalPages).toBe(0); expect(s.fingerprintIndex).toBeGreaterThanOrEqual(0); }); }); }); describe("redactProxyUrl", () => { it("strips credentials but keeps host", () => { expect(redactProxyUrl("http://user:pass@host:8080")).toBe("http://***@host:8080"); }); it("returns 'direct' for null", () => { expect(redactProxyUrl(null)).toBe("direct"); }); it("handles URLs without credentials", () => { expect(redactProxyUrl("http://host:8080")).toBe("http://host:8080"); }); it("returns a safe placeholder for malformed URLs", () => { expect(redactProxyUrl("not a url")).toBe("<invalid-proxy-url>"); }); }); ================================================ FILE: tests/unit/proxy-config.test.ts ================================================ import { describe, it, expect } from "vitest"; import { createProxyUrl, parseProxyUrl } from "../../src/proxy/config"; describe("createProxyUrl", () => { it("creates URL containing host and port", () => { const url = createProxyUrl({ host: "proxy.example.com", port: 8080 }); expect(url).toContain("proxy.example.com"); expect(url).toContain("8080"); }); it("includes auth credentials when provided", () => { const url = createProxyUrl({ host: "proxy.example.com", port: 8080, username: "user", password: "pass" }); expect(url).toContain("user"); expect(url).toContain("pass"); expect(url).toContain("proxy.example.com"); }); it("returns direct URL if provided", () => { const url = createProxyUrl({ url: "http://custom-proxy:9999" }); expect(url).toBe("http://custom-proxy:9999"); }); }); describe("parseProxyUrl", () => { it("parses simple proxy URL", () => { const result = parseProxyUrl("http://proxy.example.com:8080"); expect(result.host).toBe("proxy.example.com"); expect(result.port).toBe(8080); }); it("parses proxy URL with auth", () => { const result = parseProxyUrl("http://user:pass@proxy.example.com:8080"); expect(result.host).toBe("proxy.example.com"); expect(result.port).toBe(8080); expect(result.username).toBe("user"); expect(result.password).toBe("pass"); }); it("handles https proxy URLs", () => { const result = parseProxyUrl("https://proxy.example.com:443"); expect(result.host).toBe("proxy.example.com"); // Port may be number or undefined depending on implementation expect(result.port === 443 || result.port === undefined).toBe(true); }); }); ================================================ FILE: tests/unit/proxy-gate.test.ts ================================================ import { describe, it, expect } from "vitest"; import { PerProxyGate } from "../../src/proxy/proxy-gate"; /** * Helper: a deferred that you can resolve from outside. Tests use this to * hold slots for as long as they want. */ function defer<T = void>() { let resolve!: (v: T) => void; const promise = new Promise<T>((r) => (resolve = r)); return { promise, resolve }; } /** * Helper: let microtasks and timers flush before the next assertion. Gives * pLimit a chance to move its queue forward. */ async function tick(n = 1) { for (let i = 0; i < n; i++) { await new Promise((r) => setImmediate(r)); } } describe("PerProxyGate", () => { describe("constructor", () => { it("defaults to maxConcurrentPerProxy=2", async () => { const gate = new PerProxyGate(); const d1 = defer(); const d2 = defer(); const d3 = defer(); // Hold 2 slots const acquired: Array<Promise<void>> = []; const releases: Array<() => void> = []; for (const d of [d1, d2]) { const p = gate.acquire("http://dc1").then((r) => { releases.push(r); return d.promise; }); acquired.push(p); } await tick(2); // Both should be running expect(gate.stats("http://dc1")?.active).toBe(2); // A third should be queued let thirdAcquired = false; const third = gate.acquire("http://dc1").then((r) => { thirdAcquired = true; releases.push(r); return d3.promise; }); await tick(2); expect(thirdAcquired).toBe(false); expect(gate.stats("http://dc1")?.queued).toBe(1); // Release one — third should run d1.resolve(); releases[0]!(); await tick(2); expect(thirdAcquired).toBe(true); // Cleanup d2.resolve(); d3.resolve(); releases.forEach((r) => r()); await Promise.all([...acquired, third]); }); it("rejects non-integer or <1 max", () => { expect(() => new PerProxyGate({ maxConcurrentPerProxy: 0 })).toThrow(); expect(() => new PerProxyGate({ maxConcurrentPerProxy: -1 })).toThrow(); expect(() => new PerProxyGate({ maxConcurrentPerProxy: 1.5 })).toThrow(); }); it("accepts custom maxConcurrentPerProxy", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const d1 = defer(); const d2 = defer(); let secondAcquired = false; const r1p = gate.acquire("http://p").then((r) => d1.promise.then(() => r)); await tick(2); const r2p = gate.acquire("http://p").then((r) => { secondAcquired = true; return d2.promise.then(() => r); }); await tick(2); expect(secondAcquired).toBe(false); expect(gate.stats("http://p")?.active).toBe(1); // Release first d1.resolve(); const r1 = await r1p; r1(); await tick(2); expect(secondAcquired).toBe(true); d2.resolve(); const r2 = await r2p; r2(); }); }); describe("per-proxy isolation", () => { it("does not cross-gate different proxy URLs", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const d1 = defer(); const d2 = defer(); // Hold dc1's slot const r1p = gate.acquire("http://dc1").then((r) => d1.promise.then(() => r)); await tick(2); // dc2 should NOT be blocked by dc1 let dc2Ok = false; const r2p = gate.acquire("http://dc2").then((r) => { dc2Ok = true; return d2.promise.then(() => r); }); await tick(2); expect(dc2Ok).toBe(true); expect(gate.stats("http://dc1")?.active).toBe(1); expect(gate.stats("http://dc2")?.active).toBe(1); d1.resolve(); d2.resolve(); (await r1p)(); (await r2p)(); }); it("direct lane (null proxyUrl) never blocks", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); // Acquire 5 direct slots all at once const releases = await Promise.all([ gate.acquire(null), gate.acquire(undefined), gate.acquire(null), gate.acquire(null), gate.acquire(null), ]); expect(releases).toHaveLength(5); releases.forEach((r) => r()); }); it("direct lane does not appear in stats (no gate is created)", async () => { const gate = new PerProxyGate(); const release = await gate.acquire(null); expect(gate.allStats()).toEqual([]); release(); }); }); describe("withSlot", () => { it("releases on success", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const result = await gate.withSlot("http://p", async () => 42); expect(result).toBe(42); await tick(2); expect(gate.stats("http://p")?.active).toBe(0); }); it("releases on error", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); await expect( gate.withSlot("http://p", async () => { throw new Error("boom"); }), ).rejects.toThrow("boom"); await tick(2); expect(gate.stats("http://p")?.active).toBe(0); // Must be usable again after the failure const ok = await gate.withSlot("http://p", async () => "ok"); expect(ok).toBe("ok"); }); it("serializes withSlot calls on the same proxy", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const order: string[] = []; const a = gate.withSlot("http://p", async () => { order.push("a-start"); await tick(1); order.push("a-end"); }); const b = gate.withSlot("http://p", async () => { order.push("b-start"); order.push("b-end"); }); await Promise.all([a, b]); expect(order).toEqual(["a-start", "a-end", "b-start", "b-end"]); }); }); describe("release idempotency", () => { it("release function is safe to call multiple times", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const r = await gate.acquire("http://p"); r(); r(); r(); // Next acquire should succeed immediately const r2 = await gate.acquire("http://p"); expect(gate.stats("http://p")?.active).toBe(1); r2(); }); }); describe("per-proxy override", () => { it("setOverride tightens the cap for a specific URL", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 2 }); gate.setOverride("http://amazon", 1); const d1 = defer(); let secondAcquired = false; const r1p = gate.acquire("http://amazon").then((r) => d1.promise.then(() => r)); await tick(2); const r2p = gate.acquire("http://amazon").then((r) => { secondAcquired = true; return r; }); await tick(2); expect(secondAcquired).toBe(false); d1.resolve(); (await r1p)(); await tick(2); expect(secondAcquired).toBe(true); (await r2p)(); }); it("override only affects the named URL", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 2 }); gate.setOverride("http://amazon", 1); // Other proxies still get the default of 2 const d1 = defer(); const d2 = defer(); const r1p = gate.acquire("http://other").then((r) => d1.promise.then(() => r)); const r2p = gate.acquire("http://other").then((r) => d2.promise.then(() => r)); await tick(2); expect(gate.stats("http://other")?.active).toBe(2); d1.resolve(); d2.resolve(); (await r1p)(); (await r2p)(); }); it("rejects invalid override values", () => { const gate = new PerProxyGate(); expect(() => gate.setOverride("http://p", 0)).toThrow(); expect(() => gate.setOverride("http://p", -1)).toThrow(); expect(() => gate.setOverride("http://p", 1.5)).toThrow(); }); }); describe("stats", () => { it("returns null for unknown URL", () => { const gate = new PerProxyGate(); expect(gate.stats("http://unknown")).toBeNull(); }); it("reports active + queued counts", async () => { const gate = new PerProxyGate({ maxConcurrentPerProxy: 1 }); const d1 = defer(); const r1p = gate.acquire("http://p").then((r) => d1.promise.then(() => r)); await tick(2); // Queue 2 more const r2p = gate.acquire("http://p"); const r3p = gate.acquire("http://p"); await tick(2); const s = gate.stats("http://p"); expect(s).toEqual({ proxyUrl: "http://p", max: 1, active: 1, queued: 2, }); d1.resolve(); (await r1p)(); (await r2p)(); (await r3p)(); }); it("allStats lists every known gate", async () => { const gate = new PerProxyGate(); await (await gate.acquire("http://a"))(); await (await gate.acquire("http://b"))(); const all = gate.allStats(); expect(all.map((s) => s.proxyUrl).sort()).toEqual(["http://a", "http://b"]); }); }); }); ================================================ FILE: tests/unit/proxy-verify.test.ts ================================================ import { describe, it, expect } from "vitest"; import { verifyProxies, verifyProxiesOrThrow } from "../../src/proxy/verify"; import type { EgressIpFetcher } from "../../src/proxy/verify"; /** * Build an injected fetcher that maps proxy URLs -> mocked egress behaviour. * Each entry is either a string (the egress IP to return) or an Error * (the failure to throw). */ function makeFakeFetcher( routes: Record<string, string | Error>, ): EgressIpFetcher { return async (proxyUrl) => { const v = routes[proxyUrl]; if (v === undefined) { throw new Error(`fake fetcher: no route for ${proxyUrl}`); } if (v instanceof Error) throw v; return v; }; } describe("verifyProxies", () => { it("returns empty result for undefined pools", async () => { const result = await verifyProxies(undefined); expect(result).toEqual({ verified: [], failed: [] }); }); it("returns empty result for empty pools", async () => { const result = await verifyProxies({}); expect(result.verified).toEqual([]); expect(result.failed).toEqual([]); }); it("verifies a single datacenter proxy and returns its egress IP", async () => { const fetcher = makeFakeFetcher({ "http://dc1": "1.2.3.4" }); const result = await verifyProxies( { datacenter: [{ url: "http://dc1" }] }, { fetcher }, ); expect(result.failed).toEqual([]); expect(result.verified).toEqual([ { proxyUrl: "http://dc1", egressIp: "1.2.3.4", tier: "datacenter" }, ]); }); it("tags residential proxies with the right tier", async () => { const fetcher = makeFakeFetcher({ "http://res1": "5.6.7.8" }); const result = await verifyProxies( { residential: [{ url: "http://res1" }] }, { fetcher }, ); expect(result.verified[0]).toMatchObject({ tier: "residential" }); }); it("verifies datacenter and residential pools together", async () => { const fetcher = makeFakeFetcher({ "http://dc1": "1.1.1.1", "http://dc2": "2.2.2.2", "http://res1": "9.9.9.9", }); const result = await verifyProxies( { datacenter: [{ url: "http://dc1" }, { url: "http://dc2" }], residential: [{ url: "http://res1" }], }, { fetcher }, ); expect(result.failed).toEqual([]); expect(result.verified).toHaveLength(3); const tiers = result.verified.map((v) => v.tier).sort(); expect(tiers).toEqual(["datacenter", "datacenter", "residential"]); }); it("collects failures alongside successes", async () => { const fetcher = makeFakeFetcher({ "http://dc1": "1.1.1.1", "http://dc2": new Error("connection refused"), "http://res1": "9.9.9.9", }); const result = await verifyProxies( { datacenter: [{ url: "http://dc1" }, { url: "http://dc2" }], residential: [{ url: "http://res1" }], }, { fetcher }, ); expect(result.verified).toHaveLength(2); expect(result.failed).toEqual([ { proxyUrl: "http://dc2", tier: "datacenter", error: "connection refused" }, ]); }); it("ignores entries without a URL", async () => { const fetcher = makeFakeFetcher({ "http://dc1": "1.1.1.1" }); const result = await verifyProxies( { datacenter: [{ url: "http://dc1" }, {}, { url: "" }] }, { fetcher }, ); expect(result.verified).toHaveLength(1); expect(result.failed).toEqual([]); }); }); describe("verifyProxiesOrThrow", () => { it("returns the verified list when everything succeeds", async () => { const fetcher = makeFakeFetcher({ "http://dc1": "1.1.1.1" }); const verified = await verifyProxiesOrThrow( { datacenter: [{ url: "http://dc1" }] }, { fetcher }, ); expect(verified).toHaveLength(1); expect(verified[0].egressIp).toBe("1.1.1.1"); }); it("throws a multi-line error listing every failed proxy", async () => { const fetcher = makeFakeFetcher({ "http://dc1": new Error("EHOSTUNREACH"), "http://res1": new Error("HTTP 407 from api.ipify.org"), }); await expect( verifyProxiesOrThrow( { datacenter: [{ url: "http://dc1" }], residential: [{ url: "http://res1" }], }, { fetcher }, ), ).rejects.toThrow(/Proxy verification failed for 2 proxy/); }); it("redacts proxy credentials in the error message", async () => { const fetcher = makeFakeFetcher({ "http://user:secret@dc1.example.com:8080": new Error("nope"), }); let captured: string = ""; try { await verifyProxiesOrThrow( { datacenter: [{ url: "http://user:secret@dc1.example.com:8080" }] }, { fetcher }, ); } catch (e: unknown) { captured = e instanceof Error ? e.message : String(e); } expect(captured).toMatch(/dc1\.example\.com/); expect(captured).not.toContain("secret"); expect(captured).not.toContain("user:secret"); }); it("does not throw when there are zero proxies", async () => { const verified = await verifyProxiesOrThrow(undefined); expect(verified).toEqual([]); }); }); ================================================ FILE: tests/unit/robots-parser.test.ts ================================================ import { describe, it, expect } from "vitest"; import { parseRobotsTxt, isPathAllowed, isUrlAllowed, type RobotsRules, } from "../../src/utils/robots-parser"; describe("parseRobotsTxt", () => { it("should parse a basic disallow rule", () => { const content = `User-agent: *\nDisallow: /private`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); expect(rules.allowedPaths).toEqual([]); expect(rules.crawlDelay).toBeNull(); }); it("should parse multiple disallow rules", () => { const content = `User-agent: *\nDisallow: /private\nDisallow: /admin\nDisallow: /secret`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private", "/admin", "/secret"]); }); it("should parse allow rules alongside disallow rules", () => { const content = `User-agent: *\nDisallow: /private\nAllow: /private/public`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); expect(rules.allowedPaths).toEqual(["/private/public"]); }); it("should parse crawl-delay and convert to milliseconds", () => { const content = `User-agent: *\nCrawl-delay: 2`; const rules = parseRobotsTxt(content); expect(rules.crawlDelay).toBe(2000); }); it("should parse fractional crawl-delay", () => { const content = `User-agent: *\nCrawl-delay: 0.5`; const rules = parseRobotsTxt(content); expect(rules.crawlDelay).toBe(500); }); it("should match a specific user agent", () => { const content = `User-agent: Googlebot\nDisallow: /no-google\n\nUser-agent: *\nDisallow: /no-all`; const rules = parseRobotsTxt(content, "Googlebot"); expect(rules.disallowedPaths).toContain("/no-google"); expect(rules.disallowedPaths).toContain("/no-all"); }); it("should match user agent case-insensitively", () => { const content = `User-agent: MyBot\nDisallow: /blocked`; const rules = parseRobotsTxt(content, "mybot"); expect(rules.disallowedPaths).toEqual(["/blocked"]); }); it("should only collect rules under matching user agent sections", () => { const content = `User-agent: OtherBot\nDisallow: /other-only\n\nUser-agent: *\nDisallow: /all`; const rules = parseRobotsTxt(content, "MyBot"); expect(rules.disallowedPaths).not.toContain("/other-only"); expect(rules.disallowedPaths).toContain("/all"); }); it("should use wildcard agent by default", () => { const content = `User-agent: *\nDisallow: /blocked`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/blocked"]); }); it("should ignore comments", () => { const content = `# This is a comment\nUser-agent: *\n# Another comment\nDisallow: /private`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); }); it("should ignore empty lines", () => { const content = `\nUser-agent: *\n\n\nDisallow: /private\n\n`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); }); it("should return empty rules for empty content", () => { const rules = parseRobotsTxt(""); expect(rules.disallowedPaths).toEqual([]); expect(rules.allowedPaths).toEqual([]); expect(rules.crawlDelay).toBeNull(); }); it("should ignore lines without a colon", () => { const content = `User-agent: *\nThis is not a directive\nDisallow: /private`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); }); it("should skip empty Disallow values", () => { const content = `User-agent: *\nDisallow:\nDisallow: /private`; const rules = parseRobotsTxt(content); expect(rules.disallowedPaths).toEqual(["/private"]); }); it("should ignore non-numeric crawl-delay", () => { const content = `User-agent: *\nCrawl-delay: abc`; const rules = parseRobotsTxt(content); expect(rules.crawlDelay).toBeNull(); }); }); describe("isPathAllowed", () => { it("should disallow an exact path match", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/private", rules)).toBe(false); }); it("should disallow a prefix match", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/private/secret", rules)).toBe(false); }); it("should allow paths that do not match any disallow rule", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/public", rules)).toBe(true); }); it("should handle wildcard patterns", () => { const rules: RobotsRules = { disallowedPaths: ["/private/*"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/private/foo", rules)).toBe(false); expect(isPathAllowed("/private/bar/baz", rules)).toBe(false); }); it("should handle $ end anchor", () => { const rules: RobotsRules = { disallowedPaths: ["/*.pdf$"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/document.pdf", rules)).toBe(false); expect(isPathAllowed("/document.pdf?id=1", rules)).toBe(true); }); it("should give allow precedence over disallow", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: ["/private/public"], crawlDelay: null, }; expect(isPathAllowed("/private/public", rules)).toBe(true); expect(isPathAllowed("/private/secret", rules)).toBe(false); }); it("should default to allowed when no rules match", () => { const rules: RobotsRules = { disallowedPaths: [], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/anything", rules)).toBe(true); }); it("should normalize paths without leading slash", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("private", rules)).toBe(false); }); it("should handle wildcard in the middle of a pattern", () => { const rules: RobotsRules = { disallowedPaths: ["/api/*/internal"], allowedPaths: [], crawlDelay: null, }; expect(isPathAllowed("/api/v1/internal", rules)).toBe(false); expect(isPathAllowed("/api/v2/internal", rules)).toBe(false); expect(isPathAllowed("/api/v1/public", rules)).toBe(true); }); }); describe("isUrlAllowed", () => { it("should return true when rules are null", () => { expect(isUrlAllowed("https://example.com/anything", null)).toBe(true); }); it("should check the pathname of a full URL", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isUrlAllowed("https://example.com/private", rules)).toBe(false); expect(isUrlAllowed("https://example.com/public", rules)).toBe(true); }); it("should include query string in path matching", () => { const rules: RobotsRules = { disallowedPaths: ["/search?q=blocked"], allowedPaths: [], crawlDelay: null, }; expect(isUrlAllowed("https://example.com/search?q=blocked", rules)).toBe(false); expect(isUrlAllowed("https://example.com/search?q=allowed", rules)).toBe(true); }); it("should return true for an invalid URL", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; expect(isUrlAllowed("not-a-valid-url", rules)).toBe(true); }); it("should handle URLs with paths and fragments", () => { const rules: RobotsRules = { disallowedPaths: ["/private"], allowedPaths: [], crawlDelay: null, }; // Fragments are not sent to the server, URL constructor excludes them from pathname+search expect(isUrlAllowed("https://example.com/private#section", rules)).toBe(false); }); }); ================================================ FILE: tests/unit/scraper-pipeline.test.ts ================================================ /** * Scraper Content Pipeline Tests * * Tests the end-to-end content pipeline: raw HTML → metadata extraction → * content cleaning → markdown conversion → postprocessing. We mock the * orchestrator to return controlled HTML and test everything downstream. */ import { describe, it, expect, vi } from "vitest"; import { Scraper } from "../../src/scraper"; import type { WebsiteScrapeResult } from "../../src/types"; // ── Helpers ────────────────────────────────────────────────────────────────── function makeScraper(options?: Record<string, unknown>): Scraper { return new Scraper({ urls: ["https://example.com"], formats: ["markdown"], ...options, }); } /** * Mock scrapeSingleUrl to simulate the orchestrator returning raw HTML. * This lets us test the content pipeline (metadata → clean → convert → * postprocess) without hitting real engines. */ function mockPipeline(scraper: Scraper, html: string, url = "https://example.com") { // We need to mock at a level that still exercises the pipeline. // The pipeline runs inside scrapeSingleUrl after the orchestrator returns. // Since scrapeSingleUrl is private and tightly coupled, we mock it to // exercise the pipeline by calling the real functions directly. // // Instead, let's test the pipeline functions in isolation: // extractMetadata + cleanContent + htmlToMarkdown + postprocessMarkdown (scraper as any).logger = { info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(), }; } // ── Direct pipeline function tests ─────────────────────────────────────────── import { extractMetadata } from "../../src/utils/metadata-extractor"; import { cleanContent } from "../../src/utils/content-cleaner"; import { htmlToMarkdown } from "../../src/formatters/markdown"; import { postprocessMarkdown } from "../../src/formatters/postprocess"; describe("Scraper content pipeline", () => { describe("end-to-end: HTML → metadata + markdown", () => { const SAMPLE_HTML = ` <html> <head> <title>Example Page Title

Welcome to Example

This is a real page with meaningful content that should pass quality checks.

It has multiple paragraphs to ensure the content pipeline works correctly.

A useful link
© 2026 Example Corp
`; it("extracts metadata from raw HTML before cleaning", () => { const metadata = extractMetadata(SAMPLE_HTML, "https://example.com"); expect(metadata.title).toBe("Example Page Title"); expect(metadata.description).toBe("A test page for the content pipeline"); expect(metadata.openGraph?.title).toBe("OG Title"); expect(metadata.openGraph?.image).toBe("https://example.com/og.png"); expect(metadata.twitter?.card).toBe("summary_large_image"); }); it("metadata is NOT available after cleaning (head stripped)", () => { const cleaned = cleanContent(SAMPLE_HTML, "https://example.com", { onlyMainContent: false, }); const metadata = extractMetadata(cleaned, "https://example.com"); // Title should be null because was stripped expect(metadata.title).toBeNull(); }); it("produces markdown from cleaned HTML", () => { const cleaned = cleanContent(SAMPLE_HTML, "https://example.com", { onlyMainContent: false, }); const markdown = htmlToMarkdown(cleaned); expect(markdown).toContain("Welcome to Example"); expect(markdown).toContain("meaningful content"); expect(markdown.length).toBeGreaterThan(50); }); it("onlyMainContent extracts main content and removes nav/footer", () => { const cleaned = cleanContent(SAMPLE_HTML, "https://example.com", { onlyMainContent: true, }); const markdown = htmlToMarkdown(cleaned); expect(markdown).toContain("Welcome to Example"); // Nav and footer should be stripped expect(markdown).not.toContain("© 2026 Example Corp"); }); it("postprocessing cleans up the output", () => { const raw = "[Skip to Content](#main)\n\n\n\n\n# Title\n\nContent"; const processed = postprocessMarkdown(raw); expect(processed).not.toContain("Skip to Content"); expect(processed).not.toContain("\n\n\n"); // collapsed to 2 expect(processed).toContain("# Title"); }); it("full pipeline: raw HTML → metadata + clean markdown", () => { // Step 1: Extract metadata from raw HTML const metadata = extractMetadata(SAMPLE_HTML, "https://example.com"); // Step 2: Clean HTML const cleaned = cleanContent(SAMPLE_HTML, "https://example.com", { onlyMainContent: true, }); // Step 3: Convert to markdown const markdown = htmlToMarkdown(cleaned); // Step 4: Postprocess const final = postprocessMarkdown(markdown); // Verify the full pipeline expect(metadata.title).toBe("Example Page Title"); expect(final).toContain("Welcome to Example"); expect(final).toContain("meaningful content"); expect(final.length).toBeGreaterThan(50); }); }); describe("JSON payload detection", () => { it("wraps JSON responses in code fences", () => { // The Scraper detects JSON payloads and wraps them. // Test the detection logic directly. const jsonBody = '{"key": "value", "items": [1, 2, 3]}'; // detectJsonPayload is not exported, but we can verify the behavior // by checking that valid JSON with 200 status would be detected const trimmed = jsonBody.trim(); const firstChar = trimmed[0]; const lastChar = trimmed[trimmed.length - 1]; const looksJson = (firstChar === "{" && lastChar === "}"); expect(looksJson).toBe(true); expect(() => JSON.parse(trimmed)).not.toThrow(); }); }); describe("conversion fallback", () => { it("htmlToMarkdown falls back to text extraction on empty result from large input", () => { // When supermarkdown returns "" for a large input, the formatter // falls back to tag stripping. We can't easily trigger this without // mocking supermarkdown, but we can verify the fallback behavior // by testing with input that works normally. const html = "

Simple content

"; const result = htmlToMarkdown(html); expect(result).toContain("Simple content"); }); }); describe("Wikipedia-like content", () => { const WIKIPEDIA_HTML = ` Web scraping - Wikipedia

Web scraping

Web scraping is data scraping used for extracting data from websites. Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser.

Techniques

Human copy-and-paste is the simplest form of web scraping.

MethodDescription
HTTPDirect request
BrowserDOM parsing
`; it("extracts title from Wikipedia HTML", () => { const metadata = extractMetadata(WIKIPEDIA_HTML, "https://en.wikipedia.org/wiki/Web_scraping"); expect(metadata.title).toBe("Web scraping - Wikipedia"); }); it("produces substantial markdown from Wikipedia content", () => { const cleaned = cleanContent(WIKIPEDIA_HTML, "https://en.wikipedia.org/wiki/Web_scraping", { onlyMainContent: true, }); const markdown = postprocessMarkdown(htmlToMarkdown(cleaned)); expect(markdown).toContain("Web scraping"); expect(markdown).toContain("Techniques"); expect(markdown).toContain("HTTP"); // Table should be present as GFM expect(markdown).toContain("|"); expect(markdown.length).toBeGreaterThan(200); }); it("does not include navigation in onlyMainContent mode", () => { const cleaned = cleanContent(WIKIPEDIA_HTML, "https://en.wikipedia.org/wiki/Web_scraping", { onlyMainContent: true, }); const markdown = postprocessMarkdown(htmlToMarkdown(cleaned)); expect(markdown).not.toContain("Main Page"); }); }); describe("SaaS landing page content", () => { const SAAS_HTML = ` Acme - Build faster

Build faster with Acme

Acme helps developers ship products 10x faster with our modern platform.

Features

  • Instant deployments
  • Edge functions
  • Database included
`; it("extracts title and OG image from SaaS page", () => { const metadata = extractMetadata(SAAS_HTML, "https://acme.com"); expect(metadata.title).toBe("Acme - Build faster"); expect(metadata.description).toBe("The modern platform for developers"); expect(metadata.openGraph?.image).toBe("https://acme.com/og.png"); }); it("produces markdown with heading and list", () => { const cleaned = cleanContent(SAAS_HTML, "https://acme.com", { onlyMainContent: true }); const markdown = postprocessMarkdown(htmlToMarkdown(cleaned)); expect(markdown).toContain("Build faster with Acme"); expect(markdown).toContain("Features"); expect(markdown).toContain("Instant deployments"); expect(markdown).toContain("- "); // list items }); }); describe("edge cases", () => { it("handles empty HTML", () => { const metadata = extractMetadata("", "https://example.com"); expect(metadata.title).toBeNull(); const markdown = htmlToMarkdown(""); expect(markdown).toBe(""); }); it("handles HTML with only scripts and styles", () => { const html = ""; const cleaned = cleanContent(html, "https://example.com", { onlyMainContent: false }); const markdown = htmlToMarkdown(cleaned); // Scripts and styles should be stripped expect(markdown).not.toContain("alert"); expect(markdown).not.toContain("body{}"); }); it("handles includeTags filter", () => { const html = `

Keep this

`; const cleaned = cleanContent(html, "https://example.com", { onlyMainContent: false, includeTags: [".content"], }); const markdown = htmlToMarkdown(cleaned); expect(markdown).toContain("Keep this"); expect(markdown).not.toContain("Remove this"); }); it("handles excludeTags filter", () => { const html = `

Keep this

Remove this ad

`; const cleaned = cleanContent(html, "https://example.com", { onlyMainContent: false, excludeTags: [".ads"], }); const markdown = htmlToMarkdown(cleaned); expect(markdown).toContain("Keep this"); expect(markdown).not.toContain("Remove this ad"); }); }); }); ================================================ FILE: tests/unit/scraper-retry.test.ts ================================================ /** * Scraper Retry & Escalation Tests * * Tests the retry loop in Scraper.scrapeSingleUrlWithRetry: * 1. Datacenter attempt with 10s timeout * 2. Any failure → residential attempt with remaining time (up to 30s total) * 3. Any failure → done * * We mock `scrapeSingleUrl` on the Scraper prototype so the retry logic * is tested in isolation without hitting real engines. */ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; import { Scraper } from "../../src/scraper"; import { ScrapeFailedError } from "../../src/engines/errors"; import { ProxyConnectionError, DNSError } from "../../src/errors"; import type { WebsiteScrapeResult } from "../../src/types"; // ── Helpers ────────────────────────────────────────────────────────────────── function makeResult(overrides?: Partial): WebsiteScrapeResult { return { rawHtml: "

Hello World

This is real content.

", markdown: "# Hello World\n\nThis is real content with enough text.", metadata: { baseUrl: "https://example.com", statusCode: 200, engine: "hero", totalPages: 1, scrapedAt: new Date().toISOString(), duration: 100, website: { title: "Example", description: null } as any, }, ...overrides, }; } function makeScraper(overrides?: Record): Scraper { return new Scraper({ urls: ["https://example.com"], formats: ["markdown"], ...overrides }); } function spySingleUrl(scraper: Scraper) { const spy = vi.fn() as any; (scraper as any).scrapeSingleUrl = spy; (scraper as any).logger = { info: vi.fn(), warn: vi.fn(), error: vi.fn(), debug: vi.fn(), }; return spy; } // ── Tests ──────────────────────────────────────────────────────────────────── describe("Scraper retry & escalation", () => { beforeEach(() => { vi.useFakeTimers({ shouldAdvanceTime: true }); }); afterEach(() => { vi.useRealTimers(); }); // ── Happy path ── it("returns result on first success without escalation", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data).toHaveLength(1); expect(data[0].markdown).toContain("Hello World"); expect(spy).toHaveBeenCalledTimes(1); }); // ── Non-retryable errors ── it("fast-fails on non-retryable errors without escalating", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce(new DNSError("example.com")); const { data, batchMetadata } = await scraper.scrape(); expect(data).toHaveLength(0); expect(batchMetadata.failedUrls).toBe(1); expect(spy).toHaveBeenCalledTimes(1); // No second attempt }); // ── Escalation on failure ── it("escalates to residential on datacenter failure", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce( new ScrapeFailedError(new Error("timeout"), { proxyBlock: true }), ); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data).toHaveLength(1); expect(spy).toHaveBeenCalledTimes(2); // Second call should have proxyOverride = "residential" expect(spy.mock.calls[1][2]).toBe("residential"); }); // ── Escalation on proxy connection error ── it("escalates to residential on ProxyConnectionError", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce(new ProxyConnectionError("datacenter")); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data).toHaveLength(1); expect(spy).toHaveBeenCalledTimes(2); expect(spy.mock.calls[1][2]).toBe("residential"); }); // ── Escalation on empty result ── it("escalates to residential when datacenter returns null", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockResolvedValueOnce(null); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data).toHaveLength(1); expect(spy).toHaveBeenCalledTimes(2); expect(spy.mock.calls[1][2]).toBe("residential"); }); // ── Escalation on blocked content ── it("escalates when result looks blocked (200 + bot page content)", async () => { const scraper = makeScraper({ blockDetection: { patterns: [/click the button below to continue shopping/i], shortContentThreshold: 500, }, }); const spy = spySingleUrl(scraper); spy.mockResolvedValueOnce(makeResult({ rawHtml: '

Click the button below to continue shopping

© Amazon.com

', markdown: "Click the button below to continue shopping", metadata: { baseUrl: "https://amazon.com/dp/123", statusCode: 200, engine: "hero", totalPages: 1, scrapedAt: new Date().toISOString(), duration: 50, website: { title: null, description: null } as any, }, })); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data).toHaveLength(1); expect(data[0].markdown).toContain("Hello World"); expect(spy).toHaveBeenCalledTimes(2); }); // ── Both attempts fail ── it("reports error when both datacenter and residential fail", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce(new ScrapeFailedError(new Error("dc timeout"))); spy.mockRejectedValueOnce(new ScrapeFailedError(new Error("res timeout"))); const { data, batchMetadata } = await scraper.scrape(); expect(data).toHaveLength(0); expect(batchMetadata.failedUrls).toBe(1); expect(spy).toHaveBeenCalledTimes(2); }); // ── No third attempt ── it("does NOT retry a third time — max 2 attempts (dc + residential)", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce(new ScrapeFailedError(new Error("fail 1"))); spy.mockRejectedValueOnce(new ScrapeFailedError(new Error("fail 2"))); await scraper.scrape(); expect(spy).toHaveBeenCalledTimes(2); }); // ── Timeout passed to attempts ── it("passes 10s timeout to datacenter attempt", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockResolvedValueOnce(makeResult()); await scraper.scrape(); // 4th arg is timeoutMs expect(spy.mock.calls[0][3]).toBe(10_000); }); it("passes remaining time to residential attempt", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockRejectedValueOnce(new ScrapeFailedError(new Error("dc fail"))); spy.mockResolvedValueOnce(makeResult()); await scraper.scrape(); // Residential timeout should be <= 30s and > 0 const residentialTimeout = spy.mock.calls[1][3]; expect(residentialTimeout).toBeGreaterThan(0); expect(residentialTimeout).toBeLessThanOrEqual(30_000); }); // ── rawHtml is always present ── it("includes rawHtml in successful result", async () => { const scraper = makeScraper(); const spy = spySingleUrl(scraper); spy.mockResolvedValueOnce(makeResult()); const { data } = await scraper.scrape(); expect(data[0].rawHtml).toContain(""); }); }); ================================================ FILE: tests/unit/tiered-pool.test.ts ================================================ import { describe, it, expect } from "vitest"; import pino from "pino"; import { TieredBrowserPool, buildTierConfigsFromPools, } from "../../src/browser/tiered-pool"; import type { HeroFactory, HeroLike, TabLike } from "../../src/browser/proxy-bound-browser"; import { ProxyHealthTracker } from "../../src/proxy/health-tracker"; const silentLogger = pino({ level: "silent" }); interface FakeHero extends HeroLike { config: Record; closed: boolean; } function makeFakeTab(): TabLike { return { async goto() { return undefined; }, get url() { return Promise.resolve("about:blank"); }, get document() { return {} as unknown; }, async waitForLoad() {}, async waitForPaintingStable() {}, async waitForElement() { return undefined as unknown; }, async close() {}, }; } function makeFakeFactory(opts: { failFor?: Set } = {}): { factory: HeroFactory; instances: FakeHero[]; } { const instances: FakeHero[] = []; const factory: HeroFactory = { create(config: Record) { const url = (config.upstreamProxyUrl as string | undefined) ?? null; if (url && opts.failFor?.has(url)) { throw new Error(`launch failed for ${url}`); } const hero: FakeHero = { config, closed: false, async newTab() { return makeFakeTab(); }, async closeTab(tab: TabLike) { await tab.close(); }, async close() { this.closed = true; }, }; instances.push(hero); return hero; }, }; return { factory, instances }; } async function tick(n = 1) { for (let i = 0; i < n; i++) await new Promise((r) => setImmediate(r)); } describe("TieredBrowserPool", () => { describe("construction + pre-warm", () => { it("launches one browser per proxy URL at startup", async () => { const { factory, instances } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2", "http://dc3"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; expect(instances).toHaveLength(3); expect(pool.getStats().tiers[0].browsers).toHaveLength(3); await pool.close(); }); it("skips duplicate proxy URLs within a tier", async () => { const { factory, instances } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc1", "http://dc2"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; expect(instances).toHaveLength(2); await pool.close(); }); it("tolerates a per-browser launch failure and resolves ready anyway", async () => { const { factory } = makeFakeFactory({ failFor: new Set(["http://bad"]) }); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://bad", "http://dc2"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; // should not throw const stats = pool.getStats(); const dcBrowsers = stats.tiers.find((t) => t.tier === "datacenter")!.browsers; expect(dcBrowsers).toHaveLength(3); const closedCount = dcBrowsers.filter((b) => b.state === "closed").length; expect(closedCount).toBe(1); await pool.close(); }); }); describe("acquire", () => { it("returns least-loaded browser from the tier", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; // Hold dc1 with an in-flight page const dc1 = pool.acquire("datacenter").browser; let releaseDc1!: () => void; const heldDc1 = new Promise((r) => (releaseDc1 = r)); const dc1Page = dc1.withPage(async () => { await heldDc1; }); await tick(2); // The next acquire should prefer the OTHER browser (dc2) const lease = pool.acquire("datacenter"); expect(lease.browser).not.toBe(dc1); releaseDc1(); await dc1Page; await pool.close(); }); it("throws when tier is unknown", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, logger: silentLogger, }); await pool.ready; expect(() => pool.acquire("residential")).toThrow(/no browsers configured for tier/); await pool.close(); }); it("throws when all browsers in the tier are unavailable", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; // Retire both const lease1 = pool.acquire("datacenter"); const lease2 = pool.acquire("datacenter"); // They might be the same browser (least-loaded) — force retire via stats map for (const tierStats of pool.getStats().tiers) { for (const _ of tierStats.browsers) { /* retirement below */ } } // Actually retire both via pool.close? No, we want the pool open but // browsers unavailable. Grab them via getBrowserByProxy. const b1 = pool.getBrowserByProxy("http://dc1")!; const b2 = pool.getBrowserByProxy("http://dc2")!; await Promise.all([b1.retire(), b2.retire()]); expect(() => pool.acquire("datacenter")).toThrow(/no available browsers/); await pool.close(); void lease1; void lease2; }); }); describe("hasTier", () => { it("returns true for configured tiers", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, logger: silentLogger, }); await pool.ready; expect(pool.hasTier("datacenter")).toBe(true); expect(pool.hasTier("residential")).toBe(false); expect(pool.hasTier("direct")).toBe(false); await pool.close(); }); }); describe("getBrowserByProxy", () => { it("returns the browser bound to a proxy URL", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"], }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; const b1 = pool.getBrowserByProxy("http://dc1")!; const b2 = pool.getBrowserByProxy("http://dc2")!; expect(b1.proxyUrl).toBe("http://dc1"); expect(b2.proxyUrl).toBe("http://dc2"); expect(pool.getBrowserByProxy("http://dc3")).toBeNull(); await pool.close(); }); it("resolves null for the direct lane", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [{ tier: "direct", proxyUrls: [null] }], heroFactory: factory, logger: silentLogger, }); await pool.ready; const direct = pool.getBrowserByProxy(null); expect(direct).not.toBeNull(); expect(direct!.proxyUrl).toBeNull(); await pool.close(); }); }); describe("health tracker integration", () => { it("retires browser when its proxy is benched", async () => { const { factory } = makeFakeFactory(); const tracker = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 1000 }); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, healthTracker: tracker, logger: silentLogger, }); await pool.ready; for (let i = 0; i < 3; i++) tracker.recordFailure("http://dc1"); // Event handler schedules retire asynchronously await tick(5); const browser = pool.getBrowserByProxy("http://dc1")!; // retire is fire-and-forget; wait for it to settle for (let i = 0; i < 50 && browser.getState() !== "closed"; i++) { await tick(1); } expect(browser.getState()).toBe("closed"); await pool.close(); }); it("relaunches browser when its proxy is revived", async () => { const clock = { t: 1_000_000 }; const { factory } = makeFakeFactory(); const tracker = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 1000, now: () => clock.t, }); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, healthTracker: tracker, logger: silentLogger, }); await pool.ready; const browser = pool.getBrowserByProxy("http://dc1")!; // Bench for (let i = 0; i < 3; i++) tracker.recordFailure("http://dc1"); await tick(5); for (let i = 0; i < 50 && browser.getState() !== "closed"; i++) { await tick(1); } expect(browser.getState()).toBe("closed"); // Advance the fake clock past the cooldown, then trigger a health // check which will emit the revive event. clock.t += 2000; expect(tracker.isHealthy("http://dc1")).toBe(true); // Relaunch happens asynchronously via the event listener for (let i = 0; i < 50 && browser.getState() !== "active"; i++) { await tick(1); } expect(browser.getState()).toBe("active"); await pool.close(); }); it("acquire skips benched browsers", async () => { const { factory } = makeFakeFactory(); const tracker = new ProxyHealthTracker({ failureThreshold: 3, cooldownMs: 10000 }); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"], }, ], heroFactory: factory, healthTracker: tracker, logger: silentLogger, }); await pool.ready; for (let i = 0; i < 3; i++) tracker.recordFailure("http://dc1"); // Wait for dc1 retirement to settle for (let i = 0; i < 50; i++) { await tick(1); if (pool.getBrowserByProxy("http://dc1")!.getState() === "closed") break; } // Acquire should now always return dc2 for (let i = 0; i < 5; i++) { const lease = pool.acquire("datacenter"); expect(lease.browser.proxyUrl).toBe("http://dc2"); } await pool.close(); }); }); describe("close", () => { it("retires every browser across every tier", async () => { const { factory, instances } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [ { tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"] }, { tier: "residential", proxyUrls: ["http://res1"] }, ], heroFactory: factory, logger: silentLogger, }); await pool.ready; await pool.close(); expect(instances.every((i) => i.closed)).toBe(true); }); it("is safe to call close() twice", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, logger: silentLogger, }); await pool.ready; await pool.close(); await pool.close(); }); it("acquire throws after close", async () => { const { factory } = makeFakeFactory(); const pool = new TieredBrowserPool({ tiers: [{ tier: "datacenter", proxyUrls: ["http://dc1"] }], heroFactory: factory, logger: silentLogger, }); await pool.ready; await pool.close(); expect(() => pool.acquire("datacenter")).toThrow(/closed/); }); }); }); describe("buildTierConfigsFromPools", () => { it("returns datacenter + residential when both configured, no direct", () => { const tiers = buildTierConfigsFromPools({ datacenter: [{ url: "http://dc1" }, { url: "http://dc2" }], residential: [{ url: "http://res1" }], }); expect(tiers).toHaveLength(2); expect(tiers[0]).toEqual({ tier: "datacenter", proxyUrls: ["http://dc1", "http://dc2"] }); expect(tiers[1]).toEqual({ tier: "residential", proxyUrls: ["http://res1"] }); }); it("returns only datacenter when residential is empty", () => { const tiers = buildTierConfigsFromPools({ datacenter: [{ url: "http://dc1" }], }); expect(tiers).toHaveLength(1); expect(tiers[0].tier).toBe("datacenter"); }); it("returns direct when no proxies configured (default size 1)", () => { const tiers = buildTierConfigsFromPools({}); expect(tiers).toHaveLength(1); expect(tiers[0]).toEqual({ tier: "direct", proxyUrls: [null] }); }); it("respects directPoolSize when creating direct tier", () => { const tiers = buildTierConfigsFromPools({}, { directPoolSize: 3 }); expect(tiers[0].proxyUrls).toEqual([null, null, null]); }); it("does NOT add a direct tier when any proxy is configured", () => { const tiers = buildTierConfigsFromPools({ datacenter: [{ url: "http://dc1" }], }); expect(tiers.find((t) => t.tier === "direct")).toBeUndefined(); }); it("treats undefined pools as empty", () => { const tiers = buildTierConfigsFromPools(undefined); expect(tiers).toHaveLength(1); expect(tiers[0].tier).toBe("direct"); }); it("filters out proxies with no URL", () => { const tiers = buildTierConfigsFromPools({ datacenter: [{ url: "http://dc1" }, {}, { url: "" }], }); expect(tiers[0].proxyUrls).toEqual(["http://dc1"]); }); }); ================================================ FILE: tests/unit/url-helpers.test.ts ================================================ import { describe, it, expect } from "vitest"; import { isValidUrl, getUrlKey, isSameDomain, resolveUrl } from "../../src/utils/url-helpers"; describe("isValidUrl", () => { it("accepts valid http URLs", () => { expect(isValidUrl("http://example.com")).toBe(true); }); it("accepts valid https URLs", () => { expect(isValidUrl("https://example.com")).toBe(true); }); it("accepts URLs with paths", () => { expect(isValidUrl("https://example.com/path/to/page")).toBe(true); }); it("accepts URLs with query strings", () => { expect(isValidUrl("https://example.com?q=test&page=1")).toBe(true); }); it("rejects empty string", () => { expect(isValidUrl("")).toBe(false); }); it("rejects plain text", () => { expect(isValidUrl("not a url")).toBe(false); }); it("handles javascript: URLs (implementation-dependent)", () => { // isValidUrl uses URL constructor which may accept javascript: protocol const result = isValidUrl("javascript:alert(1)"); expect(typeof result).toBe("boolean"); }); }); describe("getUrlKey", () => { it("normalizes www prefix", () => { expect(getUrlKey("https://www.example.com")).toBe(getUrlKey("https://example.com")); }); it("removes hash fragments", () => { expect(getUrlKey("https://example.com#section")).toBe(getUrlKey("https://example.com")); }); it("removes trailing slash", () => { expect(getUrlKey("https://example.com/")).toBe(getUrlKey("https://example.com")); }); it("normalizes index files", () => { expect(getUrlKey("https://example.com/index.html")).toBe(getUrlKey("https://example.com/")); }); it("preserves path differences", () => { expect(getUrlKey("https://example.com/a")).not.toBe(getUrlKey("https://example.com/b")); }); it("lowercases the result", () => { const key = getUrlKey("https://EXAMPLE.COM/Path"); expect(key).toBe(key.toLowerCase()); }); }); describe("isSameDomain", () => { it("matches same domain", () => { expect(isSameDomain("https://example.com/a", "https://example.com/b")).toBe(true); }); it("matches with www difference", () => { expect(isSameDomain("https://www.example.com", "https://example.com")).toBe(true); }); it("rejects different domains", () => { expect(isSameDomain("https://example.com", "https://other.com")).toBe(false); }); it("rejects subdomains (strict hostname match)", () => { expect(isSameDomain("https://blog.example.com", "https://example.com")).toBe(false); expect(isSameDomain("https://dashboard.stripe.com", "https://docs.stripe.com")).toBe(false); }); }); describe("resolveUrl", () => { it("resolves relative path against base", () => { const resolved = resolveUrl("/about", "https://example.com/page"); expect(resolved).toBe("https://example.com/about"); }); it("returns absolute URL (may normalize trailing slash)", () => { const resolved = resolveUrl("https://other.com", "https://example.com"); expect(resolved).toContain("other.com"); }); it("handles fragment-only URLs", () => { const resolved = resolveUrl("#section", "https://example.com/page"); expect(resolved).toContain("example.com"); }); }); ================================================ FILE: tests/unit/url-rewriter.test.ts ================================================ import { describe, it, expect } from "vitest"; import { rewriteUrl, type UrlRewriteRule } from "../../src/utils/url-rewriter"; // Google rewrite rules — mimics what reader-api would provide function extractGoogleDocId(pathname: string): string | null { const match = pathname.match(/\/d\/([a-zA-Z0-9_-]+)/); return match ? match[1] : null; } const GOOGLE_RULES: UrlRewriteRule[] = [ { name: "google-docs", match: (url) => url.hostname === "docs.google.com" && url.pathname.startsWith("/document/"), rewrite: (url) => { const id = extractGoogleDocId(url.pathname); return `https://docs.google.com/document/d/${id}/export?format=html`; }, }, { name: "google-sheets", match: (url) => url.hostname === "docs.google.com" && url.pathname.startsWith("/spreadsheets/"), rewrite: (url) => { const id = extractGoogleDocId(url.pathname); return `https://docs.google.com/spreadsheets/d/${id}/export?format=html`; }, }, { name: "google-slides", match: (url) => url.hostname === "docs.google.com" && url.pathname.startsWith("/presentation/"), rewrite: (url) => { const id = extractGoogleDocId(url.pathname); return `https://docs.google.com/presentation/d/${id}/export/html`; }, }, { name: "google-drive", match: (url) => url.hostname === "drive.google.com" && url.pathname.startsWith("/file/"), rewrite: (url) => { const id = extractGoogleDocId(url.pathname); return `https://drive.google.com/uc?id=${id}&export=download`; }, }, ]; describe("rewriteUrl", () => { it("returns unchanged when no rules provided (unopinionated)", () => { const result = rewriteUrl("https://docs.google.com/document/d/abc123/edit"); expect(result.rewritten).toBe(false); expect(result.url).toBe("https://docs.google.com/document/d/abc123/edit"); }); describe("Google Docs", () => { it("rewrites a Google Docs /edit URL to HTML export", () => { const result = rewriteUrl( "https://docs.google.com/document/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit", GOOGLE_RULES, ); expect(result).toEqual({ url: "https://docs.google.com/document/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/export?format=html", rewritten: true, reason: "google-docs", }); }); it("handles document IDs with hyphens and underscores", () => { const result = rewriteUrl( "https://docs.google.com/document/d/abc-123_DEF-456_ghi/edit", GOOGLE_RULES, ); expect(result.rewritten).toBe(true); expect(result.reason).toBe("google-docs"); }); }); describe("Google Sheets", () => { it("rewrites a Google Sheets URL to HTML export", () => { const result = rewriteUrl( "https://docs.google.com/spreadsheets/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit", GOOGLE_RULES, ); expect(result.rewritten).toBe(true); expect(result.reason).toBe("google-sheets"); }); }); describe("Google Slides", () => { it("rewrites a Google Slides URL to HTML export", () => { const result = rewriteUrl( "https://docs.google.com/presentation/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit", GOOGLE_RULES, ); expect(result.rewritten).toBe(true); expect(result.reason).toBe("google-slides"); }); }); describe("Google Drive", () => { it("rewrites a Google Drive file URL to direct download", () => { const result = rewriteUrl( "https://drive.google.com/file/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/view", GOOGLE_RULES, ); expect(result.rewritten).toBe(true); expect(result.reason).toBe("google-drive"); }); }); describe("non-matching URLs", () => { it("returns non-Google URLs unchanged", () => { const result = rewriteUrl("https://example.com/some-page", GOOGLE_RULES); expect(result.rewritten).toBe(false); }); it("returns invalid URLs unchanged", () => { const result = rewriteUrl("not-a-valid-url", GOOGLE_RULES); expect(result.rewritten).toBe(false); }); it("does not rewrite Google Docs non-document paths like /forms/", () => { const result = rewriteUrl( "https://docs.google.com/forms/d/1aBcDeFgHiJkLmNoPqRsTuVwXyZ/edit", GOOGLE_RULES, ); expect(result.rewritten).toBe(false); }); }); }); ================================================ FILE: tsconfig.json ================================================ { "compilerOptions": { "target": "ESNext", "module": "ESNext", "moduleResolution": "bundler", "lib": ["ESNext", "DOM"], "outDir": "./dist", "strict": true, "esModuleInterop": true, "allowSyntheticDefaultImports": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "declaration": true, "declarationMap": true, "sourceMap": true, "removeComments": false, "noImplicitAny": true, "noImplicitReturns": false, "noImplicitThis": true, "noUnusedLocals": true, "noUnusedParameters": false, "exactOptionalPropertyTypes": false, "resolveJsonModule": true, "types": ["node"] }, "include": ["src/**/*"], "exclude": ["node_modules", "dist", "**/*.test.ts"] } ================================================ FILE: tsup.config.ts ================================================ import { defineConfig } from "tsup"; // Packages that should not be bundled (native modules, CommonJS deps) // Packages that must NOT be bundled — they contain native modules, // use require() internally, or need to be resolved from node_modules // at runtime. Every entry here MUST also be in package.json dependencies. const external = [ "@ulixee/hero", "@ulixee/hero-core", "@ulixee/net", "re2", "pino", "pino-pretty", ]; export default defineConfig([ // Main library { entry: ["src/index.ts"], format: ["esm"], dts: true, clean: true, outDir: "dist", splitting: false, sourcemap: true, target: "node18", external, }, // CLI (shebang preserved from source) { entry: ["src/cli/index.ts"], format: ["esm"], dts: false, outDir: "dist/cli", splitting: false, sourcemap: true, target: "node18", external, }, ]); ================================================ FILE: vitest.config.ts ================================================ import { defineConfig } from "vitest/config"; export default defineConfig({ test: { globals: true, environment: "node", include: ["tests/**/*.test.ts"], testTimeout: 30_000, hookTimeout: 15_000, }, });