Full Code of vakra-dev/reader for AI

main fbf5a54bff96 cached

147 files

751.3 KB

189.7k tokens

477 symbols

1 requests

Download .txt

Showing preview only (805K chars total). Download the full file or copy to clipboard to get everything.

Repository: vakra-dev/reader
Branch: main
Commit: fbf5a54bff96
Files: 147
Total size: 751.3 KB

Directory structure:
gitextract_cms0mrdu/

├── .eslintrc.json
├── .github/
│   └── workflows/
│       ├── ci.yml
│       └── publish.yml
├── .gitignore
├── .leasotrc
├── .nvmrc
├── .prettierrc
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs/
│   ├── api-reference.md
│   ├── architecture.md
│   ├── assets/
│   │   ├── .gitkeep
│   │   └── demo.tape
│   ├── deployment/
│   │   ├── docker.md
│   │   ├── job-queues.md
│   │   └── production-server.md
│   ├── getting-started.md
│   ├── guides/
│   │   ├── browser-pool.md
│   │   ├── browser-sessions.md
│   │   ├── cloudflare-bypass.md
│   │   ├── output-formats.md
│   │   └── proxy-configuration.md
│   └── troubleshooting.md
├── ecosystem.config.cjs
├── examples/
│   ├── .gitignore
│   ├── .nvmrc
│   ├── README.md
│   ├── ai-tools/
│   │   ├── README.md
│   │   ├── anthropic-summary.ts
│   │   ├── langchain-loader.ts
│   │   ├── llamaindex-loader.ts
│   │   ├── openai-summary.ts
│   │   ├── pinecone-ingest.ts
│   │   ├── qdrant-ingest.ts
│   │   └── vercel-ai-stream.ts
│   ├── basic/
│   │   ├── README.md
│   │   ├── all-formats.ts
│   │   ├── basic-scrape.ts
│   │   ├── batch-scrape.ts
│   │   ├── browser-pool-config.ts
│   │   ├── browser-session-actions.ts
│   │   ├── browser-session-puppeteer.ts
│   │   ├── browser-session-selenium.ts
│   │   ├── browser-session.ts
│   │   ├── cloudflare-bypass.ts
│   │   ├── crawl-website.ts
│   │   ├── large-batch-scrape.ts
│   │   ├── proxy-pool.ts
│   │   └── with-proxy.ts
│   ├── package.json
│   ├── production/
│   │   ├── README.md
│   │   ├── browser-pool-scaling/
│   │   │   ├── README.md
│   │   │   ├── package.json
│   │   │   └── src/
│   │   │       └── index.ts
│   │   ├── express-server/
│   │   │   ├── README.md
│   │   │   ├── package.json
│   │   │   └── src/
│   │   │       └── index.ts
│   │   └── job-queue-bullmq/
│   │       ├── README.md
│   │       ├── package.json
│   │       └── src/
│   │           ├── index.ts
│   │           ├── queue.ts
│   │           └── worker.ts
│   └── tsconfig.json
├── package.json
├── result.md
├── scripts/
│   └── release.sh
├── src/
│   ├── browser/
│   │   ├── hero-config.ts
│   │   ├── pool.ts
│   │   ├── proxy-bound-browser.ts
│   │   ├── tiered-pool.ts
│   │   └── types.ts
│   ├── browser-session.ts
│   ├── browser-types.ts
│   ├── cli/
│   │   └── index.ts
│   ├── client.ts
│   ├── cloudflare/
│   │   ├── detector.ts
│   │   ├── handler.ts
│   │   └── types.ts
│   ├── config/
│   │   └── domain-profiles.ts
│   ├── crawl-types.ts
│   ├── crawler.ts
│   ├── daemon/
│   │   ├── client.ts
│   │   ├── index.ts
│   │   └── server.ts
│   ├── engines/
│   │   ├── errors.ts
│   │   ├── hero/
│   │   │   └── index.ts
│   │   ├── index.ts
│   │   ├── orchestrator.ts
│   │   └── types.ts
│   ├── errors.ts
│   ├── formatters/
│   │   ├── html.ts
│   │   ├── index.ts
│   │   ├── markdown.ts
│   │   └── postprocess.ts
│   ├── index.ts
│   ├── proxy/
│   │   ├── config.ts
│   │   ├── env.ts
│   │   ├── health-tracker.ts
│   │   ├── proxy-gate.ts
│   │   └── verify.ts
│   ├── scraper.ts
│   ├── types.ts
│   └── utils/
│       ├── block-detector.ts
│       ├── content-cleaner.ts
│       ├── logger.ts
│       ├── metadata-extractor.ts
│       ├── rate-limiter.ts
│       ├── robots-parser.ts
│       ├── url-helpers.ts
│       └── url-rewriter.ts
├── tests/
│   ├── engines/
│   │   └── orchestrator.test.ts
│   ├── fixtures/
│   │   ├── amazon-bot-page.html
│   │   ├── cloudflare-challenge.html
│   │   ├── empty-page.html
│   │   └── simple-static.html
│   ├── integration/
│   │   └── daemon.test.ts
│   └── unit/
│       ├── block-detector-cloudflare.test.ts
│       ├── block-detector-fixtures.test.ts
│       ├── block-detector.test.ts
│       ├── browser-session.test.ts
│       ├── content-cleaner.test.ts
│       ├── crawler.test.ts
│       ├── daemon-dispatch.test.ts
│       ├── domain-profiles.test.ts
│       ├── errors.test.ts
│       ├── health-tracker.test.ts
│       ├── html-size-guard.test.ts
│       ├── markdown-formatter.test.ts
│       ├── metadata-extractor.test.ts
│       ├── postprocess.test.ts
│       ├── proxy-bound-browser.test.ts
│       ├── proxy-config.test.ts
│       ├── proxy-gate.test.ts
│       ├── proxy-verify.test.ts
│       ├── robots-parser.test.ts
│       ├── scraper-pipeline.test.ts
│       ├── scraper-retry.test.ts
│       ├── tiered-pool.test.ts
│       ├── url-helpers.test.ts
│       └── url-rewriter.test.ts
├── tsconfig.json
├── tsup.config.ts
└── vitest.config.ts

================================================
FILE CONTENTS
================================================

================================================
FILE: .eslintrc.json
================================================
{
  "root": true,
  "parser": "@typescript-eslint/parser",
  "parserOptions": {
    "ecmaVersion": "latest",
    "sourceType": "module",
    "project": true
  },
  "plugins": ["@typescript-eslint"],
  "extends": [
    "eslint:recommended",
    "plugin:@typescript-eslint/recommended"
  ],
  "env": {
    "node": true,
    "es2022": true
  },
  "rules": {
    "@typescript-eslint/no-explicit-any": "warn",
    "@typescript-eslint/no-unused-vars": ["error", { "argsIgnorePattern": "^_" }],
    "@typescript-eslint/explicit-function-return-type": "off",
    "@typescript-eslint/explicit-module-boundary-types": "off",
    "@typescript-eslint/no-non-null-assertion": "warn",
    "no-console": ["warn", { "allow": ["warn", "error"] }]
  },
  "ignorePatterns": ["dist/", "node_modules/", "*.js", "*.config.ts"]
}


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-node@v4
        with:
          node-version: "22"
          cache: "npm"

      - run: npm ci

      - name: Typecheck
        run: npx tsc --noEmit

      - name: Lint
        run: npm run lint

      - name: Format check
        run: npm run format:check

      - name: Test
        run: npm test

      - name: Build
        run: npm run build


================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish to npm

on:
  release:
    types: [published]

jobs:
  publish:
    runs-on: ubuntu-latest
    permissions:
      contents: read
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-node@v4
        with:
          node-version: "22"
          registry-url: "https://registry.npmjs.org"

      - run: npm ci

      - name: Verify version matches tag
        run: |
          TAG_VERSION="${GITHUB_REF_NAME#v}"
          PKG_VERSION=$(node -p "require('./package.json').version")
          if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
            echo "Error: Tag $TAG_VERSION does not match package.json $PKG_VERSION"
            exit 1
          fi
          echo "Version verified: $PKG_VERSION"

      - name: Build
        run: npm run build

      - name: Publish
        run: npm publish --access public
        env:
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}


================================================
FILE: .gitignore
================================================
# Dependencies
node_modules/

# Build output
dist/

# Environment files
.env
.env.local
.env.*.local

# Logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*

# OS files
.DS_Store
Thumbs.db

# IDE
.idea/
.vscode/
*.swp
*.swo

# Coverage
coverage/
.nyc_output/

# Package manager locks
# Note: package-lock.json is tracked for reproducible builds
yarn.lock

# Bun
bun.lockb

# Temporary files
tmp/
temp/
*.tmp

# Hero/Ulixee session data
.ulixee/

# Claude Code context
CLAUDE.md

# Deployment configs (contain sensitive data)
deploy/


================================================
FILE: .leasotrc
================================================
{
  "tags": ["TODO", "FIXME", "HACK", "XXX", "BUG", "OPTIMIZE", "REVIEW"],
  "ignore": ["node_modules/**", "dist/**"]
}


================================================
FILE: .nvmrc
================================================
v22.12.0


================================================
FILE: .prettierrc
================================================
{
  "semi": true,
  "singleQuote": false,
  "tabWidth": 2,
  "trailingComma": "es5",
  "printWidth": 100,
  "useTabs": false,
  "bracketSpacing": true,
  "arrowParens": "always",
  "endOfLine": "lf"
}


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use Reader in your research or project, please cite it."
title: "Reader: Open-source, production-grade web scraping engine built for LLMs"
type: software
authors:
  - family-names: Kaul
    given-names: Nihal
license: Apache-2.0
url: "https://github.com/vakra-dev/reader"
repository-code: "https://github.com/vakra-dev/reader"


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a welcoming experience for everyone, regardless of background or
identity.

## Our Standards

Examples of behavior that contributes to a positive environment:

- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Focusing on what is best for the community
- Showing empathy towards other community members

Examples of unacceptable behavior:

- Trolling, insulting or derogatory comments, and personal attacks
- Public or private harassment
- Publishing others' private information without explicit permission
- Other conduct which could reasonably be considered inappropriate in a professional setting

## Enforcement Responsibilities

Project maintainers are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate or harmful.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.

## Enforcement

Instances of unacceptable behavior may be reported to the project maintainers at
**nihal.codes@gmail.com**. All complaints will be reviewed and investigated
promptly and fairly.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact:** Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence:** A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the behavior
was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact:** A violation through a single incident or series of actions.

**Consequence:** A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact:** A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence:** A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact:** Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence:** A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Reader

Thank you for your interest in contributing to Reader! This document provides guidelines and instructions for contributing.

## Development Setup

### Prerequisites

- **Node.js** >= 18 (v22 recommended)
- **npm** for package management
- **Git**

> **Note:** Always run scripts with Node.js (`npx tsx` or `node`) as Hero has ESM compatibility issues with other runtimes.

### Getting Started

1. **Fork the repository** on GitHub

2. **Clone your fork:**

   ```bash
   git clone https://github.com/YOUR_USERNAME/reader.git
   cd reader
   ```

3. **Install dependencies:**

   ```bash
   npm install
   ```

4. **Verify setup:**

   ```bash
   npm run typecheck
   npm run build
   ```

5. **Test the CLI:**
   ```bash
   npx tsx src/cli/index.ts scrape https://example.com
   ```

## Project Structure

```
src/
├── index.ts              # Public API exports
├── client.ts             # ReaderClient - main API entry point
├── scraper.ts            # Scraper class - main scraping logic
├── crawler.ts            # Crawler class - link discovery
├── types.ts              # TypeScript types for scraping
├── crawl-types.ts        # TypeScript types for crawling
│
├── browser/
│   ├── pool.ts           # BrowserPool - manages Hero instances
│   ├── hero-config.ts    # Hero configuration
│   └── types.ts          # Pool types
│
├── cloudflare/
│   ├── detector.ts       # Challenge detection
│   ├── handler.ts        # Challenge resolution
│   └── types.ts          # Cloudflare types
│
├── formatters/
│   ├── markdown.ts       # Markdown formatter
│   ├── html.ts           # HTML formatter
│   ├── json.ts           # JSON formatter
│   ├── text.ts           # Text formatter
│   └── index.ts          # Re-exports
│
├── utils/
│   ├── content-cleaner.ts    # HTML content cleaning
│   ├── metadata-extractor.ts # Metadata extraction
│   ├── url-helpers.ts        # URL utilities
│   ├── rate-limiter.ts       # Rate limiting
│   └── logger.ts             # Logging
│
├── proxy/
│   └── config.ts         # Proxy configuration
│
├── daemon/
│   ├── index.ts          # Module exports
│   ├── server.ts         # DaemonServer - HTTP server with browser pool
│   └── client.ts         # DaemonClient - connects CLI to daemon
│
└── cli/
    └── index.ts          # CLI implementation
```

## Development Workflow

### Running the CLI

```bash
# Run CLI directly
npx tsx src/cli/index.ts scrape https://example.com

# With verbose output
npx tsx src/cli/index.ts scrape https://example.com -v

# Show browser window
npx tsx src/cli/index.ts scrape https://example.com --show-chrome
```

### Daemon Mode

```bash
# Start daemon with browser pool
npx tsx src/cli/index.ts start --pool-size 5

# Check daemon status
npx tsx src/cli/index.ts status

# Run commands (auto-connects to daemon)
npx tsx src/cli/index.ts scrape https://example.com

# Force standalone mode (bypass daemon)
npx tsx src/cli/index.ts scrape https://example.com --standalone

# Stop daemon
npx tsx src/cli/index.ts stop
```

### Code Quality

Run these commands before submitting a PR:

```bash
# Type checking
npm run typecheck

# Linting
npm run lint

# Auto-fix lint issues
npm run lint:fix

# Format code
npm run format

# Check formatting
npm run format:check

# Build
npm run build
```

### Finding TODOs

Track outstanding work:

```bash
npm run todo
```

## Making Changes

### Branch Naming

- `feature/description` - New features
- `fix/description` - Bug fixes
- `docs/description` - Documentation updates
- `refactor/description` - Code refactoring

### Commit Messages

Write clear, concise commit messages:

```
type: short description

Longer description if needed.
```

Types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`

Examples:

```
feat: add support for custom user agents
fix: resolve timeout issue with Cloudflare challenges
docs: update proxy configuration guide
refactor: simplify browser pool recycling logic
```

### Pull Request Process

1. Create a new branch from `main`
2. Make your changes
3. Run all checks:
   ```bash
   npm run lint
   npm run format:check
   npm run typecheck
   npm run build
   ```
4. Push your branch and create a PR
5. Fill out the PR template
6. Wait for review

## Common Tasks

### Adding a New Output Format

1. Create `src/formatters/newformat.ts`:

   ```typescript
   export function formatToNewFormat(
     pages: Page[],
     baseUrl: string,
     scrapedAt: string,
     duration: number,
     metadata?: WebsiteMetadata
   ): string {
     // Implementation
   }
   ```

2. Export from `src/formatters/index.ts`

3. Add to format type in `src/types.ts`

4. Call formatter in `src/scraper.ts`

5. Update CLI validation in `src/cli/index.ts`

### Adding a New ScrapeOption

1. Add to `ScrapeOptions` interface in `src/types.ts`
2. Add default in `DEFAULT_OPTIONS`
3. Use in `Scraper` class via `this.options.newOption`
4. Add CLI flag in `src/cli/index.ts` if applicable
5. Update documentation

### Modifying Cloudflare Detection

1. Detection patterns: `src/cloudflare/detector.ts`
2. Resolution logic: `src/cloudflare/handler.ts`
3. Test with known Cloudflare-protected sites

### Adjusting Browser Pool

1. Default config: `src/browser/types.ts`
2. Pool logic: `src/browser/pool.ts`

## Testing

Currently testing is done manually. When adding new features:

1. **Test basic functionality:**

   ```bash
   npx tsx src/cli/index.ts scrape https://example.com
   ```

2. **Test Cloudflare-protected sites:**

   ```bash
   npx tsx src/cli/index.ts scrape https://cloudflare-protected-site.com -v
   ```

3. **Test different output formats:**

   ```bash
   npx tsx src/cli/index.ts scrape https://example.com -f markdown,html,json,text
   ```

4. **Test crawling:**

   ```bash
   npx tsx src/cli/index.ts crawl https://example.com -d 2 -m 10
   ```

5. **Test batch scraping:**

   ```bash
   npx tsx src/cli/index.ts scrape url1 url2 url3 -c 3 -v
   ```

6. **Test daemon mode:**

   ```bash
   # Start daemon
   npx tsx src/cli/index.ts start --pool-size 3

   # Test scraping via daemon
   npx tsx src/cli/index.ts scrape https://example.com

   # Check status
   npx tsx src/cli/index.ts status

   # Stop daemon
   npx tsx src/cli/index.ts stop
   ```

## Running Examples

The `examples/` folder contains working examples:

```bash
cd examples
npm install

# Basic examples
npx tsx basic/basic-scrape.ts
npx tsx basic/batch-scrape.ts
npx tsx basic/crawl-website.ts

# AI integration examples (requires API keys)
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/openai-summary.ts https://example.com

# Production server
npx tsx production/express-server/src/index.ts
```

## Code Style

- Use TypeScript for all new code
- Follow existing patterns in the codebase
- Use async/await instead of callbacks
- Prefer explicit types over `any`
- Use meaningful variable and function names
- Add JSDoc comments for public APIs

## Documentation

When making changes:

1. Update relevant markdown files in `docs/`
2. Update README.md if adding new features
3. Add JSDoc comments to new public functions
4. Update CLAUDE.md for AI context if architecture changes

### Documentation Files

| File                      | Purpose                         |
| ------------------------- | ------------------------------- |
| `README.md`               | Main documentation, quick start |
| `CONTRIBUTING.md`         | This file                       |
| `docs/getting-started.md` | Detailed setup guide            |
| `docs/api-reference.md`   | Complete API docs               |
| `docs/architecture.md`    | System design                   |
| `docs/troubleshooting.md` | Common issues                   |
| `docs/guides/`            | Feature guides                  |
| `docs/deployment/`        | Deployment guides               |

## Reporting Issues

When reporting bugs, please include:

- Operating system and version
- Node.js version (`node --version`)
- Reader version
- Steps to reproduce
- Expected vs actual behavior
- Error messages and stack traces
- Verbose output (`-v` flag)

## Code of Conduct

- Be respectful and inclusive
- Focus on constructive feedback
- Help others learn and grow
- Follow project guidelines

## License

By contributing, you agree that your contributions will be licensed under the Apache 2.0 License.

## Disclaimer

By using Reader, you agree to the following:

- You are solely responsible for respecting websites' policies when scraping and crawling
- You will adhere to applicable privacy policies and terms of use before initiating scraping activities
- Reader respects robots.txt directives by default, but ultimate compliance is your responsibility

## Questions?

- Check the [documentation](https://docs.reader.dev)
- Search [GitHub Issues](https://github.com/vakra-dev/reader/issues)
- Ask in [Discord](https://discord.gg/6tjkq7J5WV)
- Open a new issue or discussion

Thank you for contributing!


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to the Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   Copyright (c) 2026 vakra-dev

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: README.md
================================================
<p align="center">
  <img src="docs/assets/logo.png" alt="Reader Logo" width="200" />
</p>

<h1 align="center">Reader</h1>

<p align="center">
  <strong>Open source web infrastructure for AI.</strong>
</p>

<p align="center">
  Access the web without the complexity.
</p>

<p align="center">
  <a href="https://opensource.org/licenses/Apache-2.0"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0"></a>
  <a href="https://www.npmjs.com/package/@vakra-dev/reader"><img src="https://img.shields.io/npm/v/@vakra-dev/reader.svg" alt="npm version"></a>
  <a href="https://github.com/vakra-dev/reader/stargazers"><img src="https://img.shields.io/github/stars/vakra-dev/reader.svg?style=social" alt="GitHub stars"></a>
</p>

<p align="center">
  <a href="https://docs.reader.dev">Docs</a> · <a href="https://docs.reader.dev/home/examples">Examples</a> · <a href="https://discord.gg/6tjkq7J5WV">Discord</a>
</p>

<p align="center">
  <img src="./docs/assets/demo.gif" alt="Reader demo - scrape any URL to clean markdown" width="700" />
</p>

## The Problem

Building agents that need web access is frustrating. You piece together Puppeteer, add stealth plugins, fight Cloudflare, manage proxies and it still breaks in production.

Because production grade web scraping isn't about rendering a page and converting HTML to markdown. It's about everything underneath:

| Layer                    | What it actually takes                                              |
| ------------------------ | ------------------------------------------------------------------- |
| **Browser architecture** | Managing browser instances at scale, not one-off scripts            |
| **Anti-bot bypass**      | Cloudflare, Turnstile, JS challenges, they all block naive scrapers |
| **TLS fingerprinting**   | Real browsers have fingerprints. Puppeteer doesn't. Sites know.     |
| **Proxy infrastructure** | Datacenter vs residential, rotation strategies, sticky sessions     |
| **Resource management**  | Browser pooling, memory limits, graceful recycling                  |
| **Reliability**          | Rate limiting, retries, timeouts, caching, graceful degradation     |

I built **Reader**, a production-grade web scraping engine on top of [Ulixee Hero](https://ulixee.org/), a headless browser designed for exactly this.

## The Solution

Three primitives. That's it.

```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";

const reader = new ReaderClient();

// 1. Scrape URLs → clean markdown
const result = await reader.scrape({ urls: ["https://example.com"] });
console.log(result.data[0].markdown);

// 2. Crawl a site → discover + scrape pages
const pages = await reader.crawl({
  url: "https://example.com",
  depth: 2,
  scrape: true,
});
console.log(`Found ${pages.urls.length} pages`);

// 3. Browser session → full Playwright/Puppeteer control with stealth
const session = await reader.browser();
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const page = browser.contexts()[0].pages()[0];
await page.goto("https://example.com");
console.log(await page.title());
await session.close();
```

All the hard stuff (browser pooling, anti-bot bypass, proxy rotation, retries) happens under the hood. You get clean markdown. Your agents get the web. And when you need full browser control, `browser()` gives you a stealthed Chrome that Playwright or Puppeteer can drive.

> [!TIP]
> If Reader is useful to you, a [star on GitHub](https://github.com/vakra-dev/reader) helps others discover the project.

## Features

- **Browser Sessions** - Launch stealthed Chrome, connect Playwright/Puppeteer via CDP
- **Anti-Bot Bypass** - TLS fingerprinting, navigator spoofing, WebRTC masking, `webdriver=false`
- **Clean Output** - Markdown and HTML with automatic main content extraction
- **Smart Content Cleaning** - Removes nav, headers, footers, popups, cookie banners
- **CLI & API** - Use from command line or programmatically
- **Browser Pool** - Auto-recycling, health monitoring, tiered proxy pools
- **Concurrent Scraping** - Parallel URL processing with progress tracking
- **Website Crawling** - BFS link discovery with depth/page limits
- **Tiered Proxies** - Datacenter and residential pools with auto-escalation and health tracking

## Installation

```bash
npm install @vakra-dev/reader
```

**Requirements:** Node.js >= 18

> **Apple Silicon (M1/M2/M3):** Hero's bundled Chrome binary isn't available for arm64. Point to your system Chrome:
>
> ```bash
> export CHROME_139_BIN="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
> ```

## Quick Start

### Cloud (Fastest)

Get an API key at [app.reader.dev](https://app.reader.dev) and start scraping immediately:

```typescript
import { ReaderClient } from "@vakra-dev/reader-js";

const reader = new ReaderClient({ apiKey: process.env.READER_API_KEY });

const result = await reader.read({ url: "https://example.com" });
if (result.kind === "scrape") {
  console.log(result.data.markdown);
}
```

```bash
npm install @vakra-dev/reader-js
```

See the [cloud docs](https://docs.reader.dev) for the full API reference.

### Self-Hosted

Install the reader engine and run scraping on your own infrastructure:

### Basic Scrape

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: ["https://example.com"],
  formats: ["markdown", "html"],
});

console.log(result.data[0].markdown);
console.log(result.data[0].html);

await reader.close();
```

### Batch Scraping with Concurrency

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: ["https://example.com", "https://example.org", "https://example.net"],
  formats: ["markdown"],
  batchConcurrency: 3,
  onProgress: (progress) => {
    console.log(`${progress.completed}/${progress.total}: ${progress.currentUrl}`);
  },
});

console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);

await reader.close();
```

### Crawling

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.crawl({
  url: "https://example.com",
  depth: 2,
  maxPages: 20,
  scrape: true,
});

console.log(`Discovered ${result.urls.length} URLs`);
console.log(`Scraped ${result.scraped?.batchMetadata.successfulUrls} pages`);

await reader.close();
```

### Browser Session

Launch a stealthed Chrome and control it with Playwright or Puppeteer. The browser has anti-bot stealth active (`webdriver=false`, navigator spoofing, WebRTC masking). Your existing scripts just work.

```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";

const reader = new ReaderClient();

// Create a browser session - returns a CDP WebSocket URL
const session = await reader.browser();

// Connect Playwright (one-line change from a local script)
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const context = await browser.newContext();
const page = await context.newPage();

// Use Playwright normally - full stealth active
await page.goto("https://news.ycombinator.com/");
console.log(await page.title());

await browser.close();
await session.close();
await reader.close();
```

Also works with Puppeteer:

```typescript
import { connect } from "puppeteer-core";

const browser = await connect({ browserWSEndpoint: session.wsEndpoint });
```

### With Proxy

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: ["https://example.com"],
  formats: ["markdown"],
  proxy: {
    type: "residential",
    host: "proxy.example.com",
    port: 8080,
    username: "username",
    password: "password",
    country: "us",
  },
});

await reader.close();
```

### With Tiered Proxy Pools

Configure datacenter (fast, cheap) and residential (anti-bot) proxy tiers. Reader auto-escalates from datacenter to residential when sites block:

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({
  proxyPools: {
    datacenter: [
      { url: "http://user:pass@dc-proxy1:8080" },
      { url: "http://user:pass@dc-proxy2:8080" },
    ],
    residential: [{ url: "http://user:pass@res-proxy1:8080" }],
  },
});

const result = await reader.scrape({
  urls: ["https://example.com"],
  proxyTier: "auto", // datacenter first, escalate to residential on block
});

await reader.close();
```

Or via environment variables:

```bash
PROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080
PROXY_RESIDENTIAL=http://user:pass@res1:8080
```

### With Browser Pool Configuration

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({
  browserPool: {
    size: 5, // 5 browser instances
    retireAfterPages: 50, // Recycle after 50 pages
    retireAfterMinutes: 15, // Recycle after 15 minutes
  },
  verbose: true,
});

const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 5,
});

await reader.close();
```

## CLI Reference

### Daemon Mode

For multiple requests, start a daemon to keep browser pool warm:

```bash
# Start daemon with browser pool
npx reader start --direct-pool-size 5

# All subsequent commands auto-connect to daemon
npx reader scrape https://example.com
npx reader crawl https://example.com -d 2

# Check daemon status
npx reader status

# Stop daemon
npx reader stop

# Force standalone mode (bypass daemon)
npx reader scrape https://example.com --standalone
```

### `reader scrape <urls...>`

Scrape one or more URLs.

```bash
# Scrape a single URL
npx reader scrape https://example.com

# Scrape with multiple formats
npx reader scrape https://example.com -f markdown,html

# Scrape multiple URLs concurrently
npx reader scrape https://example.com https://example.org -c 2

# Save to file
npx reader scrape https://example.com -o output.md
```

| Option                   | Type   | Default      | Description                                             |
| ------------------------ | ------ | ------------ | ------------------------------------------------------- |
| `-f, --format <formats>` | string | `"markdown"` | Output formats (comma-separated: markdown,html)         |
| `-o, --output <file>`    | string | stdout       | Output file path                                        |
| `-c, --concurrency <n>`  | number | `1`          | Parallel requests                                       |
| `-t, --timeout <ms>`     | number | `30000`      | Request timeout in milliseconds                         |
| `--batch-timeout <ms>`   | number | `300000`     | Total timeout for entire batch operation                |
| `--proxy <url>`          | string | -            | Proxy URL (e.g., http://user:pass@host:port)            |
| `--user-agent <string>`  | string | -            | Custom user agent string                                |
| `--show-chrome`          | flag   | -            | Show browser window for debugging                       |
| `--no-main-content`      | flag   | -            | Disable main content extraction (include full page)     |
| `--include-tags <sel>`   | string | -            | CSS selectors for elements to include (comma-separated) |
| `--exclude-tags <sel>`   | string | -            | CSS selectors for elements to exclude (comma-separated) |
| `-v, --verbose`          | flag   | -            | Enable verbose logging                                  |

### `reader crawl <url>`

Crawl a website to discover pages.

```bash
# Crawl with default settings
npx reader crawl https://example.com

# Crawl deeper with more pages
npx reader crawl https://example.com -d 3 -m 50

# Crawl and scrape content
npx reader crawl https://example.com -d 2 --scrape

# Filter URLs with patterns
npx reader crawl https://example.com --include "blog/*" --exclude "admin/*"
```

| Option                   | Type   | Default      | Description                                     |
| ------------------------ | ------ | ------------ | ----------------------------------------------- |
| `-d, --depth <n>`        | number | `1`          | Maximum crawl depth                             |
| `-m, --max-pages <n>`    | number | `20`         | Maximum pages to discover                       |
| `-s, --scrape`           | flag   | -            | Also scrape content of discovered pages         |
| `-f, --format <formats>` | string | `"markdown"` | Output formats when scraping (comma-separated)  |
| `-o, --output <file>`    | string | stdout       | Output file path                                |
| `--delay <ms>`           | number | `1000`       | Delay between requests in milliseconds          |
| `-t, --timeout <ms>`     | number | -            | Total timeout for crawl operation               |
| `--include <patterns>`   | string | -            | URL patterns to include (comma-separated regex) |
| `--exclude <patterns>`   | string | -            | URL patterns to exclude (comma-separated regex) |
| `--proxy <url>`          | string | -            | Proxy URL (e.g., http://user:pass@host:port)    |
| `--user-agent <string>`  | string | -            | Custom user agent string                        |
| `--show-chrome`          | flag   | -            | Show browser window for debugging               |
| `-v, --verbose`          | flag   | -            | Enable verbose logging                          |

### `reader browser`

Launch a browser session with a CDP WebSocket endpoint.

```bash
# Create a session (prints wsEndpoint, blocks until Ctrl+C)
npx reader browser create

# Create with options
npx reader browser create --timeout 60000 --show-chrome

# List active sessions (daemon mode)
npx reader browser list

# Stop a session
npx reader browser stop <sessionId>
```

| Option               | Type   | Default  | Description                      |
| -------------------- | ------ | -------- | -------------------------------- |
| `--proxy <url>`      | string | -        | Proxy URL                        |
| `-t, --timeout <ms>` | number | `300000` | Session lifetime in milliseconds |
| `--show-chrome`      | flag   | -        | Show browser window              |
| `--standalone`       | flag   | -        | Force standalone mode            |
| `-v, --verbose`      | flag   | -        | Enable verbose logging           |

## API Reference

### `ReaderClient`

The recommended way to use Reader. Manages HeroCore lifecycle automatically.

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({ verbose: true });

// Scrape
const result = await reader.scrape({ urls: ["https://example.com"] });

// Crawl
const crawlResult = await reader.crawl({ url: "https://example.com", depth: 2 });

// Browser session
const session = await reader.browser();
// → session.wsEndpoint for Playwright/Puppeteer

// Close when done (optional - auto-closes on exit)
await reader.close();
```

#### Constructor Options

| Option          | Type                | Default         | Description                                      |
| --------------- | ------------------- | --------------- | ------------------------------------------------ |
| `verbose`       | `boolean`           | `false`         | Enable verbose logging                           |
| `showChrome`    | `boolean`           | `false`         | Show browser window for debugging                |
| `browserPool`   | `BrowserPoolConfig` | `undefined`     | Browser pool configuration (size, recycling)     |
| `proxyPools`    | `ProxyPoolConfig`   | `undefined`     | Tiered proxy pools (datacenter + residential)    |
| `proxies`       | `ProxyConfig[]`     | `undefined`     | Array of proxies for rotation (legacy)           |
| `proxyRotation` | `string`            | `"round-robin"` | Rotation strategy: `"round-robin"` or `"random"` |

#### BrowserPoolConfig

| Option               | Type     | Default | Description                         |
| -------------------- | -------- | ------- | ----------------------------------- |
| `size`               | `number` | `2`     | Number of browser instances in pool |
| `retireAfterPages`   | `number` | `100`   | Recycle browser after N page loads  |
| `retireAfterMinutes` | `number` | `30`    | Recycle browser after N minutes     |
| `maxQueueSize`       | `number` | `100`   | Max pending requests in queue       |

#### Methods

| Method              | Description                                        |
| ------------------- | -------------------------------------------------- |
| `scrape(options)`   | Scrape one or more URLs                            |
| `crawl(options)`    | Crawl a website to discover pages                  |
| `browser(options?)` | Launch a stealthed browser session (CDP WebSocket) |
| `start()`           | Pre-initialize HeroCore (optional)                 |
| `isReady()`         | Check if client is initialized                     |
| `close()`           | Close client and release resources                 |

### `scrape(options): Promise<ScrapeResult>`

Scrape one or more URLs. Can be used directly or via `ReaderClient`.

| Option             | Type                          | Required | Default        | Description                                                     |
| ------------------ | ----------------------------- | -------- | -------------- | --------------------------------------------------------------- |
| `urls`             | `string[]`                    | Yes      | -              | Array of URLs to scrape                                         |
| `formats`          | `Array<"markdown" \| "html">` | No       | `["markdown"]` | Output formats                                                  |
| `onlyMainContent`  | `boolean`                     | No       | `true`         | Extract only main content (removes nav/header/footer)           |
| `includeTags`      | `string[]`                    | No       | `[]`           | CSS selectors for elements to keep                              |
| `excludeTags`      | `string[]`                    | No       | `[]`           | CSS selectors for elements to remove                            |
| `waitForSelector`  | `string`                      | No       | -              | CSS selector to wait for before page is loaded                  |
| `timeoutMs`        | `number`                      | No       | `30000`        | Request timeout in milliseconds                                 |
| `batchConcurrency` | `number`                      | No       | `1`            | Number of URLs to process in parallel                           |
| `batchTimeoutMs`   | `number`                      | No       | `300000`       | Total timeout for entire batch operation                        |
| `proxy`            | `ProxyConfig`                 | No       | -              | Proxy configuration object                                      |
| `proxyTier`        | `ProxyTier`                   | No       | -              | Proxy tier: `"datacenter"`, `"residential"`, `"auto"`           |
| `onProgress`       | `function`                    | No       | -              | Progress callback: `({ completed, total, currentUrl }) => void` |
| `verbose`          | `boolean`                     | No       | `false`        | Enable verbose logging                                          |
| `showChrome`       | `boolean`                     | No       | `false`        | Show Chrome window for debugging                                |

**Returns:** `Promise<ScrapeResult>`

```typescript
interface ScrapeResult {
  data: WebsiteScrapeResult[];
  batchMetadata: BatchMetadata;
}

interface WebsiteScrapeResult {
  markdown?: string;
  html?: string;
  metadata: {
    baseUrl: string;
    finalUrl?: string; // Present if URL redirected
    totalPages: number;
    scrapedAt: string;
    duration: number;
    website: WebsiteMetadata;
  };
}

interface BatchMetadata {
  totalUrls: number;
  successfulUrls: number;
  failedUrls: number;
  scrapedAt: string;
  totalDuration: number;
  errors?: Array<{ url: string; error: string }>;
}
```

### `crawl(options): Promise<CrawlResult>`

Crawl a website to discover pages.

| Option              | Type                          | Required | Default        | Description                                     |
| ------------------- | ----------------------------- | -------- | -------------- | ----------------------------------------------- |
| `url`               | `string`                      | Yes      | -              | Single seed URL to start crawling from          |
| `depth`             | `number`                      | No       | `1`            | Maximum depth to crawl                          |
| `maxPages`          | `number`                      | No       | `20`           | Maximum pages to discover                       |
| `scrape`            | `boolean`                     | No       | `false`        | Also scrape full content of discovered pages    |
| `delayMs`           | `number`                      | No       | `1000`         | Delay between requests in milliseconds          |
| `timeoutMs`         | `number`                      | No       | -              | Total timeout for entire crawl operation        |
| `includePatterns`   | `string[]`                    | No       | -              | URL patterns to include (regex strings)         |
| `excludePatterns`   | `string[]`                    | No       | -              | URL patterns to exclude (regex strings)         |
| `formats`           | `Array<"markdown" \| "html">` | No       | `["markdown"]` | Output formats for scraped content              |
| `scrapeConcurrency` | `number`                      | No       | `2`            | Number of URLs to scrape in parallel            |
| `proxy`             | `ProxyConfig`                 | No       | -              | Proxy configuration object                      |
| `userAgent`         | `string`                      | No       | -              | Custom user agent string                        |
| `verbose`           | `boolean`                     | No       | `false`        | Enable verbose logging                          |
| `showChrome`        | `boolean`                     | No       | `false`        | Show Chrome window for debugging                |
| `connectionToCore`  | `any`                         | No       | -              | Connection to shared Hero Core (for production) |

**Returns:** `Promise<CrawlResult>`

```typescript
interface CrawlResult {
  urls: CrawlUrl[];
  scraped?: ScrapeResult;
  metadata: CrawlMetadata;
}

interface CrawlUrl {
  url: string;
  title: string;
  description: string | null;
}

interface CrawlMetadata {
  totalUrls: number;
  maxDepth: number;
  totalDuration: number;
  seedUrl: string;
}
```

### `browser(options?): Promise<BrowserSession>`

Launch a stealthed Chrome and return a CDP WebSocket URL for Playwright/Puppeteer.

| Option       | Type          | Required | Default  | Description                                           |
| ------------ | ------------- | -------- | -------- | ----------------------------------------------------- |
| `proxy`      | `ProxyConfig` | No       | -        | Proxy configuration                                   |
| `proxyTier`  | `ProxyTier`   | No       | -        | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `showChrome` | `boolean`     | No       | `false`  | Show browser window                                   |
| `timeoutMs`  | `number`      | No       | `300000` | Session lifetime (auto-closes after)                  |
| `verbose`    | `boolean`     | No       | `false`  | Enable verbose logging                                |

**Returns:** `Promise<BrowserSession>`

```typescript
interface BrowserSession {
  sessionId: string; // Unique session identifier
  wsEndpoint: string; // CDP WebSocket URL for Playwright/Puppeteer
  createdAt: string; // ISO timestamp
  close(): Promise<void>; // Close session and release resources
}
```

**Stealth features active on all sessions:**

- `navigator.webdriver = false` (via `--disable-blink-features=AutomationControlled`)
- Proxy routing through authenticated proxy forwarder (if configured)
- Isolated user profile per session (no cookie/state leaks)

### ProxyConfig

| Option     | Type                            | Required | Default | Description                                             |
| ---------- | ------------------------------- | -------- | ------- | ------------------------------------------------------- |
| `url`      | `string`                        | No       | -       | Full proxy URL (takes precedence over other fields)     |
| `type`     | `"datacenter" \| "residential"` | No       | -       | Proxy type                                              |
| `host`     | `string`                        | No       | -       | Proxy host                                              |
| `port`     | `number`                        | No       | -       | Proxy port                                              |
| `username` | `string`                        | No       | -       | Proxy username                                          |
| `password` | `string`                        | No       | -       | Proxy password                                          |
| `country`  | `string`                        | No       | -       | Country code for residential proxies (e.g., 'us', 'uk') |

## Daemon Mode (Production)

For production servers, start the daemon once and all scrape/crawl/browser requests share the warm browser pool:

```typescript
import { ReaderClient } from "@vakra-dev/reader";

// Create once at startup
const reader = new ReaderClient({
  proxyPools: {
    datacenter: [{ url: "http://user:pass@dc-proxy:8080" }],
    residential: [{ url: "http://user:pass@res-proxy:8080" }],
  },
});

// Reuse for all requests
const result = await reader.scrape({ urls: ["https://example.com"] });

// Graceful shutdown
process.on("SIGTERM", () => reader.close());
```

## How It Works

### Anti-Bot Bypass

Reader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced anti-detection:

1. **TLS Fingerprinting** - Emulates real Chrome browser fingerprints via MITM proxy
2. **Navigator Spoofing** - `webdriver=false`, device memory, hardware concurrency
3. **DNS over TLS** - Uses Cloudflare DNS (1.1.1.1) to mimic Chrome behavior
4. **WebRTC IP Masking** - Prevents IP leaks through WebRTC connections
5. **WebGL/Canvas Fingerprinting** - Randomized rendering signatures

### Browser Pool

- **Tiered Proxy Pools** - Separate datacenter and residential pools with auto-escalation
- **Auto-Recycling** - Browsers recycled after 100 requests or 30 minutes
- **Health Tracking** - Auto-benches failed proxies for 5 minutes, revives on recovery
- **Per-Proxy Concurrency** - Limits concurrent requests per proxy URL (default: 2)

### HTML to Markdown: supermarkdown

Reader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines.

**Why we built it:**

When you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown.

**What supermarkdown offers:**

| Feature              | Benefit                                              |
| -------------------- | ---------------------------------------------------- |
| **Written in Rust**  | Native performance with Node.js bindings via napi-rs |
| **Full GFM support** | Tables, task lists, strikethrough, autolinks         |
| **LLM-optimized**    | Clean output designed for AI consumption             |
| **Battle-tested**    | Handles malformed HTML from real web pages           |
| **CSS selectors**    | Include/exclude elements during conversion           |

supermarkdown is open source and available as both a Rust crate and npm package:

```bash
# npm
npm install @vakra-dev/supermarkdown

# Rust
cargo add supermarkdown
```

Check out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation.

## Server Deployment

Reader uses a real Chromium browser under the hood. On headless Linux servers (VPS, EC2, etc.), you need to install Chrome's system dependencies:

```bash
# Debian/Ubuntu
sudo apt-get install -y libnspr4 libnss3 libatk1.0-0 libatk-bridge2.0-0 \
  libcups2 libxcb1 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
  libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 libasound2
```

This is the same requirement that Puppeteer and Playwright have on headless Linux. macOS, Windows, and Linux desktops already have these libraries.

For Docker and production deployment guides, see the [deployment documentation](https://docs.reader.dev/documentation/guides/deployment).

## Documentation

Full documentation is available at **[docs.reader.dev](https://docs.reader.dev)**, including guides for scraping, crawling, proxy configuration, browser pool management, and deployment.

### Examples

| Example                                                                    | Description                                    |
| -------------------------------------------------------------------------- | ---------------------------------------------- |
| [Basic Scraping](examples/basic/basic-scrape.ts)                           | Simple single-URL scraping                     |
| [Batch Scraping](examples/basic/batch-scrape.ts)                           | Concurrent multi-URL scraping                  |
| [Crawl Website](examples/basic/crawl-website.ts)                           | Crawl and discover pages                       |
| [Browser Session (Playwright)](examples/basic/browser-session.ts)          | Navigate, extract data, screenshot             |
| [Browser Session (Actions)](examples/basic/browser-session-actions.ts)     | Click, type, search, wait for elements         |
| [Browser Session (Puppeteer)](examples/basic/browser-session-puppeteer.ts) | Puppeteer via `connect({ browserWSEndpoint })` |
| [Browser Session (Raw CDP)](examples/basic/browser-session-selenium.ts)    | Direct CDP WebSocket commands                  |
| [Browser Pool Config](examples/basic/browser-pool-config.ts)               | Configure browser pool for high throughput     |
| [Proxy Pool](examples/basic/proxy-pool.ts)                                 | Proxy rotation with multiple proxies           |
| [Cloudflare Bypass](examples/basic/cloudflare-bypass.ts)                   | Scrape Cloudflare-protected sites              |
| [All Formats](examples/basic/all-formats.ts)                               | Output in markdown and html                    |
| [AI Tools](examples/ai-tools/)                                             | OpenAI, Anthropic, LangChain integrations      |

## Development

```bash
# Install dependencies
npm install

# Run linting
npm run lint

# Format code
npm run format

# Type check
npm run typecheck

# Find TODOs
npm run todo
```

## Contributing

Contributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.

## License

[Apache 2.0](LICENSE) - See LICENSE for details.

## Citation

If you use Reader in your research or project, please cite it:

```bibtex
@software{reader.dev,
  author = {Kaul, Nihal},
  title = {Reader: Open-source, production-grade web scraping engine built for LLMs},
  year = {2026},
  publisher = {GitHub},
  url = {https://github.com/vakra-dev/reader}
}
```

## Support

- [GitHub Issues](https://github.com/vakra-dev/reader/issues)
- [Documentation](https://docs.reader.dev)
- [Discord](https://discord.gg/6tjkq7J5WV)


================================================
FILE: SECURITY.md
================================================
# Security Policy

## Supported Versions

| Version | Supported |
| ------- | --------- |
| Latest  | Yes       |

We only provide security fixes for the latest release.

## Reporting a Vulnerability

If you discover a security vulnerability in Reader, please report it responsibly.

**Do not open a public GitHub issue for security vulnerabilities.**

Instead, email **nihal.codes@gmail.com** with:

- A description of the vulnerability
- Steps to reproduce the issue
- The potential impact
- Any suggested fixes (optional)

## What to Expect

- **Acknowledgment** within 48 hours of your report
- **Status update** within 7 days with an assessment and timeline
- **Credit** in the release notes (unless you prefer to remain anonymous)

## Scope

The following are in scope:

- The `@vakra-dev/reader` npm package
- The Reader CLI tool
- The Reader Cloud API (`cloud.reader.dev`)

The following are out of scope:

- Vulnerabilities in upstream dependencies (report these to the respective projects)
- Issues related to websites blocking scraping (this is expected behavior, not a vulnerability)

## Responsible Use

Reader is a web scraping tool. Users are responsible for complying with applicable laws and website terms of service. The project maintainers are not responsible for how the tool is used.

================================================
FILE: docs/api-reference.md
================================================
# API Reference

Complete API documentation for Reader.

## ReaderClient (Recommended)

The recommended way to use Reader. Manages HeroCore lifecycle automatically, reuses connections efficiently, and auto-closes on process exit.

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({ verbose: true });

// Scrape URLs
const result = await reader.scrape({
  urls: ["https://example.com"],
  formats: ["markdown"],
});

// Crawl a website
const crawlResult = await reader.crawl({
  url: "https://example.com",
  depth: 2,
});

// Launch a stealthed browser session
const session = await reader.browser();
// → session.wsEndpoint for Playwright/Puppeteer

// Close when done (optional - auto-closes on exit)
await reader.close();
```

### Constructor

```typescript
new ReaderClient(options?: ReaderClientOptions)
```

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `verbose` | `boolean` | `false` | Enable verbose logging |
| `showChrome` | `boolean` | `false` | Show browser window for debugging |
| `browserPool` | `BrowserPoolConfig` | - | Browser pool configuration |
| `proxyPools` | `ProxyPoolConfig` | - | Tiered proxy pools (datacenter + residential) |
| `proxies` | `ProxyConfig[]` | - | List of proxies to rotate through (legacy) |
| `proxyRotation` | `"round-robin" \| "random"` | `"round-robin"` | Proxy rotation strategy |

#### ProxyPoolConfig

```typescript
interface ProxyPoolConfig {
  datacenter?: ProxyConfig[];   // Fast, cheap - works for most sites
  residential?: ProxyConfig[];  // Slower, anti-bot sites (Amazon, LinkedIn)
}
```

#### BrowserPoolConfig

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `size` | `number` | `2` | Number of browser instances |
| `retireAfterPages` | `number` | `100` | Retire browser after N page loads |
| `retireAfterMinutes` | `number` | `30` | Retire browser after N minutes |
| `maxQueueSize` | `number` | `100` | Maximum pending requests in queue |

### Methods

#### start()

Pre-initialize HeroCore. Called automatically on first scrape/crawl.

```typescript
await reader.start(): Promise<void>
```

#### scrape(options)

Scrape one or more URLs.

```typescript
const result = await reader.scrape(options): Promise<ScrapeResult>
```

See [ScrapeOptions](#scrapeoptions) for available options.

#### crawl(options)

Crawl a website to discover pages.

```typescript
const result = await reader.crawl(options): Promise<CrawlResult>
```

See [CrawlOptions](#crawloptions) for available options.

#### browser(options?)

Launch a stealthed browser session and return a CDP WebSocket URL for Playwright/Puppeteer.

```typescript
const session = await reader.browser(options?): Promise<BrowserSession>
```

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `proxy` | `ProxyConfig` | - | Proxy configuration |
| `proxyTier` | `ProxyTier` | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `showChrome` | `boolean` | `false` | Show browser window |
| `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) |
| `verbose` | `boolean` | `false` | Enable verbose logging |

Returns:

```typescript
interface BrowserSession {
  sessionId: string;       // Unique session identifier
  wsEndpoint: string;      // CDP WebSocket URL
  createdAt: string;       // ISO timestamp
  close(): Promise<void>;  // Close session and release resources
}
```

See the [Browser Sessions guide](guides/browser-sessions.md) for full examples.

#### isReady()

Check if the client is initialized and ready.

```typescript
reader.isReady(): boolean
```

#### close()

Close the client and release resources.

```typescript
await reader.close(): Promise<void>
```

---

## Direct Functions (Advanced)

For advanced use cases where you need custom HeroCore management, you can use the direct functions. Note that without `connectionToCore`, each call spawns a new HeroCore instance which is less efficient.

### scrape(options)

Scrape one or more URLs and return content in specified formats.

```typescript
import { scrape } from "@vakra-dev/reader";

const result = await scrape({
  urls: ["https://example.com"],
  formats: ["markdown"],
});
```

#### Parameters

| Name | Type | Required | Default | Description |
|------|------|----------|---------|-------------|
| `urls` | `string[]` | Yes | - | Array of URLs to scrape |
| `formats` | `FormatType[]` | No | `["markdown"]` | Output formats |
| `onlyMainContent` | `boolean` | No | `true` | Extract only main content |
| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |
| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |
| `userAgent` | `string` | No | - | Custom user agent string |
| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |
| `batchConcurrency` | `number` | No | `1` | URLs to process in parallel |
| `batchTimeoutMs` | `number` | No | `300000` | Total batch timeout |
| `onProgress` | `ProgressCallback` | No | - | Progress callback function |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration |
| `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `waitForSelector` | `string` | No | - | CSS selector to wait for |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show browser window |
| `connectionToCore` | `any` | No | - | Shared Hero Core connection |

#### Returns

`Promise<ScrapeResult>`

```typescript
interface ScrapeResult {
  data: WebsiteScrapeResult[];
  batchMetadata: BatchMetadata;
}
```

#### Example

```typescript
// Using ReaderClient (recommended)
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://example.com", "https://example.org"],
  formats: ["markdown", "html"],
  batchConcurrency: 2,
  onProgress: ({ completed, total, currentUrl }) => {
    console.log(`[${completed}/${total}] ${currentUrl}`);
  },
});

for (const site of result.data) {
  console.log("URL:", site.metadata.baseUrl);
  console.log("Markdown:", site.markdown?.substring(0, 200));
}

await reader.close();
```

---

### crawl(options)

Crawl a website to discover pages, optionally scraping their content.

```typescript
// Using ReaderClient (recommended)
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();
const result = await reader.crawl({
  url: "https://example.com",
  depth: 2,
  maxPages: 20,
  scrape: true,
});
await reader.close();
```

#### Parameters

| Name | Type | Required | Default | Description |
|------|------|----------|---------|-------------|
| `url` | `string` | Yes | - | Seed URL to start crawling |
| `depth` | `number` | No | `1` | Maximum crawl depth |
| `maxPages` | `number` | No | `20` | Maximum pages to discover |
| `scrape` | `boolean` | No | `false` | Also scrape discovered pages |
| `delayMs` | `number` | No | `1000` | Delay between requests |
| `timeoutMs` | `number` | No | - | Total crawl timeout |
| `includePatterns` | `string[]` | No | - | URL patterns to include |
| `excludePatterns` | `string[]` | No | - | URL patterns to exclude |
| `formats` | `FormatType[]` | No | `["markdown", "html"]` | Output formats when scraping |
| `scrapeConcurrency` | `number` | No | `2` | Scraping parallelism |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration |
| `userAgent` | `string` | No | - | Custom user agent |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show browser window |
| `connectionToCore` | `any` | No | - | Shared Hero Core connection |

#### Returns

`Promise<CrawlResult>`

```typescript
interface CrawlResult {
  urls: CrawlUrl[];
  scraped?: ScrapeResult;
  metadata: CrawlMetadata;
}
```

#### Example

```typescript
const reader = new ReaderClient();
const result = await reader.crawl({
  url: "https://docs.example.com",
  depth: 3,
  maxPages: 50,
  includePatterns: ["docs/*"],
  excludePatterns: ["docs/archive/*"],
  scrape: true,
});

console.log(`Discovered ${result.urls.length} pages`);
result.urls.forEach((page) => {
  console.log(`- ${page.title}: ${page.url}`);
});

if (result.scraped) {
  console.log(`Scraped ${result.scraped.batchMetadata.successfulUrls} pages`);
}

await reader.close();
```

---

## Type Definitions

### ScrapeOptions

```typescript
interface ScrapeOptions {
  urls: string[];
  formats?: Array<"markdown" | "html">;
  onlyMainContent?: boolean;
  includeTags?: string[];
  excludeTags?: string[];
  userAgent?: string;
  timeoutMs?: number;
  batchConcurrency?: number;
  batchTimeoutMs?: number;
  onProgress?: (progress: ProgressInfo) => void;
  proxy?: ProxyConfig;
  proxyTier?: "datacenter" | "residential" | "auto";
  waitForSelector?: string;
  verbose?: boolean;
  showChrome?: boolean;
  connectionToCore?: any;
}
```

### CrawlOptions

```typescript
interface CrawlOptions {
  url: string;
  depth?: number;
  maxPages?: number;
  scrape?: boolean;
  delayMs?: number;
  timeoutMs?: number;
  includePatterns?: string[];
  excludePatterns?: string[];
  formats?: Array<"markdown" | "html">;
  scrapeConcurrency?: number;
  proxy?: ProxyConfig;
  userAgent?: string;
  verbose?: boolean;
  showChrome?: boolean;
  connectionToCore?: any;
}
```

### ProxyConfig

```typescript
interface ProxyConfig {
  url?: string;
  type?: "datacenter" | "residential";
  host?: string;
  port?: number;
  username?: string;
  password?: string;
  country?: string;
}
```

### ScrapeResult

```typescript
interface ScrapeResult {
  data: WebsiteScrapeResult[];
  batchMetadata: BatchMetadata;
}
```

### WebsiteScrapeResult

```typescript
interface WebsiteScrapeResult {
  markdown?: string;
  html?: string;
  metadata: {
    baseUrl: string;
    finalUrl?: string;  // Present if URL redirected
    totalPages: number;
    scrapedAt: string;
    duration: number;
    website: WebsiteMetadata;
    proxy?: ProxyMetadata;  // Included when proxy pooling is used
  };
}
```

### ProxyMetadata

```typescript
interface ProxyMetadata {
  host: string;
  port: number;
  country?: string;  // If geo-targeting was used
}
```

### BatchMetadata

```typescript
interface BatchMetadata {
  totalUrls: number;
  successfulUrls: number;
  failedUrls: number;
  scrapedAt: string;
  totalDuration: number;
  errors?: Array<{ url: string; error: string }>;
}
```

### CrawlResult

```typescript
interface CrawlResult {
  urls: CrawlUrl[];
  scraped?: ScrapeResult;
  metadata: CrawlMetadata;
}
```

### CrawlUrl

```typescript
interface CrawlUrl {
  url: string;
  title: string;
  description: string | null;
}
```

### CrawlMetadata

```typescript
interface CrawlMetadata {
  totalUrls: number;
  maxDepth: number;
  totalDuration: number;
  seedUrl: string;
}
```

### WebsiteMetadata

```typescript
interface WebsiteMetadata {
  title: string | null;
  description: string | null;
  author: string | null;
  language: string | null;
  charset: string | null;
  favicon: string | null;
  image: string | null;
  canonical: string | null;
  keywords: string[] | null;
  robots: string | null;
  themeColor: string | null;
  openGraph: {
    title: string | null;
    description: string | null;
    type: string | null;
    url: string | null;
    image: string | null;
    siteName: string | null;
    locale: string | null;
  } | null;
  twitter: {
    card: string | null;
    site: string | null;
    creator: string | null;
    title: string | null;
    description: string | null;
    image: string | null;
  } | null;
}
```

### ProgressInfo

```typescript
interface ProgressInfo {
  completed: number;
  total: number;
  currentUrl: string;
}
```

---

## Classes

### BrowserPool

Manages a pool of Hero browser instances for efficient scraping.

```typescript
import { BrowserPool } from "@vakra-dev/reader";

const pool = new BrowserPool({ size: 5 });
await pool.initialize();

const result = await pool.withBrowser(async (hero) => {
  await hero.goto("https://example.com");
  return await hero.document.title;
});

await pool.shutdown();
```

#### Constructor

```typescript
new BrowserPool(config?: PoolConfig)
```

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `size` | `number` | `2` | Number of browser instances |
| `retireAfterPages` | `number` | `100` | Recycle after N pages |
| `retireAfterMinutes` | `number` | `30` | Recycle after N minutes |
| `maxQueueSize` | `number` | `100` | Maximum pending requests |
| `healthCheckIntervalMs` | `number` | `300000` | Health check interval |

#### Methods

##### initialize()

Initialize the browser pool.

```typescript
await pool.initialize(): Promise<void>
```

##### withBrowser(fn)

Execute a function with an acquired browser, automatically releasing it after.

```typescript
await pool.withBrowser<T>(fn: (hero: Hero) => Promise<T>): Promise<T>
```

##### acquire()

Manually acquire a browser instance. Must be paired with `release()`.

```typescript
const hero = await pool.acquire(): Promise<Hero>
```

##### release(hero)

Release a browser instance back to the pool.

```typescript
await pool.release(hero: Hero): Promise<void>
```

##### healthCheck()

Check the health of all pool instances.

```typescript
const health = await pool.healthCheck(): Promise<HealthCheckResult>
```

##### getStats()

Get current pool statistics.

```typescript
const stats = pool.getStats(): PoolStats
```

##### shutdown()

Shutdown all browser instances.

```typescript
await pool.shutdown(): Promise<void>
```

---

## Formatter Functions

### formatToMarkdown(pages, baseUrl, scrapedAt, duration, metadata?)

Convert scraped pages to Markdown format.

```typescript
import { formatToMarkdown } from "@vakra-dev/reader";

const markdown = formatToMarkdown(
  pages,
  "https://example.com",
  new Date().toISOString(),
  1500,
  metadata
);
```

---

### formatToHTML(pages, baseUrl, scrapedAt, duration, metadata?)

Convert scraped pages to a complete HTML document.

```typescript
import { formatToHTML } from "@vakra-dev/reader";

const html = formatToHTML(
  pages,
  "https://example.com",
  new Date().toISOString(),
  1500,
  metadata
);
```


---

## Utility Functions

### cleanContent(html)

Remove navigation, ads, scripts, and other non-content elements from HTML.

```typescript
import { cleanContent } from "@vakra-dev/reader";

const cleanHtml = cleanContent(rawHtml);
```

---

### extractMetadata(html)

Extract metadata from HTML including Open Graph and Twitter cards.

```typescript
import { extractMetadata } from "@vakra-dev/reader";

const metadata = extractMetadata(html);
console.log(metadata.title);
console.log(metadata.openGraph?.image);
```

---

## Default Values

```typescript
const DEFAULT_OPTIONS = {
  formats: ["markdown"],
  onlyMainContent: true,
  timeoutMs: 30000,
  batchConcurrency: 1,
  batchTimeoutMs: 300000,
  verbose: false,
  showChrome: false,
};

const DEFAULT_CRAWL_OPTIONS = {
  depth: 1,
  maxPages: 20,
  scrape: false,
  delayMs: 1000,
  formats: ["markdown", "html"],
  scrapeConcurrency: 2,
  verbose: false,
  showChrome: false,
};

const DEFAULT_POOL_CONFIG = {
  size: 2,
  retireAfterPages: 100,
  retireAfterMinutes: 30,
  maxQueueSize: 100,
  healthCheckIntervalMs: 300000,
};
```

---

## See Also

- [Getting Started](getting-started.md) - Quick start guide
- [Architecture](architecture.md) - System design
- [Browser Pool Guide](guides/browser-pool.md) - Pool management
- [Cloudflare Bypass Guide](guides/cloudflare-bypass.md) - Challenge handling


================================================
FILE: docs/architecture.md
================================================
# Architecture

This document describes the internal architecture of Reader, helping contributors understand how the system works.

## High-Level Overview

```
┌─────────────────────────────────────────────────────────────────┐
│                        Public API                                │
│              scrape() / crawl() / browser()                      │
└──────────┬─────────────────┬────────────────┬───────────────────┘
           │                 │                │
     ┌─────▼─────┐    ┌─────▼─────┐    ┌─────▼──────────┐
     │  Scraper  │    │  Crawler  │    │ BrowserSession │
     │  Class    │    │  Class    │    │ (CDP WebSocket)│
     └─────┬─────┘    └─────┬─────┘    └─────┬──────────┘
           │                │                │
           └────────┬───────┘                │ own HeroCore
                    │                        │
          ┌─────────▼─────────┐    ┌─────────▼─────────┐
          │ TieredBrowserPool │    │  Dedicated Chrome  │
          │ (shared, pooled)  │    │  (per-session)     │
          └─────────┬─────────┘    └───────────────────┘
                    │
    ┌───────────────┼───────────────┐
    │               │               │
┌───▼──────────┐ ┌──▼──────────┐ ┌──▼────────────┐
│  Hero Config │ │  Orchestrator│ │  Formatters   │
│ (TLS, DNS, etc.) │ │   Detection     │ │ (MD, HTML, etc) │
└──────────────────┘ └─────────────────┘ └─────────────────┘
```

## Directory Structure

```
src/
├── index.ts              # Public API exports
├── scraper.ts            # Scraper class - main scraping logic
├── crawler.ts            # Crawler class - link discovery + scraping
├── types.ts              # ScrapeOptions, ScrapeResult, etc.
├── crawl-types.ts        # CrawlOptions, CrawlResult, etc.
│
├── browser/
│   ├── pool.ts           # BrowserPool - manages Hero instances
│   ├── hero-config.ts    # Hero configuration (TLS, DNS, viewport)
│   └── types.ts          # IBrowserPool, PoolConfig, PoolStats
│
├── cloudflare/
│   ├── detector.ts       # detectChallenge() - DOM/text matching
│   ├── handler.ts        # waitForChallengeResolution() - polling
│   └── types.ts          # ChallengeDetection, ResolutionResult
│
├── formatters/
│   ├── markdown.ts       # formatToMarkdown() - uses supermarkdown
│   ├── html.ts           # formatToHTML() - full HTML document
│   ├── postprocess.ts    # Post-processing utilities
│   └── index.ts          # Re-exports all formatters
│
├── utils/
│   ├── content-cleaner.ts    # cleanContent() - removes nav, ads
│   ├── metadata-extractor.ts # extractMetadata() - OG tags, etc.
│   ├── url-helpers.ts        # URL validation, normalization
│   ├── rate-limiter.ts       # Simple delay-based rate limiting
│   └── logger.ts             # Pino logger with pretty print
│
├── proxy/
│   └── config.ts         # createProxyUrl(), parseProxyUrl()
│
└── cli/
    └── index.ts          # CLI using Commander.js
```

## Core Components

### Scraper

The `Scraper` class (`src/scraper.ts`) handles URL scraping:

```typescript
class Scraper {
  constructor(options: ScrapeOptions) { ... }

  async scrape(): Promise<ScrapeResult> {
    // 1. Initialize browser pool
    // 2. Process URLs with concurrency control (p-limit)
    // 3. For each URL: fetch, detect challenges, extract content
    // 4. Format to requested output formats
    // 5. Aggregate results and metadata
  }

  private async scrapeSingleUrl(url: string): Promise<WebsiteScrapeResult> {
    // 1. Acquire browser from pool
    // 2. Navigate to URL
    // 3. Detect Cloudflare challenge
    // 4. Wait for resolution if needed
    // 5. Extract HTML and metadata
    // 6. Clean content
    // 7. Format to outputs
    // 8. Release browser to pool
  }
}
```

**Key design decisions:**

- Uses `p-limit` for concurrency control
- Each URL gets its own browser instance from the pool
- Cloudflare detection runs before content extraction
- All formatters run in parallel for each URL

### Crawler

The `Crawler` class (`src/crawler.ts`) discovers links:

```typescript
class Crawler {
  async crawl(): Promise<CrawlResult> {
    // BFS (Breadth-First Search) algorithm
    // 1. Start with seed URL at depth 0
    // 2. Fetch page, extract links
    // 3. Filter links (same domain, patterns)
    // 4. Add to queue with depth + 1
    // 5. Repeat until maxPages or maxDepth
    // 6. Optionally scrape discovered URLs
  }
}
```

**Key design decisions:**

- BFS ensures shallow pages are discovered first
- Respects `maxPages` and `depth` limits
- Optional scraping reuses the Scraper class
- Delay between requests for rate limiting

### Browser Pool

The `BrowserPool` class (`src/browser/pool.ts`) manages Hero instances:

```typescript
class BrowserPool {
  private instances: HeroInstance[];
  private available: HeroInstance[];
  private queue: PendingRequest[];

  async initialize(): Promise<void> { ... }
  async acquire(): Promise<Hero> { ... }
  async release(hero: Hero): Promise<void> { ... }

  async withBrowser<T>(fn: (hero: Hero) => Promise<T>): Promise<T> {
    const hero = await this.acquire();
    try {
      return await fn(hero);
    } finally {
      await this.release(hero);
    }
  }
}
```

**Pool lifecycle:**

1. **Initialize** - Create `size` Hero instances
2. **Acquire** - Get available instance or queue the request
3. **Use** - Execute scraping logic
4. **Release** - Return to pool or recycle if stale
5. **Recycle** - Close old instance, create new one
6. **Shutdown** - Close all instances

**Recycling triggers:**

- After N pages (default: 100)
- After N minutes (default: 30)
- On health check failure

### Cloudflare Detection

Detection happens in two phases:

**1. Challenge Detection** (`src/cloudflare/detector.ts`):

```typescript
async function detectChallenge(hero: Hero): Promise<ChallengeDetection> {
  // Check DOM for challenge elements
  const signals = [];

  // CSS selectors that indicate challenges
  if (await hero.document.querySelector("#challenge-form")) {
    signals.push({ type: "dom", selector: "#challenge-form" });
  }

  // Text patterns that indicate challenges
  const bodyText = await hero.document.body.textContent;
  if (bodyText.includes("checking your browser")) {
    signals.push({ type: "text", pattern: "checking your browser" });
  }

  return {
    isChallenge: signals.length > 0,
    type: determineType(signals),
    signals,
  };
}
```

**2. Challenge Resolution** (`src/cloudflare/handler.ts`):

```typescript
async function waitForChallengeResolution(
  hero: Hero,
  options: ResolutionOptions
): Promise<ResolutionResult> {
  const startTime = Date.now();

  while (Date.now() - startTime < options.maxWaitMs) {
    // Check if URL changed (redirect after challenge)
    if ((await hero.url) !== options.initialUrl) {
      return { resolved: true, method: "redirect" };
    }

    // Check if challenge elements disappeared
    const detection = await detectChallenge(hero);
    if (!detection.isChallenge) {
      return { resolved: true, method: "element_removal" };
    }

    await sleep(options.pollIntervalMs);
  }

  return { resolved: false };
}
```

### Formatters

Each formatter transforms scraped pages into a specific format:

| Formatter | Input | Output |
|-----------|-------|--------|
| `formatToMarkdown` | Pages, metadata | Markdown document with frontmatter |
| `formatToHTML` | Pages, metadata | Complete HTML document with CSS |

**Markdown formatter** uses [supermarkdown](https://github.com/vakra-dev/supermarkdown) - a high-performance Rust-based HTML-to-Markdown converter with full GFM support.

## Data Flow

### Scrape Request Flow

```
scrape({ urls: ["https://example.com"], formats: ["markdown"] })
  │
  ├─► Scraper.scrape()
  │     │
  │     ├─► BrowserPool.initialize(size=concurrency)
  │     │
  │     ├─► For each URL (controlled by p-limit):
  │     │     │
  │     │     ├─► pool.withBrowser(async hero => {
  │     │     │     │
  │     │     │     ├─► hero.goto(url)
  │     │     │     │
  │     │     │     ├─► detectChallenge(hero)
  │     │     │     │     └─► Returns { isChallenge, type, signals }
  │     │     │     │
  │     │     │     ├─► if (isChallenge):
  │     │     │     │     └─► waitForChallengeResolution(hero)
  │     │     │     │
  │     │     │     ├─► Extract title, HTML
  │     │     │     │
  │     │     │     ├─► cleanContent(html)
  │     │     │     │     └─► Remove nav, ads, scripts
  │     │     │     │
  │     │     │     ├─► extractMetadata(html)
  │     │     │     │     └─► OG tags, Twitter cards, etc.
  │     │     │     │
  │     │     │     └─► Format to requested formats
  │     │     │   })
  │     │     │
  │     │     └─► Add to results array
  │     │
  │     ├─► pool.shutdown()
  │     │
  │     └─► Return ScrapeResult { data[], batchMetadata }
  │
  └─► Result returned to caller
```

### Crawl Request Flow

```
crawl({ url: "https://example.com", depth: 2, scrape: true })
  │
  ├─► Crawler.crawl()
  │     │
  │     ├─► Initialize queue with seed URL at depth 0
  │     │
  │     ├─► BFS loop (while queue not empty && pages < maxPages):
  │     │     │
  │     │     ├─► Dequeue next URL
  │     │     │
  │     │     ├─► Fetch page with Hero
  │     │     │
  │     │     ├─► Extract links via regex
  │     │     │
  │     │     ├─► Filter links:
  │     │     │     ├─► Same domain only
  │     │     │     ├─► Match includePatterns
  │     │     │     └─► Exclude excludePatterns
  │     │     │
  │     │     ├─► Add new links to queue with depth + 1
  │     │     │
  │     │     ├─► Rate limit (delay between requests)
  │     │     │
  │     │     └─► Add to discovered URLs
  │     │
  │     ├─► If scrape=true:
  │     │     └─► scrape({ urls: discoveredUrls })
  │     │
  │     └─► Return CrawlResult { urls[], scraped?, metadata }
  │
  └─► Result returned to caller
```

## Design Decisions

### Why Hero?

[Ulixee Hero](https://ulixee.org/) was chosen for:

1. **Stealth** - Advanced TLS fingerprinting and anti-detection
2. **Speed** - Optimized for headless automation
3. **API** - Clean async/await interface
4. **Stability** - Production-tested at scale

### Pool vs Per-Request Browsers

We use a pool because:

- Browser startup is slow (~2-3 seconds)
- Memory overhead per browser is high
- Connection reuse improves performance

Trade-off: Stale browsers can accumulate state, so we recycle them periodically.

### Cloudflare Detection Strategy

Multi-signal approach because:

- No single indicator is 100% reliable
- Cloudflare changes their challenge pages
- Different challenge types have different signatures

Detection signals include:
- DOM elements (`#challenge-form`, `.cf-browser-verification`)
- Text patterns ("checking your browser", "ray id")
- URL patterns (`/cdn-cgi/challenge-platform/`)
- HTTP status codes

### Content Cleaning

We clean HTML before formatting because:

- Navigation, ads, scripts bloat output
- LLMs perform better with focused content
- Reduces token usage

Cleaning removes:
- `<script>`, `<style>` tags
- Navigation elements
- Footer/sidebar content
- Ad containers
- Hidden elements

## Extension Points

### Adding a New Formatter

1. Create `src/formatters/newformat.ts`:
   ```typescript
   export function formatToNewFormat(
     pages: Page[],
     baseUrl: string,
     scrapedAt: string,
     duration: number,
     metadata?: WebsiteMetadata
   ): string {
     // Your formatting logic
   }
   ```

2. Export from `src/formatters/index.ts`

3. Add to format type in `src/types.ts`:
   ```typescript
   formats?: Array<"markdown" | "html" | "newformat">
   ```

4. Call formatter in `src/scraper.ts`

### Adding a New ScrapeOption

1. Add to `ScrapeOptions` in `src/types.ts`
2. Add default in `DEFAULT_OPTIONS`
3. Use in `Scraper` class via `this.options.newOption`
4. Add CLI flag in `src/cli/index.ts` if needed

### Modifying Cloudflare Detection

- Detection patterns: `src/cloudflare/detector.ts`
- Resolution logic: `src/cloudflare/handler.ts`

## Testing

```bash
cd reader && npx vitest run
```

415 unit tests across 26 test files covering scraping, crawling, browser sessions, formatters, content cleaning, proxy pools, and error handling.

## Related Guides

- [Browser Pool](guides/browser-pool.md) - Deep dive into pool management
- [Cloudflare Bypass](guides/cloudflare-bypass.md) - Understanding antibot bypass
- [Production Server](deployment/production-server.md) - Shared Hero Core pattern


================================================
FILE: docs/assets/.gitkeep
================================================


================================================
FILE: docs/assets/demo.tape
================================================
# VHS tape file for Reader demo GIF
# Run: vhs docs/assets/demo.tape

Output docs/assets/demo.gif

Set FontSize 16
Set Width 900
Set Height 500
Set Theme "Catppuccin Mocha"
Set Padding 20

# Scrape a URL and extract the markdown
Type "npx reader scrape https://reader.dev | jq -r '.data[0].markdown' | head -n 12"
Sleep 500ms
Enter
Sleep 3s

# Let output display
Sleep 3s


================================================
FILE: docs/deployment/docker.md
================================================
# Docker Deployment Guide

Deploy Reader in Docker containers.

## Quick Start

### Basic Dockerfile

```dockerfile
# Dockerfile
FROM node:22-slim

# Install Chrome dependencies
RUN apt-get update && apt-get install -y \
    chromium \
    fonts-liberation \
    libasound2 \
    libatk-bridge2.0-0 \
    libatk1.0-0 \
    libcups2 \
    libdbus-1-3 \
    libdrm2 \
    libgbm1 \
    libgtk-3-0 \
    libnspr4 \
    libnss3 \
    libxcomposite1 \
    libxdamage1 \
    libxrandr2 \
    xdg-utils \
    --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*

# Set Chrome path for Hero
ENV CHROME_PATH=/usr/bin/chromium

WORKDIR /app

# Copy package files
COPY package*.json ./

# Install dependencies
RUN npm ci --only=production

# Copy application
COPY . .

# Build if TypeScript
RUN npm run build 2>/dev/null || true

EXPOSE 3000

CMD ["node", "dist/server.js"]
```

### Build and Run

```bash
# Build image
docker build -t reader .

# Run container
docker run -p 3000:3000 reader
```

## Docker Compose

### Basic Setup

```yaml
# docker-compose.yml
version: "3.8"

services:
  reader:
    build: .
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - LOG_LEVEL=info
    restart: unless-stopped
    deploy:
      resources:
        limits:
          memory: 2G
```

### With Redis (for job queues)

```yaml
# docker-compose.yml
version: "3.8"

services:
  api:
    build:
      context: .
      dockerfile: Dockerfile.api
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - REDIS_HOST=redis
      - REDIS_PORT=6379
    depends_on:
      - redis
    restart: unless-stopped

  worker:
    build:
      context: .
      dockerfile: Dockerfile.worker
    environment:
      - NODE_ENV=production
      - REDIS_HOST=redis
      - REDIS_PORT=6379
    depends_on:
      - redis
    deploy:
      replicas: 3
      resources:
        limits:
          memory: 2G
    restart: unless-stopped

  redis:
    image: redis:7-alpine
    volumes:
      - redis-data:/data
    restart: unless-stopped

volumes:
  redis-data:
```

### Start Services

```bash
# Start all services
docker-compose up -d

# Scale workers
docker-compose up -d --scale worker=5

# View logs
docker-compose logs -f worker

# Stop services
docker-compose down
```

## Optimized Dockerfile

### Multi-stage Build

```dockerfile
# Dockerfile
# Build stage
FROM node:22-slim AS builder

WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build

# Production stage
FROM node:22-slim

# Install Chrome dependencies
RUN apt-get update && apt-get install -y \
    chromium \
    fonts-liberation \
    libasound2 \
    libatk-bridge2.0-0 \
    libatk1.0-0 \
    libcups2 \
    libdbus-1-3 \
    libdrm2 \
    libgbm1 \
    libgtk-3-0 \
    libnspr4 \
    libnss3 \
    libxcomposite1 \
    libxdamage1 \
    libxrandr2 \
    xdg-utils \
    --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*

ENV CHROME_PATH=/usr/bin/chromium
ENV NODE_ENV=production

WORKDIR /app

# Copy only production dependencies
COPY package*.json ./
RUN npm ci --only=production

# Copy built application
COPY --from=builder /app/dist ./dist

# Non-root user for security
RUN groupadd -r app && useradd -r -g app app
USER app

EXPOSE 3000

CMD ["node", "dist/server.js"]
```

## Configuration

### Environment Variables

```yaml
# docker-compose.yml
services:
  reader:
    environment:
      - NODE_ENV=production
      - PORT=3000
      - LOG_LEVEL=info
      - CHROME_PATH=/usr/bin/chromium
      - MAX_CONCURRENT_REQUESTS=10
      - REQUEST_TIMEOUT_MS=60000
```

### Resource Limits

```yaml
services:
  reader:
    deploy:
      resources:
        limits:
          cpus: "2"
          memory: 4G
        reservations:
          cpus: "1"
          memory: 2G
```

### Health Checks

```yaml
services:
  reader:
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
```

## Chrome Configuration

### Sandbox Mode

Chrome requires special configuration in Docker:

```dockerfile
# Add to Dockerfile
ENV CHROME_FLAGS="--no-sandbox --disable-setuid-sandbox"
```

Or configure in Hero:

```typescript
// In your application
const pool = new BrowserPool({
  heroOptions: {
    noChromeSandbox: true,
  },
});
```

### Shared Memory

Chrome needs sufficient shared memory:

```yaml
services:
  reader:
    shm_size: "2gb"
```

Or mount tmpfs:

```yaml
services:
  reader:
    volumes:
      - /dev/shm:/dev/shm
```

## Production Considerations

### Logging

```yaml
services:
  reader:
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"
```

### Networking

```yaml
services:
  reader:
    networks:
      - internal
      - external

networks:
  internal:
    internal: true
  external:
```

### Secrets

```yaml
services:
  reader:
    secrets:
      - proxy_credentials

secrets:
  proxy_credentials:
    file: ./secrets/proxy.txt
```

### Volumes for Data

```yaml
services:
  reader:
    volumes:
      - ./data:/app/data
      - ./logs:/app/logs
```

## Scaling

### Docker Swarm

```yaml
# docker-stack.yml
version: "3.8"

services:
  reader:
    image: reader:latest
    deploy:
      replicas: 5
      update_config:
        parallelism: 2
        delay: 10s
      restart_policy:
        condition: on-failure
    networks:
      - traefik

networks:
  traefik:
    external: true
```

Deploy:

```bash
docker stack deploy -c docker-stack.yml reader
```

### Kubernetes

```yaml
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: reader
spec:
  replicas: 3
  selector:
    matchLabels:
      app: reader
  template:
    metadata:
      labels:
        app: reader
    spec:
      containers:
        - name: reader
          image: reader:latest
          ports:
            - containerPort: 3000
          resources:
            limits:
              memory: "2Gi"
              cpu: "1"
          env:
            - name: NODE_ENV
              value: "production"
---
apiVersion: v1
kind: Service
metadata:
  name: reader
spec:
  selector:
    app: reader
  ports:
    - port: 80
      targetPort: 3000
```

## Troubleshooting

### Chrome Won't Start

```bash
# Check Chrome installation
docker exec -it container_name chromium --version

# Test Chrome manually
docker exec -it container_name chromium --headless --no-sandbox --dump-dom https://example.com
```

### Memory Issues

```yaml
# Increase limits
services:
  reader:
    deploy:
      resources:
        limits:
          memory: 4G
    shm_size: "2gb"
```

### Network Issues

```bash
# Debug networking
docker exec -it container_name curl https://example.com

# Check DNS
docker exec -it container_name nslookup example.com
```

## Complete Example

See [examples/deployment/docker/](../../examples/deployment/docker/) for a complete Docker setup.

## Related Guides

- [Production Server](production-server.md) - Server setup
- [Job Queues](job-queues.md) - Async processing


================================================
FILE: docs/deployment/job-queues.md
================================================
# Job Queues Guide

Use job queues for async scraping at scale with BullMQ.

## Overview

For high-volume scraping, use a job queue to:
- Process requests asynchronously
- Handle retries automatically
- Scale workers independently
- Monitor job progress
- Avoid overwhelming target sites

## Architecture

```
┌─────────────┐     ┌─────────────┐     ┌─────────────┐
│   API       │────▶│   Redis     │────▶│   Workers   │
│   Server    │     │   Queue     │     │   (N)       │
└─────────────┘     └─────────────┘     └─────────────┘
       │                                       │
       │         ┌─────────────┐              │
       └────────▶│   Results   │◀─────────────┘
                 │   Store     │
                 └─────────────┘
```

## Setup

### Installation

```bash
npm install bullmq ioredis @vakra-dev/reader
```

### Basic Queue Setup

```typescript
// queue.ts
import { Queue, Worker, Job } from "bullmq";
import { scrape } from "@vakra-dev/reader";

const connection = {
  host: process.env.REDIS_HOST || "localhost",
  port: parseInt(process.env.REDIS_PORT || "6379"),
};

// Create queue
export const scrapeQueue = new Queue("scrape", { connection });

// Job data interface
interface ScrapeJobData {
  urls: string[];
  formats: ("markdown" | "html")[];
  callbackUrl?: string;
}

// Add job to queue
export async function enqueueScrape(data: ScrapeJobData) {
  const job = await scrapeQueue.add("scrape", data, {
    attempts: 3,
    backoff: {
      type: "exponential",
      delay: 5000,
    },
  });

  return job.id;
}
```

### Worker Process

```typescript
// worker.ts
import { Worker, Job } from "bullmq";
import HeroCore from "@ulixee/hero-core";
import { TransportBridge } from "@ulixee/net";
import { ConnectionToHeroCore } from "@ulixee/hero";
import { scrape } from "@vakra-dev/reader";

const connection = {
  host: process.env.REDIS_HOST || "localhost",
  port: parseInt(process.env.REDIS_PORT || "6379"),
};

// Shared Hero Core
let heroCore: HeroCore;

async function createConnection() {
  const bridge = new TransportBridge();
  heroCore.addConnection(bridge.transportToClient);
  return new ConnectionToHeroCore(bridge.transportToCore);
}

// Process jobs
const worker = new Worker(
  "scrape",
  async (job: Job) => {
    const { urls, formats } = job.data;

    console.log(`Processing job ${job.id}: ${urls.length} URLs`);

    const result = await scrape({
      urls,
      formats,
      connectionToCore: await createConnection(),
      onProgress: async ({ completed, total }) => {
        await job.updateProgress((completed / total) * 100);
      },
    });

    // Callback if provided
    if (job.data.callbackUrl) {
      await fetch(job.data.callbackUrl, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify(result),
      });
    }

    return result;
  },
  {
    connection,
    concurrency: 5,
  }
);

// Event handlers
worker.on("completed", (job) => {
  console.log(`Job ${job.id} completed`);
});

worker.on("failed", (job, err) => {
  console.error(`Job ${job?.id} failed:`, err.message);
});

// Start worker
async function start() {
  heroCore = new HeroCore();
  await heroCore.start();
  console.log("Worker started, waiting for jobs...");
}

// Graceful shutdown
async function shutdown() {
  console.log("Shutting down worker...");
  await worker.close();
  if (heroCore) await heroCore.close();
  process.exit(0);
}

process.on("SIGTERM", shutdown);
process.on("SIGINT", shutdown);

start().catch(console.error);
```

### API Server

```typescript
// api.ts
import express from "express";
import { scrapeQueue, enqueueScrape } from "./queue";

const app = express();
app.use(express.json());

// Enqueue scrape job
app.post("/scrape", async (req, res) => {
  const { urls, formats, callbackUrl } = req.body;

  const jobId = await enqueueScrape({ urls, formats, callbackUrl });

  res.json({ jobId, status: "queued" });
});

// Get job status
app.get("/job/:id", async (req, res) => {
  const job = await scrapeQueue.getJob(req.params.id);

  if (!job) {
    return res.status(404).json({ error: "Job not found" });
  }

  const state = await job.getState();
  const progress = job.progress;

  res.json({
    id: job.id,
    state,
    progress,
    data: job.data,
    result: job.returnvalue,
    failedReason: job.failedReason,
  });
});

// Get job result
app.get("/job/:id/result", async (req, res) => {
  const job = await scrapeQueue.getJob(req.params.id);

  if (!job) {
    return res.status(404).json({ error: "Job not found" });
  }

  const state = await job.getState();

  if (state !== "completed") {
    return res.status(202).json({ status: state, progress: job.progress });
  }

  res.json(job.returnvalue);
});

app.listen(3000, () => {
  console.log("API server running on port 3000");
});
```

## Job Options

### Retry Configuration

```typescript
await scrapeQueue.add("scrape", data, {
  attempts: 5,
  backoff: {
    type: "exponential",
    delay: 5000,  // 5s, 10s, 20s, 40s, 80s
  },
});
```

### Priority

```typescript
// High priority (lower number = higher priority)
await scrapeQueue.add("scrape", urgentData, { priority: 1 });

// Normal priority
await scrapeQueue.add("scrape", normalData, { priority: 5 });

// Low priority
await scrapeQueue.add("scrape", bulkData, { priority: 10 });
```

### Delayed Jobs

```typescript
// Process after 5 minutes
await scrapeQueue.add("scrape", data, {
  delay: 5 * 60 * 1000,
});
```

### Rate Limiting

```typescript
// Max 10 jobs per minute
const worker = new Worker("scrape", processor, {
  limiter: {
    max: 10,
    duration: 60000,
  },
});
```

## Scaling Workers

### Multiple Workers

Run multiple worker processes:

```bash
# Terminal 1
WORKER_ID=1 npx tsx worker.ts

# Terminal 2
WORKER_ID=2 npx tsx worker.ts

# Terminal 3
WORKER_ID=3 npx tsx worker.ts
```

### Worker Concurrency

```typescript
const worker = new Worker("scrape", processor, {
  connection,
  concurrency: 5,  // Process 5 jobs simultaneously
});
```

### Auto-Scaling

```typescript
// Scale based on queue depth
async function checkScale() {
  const waiting = await scrapeQueue.getWaitingCount();
  const active = await scrapeQueue.getActiveCount();

  console.log(`Queue: ${waiting} waiting, ${active} active`);

  if (waiting > 100) {
    // Signal to scale up
    await notifyScaleUp();
  }
}

setInterval(checkScale, 30000);
```

## Monitoring

### Queue Dashboard (Bull Board)

```typescript
import { createBullBoard } from "@bull-board/api";
import { BullMQAdapter } from "@bull-board/api/bullMQAdapter";
import { ExpressAdapter } from "@bull-board/express";

const serverAdapter = new ExpressAdapter();
serverAdapter.setBasePath("/admin/queues");

createBullBoard({
  queues: [new BullMQAdapter(scrapeQueue)],
  serverAdapter,
});

app.use("/admin/queues", serverAdapter.getRouter());
```

### Metrics

```typescript
// Queue stats
async function getQueueStats() {
  return {
    waiting: await scrapeQueue.getWaitingCount(),
    active: await scrapeQueue.getActiveCount(),
    completed: await scrapeQueue.getCompletedCount(),
    failed: await scrapeQueue.getFailedCount(),
    delayed: await scrapeQueue.getDelayedCount(),
  };
}

app.get("/stats", async (req, res) => {
  res.json(await getQueueStats());
});
```

### Events

```typescript
// Listen to queue events
scrapeQueue.on("completed", (job) => {
  metrics.increment("jobs.completed");
  metrics.timing("jobs.duration", job.processedOn - job.timestamp);
});

scrapeQueue.on("failed", (job, err) => {
  metrics.increment("jobs.failed");
  alerting.notify(`Job ${job.id} failed: ${err.message}`);
});
```

## Error Handling

### Retry Strategy

```typescript
const worker = new Worker(
  "scrape",
  async (job) => {
    try {
      return await scrape(job.data);
    } catch (error) {
      // Don't retry on certain errors
      if (error.message.includes("Invalid URL")) {
        throw new Error(`Permanent failure: ${error.message}`);
      }
      // Retry on transient errors
      throw error;
    }
  },
  {
    connection,
    settings: {
      backoffStrategy: (attemptsMade) => {
        // Custom backoff: 5s, 30s, 2m, 10m
        const delays = [5000, 30000, 120000, 600000];
        return delays[Math.min(attemptsMade - 1, delays.length - 1)];
      },
    },
  }
);
```

### Dead Letter Queue

```typescript
// Move failed jobs to DLQ after all retries
await scrapeQueue.add("scrape", data, {
  attempts: 3,
  removeOnFail: {
    age: 24 * 3600,  // Keep for 24 hours
  },
});

// Process DLQ manually
const failedJobs = await scrapeQueue.getFailed();
for (const job of failedJobs) {
  console.log(`Failed job ${job.id}: ${job.failedReason}`);
  // Optionally retry
  await job.retry();
}
```

## Complete Example

```typescript
// complete-example.ts
import { Queue, Worker, Job } from "bullmq";
import express from "express";
import HeroCore from "@ulixee/hero-core";
import { scrape, ScrapeResult } from "@vakra-dev/reader";

const app = express();
app.use(express.json());

// Redis connection
const connection = { host: "localhost", port: 6379 };

// Queue
const scrapeQueue = new Queue("scrape", { connection });

// Shared Hero Core
let heroCore: HeroCore;

// Worker
const worker = new Worker<any, ScrapeResult>(
  "scrape",
  async (job: Job) => {
    const result = await scrape({
      ...job.data,
      connectionToCore: await createConnection(),
    });
    return result;
  },
  { connection, concurrency: 3 }
);

// API endpoints
app.post("/scrape/async", async (req, res) => {
  const job = await scrapeQueue.add("scrape", req.body);
  res.json({ jobId: job.id });
});

app.get("/scrape/:jobId", async (req, res) => {
  const job = await scrapeQueue.getJob(req.params.jobId);
  if (!job) return res.status(404).json({ error: "Not found" });

  const state = await job.getState();
  res.json({
    state,
    progress: job.progress,
    result: state === "completed" ? job.returnvalue : null,
  });
});

// Start
async function start() {
  heroCore = new HeroCore();
  await heroCore.start();

  app.listen(3000, () => console.log("Server running"));
}

start();
```

## Related Guides

- [Production Server](production-server.md) - Basic server setup
- [Docker](docker.md) - Containerized deployment
- [Browser Pool](../guides/browser-pool.md) - Managing browsers


================================================
FILE: docs/deployment/production-server.md
================================================
# Production Server Guide

Deploy Reader as a production-ready API server.

## Overview

For production servers, use a **shared Hero Core** pattern instead of spawning individual Chrome processes per request. This dramatically reduces resource usage and improves performance.

## Architecture

```
┌─────────────────────────────────────────────────┐
│                Express Server                    │
├─────────────────────────────────────────────────┤
│              Shared Hero Core                    │
│         (Single Chrome Process)                  │
├─────────────────────────────────────────────────┤
│   Browser 1  │  Browser 2  │  Browser 3  │ ...  │
│   (Tab)      │  (Tab)      │  (Tab)      │      │
└─────────────────────────────────────────────────┘
```

**Benefits:**
- Single Chrome process instead of one per request
- Lower memory footprint
- Faster browser creation
- Better resource utilization

## Basic Setup

### Installation

```bash
npm install @vakra-dev/reader express
npm install @ulixee/hero-core @ulixee/net  # For shared Core
```

### Server Code

```typescript
// server.ts
import express from "express";
import HeroCore from "@ulixee/hero-core";
import { TransportBridge } from "@ulixee/net";
import { ConnectionToHeroCore } from "@ulixee/hero";
import { scrape, crawl } from "@vakra-dev/reader";

const app = express();
app.use(express.json());

// Shared Hero Core - initialized once
let heroCore: HeroCore;

async function createConnection() {
  const bridge = new TransportBridge();
  heroCore.addConnection(bridge.transportToClient);
  return new ConnectionToHeroCore(bridge.transportToCore);
}

// Scrape endpoint
app.post("/scrape", async (req, res) => {
  const { urls, formats = ["markdown"] } = req.body;

  try {
    const result = await scrape({
      urls,
      formats,
      connectionToCore: await createConnection(),
    });

    res.json(result);
  } catch (error: any) {
    res.status(500).json({ error: error.message });
  }
});

// Crawl endpoint
app.post("/crawl", async (req, res) => {
  const { url, depth = 2, maxPages = 20, scrape: doScrape = false } = req.body;

  try {
    const result = await crawl({
      url,
      depth,
      maxPages,
      scrape: doScrape,
      connectionToCore: await createConnection(),
    });

    res.json(result);
  } catch (error: any) {
    res.status(500).json({ error: error.message });
  }
});

// Health check
app.get("/health", (req, res) => {
  res.json({ status: "ok", heroCore: heroCore ? "running" : "stopped" });
});

// Start server
async function start() {
  // Initialize shared Hero Core
  heroCore = new HeroCore();
  await heroCore.start();
  console.log("Hero Core started");

  const PORT = process.env.PORT || 3000;
  app.listen(PORT, () => {
    console.log(`Server running on port ${PORT}`);
  });
}

// Graceful shutdown
async function shutdown() {
  console.log("Shutting down...");
  if (heroCore) {
    await heroCore.close();
  }
  process.exit(0);
}

process.on("SIGTERM", shutdown);
process.on("SIGINT", shutdown);

start().catch(console.error);
```

### Run the Server

```bash
npx tsx server.ts
```

### Test Endpoints

```bash
# Scrape
curl -X POST http://localhost:3000/scrape \
  -H "Content-Type: application/json" \
  -d '{"urls": ["https://example.com"], "formats": ["markdown"]}'

# Crawl
curl -X POST http://localhost:3000/crawl \
  -H "Content-Type: application/json" \
  -d '{"url": "https://example.com", "depth": 2, "scrape": true}'
```

## Production Configuration

### Environment Variables

```bash
# .env
PORT=3000
NODE_ENV=production
LOG_LEVEL=info
MAX_CONCURRENT_REQUESTS=10
REQUEST_TIMEOUT_MS=60000
```

### Request Limits

```typescript
import rateLimit from "express-rate-limit";

// Rate limiting
const limiter = rateLimit({
  windowMs: 60 * 1000,  // 1 minute
  max: 100,             // 100 requests per minute
});

app.use(limiter);

// Request timeout
app.use((req, res, next) => {
  res.setTimeout(60000, () => {
    res.status(408).json({ error: "Request timeout" });
  });
  next();
});
```

### Request Validation

```typescript
import { z } from "zod";

const scrapeSchema = z.object({
  urls: z.array(z.string().url()).min(1).max(100),
  formats: z.array(z.enum(["markdown", "html"])).optional(),
  batchConcurrency: z.number().min(1).max(10).optional(),
});

app.post("/scrape", async (req, res) => {
  const parsed = scrapeSchema.safeParse(req.body);

  if (!parsed.success) {
    return res.status(400).json({ error: parsed.error.issues });
  }

  // ... handle request
});
```

## Concurrency Control

### Request Queue

```typescript
import PQueue from "p-queue";

const requestQueue = new PQueue({
  concurrency: parseInt(process.env.MAX_CONCURRENT_REQUESTS || "10"),
});

app.post("/scrape", async (req, res) => {
  try {
    const result = await requestQueue.add(() =>
      scrape({
        urls: req.body.urls,
        formats: req.body.formats,
        connectionToCore: await createConnection(),
      })
    );

    res.json(result);
  } catch (error: any) {
    res.status(500).json({ error: error.message });
  }
});
```

### Timeout Handling

```typescript
async function scrapeWithTimeout(options: ScrapeOptions, timeoutMs: number) {
  const controller = new AbortController();
  const timeout = setTimeout(() => controller.abort(), timeoutMs);

  try {
    return await scrape({
      ...options,
      connectionToCore: await createConnection(),
    });
  } finally {
    clearTimeout(timeout);
  }
}
```

## Monitoring

### Health Checks

```typescript
let activeRequests = 0;
let totalRequests = 0;
let failedRequests = 0;

app.use((req, res, next) => {
  activeRequests++;
  totalRequests++;

  res.on("finish", () => {
    activeRequests--;
    if (res.statusCode >= 500) failedRequests++;
  });

  next();
});

app.get("/health", (req, res) => {
  res.json({
    status: "ok",
    heroCore: heroCore ? "running" : "stopped",
    stats: {
      activeRequests,
      totalRequests,
      failedRequests,
      queueSize: requestQueue.size,
      queuePending: requestQueue.pending,
    },
  });
});
```

### Logging

```typescript
import pino from "pino";
import pinoHttp from "pino-http";

const logger = pino({
  level: process.env.LOG_LEVEL || "info",
});

app.use(pinoHttp({ logger }));

// Log scrape requests
app.post("/scrape", async (req, res) => {
  const startTime = Date.now();

  try {
    const result = await scrape({ ... });

    logger.info({
      type: "scrape",
      urls: req.body.urls.length,
      duration: Date.now() - startTime,
      successful: result.batchMetadata.successfulUrls,
    });

    res.json(result);
  } catch (error) {
    logger.error({ type: "scrape_error", error: error.message });
    res.status(500).json({ error: error.message });
  }
});
```

## Scaling

### Horizontal Scaling

Run multiple server instances behind a load balancer:

```bash
# Start multiple instances
PORT=3001 npx tsx server.ts &
PORT=3002 npx tsx server.ts &
PORT=3003 npx tsx server.ts &
```

### PM2 Cluster Mode

```javascript
// ecosystem.config.js
module.exports = {
  apps: [{
    name: "reader",
    script: "server.ts",
    interpreter: "npx",
    interpreter_args: "tsx",
    instances: "max",
    exec_mode: "cluster",
    env: {
      NODE_ENV: "production",
      PORT: 3000,
    },
  }],
};
```

```bash
pm2 start ecosystem.config.js
```

### Memory Limits

```javascript
// ecosystem.config.js
module.exports = {
  apps: [{
    name: "reader",
    script: "server.ts",
    max_memory_restart: "2G",
    node_args: "--max-old-space-size=2048",
  }],
};
```

## Complete Example

See [examples/production/express-server/](../../examples/production/express-server/) for a complete production server implementation.

## Related Guides

- [Docker Deployment](docker.md) - Containerized deployment
- [Job Queues](job-queues.md) - Async job processing
- [Browser Pool](../guides/browser-pool.md) - Pool management


================================================
FILE: docs/getting-started.md
================================================
# Getting Started

This guide walks you through setting up Reader, verifying your installation, and running your first scrape.

## Prerequisites

- **Node.js >= 18** (v22 recommended)
- **npm** package manager

> **Note:** The Hero browser runtime requires Node.js. Always run your scripts with `node` or `npx tsx`.

## Installation

### From npm

```bash
npm install @vakra-dev/reader
```

### From source

```bash
git clone https://github.com/vakra-dev/reader.git
cd reader
npm install
npm run build
```

## Verify Installation

### Test the CLI

```bash
npx reader scrape https://example.com
```

You should see markdown output of the example.com page.

### Test the API

Create a file `test-scrape.ts`:

```typescript
import { ReaderClient } from "@vakra-dev/reader";

async function main() {
  const reader = new ReaderClient();

  const result = await reader.scrape({
    urls: ["https://example.com"],
    formats: ["markdown"],
  });

  console.log("Success:", result.batchMetadata.successfulUrls === 1);
  console.log("Content length:", result.data[0].markdown?.length);

  await reader.close();
}

main().catch(console.error);
```

Run it:

```bash
npx tsx test-scrape.ts
```

## Your First Scrape

### Single URL

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: ["https://news.ycombinator.com"],
  formats: ["markdown"],
});

// Access the markdown content
console.log(result.data[0].markdown);

// Access metadata
console.log("Title:", result.data[0].metadata.website.title);
console.log("Duration:", result.data[0].metadata.duration, "ms");

await reader.close();
```

### Multiple URLs

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: [
    "https://example.com",
    "https://example.org",
    "https://example.net",
  ],
  formats: ["markdown"],
  batchConcurrency: 3,
  onProgress: ({ completed, total, currentUrl }) => {
    console.log(`[${completed}/${total}] Scraping: ${currentUrl}`);
  },
});

console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);
console.log(`Failed: ${result.batchMetadata.failedUrls}`);

await reader.close();
```

### Crawl a Website

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.crawl({
  url: "https://example.com",
  depth: 2,
  maxPages: 10,
  scrape: true,
});

console.log(`Discovered ${result.urls.length} URLs:`);
result.urls.forEach((page) => {
  console.log(`  - ${page.title}: ${page.url}`);
});

if (result.scraped) {
  console.log(`\nScraped ${result.scraped.batchMetadata.successfulUrls} pages`);
}

await reader.close();
```

### Browser Session

Launch a stealthed Chrome and drive it with Playwright or Puppeteer:

```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";

const reader = new ReaderClient();

const session = await reader.browser();
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const context = await browser.newContext();
const page = await context.newPage();

await page.goto("https://news.ycombinator.com");
console.log("Title:", await page.title());

// Full Playwright API - click, type, screenshot, evaluate
const stories = await page.evaluate(() =>
  Array.from(document.querySelectorAll(".athing")).slice(0, 5).map((r) =>
    r.querySelector(".titleline > a")?.textContent
  )
);
console.log("Top stories:", stories);

await browser.close();
await session.close();
await reader.close();
```

Install Playwright: `npm install playwright-core`

For more examples, see the [Browser Sessions guide](guides/browser-sessions.md).

## Understanding the Output

### ScrapeResult Structure

```typescript
interface ScrapeResult {
  // Array of scraped websites (one per URL)
  data: WebsiteScrapeResult[];

  // Metadata about the batch operation
  batchMetadata: {
    totalUrls: number;
    successfulUrls: number;
    failedUrls: number;
    scrapedAt: string;      // ISO timestamp
    totalDuration: number;  // milliseconds
    errors?: Array<{ url: string; error: string }>;
  };
}

interface WebsiteScrapeResult {
  // Content in requested formats
  markdown?: string;
  html?: string;

  // Metadata about this specific scrape
  metadata: {
    baseUrl: string;
    finalUrl?: string;  // Present if URL redirected
    totalPages: number;
    scrapedAt: string;
    duration: number;
    website: WebsiteMetadata;  // Title, description, OG tags, etc.
  };
}
```

### CrawlResult Structure

```typescript
interface CrawlResult {
  // Discovered URLs with basic info
  urls: Array<{
    url: string;
    title: string;
    description: string | null;
  }>;

  // Full scrape results (only when scrape: true)
  scraped?: ScrapeResult;

  // Crawl operation metadata
  metadata: {
    totalUrls: number;
    maxDepth: number;
    totalDuration: number;
    seedUrl: string;
  };
}
```

## CLI Quick Reference

### Daemon Mode (Recommended for Multiple Requests)

```bash
# Start daemon (once, in a separate terminal or background)
npx reader start --pool-size 5

# Scrape (auto-detects and uses daemon if running)
npx reader scrape https://example.com

# Crawl (auto-detects and uses daemon if running)
npx reader crawl https://example.com -d 2

# Check daemon status
npx reader status

# Stop daemon
npx reader stop

# Force standalone mode (bypass daemon)
npx reader scrape https://example.com --standalone
```

### Scraping

```bash
# Scrape a URL to markdown
npx reader scrape https://example.com

# Scrape with multiple formats
npx reader scrape https://example.com -f markdown,html

# Scrape multiple URLs concurrently
npx reader scrape url1 url2 url3 -c 3

# Save output to file
npx reader scrape https://example.com -o output.md

# Enable verbose logging
npx reader scrape https://example.com -v

# Show browser window (debugging)
npx reader scrape https://example.com --show-chrome
```

### Crawling

```bash
# Crawl a website
npx reader crawl https://example.com -d 2 -m 20

# Crawl and scrape content
npx reader crawl https://example.com -d 2 --scrape
```

## Environment Variables

| Variable | Description |
|----------|-------------|
| `LOG_LEVEL` | Logging level: `debug`, `info`, `warn`, `error` (default: `info`) |
| `NODE_ENV` | Set to `development` for pretty-printed logs |

## Common Issues

### "Chrome/Chromium not found"

Hero automatically downloads Chrome on first run. If this fails:

```bash
# Manually install Chrome dependencies (Ubuntu/Debian)
sudo apt-get install -y chromium-browser

# Or use the system Chrome
export CHROME_PATH=/usr/bin/chromium-browser
```

### "ECONNREFUSED" errors

This usually means the target site is blocking requests. Try:

1. Use a proxy: `--proxy http://user:pass@host:port`
2. Add delays between requests: `--delay 2000`
3. Use verbose mode to see what's happening: `-v`

### ESM/CommonJS issues

Reader is ESM-only. Make sure your `package.json` has:

```json
{
  "type": "module"
}
```

Or use the `.mjs` extension for your files.

## Next Steps

Based on your use case, explore these guides:

| Use Case | Guide |
|----------|-------|
| Understanding Cloudflare bypass | [Cloudflare Bypass](guides/cloudflare-bypass.md) |
| Setting up proxies | [Proxy Configuration](guides/proxy-configuration.md) |
| Production server deployment | [Production Server](deployment/production-server.md) |
| High-volume scraping | [Browser Pool](guides/browser-pool.md) |
| Docker deployment | [Docker](deployment/docker.md) |

## Need Help?

- Check the [Troubleshooting Guide](troubleshooting.md)
- Browse [Examples](../examples/)
- Open an issue on [GitHub](https://github.com/vakra-dev/reader/issues)


================================================
FILE: docs/guides/browser-pool.md
================================================
# Browser Pool Guide

This guide covers browser pool management for production-grade scraping.

## When to Use BrowserPool vs ReaderClient

| Use Case | Recommended |
|----------|-------------|
| Simple scraping/crawling | `ReaderClient` |
| Scripts and CLI tools | `ReaderClient` |
| Custom browser control | `BrowserPool` |
| Express/production servers | `BrowserPool` or Shared Hero Core |
| Low-level page interaction | `BrowserPool` |

For most use cases, **ReaderClient is recommended** as it manages the HeroCore lifecycle automatically. Use `BrowserPool` when you need direct access to Hero browser instances for custom logic.

## Overview

Browser instances are expensive:
- ~2-3 seconds to start
- ~200-500MB memory each
- Can accumulate state over time

The `BrowserPool` class manages a pool of reusable browser instances, handling lifecycle, recycling, and health monitoring.

## Basic Usage

### Using ReaderClient (Recommended)

The simplest way to configure browser pool settings:

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({
  browserPool: {
    size: 5,                   // Number of browser instances
    retireAfterPages: 50,      // Recycle after N pages
    retireAfterMinutes: 15,    // Recycle after N minutes
    maxQueueSize: 100,         // Max pending requests
  },
});

// All scrape/crawl operations use the configured pool
const result = await reader.scrape({
  urls: ["https://example.com", "https://example.org"],
  batchConcurrency: 3,
});

await reader.close();
```

### Using BrowserPool Directly (Advanced)

For custom browser control:

```typescript
import { BrowserPool } from "@vakra-dev/reader";

const pool = new BrowserPool({ size: 5 });
await pool.initialize();

// Use withBrowser for automatic acquire/release
const title = await pool.withBrowser(async (hero) => {
  await hero.goto("https://example.com");
  return await hero.document.title;
});

await pool.shutdown();
```

## Configuration

```typescript
const pool = new BrowserPool({
  size: 5,                    // Number of browser instances
  retireAfterPages: 100,      // Recycle after N pages
  retireAfterMinutes: 30,     // Recycle after N minutes
  maxQueueSize: 100,          // Max pending requests
  healthCheckIntervalMs: 300000, // Health check interval (5 min)
});
```

### Configuration Options

| Option | Default | Description |
|--------|---------|-------------|
| `size` | `2` | Number of browser instances in the pool |
| `retireAfterPages` | `100` | Recycle browser after this many pages |
| `retireAfterMinutes` | `30` | Recycle browser after this many minutes |
| `maxQueueSize` | `100` | Maximum requests that can wait for a browser |
| `healthCheckIntervalMs` | `300000` | Interval between health checks (5 minutes) |

## Pool Lifecycle

### Initialization

```typescript
const pool = new BrowserPool({ size: 5 });
await pool.initialize();
```

This:
1. Creates `size` Hero instances
2. Starts background health checking
3. Makes pool ready for requests

### Acquire and Release

**Recommended: Use `withBrowser`**

```typescript
const result = await pool.withBrowser(async (hero) => {
  await hero.goto("https://example.com");
  const title = await hero.document.title;
  return title;
});
```

Benefits:
- Automatic acquire/release
- Exception-safe (always releases on error)
- Clean, readable code

**Manual acquire/release (advanced)**

```typescript
const hero = await pool.acquire();
try {
  await hero.goto("https://example.com");
  // ... do work
} finally {
  await pool.release(hero);
}
```

### Recycling

Browsers are automatically recycled when:

1. **Page limit reached** - After `retireAfterPages` navigations
2. **Time limit reached** - After `retireAfterMinutes`
3. **Health check failure** - If browser becomes unresponsive

Recycling closes the old browser and creates a fresh one.

### Shutdown

```typescript
await pool.shutdown();
```

This:
1. Stops health checking
2. Closes all browser instances
3. Clears the queue

## Monitoring

### Get Pool Stats

```typescript
const stats = pool.getStats();
console.log(stats);
// {
//   total: 5,
//   available: 3,
//   inUse: 2,
//   queueSize: 0,
//   totalAcquired: 150,
//   totalRecycled: 3
// }
```

### Health Check

```typescript
const health = await pool.healthCheck();
console.log(health);
// {
//   healthy: true,
//   instances: [
//     { id: 0, healthy: true, pages: 45, ageMinutes: 12 },
//     { id: 1, healthy: true, pages: 38, ageMinutes: 10 },
//     ...
//   ]
// }
```

## Production Patterns

### Shared Pool for Express Server

```typescript
import express from "express";
import { BrowserPool } from "@vakra-dev/reader";

const app = express();
const pool = new BrowserPool({ size: 10 });

// Initialize on startup
pool.initialize().then(() => {
  console.log("Browser pool ready");
});

app.get("/scrape", async (req, res) => {
  const url = req.query.url as string;

  try {
    const result = await pool.withBrowser(async (hero) => {
      await hero.goto(url);
      return await hero.document.body.innerHTML;
    });

    res.json({ html: result });
  } catch (error) {
    res.status(500).json({ error: error.message });
  }
});

// Graceful shutdown
process.on("SIGTERM", async () => {
  await pool.shutdown();
  process.exit(0);
});

app.listen(3000);
```

### Queue Management

When all browsers are busy, requests queue up:

```typescript
const pool = new BrowserPool({
  size: 5,
  maxQueueSize: 100,  // Max 100 waiting requests
});

// If queue is full, acquire() throws an error
try {
  const hero = await pool.acquire();
} catch (error) {
  if (error.message.includes("queue full")) {
    // Handle backpressure
    console.log("Too many pending requests");
  }
}
```

### Scaling Guidelines

| Concurrent Users | Pool Size | Memory (approx) |
|------------------|-----------|-----------------|
| 1-5 | 2-3 | 1-1.5 GB |
| 5-20 | 5-10 | 2.5-5 GB |
| 20-50 | 10-20 | 5-10 GB |
| 50+ | Consider distributed pools | 10+ GB |

## Shared Hero Core Pattern

For production servers, use a shared Hero Core instead of individual cores per browser:

```typescript
import HeroCore from "@ulixee/hero-core";
import { TransportBridge } from "@ulixee/net";
import { ConnectionToHeroCore } from "@ulixee/hero";

// Initialize once at startup
const heroCore = new HeroCore();
await heroCore.start();

// Create connection for each scrape
function createConnection() {
  const bridge = new TransportBridge();
  heroCore.addConnection(bridge.transportToClient);
  return new ConnectionToHeroCore(bridge.transportToCore);
}

// Use with scrape
const result = await scrape({
  urls: ["https://example.com"],
  connectionToCore: createConnection(),
});

// Shutdown on exit
await heroCore.close();
```

**Why use shared Core?**

- Single Chrome process manages all browsers
- Lower memory overhead
- Better resource utilization
- Faster browser creation

See [Production Server Guide](../deployment/production-server.md) for complete examples.

## Memory Management

### Reduce Memory Usage

```typescript
const pool = new BrowserPool({
  size: 3,                   // Fewer browsers
  retireAfterPages: 50,      // Recycle more often
  retireAfterMinutes: 15,    // Shorter lifetime
});
```

### Monitor Memory

```typescript
import { memoryUsage } from "process";

setInterval(() => {
  const usage = memoryUsage();
  console.log(`Memory: ${Math.round(usage.heapUsed / 1024 / 1024)} MB`);

  const stats = pool.getStats();
  console.log(`Pool: ${stats.inUse}/${stats.total} in use`);
}, 30000);
```

### Force Garbage Collection

Between large batch operations:

```typescript
const reader = new ReaderClient();

// Process batch
await reader.scrape({ urls: batch1 });

// Allow GC before next batch
await new Promise(r => setTimeout(r, 1000));

// Process next batch
await reader.scrape({ urls: batch2 });

await reader.close();
```

## Error Handling

### Browser Crashes

If a browser crashes, the pool automatically:
1. Removes it from the pool
2. Creates a replacement
3. Continues serving requests

### Timeout Handling

```typescript
const result = await pool.withBrowser(async (hero) => {
  // Set navigation timeout
  await hero.goto(url, { timeoutMs: 30000 });

  // ... rest of logic
}, { timeoutMs: 60000 }); // Overall operation timeout
```

### Retry Logic

```typescript
async function scrapeWithRetry(url: string, maxRetries = 3) {
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      return await pool.withBrowser(async (hero) => {
        await hero.goto(url);
        return await hero.document.body.innerHTML;
      });
    } catch (error) {
      if (attempt === maxRetries) throw error;
      console.log(`Attempt ${attempt} failed, retrying...`);
      await new Promise(r => setTimeout(r, 1000 * attempt));
    }
  }
}
```

## Best Practices

1. **Always use `withBrowser`** - Ensures proper acquire/release
2. **Size pool appropriately** - Balance memory vs throughput
3. **Enable recycling** - Prevents memory leaks from long-running browsers
4. **Monitor stats** - Track pool utilization
5. **Handle shutdown gracefully** - Close pool on process exit
6. **Use shared Hero Core** - For production servers

## Related Guides

- [Production Server](../deployment/production-server.md) - Shared Hero Core setup
- [Cloudflare Bypass](cloudflare-bypass.md) - Challenge handling
- [Troubleshooting](../troubleshooting.md) - Common issues


================================================
FILE: docs/guides/browser-sessions.md
================================================
# Browser Sessions

Browser sessions launch a stealthed Chrome and return a CDP (Chrome DevTools Protocol) WebSocket URL. You connect Playwright, Puppeteer, or any CDP client and get full browser automation with anti-bot stealth active.

## When to Use Browser Sessions

| Use case | Primitive |
|----------|-----------|
| Extract content from a URL → markdown | `scrape()` |
| Discover pages on a site | `crawl()` |
| Click buttons, fill forms, navigate multi-page flows | `browser()` |
| Scrape pages behind login/auth | `browser()` |
| Take screenshots, generate PDFs | `browser()` |
| Run existing Playwright/Puppeteer scripts with stealth | `browser()` |

## Quick Start

```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";

const reader = new ReaderClient();

// Create a session
const session = await reader.browser();

// Connect Playwright - one-line change from local scripts
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const context = await browser.newContext();
const page = await context.newPage();

// Use Playwright normally
await page.goto("https://example.com");
console.log(await page.title());

// Cleanup
await browser.close();
await session.close();
await reader.close();
```

## Stealth Features

Every browser session has these anti-bot features active automatically:

| Feature | What it does |
|---------|-------------|
| `navigator.webdriver = false` | Hides the automation flag that most bot detectors check first |
| Navigator spoofing | Realistic `deviceMemory`, `hardwareConcurrency`, `platform` values |
| WebGL/Canvas fingerprinting | Randomized rendering signatures |
| WebRTC IP masking | Prevents real IP leaks through WebRTC connections |
| Chrome plugin array | Simulates real Chrome extension presence |
| Permission API behavior | Matches real Chrome permission responses |

These are injected at the browser level via `Page.addScriptToEvaluateOnNewDocument` and apply to all pages, including pages created by Playwright/Puppeteer.

## Connecting with Playwright

```typescript
import { chromium } from "playwright-core";

const session = await reader.browser();
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const context = await browser.newContext();
const page = await context.newPage();

// Full Playwright API available
await page.goto("https://example.com");
await page.click("#login-button");
await page.fill("#email", "user@example.com");
await page.screenshot({ path: "screenshot.png" });
await page.pdf({ path: "page.pdf" });

const cookies = await context.cookies();
```

Install: `npm install playwright-core`

## Connecting with Puppeteer

```typescript
import { connect } from "puppeteer-core";

const session = await reader.browser();
const browser = await connect({
  browserWSEndpoint: session.wsEndpoint,
  defaultViewport: null,
});

const page = await browser.newPage();
await page.goto("https://example.com");
console.log(await page.title());
```

Install: `npm install puppeteer-core`

## Connecting with Raw CDP

For any language or tool that speaks the Chrome DevTools Protocol:

```typescript
import WebSocket from "ws";

const session = await reader.browser();
const ws = new WebSocket(session.wsEndpoint);

// Create a page target
const target = await sendCDP(ws, "Target.createTarget", { url: "about:blank" });

// Attach and navigate
const attached = await sendCDP(ws, "Target.attachToTarget", {
  targetId: target.targetId,
  flatten: true,
});

await sendPageCDP(ws, attached.sessionId, "Page.navigate", {
  url: "https://example.com",
});
```

## Session Lifecycle

```
reader.browser()
  │
  ├── Launches Chrome with stealth (Hero emulation scripts)
  ├── Extracts CDP WebSocket URL
  ├── Starts auto-close timeout (default: 5 minutes)
  │
  ▼
session.wsEndpoint
  │
  ├── Connect Playwright/Puppeteer
  ├── Navigate, interact, extract
  │
  ▼
session.close()  OR  timeout expires
  │
  └── Chrome process terminated, resources released
```

### Timeout

Sessions auto-close after `timeoutMs` (default: 300,000ms = 5 minutes). Set a longer timeout for extended automation:

```typescript
const session = await reader.browser({
  timeoutMs: 600_000, // 10 minutes
});
```

### Cleanup

Always close sessions when done to release Chrome processes:

```typescript
try {
  const session = await reader.browser();
  // ... use session ...
} finally {
  await session.close();
}
```

## CLI Usage

```bash
# Create a session (prints wsEndpoint JSON, blocks until Ctrl+C)
npx reader browser create

# Create with options
npx reader browser create --timeout 60000 --show-chrome

# List active sessions (daemon mode)
npx reader browser list

# Stop a session
npx reader browser stop <sessionId>
```

## Options

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `proxy` | `ProxyConfig` | - | Proxy to route browser traffic through |
| `proxyTier` | `ProxyTier` | - | Use a proxy from the configured pool tier |
| `showChrome` | `boolean` | `false` | Show the browser window |
| `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) |
| `verbose` | `boolean` | `false` | Enable verbose logging |

## Notes

- Each session launches its own Chrome process (~300MB memory)
- Sessions are isolated from the scrape/crawl browser pool
- MITM proxy (TLS fingerprinting) is disabled for sessions. Emulation scripts provide the stealth layer
- Selenium/chromedriver is not supported (requires exclusive Chrome access). Use Playwright, Puppeteer, or raw CDP instead.


================================================
FILE: docs/guides/cloudflare-bypass.md
================================================
# Cloudflare Bypass Guide

This guide explains how Reader bypasses Cloudflare and other bot detection systems.

## Overview

Many websites use Cloudflare to protect against bots. Reader uses [Ulixee Hero](https://ulixee.org/) which employs multiple techniques to appear as a legitimate browser.

## How It Works

### 1. TLS Fingerprinting

Every browser has a unique TLS (HTTPS) fingerprint based on:
- Supported cipher suites
- TLS extensions order
- ALPN protocols

Hero emulates Chrome's exact TLS fingerprint, making connections indistinguishable from a real browser.

### 2. DNS over TLS

Chrome uses DNS over HTTPS/TLS to Cloudflare's 1.1.1.1 servers. Hero replicates this behavior, which Cloudflare can detect and uses as a trust signal.

### 3. WebRTC IP Masking

WebRTC can leak your real IP even behind a proxy. Hero masks WebRTC to prevent IP detection that could reveal automation.

### 4. JavaScript Environment

Hero creates a complete browser environment:
- Navigator properties match real Chrome
- WebGL fingerprints are realistic
- Canvas fingerprints are consistent
- Plugin arrays match real installations

## Challenge Types

Reader detects and handles these challenge types:

| Challenge | Detection | Bypass Method |
|-----------|-----------|---------------|
| **JS Challenge** | "Checking your browser" text | Wait for auto-resolution |
| **Turnstile** | Turnstile widget in DOM | Wait for user interaction simulation |
| **Under Attack Mode** | Interstitial page | Extended wait with polling |
| **CAPTCHA** | hCaptcha/reCAPTCHA widget | Cannot bypass (requires human) |
| **WAF Block** | 403/1020 error codes | Cannot bypass (IP blocked) |

## How Detection Works

Challenge detection and resolution is handled automatically by the engine. You don't need to call any detection functions manually - Reader detects and resolves challenges during every scrape.

### Detection Signals

The detector looks for multiple signals:

**DOM Signals:**
- `#challenge-form` - Main challenge container
- `.cf-browser-verification` - Verification widget
- `#turnstile-wrapper` - Turnstile CAPTCHA
- `#cf-hcaptcha-container` - hCaptcha container

**Text Signals:**
- "Checking your browser"
- "Please wait..."
- "DDoS protection by Cloudflare"
- "Ray ID:"

**URL Signals:**
- `/cdn-cgi/challenge-platform/`
- `__cf_chl_` parameters

## Resolution

The engine automatically resolves challenges using two methods:

1. **Redirect Detection** - URL changes after challenge is solved
2. **Element Removal** - Challenge DOM elements disappear

Resolution runs automatically during every scrape with a 45-second timeout.

## Improving Success Rate

### Use Residential Proxies

Cloudflare trusts residential IPs more than datacenter IPs:

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://protected-site.com"],
  proxy: {
    type: "residential",
    host: "proxy.example.com",
    port: 8080,
    username: "username",
    password: "password",
    country: "us",
  },
});
await reader.close();
```

### Add Delays

Rate limiting makes your traffic look more human:

```typescript
const reader = new ReaderClient();

// For crawling
const result = await reader.crawl({
  url: "https://protected-site.com",
  delayMs: 3000,  // 3 seconds between requests
});

// For batch scraping, lower concurrency
const batchResult = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 1,  // One at a time
});

await reader.close();
```

### Rotate User Agents

Some sites track user agent patterns:

```typescript
const userAgents = [
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36...",
];

const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://example.com"],
  userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],
});
await reader.close();
```

### Increase Timeout

Challenges can take 30+ seconds to resolve:

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://protected-site.com"],
  timeoutMs: 60000,  // 60 seconds
});
await reader.close();
```

## What Can't Be Bypassed

### CAPTCHAs

CAPTCHAs require human interaction. Reader cannot solve:
- hCaptcha
- reCAPTCHA
- Cloudflare Turnstile (interactive mode)

For these, consider:
- CAPTCHA solving services (2Captcha, Anti-Captcha)
- Manual solving workflows
- Alternative data sources

### IP Bans

If your IP is blocked by Cloudflare's WAF:
- You'll see 403 or 1020 errors
- No amount of browser emulation helps
- Solution: Use different IPs (proxies)

### Rate Limits

Excessive requests trigger blocks:
- Implement delays between requests
- Use multiple proxies
- Reduce concurrency

## Debugging Challenges

### Visual Debugging

See exactly what's happening:

```typescript
const reader = new ReaderClient({ showChrome: true, verbose: true });
const result = await reader.scrape({
  urls: ["https://protected-site.com"],
});
await reader.close();
```

### Verbose Mode

Enable verbose logging to see challenge detection and resolution in action:

```typescript
const reader = new ReaderClient({ verbose: true });
const result = await reader.scrape({
  urls: ["https://protected-site.com"],
});
await reader.close();
```

## Best Practices

1. **Start with verbose mode** to understand what's happening
2. **Use residential proxies** for heavily protected sites
3. **Implement delays** to avoid triggering rate limits
4. **Handle failures gracefully** - not every request will succeed
5. **Rotate IPs** for large-scale scraping
6. **Respect robots.txt** when possible
7. **Cache results** to minimize repeat requests

## Example: Scraping a Cloudflare-Protected Site

Challenge handling is automatic. Just scrape normally:

```typescript
import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient({
  proxyPools: {
    datacenter: [{ url: "http://user:pass@dc-proxy:8080" }],
    residential: [{ url: "http://user:pass@res-proxy:8080" }],
  },
});

// Reader auto-detects Cloudflare and escalates to residential proxy if needed
const result = await reader.scrape({
  urls: ["https://cloudflare-protected-site.com"],
  proxyTier: "auto",
});

console.log(result.data[0].markdown);
await reader.close();
```

## Related Guides

- [Proxy Configuration](proxy-configuration.md) - Setting up proxies
- [Browser Pool](browser-pool.md) - Managing browser instances
- [Troubleshooting](../troubleshooting.md) - Common issues


================================================
FILE: docs/guides/output-formats.md
================================================
# Output Formats

Reader supports two output formats: **Markdown** and **HTML**.

| Format | Best For | What You Get |
|--------|----------|-------------|
| **markdown** | LLM consumption, RAG pipelines | Clean markdown with headings, lists, links |
| **html** | Rendering, further processing | Cleaned HTML with semantic structure |

## Specifying Formats

```typescript
const result = await reader.scrape({
  urls: ["https://example.com"],
  formats: ["markdown", "html"],
});

console.log(result.data[0].markdown);
console.log(result.data[0].html);
```

### CLI

```bash
npx reader scrape https://example.com -f markdown,html
```

Default format is `["markdown"]` if not specified.

## Markdown Output

Markdown is the recommended format for LLM consumption. Reader uses [supermarkdown](https://github.com/vakra-dev/supermarkdown), a Rust-based HTML to markdown converter built specifically for web scraping and LLM pipelines.

Features:
- Full GitHub Flavored Markdown (GFM) support
- Tables, task lists, strikethrough, autolinks
- Handles malformed HTML from real web pages
- LLM-optimized output (clean, no artifacts)

## HTML Output

HTML output is the cleaned, semantic HTML after content extraction. It includes:
- Main content only (nav/header/footer removed when `onlyMainContent: true`)
- Scripts, styles, and hidden elements removed
- Base64 images stripped
- URLs resolved to absolute paths

## Content Cleaning

Both formats benefit from the content cleaning pipeline:

```typescript
// Extract only main content (default)
await reader.scrape({ urls, onlyMainContent: true });

// Include specific elements only
await reader.scrape({ urls, includeTags: [".article-body"] });

// Exclude specific elements
await reader.scrape({ urls, excludeTags: [".comments", ".sidebar"] });

// Full page (no cleaning)
await reader.scrape({ urls, onlyMainContent: false });
```

## Metadata

Every scrape result includes metadata regardless of format:

```typescript
result.data[0].metadata.website.title       // Page title
result.data[0].metadata.website.description // Meta description
result.data[0].metadata.website.language    // Language
result.data[0].metadata.baseUrl             // Original URL
result.data[0].metadata.finalUrl            // URL after redirects (if different)
result.data[0].metadata.statusCode          // HTTP status
result.data[0].metadata.duration            // Scrape duration (ms)
```


================================================
FILE: docs/guides/proxy-configuration.md
================================================
# Proxy Configuration Guide

This guide covers proxy setup for Reader.

## Overview

Proxies help with:
- Bypassing IP-based blocks
- Accessing geo-restricted content
- Distributing requests across multiple IPs
- Avoiding rate limits

## Quick Start

### Using Proxy URL

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://example.com"],
  proxy: {
    url: "http://username:password@proxy.example.com:8080",
  },
});
await reader.close();
```

### Using Structured Config

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://example.com"],
  proxy: {
    type: "residential",
    host: "proxy.example.com",
    port: 8080,
    username: "username",
    password: "password",
    country: "us",
  },
});
await reader.close();
```

### CLI Usage

```bash
npx reader scrape https://example.com --proxy http://user:pass@host:port
```

## Proxy Types

### Datacenter Proxies

- **Pros:** Fast, cheap, reliable
- **Cons:** Easily detected, often blocked
- **Best for:** Sites without bot protection

```typescript
proxy: {
  type: "datacenter",
  host: "proxy.example.com",
  port: 8080,
  username: "username",
  password: "password",
}
```

### Residential Proxies

- **Pros:** Real IPs, hard to detect, trusted by Cloudflare
- **Cons:** Slower, more expensive, limited bandwidth
- **Best for:** Cloudflare-protected sites, sensitive scraping

```typescript
proxy: {
  type: "residential",
  host: "proxy.example.com",
  port: 8080,
  username: "username",
  password: "password",
  country: "us",
}
```

### Mobile Proxies

- **Pros:** Highest trust level, shared by many users
- **Cons:** Most expensive, limited availability
- **Best for:** Most aggressive anti-bot systems

## Configuration Options

| Option | Type | Description |
|--------|------|-------------|
| `url` | `string` | Full proxy URL (takes precedence) |
| `type` | `"datacenter" \| "residential"` | Proxy type |
| `host` | `string` | Proxy server hostname |
| `port` | `number` | Proxy server port |
| `username` | `string` | Authentication username |
| `password` | `string` | Authentication password |
| `country` | `string` | Country code (e.g., "us", "uk", "de") |

## Provider Examples

### IPRoyal

```typescript
proxy: {
  type: "residential",
  host: "geo.iproyal.com",
  port: 12321,
  username: "customer-username",
  password: "password",
  country: "us",
}
```

### Bright Data (Luminati)

```typescript
proxy: {
  type: "residential",
  host: "brd.superproxy.io",
  port: 22225,
  username: "customer-zone-residential",
  password: "password",
  country: "us",
}
```

### Oxylabs

```typescript
proxy: {
  type: "residential",
  host: "pr.oxylabs.io",
  port: 7777,
  username: "customer-username",
  password: "password",
  country: "us",
}
```

### SmartProxy

```typescript
proxy: {
  type: "residential",
  host: "gate.smartproxy.com",
  port: 7000,
  username: "user",
  password: "pass",
  country: "us",
}
```

## Proxy Pooling

Reader supports built-in proxy pooling with automatic rotation:

```typescript
const reader = new ReaderClient({
  // Configure multiple proxies
  proxies: [
    { host: "proxy1.example.com", port: 8080, username: "user", password: "pass" },
    { host: "proxy2.example.com", port: 8080, username: "user", password: "pass" },
    { host: "proxy3.example.com", port: 8080, username: "user", password: "pass", country: "us" },
  ],
  // Rotation strategy: "round-robin" (default) or "random"
  proxyRotation: "round-robin",
});

// Each request automatically uses the next proxy in rotation
const result = await reader.scrape({
  urls: ["https://example1.com", "https://example2.com", "https://example3.com"],
});

// Check which proxy handled each request
result.data.forEach((site) => {
  console.log(`${site.metadata.baseUrl} -> ${site.metadata.proxy?.host}:${site.metadata.proxy?.port}`);
});

await reader.close();
```

### Proxy Metadata in Response

When using proxy pooling, each result includes metadata about which proxy was used:

```typescript
interface ProxyMetadata {
  host: string;    // Proxy host that handled the request
  port: number;    // Proxy port
  country?: string; // Country code if geo-targeting was used
}
```

## Tiered Proxy Pools (Recommended)

Instead of a flat proxy list, configure separate datacenter and residential pools. Reader auto-escalates from datacenter to residential when a site blocks:

```typescript
const reader = new ReaderClient({
  proxyPools: {
    datacenter: [
      { url: "http://user:pass@dc-proxy1:8080" },
      { url: "http://user:pass@dc-proxy2:8080" },
    ],
    residential: [
      { url: "http://user:pass@res-proxy1:8080" },
    ],
  },
});

const result = await reader.scrape({
  urls: ["https://example.com"],
  proxyTier: "auto", // datacenter first, escalate to residential on block
});
```

### Proxy Tiers

| Tier | When used | Credits |
|------|-----------|---------|
| `"datacenter"` | Fast, most sites | 1 per scrape |
| `"residential"` | Anti-bot sites (Amazon, LinkedIn) | 3 per scrape |
| `"auto"` | Starts datacenter, escalates on block | 1 or 3 |

### Environment Variables

Configure proxy pools via environment variables (useful for daemons):

```bash
PROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080
PROXY_RESIDENTIAL=http://user:pass@res1:8080
```

### Health Tracking

Reader monitors proxy health automatically:
- **Circuit breaker:** After 10 consecutive failures, a proxy is benched for 5 minutes
- **Auto-recovery:** Benched proxies are automatically revived after the cooldown
- **Only proxy faults count:** Bot blocks (403, captcha) don't count against the proxy. Those are the site's behavior, not the proxy's

### Per-Proxy Concurrency

Each proxy URL has a concurrency limit (default: 2 simultaneous requests). This prevents overwhelming a single proxy IP, which can trigger rate limits.

## Rotation Strategies

### Per-Request Rotation

Most residential proxy providers rotate IPs automatically:

```typescript
const reader = new ReaderClient();

// Each request gets a different IP
for (const url of urls) {
  await reader.scrape({
    urls: [url],
    proxy: proxyConfig,
  });
}

await reader.close();
```

### Sticky Sessions

Keep the same IP for multiple requests:

```typescript
// Some providers support session IDs
proxy: {
  host: "proxy.example.com",
  port: 8080,
  username: "user-session-abc123",  // Session in username
  password: "pass",
}
```

### Manual Rotation

Rotate through a list of proxies:

```typescript
const proxies = [
  { host: "proxy1.example.com", port: 8080 },
  { host: "proxy2.example.com", port: 8080 },
  { host: "proxy3.example.com", port: 8080 },
];

let proxyIndex = 0;
const reader = new ReaderClient();

async function scrapeWithRotation(url: string) {
  const proxy = proxies[proxyIndex % proxies.length];
  proxyIndex++;

  return await reader.scrape({
    urls: [url],
    proxy: {
      ...proxy,
      username: "username",
      password: "password",
    },
  });
}

// Don't forget to close when done
// await reader.close();
```

## Geo-Targeting

Target specific countries for localized content:

```typescript
const reader = new ReaderClient();

// US content
const usResult = await reader.scrape({
  urls: ["https://example.com"],
  proxy: { ...baseProxy, country: "us" },
});

// UK content
const ukResult = await reader.scrape({
  urls: ["https://example.com"],
  proxy: { ...baseProxy, country: "uk" },
});

await reader.close();
```

Common country codes:
- `us` - United States
- `uk` or `gb` - United Kingdom
- `de` - Germany
- `fr` - France
- `jp` - Japan
- `au` - Australia

## Error Handling

### Proxy Failures

```typescript
const reader = new ReaderClient();

async function scrapeWithFallback(url: string) {
  const proxies = [residentialProxy, datacenterProxy, null];

  for (const proxy of proxies) {
    try {
      return await reader.scrape({
        urls: [url],
        proxy,
        timeoutMs: 30000,
      });
    } catch (error) {
      console.log(`Proxy failed: ${proxy?.host || "direct"}`);
      continue;
    }
  }

  throw new Error("All proxies failed");
}

// Don't forget to close when done
// await reader.close();
```

### Connection Errors

Common proxy errors and solutions:

| Error | Cause | Solution |
|-------|-------|----------|
| `ECONNREFUSED` | Proxy server down | Try different proxy |
| `407 Proxy Auth Required` | Wrong credentials | Check username/password |
| `403 Forbidden` | Proxy blocked by site | Use residential proxy |
| `Timeout` | Slow proxy | Increase timeout |

## Testing Proxies

### Verify Proxy Works

```typescript
const reader = new ReaderClient();

async function testProxy(proxy: ProxyConfig): Promise<boolean> {
  try {
    const result = await reader.scrape({
      urls: ["https://httpbin.org/ip"],
      formats: ["markdown"],
      proxy,
      timeoutMs: 10000,
    });

    console.log("Proxy IP:", result.data[0].markdown);
    return true;
  } catch (error) {
    console.log("Proxy failed:", error.message);
    return false;
  }
}

await reader.close();
```

### Check Geo-Location

```typescript
const reader = new ReaderClient();

const result = await reader.scrape({
  urls: ["https://ipinfo.io/json"],
  formats: ["markdown"],
  proxy: { ...proxyConfig, country: "uk" },
});

console.log(result.data[0].markdown);  // Contains the IP info

await reader.close();
```

## Best Practices

1. **Start with datacenter proxies** - Cheaper, see if you need more
2. **Upgrade to residential** - When blocked or for Cloudflare sites
3. **Use geo-targeting** - Match target site's expected users
4. **Implement rotation** - Spread requests across IPs
5. **Handle failures gracefully** - Have fallback proxies
6. **Monitor bandwidth** - Residential proxies charge by GB
7. **Test before deploying** - Verify proxies work with target site

## Cost Considerations

| Proxy Type | Typical Cost | Best For |
|------------|--------------|----------|
| Datacenter | $0.50-2/GB | Unprotected sites |
| Residential | $3-15/GB | Cloudflare, sensitive sites |
| Mobile | $20-50/GB | Highest security sites |

## Related Guides

- [Cloudflare Bypass](cloudflare-bypass.md) - Works best with residential proxies
- [Browser Pool](browser-pool.md) - Managing browser instances
- [Troubleshooting](../troubleshooting.md) - Common proxy issues


================================================
FILE: docs/troubleshooting.md
================================================
# Troubleshooting

This guide covers common issues and their solutions when using Reader.

## Quick Diagnostics

Before diving into specific issues, try these debugging steps:

```bash
# Enable verbose logging
npx reader scrape https://example.com -v

# Show the browser window to see what's happening
npx reader scrape https://example.com --show-chrome

# Check Node.js version (should be >= 18)
node --version
```

## Common Errors

### Chrome/Chromium Not Found

**Error:**
```
Error: Could not find Chrome installation
```

**Cause:** Hero needs Chrome/Chromium to run. It tries to download it automatically on first run.

**Solutions:**

1. **Let Hero download Chrome:**
   ```bash
   # Clear any cached downloads and try again
   rm -rf ~/.cache/ulixee
   npx reader scrape https://example.com
   ```

2. **Install Chrome manually (Ubuntu/Debian):**
   ```bash
   sudo apt-get update
   sudo apt-get install -y chromium-browser
   ```

3. **Install Chrome manually (macOS):**
   ```bash
   brew install --cask chromium
   ```

4. **Point to existing Chrome:**
   ```bash
   export CHROME_PATH=/usr/bin/chromium-browser
   # or on macOS
   export CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
   ```

### Connection Refused (ECONNREFUSED)

**Error:**
```
Error: connect ECONNREFUSED 127.0.0.1:9222
```

**Cause:** Hero couldn't start or connect to Chrome.

**Solutions:**

1. **Check if Chrome is running:**
   ```bash
   ps aux | grep chrome
   # Kill any zombie processes
   pkill -f chrome
   ```

2. **Check for port conflicts:**
   ```bash
   lsof -i :9222
   ```

3. **Try with a fresh browser instance:**
   ```typescript
   const reader = new ReaderClient({ showChrome: true });
   const result = await reader.scrape({
     urls: ["https://example.com"],
   });
   await reader.close();
   ```

### Request Timeout

**Error:**
```
Error: Navigation timeout of 30000 ms exceeded
```

**Cause:** The page took too long to load, or Cloudflare challenge took too long to resolve.

**Solutions:**

1. **Increase timeout:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.scrape({
     urls: ["https://example.com"],
     timeoutMs: 60000,  // 60 seconds
   });
   await reader.close();
   ```

2. **For batch operations, increase batch timeout:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.scrape({
     urls: [...manyUrls],
     batchTimeoutMs: 600000,  // 10 minutes total
   });
   await reader.close();
   ```

3. **Check if the site is accessible:**
   ```bash
   curl -I https://example.com
   ```

### Cloudflare Block (403/1020)

**Error:**
```
Error: Access denied (Error code 1020)
```

**Cause:** Cloudflare detected automated access and blocked the request.

**Solutions:**

1. **Use a proxy:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.scrape({
     urls: ["https://example.com"],
     proxy: {
       type: "residential",
       host: "proxy.example.com",
       port: 8080,
       username: "username",
       password: "password",
     },
   });
   await reader.close();
   ```

2. **Add delays between requests:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.crawl({
     url: "https://example.com",
     delayMs: 3000,  // 3 seconds between requests
   });
   await reader.close();
   ```

3. **Try a different user agent:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.scrape({
     urls: ["https://example.com"],
     userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
   });
   await reader.close();
   ```

4. **Enable verbose mode to see challenge detection:**
   ```typescript
   const reader = new ReaderClient({ verbose: true, showChrome: true });
   const result = await reader.scrape({
     urls: ["https://example.com"],
   });
   await reader.close();
   ```

### Memory Issues

**Error:**
```
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
```

**Cause:** Too many browser instances or large pages consuming memory.

**Solutions:**

1. **Reduce concurrency:**
   ```typescript
   const reader = new ReaderClient();
   const result = await reader.scrape({
     urls: [...manyUrls],
     batchConcurrency: 2,  // Lower concurrency
   });
   await reader.close();
   ```

2. **Increase Node.js memory:**
   ```bash
   NODE_OPTIONS="--max-old-space-size=4096" npx reader scrape ...
   ```

3. **Use browser pool recycling (happens automatically, but you can tune it):**
   ```typescript
   import { BrowserPool } from "@vakra-dev/reader";

   const pool = new BrowserPool({
     size: 2,
     retireAfterPages: 50,  // Recycle browsers more frequently
   });
   ```

### ESM/CommonJS Issues

**Error:**
```
SyntaxError: Cannot use import statement outside a module
```

**Cause:** Reader is ESM-only, but your project is using CommonJS.

**Solutions:**

1. **Add to package.json:**
   ```json
   {
     "type": "module"
   }
   ```

2. **Or use .mjs extension:**
   ```bash
   mv script.js script.mjs
   node script.mjs
   ```

3. **Or use dynamic import in CommonJS:**
   ```javascript
   // script.cjs
   async function main() {
     const { scrape } = await import("@vakra-dev/reader");
     // ...
   }
   main();
   ```

### "Bun runtime not supported"

**Error:**
```
Error: Hero doesn't work with Bun runtime
```

**Cause:** Hero requires Node.js runtime and is not compatible with Bun.

**Solution:** Use Node.js to run your scripts:

```bash
# Use npx tsx
npx tsx script.ts

# or node with loader
node --loader tsx script.ts
```

## Debugging Tips

### Enable Verbose Logging

```typescript
const reader = new ReaderClient({ verbose: true });
const result = await reader.scrape({
  urls: ["https://example.com"],
});
await reader.close();
```

This shows:
- Cloudflare challenge detection
- Page navigation events
- Timing information
- Error details

### Show Browser Window

```typescript
const reader = new ReaderClient({ showChrome: true });
const result = await reader.scrape({
  urls: ["https://example.com"],
});
await reader.close();
```

This opens a visible Chrome window so you can see:
- What the page looks like
- Cloudflare challenges appearing
- JavaScript errors in DevTools

### Enable Verbose Logging

Challenge detection and resolution happens automatically. Enable verbose logging to see what's happening:

### Log Progress

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 3,
  onProgress: ({ completed, total, currentUrl }) => {
    console.log(`[${completed}/${total}] ${currentUrl}`);
  },
});
await reader.close();
```

## Performance Issues

### Slow Scraping

1. **Increase concurrency (if resources allow):**
   ```typescript
   batchConcurrency: 5  // Default is 1
   ```

2. **Use browser pool for repeated scrapes:**
   ```typescript
   import { BrowserPool } from "@vakra-dev/reader";

   const pool = new BrowserPool({ size: 5 });
   await pool.initialize();

   // Reuse pool for multiple operations
   for (const url of urls) {
     await pool.withBrowser(async (hero) => {
       await hero.goto(url);
       // ...
     });
   }

   await pool.shutdown();
   ```

3. **Use shared Hero Core for production:**
   See [Production Server Guide](deployment/production-server.md)

### High Memory Usage

1. **Reduce pool size:**
   ```typescript
   const pool = new BrowserPool({ size: 2 });
   ```

2. **Enable more aggressive recycling:**
   ```typescript
   const pool = new BrowserPool({
     size: 3,
     retireAfterPages: 30,      // Default: 100
     retireAfterMinutes: 15,    // Default: 30
   });
   ```

3. **Process URLs in smaller batches:**
   ```typescript
   const reader = new ReaderClient();
   const batchSize = 10;
   for (let i = 0; i < urls.length; i += batchSize) {
     const batch = urls.slice(i, i + batchSize);
     await reader.scrape({ urls: batch, batchConcurrency: 3 });
     // Allow garbage collection between batches
     await new Promise(r => setTimeout(r, 1000));
   }
   await reader.close();
   ```

## Site-Specific Issues

### JavaScript-Heavy Sites

Some sites require waiting for JavaScript to render:

```typescript
const reader = new ReaderClient();
const result = await reader.scrape({
  urls: ["https://spa-site.com"],
  waitForSelector: ".main-content",  // Wait for this element
  timeoutMs: 60000,
});
await reader.close();
```

### Sites with Infinite Scroll

Crawling may not discover all content. Consider:

1. Limiting depth and using specific URL patterns
2. Using the API directly with custom scroll logic

### Login-Protected Content

Reader doesn't handle authentication directly. Options:

1. Use cookies from an authenticated session
2. Build custom authentication logic using the Browser Pool
3. Use a headless browser automation tool for login, then Reader for scraping

## Getting More Help

1. **Check the logs** with `-v` flag
2. **Search existing issues** on [GitHub](https://github.com/vakra-dev/reader/issues)
3. **Open a new issue** with:
   - Node.js version
   - Reader version
   - Operating system
   - Error message and stack trace
   - Minimal reproduction steps

## Related Guides

- [Getting Started](getting-started.md)
- [Cloudflare Bypass](guides/cloudflare-bypass.md)
- [Browser Pool](guides/browser-pool.md)
- [Proxy Configuration](guides/proxy-configuration.md)


================================================
FILE: ecosystem.config.cjs
================================================
/**
 * PM2 ecosystem config for reader daemon.
 *
 * Two separate instances on different ports, each with its own proxy pool.
 * NOT cluster mode: Hero browser pool is stateful (proxy-bound browsers).
 *
 * Proxy sets are split via READER_PROXIES env var in each instance's .env file.
 * Example:
 *   Instance 1 (.env.1): READER_PROXIES=dc1,dc2,dc3,dc4,dc5,res1,res2
 *   Instance 2 (.env.2): READER_PROXIES=dc6,dc7,dc8,dc9,dc10,res3,res4
 */
module.exports = {
  apps: [
    {
      name: "reader-daemon-1",
      script: "dist/cli/index.js",
      args: "start --port 6003",
      node_args: "--env-file=.env.1",
      instances: 1,
      autorestart: true,
      max_memory_restart: "2G",
      env: {
        NODE_ENV: "production",
      },
    },
    {
      name: "reader-daemon-2",
      script: "dist/cli/index.js",
      args: "start --port 6004",
      node_args: "--env-file=.env.2",
      instances: 1,
      autorestart: true,
      max_memory_restart: "2G",
      env: {
        NODE_ENV: "production",
      },
    },
  ],
};


================================================
FILE: examples/.gitignore
================================================
# Dependencies
node_modules/
bun.lockb

# Build outputs
dist/
*.js
*.d.ts
*.map

# Environment
.env
.env.local
.env.*.local

# Logs
*.log
npm-debug.log*

# OS
.DS_Store

# IDE
.idea/
.vscode/
*.swp
*.swo


================================================
FILE: examples/.nvmrc
================================================
v22.12.0


================================================
FILE: examples/README.md
================================================
# Reader Examples

Examples demonstrating various uses of Reader.

## Structure

```
examples/
├── basic/                    # Basic usage examples
│   ├── basic-scrape.ts       # Single URL scraping
│   ├── batch-scrape.ts       # Concurrent multi-URL scraping
│   ├── large-batch-scrape.ts # Large-scale batch scraping (1000+ URLs)
│   ├── browser-pool-config.ts # Browser pool configuration
│   ├── proxy-pool.ts         # Proxy rotation with multiple proxies
│   ├── cloudflare-bypass.ts  # Cloudflare-protected site scraping
│   ├── crawl-website.ts      # Website crawling
│   ├── all-formats.ts        # All output formats
│   └── with-proxy.ts         # Single proxy configuration
│
├── ai-tools/                 # AI framework integrations
│   ├── openai-summary.ts     # OpenAI summarization
│   ├── anthropic-summary.ts  # Anthropic summarization
│   ├── vercel-ai-stream.ts   # Vercel AI SDK streaming
│   ├── langchain-loader.ts   # LangChain document loader
│   ├── llamaindex-loader.ts  # LlamaIndex document loader
│   ├── pinecone-ingest.ts    # Pinecone vector store
│   └── qdrant-ingest.ts      # Qdrant vector store
│
├── production/               # Production-ready setups
│   └── express-server/       # REST API server
│
└── deployment/               # Cloud deployment guides
    ├── docker/               # Docker + docker-compose
    ├── aws-lambda/           # AWS Lambda (container)
    └── vercel-functions/     # Vercel serverless
```

## Quick Start

1. Install dependencies from the examples folder:

```bash
cd examples
npm install
```

2. Start Ulixee Cloud (in a separate terminal):

```bash
npx @ulixee/cloud start
```

3. Run any example using tsx:

```bash
# Basic examples
npx tsx basic/basic-scrape.ts
npx tsx basic/batch-scrape.ts
npx tsx basic/large-batch-scrape.ts  # Large-scale (1000+ URLs)
npx tsx basic/browser-pool-config.ts
npx tsx basic/proxy-pool.ts
npx tsx basic/cloudflare-bypass.ts
npx tsx basic/crawl-website.ts

# AI tools examples (requires API keys)
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/openai-summary.ts https://example.com

export ANTHROPIC_API_KEY="sk-..."
npx tsx ai-tools/anthropic-summary.ts https://example.com

# Production server
npx tsx production/express-server/src/index.ts
```

### Deploy with Docker

```bash
cd examples/deployment/docker
docker-compose up -d
```

## Requirements

- **Node.js** >= 18
- For LLM examples: API keys for OpenAI/Anthropic
- For deployment: Docker, cloud CLI tools

## Contributing

Have an example to share? Open a PR!


================================================
FILE: examples/ai-tools/README.md
================================================
# AI Tools Examples

Examples showing how to integrate Reader with AI frameworks, LLMs, and vector stores.

## Prerequisites

Start Ulixee Cloud in a separate terminal:

```bash
npx @ulixee/cloud start
```

## Examples

### LLM Summarization

Scrape webpages and summarize with LLMs.

| Example | Description | API Key Required |
|---------|-------------|------------------|
| [openai-summary.ts](./openai-summary.ts) | Summarize with GPT | `OPENAI_API_KEY` |
| [anthropic-summary.ts](./anthropic-summary.ts) | Summarize with Claude | `ANTHROPIC_API_KEY` |
| [vercel-ai-stream.ts](./vercel-ai-stream.ts) | Streaming summary with Vercel AI SDK | `OPENAI_API_KEY` |

```bash
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/openai-summary.ts https://example.com

export ANTHROPIC_API_KEY="sk-ant-..."
npx tsx ai-tools/anthropic-summary.ts https://example.com
```

### RAG Frameworks

Load scraped content into RAG frameworks for retrieval-augmented generation.

| Example | Description |
|---------|-------------|
| [langchain-loader.ts](./langchain-loader.ts) | Custom LangChain document loader |
| [llamaindex-loader.ts](./llamaindex-loader.ts) | LlamaIndex document loader |

```bash
npx tsx ai-tools/langchain-loader.ts
npx tsx ai-tools/llamaindex-loader.ts
```

### Vector Stores

Scrape and ingest content directly into vector databases for semantic search.

| Example | Description | API Keys Required |
|---------|-------------|-------------------|
| [pinecone-ingest.ts](./pinecone-ingest.ts) | Ingest into Pinecone | `PINECONE_API_KEY`, `OPENAI_API_KEY` |
| [qdrant-ingest.ts](./qdrant-ingest.ts) | Ingest into Qdrant | `OPENAI_API_KEY`, optionally `QDRANT_URL` |

```bash
# Pinecone
export PINECONE_API_KEY="..."
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/pinecone-ingest.ts

# Qdrant (local)
docker run -p 6333:6333 qdrant/qdrant
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/qdrant-ingest.ts
```

## Tips

- Use `markdown` format for LLM input (cleaner than HTML)
- Truncate content if it exceeds token limits
- For production, consider chunking large documents before embedding


================================================
FILE: examples/ai-tools/anthropic-summary.ts
================================================
/**
 * Anthropic (Claude) Summarization Example
 *
 * Scrapes a webpage and uses Claude to summarize the content.
 *
 * Usage:
 *   npx tsx ai-tools/anthropic-summary.ts https://example.com
 *
 * Requirements:
 *   - Set ANTHROPIC_API_KEY environment variable
 */

import { ReaderClient } from "@vakra-dev/reader";
import Anthropic from "@anthropic-ai/sdk";

async function main() {
  const url = process.argv[2] || "https://example.com";

  console.log(`Scraping ${url}...\n`);

  // Check for API key
  if (!process.env.ANTHROPIC_API_KEY) {
    console.error("Error: ANTHROPIC_API_KEY environment variable is required");
    process.exit(1);
  }

  const reader = new ReaderClient();

  try {
    // Step 1: Scrape the webpage
    const result = await reader.scrape({
      urls: [url],
      formats: ["markdown"], // Markdown is best for LLM consumption
    });

    const content = result.data[0]?.markdown;
    if (!content) {
      console.error("No content scraped");
      process.exit(1);
    }

    console.log(`Scraped ${content.length} characters`);
    console.log("Sending to Claude for summarization...\n");

    // Step 2: Summarize with Claude
    const anthropic = new Anthropic();

    const message = await anthropic.messages.create({
      model: "claude-3-haiku-20240307",
      max_tokens: 500,
      messages: [
        {
          role: "user",
          content: `Please summarize the following webpage content in 2-3 paragraphs:\n\n${content.slice(0, 10000)}`,
        },
      ],
    });

    const summary = message.content[0].type === "text" ? message.content[0].text : "";

    console.log("=== SUMMARY ===\n");
    console.log(summary);
    console.log("\n=== METADATA ===");
    console.log(`Source: ${url}`);
    console.log(`Content length: ${content.length} chars`);
    console.log(`Model: ${message.model}`);
    console.log(`Tokens: ${message.usage.input_tokens} in / ${message.usage.output_tokens} out`);
  } catch (error: any) {
    console.error("Error:", error.message);
    process.exit(1);
  } finally {
    await reader.close();
  }
}

main();


================================================
FILE: examples/ai-tools/langchain-loader.ts
================================================
/**
 * LangChain Document Loader Example
 *
 * Creates a custom LangChain document loader using Reader.
 *
 * Usage:
 *   npx tsx ai-tools/langchain-loader.ts
 */

import { ReaderClient } from "@vakra-dev/reader";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";

/**
 * Custom LangChain document loader powered by Reader
 */
class ReaderEngineLoader extends BaseDocumentLoader {
  private urls: string[];
  private crawlMode: boolean;
  private maxPages: number;
  private depth: number;
  private reader: ReaderClient;

  constructor(options: {
    urls: string[];
    crawl?: boolean;
    maxPages?: number;
    depth?: number;
    reader: ReaderClient;
  }) {
    super();
    this.urls = options.urls;
    this.crawlMode = options.crawl ?? false;
    this.maxPages = options.maxPages ?? 20;
    this.depth = options.depth ?? 1;
    this.reader = options.reader;
  }

  async load(): Promise<Document[]> {
    const documents: Document[] = [];

    if (this.crawlMode && this.urls.length === 1) {
      // Crawl mode: discover pages from a single seed URL
      const result = await this.reader.crawl({
        url: this.urls[0],
        depth: this.depth,
        maxPages: this.maxPages,
        scrape: true,
      });

      if (result.scraped) {
        for (const page of result.scraped.data) {
          documents.push(
            new Document({
              pageContent: page.markdown || "",
              metadata: {
                source: page.metadata.baseUrl,
                title: page.metadata.website.title,
                description: page.metadata.website.description,
                scrapedAt: page.metadata.scrapedAt,
              },
            })
          );
        }
      }
    } else {
      // Scrape mode: scrape specific URLs
      const result = await this.reader.scrape({
        urls: this.urls,
        formats: ["markdown"],
        batchConcurrency: 2,
      });

      for (const page of result.data) {
        documents.push(
          new Document({
            pageContent: page.markdown || "",
            metadata: {
              source: page.metadata.baseUrl,
              title: page.metadata.website.title,
              description: page.metadata.website.description,
              scrapedAt: page.metadata.scrapedAt,
            },
          })
        );
      }
    }

    return documents;
  }
}

// Example usage
async function main() {
  console.log("LangChain Document Loader Example\n");

  const reader = new ReaderClient({ verbose: true });

  try {
    // Example 1: Load specific URLs
    console.log("--- Example 1: Load specific URLs ---");
    const loader1 = new ReaderEngineLoader({
      urls: ["https://example.com", "https://example.org"],
      reader,
    });

    const docs1 = await loader1.load();
    console.log(`Loaded ${docs1.length} documents`);
    for (const doc of docs1) {
      console.log(`  - ${doc.metadata.source}: ${doc.pageContent.length} chars`);
    }

    // Example 2: Crawl a website
    console.log("\n--- Example 2: Crawl a website ---");
    const loader2 = new ReaderEngineLoader({
      urls: ["https://example.com"],
      crawl: true,
      depth: 1,
      maxPages: 5,
      reader,
    });

    const docs2 = await loader2.load();
    console.log(`Crawled and loaded ${docs2.length} documents`);
    for (const doc of docs2) {
      console.log(`  - ${doc.metadata.source}: ${doc.pageContent.length} chars`);
    }

    // The documents can now be used with LangChain:
    // - Text splitters for chunking
    // - Vector stores for embeddings
    // - RAG pipelines
    // - etc.
  } finally {
    await reader.close();
  }
}

main().catch(console.error);


================================================
FILE: examples/ai-tools/llamaindex-loader.ts
================================================
/**
 * LlamaIndex Document Loader Example
 *
 * Creates a custom LlamaIndex document loader using Reader.
 *
 * Usage:
 *   npx tsx ai-tools/llamaindex-loader.ts
 */

import { ReaderClient } from "@vakra-dev/reader";
import { Document } from "llamaindex";

/**
 * Load documents from URLs using Reader
 */
async function loadDocuments(reader: ReaderClient, urls: string[]): Promise<Document[]> {
  const result = await reader.scrape({
    urls,
    formats: ["markdown"],
    batchConcurrency: 2,
  });

  return result.data.map(
    (page) =>
      new Document({
        text: page.markdown || "",
        metadata: {
          source: page.metadata.baseUrl,
          title: page.metadata.website.title ?? undefined,
          description: page.metadata.website.description ?? undefined,
          scrapedAt: page.metadata.scrapedAt,
        },
      })
  );
}

/**
 * Crawl a website and load all discovered pages as documents
 */
async function crawlAndLoadDocuments(
  reader: ReaderClient,
  url: string,
  options: { depth?: number; maxPages?: number } = {}
): Promise<Document[]> {
  const result = await reader.crawl({
    url,
    depth: options.depth ?? 1,
    maxPages: options.maxPages ?? 20,
    scrape: true,
  });

  if (!result.scraped) {
    return [];
  }

  return result.scraped.data.map(
    (page) =>
      new Document({
        text: page.markdown || "",
        metadata: {
          source: page.metadata.baseUrl,
          title: page.metadata.website.title ?? undefined,
          description: page.metadata.website.description ?? undefined,
          scrapedAt: page.metadata.scrapedAt,
        },
      })
  );
}

// Example usage
async function main() {
  console.log("LlamaIndex Document Loader Example\n");

  const reader = new ReaderClient({ verbose: true });

  try {
    // Example 1: Load specific URLs
    console.log("--- Example 1: Load specific URLs ---");
    const docs1 = await loadDocuments(reader, ["https://example.com", "https://example.org"]);
    console.log(`Loaded ${docs1.length} documents`);
    for (const doc of docs1) {
      console.log(`  - ${doc.metadata.source}: ${doc.getText().length} chars`);
    }

    // Example 2: Crawl a website
    console.log("\n--- Example 2: Crawl a website ---");
    const docs2 = await crawlAndLoadDocuments(reader, "https://example.com", {
      depth: 1,
      maxPages: 5,
    });
    console.log(`Crawled and loaded ${docs2.length} documents`);
    for (const doc of docs2) {
      console.log(`  - ${doc.metadata.source}: ${doc.getText().length} chars`);
    }

    // The documents can now be used with LlamaIndex:
    // - VectorStoreIndex for similarity search
    // - SummaryIndex for summarization
    // - KnowledgeGraphIndex for graph-based retrieval
  } finally {
    await reader.close();
  }
}

main().catch(console.error);


================================================
FILE: examples/ai-tools/openai-summary.ts
================================================
/**
 * OpenAI Summarization Example
 *
 * Scrapes a webpage and uses OpenAI to summarize the content.
 *
 * Usage:
 *   npx tsx ai-tools/openai-summary.ts https://example.com
 *
 * Requirements:
 *   - Set OPENAI_API_KEY environment variable
 */

import { ReaderClient } from "@vakra-dev/reader";
import OpenAI from "openai";

async function main() {
  const url = process.argv[2] || "https://example.com";

  console.log(`Scraping ${url}...\n`);

  // Check for API key
  if (!process.env.OPENAI_API_KEY) {
    console.error("Error: OPENAI_API_KEY environment variable is required");
    process.exit(1);
  }

  const reader = new ReaderClient();

  try {
    // Step 1: Scrape the webpage
    const result = await reader.scrape({
      urls: [url],
      formats: ["markdown"], // Markdown is best for LLM consumption
    });

    const content = result.data[0]?.markdown;
    if (!content) {
      console.error("No content scraped");
      process.exit(1);
    }

    console.log(`Scraped ${content.length} characters`);
    console.log("Sending to OpenAI for summarization...\n");

    // Step 2: Summarize with OpenAI
    const openai = new OpenAI();

    const completion = await openai.chat.completions.create({
      model: "gpt-4o-mini",
      messages: [
        {
          role: "system",
          content:
            "You are a helpful assistant that summarizes web content. Provide a concise summary in 2-3 paragraphs.",
        },
        {
          role: "user",
          content: `Please summarize the following webpage content:\n\n${content.slice(0, 10000)}`,
        },
      ],
      max_tokens: 500,
    });

    const summary = completion.choices[0]?.message?.content;

    console.log("=== SUMMARY ===\n");
    console.log(summary);
    console.log("\n=== METADATA ===");
    console.log(`Source: ${url}`);
    console.log(`Content length: ${content.length} chars`);
    console.log(`Model: ${completion.model}`);
    console.log(`Tokens used: ${completion.usage?.total_tokens}`);
  } catch (error: any) {
    console.error("Error:", error.message);
    process.exit(1);
  } finally {
    await reader.close();
  }
}

main();


================================================
FILE: examples/ai-tools/pinecone-ingest.ts
================================================
/**
 * Pinecone Vector Store Ingestion Example
 *
 * Scrapes webpages and ingests them into Pinecone for semantic search.
 *
 * Usage:
 *   npx tsx ai-tools/pinecone-ingest.ts
 *
 * Requirements:
 *   - Set PINECONE_API_KEY environment variable
 *   - Set OPENAI_API_KEY environment variable (for embeddings)
 *   - Create a Pinecone index with dimension 1536 (for text-embedding-3-small)
 */

import { ReaderClient } from "@vakra-dev/reader";
import { Pinecone } from "@pinecone-database/pinecone";
import OpenAI from "openai";

const INDEX_NAME = "reader-docs";

async function main() {
  // Check for required API keys
  if (!process.env.PINECONE_API_KEY) {
    console.error("Error: PINECONE_API_KEY environment variable is required");
    process.exit(1);
  }
  if (!process.env.OPENAI_API_KEY) {
    console.error("Error: OPENAI_API_KEY environment variable is required");
    process.exit(1);
  }

  console.log("Pinecone Vector Store Ingestion Example\n");

  // Initialize clients
  const pinecone = new Pinecone();
  const openai = new OpenAI();
  const reader = new ReaderClient({ verbose: true });

  try {
    // Step 1: Scrape webpages
    const urls = ["https://example.com", "https://example.org"];

    console.log(`Scraping ${urls.length} URLs...`);
    const result = await reader.scrape({
      urls,
      formats: ["markdown"],
      batchConcurrency: 2,
    });

    console.log(`Scraped ${result.batchMetadata.successfulUrls} pages`);

    // Step 2: Generate embeddings and prepare vectors
    console.log("\nGenerating embeddings...");
    const index = pinecone.index(INDEX_NAME);

    const vectors = [];
    for (const page of result.data) {
      const content = page.markdown || "";
      if (!content) continue;

      // Truncate content to fit embedding model limits
      const truncatedContent = content.slice(0, 8000);

      // Generate embedding
      const embeddingResponse = await openai.embeddings.create({
        model: "text-embedding-3-small",
        input: truncatedContent,
      });

      const embedding = embeddingResponse.data[0].embedding;

      vectors.push({
        id: Buffer.from(page.metadata.baseUrl).toString("base64"),
        values: embedding,
        metadata: {
          url: page.metadata.baseUrl,
          title: page.metadata.website.title || "",
          description: page.metadata.website.description || "",
          content: truncatedContent.slice(0, 1000), // Store preview in metadata
          scrapedAt: page.metadata.scrapedAt,
        },
      });

      console.log(`  - Embedded: ${page.metadata.baseUrl}`);
    }

    // Step 3: Upsert to Pinecone
    console.log(`\nUpserting ${vectors.length} vectors to Pinecone...`);
    await index.upsert(vectors);

    console.log("\nDone! Vectors are now searchable in Pinecone.");
    console.log(`Index: ${INDEX_NAME}`);

    // Example: Query the index
    console.log("\n--- Example Query ---");
    const queryText = "example domain";
    const queryEmbedding = await openai.embeddings.create({
      model: "text-embedding-3-small",
      input: queryText,
    });

    const queryResponse = await index.query({
      vector: queryEmbedding.data[0].embedding,
      topK: 3,
      includeMetadata: true,
    });

    console.log(`Query: "${queryText}"`);
    console.log("Results:");
    for (const match of queryResponse.matches) {
      console.log(`  - ${match.metadata?.title} (score: ${match.score?.toFixed(3)})`);
      console.log(`    URL: ${match.metadata?.url}`);
    }
  } finally {
    await reader.close();
  }
}

main().catch(console.error);


================================================
FILE: examples/ai-tools/qdrant-ingest.ts
================================================
/**
 * Qdrant Vector Store Ingestion Example
 *
 * Scrapes webpages and ingests them into Qdrant for semantic search.
 *
 * Usage:
 *   npx tsx ai-tools/qdrant-ingest.ts
 *
 * Requirements:
 *   - Set QDRANT_URL environment variable (default: http://localhost:6333)
 *   - Set QDRANT_API_KEY environment variable (optional, for Qdrant Cloud)
 *   - Set OPENAI_API_KEY environment variable (for embeddings)
 */

import { ReaderClient } from "@vakra-dev/reader";
import { QdrantClient } from "@qdrant/js-client-rest";
import OpenAI from "openai";

const COLLECTION_NAME = "reader-docs";
const VECTOR_SIZE = 1536; // text-embedding-3-small dimension

async function main() {
  // Check for required API keys
  if (!process.env.OPENAI_API_KEY) {
    console.error("Error: OPENAI_API_KEY environment variable is required");
    process.exit(1);
  }

  console.log("Qdrant Vector Store Ingestion Example\n");

  // Initialize clients
  const qdrantUrl = process.env.QDRANT_URL || "http://localhost:6333";
  const qdrant = new QdrantClient({
    url: qdrantUrl,
    apiKey: process.env.QDRANT_API_KEY,
  });
  const openai = new OpenAI();
  const reader = new ReaderClient({ verbose: true });

  try {
    // Ensure collection exists
    try {
      await qdrant.getCollection(COLLECTION_NAME);
      console.log(`Using existing collection: ${COLLECTION_NAME}`);
    } catch {
      console.log(`Creating collection: ${COLLECTION_NAME}`);
      await qdrant.createCollection(COLLECTION_NAME, {
        vectors: {
          size: VECTOR_SIZE,
          distance: "Cosine",
        },
      });
    }

    // Step 1: Scrape webpages
    const urls = ["https://example.com", "https://example.org"];

    console.log(`\nScraping ${urls.length} URLs...`);
    const result = await reader.scrape({
      urls,
      formats: ["markdown"],
      batchConcurrency: 2,
    });

    console.log(`Scraped ${result.batchMetadata.successfulUrls} pages`);

    // Step 2: Generate embeddings and prepare points
    console.log("\nGenerating embeddings...");
    const points = [];

    for (let i = 0; i < result.data.length; i++) {
      const page = result.data[i];
      const content = page.markdown || "";
      if (!content) continue;

      // Truncate content to fit embedding model limits
      const truncatedContent = content.slice(0, 8000);

      // Generate embedding
      const embeddingResponse = await openai.embeddings.create({
        model: "text-embedding-3-small",
        input: truncatedContent,
      });

      const embedding = embeddingResponse.data[0].embedding;

      points.push({
        id: i + 1, // Qdrant requires positive integers or UUIDs
        vector: embedding,
        payload: {
          url: page.metadata.baseUrl,
          title: page.metadata.website.title || "",
          description: page.metadata.website.description || "",
          content: truncatedContent.slice(0, 1000), // Store preview in payload
          scrapedAt: page.metadata.scrapedAt,
        },
      });

      console.log(`  - Embedded: ${page.metadata.baseUrl}`);
    }

    // Step 3: Upsert to Qdrant
    console.log(`\nUpserting ${points.length} points to Qdrant...`);
    await qdrant.upsert(COLLECTION_NAME, {
      wait: true,
      points,
    });

    console.log("\nDone! Points are now searchable in Qdrant.");
    console.log(`Collection: ${COLLECTION_NAME}`);
    console.log(`Qdrant URL: ${qdrantUrl}`);

    // Example: Query the collection
    console.log("\n--- Example Query ---");
    const queryText = "example domain";
    const queryEmbedding = await openai.embeddings.create({
      model: "text-embedding-3-small",
      input: queryText,
    });

    const searchResponse = await qdrant.search(COLLECTION_NAME, {
      vector: queryEmbedding.data[0].embedding,
      limit: 3,
      with_payload: true,
    });

    console.log(`Query: "${queryText}"`);
    console.log("Results:");
    for (const result of searchResponse) {
      console.log(`  - ${result.payload?.title} (score: ${result.score.toFixed(3)})`);
      console.log(`    URL: ${result.payload?.url}`);
    }
  } finally {
    await reader.close();
  }
}

main().catch(console.error);


================================================
FILE: examples/ai-tools/vercel-ai-stream.ts
================================================
/**
 * Vercel AI SDK Streaming Example
 *
 * Scrapes a webpage and streams a summary using the Vercel AI SDK.
 *
 * Usage:
 *   npx tsx ai-tools/vercel-ai-stream.ts https://example.com
 *
 * Requirements:
 *   - Set OPENAI_API_KEY environment variable
 */

import { ReaderClient } from "@vakra-dev/reader";
import { openai } from "@ai-sdk/openai";
import { streamText } from "ai";

async function main() {
  const url = process.argv[2] || "https://example.com";

  console.log(`Scraping ${url}...\n`);

  // Check for API key
  if (!process.env.OPENAI_API_KEY) {
    console.error("Error: OPENAI_API_KEY environment variable is required");
    process.exit(1);
  }

  const reader = new ReaderClient({ verbose: true });

  try {
    // Step 1: Scrape the webpage
    const result = await reader.scrape({
      urls: [url],
      formats: ["markdown"],
    });

    const content = result.data[0]?.markdown;
    if (!content) {
      console.error("No content scraped");
      process.exit(1);
    }

    console.log(`Scraped ${content.length} characters`);
    console.log("Streaming summary...\n");
    console.log("=== STREAMING SUMMARY ===\n");

    // Step 2: Stream summary with Vercel AI SDK
    const { textStream } = await streamText({
      model: openai("gpt-4o-mini"),
      system:
        "You are a helpful assistant that summarizes web content. Provide a concise summary in 2-3 paragraphs.",
      prompt: `Please summarize the following webpage content:\n\n${content.slice(0, 10000)}`,
      maxTokens: 500,
    });

    // Stream the response to stdout
    for await (const chunk of textStream) {
      process.stdout.write(chunk);
    }

    console.log("\n\n=== METADATA ===");
    console.log(`Source: ${url}`);
    console.log(`Content length: ${content.length} chars`);
  } catch (error: any) {
    console.error("Error:", error.message);
    process.exit(1);
  } finally {
    await reader.close();
  }
}

main();


================================================
FILE: examples/basic/README.md
================================================
# Basic Examples

Simple examples demonstrating core Reader functionality.

## Running Examples

All commands run from the `reader` directory. Requires Node v22+ (`nvm use v22`).

```bash
npx tsx --tsconfig examples/tsconfig.json examples/basic/<example>.ts
```

If Hero's bundled Chrome binary isn't available (e.g. Apple Silicon), point to your local Chrome:

```bash
export CHROME_139_BIN="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
```

## Scraping

| Example | Description |
|---------|-------------|
| `basic-scrape.ts` | Scrape a single URL and display markdown output |
| `batch-scrape.ts` | Scrape multiple URLs concurrently with progress tracking |
| `all-formats.ts` | Output content in all supported formats (markdown, html) |

## Crawling

| Example | Description |
|---------|-------------|
| `crawl-website.ts` | Crawl a website to discover and optionally scrape pages |

## Browser Sessions

Browser sessions launch a stealthed Chrome and return a CDP WebSocket URL.
Connect with Playwright, Puppeteer, or any CDP client. Anti-bot stealth is
active (`webdriver=false`, navigator spoofing, WebRTC masking).

| Example | Description |
|---------|-------------|
| `browser-session.ts` | Playwright: navigate, extract data, screenshot |
| `browser-session-actions.ts` | Playwright: click, type, search, wait for elements |
| `browser-session-puppeteer.ts` | Puppeteer: same flow via `connect({ browserWSEndpoint })` |
| `browser-session-selenium.ts` | Raw CDP: direct WebSocket commands, no framework needed |

### Dependencies

```bash
npm install --save-dev playwright-core   # for Playwright examples
npm install --save-dev puppeteer-core    # for Puppeteer example
npm install --save-dev ws                # for raw CDP example
```

## Configuration

| Example | Description |
|---------|-------------|
| `with-proxy.ts` | Scrape using a proxy server |
| `proxy-pool.ts` | Rotate through multiple proxies |
| `browser-pool-config.ts` | Configure pool size, retirement, and queue limits |
| `cloudflare-bypass.ts` | Scrape a Cloudflare-protected site |


================================================
FILE: examples/basic/all-formats.ts
================================================
#!/usr/bin/env node
/**
 * All Formats Example
 *
 * Demonstrates outputting content in all supported formats (markdown and html)
 */

import { ReaderClient } from "@vakra-dev/reader";

async function main() {
  console.log("Starting all-formats example\n");

  const reader = new ReaderClient({ verbose: true });

  try {
    const result = await reader.scrape({
      urls: ["https://example.com"],
      formats: ["markdown", "html"],
    });

    const page = result.data[0]

Download .txt

gitextract_cms0mrdu/

├── .eslintrc.json
├── .github/
│   └── workflows/
│       ├── ci.yml
│       └── publish.yml
├── .gitignore
├── .leasotrc
├── .nvmrc
├── .prettierrc
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs/
│   ├── api-reference.md
│   ├── architecture.md
│   ├── assets/
│   │   ├── .gitkeep
│   │   └── demo.tape
│   ├── deployment/
│   │   ├── docker.md
│   │   ├── job-queues.md
│   │   └── production-server.md
│   ├── getting-started.md
│   ├── guides/
│   │   ├── browser-pool.md
│   │   ├── browser-sessions.md
│   │   ├── cloudflare-bypass.md
│   │   ├── output-formats.md
│   │   └── proxy-configuration.md
│   └── troubleshooting.md
├── ecosystem.config.cjs
├── examples/
│   ├── .gitignore
│   ├── .nvmrc
│   ├── README.md
│   ├── ai-tools/
│   │   ├── README.md
│   │   ├── anthropic-summary.ts
│   │   ├── langchain-loader.ts
│   │   ├── llamaindex-loader.ts
│   │   ├── openai-summary.ts
│   │   ├── pinecone-ingest.ts
│   │   ├── qdrant-ingest.ts
│   │   └── vercel-ai-stream.ts
│   ├── basic/
│   │   ├── README.md
│   │   ├── all-formats.ts
│   │   ├── basic-scrape.ts
│   │   ├── batch-scrape.ts
│   │   ├── browser-pool-config.ts
│   │   ├── browser-session-actions.ts
│   │   ├── browser-session-puppeteer.ts
│   │   ├── browser-session-selenium.ts
│   │   ├── browser-session.ts
│   │   ├── cloudflare-bypass.ts
│   │   ├── crawl-website.ts
│   │   ├── large-batch-scrape.ts
│   │   ├── proxy-pool.ts
│   │   └── with-proxy.ts
│   ├── package.json
│   ├── production/
│   │   ├── README.md
│   │   ├── browser-pool-scaling/
│   │   │   ├── README.md
│   │   │   ├── package.json
│   │   │   └── src/
│   │   │       └── index.ts
│   │   ├── express-server/
│   │   │   ├── README.md
│   │   │   ├── package.json
│   │   │   └── src/
│   │   │       └── index.ts
│   │   └── job-queue-bullmq/
│   │       ├── README.md
│   │       ├── package.json
│   │       └── src/
│   │           ├── index.ts
│   │           ├── queue.ts
│   │           └── worker.ts
│   └── tsconfig.json
├── package.json
├── result.md
├── scripts/
│   └── release.sh
├── src/
│   ├── browser/
│   │   ├── hero-config.ts
│   │   ├── pool.ts
│   │   ├── proxy-bound-browser.ts
│   │   ├── tiered-pool.ts
│   │   └── types.ts
│   ├── browser-session.ts
│   ├── browser-types.ts
│   ├── cli/
│   │   └── index.ts
│   ├── client.ts
│   ├── cloudflare/
│   │   ├── detector.ts
│   │   ├── handler.ts
│   │   └── types.ts
│   ├── config/
│   │   └── domain-profiles.ts
│   ├── crawl-types.ts
│   ├── crawler.ts
│   ├── daemon/
│   │   ├── client.ts
│   │   ├── index.ts
│   │   └── server.ts
│   ├── engines/
│   │   ├── errors.ts
│   │   ├── hero/
│   │   │   └── index.ts
│   │   ├── index.ts
│   │   ├── orchestrator.ts
│   │   └── types.ts
│   ├── errors.ts
│   ├── formatters/
│   │   ├── html.ts
│   │   ├── index.ts
│   │   ├── markdown.ts
│   │   └── postprocess.ts
│   ├── index.ts
│   ├── proxy/
│   │   ├── config.ts
│   │   ├── env.ts
│   │   ├── health-tracker.ts
│   │   ├── proxy-gate.ts
│   │   └── verify.ts
│   ├── scraper.ts
│   ├── types.ts
│   └── utils/
│       ├── block-detector.ts
│       ├── content-cleaner.ts
│       ├── logger.ts
│       ├── metadata-extractor.ts
│       ├── rate-limiter.ts
│       ├── robots-parser.ts
│       ├── url-helpers.ts
│       └── url-rewriter.ts
├── tests/
│   ├── engines/
│   │   └── orchestrator.test.ts
│   ├── fixtures/
│   │   ├── amazon-bot-page.html
│   │   ├── cloudflare-challenge.html
│   │   ├── empty-page.html
│   │   └── simple-static.html
│   ├── integration/
│   │   └── daemon.test.ts
│   └── unit/
│       ├── block-detector-cloudflare.test.ts
│       ├── block-detector-fixtures.test.ts
│       ├── block-detector.test.ts
│       ├── browser-session.test.ts
│       ├── content-cleaner.test.ts
│       ├── crawler.test.ts
│       ├── daemon-dispatch.test.ts
│       ├── domain-profiles.test.ts
│       ├── errors.test.ts
│       ├── health-tracker.test.ts
│       ├── html-size-guard.test.ts
│       ├── markdown-formatter.test.ts
│       ├── metadata-extractor.test.ts
│       ├── postprocess.test.ts
│       ├── proxy-bound-browser.test.ts
│       ├── proxy-config.test.ts
│       ├── proxy-gate.test.ts
│       ├── proxy-verify.test.ts
│       ├── robots-parser.test.ts
│       ├── scraper-pipeline.test.ts
│       ├── scraper-retry.test.ts
│       ├── tiered-pool.test.ts
│       ├── url-helpers.test.ts
│       └── url-rewriter.test.ts
├── tsconfig.json
├── tsup.config.ts
└── vitest.config.ts

Download .txt

SYMBOL INDEX (477 symbols across 81 files)

FILE: examples/ai-tools/anthropic-summary.ts
  function main (line 16) | async function main() {

FILE: examples/ai-tools/langchain-loader.ts
  class ReaderEngineLoader (line 17) | class ReaderEngineLoader extends BaseDocumentLoader {
    method constructor (line 24) | constructor(options: {
    method load (line 39) | async load(): Promise<Document[]> {
  function main (line 94) | async function main() {

FILE: examples/ai-tools/llamaindex-loader.ts
  function loadDocuments (line 16) | async function loadDocuments(reader: ReaderClient, urls: string[]): Prom...
  function crawlAndLoadDocuments (line 40) | async function crawlAndLoadDocuments(
  function main (line 71) | async function main() {

FILE: examples/ai-tools/openai-summary.ts
  function main (line 16) | async function main() {

FILE: examples/ai-tools/pinecone-ingest.ts
  constant INDEX_NAME (line 19) | const INDEX_NAME = "reader-docs";
  function main (line 21) | async function main() {

FILE: examples/ai-tools/qdrant-ingest.ts
  constant COLLECTION_NAME (line 19) | const COLLECTION_NAME = "reader-docs";
  constant VECTOR_SIZE (line 20) | const VECTOR_SIZE = 1536;
  function main (line 22) | async function main() {

FILE: examples/ai-tools/vercel-ai-stream.ts
  function main (line 17) | async function main() {

FILE: examples/basic/all-formats.ts
  function main (line 10) | async function main() {

FILE: examples/basic/basic-scrape.ts
  function main (line 10) | async function main() {

FILE: examples/basic/batch-scrape.ts
  function main (line 10) | async function main() {

FILE: examples/basic/browser-pool-config.ts
  function main (line 11) | async function main() {

FILE: examples/basic/browser-session-actions.ts
  function main (line 17) | async function main() {

FILE: examples/basic/browser-session-puppeteer.ts
  function main (line 15) | async function main() {

FILE: examples/basic/browser-session-selenium.ts
  function sendCDP (line 21) | function sendCDP(
  function main (line 45) | async function main() {

FILE: examples/basic/browser-session.ts
  function main (line 22) | async function main() {

FILE: examples/basic/cloudflare-bypass.ts
  function main (line 14) | async function main() {

FILE: examples/basic/crawl-website.ts
  function main (line 10) | async function main() {

FILE: examples/basic/large-batch-scrape.ts
  function generateSampleUrls (line 34) | function generateSampleUrls(count: number): string[] {
  function main (line 44) | async function main() {

FILE: examples/basic/proxy-pool.ts
  function main (line 15) | async function main() {

FILE: examples/basic/with-proxy.ts
  function main (line 10) | async function main() {

FILE: examples/production/browser-pool-scaling/src/index.ts
  function createConnectionToCore (line 23) | function createConnectionToCore(): ConnectionToHeroCore {
  constant PORT (line 64) | const PORT = process.env.PORT || 3003;
  function formatDuration (line 319) | function formatDuration(ms: number): string {
  function getUtilizationStatus (line 331) | function getUtilizationStatus(stats: { total: number; busy: number; queu...
  function startServer (line 365) | async function startServer() {

FILE: examples/production/express-server/src/index.ts
  constant PORT (line 21) | const PORT = process.env.PORT || 3001;
  function startServer (line 243) | async function startServer() {

FILE: examples/production/job-queue-bullmq/src/index.ts
  constant PORT (line 21) | const PORT = process.env.PORT || 3002;
  function startServer (line 265) | async function startServer() {

FILE: examples/production/job-queue-bullmq/src/queue.ts
  type ScrapeJobData (line 37) | interface ScrapeJobData {
  type ScrapeJobResult (line 51) | interface ScrapeJobResult {
  function addScrapeJob (line 75) | async function addScrapeJob(
  function getJob (line 89) | async function getJob(jobId: string) {
  function getQueueStats (line 96) | async function getQueueStats() {

FILE: examples/production/job-queue-bullmq/src/worker.ts
  function processJob (line 20) | async function processJob(job: Job<ScrapeJobData>): Promise<ScrapeJobRes...
  function startWorker (line 118) | async function startWorker() {

FILE: src/browser-session.ts
  constant DEFAULT_SESSION_TIMEOUT_MS (line 42) | const DEFAULT_SESSION_TIMEOUT_MS = 300_000;
  constant CHROME_LAUNCH_TIMEOUT_MS (line 43) | const CHROME_LAUNCH_TIMEOUT_MS = 15_000;
  function findChromePath (line 49) | function findChromePath(): string {
  function startAuthProxy (line 93) | function startAuthProxy(
  function parseProxy (line 179) | function parseProxy(proxyUrl: string): {
  function createBrowserSession (line 205) | async function createBrowserSession(

FILE: src/browser-types.ts
  type BrowserOptions (line 11) | interface BrowserOptions {
  type BrowserSession (line 50) | interface BrowserSession {
  type BrowserSessionInternalOptions (line 68) | interface BrowserSessionInternalOptions extends BrowserOptions {

FILE: src/browser/hero-config.ts
  type HeroConfigOptions (line 7) | interface HeroConfigOptions {
  function createHeroConfig (line 39) | function createHeroConfig(options: HeroConfigOptions = {}): any {
  function getDefaultHeroConfig (line 122) | function getDefaultHeroConfig(): any {

FILE: src/browser/pool.ts
  constant DEFAULT_POOL_CONFIG (line 17) | const DEFAULT_POOL_CONFIG: PoolConfig = {
  function generateId (line 31) | function generateId(): string {
  class BrowserPool (line 56) | class BrowserPool implements IBrowserPool {
    method constructor (line 72) | constructor(
    method initialize (line 90) | async initialize(): Promise<void> {
    method shutdown (line 116) | async shutdown(): Promise<void> {
    method acquire (line 158) | async acquire(): Promise<Hero> {
    method release (line 188) | release(hero: Hero): void {
    method withBrowser (line 218) | async withBrowser<T>(callback: (hero: Hero) => Promise<T>): Promise<T> {
    method getStats (line 238) | getStats(): PoolStats {
    method healthCheck (line 258) | async healthCheck(): Promise<HealthStatus> {
    method createInstance (line 291) | private async createInstance(): Promise<BrowserInstance> {
    method shouldRecycle (line 313) | private shouldRecycle(instance: BrowserInstance): boolean {
    method recycleInstance (line 324) | private async recycleInstance(instance: BrowserInstance): Promise<void> {
    method queueRequest (line 361) | private queueRequest(): Promise<Hero> {
    method processQueue (line 391) | private processQueue(): void {
    method startRecycling (line 410) | private startRecycling(): void {
    method startHealthChecks (line 425) | private startHealthChecks(): void {

FILE: src/browser/proxy-bound-browser.ts
  type TabLike (line 40) | interface TabLike {
  type HeroLike (line 54) | interface HeroLike {
  type HeroFactory (line 65) | interface HeroFactory {
  function createDefaultHeroFactory (line 83) | function createDefaultHeroFactory(): HeroFactory {
  type BrowserState (line 110) | type BrowserState = "launching" | "active" | "retired" | "closed";
  type ProxyBoundBrowserStats (line 116) | interface ProxyBoundBrowserStats {
  type ProxyBoundBrowserOptions (line 128) | interface ProxyBoundBrowserOptions {
  class ProxyBoundBrowser (line 192) | class ProxyBoundBrowser {
    method constructor (line 223) | constructor(options: ProxyBoundBrowserOptions) {
    method getState (line 275) | getState(): BrowserState {
    method isAvailable (line 283) | isAvailable(): boolean {
    method getActiveTabs (line 291) | getActiveTabs(): number {
    method getStats (line 298) | getStats(): ProxyBoundBrowserStats {
    method withPage (line 319) | async withPage<T>(fn: (tab: TabLike) => Promise<T>): Promise<T> {
    method retire (line 404) | async retire(): Promise<void> {
    method relaunch (line 447) | async relaunch(): Promise<void> {
    method launch (line 475) | private async launch(): Promise<void> {
    method drainLimit (line 499) | private async drainLimit(): Promise<void> {
  function redactProxyUrl (line 510) | function redactProxyUrl(proxyUrl: string | null): string {
  function makeDeferred (line 526) | function makeDeferred<T>(): { promise: Promise<T>; resolve: (v: T) => vo...

FILE: src/browser/tiered-pool.ts
  type PoolTier (line 44) | type PoolTier = "datacenter" | "residential" | "direct";
  type TierConfig (line 52) | interface TierConfig {
  type TieredBrowserPoolOptions (line 62) | interface TieredBrowserPoolOptions {
  type BrowserLease (line 125) | interface BrowserLease {
  type TierStats (line 135) | interface TierStats {
  type PoolStatsSnapshot (line 143) | interface PoolStatsSnapshot {
  class TieredBrowserPool (line 150) | class TieredBrowserPool {
    method constructor (line 173) | constructor(options: TieredBrowserPoolOptions) {
    method acquire (line 234) | acquire(tier: PoolTier): BrowserLease {
    method hasTier (line 274) | hasTier(tier: PoolTier): boolean {
    method getBrowserByProxy (line 285) | getBrowserByProxy(proxyUrl: string | null): ProxyBoundBrowser | null {
    method getStats (line 296) | getStats(): PoolStatsSnapshot {
    method close (line 311) | async close(): Promise<void> {
    method createBrowser (line 326) | private createBrowser(proxyUrl: string | null, timezoneId?: string): P...
    method attachHealthListeners (line 352) | private attachHealthListeners(tracker: ProxyHealthTracker): void {
  function buildTierConfigsFromPools (line 390) | function buildTierConfigsFromPools(
  function proxyUrlKey (line 439) | function proxyUrlKey(proxyUrl: string | null | undefined): string {

FILE: src/browser/types.ts
  type BrowserInstance (line 6) | interface BrowserInstance {
  type QueueItem (line 29) | interface QueueItem {
  type PoolConfig (line 43) | interface PoolConfig {
  type PoolStats (line 72) | interface PoolStats {
  type HealthStatus (line 101) | interface HealthStatus {
  type IBrowserPool (line 115) | interface IBrowserPool {

FILE: src/client.ts
  type ProxyRotation (line 48) | type ProxyRotation = "round-robin" | "random";
  type ReaderClientOptions (line 53) | interface ReaderClientOptions {
  class ReaderClient (line 97) | class ReaderClient {
    method constructor (line 110) | constructor(options: ReaderClientOptions = {}) {
    method getNextProxy (line 128) | private getNextProxy(): ProxyConfig | undefined {
    method getProxyForTier (line 149) | getProxyForTier(tier: "datacenter" | "residential"): ProxyConfig | und...
    method resolveProxy (line 173) | private resolveProxy(proxyTier?: import("./types").ProxyTier): ProxyCo...
    method hasProxyTier (line 196) | hasProxyTier(tier: "datacenter" | "residential"): boolean {
    method start (line 207) | async start(): Promise<void> {
    method initializeCore (line 241) | private async initializeCore(): Promise<void> {
    method createConnection (line 334) | private createConnection(): ConnectionToHeroCore {
    method ensureInitialized (line 347) | private async ensureInitialized(): Promise<void> {
    method scrape (line 369) | async scrape(options: Omit<ScrapeOptions, "connectionToCore" | "pool">...
    method crawl (line 407) | async crawl(options: Omit<CrawlOptions, "connectionToCore" | "pool">):...
    method browser (line 451) | async browser(options: Omit<BrowserOptions, "connectionToCore"> = {}):...
    method isReady (line 483) | isReady(): boolean {
    method close (line 492) | async close(): Promise<void> {
    method registerCleanup (line 564) | private registerCleanup(): void {
    method removeCleanupHandlers (line 584) | private removeCleanupHandlers(): void {

FILE: src/cloudflare/detector.ts
  constant CLOUDFLARE_CHALLENGE_SELECTORS (line 10) | const CLOUDFLARE_CHALLENGE_SELECTORS = [
  constant CLOUDFLARE_TEXT_PATTERNS (line 26) | const CLOUDFLARE_TEXT_PATTERNS = [
  constant CLOUDFLARE_INFRA_PATTERNS (line 38) | const CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm",...
  constant CLOUDFLARE_BLOCKED_PATTERNS (line 45) | const CLOUDFLARE_BLOCKED_PATTERNS = ["sorry, you have been blocked", "ra...
  function detectChallenge (line 60) | async function detectChallenge(hero: Hero): Promise<ChallengeDetection> {
  function isChallengePage (line 174) | async function isChallengePage(hero: Hero): Promise<boolean> {

FILE: src/cloudflare/handler.ts
  function waitForChallengeResolution (line 28) | async function waitForChallengeResolution(
  function waitForSelector (line 118) | async function waitForSelector(
  function handleChallenge (line 164) | async function handleChallenge(

FILE: src/cloudflare/types.ts
  type ChallengeDetection (line 4) | interface ChallengeDetection {
  type ChallengeResolutionResult (line 21) | interface ChallengeResolutionResult {
  type ChallengeWaitOptions (line 35) | interface ChallengeWaitOptions {

FILE: src/config/domain-profiles.ts
  type DomainProfile (line 17) | interface DomainProfile {
  function getDomainProfile (line 39) | function getDomainProfile(
  function applyDomainProfile (line 73) | function applyDomainProfile<T extends Partial<ScrapeOptions>>(

FILE: src/crawl-types.ts
  type CrawlOptions (line 7) | interface CrawlOptions {
  type CrawlUrl (line 109) | interface CrawlUrl {
  type CrawlResult (line 123) | interface CrawlResult {
  type CrawlMetadata (line 137) | interface CrawlMetadata {

FILE: src/crawler.ts
  class Crawler (line 24) | class Crawler {
    method constructor (line 32) | constructor(options: CrawlOptions) {
    method crawl (line 49) | async crawl(): Promise<CrawlResult> {
    method fetchPage (line 125) | private async fetchPage(url: string): Promise<{ crawlUrl: CrawlUrl; ht...
    method extractLinks (line 169) | private extractLinks(
    method scrapeDiscoveredUrls (line 241) | private async scrapeDiscoveredUrls(): Promise<ScrapeResult> {
  function crawl (line 267) | async function crawl(options: CrawlOptions): Promise<CrawlResult> {

FILE: src/daemon/client.ts
  type DaemonClientOptions (line 26) | interface DaemonClientOptions {
  class DaemonClient (line 38) | class DaemonClient {
    method constructor (line 41) | constructor(options: DaemonClientOptions = {}) {
    method scrape (line 52) | async scrape(options: Omit<ScrapeOptions, "connectionToCore">): Promis...
    method crawl (line 62) | async crawl(options: Omit<CrawlOptions, "connectionToCore">): Promise<...
    method status (line 72) | async status(): Promise<DaemonStatus> {
    method shutdown (line 81) | async shutdown(): Promise<void> {
    method browserCreate (line 90) | async browserCreate(
    method browserStop (line 102) | async browserStop(sessionId: string): Promise<void> {
    method browserList (line 112) | async browserList(): Promise<BrowserSessionInfo[]> {
    method isRunning (line 121) | async isRunning(): Promise<boolean> {
    method request (line 133) | private request<T>(body: object): Promise<T> {
  function isDaemonRunning (line 199) | async function isDaemonRunning(port: number = DEFAULT_DAEMON_PORT): Prom...

FILE: src/daemon/server.ts
  constant DEFAULT_DAEMON_PORT (line 38) | const DEFAULT_DAEMON_PORT = 6003;
  constant PID_FILE_NAME (line 39) | const PID_FILE_NAME = ".reader-daemon.pid";
  constant SHUTDOWN_TIMEOUT_MS (line 40) | const SHUTDOWN_TIMEOUT_MS = 30_000;
  type DaemonServerOptions (line 45) | interface DaemonServerOptions {
  type ScrapeRequest (line 61) | interface ScrapeRequest {
  type CrawlRequest (line 66) | interface CrawlRequest {
  type StatusRequest (line 71) | interface StatusRequest {
  type ShutdownRequest (line 75) | interface ShutdownRequest {
  type BrowserCreateRequest (line 79) | interface BrowserCreateRequest {
  type BrowserStopRequest (line 84) | interface BrowserStopRequest {
  type BrowserListRequest (line 89) | interface BrowserListRequest {
  type DaemonRequest (line 93) | type DaemonRequest =
  type SuccessResponse (line 105) | interface SuccessResponse<T> {
  type ErrorResponse (line 110) | interface ErrorResponse {
  type DaemonResponse (line 115) | type DaemonResponse<T> = SuccessResponse<T> | ErrorResponse;
  type DaemonStatus (line 120) | interface DaemonStatus {
  type BrowserSessionInfo (line 133) | interface BrowserSessionInfo {
  class DaemonServer (line 142) | class DaemonServer {
    method constructor (line 151) | constructor(options: DaemonServerOptions = {}) {
    method start (line 164) | async start(): Promise<void> {
    method stop (line 244) | async stop(): Promise<void> {
    method getPort (line 268) | getPort(): number {
    method checkAuth (line 276) | private checkAuth(req: http.IncomingMessage, res: http.ServerResponse)...
    method handleRequest (line 290) | private async handleRequest(req: http.IncomingMessage, res: http.Serve...
    method handleScrape (line 390) | private async handleScrape(
    method handleCrawl (line 406) | private async handleCrawl(
    method handleStatus (line 422) | private handleStatus(res: http.ServerResponse): void {
    method handleShutdown (line 438) | private async handleShutdown(res: http.ServerResponse): Promise<void> {
    method gracefulStop (line 450) | async gracefulStop(): Promise<void> {
    method handleBrowserCreate (line 491) | private async handleBrowserCreate(
    method handleBrowserStop (line 515) | private async handleBrowserStop(res: http.ServerResponse, sessionId: s...
    method handleBrowserList (line 530) | private handleBrowserList(res: http.ServerResponse): void {
    method sendResponse (line 542) | private sendResponse<T>(
    method writePidFile (line 554) | private async writePidFile(): Promise<void> {
    method removePidFile (line 572) | private async removePidFile(): Promise<void> {
  function getPidFilePath (line 589) | async function getPidFilePath(): Promise<string> {
  function getDaemonInfo (line 598) | async function getDaemonInfo(): Promise<{

FILE: src/engines/errors.ts
  class EngineError (line 13) | class EngineError extends Error {
    method constructor (line 17) | constructor(
  class InsufficientContentError (line 37) | class InsufficientContentError extends EngineError {
    method constructor (line 41) | constructor(engine: EngineName, contentLength: number, threshold: numb...
  class HttpError (line 54) | class HttpError extends EngineError {
    method constructor (line 57) | constructor(engine: EngineName, statusCode: number, statusText?: strin...
  class EngineTimeoutError (line 68) | class EngineTimeoutError extends EngineError {
    method constructor (line 71) | constructor(engine: EngineName, timeoutMs: number) {
  class EngineUnavailableError (line 81) | class EngineUnavailableError extends EngineError {
    method constructor (line 82) | constructor(engine: EngineName, reason?: string) {
  class ScrapeFailedError (line 94) | class ScrapeFailedError extends Error {
    method constructor (line 98) | constructor(error: Error, options?: { proxyBlock?: boolean }) {

FILE: src/engines/hero/index.ts
  constant MIN_CONTENT_LENGTH (line 32) | const MIN_CONTENT_LENGTH = 100;
  class HeroEngine (line 37) | class HeroEngine implements Engine {
    method scrape (line 40) | async scrape(meta: EngineMeta): Promise<EngineResult> {
    method extractText (line 176) | private extractText(html: string): string {
    method isAvailable (line 185) | isAvailable(): boolean {
  function resolveTierFromOptions (line 198) | function resolveTierFromOptions(proxyTier: string | undefined): PoolTier {

FILE: src/engines/orchestrator.ts
  type OrchestratorOptions (line 17) | interface OrchestratorOptions {
  type OrchestratorResult (line 27) | interface OrchestratorResult extends EngineResult {
  class EngineOrchestrator (line 42) | class EngineOrchestrator {
    method constructor (line 45) | constructor(options: OrchestratorOptions = {}) {
    method assessQuality (line 53) | private assessQuality(result: EngineResult): {
    method scrape (line 81) | async scrape(meta: EngineMeta): Promise<OrchestratorResult> {
  function createOrchestrator (line 131) | function createOrchestrator(options: OrchestratorOptions = {}): EngineOr...

FILE: src/engines/types.ts
  type EngineName (line 14) | type EngineName = "hero";
  type EngineResult (line 19) | interface EngineResult {
  type EngineMeta (line 40) | interface EngineMeta {
  type EngineConfig (line 54) | interface EngineConfig {
  type EngineFeatures (line 68) | interface EngineFeatures {
  type Engine (line 84) | interface Engine {
  constant ENGINE_CONFIG (line 102) | const ENGINE_CONFIG: EngineConfig = {

FILE: src/errors.ts
  type ReaderErrorCode (line 11) | enum ReaderErrorCode {
  class ReaderError (line 59) | class ReaderError extends Error {
    method constructor (line 66) | constructor(
    method toJSON (line 92) | toJSON(): Record<string, unknown> {
  class NetworkError (line 109) | class NetworkError extends ReaderError {
    method constructor (line 110) | constructor(message: string, options?: { url?: string; cause?: Error }) {
  class TimeoutError (line 122) | class TimeoutError extends ReaderError {
    method constructor (line 125) | constructor(message: string, timeoutMs: number, options?: { url?: stri...
    method toJSON (line 134) | toJSON(): Record<string, unknown> {
  class CloudflareError (line 145) | class CloudflareError extends ReaderError {
    method constructor (line 148) | constructor(challengeType: string, options?: { url?: string; cause?: E...
    method toJSON (line 161) | toJSON(): Record<string, unknown> {
  class AccessDeniedError (line 172) | class AccessDeniedError extends ReaderError {
    method constructor (line 175) | constructor(message: string, options?: { url?: string; statusCode?: nu...
    method toJSON (line 184) | toJSON(): Record<string, unknown> {
  class ContentExtractionError (line 195) | class ContentExtractionError extends ReaderError {
    method constructor (line 196) | constructor(message: string, options?: { url?: string; cause?: Error }) {
  class ValidationError (line 208) | class ValidationError extends ReaderError {
    method constructor (line 211) | constructor(message: string, options?: { field?: string; url?: string ...
    method toJSON (line 220) | toJSON(): Record<string, unknown> {
  class InvalidUrlError (line 231) | class InvalidUrlError extends ReaderError {
    method constructor (line 232) | constructor(url: string, reason?: string) {
  class RobotsBlockedError (line 248) | class RobotsBlockedError extends ReaderError {
    method constructor (line 249) | constructor(url: string) {
  class BrowserPoolError (line 265) | class BrowserPoolError extends ReaderError {
    method constructor (line 266) | constructor(message: string, options?: { cause?: Error }) {
  class ClientClosedError (line 278) | class ClientClosedError extends ReaderError {
    method constructor (line 279) | constructor() {
  class NotInitializedError (line 294) | class NotInitializedError extends ReaderError {
    method constructor (line 295) | constructor(component: string) {
  class DNSError (line 314) | class DNSError extends ReaderError {
    method constructor (line 317) | constructor(hostname: string, options?: { url?: string; cause?: Error ...
    method toJSON (line 326) | toJSON(): Record<string, unknown> {
  class TLSError (line 334) | class TLSError extends ReaderError {
    method constructor (line 335) | constructor(detail: string, options?: { url?: string; cause?: Error }) {
  class BotDetectedError (line 351) | class BotDetectedError extends ReaderError {
    method constructor (line 354) | constructor(signal: string, options?: { url?: string; cause?: Error }) {
    method toJSON (line 363) | toJSON(): Record<string, unknown> {
  class ProxyConnectionError (line 375) | class ProxyConnectionError extends ReaderError {
    method constructor (line 378) | constructor(proxyTier: string, options?: { url?: string; cause?: Error...
    method toJSON (line 387) | toJSON(): Record<string, unknown> {
  class ProxyExhaustedError (line 395) | class ProxyExhaustedError extends ReaderError {
    method constructor (line 396) | constructor(options?: { url?: string; cause?: Error }) {
  class ContentTooLargeError (line 416) | class ContentTooLargeError extends ReaderError {
    method constructor (line 420) | constructor(sizeBytes: number, limitBytes: number, options?: { url?: s...
    method toJSON (line 431) | toJSON(): Record<string, unknown> {
  class MarkdownConversionError (line 439) | class MarkdownConversionError extends ReaderError {
    method constructor (line 440) | constructor(detail: string, options?: { url?: string; cause?: Error }) {
  class EmptyContentError (line 452) | class EmptyContentError extends ReaderError {
    method constructor (line 455) | constructor(contentLength: number, options?: { url?: string }) {
    method toJSON (line 465) | toJSON(): Record<string, unknown> {
  function wrapError (line 484) | function wrapError(error: unknown, url?: string): ReaderError {

FILE: src/formatters/html.ts
  function formatToHTML (line 15) | function formatToHTML(html: string): string {

FILE: src/formatters/markdown.ts
  function htmlToMarkdown (line 20) | function htmlToMarkdown(html: string): string {
  function fallbackTextExtract (line 53) | function fallbackTextExtract(html: string): string {

FILE: src/formatters/postprocess.ts
  function postprocessMarkdown (line 11) | function postprocessMarkdown(md: string): string {
  function deduplicateImageLinks (line 37) | function deduplicateImageLinks(md: string): string {

FILE: src/proxy/config.ts
  function createProxyUrl (line 23) | function createProxyUrl(config: ProxyConfig): string {
  function parseProxyUrl (line 54) | function parseProxyUrl(url: string): ProxyConfig {

FILE: src/proxy/env.ts
  function parseList (line 30) | function parseList(raw: string | undefined, tierLabel: string): ProxyCon...
  type ParsedProxyPools (line 55) | interface ParsedProxyPools {
  function parseProxyPoolsFromEnv (line 66) | function parseProxyPoolsFromEnv(env: NodeJS.ProcessEnv = process.env): P...

FILE: src/proxy/health-tracker.ts
  constant DEFAULT_FAILURE_THRESHOLD (line 39) | const DEFAULT_FAILURE_THRESHOLD = 10;
  constant DEFAULT_COOLDOWN_MS (line 40) | const DEFAULT_COOLDOWN_MS = 5 * 60 * 1000;
  type ProxyHealthEvents (line 45) | interface ProxyHealthEvents {
  type ProxyHealthTrackerOptions (line 57) | interface ProxyHealthTrackerOptions {
  type ProxyState (line 72) | interface ProxyState {
  type ProxyHealthSnapshot (line 89) | interface ProxyHealthSnapshot {
  class ProxyHealthTracker (line 124) | class ProxyHealthTracker extends EventEmitter {
    method constructor (line 130) | constructor(options: ProxyHealthTrackerOptions = {}) {
    method on (line 150) | override on<E extends keyof ProxyHealthEvents>(event: E, listener: Pro...
    method once (line 153) | override once<E extends keyof ProxyHealthEvents>(event: E, listener: P...
    method emit (line 156) | override emit<E extends keyof ProxyHealthEvents>(
    method isHealthy (line 168) | isHealthy(proxyUrl: string): boolean {
    method recordSuccess (line 188) | recordSuccess(proxyUrl: string): void {
    method recordFailure (line 208) | recordFailure(proxyUrl: string): void {
    method snapshot (line 236) | snapshot(proxyUrl: string): ProxyHealthSnapshot | null {
    method allSnapshots (line 254) | allSnapshots(): ProxyHealthSnapshot[] {
    method reset (line 264) | reset(proxyUrl: string): void {
    method ensureState (line 268) | private ensureState(proxyUrl: string): ProxyState {

FILE: src/proxy/proxy-gate.ts
  type PerProxyGateOptions (line 30) | interface PerProxyGateOptions {
  type PerProxyRelease (line 45) | type PerProxyRelease = () => void;
  type PerProxyStats (line 50) | interface PerProxyStats {
  class PerProxyGate (line 74) | class PerProxyGate {
    method constructor (line 79) | constructor(options: PerProxyGateOptions = {}) {
    method setOverride (line 97) | setOverride(proxyUrl: string, max: number): void {
    method acquire (line 115) | async acquire(proxyUrl: string | null | undefined): Promise<PerProxyRe...
    method withSlot (line 152) | async withSlot<T>(proxyUrl: string | null | undefined, fn: () => Promi...
    method stats (line 165) | stats(proxyUrl: string): PerProxyStats | null {
    method allStats (line 179) | allStats(): PerProxyStats[] {
    method gateFor (line 191) | private gateFor(proxyUrl: string): { limit: ReturnType<typeof pLimit>;...
  function makeRelease (line 204) | function makeRelease(resolve: () => void): PerProxyRelease {
  function noopRelease (line 216) | function noopRelease(): PerProxyRelease {

FILE: src/proxy/verify.ts
  constant IP_CHECK_URL (line 25) | const IP_CHECK_URL = "https://api.ipify.org?format=json";
  constant IP_CHECK_TIMEOUT_MS (line 26) | const IP_CHECK_TIMEOUT_MS = 10_000;
  type ProxyTierName (line 28) | type ProxyTierName = "datacenter" | "residential";
  type VerifiedProxy (line 30) | interface VerifiedProxy {
  type ProxyVerificationFailure (line 36) | interface ProxyVerificationFailure {
  type ProxyVerificationResult (line 42) | interface ProxyVerificationResult {
  type EgressIpFetcher (line 52) | type EgressIpFetcher = (proxyUrl: string) => Promise<string>;
  type VerifyProxiesOptions (line 54) | interface VerifyProxiesOptions {
  function verifyProxies (line 67) | async function verifyProxies(
  function verifyProxiesOrThrow (line 105) | async function verifyProxiesOrThrow(
  function defaultFetcher (line 132) | async function defaultFetcher(proxyUrl: string): Promise<string> {

FILE: src/scraper.ts
  constant DEFAULT_HARD_DEADLINE_MS (line 34) | const DEFAULT_HARD_DEADLINE_MS = 30_000;
  constant DEFAULT_DATACENTER_TIMEOUT_MS (line 37) | const DEFAULT_DATACENTER_TIMEOUT_MS = 10_000;
  class Scraper (line 49) | class Scraper {
    method constructor (line 54) | constructor(options: ScrapeOptions) {
    method getRobotsRules (line 64) | private async getRobotsRules(url: string): Promise<RobotsRules | null> {
    method scrape (line 76) | async scrape(): Promise<ScrapeResult> {
    method scrapeWithConcurrency (line 85) | private async scrapeWithConcurrency(): Promise<
    method scrapeSingleUrlWithRetry (line 115) | private async scrapeSingleUrlWithRetry(
    method scrapeSingleUrl (line 200) | private async scrapeSingleUrl(
    method buildScrapeResult (line 485) | private buildScrapeResult(
  function detectJsonPayload (line 519) | function detectJsonPayload(body: string, statusCode: number): string | n...
  function scrape (line 544) | async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {

FILE: src/types.ts
  type ProxyConfig (line 6) | interface ProxyConfig {
  type ProxyTier (line 32) | type ProxyTier = "datacenter" | "residential" | "auto";
  type ProxyPoolConfig (line 37) | interface ProxyPoolConfig {
  type ProxyMetadata (line 47) | interface ProxyMetadata {
  type BrowserPoolConfig (line 61) | interface BrowserPoolConfig {
  type ScrapeOptions (line 75) | interface ScrapeOptions {
  type WebsiteMetadata (line 269) | interface WebsiteMetadata {
  type Page (line 314) | interface Page {
  type WebsiteScrapeResult (line 337) | interface WebsiteScrapeResult {
  type BatchMetadata (line 378) | interface BatchMetadata {
  type ScrapeResult (line 401) | interface ScrapeResult {
  type CrawlerState (line 412) | interface CrawlerState {
  type ScraperConfig (line 426) | interface ScraperConfig {
  constant DEFAULT_OPTIONS (line 440) | const DEFAULT_OPTIONS: Omit<
  function isValidFormat (line 499) | function isValidFormat(format: string): format is "markdown" | "html" {
  function shouldCrawlUrl (line 506) | function shouldCrawlUrl(url: URL, baseDomain: string): boolean {

FILE: src/utils/block-detector.ts
  type BlockDetectionConfig (line 18) | interface BlockDetectionConfig {
  function toRegExp (line 30) | function toRegExp(p: RegExp | string): RegExp {
  function detectBotPage (line 39) | function detectBotPage(html: string, config?: BlockDetectionConfig): boo...
  function detectBotTitle (line 60) | function detectBotTitle(title: string, config?: BlockDetectionConfig): b...
  function isBlockedResponse (line 73) | function isBlockedResponse(
  function stripTags (line 97) | function stripTags(html: string): string {

FILE: src/utils/content-cleaner.ts
  type CleaningOptions (line 25) | interface CleaningOptions {
  constant ALWAYS_REMOVE_SELECTORS (line 43) | const ALWAYS_REMOVE_SELECTORS = ["script", "style", "noscript", "meta", ...
  constant NAVIGATION_SELECTORS (line 50) | const NAVIGATION_SELECTORS = [
  constant FORCE_INCLUDE_SELECTORS (line 125) | const FORCE_INCLUDE_SELECTORS = [
  function removeElements (line 144) | function removeElements(document: Document, selectors: string[]): void {
  function removeWithProtection (line 158) | function removeWithProtection(
  function cleanHtml (line 201) | function cleanHtml(html: string, baseUrl: string, options: CleaningOptio...
  function removeBase64ImagesFromDocument (line 272) | function removeBase64ImagesFromDocument(document: Document): void {
  function resolveSrcsets (line 302) | function resolveSrcsets(document: Document): void {
  function convertRelativeUrls (line 334) | function convertRelativeUrls(document: Document, baseUrl: string): void {
  function cleanContent (line 369) | function cleanContent(html: string, baseUrl: string, options: CleaningOp...

FILE: src/utils/logger.ts
  type Logger (line 6) | type Logger = ReturnType<typeof createLogger>;
  function hasPinoPretty (line 11) | function hasPinoPretty(): boolean {
  function createLogger (line 27) | function createLogger(

FILE: src/utils/metadata-extractor.ts
  function extractMetadata (line 9) | function extractMetadata(html: string, baseUrl: string): WebsiteMetadata {
  function extractWebsiteMetadata (line 16) | function extractWebsiteMetadata(html: string, baseUrl: string): WebsiteM...
  function extractTitle (line 65) | function extractTitle(document: Document): string | null {
  function extractMetaContent (line 80) | function extractMetaContent(document: Document, name: string): string | ...
  function extractLanguage (line 101) | function extractLanguage(document: Document): string | null {
  function extractCharset (line 109) | function extractCharset(document: Document): string | null {
  function extractFavicon (line 133) | function extractFavicon(document: Document, baseUrl: string): string | n...
  function extractCanonical (line 163) | function extractCanonical(document: Document, baseUrl: string): string |...
  function extractKeywords (line 178) | function extractKeywords(document: Document): string[] | null {
  function extractOpenGraph (line 193) | function extractOpenGraph(document: Document): WebsiteMetadata["openGrap...
  function extractTwitterCard (line 223) | function extractTwitterCard(document: Document): WebsiteMetadata["twitte...
  function extractStructuredData (line 251) | function extractStructuredData(html: string): unknown[] {
  function extractMicrodata (line 270) | function extractMicrodata(_html: string): unknown[] {
  function getMetadataSummary (line 280) | function getMetadataSummary(metadata: WebsiteMetadata): string {

FILE: src/utils/rate-limiter.ts
  function rateLimit (line 6) | async function rateLimit(ms: number): Promise<void> {
  class RateLimiter (line 13) | class RateLimiter {
    method constructor (line 16) | constructor(requestsPerSecond: number) {
    method execute (line 29) | async execute<T>(fn: () => Promise<T>): Promise<T> {
    method waitForNextSlot (line 39) | private async waitForNextSlot(): Promise<void> {
    method executeAll (line 55) | async executeAll<T>(functions: Array<() => Promise<T>>): Promise<T[]> {

FILE: src/utils/robots-parser.ts
  type RobotsRules (line 5) | interface RobotsRules {
  function parseRobotsTxt (line 14) | function parseRobotsTxt(content: string, userAgent: string = "*"): Robot...
  function isPathAllowed (line 63) | function isPathAllowed(path: string, rules: RobotsRules): boolean {
  function pathMatches (line 89) | function pathMatches(path: string, pattern: string): boolean {
  function fetchRobotsTxt (line 119) | async function fetchRobotsTxt(baseUrl: string): Promise<RobotsRules | nu...
  function isUrlAllowed (line 144) | function isUrlAllowed(url: string, rules: RobotsRules | null): boolean {

FILE: src/utils/url-helpers.ts
  function resolveUrl (line 11) | function resolveUrl(relative: string, base: string): string {
  function isValidUrl (line 22) | function isValidUrl(string: string): boolean {
  function normalizeUrl (line 34) | function normalizeUrl(url: string, baseUrl?: string): string {
  function extractBaseDomain (line 58) | function extractBaseDomain(url: string): string {
  function isSameDomain (line 74) | function isSameDomain(url: string, baseUrl: string): boolean {
  function getUrlKey (line 96) | function getUrlKey(url: string): string {
  function validateUrls (line 143) | function validateUrls(urls: string[]): {
  function matchesPatterns (line 206) | function matchesPatterns(url: string, patterns: string[]): boolean {
  function shouldIncludeUrl (line 227) | function shouldIncludeUrl(
  function isContentUrl (line 253) | function isContentUrl(url: string): boolean {
  function shouldCrawlUrl (line 293) | function shouldCrawlUrl(

FILE: src/utils/url-rewriter.ts
  type UrlRewriteRule (line 16) | interface UrlRewriteRule {
  type RewriteResult (line 28) | interface RewriteResult {
  function rewriteUrl (line 42) | function rewriteUrl(inputUrl: string, rules?: UrlRewriteRule[]): Rewrite...

FILE: tests/engines/orchestrator.test.ts
  function createMeta (line 7) | function createMeta(url = "https://example.com") {

FILE: tests/integration/daemon.test.ts
  function request (line 16) | function request(

FILE: tests/unit/block-detector-cloudflare.test.ts
  constant FIXTURES_DIR (line 6) | const FIXTURES_DIR = join(__dirname, "..", "fixtures");
  function loadFixture (line 8) | function loadFixture(name: string): string {
  constant CF_CONFIG (line 12) | const CF_CONFIG: BlockDetectionConfig = {

FILE: tests/unit/block-detector-fixtures.test.ts
  constant FIXTURES_DIR (line 6) | const FIXTURES_DIR = join(__dirname, "..", "fixtures");
  function loadFixture (line 8) | function loadFixture(name: string): string {
  constant AMAZON_CONFIG (line 12) | const AMAZON_CONFIG: BlockDetectionConfig = {

FILE: tests/unit/block-detector.test.ts
  constant TEST_CONFIG (line 5) | const TEST_CONFIG: BlockDetectionConfig = {

FILE: tests/unit/crawler.test.ts
  function mockPool (line 28) | function mockPool(): IBrowserPool {
  function createTestCrawler (line 41) | function createTestCrawler(options: {
  function makeHtml (line 70) | function makeHtml(links: string[], title = "Test Page"): string {
  function pageResult (line 76) | function pageResult(url: string, html: string, title = "Test Page") {

FILE: tests/unit/daemon-dispatch.test.ts
  function mockReq (line 17) | function mockReq(
  type CapturedResponse (line 44) | interface CapturedResponse {
  function mockRes (line 51) | function mockRes(): { res: http.ServerResponse; captured: () => Captured...

FILE: tests/unit/domain-profiles.test.ts
  constant TEST_PROFILES (line 5) | const TEST_PROFILES = {

FILE: tests/unit/health-tracker.test.ts
  function fakeClock (line 7) | function fakeClock(start = 1_000_000_000_000) {

FILE: tests/unit/html-size-guard.test.ts
  constant DEFAULT_MAX (line 10) | const DEFAULT_MAX = 307200;
  function applyGuard (line 12) | function applyGuard(html: string, maxBytes: number = DEFAULT_MAX): { tru...

FILE: tests/unit/proxy-bound-browser.test.ts
  type FakeTab (line 19) | interface FakeTab extends TabLike {
  function makeFakeTab (line 23) | function makeFakeTab(): FakeTab {
  type FakeHero (line 41) | interface FakeHero extends HeroLike {
  function makeFakeFactory (line 47) | function makeFakeFactory(opts: {
  function tick (line 94) | async function tick(n = 1) {

FILE: tests/unit/proxy-gate.test.ts
  function defer (line 8) | function defer<T = void>() {
  function tick (line 18) | async function tick(n = 1) {

FILE: tests/unit/proxy-verify.test.ts
  function makeFakeFetcher (line 10) | function makeFakeFetcher(

FILE: tests/unit/scraper-pipeline.test.ts
  function makeScraper (line 15) | function makeScraper(options?: Record<string, unknown>): Scraper {
  function mockPipeline (line 28) | function mockPipeline(scraper: Scraper, html: string, url = "https://exa...

FILE: tests/unit/scraper-retry.test.ts
  function makeResult (line 21) | function makeResult(overrides?: Partial<WebsiteScrapeResult>): WebsiteSc...
  function makeScraper (line 38) | function makeScraper(overrides?: Record<string, unknown>): Scraper {
  function spySingleUrl (line 42) | function spySingleUrl(scraper: Scraper) {

FILE: tests/unit/tiered-pool.test.ts
  type FakeHero (line 12) | interface FakeHero extends HeroLike {
  function makeFakeTab (line 17) | function makeFakeTab(): TabLike {
  function makeFakeFactory (line 29) | function makeFakeFactory(opts: { failFor?: Set<string> } = {}): {
  function tick (line 56) | async function tick(n = 1) {

FILE: tests/unit/url-rewriter.test.ts
  function extractGoogleDocId (line 5) | function extractGoogleDocId(pathname: string): string | null {
  constant GOOGLE_RULES (line 10) | const GOOGLE_RULES: UrlRewriteRule[] = [

Download .json

Condensed preview — 147 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (824K chars).

[
  {
    "path": ".eslintrc.json",
    "chars": 807,
    "preview": "{\n  \"root\": true,\n  \"parser\": \"@typescript-eslint/parser\",\n  \"parserOptions\": {\n    \"ecmaVersion\": \"latest\",\n    \"source"
  },
  {
    "path": ".github/workflows/ci.yml",
    "chars": 544,
    "preview": "name: CI\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n\njobs:\n  test:\n    runs-on: ubuntu-lates"
  },
  {
    "path": ".github/workflows/publish.yml",
    "chars": 910,
    "preview": "name: Publish to npm\n\non:\n  release:\n    types: [published]\n\njobs:\n  publish:\n    runs-on: ubuntu-latest\n    permissions"
  },
  {
    "path": ".gitignore",
    "chars": 538,
    "preview": "# Dependencies\nnode_modules/\n\n# Build output\ndist/\n\n# Environment files\n.env\n.env.local\n.env.*.local\n\n# Logs\n*.log\nnpm-d"
  },
  {
    "path": ".leasotrc",
    "chars": 120,
    "preview": "{\n  \"tags\": [\"TODO\", \"FIXME\", \"HACK\", \"XXX\", \"BUG\", \"OPTIMIZE\", \"REVIEW\"],\n  \"ignore\": [\"node_modules/**\", \"dist/**\"]\n}\n"
  },
  {
    "path": ".nvmrc",
    "chars": 9,
    "preview": "v22.12.0\n"
  },
  {
    "path": ".prettierrc",
    "chars": 201,
    "preview": "{\n  \"semi\": true,\n  \"singleQuote\": false,\n  \"tabWidth\": 2,\n  \"trailingComma\": \"es5\",\n  \"printWidth\": 100,\n  \"useTabs\": f"
  },
  {
    "path": "CITATION.cff",
    "chars": 363,
    "preview": "cff-version: 1.2.0\nmessage: \"If you use Reader in your research or project, please cite it.\"\ntitle: \"Reader: Open-source"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "chars": 3603,
    "preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participa"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 8919,
    "preview": "# Contributing to Reader\n\nThank you for your interest in contributing to Reader! This document provides guidelines and i"
  },
  {
    "path": "LICENSE",
    "chars": 10761,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 32108,
    "preview": "<p align=\"center\">\n  <img src=\"docs/assets/logo.png\" alt=\"Reader Logo\" width=\"200\" />\n</p>\n\n<h1 align=\"center\">Reader</h"
  },
  {
    "path": "SECURITY.md",
    "chars": 1304,
    "preview": "# Security Policy\n\n## Supported Versions\n\n| Version | Supported |\n| ------- | --------- |\n| Latest  | Yes       |\n\nWe on"
  },
  {
    "path": "docs/api-reference.md",
    "chars": 15635,
    "preview": "# API Reference\n\nComplete API documentation for Reader.\n\n## ReaderClient (Recommended)\n\nThe recommended way to use Reade"
  },
  {
    "path": "docs/architecture.md",
    "chars": 12451,
    "preview": "# Architecture\n\nThis document describes the internal architecture of Reader, helping contributors understand how the sys"
  },
  {
    "path": "docs/assets/.gitkeep",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "docs/assets/demo.tape",
    "chars": 372,
    "preview": "# VHS tape file for Reader demo GIF\n# Run: vhs docs/assets/demo.tape\n\nOutput docs/assets/demo.gif\n\nSet FontSize 16\nSet W"
  },
  {
    "path": "docs/deployment/docker.md",
    "chars": 7057,
    "preview": "# Docker Deployment Guide\n\nDeploy Reader in Docker containers.\n\n## Quick Start\n\n### Basic Dockerfile\n\n```dockerfile\n# Do"
  },
  {
    "path": "docs/deployment/job-queues.md",
    "chars": 10359,
    "preview": "# Job Queues Guide\n\nUse job queues for async scraping at scale with BullMQ.\n\n## Overview\n\nFor high-volume scraping, use "
  },
  {
    "path": "docs/deployment/production-server.md",
    "chars": 7928,
    "preview": "# Production Server Guide\n\nDeploy Reader as a production-ready API server.\n\n## Overview\n\nFor production servers, use a *"
  },
  {
    "path": "docs/getting-started.md",
    "chars": 7801,
    "preview": "# Getting Started\n\nThis guide walks you through setting up Reader, verifying your installation, and running your first s"
  },
  {
    "path": "docs/guides/browser-pool.md",
    "chars": 9451,
    "preview": "# Browser Pool Guide\n\nThis guide covers browser pool management for production-grade scraping.\n\n## When to Use BrowserPo"
  },
  {
    "path": "docs/guides/browser-sessions.md",
    "chars": 5574,
    "preview": "# Browser Sessions\n\nBrowser sessions launch a stealthed Chrome and return a CDP (Chrome DevTools Protocol) WebSocket URL"
  },
  {
    "path": "docs/guides/cloudflare-bypass.md",
    "chars": 6554,
    "preview": "# Cloudflare Bypass Guide\n\nThis guide explains how Reader bypasses Cloudflare and other bot detection systems.\n\n## Overv"
  },
  {
    "path": "docs/guides/output-formats.md",
    "chars": 2418,
    "preview": "# Output Formats\n\nReader supports two output formats: **Markdown** and **HTML**.\n\n| Format | Best For | What You Get |\n|"
  },
  {
    "path": "docs/guides/proxy-configuration.md",
    "chars": 10396,
    "preview": "# Proxy Configuration Guide\n\nThis guide covers proxy setup for Reader.\n\n## Overview\n\nProxies help with:\n- Bypassing IP-b"
  },
  {
    "path": "docs/troubleshooting.md",
    "chars": 9492,
    "preview": "# Troubleshooting\n\nThis guide covers common issues and their solutions when using Reader.\n\n## Quick Diagnostics\n\nBefore "
  },
  {
    "path": "ecosystem.config.cjs",
    "chars": 1043,
    "preview": "/**\n * PM2 ecosystem config for reader daemon.\n *\n * Two separate instances on different ports, each with its own proxy "
  },
  {
    "path": "examples/.gitignore",
    "chars": 204,
    "preview": "# Dependencies\nnode_modules/\nbun.lockb\n\n# Build outputs\ndist/\n*.js\n*.d.ts\n*.map\n\n# Environment\n.env\n.env.local\n.env.*.lo"
  },
  {
    "path": "examples/.nvmrc",
    "chars": 9,
    "preview": "v22.12.0\n"
  },
  {
    "path": "examples/README.md",
    "chars": 2533,
    "preview": "# Reader Examples\n\nExamples demonstrating various uses of Reader.\n\n## Structure\n\n```\nexamples/\n├── basic/               "
  },
  {
    "path": "examples/ai-tools/README.md",
    "chars": 2096,
    "preview": "# AI Tools Examples\n\nExamples showing how to integrate Reader with AI frameworks, LLMs, and vector stores.\n\n## Prerequis"
  },
  {
    "path": "examples/ai-tools/anthropic-summary.ts",
    "chars": 2091,
    "preview": "/**\n * Anthropic (Claude) Summarization Example\n *\n * Scrapes a webpage and uses Claude to summarize the content.\n *\n * "
  },
  {
    "path": "examples/ai-tools/langchain-loader.ts",
    "chars": 3751,
    "preview": "/**\n * LangChain Document Loader Example\n *\n * Creates a custom LangChain document loader using Reader.\n *\n * Usage:\n * "
  },
  {
    "path": "examples/ai-tools/llamaindex-loader.ts",
    "chars": 2836,
    "preview": "/**\n * LlamaIndex Document Loader Example\n *\n * Creates a custom LlamaIndex document loader using Reader.\n *\n * Usage:\n "
  },
  {
    "path": "examples/ai-tools/openai-summary.ts",
    "chars": 2154,
    "preview": "/**\n * OpenAI Summarization Example\n *\n * Scrapes a webpage and uses OpenAI to summarize the content.\n *\n * Usage:\n *   "
  },
  {
    "path": "examples/ai-tools/pinecone-ingest.ts",
    "chars": 3592,
    "preview": "/**\n * Pinecone Vector Store Ingestion Example\n *\n * Scrapes webpages and ingests them into Pinecone for semantic search"
  },
  {
    "path": "examples/ai-tools/qdrant-ingest.ts",
    "chars": 4176,
    "preview": "/**\n * Qdrant Vector Store Ingestion Example\n *\n * Scrapes webpages and ingests them into Qdrant for semantic search.\n *"
  },
  {
    "path": "examples/ai-tools/vercel-ai-stream.ts",
    "chars": 1937,
    "preview": "/**\n * Vercel AI SDK Streaming Example\n *\n * Scrapes a webpage and streams a summary using the Vercel AI SDK.\n *\n * Usag"
  },
  {
    "path": "examples/basic/README.md",
    "chars": 2087,
    "preview": "# Basic Examples\n\nSimple examples demonstrating core Reader functionality.\n\n## Running Examples\n\nAll commands run from t"
  },
  {
    "path": "examples/basic/all-formats.ts",
    "chars": 1317,
    "preview": "#!/usr/bin/env node\n/**\n * All Formats Example\n *\n * Demonstrates outputting content in all supported formats (markdown "
  },
  {
    "path": "examples/basic/basic-scrape.ts",
    "chars": 1591,
    "preview": "#!/usr/bin/env node\n/**\n * Basic Scraping Example\n *\n * Demonstrates simple single-URL scraping with reader\n */\n\nimport "
  },
  {
    "path": "examples/basic/batch-scrape.ts",
    "chars": 1739,
    "preview": "#!/usr/bin/env node\n/**\n * Batch Scraping Example\n *\n * Demonstrates concurrent scraping of multiple URLs\n */\n\nimport { "
  },
  {
    "path": "examples/basic/browser-pool-config.ts",
    "chars": 2315,
    "preview": "#!/usr/bin/env node\n/**\n * Browser Pool Configuration Example\n *\n * Demonstrates configuring the browser pool for high-t"
  },
  {
    "path": "examples/basic/browser-session-actions.ts",
    "chars": 3431,
    "preview": "#!/usr/bin/env node\n/**\n * Browser Session — Actions Example\n *\n * Demonstrates performing browser actions: clicking, ty"
  },
  {
    "path": "examples/basic/browser-session-puppeteer.ts",
    "chars": 2386,
    "preview": "#!/usr/bin/env node\n/**\n * Browser Session — Puppeteer Example\n *\n * Same browser session primitive, but using Puppeteer"
  },
  {
    "path": "examples/basic/browser-session-selenium.ts",
    "chars": 4834,
    "preview": "#!/usr/bin/env node\n/**\n * Browser Session — Selenium CDP Example\n *\n * Selenium 4+ supports direct CDP connections, byp"
  },
  {
    "path": "examples/basic/browser-session.ts",
    "chars": 3249,
    "preview": "#!/usr/bin/env node\n/**\n * Browser Session Example\n *\n * Demonstrates the browser() primitive — launches a Hero-stealthe"
  },
  {
    "path": "examples/basic/cloudflare-bypass.ts",
    "chars": 2182,
    "preview": "#!/usr/bin/env node\n/**\n * Cloudflare Bypass Example\n *\n * Demonstrates scraping a Cloudflare-protected website.\n * Read"
  },
  {
    "path": "examples/basic/crawl-website.ts",
    "chars": 1737,
    "preview": "#!/usr/bin/env node\n/**\n * Crawling Example\n *\n * Demonstrates website crawling with link discovery\n */\n\nimport { Reader"
  },
  {
    "path": "examples/basic/large-batch-scrape.ts",
    "chars": 5531,
    "preview": "#!/usr/bin/env node\n/**\n * Large-Scale Batch Scraping Example (1000 URLs)\n *\n * Demonstrates how to configure Reader for"
  },
  {
    "path": "examples/basic/proxy-pool.ts",
    "chars": 2942,
    "preview": "#!/usr/bin/env node\n/**\n * Proxy Pool Example\n *\n * Demonstrates configuring multiple proxies with rotation for scraping"
  },
  {
    "path": "examples/basic/with-proxy.ts",
    "chars": 1847,
    "preview": "#!/usr/bin/env node\n/**\n * Proxy Example\n *\n * Demonstrates scraping with a proxy configuration\n */\n\nimport { ReaderClie"
  },
  {
    "path": "examples/package.json",
    "chars": 819,
    "preview": "{\n  \"name\": \"reader-examples\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Examples for @vakra-dev/reader"
  },
  {
    "path": "examples/production/README.md",
    "chars": 1793,
    "preview": "# Production Examples\n\nProduction-ready setups for running Reader at scale.\n\n## Available Examples\n\n### [Express Server]"
  },
  {
    "path": "examples/production/browser-pool-scaling/README.md",
    "chars": 5863,
    "preview": "# Browser Pool Scaling\n\nAdvanced browser pool configuration with metrics, health monitoring, and scaling.\n\n## Overview\n\n"
  },
  {
    "path": "examples/production/browser-pool-scaling/package.json",
    "chars": 479,
    "preview": "{\n  \"name\": \"browser-pool-scaling-example\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Browser pool scal"
  },
  {
    "path": "examples/production/browser-pool-scaling/src/index.ts",
    "chars": 13240,
    "preview": "/**\n * Browser Pool Scaling Example\n *\n * Demonstrates advanced browser pool configuration with:\n * - Pool metrics endpo"
  },
  {
    "path": "examples/production/express-server/README.md",
    "chars": 2731,
    "preview": "# Express Server Example\n\nA production-ready Express server exposing Reader as a REST API.\n\n## Features\n\n- Health check "
  },
  {
    "path": "examples/production/express-server/package.json",
    "chars": 610,
    "preview": "{\n  \"name\": \"reader-express-server\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Express server example f"
  },
  {
    "path": "examples/production/express-server/src/index.ts",
    "chars": 7742,
    "preview": "/**\n * Express Server Example for Reader\n *\n * Demonstrates how to run Reader as a REST API.\n * Uses ReaderClient which "
  },
  {
    "path": "examples/production/job-queue-bullmq/README.md",
    "chars": 5095,
    "preview": "# Job Queue with BullMQ\n\nAsync job processing for Reader using BullMQ and Redis.\n\n## Overview\n\nThis example demonstrates"
  },
  {
    "path": "examples/production/job-queue-bullmq/package.json",
    "chars": 761,
    "preview": "{\n  \"name\": \"job-queue-bullmq-example\",\n  \"version\": \"1.0.0\",\n  \"private\": true,\n  \"description\": \"Async job queue examp"
  },
  {
    "path": "examples/production/job-queue-bullmq/src/index.ts",
    "chars": 8139,
    "preview": "/**\n * Job Queue API Server\n *\n * REST API for submitting and monitoring scrape jobs.\n * Jobs are processed asynchronous"
  },
  {
    "path": "examples/production/job-queue-bullmq/src/queue.ts",
    "chars": 2334,
    "preview": "/**\n * Queue Configuration\n *\n * Defines the BullMQ queue and job types for async scraping.\n */\n\nimport { Queue } from \""
  },
  {
    "path": "examples/production/job-queue-bullmq/src/worker.ts",
    "chars": 5126,
    "preview": "/**\n * Scrape Worker\n *\n * Processes scrape jobs from the BullMQ queue.\n * Run this as a separate process from the API s"
  },
  {
    "path": "examples/tsconfig.json",
    "chars": 596,
    "preview": "{\n  \"compilerOptions\": {\n    \"ignoreDeprecations\": \"6.0\",\n    \"target\": \"ESNext\",\n    \"module\": \"ESNext\",\n    \"moduleRes"
  },
  {
    "path": "package.json",
    "chars": 2481,
    "preview": "{\n  \"name\": \"@vakra-dev/reader\",\n  \"version\": \"0.2.0\",\n  \"description\": \"Open source, production grade web scraping engi"
  },
  {
    "path": "result.md",
    "chars": 1020,
    "preview": "{\n  \"data\": [\n    {\n      \"markdown\": \"Example Domain\\n\\n# Example Domain\\n\\nThis domain is for use in documentation exa"
  },
  {
    "path": "scripts/release.sh",
    "chars": 4216,
    "preview": "#!/usr/bin/env bash\n#\n# Release script for reader\n#\n# Usage:\n#   ./scripts/release.sh 0.2.0\n#   ./scripts/release.sh 0.2"
  },
  {
    "path": "src/browser/hero-config.ts",
    "chars": 4789,
    "preview": "import type { ProxyConfig } from \"../types\";\nimport { createProxyUrl } from \"../proxy/config\";\n\n/**\n * Hero configuratio"
  },
  {
    "path": "src/browser/pool.ts",
    "chars": 11653,
    "preview": "import Hero from \"@ulixee/hero\";\nimport { createHeroConfig } from \"./hero-config\";\nimport type {\n  BrowserInstance,\n  Qu"
  },
  {
    "path": "src/browser/proxy-bound-browser.ts",
    "chars": 18876,
    "preview": "/**\n * ProxyBoundBrowser — a single Hero instance pinned to exactly one proxy URL.\n *\n * This is the per-IP unit of the "
  },
  {
    "path": "src/browser/tiered-pool.ts",
    "chars": 15471,
    "preview": "/**\n * TieredBrowserPool — the top-level browser pool for Reader.\n *\n * Composes N ProxyBoundBrowser instances grouped b"
  },
  {
    "path": "src/browser/types.ts",
    "chars": 2625,
    "preview": "import type Hero from \"@ulixee/hero\";\n\n/**\n * Browser instance in the pool\n */\nexport interface BrowserInstance {\n  /** "
  },
  {
    "path": "src/browser-session.ts",
    "chars": 12288,
    "preview": "/**\n * Browser Session\n *\n * Launches a Chrome instance directly and returns a CDP WebSocket URL.\n * No Hero involvement"
  },
  {
    "path": "src/browser-types.ts",
    "chars": 2131,
    "preview": "import type { ProxyConfig, ProxyTier } from \"./types\";\n\n/**\n * Options for creating a browser session.\n *\n * A browser s"
  },
  {
    "path": "src/cli/index.ts",
    "chars": 21493,
    "preview": "#!/usr/bin/env node\n// Load .env from cwd before any code reads process.env. This makes\n// `PROXY_DATACENTER` / `PROXY_R"
  },
  {
    "path": "src/client.ts",
    "chars": 18003,
    "preview": "/**\n * ReaderClient\n *\n * A client wrapper that manages HeroCore lifecycle and provides\n * a simple interface for scrapi"
  },
  {
    "path": "src/cloudflare/detector.ts",
    "chars": 5769,
    "preview": "import type Hero from \"@ulixee/hero\";\nimport type { ChallengeDetection } from \"./types\";\n\n/**\n * CLOUDFLARE-SPECIFIC DOM"
  },
  {
    "path": "src/cloudflare/handler.ts",
    "chars": 5960,
    "preview": "import type Hero from \"@ulixee/hero\";\nimport { detectChallenge } from \"./detector\";\nimport type { ChallengeResolutionRes"
  },
  {
    "path": "src/cloudflare/types.ts",
    "chars": 1045,
    "preview": "/**\n * Cloudflare challenge detection result\n */\nexport interface ChallengeDetection {\n  /** Whether a challenge was det"
  },
  {
    "path": "src/config/domain-profiles.ts",
    "chars": 2517,
    "preview": "/**\n * Domain Profiles\n *\n * Per-domain scrape configuration overrides. Reader ships with NO\n * built-in profiles — the "
  },
  {
    "path": "src/crawl-types.ts",
    "chars": 4077,
    "preview": "import type { ScrapeResult, ProxyConfig, ProxyTier } from \"./types\";\nimport type { IBrowserPool } from \"./browser/types\""
  },
  {
    "path": "src/crawler.ts",
    "chars": 7965,
    "preview": "import { parseHTML } from \"linkedom\";\nimport {\n  resolveUrl,\n  isValidUrl,\n  isSameDomain,\n  getUrlKey,\n  isContentUrl,\n"
  },
  {
    "path": "src/daemon/client.ts",
    "chars": 5015,
    "preview": "/**\n * Daemon Client\n *\n * A client that connects to the daemon server via HTTP.\n * Used by CLI commands when a daemon i"
  },
  {
    "path": "src/daemon/index.ts",
    "chars": 326,
    "preview": "/**\n * Daemon module exports\n */\n\nexport { DaemonServer, DEFAULT_DAEMON_PORT, getDaemonInfo, getPidFilePath } from \"./se"
  },
  {
    "path": "src/daemon/server.ts",
    "chars": 17336,
    "preview": "/**\n * Daemon Server\n *\n * An HTTP server that wraps ReaderClient, allowing multiple CLI\n * commands to share a single b"
  },
  {
    "path": "src/engines/errors.ts",
    "chars": 2855,
    "preview": "/**\n * Engine error classes\n *\n * Used by the Hero engine and orchestrator to signal specific failure\n * conditions. Con"
  },
  {
    "path": "src/engines/hero/index.ts",
    "chars": 6398,
    "preview": "/**\n * Hero Engine - Full browser with JavaScript execution\n *\n * Uses Hero browser automation with a tiered browser poo"
  },
  {
    "path": "src/engines/index.ts",
    "chars": 689,
    "preview": "/**\n * Scraping Engine\n *\n * Hero-only engine with orchestrator for quality checks and\n * proxy block detection.\n */\n\n//"
  },
  {
    "path": "src/engines/orchestrator.ts",
    "chars": 4059,
    "preview": "/**\n * Engine Orchestrator\n *\n * Runs Hero against a URL, applies a minimal quality check, and returns\n * the result. De"
  },
  {
    "path": "src/engines/types.ts",
    "chars": 2334,
    "preview": "/**\n * Engine types for the scraping engine.\n *\n * Reader uses a single engine: Hero (Ulixee), a full browser with\n * Ja"
  },
  {
    "path": "src/errors.ts",
    "chars": 16056,
    "preview": "/**\n * Typed error classes for Reader\n *\n * Provides actionable error messages and structured error information\n * for b"
  },
  {
    "path": "src/formatters/html.ts",
    "chars": 469,
    "preview": "/**\n * HTML formatter\n *\n * Returns the cleaned HTML content as-is.\n * The content has already been processed by content"
  },
  {
    "path": "src/formatters/index.ts",
    "chars": 127,
    "preview": "// Export all formatters\nexport { formatToMarkdown, htmlToMarkdown } from \"./markdown\";\nexport { formatToHTML } from \"./"
  },
  {
    "path": "src/formatters/markdown.ts",
    "chars": 2015,
    "preview": "import { convert } from \"@vakra-dev/supermarkdown\";\nimport { logger } from \"../utils/logger.js\";\n\nconst log = logger.chi"
  },
  {
    "path": "src/formatters/postprocess.ts",
    "chars": 1549,
    "preview": "/**\n * Markdown post-processing.\n *\n * Light-touch cleanup on the markdown output from supermarkdown. Only\n * fixes patt"
  },
  {
    "path": "src/index.ts",
    "chars": 5069,
    "preview": "/**\n * @vakra-dev/reader\n *\n * Production-grade web scraping engine for LLMs.\n * Clean markdown output, ready for your a"
  },
  {
    "path": "src/proxy/config.ts",
    "chars": 1956,
    "preview": "import type { ProxyConfig } from \"../types\";\n\n/**\n * Create proxy URL from configuration\n *\n * Supports both datacenter "
  },
  {
    "path": "src/proxy/env.ts",
    "chars": 2999,
    "preview": "/**\n * Environment-driven proxy pool configuration.\n *\n * Lets operators configure datacenter and residential proxy pool"
  },
  {
    "path": "src/proxy/health-tracker.ts",
    "chars": 9539,
    "preview": "/**\n * ProxyHealthTracker — minimal per-proxy circuit breaker.\n *\n * Goal: detect a dead or blacklisted proxy mid-sessio"
  },
  {
    "path": "src/proxy/proxy-gate.ts",
    "chars": 7126,
    "preview": "/**\n * PerProxyGate — per-IP concurrency cap.\n *\n * Enforces a hard limit on the number of simultaneous scrapes that can"
  },
  {
    "path": "src/proxy/verify.ts",
    "chars": 5169,
    "preview": "/**\n * Startup-time proxy verification.\n *\n * Before the daemon declares itself ready, every configured proxy URL is\n * "
  },
  {
    "path": "src/scraper.ts",
    "chars": 18380,
    "preview": "import pLimit from \"p-limit\";\nimport { htmlToMarkdown } from \"./formatters/markdown\";\nimport { postprocessMarkdown } fro"
  },
  {
    "path": "src/types.ts",
    "chars": 15231,
    "preview": "import type { IBrowserPool } from \"./browser/types\";\n\n/**\n * Proxy configuration for Hero\n */\nexport interface ProxyConf"
  },
  {
    "path": "src/utils/block-detector.ts",
    "chars": 3571,
    "preview": "/**\n * Block Detector\n *\n * Detects bot-block pages that return HTTP 200 but contain\n * anti-bot content instead of actu"
  },
  {
    "path": "src/utils/content-cleaner.ts",
    "chars": 10202,
    "preview": "import { parseHTML } from \"linkedom\";\n\n/**\n * HTML content cleaning — minimal approach.\n *\n * Philosophy: strip only wha"
  },
  {
    "path": "src/utils/logger.ts",
    "chars": 1146,
    "preview": "import pino from \"pino\";\n\n/**\n * Logger type\n */\nexport type Logger = ReturnType<typeof createLogger>;\n\n/**\n * Check if "
  },
  {
    "path": "src/utils/metadata-extractor.ts",
    "chars": 8443,
    "preview": "import { parseHTML } from \"linkedom\";\nimport type { WebsiteMetadata } from \"../types\";\nimport { normalizeUrl } from \"./u"
  },
  {
    "path": "src/utils/rate-limiter.ts",
    "chars": 1578,
    "preview": "import pLimit from \"p-limit\";\n\n/**\n * Simple rate limit function\n */\nexport async function rateLimit(ms: number): Promis"
  },
  {
    "path": "src/utils/robots-parser.ts",
    "chars": 3972,
    "preview": "/**\n * Simple robots.txt parser for crawler compliance\n */\n\nexport interface RobotsRules {\n  disallowedPaths: string[];\n"
  },
  {
    "path": "src/utils/url-helpers.ts",
    "chars": 11366,
    "preview": "import { URL } from \"url\";\nimport RE2 from \"re2\";\n\n/**\n * URL validation and normalization utilities\n */\n\n/**\n * Resolve"
  },
  {
    "path": "src/utils/url-rewriter.ts",
    "chars": 1654,
    "preview": "/**\n * URL Rewriter\n *\n * Rewrites certain URLs to their export/download equivalents before scraping.\n * Reader ships wi"
  },
  {
    "path": "tests/engines/orchestrator.test.ts",
    "chars": 3303,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { EngineOrchestrator } from \"../../src/engines/orchestrator\";\nimpo"
  },
  {
    "path": "tests/fixtures/amazon-bot-page.html",
    "chars": 1627,
    "preview": "<html class=\"a-no-js\" lang=\"en-us\"><head>\n<title dir=\"ltr\">Amazon.com</title>\n</head>\n<body>\n<div class=\"a-container a-p"
  },
  {
    "path": "tests/fixtures/cloudflare-challenge.html",
    "chars": 747,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <title>Just a moment...</title>\n</head>\n<body>\n  <div id=\"challenge-running\">\n    <div c"
  },
  {
    "path": "tests/fixtures/empty-page.html",
    "chars": 71,
    "preview": "<!DOCTYPE html>\n<html><head><title></title></head><body></body></html>\n"
  },
  {
    "path": "tests/fixtures/simple-static.html",
    "chars": 1066,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n  <meta charset=\"utf-8\">\n  <title>Simple Test Page</title>\n  <meta name=\"descrip"
  },
  {
    "path": "tests/integration/daemon.test.ts",
    "chars": 6628,
    "preview": "import { describe, it, expect, beforeAll, afterAll } from \"vitest\";\nimport http from \"http\";\n\n/**\n * Daemon integration "
  },
  {
    "path": "tests/unit/block-detector-cloudflare.test.ts",
    "chars": 1771,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { readFileSync } from \"fs\";\nimport { join } from \"path\";\nimport { "
  },
  {
    "path": "tests/unit/block-detector-fixtures.test.ts",
    "chars": 993,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { readFileSync } from \"fs\";\nimport { join } from \"path\";\nimport { "
  },
  {
    "path": "tests/unit/block-detector.test.ts",
    "chars": 5614,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { detectBotPage, detectBotTitle, isBlockedResponse, type BlockDete"
  },
  {
    "path": "tests/unit/browser-session.test.ts",
    "chars": 1553,
    "preview": "/**\n * Browser Session Unit Tests\n *\n * Tests the findChromePath logic and session structure.\n * Full integration is tes"
  },
  {
    "path": "tests/unit/content-cleaner.test.ts",
    "chars": 9271,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { cleanContent } from \"../../src/utils/content-cleaner\";\n\ndescribe"
  },
  {
    "path": "tests/unit/crawler.test.ts",
    "chars": 13548,
    "preview": "/**\n * Crawler Tests\n *\n * Tests link extraction, depth limiting, maxPages cap, URL dedup,\n * same-domain filtering, and"
  },
  {
    "path": "tests/unit/daemon-dispatch.test.ts",
    "chars": 11077,
    "preview": "import { describe, it, expect, beforeEach, vi } from \"vitest\";\nimport { Readable } from \"stream\";\nimport http from \"http"
  },
  {
    "path": "tests/unit/domain-profiles.test.ts",
    "chars": 4928,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { getDomainProfile, applyDomainProfile } from \"../../src/config/do"
  },
  {
    "path": "tests/unit/errors.test.ts",
    "chars": 9444,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport {\n  ReaderError,\n  ReaderErrorCode,\n  NetworkError,\n  TimeoutError"
  },
  {
    "path": "tests/unit/health-tracker.test.ts",
    "chars": 8288,
    "preview": "import { describe, it, expect, vi, beforeEach } from \"vitest\";\nimport { ProxyHealthTracker } from \"../../src/proxy/healt"
  },
  {
    "path": "tests/unit/html-size-guard.test.ts",
    "chars": 1681,
    "preview": "import { describe, it, expect } from \"vitest\";\n\n/**\n * HTML Size Guard tests.\n *\n * The scraper truncates HTML > MAX_HTM"
  },
  {
    "path": "tests/unit/markdown-formatter.test.ts",
    "chars": 4407,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { htmlToMarkdown, formatToMarkdown } from \"../../src/formatters/ma"
  },
  {
    "path": "tests/unit/metadata-extractor.test.ts",
    "chars": 4053,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { extractMetadata } from \"../../src/utils/metadata-extractor\";\n\nde"
  },
  {
    "path": "tests/unit/postprocess.test.ts",
    "chars": 4740,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { postprocessMarkdown } from \"../../src/formatters/postprocess\";\n\n"
  },
  {
    "path": "tests/unit/proxy-bound-browser.test.ts",
    "chars": 12735,
    "preview": "import { describe, it, expect, vi } from \"vitest\";\nimport pino from \"pino\";\nimport {\n  ProxyBoundBrowser,\n  redactProxyU"
  },
  {
    "path": "tests/unit/proxy-config.test.ts",
    "chars": 1684,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { createProxyUrl, parseProxyUrl } from \"../../src/proxy/config\";\n\n"
  },
  {
    "path": "tests/unit/proxy-gate.test.ts",
    "chars": 9124,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { PerProxyGate } from \"../../src/proxy/proxy-gate\";\n\n/**\n * Helper"
  },
  {
    "path": "tests/unit/proxy-verify.test.ts",
    "chars": 5094,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { verifyProxies, verifyProxiesOrThrow } from \"../../src/proxy/veri"
  },
  {
    "path": "tests/unit/robots-parser.test.ts",
    "chars": 8136,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport {\n  parseRobotsTxt,\n  isPathAllowed,\n  isUrlAllowed,\n  type Robots"
  },
  {
    "path": "tests/unit/scraper-pipeline.test.ts",
    "chars": 12712,
    "preview": "/**\n * Scraper Content Pipeline Tests\n *\n * Tests the end-to-end content pipeline: raw HTML → metadata extraction →\n * c"
  },
  {
    "path": "tests/unit/scraper-retry.test.ts",
    "chars": 7558,
    "preview": "/**\n * Scraper Retry & Escalation Tests\n *\n * Tests the retry loop in Scraper.scrapeSingleUrlWithRetry:\n *   1. Datacent"
  },
  {
    "path": "tests/unit/tiered-pool.test.ts",
    "chars": 14386,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport pino from \"pino\";\nimport {\n  TieredBrowserPool,\n  buildTierConfigs"
  },
  {
    "path": "tests/unit/url-helpers.test.ts",
    "chars": 3198,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { isValidUrl, getUrlKey, isSameDomain, resolveUrl } from \"../../sr"
  },
  {
    "path": "tests/unit/url-rewriter.test.ts",
    "chars": 4358,
    "preview": "import { describe, it, expect } from \"vitest\";\nimport { rewriteUrl, type UrlRewriteRule } from \"../../src/utils/url-rewr"
  },
  {
    "path": "tsconfig.json",
    "chars": 760,
    "preview": "{\n  \"compilerOptions\": {\n    \"target\": \"ESNext\",\n    \"module\": \"ESNext\",\n    \"moduleResolution\": \"bundler\",\n    \"lib\": ["
  },
  {
    "path": "tsup.config.ts",
    "chars": 904,
    "preview": "import { defineConfig } from \"tsup\";\n\n// Packages that should not be bundled (native modules, CommonJS deps)\n// Packages"
  },
  {
    "path": "vitest.config.ts",
    "chars": 227,
    "preview": "import { defineConfig } from \"vitest/config\";\n\nexport default defineConfig({\n  test: {\n    globals: true,\n    environmen"
  }
]

About this extraction

This page contains the full source code of the vakra-dev/reader GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 147 files (751.3 KB), approximately 189.7k tokens, and a symbol index with 477 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo