Repository: vakra-dev/reader
Branch: main
Commit: fbf5a54bff96
Files: 147
Total size: 751.3 KB
Directory structure:
gitextract_cms0mrdu/
├── .eslintrc.json
├── .github/
│ └── workflows/
│ ├── ci.yml
│ └── publish.yml
├── .gitignore
├── .leasotrc
├── .nvmrc
├── .prettierrc
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs/
│ ├── api-reference.md
│ ├── architecture.md
│ ├── assets/
│ │ ├── .gitkeep
│ │ └── demo.tape
│ ├── deployment/
│ │ ├── docker.md
│ │ ├── job-queues.md
│ │ └── production-server.md
│ ├── getting-started.md
│ ├── guides/
│ │ ├── browser-pool.md
│ │ ├── browser-sessions.md
│ │ ├── cloudflare-bypass.md
│ │ ├── output-formats.md
│ │ └── proxy-configuration.md
│ └── troubleshooting.md
├── ecosystem.config.cjs
├── examples/
│ ├── .gitignore
│ ├── .nvmrc
│ ├── README.md
│ ├── ai-tools/
│ │ ├── README.md
│ │ ├── anthropic-summary.ts
│ │ ├── langchain-loader.ts
│ │ ├── llamaindex-loader.ts
│ │ ├── openai-summary.ts
│ │ ├── pinecone-ingest.ts
│ │ ├── qdrant-ingest.ts
│ │ └── vercel-ai-stream.ts
│ ├── basic/
│ │ ├── README.md
│ │ ├── all-formats.ts
│ │ ├── basic-scrape.ts
│ │ ├── batch-scrape.ts
│ │ ├── browser-pool-config.ts
│ │ ├── browser-session-actions.ts
│ │ ├── browser-session-puppeteer.ts
│ │ ├── browser-session-selenium.ts
│ │ ├── browser-session.ts
│ │ ├── cloudflare-bypass.ts
│ │ ├── crawl-website.ts
│ │ ├── large-batch-scrape.ts
│ │ ├── proxy-pool.ts
│ │ └── with-proxy.ts
│ ├── package.json
│ ├── production/
│ │ ├── README.md
│ │ ├── browser-pool-scaling/
│ │ │ ├── README.md
│ │ │ ├── package.json
│ │ │ └── src/
│ │ │ └── index.ts
│ │ ├── express-server/
│ │ │ ├── README.md
│ │ │ ├── package.json
│ │ │ └── src/
│ │ │ └── index.ts
│ │ └── job-queue-bullmq/
│ │ ├── README.md
│ │ ├── package.json
│ │ └── src/
│ │ ├── index.ts
│ │ ├── queue.ts
│ │ └── worker.ts
│ └── tsconfig.json
├── package.json
├── result.md
├── scripts/
│ └── release.sh
├── src/
│ ├── browser/
│ │ ├── hero-config.ts
│ │ ├── pool.ts
│ │ ├── proxy-bound-browser.ts
│ │ ├── tiered-pool.ts
│ │ └── types.ts
│ ├── browser-session.ts
│ ├── browser-types.ts
│ ├── cli/
│ │ └── index.ts
│ ├── client.ts
│ ├── cloudflare/
│ │ ├── detector.ts
│ │ ├── handler.ts
│ │ └── types.ts
│ ├── config/
│ │ └── domain-profiles.ts
│ ├── crawl-types.ts
│ ├── crawler.ts
│ ├── daemon/
│ │ ├── client.ts
│ │ ├── index.ts
│ │ └── server.ts
│ ├── engines/
│ │ ├── errors.ts
│ │ ├── hero/
│ │ │ └── index.ts
│ │ ├── index.ts
│ │ ├── orchestrator.ts
│ │ └── types.ts
│ ├── errors.ts
│ ├── formatters/
│ │ ├── html.ts
│ │ ├── index.ts
│ │ ├── markdown.ts
│ │ └── postprocess.ts
│ ├── index.ts
│ ├── proxy/
│ │ ├── config.ts
│ │ ├── env.ts
│ │ ├── health-tracker.ts
│ │ ├── proxy-gate.ts
│ │ └── verify.ts
│ ├── scraper.ts
│ ├── types.ts
│ └── utils/
│ ├── block-detector.ts
│ ├── content-cleaner.ts
│ ├── logger.ts
│ ├── metadata-extractor.ts
│ ├── rate-limiter.ts
│ ├── robots-parser.ts
│ ├── url-helpers.ts
│ └── url-rewriter.ts
├── tests/
│ ├── engines/
│ │ └── orchestrator.test.ts
│ ├── fixtures/
│ │ ├── amazon-bot-page.html
│ │ ├── cloudflare-challenge.html
│ │ ├── empty-page.html
│ │ └── simple-static.html
│ ├── integration/
│ │ └── daemon.test.ts
│ └── unit/
│ ├── block-detector-cloudflare.test.ts
│ ├── block-detector-fixtures.test.ts
│ ├── block-detector.test.ts
│ ├── browser-session.test.ts
│ ├── content-cleaner.test.ts
│ ├── crawler.test.ts
│ ├── daemon-dispatch.test.ts
│ ├── domain-profiles.test.ts
│ ├── errors.test.ts
│ ├── health-tracker.test.ts
│ ├── html-size-guard.test.ts
│ ├── markdown-formatter.test.ts
│ ├── metadata-extractor.test.ts
│ ├── postprocess.test.ts
│ ├── proxy-bound-browser.test.ts
│ ├── proxy-config.test.ts
│ ├── proxy-gate.test.ts
│ ├── proxy-verify.test.ts
│ ├── robots-parser.test.ts
│ ├── scraper-pipeline.test.ts
│ ├── scraper-retry.test.ts
│ ├── tiered-pool.test.ts
│ ├── url-helpers.test.ts
│ └── url-rewriter.test.ts
├── tsconfig.json
├── tsup.config.ts
└── vitest.config.ts
================================================
FILE CONTENTS
================================================
================================================
FILE: .eslintrc.json
================================================
{
"root": true,
"parser": "@typescript-eslint/parser",
"parserOptions": {
"ecmaVersion": "latest",
"sourceType": "module",
"project": true
},
"plugins": ["@typescript-eslint"],
"extends": [
"eslint:recommended",
"plugin:@typescript-eslint/recommended"
],
"env": {
"node": true,
"es2022": true
},
"rules": {
"@typescript-eslint/no-explicit-any": "warn",
"@typescript-eslint/no-unused-vars": ["error", { "argsIgnorePattern": "^_" }],
"@typescript-eslint/explicit-function-return-type": "off",
"@typescript-eslint/explicit-module-boundary-types": "off",
"@typescript-eslint/no-non-null-assertion": "warn",
"no-console": ["warn", { "allow": ["warn", "error"] }]
},
"ignorePatterns": ["dist/", "node_modules/", "*.js", "*.config.ts"]
}
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- run: npm ci
- name: Typecheck
run: npx tsc --noEmit
- name: Lint
run: npm run lint
- name: Format check
run: npm run format:check
- name: Test
run: npm test
- name: Build
run: npm run build
================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish to npm
on:
release:
types: [published]
jobs:
publish:
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "22"
registry-url: "https://registry.npmjs.org"
- run: npm ci
- name: Verify version matches tag
run: |
TAG_VERSION="${GITHUB_REF_NAME#v}"
PKG_VERSION=$(node -p "require('./package.json').version")
if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
echo "Error: Tag $TAG_VERSION does not match package.json $PKG_VERSION"
exit 1
fi
echo "Version verified: $PKG_VERSION"
- name: Build
run: npm run build
- name: Publish
run: npm publish --access public
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
================================================
FILE: .gitignore
================================================
# Dependencies
node_modules/
# Build output
dist/
# Environment files
.env
.env.local
.env.*.local
# Logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# OS files
.DS_Store
Thumbs.db
# IDE
.idea/
.vscode/
*.swp
*.swo
# Coverage
coverage/
.nyc_output/
# Package manager locks
# Note: package-lock.json is tracked for reproducible builds
yarn.lock
# Bun
bun.lockb
# Temporary files
tmp/
temp/
*.tmp
# Hero/Ulixee session data
.ulixee/
# Claude Code context
CLAUDE.md
# Deployment configs (contain sensitive data)
deploy/
================================================
FILE: .leasotrc
================================================
{
"tags": ["TODO", "FIXME", "HACK", "XXX", "BUG", "OPTIMIZE", "REVIEW"],
"ignore": ["node_modules/**", "dist/**"]
}
================================================
FILE: .nvmrc
================================================
v22.12.0
================================================
FILE: .prettierrc
================================================
{
"semi": true,
"singleQuote": false,
"tabWidth": 2,
"trailingComma": "es5",
"printWidth": 100,
"useTabs": false,
"bracketSpacing": true,
"arrowParens": "always",
"endOfLine": "lf"
}
================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use Reader in your research or project, please cite it."
title: "Reader: Open-source, production-grade web scraping engine built for LLMs"
type: software
authors:
- family-names: Kaul
given-names: Nihal
license: Apache-2.0
url: "https://github.com/vakra-dev/reader"
repository-code: "https://github.com/vakra-dev/reader"
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a welcoming experience for everyone, regardless of background or
identity.
## Our Standards
Examples of behavior that contributes to a positive environment:
- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Focusing on what is best for the community
- Showing empathy towards other community members
Examples of unacceptable behavior:
- Trolling, insulting or derogatory comments, and personal attacks
- Public or private harassment
- Publishing others' private information without explicit permission
- Other conduct which could reasonably be considered inappropriate in a professional setting
## Enforcement Responsibilities
Project maintainers are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate or harmful.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
## Enforcement
Instances of unacceptable behavior may be reported to the project maintainers at
**nihal.codes@gmail.com**. All complaints will be reviewed and investigated
promptly and fairly.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact:** Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence:** A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the behavior
was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact:** A violation through a single incident or series of actions.
**Consequence:** A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact:** A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence:** A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact:** Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence:** A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Reader
Thank you for your interest in contributing to Reader! This document provides guidelines and instructions for contributing.
## Development Setup
### Prerequisites
- **Node.js** >= 18 (v22 recommended)
- **npm** for package management
- **Git**
> **Note:** Always run scripts with Node.js (`npx tsx` or `node`) as Hero has ESM compatibility issues with other runtimes.
### Getting Started
1. **Fork the repository** on GitHub
2. **Clone your fork:**
```bash
git clone https://github.com/YOUR_USERNAME/reader.git
cd reader
```
3. **Install dependencies:**
```bash
npm install
```
4. **Verify setup:**
```bash
npm run typecheck
npm run build
```
5. **Test the CLI:**
```bash
npx tsx src/cli/index.ts scrape https://example.com
```
## Project Structure
```
src/
├── index.ts # Public API exports
├── client.ts # ReaderClient - main API entry point
├── scraper.ts # Scraper class - main scraping logic
├── crawler.ts # Crawler class - link discovery
├── types.ts # TypeScript types for scraping
├── crawl-types.ts # TypeScript types for crawling
│
├── browser/
│ ├── pool.ts # BrowserPool - manages Hero instances
│ ├── hero-config.ts # Hero configuration
│ └── types.ts # Pool types
│
├── cloudflare/
│ ├── detector.ts # Challenge detection
│ ├── handler.ts # Challenge resolution
│ └── types.ts # Cloudflare types
│
├── formatters/
│ ├── markdown.ts # Markdown formatter
│ ├── html.ts # HTML formatter
│ ├── json.ts # JSON formatter
│ ├── text.ts # Text formatter
│ └── index.ts # Re-exports
│
├── utils/
│ ├── content-cleaner.ts # HTML content cleaning
│ ├── metadata-extractor.ts # Metadata extraction
│ ├── url-helpers.ts # URL utilities
│ ├── rate-limiter.ts # Rate limiting
│ └── logger.ts # Logging
│
├── proxy/
│ └── config.ts # Proxy configuration
│
├── daemon/
│ ├── index.ts # Module exports
│ ├── server.ts # DaemonServer - HTTP server with browser pool
│ └── client.ts # DaemonClient - connects CLI to daemon
│
└── cli/
└── index.ts # CLI implementation
```
## Development Workflow
### Running the CLI
```bash
# Run CLI directly
npx tsx src/cli/index.ts scrape https://example.com
# With verbose output
npx tsx src/cli/index.ts scrape https://example.com -v
# Show browser window
npx tsx src/cli/index.ts scrape https://example.com --show-chrome
```
### Daemon Mode
```bash
# Start daemon with browser pool
npx tsx src/cli/index.ts start --pool-size 5
# Check daemon status
npx tsx src/cli/index.ts status
# Run commands (auto-connects to daemon)
npx tsx src/cli/index.ts scrape https://example.com
# Force standalone mode (bypass daemon)
npx tsx src/cli/index.ts scrape https://example.com --standalone
# Stop daemon
npx tsx src/cli/index.ts stop
```
### Code Quality
Run these commands before submitting a PR:
```bash
# Type checking
npm run typecheck
# Linting
npm run lint
# Auto-fix lint issues
npm run lint:fix
# Format code
npm run format
# Check formatting
npm run format:check
# Build
npm run build
```
### Finding TODOs
Track outstanding work:
```bash
npm run todo
```
## Making Changes
### Branch Naming
- `feature/description` - New features
- `fix/description` - Bug fixes
- `docs/description` - Documentation updates
- `refactor/description` - Code refactoring
### Commit Messages
Write clear, concise commit messages:
```
type: short description
Longer description if needed.
```
Types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`
Examples:
```
feat: add support for custom user agents
fix: resolve timeout issue with Cloudflare challenges
docs: update proxy configuration guide
refactor: simplify browser pool recycling logic
```
### Pull Request Process
1. Create a new branch from `main`
2. Make your changes
3. Run all checks:
```bash
npm run lint
npm run format:check
npm run typecheck
npm run build
```
4. Push your branch and create a PR
5. Fill out the PR template
6. Wait for review
## Common Tasks
### Adding a New Output Format
1. Create `src/formatters/newformat.ts`:
```typescript
export function formatToNewFormat(
pages: Page[],
baseUrl: string,
scrapedAt: string,
duration: number,
metadata?: WebsiteMetadata
): string {
// Implementation
}
```
2. Export from `src/formatters/index.ts`
3. Add to format type in `src/types.ts`
4. Call formatter in `src/scraper.ts`
5. Update CLI validation in `src/cli/index.ts`
### Adding a New ScrapeOption
1. Add to `ScrapeOptions` interface in `src/types.ts`
2. Add default in `DEFAULT_OPTIONS`
3. Use in `Scraper` class via `this.options.newOption`
4. Add CLI flag in `src/cli/index.ts` if applicable
5. Update documentation
### Modifying Cloudflare Detection
1. Detection patterns: `src/cloudflare/detector.ts`
2. Resolution logic: `src/cloudflare/handler.ts`
3. Test with known Cloudflare-protected sites
### Adjusting Browser Pool
1. Default config: `src/browser/types.ts`
2. Pool logic: `src/browser/pool.ts`
## Testing
Currently testing is done manually. When adding new features:
1. **Test basic functionality:**
```bash
npx tsx src/cli/index.ts scrape https://example.com
```
2. **Test Cloudflare-protected sites:**
```bash
npx tsx src/cli/index.ts scrape https://cloudflare-protected-site.com -v
```
3. **Test different output formats:**
```bash
npx tsx src/cli/index.ts scrape https://example.com -f markdown,html,json,text
```
4. **Test crawling:**
```bash
npx tsx src/cli/index.ts crawl https://example.com -d 2 -m 10
```
5. **Test batch scraping:**
```bash
npx tsx src/cli/index.ts scrape url1 url2 url3 -c 3 -v
```
6. **Test daemon mode:**
```bash
# Start daemon
npx tsx src/cli/index.ts start --pool-size 3
# Test scraping via daemon
npx tsx src/cli/index.ts scrape https://example.com
# Check status
npx tsx src/cli/index.ts status
# Stop daemon
npx tsx src/cli/index.ts stop
```
## Running Examples
The `examples/` folder contains working examples:
```bash
cd examples
npm install
# Basic examples
npx tsx basic/basic-scrape.ts
npx tsx basic/batch-scrape.ts
npx tsx basic/crawl-website.ts
# AI integration examples (requires API keys)
export OPENAI_API_KEY="sk-..."
npx tsx ai-tools/openai-summary.ts https://example.com
# Production server
npx tsx production/express-server/src/index.ts
```
## Code Style
- Use TypeScript for all new code
- Follow existing patterns in the codebase
- Use async/await instead of callbacks
- Prefer explicit types over `any`
- Use meaningful variable and function names
- Add JSDoc comments for public APIs
## Documentation
When making changes:
1. Update relevant markdown files in `docs/`
2. Update README.md if adding new features
3. Add JSDoc comments to new public functions
4. Update CLAUDE.md for AI context if architecture changes
### Documentation Files
| File | Purpose |
| ------------------------- | ------------------------------- |
| `README.md` | Main documentation, quick start |
| `CONTRIBUTING.md` | This file |
| `docs/getting-started.md` | Detailed setup guide |
| `docs/api-reference.md` | Complete API docs |
| `docs/architecture.md` | System design |
| `docs/troubleshooting.md` | Common issues |
| `docs/guides/` | Feature guides |
| `docs/deployment/` | Deployment guides |
## Reporting Issues
When reporting bugs, please include:
- Operating system and version
- Node.js version (`node --version`)
- Reader version
- Steps to reproduce
- Expected vs actual behavior
- Error messages and stack traces
- Verbose output (`-v` flag)
## Code of Conduct
- Be respectful and inclusive
- Focus on constructive feedback
- Help others learn and grow
- Follow project guidelines
## License
By contributing, you agree that your contributions will be licensed under the Apache 2.0 License.
## Disclaimer
By using Reader, you agree to the following:
- You are solely responsible for respecting websites' policies when scraping and crawling
- You will adhere to applicable privacy policies and terms of use before initiating scraping activities
- Reader respects robots.txt directives by default, but ultimate compliance is your responsibility
## Questions?
- Check the [documentation](https://docs.reader.dev)
- Search [GitHub Issues](https://github.com/vakra-dev/reader/issues)
- Ask in [Discord](https://discord.gg/6tjkq7J5WV)
- Open a new issue or discussion
Thank you for contributing!
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to the Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright (c) 2026 vakra-dev
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
Reader
Open source web infrastructure for AI.
Access the web without the complexity.
Docs · Examples · Discord
## The Problem
Building agents that need web access is frustrating. You piece together Puppeteer, add stealth plugins, fight Cloudflare, manage proxies and it still breaks in production.
Because production grade web scraping isn't about rendering a page and converting HTML to markdown. It's about everything underneath:
| Layer | What it actually takes |
| ------------------------ | ------------------------------------------------------------------- |
| **Browser architecture** | Managing browser instances at scale, not one-off scripts |
| **Anti-bot bypass** | Cloudflare, Turnstile, JS challenges, they all block naive scrapers |
| **TLS fingerprinting** | Real browsers have fingerprints. Puppeteer doesn't. Sites know. |
| **Proxy infrastructure** | Datacenter vs residential, rotation strategies, sticky sessions |
| **Resource management** | Browser pooling, memory limits, graceful recycling |
| **Reliability** | Rate limiting, retries, timeouts, caching, graceful degradation |
I built **Reader**, a production-grade web scraping engine on top of [Ulixee Hero](https://ulixee.org/), a headless browser designed for exactly this.
## The Solution
Three primitives. That's it.
```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";
const reader = new ReaderClient();
// 1. Scrape URLs → clean markdown
const result = await reader.scrape({ urls: ["https://example.com"] });
console.log(result.data[0].markdown);
// 2. Crawl a site → discover + scrape pages
const pages = await reader.crawl({
url: "https://example.com",
depth: 2,
scrape: true,
});
console.log(`Found ${pages.urls.length} pages`);
// 3. Browser session → full Playwright/Puppeteer control with stealth
const session = await reader.browser();
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const page = browser.contexts()[0].pages()[0];
await page.goto("https://example.com");
console.log(await page.title());
await session.close();
```
All the hard stuff (browser pooling, anti-bot bypass, proxy rotation, retries) happens under the hood. You get clean markdown. Your agents get the web. And when you need full browser control, `browser()` gives you a stealthed Chrome that Playwright or Puppeteer can drive.
> [!TIP]
> If Reader is useful to you, a [star on GitHub](https://github.com/vakra-dev/reader) helps others discover the project.
## Features
- **Browser Sessions** - Launch stealthed Chrome, connect Playwright/Puppeteer via CDP
- **Anti-Bot Bypass** - TLS fingerprinting, navigator spoofing, WebRTC masking, `webdriver=false`
- **Clean Output** - Markdown and HTML with automatic main content extraction
- **Smart Content Cleaning** - Removes nav, headers, footers, popups, cookie banners
- **CLI & API** - Use from command line or programmatically
- **Browser Pool** - Auto-recycling, health monitoring, tiered proxy pools
- **Concurrent Scraping** - Parallel URL processing with progress tracking
- **Website Crawling** - BFS link discovery with depth/page limits
- **Tiered Proxies** - Datacenter and residential pools with auto-escalation and health tracking
## Installation
```bash
npm install @vakra-dev/reader
```
**Requirements:** Node.js >= 18
> **Apple Silicon (M1/M2/M3):** Hero's bundled Chrome binary isn't available for arm64. Point to your system Chrome:
>
> ```bash
> export CHROME_139_BIN="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
> ```
## Quick Start
### Cloud (Fastest)
Get an API key at [app.reader.dev](https://app.reader.dev) and start scraping immediately:
```typescript
import { ReaderClient } from "@vakra-dev/reader-js";
const reader = new ReaderClient({ apiKey: process.env.READER_API_KEY });
const result = await reader.read({ url: "https://example.com" });
if (result.kind === "scrape") {
console.log(result.data.markdown);
}
```
```bash
npm install @vakra-dev/reader-js
```
See the [cloud docs](https://docs.reader.dev) for the full API reference.
### Self-Hosted
Install the reader engine and run scraping on your own infrastructure:
### Basic Scrape
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.scrape({
urls: ["https://example.com"],
formats: ["markdown", "html"],
});
console.log(result.data[0].markdown);
console.log(result.data[0].html);
await reader.close();
```
### Batch Scraping with Concurrency
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.scrape({
urls: ["https://example.com", "https://example.org", "https://example.net"],
formats: ["markdown"],
batchConcurrency: 3,
onProgress: (progress) => {
console.log(`${progress.completed}/${progress.total}: ${progress.currentUrl}`);
},
});
console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);
await reader.close();
```
### Crawling
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.crawl({
url: "https://example.com",
depth: 2,
maxPages: 20,
scrape: true,
});
console.log(`Discovered ${result.urls.length} URLs`);
console.log(`Scraped ${result.scraped?.batchMetadata.successfulUrls} pages`);
await reader.close();
```
### Browser Session
Launch a stealthed Chrome and control it with Playwright or Puppeteer. The browser has anti-bot stealth active (`webdriver=false`, navigator spoofing, WebRTC masking). Your existing scripts just work.
```typescript
import { ReaderClient } from "@vakra-dev/reader";
import { chromium } from "playwright-core";
const reader = new ReaderClient();
// Create a browser session - returns a CDP WebSocket URL
const session = await reader.browser();
// Connect Playwright (one-line change from a local script)
const browser = await chromium.connectOverCDP(session.wsEndpoint);
const context = await browser.newContext();
const page = await context.newPage();
// Use Playwright normally - full stealth active
await page.goto("https://news.ycombinator.com/");
console.log(await page.title());
await browser.close();
await session.close();
await reader.close();
```
Also works with Puppeteer:
```typescript
import { connect } from "puppeteer-core";
const browser = await connect({ browserWSEndpoint: session.wsEndpoint });
```
### With Proxy
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.scrape({
urls: ["https://example.com"],
formats: ["markdown"],
proxy: {
type: "residential",
host: "proxy.example.com",
port: 8080,
username: "username",
password: "password",
country: "us",
},
});
await reader.close();
```
### With Tiered Proxy Pools
Configure datacenter (fast, cheap) and residential (anti-bot) proxy tiers. Reader auto-escalates from datacenter to residential when sites block:
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient({
proxyPools: {
datacenter: [
{ url: "http://user:pass@dc-proxy1:8080" },
{ url: "http://user:pass@dc-proxy2:8080" },
],
residential: [{ url: "http://user:pass@res-proxy1:8080" }],
},
});
const result = await reader.scrape({
urls: ["https://example.com"],
proxyTier: "auto", // datacenter first, escalate to residential on block
});
await reader.close();
```
Or via environment variables:
```bash
PROXY_DATACENTER=http://user:pass@dc1:8080,http://user:pass@dc2:8080
PROXY_RESIDENTIAL=http://user:pass@res1:8080
```
### With Browser Pool Configuration
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient({
browserPool: {
size: 5, // 5 browser instances
retireAfterPages: 50, // Recycle after 50 pages
retireAfterMinutes: 15, // Recycle after 15 minutes
},
verbose: true,
});
const result = await reader.scrape({
urls: manyUrls,
batchConcurrency: 5,
});
await reader.close();
```
## CLI Reference
### Daemon Mode
For multiple requests, start a daemon to keep browser pool warm:
```bash
# Start daemon with browser pool
npx reader start --direct-pool-size 5
# All subsequent commands auto-connect to daemon
npx reader scrape https://example.com
npx reader crawl https://example.com -d 2
# Check daemon status
npx reader status
# Stop daemon
npx reader stop
# Force standalone mode (bypass daemon)
npx reader scrape https://example.com --standalone
```
### `reader scrape `
Scrape one or more URLs.
```bash
# Scrape a single URL
npx reader scrape https://example.com
# Scrape with multiple formats
npx reader scrape https://example.com -f markdown,html
# Scrape multiple URLs concurrently
npx reader scrape https://example.com https://example.org -c 2
# Save to file
npx reader scrape https://example.com -o output.md
```
| Option | Type | Default | Description |
| ------------------------ | ------ | ------------ | ------------------------------------------------------- |
| `-f, --format ` | string | `"markdown"` | Output formats (comma-separated: markdown,html) |
| `-o, --output ` | string | stdout | Output file path |
| `-c, --concurrency ` | number | `1` | Parallel requests |
| `-t, --timeout ` | number | `30000` | Request timeout in milliseconds |
| `--batch-timeout ` | number | `300000` | Total timeout for entire batch operation |
| `--proxy ` | string | - | Proxy URL (e.g., http://user:pass@host:port) |
| `--user-agent ` | string | - | Custom user agent string |
| `--show-chrome` | flag | - | Show browser window for debugging |
| `--no-main-content` | flag | - | Disable main content extraction (include full page) |
| `--include-tags ` | string | - | CSS selectors for elements to include (comma-separated) |
| `--exclude-tags ` | string | - | CSS selectors for elements to exclude (comma-separated) |
| `-v, --verbose` | flag | - | Enable verbose logging |
### `reader crawl `
Crawl a website to discover pages.
```bash
# Crawl with default settings
npx reader crawl https://example.com
# Crawl deeper with more pages
npx reader crawl https://example.com -d 3 -m 50
# Crawl and scrape content
npx reader crawl https://example.com -d 2 --scrape
# Filter URLs with patterns
npx reader crawl https://example.com --include "blog/*" --exclude "admin/*"
```
| Option | Type | Default | Description |
| ------------------------ | ------ | ------------ | ----------------------------------------------- |
| `-d, --depth ` | number | `1` | Maximum crawl depth |
| `-m, --max-pages ` | number | `20` | Maximum pages to discover |
| `-s, --scrape` | flag | - | Also scrape content of discovered pages |
| `-f, --format ` | string | `"markdown"` | Output formats when scraping (comma-separated) |
| `-o, --output ` | string | stdout | Output file path |
| `--delay ` | number | `1000` | Delay between requests in milliseconds |
| `-t, --timeout ` | number | - | Total timeout for crawl operation |
| `--include ` | string | - | URL patterns to include (comma-separated regex) |
| `--exclude ` | string | - | URL patterns to exclude (comma-separated regex) |
| `--proxy ` | string | - | Proxy URL (e.g., http://user:pass@host:port) |
| `--user-agent ` | string | - | Custom user agent string |
| `--show-chrome` | flag | - | Show browser window for debugging |
| `-v, --verbose` | flag | - | Enable verbose logging |
### `reader browser`
Launch a browser session with a CDP WebSocket endpoint.
```bash
# Create a session (prints wsEndpoint, blocks until Ctrl+C)
npx reader browser create
# Create with options
npx reader browser create --timeout 60000 --show-chrome
# List active sessions (daemon mode)
npx reader browser list
# Stop a session
npx reader browser stop
```
| Option | Type | Default | Description |
| -------------------- | ------ | -------- | -------------------------------- |
| `--proxy ` | string | - | Proxy URL |
| `-t, --timeout ` | number | `300000` | Session lifetime in milliseconds |
| `--show-chrome` | flag | - | Show browser window |
| `--standalone` | flag | - | Force standalone mode |
| `-v, --verbose` | flag | - | Enable verbose logging |
## API Reference
### `ReaderClient`
The recommended way to use Reader. Manages HeroCore lifecycle automatically.
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient({ verbose: true });
// Scrape
const result = await reader.scrape({ urls: ["https://example.com"] });
// Crawl
const crawlResult = await reader.crawl({ url: "https://example.com", depth: 2 });
// Browser session
const session = await reader.browser();
// → session.wsEndpoint for Playwright/Puppeteer
// Close when done (optional - auto-closes on exit)
await reader.close();
```
#### Constructor Options
| Option | Type | Default | Description |
| --------------- | ------------------- | --------------- | ------------------------------------------------ |
| `verbose` | `boolean` | `false` | Enable verbose logging |
| `showChrome` | `boolean` | `false` | Show browser window for debugging |
| `browserPool` | `BrowserPoolConfig` | `undefined` | Browser pool configuration (size, recycling) |
| `proxyPools` | `ProxyPoolConfig` | `undefined` | Tiered proxy pools (datacenter + residential) |
| `proxies` | `ProxyConfig[]` | `undefined` | Array of proxies for rotation (legacy) |
| `proxyRotation` | `string` | `"round-robin"` | Rotation strategy: `"round-robin"` or `"random"` |
#### BrowserPoolConfig
| Option | Type | Default | Description |
| -------------------- | -------- | ------- | ----------------------------------- |
| `size` | `number` | `2` | Number of browser instances in pool |
| `retireAfterPages` | `number` | `100` | Recycle browser after N page loads |
| `retireAfterMinutes` | `number` | `30` | Recycle browser after N minutes |
| `maxQueueSize` | `number` | `100` | Max pending requests in queue |
#### Methods
| Method | Description |
| ------------------- | -------------------------------------------------- |
| `scrape(options)` | Scrape one or more URLs |
| `crawl(options)` | Crawl a website to discover pages |
| `browser(options?)` | Launch a stealthed browser session (CDP WebSocket) |
| `start()` | Pre-initialize HeroCore (optional) |
| `isReady()` | Check if client is initialized |
| `close()` | Close client and release resources |
### `scrape(options): Promise`
Scrape one or more URLs. Can be used directly or via `ReaderClient`.
| Option | Type | Required | Default | Description |
| ------------------ | ----------------------------- | -------- | -------------- | --------------------------------------------------------------- |
| `urls` | `string[]` | Yes | - | Array of URLs to scrape |
| `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats |
| `onlyMainContent` | `boolean` | No | `true` | Extract only main content (removes nav/header/footer) |
| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |
| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |
| `waitForSelector` | `string` | No | - | CSS selector to wait for before page is loaded |
| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |
| `batchConcurrency` | `number` | No | `1` | Number of URLs to process in parallel |
| `batchTimeoutMs` | `number` | No | `300000` | Total timeout for entire batch operation |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration object |
| `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `onProgress` | `function` | No | - | Progress callback: `({ completed, total, currentUrl }) => void` |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging |
**Returns:** `Promise`
```typescript
interface ScrapeResult {
data: WebsiteScrapeResult[];
batchMetadata: BatchMetadata;
}
interface WebsiteScrapeResult {
markdown?: string;
html?: string;
metadata: {
baseUrl: string;
finalUrl?: string; // Present if URL redirected
totalPages: number;
scrapedAt: string;
duration: number;
website: WebsiteMetadata;
};
}
interface BatchMetadata {
totalUrls: number;
successfulUrls: number;
failedUrls: number;
scrapedAt: string;
totalDuration: number;
errors?: Array<{ url: string; error: string }>;
}
```
### `crawl(options): Promise`
Crawl a website to discover pages.
| Option | Type | Required | Default | Description |
| ------------------- | ----------------------------- | -------- | -------------- | ----------------------------------------------- |
| `url` | `string` | Yes | - | Single seed URL to start crawling from |
| `depth` | `number` | No | `1` | Maximum depth to crawl |
| `maxPages` | `number` | No | `20` | Maximum pages to discover |
| `scrape` | `boolean` | No | `false` | Also scrape full content of discovered pages |
| `delayMs` | `number` | No | `1000` | Delay between requests in milliseconds |
| `timeoutMs` | `number` | No | - | Total timeout for entire crawl operation |
| `includePatterns` | `string[]` | No | - | URL patterns to include (regex strings) |
| `excludePatterns` | `string[]` | No | - | URL patterns to exclude (regex strings) |
| `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats for scraped content |
| `scrapeConcurrency` | `number` | No | `2` | Number of URLs to scrape in parallel |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration object |
| `userAgent` | `string` | No | - | Custom user agent string |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging |
| `connectionToCore` | `any` | No | - | Connection to shared Hero Core (for production) |
**Returns:** `Promise`
```typescript
interface CrawlResult {
urls: CrawlUrl[];
scraped?: ScrapeResult;
metadata: CrawlMetadata;
}
interface CrawlUrl {
url: string;
title: string;
description: string | null;
}
interface CrawlMetadata {
totalUrls: number;
maxDepth: number;
totalDuration: number;
seedUrl: string;
}
```
### `browser(options?): Promise`
Launch a stealthed Chrome and return a CDP WebSocket URL for Playwright/Puppeteer.
| Option | Type | Required | Default | Description |
| ------------ | ------------- | -------- | -------- | ----------------------------------------------------- |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration |
| `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `showChrome` | `boolean` | No | `false` | Show browser window |
| `timeoutMs` | `number` | No | `300000` | Session lifetime (auto-closes after) |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
**Returns:** `Promise`
```typescript
interface BrowserSession {
sessionId: string; // Unique session identifier
wsEndpoint: string; // CDP WebSocket URL for Playwright/Puppeteer
createdAt: string; // ISO timestamp
close(): Promise; // Close session and release resources
}
```
**Stealth features active on all sessions:**
- `navigator.webdriver = false` (via `--disable-blink-features=AutomationControlled`)
- Proxy routing through authenticated proxy forwarder (if configured)
- Isolated user profile per session (no cookie/state leaks)
### ProxyConfig
| Option | Type | Required | Default | Description |
| ---------- | ------------------------------- | -------- | ------- | ------------------------------------------------------- |
| `url` | `string` | No | - | Full proxy URL (takes precedence over other fields) |
| `type` | `"datacenter" \| "residential"` | No | - | Proxy type |
| `host` | `string` | No | - | Proxy host |
| `port` | `number` | No | - | Proxy port |
| `username` | `string` | No | - | Proxy username |
| `password` | `string` | No | - | Proxy password |
| `country` | `string` | No | - | Country code for residential proxies (e.g., 'us', 'uk') |
## Daemon Mode (Production)
For production servers, start the daemon once and all scrape/crawl/browser requests share the warm browser pool:
```typescript
import { ReaderClient } from "@vakra-dev/reader";
// Create once at startup
const reader = new ReaderClient({
proxyPools: {
datacenter: [{ url: "http://user:pass@dc-proxy:8080" }],
residential: [{ url: "http://user:pass@res-proxy:8080" }],
},
});
// Reuse for all requests
const result = await reader.scrape({ urls: ["https://example.com"] });
// Graceful shutdown
process.on("SIGTERM", () => reader.close());
```
## How It Works
### Anti-Bot Bypass
Reader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced anti-detection:
1. **TLS Fingerprinting** - Emulates real Chrome browser fingerprints via MITM proxy
2. **Navigator Spoofing** - `webdriver=false`, device memory, hardware concurrency
3. **DNS over TLS** - Uses Cloudflare DNS (1.1.1.1) to mimic Chrome behavior
4. **WebRTC IP Masking** - Prevents IP leaks through WebRTC connections
5. **WebGL/Canvas Fingerprinting** - Randomized rendering signatures
### Browser Pool
- **Tiered Proxy Pools** - Separate datacenter and residential pools with auto-escalation
- **Auto-Recycling** - Browsers recycled after 100 requests or 30 minutes
- **Health Tracking** - Auto-benches failed proxies for 5 minutes, revives on recovery
- **Per-Proxy Concurrency** - Limits concurrent requests per proxy URL (default: 2)
### HTML to Markdown: supermarkdown
Reader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines.
**Why we built it:**
When you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown.
**What supermarkdown offers:**
| Feature | Benefit |
| -------------------- | ---------------------------------------------------- |
| **Written in Rust** | Native performance with Node.js bindings via napi-rs |
| **Full GFM support** | Tables, task lists, strikethrough, autolinks |
| **LLM-optimized** | Clean output designed for AI consumption |
| **Battle-tested** | Handles malformed HTML from real web pages |
| **CSS selectors** | Include/exclude elements during conversion |
supermarkdown is open source and available as both a Rust crate and npm package:
```bash
# npm
npm install @vakra-dev/supermarkdown
# Rust
cargo add supermarkdown
```
Check out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation.
## Server Deployment
Reader uses a real Chromium browser under the hood. On headless Linux servers (VPS, EC2, etc.), you need to install Chrome's system dependencies:
```bash
# Debian/Ubuntu
sudo apt-get install -y libnspr4 libnss3 libatk1.0-0 libatk-bridge2.0-0 \
libcups2 libxcb1 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 libasound2
```
This is the same requirement that Puppeteer and Playwright have on headless Linux. macOS, Windows, and Linux desktops already have these libraries.
For Docker and production deployment guides, see the [deployment documentation](https://docs.reader.dev/documentation/guides/deployment).
## Documentation
Full documentation is available at **[docs.reader.dev](https://docs.reader.dev)**, including guides for scraping, crawling, proxy configuration, browser pool management, and deployment.
### Examples
| Example | Description |
| -------------------------------------------------------------------------- | ---------------------------------------------- |
| [Basic Scraping](examples/basic/basic-scrape.ts) | Simple single-URL scraping |
| [Batch Scraping](examples/basic/batch-scrape.ts) | Concurrent multi-URL scraping |
| [Crawl Website](examples/basic/crawl-website.ts) | Crawl and discover pages |
| [Browser Session (Playwright)](examples/basic/browser-session.ts) | Navigate, extract data, screenshot |
| [Browser Session (Actions)](examples/basic/browser-session-actions.ts) | Click, type, search, wait for elements |
| [Browser Session (Puppeteer)](examples/basic/browser-session-puppeteer.ts) | Puppeteer via `connect({ browserWSEndpoint })` |
| [Browser Session (Raw CDP)](examples/basic/browser-session-selenium.ts) | Direct CDP WebSocket commands |
| [Browser Pool Config](examples/basic/browser-pool-config.ts) | Configure browser pool for high throughput |
| [Proxy Pool](examples/basic/proxy-pool.ts) | Proxy rotation with multiple proxies |
| [Cloudflare Bypass](examples/basic/cloudflare-bypass.ts) | Scrape Cloudflare-protected sites |
| [All Formats](examples/basic/all-formats.ts) | Output in markdown and html |
| [AI Tools](examples/ai-tools/) | OpenAI, Anthropic, LangChain integrations |
## Development
```bash
# Install dependencies
npm install
# Run linting
npm run lint
# Format code
npm run format
# Type check
npm run typecheck
# Find TODOs
npm run todo
```
## Contributing
Contributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
## License
[Apache 2.0](LICENSE) - See LICENSE for details.
## Citation
If you use Reader in your research or project, please cite it:
```bibtex
@software{reader.dev,
author = {Kaul, Nihal},
title = {Reader: Open-source, production-grade web scraping engine built for LLMs},
year = {2026},
publisher = {GitHub},
url = {https://github.com/vakra-dev/reader}
}
```
## Support
- [GitHub Issues](https://github.com/vakra-dev/reader/issues)
- [Documentation](https://docs.reader.dev)
- [Discord](https://discord.gg/6tjkq7J5WV)
================================================
FILE: SECURITY.md
================================================
# Security Policy
## Supported Versions
| Version | Supported |
| ------- | --------- |
| Latest | Yes |
We only provide security fixes for the latest release.
## Reporting a Vulnerability
If you discover a security vulnerability in Reader, please report it responsibly.
**Do not open a public GitHub issue for security vulnerabilities.**
Instead, email **nihal.codes@gmail.com** with:
- A description of the vulnerability
- Steps to reproduce the issue
- The potential impact
- Any suggested fixes (optional)
## What to Expect
- **Acknowledgment** within 48 hours of your report
- **Status update** within 7 days with an assessment and timeline
- **Credit** in the release notes (unless you prefer to remain anonymous)
## Scope
The following are in scope:
- The `@vakra-dev/reader` npm package
- The Reader CLI tool
- The Reader Cloud API (`cloud.reader.dev`)
The following are out of scope:
- Vulnerabilities in upstream dependencies (report these to the respective projects)
- Issues related to websites blocking scraping (this is expected behavior, not a vulnerability)
## Responsible Use
Reader is a web scraping tool. Users are responsible for complying with applicable laws and website terms of service. The project maintainers are not responsible for how the tool is used.
================================================
FILE: docs/api-reference.md
================================================
# API Reference
Complete API documentation for Reader.
## ReaderClient (Recommended)
The recommended way to use Reader. Manages HeroCore lifecycle automatically, reuses connections efficiently, and auto-closes on process exit.
```typescript
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient({ verbose: true });
// Scrape URLs
const result = await reader.scrape({
urls: ["https://example.com"],
formats: ["markdown"],
});
// Crawl a website
const crawlResult = await reader.crawl({
url: "https://example.com",
depth: 2,
});
// Launch a stealthed browser session
const session = await reader.browser();
// → session.wsEndpoint for Playwright/Puppeteer
// Close when done (optional - auto-closes on exit)
await reader.close();
```
### Constructor
```typescript
new ReaderClient(options?: ReaderClientOptions)
```
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `verbose` | `boolean` | `false` | Enable verbose logging |
| `showChrome` | `boolean` | `false` | Show browser window for debugging |
| `browserPool` | `BrowserPoolConfig` | - | Browser pool configuration |
| `proxyPools` | `ProxyPoolConfig` | - | Tiered proxy pools (datacenter + residential) |
| `proxies` | `ProxyConfig[]` | - | List of proxies to rotate through (legacy) |
| `proxyRotation` | `"round-robin" \| "random"` | `"round-robin"` | Proxy rotation strategy |
#### ProxyPoolConfig
```typescript
interface ProxyPoolConfig {
datacenter?: ProxyConfig[]; // Fast, cheap - works for most sites
residential?: ProxyConfig[]; // Slower, anti-bot sites (Amazon, LinkedIn)
}
```
#### BrowserPoolConfig
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `size` | `number` | `2` | Number of browser instances |
| `retireAfterPages` | `number` | `100` | Retire browser after N page loads |
| `retireAfterMinutes` | `number` | `30` | Retire browser after N minutes |
| `maxQueueSize` | `number` | `100` | Maximum pending requests in queue |
### Methods
#### start()
Pre-initialize HeroCore. Called automatically on first scrape/crawl.
```typescript
await reader.start(): Promise
```
#### scrape(options)
Scrape one or more URLs.
```typescript
const result = await reader.scrape(options): Promise
```
See [ScrapeOptions](#scrapeoptions) for available options.
#### crawl(options)
Crawl a website to discover pages.
```typescript
const result = await reader.crawl(options): Promise
```
See [CrawlOptions](#crawloptions) for available options.
#### browser(options?)
Launch a stealthed browser session and return a CDP WebSocket URL for Playwright/Puppeteer.
```typescript
const session = await reader.browser(options?): Promise
```
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `proxy` | `ProxyConfig` | - | Proxy configuration |
| `proxyTier` | `ProxyTier` | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `showChrome` | `boolean` | `false` | Show browser window |
| `timeoutMs` | `number` | `300000` | Session lifetime (auto-closes after) |
| `verbose` | `boolean` | `false` | Enable verbose logging |
Returns:
```typescript
interface BrowserSession {
sessionId: string; // Unique session identifier
wsEndpoint: string; // CDP WebSocket URL
createdAt: string; // ISO timestamp
close(): Promise; // Close session and release resources
}
```
See the [Browser Sessions guide](guides/browser-sessions.md) for full examples.
#### isReady()
Check if the client is initialized and ready.
```typescript
reader.isReady(): boolean
```
#### close()
Close the client and release resources.
```typescript
await reader.close(): Promise
```
---
## Direct Functions (Advanced)
For advanced use cases where you need custom HeroCore management, you can use the direct functions. Note that without `connectionToCore`, each call spawns a new HeroCore instance which is less efficient.
### scrape(options)
Scrape one or more URLs and return content in specified formats.
```typescript
import { scrape } from "@vakra-dev/reader";
const result = await scrape({
urls: ["https://example.com"],
formats: ["markdown"],
});
```
#### Parameters
| Name | Type | Required | Default | Description |
|------|------|----------|---------|-------------|
| `urls` | `string[]` | Yes | - | Array of URLs to scrape |
| `formats` | `FormatType[]` | No | `["markdown"]` | Output formats |
| `onlyMainContent` | `boolean` | No | `true` | Extract only main content |
| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |
| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |
| `userAgent` | `string` | No | - | Custom user agent string |
| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |
| `batchConcurrency` | `number` | No | `1` | URLs to process in parallel |
| `batchTimeoutMs` | `number` | No | `300000` | Total batch timeout |
| `onProgress` | `ProgressCallback` | No | - | Progress callback function |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration |
| `proxyTier` | `ProxyTier` | No | - | Proxy tier: `"datacenter"`, `"residential"`, `"auto"` |
| `waitForSelector` | `string` | No | - | CSS selector to wait for |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show browser window |
| `connectionToCore` | `any` | No | - | Shared Hero Core connection |
#### Returns
`Promise`
```typescript
interface ScrapeResult {
data: WebsiteScrapeResult[];
batchMetadata: BatchMetadata;
}
```
#### Example
```typescript
// Using ReaderClient (recommended)
const reader = new ReaderClient();
const result = await reader.scrape({
urls: ["https://example.com", "https://example.org"],
formats: ["markdown", "html"],
batchConcurrency: 2,
onProgress: ({ completed, total, currentUrl }) => {
console.log(`[${completed}/${total}] ${currentUrl}`);
},
});
for (const site of result.data) {
console.log("URL:", site.metadata.baseUrl);
console.log("Markdown:", site.markdown?.substring(0, 200));
}
await reader.close();
```
---
### crawl(options)
Crawl a website to discover pages, optionally scraping their content.
```typescript
// Using ReaderClient (recommended)
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.crawl({
url: "https://example.com",
depth: 2,
maxPages: 20,
scrape: true,
});
await reader.close();
```
#### Parameters
| Name | Type | Required | Default | Description |
|------|------|----------|---------|-------------|
| `url` | `string` | Yes | - | Seed URL to start crawling |
| `depth` | `number` | No | `1` | Maximum crawl depth |
| `maxPages` | `number` | No | `20` | Maximum pages to discover |
| `scrape` | `boolean` | No | `false` | Also scrape discovered pages |
| `delayMs` | `number` | No | `1000` | Delay between requests |
| `timeoutMs` | `number` | No | - | Total crawl timeout |
| `includePatterns` | `string[]` | No | - | URL patterns to include |
| `excludePatterns` | `string[]` | No | - | URL patterns to exclude |
| `formats` | `FormatType[]` | No | `["markdown", "html"]` | Output formats when scraping |
| `scrapeConcurrency` | `number` | No | `2` | Scraping parallelism |
| `proxy` | `ProxyConfig` | No | - | Proxy configuration |
| `userAgent` | `string` | No | - | Custom user agent |
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
| `showChrome` | `boolean` | No | `false` | Show browser window |
| `connectionToCore` | `any` | No | - | Shared Hero Core connection |
#### Returns
`Promise`
```typescript
interface CrawlResult {
urls: CrawlUrl[];
scraped?: ScrapeResult;
metadata: CrawlMetadata;
}
```
#### Example
```typescript
const reader = new ReaderClient();
const result = await reader.crawl({
url: "https://docs.example.com",
depth: 3,
maxPages: 50,
includePatterns: ["docs/*"],
excludePatterns: ["docs/archive/*"],
scrape: true,
});
console.log(`Discovered ${result.urls.length} pages`);
result.urls.forEach((page) => {
console.log(`- ${page.title}: ${page.url}`);
});
if (result.scraped) {
console.log(`Scraped ${result.scraped.batchMetadata.successfulUrls} pages`);
}
await reader.close();
```
---
## Type Definitions
### ScrapeOptions
```typescript
interface ScrapeOptions {
urls: string[];
formats?: Array<"markdown" | "html">;
onlyMainContent?: boolean;
includeTags?: string[];
excludeTags?: string[];
userAgent?: string;
timeoutMs?: number;
batchConcurrency?: number;
batchTimeoutMs?: number;
onProgress?: (progress: ProgressInfo) => void;
proxy?: ProxyConfig;
proxyTier?: "datacenter" | "residential" | "auto";
waitForSelector?: string;
verbose?: boolean;
showChrome?: boolean;
connectionToCore?: any;
}
```
### CrawlOptions
```typescript
interface CrawlOptions {
url: string;
depth?: number;
maxPages?: number;
scrape?: boolean;
delayMs?: number;
timeoutMs?: number;
includePatterns?: string[];
excludePatterns?: string[];
formats?: Array<"markdown" | "html">;
scrapeConcurrency?: number;
proxy?: ProxyConfig;
userAgent?: string;
verbose?: boolean;
showChrome?: boolean;
connectionToCore?: any;
}
```
### ProxyConfig
```typescript
interface ProxyConfig {
url?: string;
type?: "datacenter" | "residential";
host?: string;
port?: number;
username?: string;
password?: string;
country?: string;
}
```
### ScrapeResult
```typescript
interface ScrapeResult {
data: WebsiteScrapeResult[];
batchMetadata: BatchMetadata;
}
```
### WebsiteScrapeResult
```typescript
interface WebsiteScrapeResult {
markdown?: string;
html?: string;
metadata: {
baseUrl: string;
finalUrl?: string; // Present if URL redirected
totalPages: number;
scrapedAt: string;
duration: number;
website: WebsiteMetadata;
proxy?: ProxyMetadata; // Included when proxy pooling is used
};
}
```
### ProxyMetadata
```typescript
interface ProxyMetadata {
host: string;
port: number;
country?: string; // If geo-targeting was used
}
```
### BatchMetadata
```typescript
interface BatchMetadata {
totalUrls: number;
successfulUrls: number;
failedUrls: number;
scrapedAt: string;
totalDuration: number;
errors?: Array<{ url: string; error: string }>;
}
```
### CrawlResult
```typescript
interface CrawlResult {
urls: CrawlUrl[];
scraped?: ScrapeResult;
metadata: CrawlMetadata;
}
```
### CrawlUrl
```typescript
interface CrawlUrl {
url: string;
title: string;
description: string | null;
}
```
### CrawlMetadata
```typescript
interface CrawlMetadata {
totalUrls: number;
maxDepth: number;
totalDuration: number;
seedUrl: string;
}
```
### WebsiteMetadata
```typescript
interface WebsiteMetadata {
title: string | null;
description: string | null;
author: string | null;
language: string | null;
charset: string | null;
favicon: string | null;
image: string | null;
canonical: string | null;
keywords: string[] | null;
robots: string | null;
themeColor: string | null;
openGraph: {
title: string | null;
description: string | null;
type: string | null;
url: string | null;
image: string | null;
siteName: string | null;
locale: string | null;
} | null;
twitter: {
card: string | null;
site: string | null;
creator: string | null;
title: string | null;
description: string | null;
image: string | null;
} | null;
}
```
### ProgressInfo
```typescript
interface ProgressInfo {
completed: number;
total: number;
currentUrl: string;
}
```
---
## Classes
### BrowserPool
Manages a pool of Hero browser instances for efficient scraping.
```typescript
import { BrowserPool } from "@vakra-dev/reader";
const pool = new BrowserPool({ size: 5 });
await pool.initialize();
const result = await pool.withBrowser(async (hero) => {
await hero.goto("https://example.com");
return await hero.document.title;
});
await pool.shutdown();
```
#### Constructor
```typescript
new BrowserPool(config?: PoolConfig)
```
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `size` | `number` | `2` | Number of browser instances |
| `retireAfterPages` | `number` | `100` | Recycle after N pages |
| `retireAfterMinutes` | `number` | `30` | Recycle after N minutes |
| `maxQueueSize` | `number` | `100` | Maximum pending requests |
| `healthCheckIntervalMs` | `number` | `300000` | Health check interval |
#### Methods
##### initialize()
Initialize the browser pool.
```typescript
await pool.initialize(): Promise
```
##### withBrowser(fn)
Execute a function with an acquired browser, automatically releasing it after.
```typescript
await pool.withBrowser(fn: (hero: Hero) => Promise): Promise
```
##### acquire()
Manually acquire a browser instance. Must be paired with `release()`.
```typescript
const hero = await pool.acquire(): Promise
```
##### release(hero)
Release a browser instance back to the pool.
```typescript
await pool.release(hero: Hero): Promise
```
##### healthCheck()
Check the health of all pool instances.
```typescript
const health = await pool.healthCheck(): Promise
```
##### getStats()
Get current pool statistics.
```typescript
const stats = pool.getStats(): PoolStats
```
##### shutdown()
Shutdown all browser instances.
```typescript
await pool.shutdown(): Promise
```
---
## Formatter Functions
### formatToMarkdown(pages, baseUrl, scrapedAt, duration, metadata?)
Convert scraped pages to Markdown format.
```typescript
import { formatToMarkdown } from "@vakra-dev/reader";
const markdown = formatToMarkdown(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);
```
---
### formatToHTML(pages, baseUrl, scrapedAt, duration, metadata?)
Convert scraped pages to a complete HTML document.
```typescript
import { formatToHTML } from "@vakra-dev/reader";
const html = formatToHTML(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);
```
---
## Utility Functions
### cleanContent(html)
Remove navigation, ads, scripts, and other non-content elements from HTML.
```typescript
import { cleanContent } from "@vakra-dev/reader";
const cleanHtml = cleanContent(rawHtml);
```
---
### extractMetadata(html)
Extract metadata from HTML including Open Graph and Twitter cards.
```typescript
import { extractMetadata } from "@vakra-dev/reader";
const metadata = extractMetadata(html);
console.log(metadata.title);
console.log(metadata.openGraph?.image);
```
---
## Default Values
```typescript
const DEFAULT_OPTIONS = {
formats: ["markdown"],
onlyMainContent: true,
timeoutMs: 30000,
batchConcurrency: 1,
batchTimeoutMs: 300000,
verbose: false,
showChrome: false,
};
const DEFAULT_CRAWL_OPTIONS = {
depth: 1,
maxPages: 20,
scrape: false,
delayMs: 1000,
formats: ["markdown", "html"],
scrapeConcurrency: 2,
verbose: false,
showChrome: false,
};
const DEFAULT_POOL_CONFIG = {
size: 2,
retireAfterPages: 100,
retireAfterMinutes: 30,
maxQueueSize: 100,
healthCheckIntervalMs: 300000,
};
```
---
## See Also
- [Getting Started](getting-started.md) - Quick start guide
- [Architecture](architecture.md) - System design
- [Browser Pool Guide](guides/browser-pool.md) - Pool management
- [Cloudflare Bypass Guide](guides/cloudflare-bypass.md) - Challenge handling
================================================
FILE: docs/architecture.md
================================================
# Architecture
This document describes the internal architecture of Reader, helping contributors understand how the system works.
## High-Level Overview
```
┌─────────────────────────────────────────────────────────────────┐
│ Public API │
│ scrape() / crawl() / browser() │
└──────────┬─────────────────┬────────────────┬───────────────────┘
│ │ │
┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼──────────┐
│ Scraper │ │ Crawler │ │ BrowserSession │
│ Class │ │ Class │ │ (CDP WebSocket)│
└─────┬─────┘ └─────┬─────┘ └─────┬──────────┘
│ │ │
└────────┬───────┘ │ own HeroCore
│ │
┌─────────▼─────────┐ ┌─────────▼─────────┐
│ TieredBrowserPool │ │ Dedicated Chrome │
│ (shared, pooled) │ │ (per-session) │
└─────────┬─────────┘ └───────────────────┘
│
┌───────────────┼───────────────┐
│ │ │
┌───▼──────────┐ ┌──▼──────────┐ ┌──▼────────────┐
│ Hero Config │ │ Orchestrator│ │ Formatters │
│ (TLS, DNS, etc.) │ │ Detection │ │ (MD, HTML, etc) │
└──────────────────┘ └─────────────────┘ └─────────────────┘
```
## Directory Structure
```
src/
├── index.ts # Public API exports
├── scraper.ts # Scraper class - main scraping logic
├── crawler.ts # Crawler class - link discovery + scraping
├── types.ts # ScrapeOptions, ScrapeResult, etc.
├── crawl-types.ts # CrawlOptions, CrawlResult, etc.
│
├── browser/
│ ├── pool.ts # BrowserPool - manages Hero instances
│ ├── hero-config.ts # Hero configuration (TLS, DNS, viewport)
│ └── types.ts # IBrowserPool, PoolConfig, PoolStats
│
├── cloudflare/
│ ├── detector.ts # detectChallenge() - DOM/text matching
│ ├── handler.ts # waitForChallengeResolution() - polling
│ └── types.ts # ChallengeDetection, ResolutionResult
│
├── formatters/
│ ├── markdown.ts # formatToMarkdown() - uses supermarkdown
│ ├── html.ts # formatToHTML() - full HTML document
│ ├── postprocess.ts # Post-processing utilities
│ └── index.ts # Re-exports all formatters
│
├── utils/
│ ├── content-cleaner.ts # cleanContent() - removes nav, ads
│ ├── metadata-extractor.ts # extractMetadata() - OG tags, etc.
│ ├── url-helpers.ts # URL validation, normalization
│ ├── rate-limiter.ts # Simple delay-based rate limiting
│ └── logger.ts # Pino logger with pretty print
│
├── proxy/
│ └── config.ts # createProxyUrl(), parseProxyUrl()
│
└── cli/
└── index.ts # CLI using Commander.js
```
## Core Components
### Scraper
The `Scraper` class (`src/scraper.ts`) handles URL scraping:
```typescript
class Scraper {
constructor(options: ScrapeOptions) { ... }
async scrape(): Promise {
// 1. Initialize browser pool
// 2. Process URLs with concurrency control (p-limit)
// 3. For each URL: fetch, detect challenges, extract content
// 4. Format to requested output formats
// 5. Aggregate results and metadata
}
private async scrapeSingleUrl(url: string): Promise {
// 1. Acquire browser from pool
// 2. Navigate to URL
// 3. Detect Cloudflare challenge
// 4. Wait for resolution if needed
// 5. Extract HTML and metadata
// 6. Clean content
// 7. Format to outputs
// 8. Release browser to pool
}
}
```
**Key design decisions:**
- Uses `p-limit` for concurrency control
- Each URL gets its own browser instance from the pool
- Cloudflare detection runs before content extraction
- All formatters run in parallel for each URL
### Crawler
The `Crawler` class (`src/crawler.ts`) discovers links:
```typescript
class Crawler {
async crawl(): Promise {
// BFS (Breadth-First Search) algorithm
// 1. Start with seed URL at depth 0
// 2. Fetch page, extract links
// 3. Filter links (same domain, patterns)
// 4. Add to queue with depth + 1
// 5. Repeat until maxPages or maxDepth
// 6. Optionally scrape discovered URLs
}
}
```
**Key design decisions:**
- BFS ensures shallow pages are discovered first
- Respects `maxPages` and `depth` limits
- Optional scraping reuses the Scraper class
- Delay between requests for rate limiting
### Browser Pool
The `BrowserPool` class (`src/browser/pool.ts`) manages Hero instances:
```typescript
class BrowserPool {
private instances: HeroInstance[];
private available: HeroInstance[];
private queue: PendingRequest[];
async initialize(): Promise { ... }
async acquire(): Promise { ... }
async release(hero: Hero): Promise { ... }
async withBrowser(fn: (hero: Hero) => Promise): Promise {
const hero = await this.acquire();
try {
return await fn(hero);
} finally {
await this.release(hero);
}
}
}
```
**Pool lifecycle:**
1. **Initialize** - Create `size` Hero instances
2. **Acquire** - Get available instance or queue the request
3. **Use** - Execute scraping logic
4. **Release** - Return to pool or recycle if stale
5. **Recycle** - Close old instance, create new one
6. **Shutdown** - Close all instances
**Recycling triggers:**
- After N pages (default: 100)
- After N minutes (default: 30)
- On health check failure
### Cloudflare Detection
Detection happens in two phases:
**1. Challenge Detection** (`src/cloudflare/detector.ts`):
```typescript
async function detectChallenge(hero: Hero): Promise {
// Check DOM for challenge elements
const signals = [];
// CSS selectors that indicate challenges
if (await hero.document.querySelector("#challenge-form")) {
signals.push({ type: "dom", selector: "#challenge-form" });
}
// Text patterns that indicate challenges
const bodyText = await hero.document.body.textContent;
if (bodyText.includes("checking your browser")) {
signals.push({ type: "text", pattern: "checking your browser" });
}
return {
isChallenge: signals.length > 0,
type: determineType(signals),
signals,
};
}
```
**2. Challenge Resolution** (`src/cloudflare/handler.ts`):
```typescript
async function waitForChallengeResolution(
hero: Hero,
options: ResolutionOptions
): Promise {
const startTime = Date.now();
while (Date.now() - startTime < options.maxWaitMs) {
// Check if URL changed (redirect after challenge)
if ((await hero.url) !== options.initialUrl) {
return { resolved: true, method: "redirect" };
}
// Check if challenge elements disappeared
const detection = await detectChallenge(hero);
if (!detection.isChallenge) {
return { resolved: true, method: "element_removal" };
}
await sleep(options.pollIntervalMs);
}
return { resolved: false };
}
```
### Formatters
Each formatter transforms scraped pages into a specific format:
| Formatter | Input | Output |
|-----------|-------|--------|
| `formatToMarkdown` | Pages, metadata | Markdown document with frontmatter |
| `formatToHTML` | Pages, metadata | Complete HTML document with CSS |
**Markdown formatter** uses [supermarkdown](https://github.com/vakra-dev/supermarkdown) - a high-performance Rust-based HTML-to-Markdown converter with full GFM support.
## Data Flow
### Scrape Request Flow
```
scrape({ urls: ["https://example.com"], formats: ["markdown"] })
│
├─► Scraper.scrape()
│ │
│ ├─► BrowserPool.initialize(size=concurrency)
│ │
│ ├─► For each URL (controlled by p-limit):
│ │ │
│ │ ├─► pool.withBrowser(async hero => {
│ │ │ │
│ │ │ ├─► hero.goto(url)
│ │ │ │
│ │ │ ├─► detectChallenge(hero)
│ │ │ │ └─► Returns { isChallenge, type, signals }
│ │ │ │
│ │ │ ├─► if (isChallenge):
│ │ │ │ └─► waitForChallengeResolution(hero)
│ │ │ │
│ │ │ ├─► Extract title, HTML
│ │ │ │
│ │ │ ├─► cleanContent(html)
│ │ │ │ └─► Remove nav, ads, scripts
│ │ │ │
│ │ │ ├─► extractMetadata(html)
│ │ │ │ └─► OG tags, Twitter cards, etc.
│ │ │ │
│ │ │ └─► Format to requested formats
│ │ │ })
│ │ │
│ │ └─► Add to results array
│ │
│ ├─► pool.shutdown()
│ │
│ └─► Return ScrapeResult { data[], batchMetadata }
│
└─► Result returned to caller
```
### Crawl Request Flow
```
crawl({ url: "https://example.com", depth: 2, scrape: true })
│
├─► Crawler.crawl()
│ │
│ ├─► Initialize queue with seed URL at depth 0
│ │
│ ├─► BFS loop (while queue not empty && pages < maxPages):
│ │ │
│ │ ├─► Dequeue next URL
│ │ │
│ │ ├─► Fetch page with Hero
│ │ │
│ │ ├─► Extract links via regex
│ │ │
│ │ ├─► Filter links:
│ │ │ ├─► Same domain only
│ │ │ ├─► Match includePatterns
│ │ │ └─► Exclude excludePatterns
│ │ │
│ │ ├─► Add new links to queue with depth + 1
│ │ │
│ │ ├─► Rate limit (delay between requests)
│ │ │
│ │ └─► Add to discovered URLs
│ │
│ ├─► If scrape=true:
│ │ └─► scrape({ urls: discoveredUrls })
│ │
│ └─► Return CrawlResult { urls[], scraped?, metadata }
│
└─► Result returned to caller
```
## Design Decisions
### Why Hero?
[Ulixee Hero](https://ulixee.org/) was chosen for:
1. **Stealth** - Advanced TLS fingerprinting and anti-detection
2. **Speed** - Optimized for headless automation
3. **API** - Clean async/await interface
4. **Stability** - Production-tested at scale
### Pool vs Per-Request Browsers
We use a pool because:
- Browser startup is slow (~2-3 seconds)
- Memory overhead per browser is high
- Connection reuse improves performance
Trade-off: Stale browsers can accumulate state, so we recycle them periodically.
### Cloudflare Detection Strategy
Multi-signal approach because:
- No single indicator is 100% reliable
- Cloudflare changes their challenge pages
- Different challenge types have different signatures
Detection signals include:
- DOM elements (`#challenge-form`, `.cf-browser-verification`)
- Text patterns ("checking your browser", "ray id")
- URL patterns (`/cdn-cgi/challenge-platform/`)
- HTTP status codes
### Content Cleaning
We clean HTML before formatting because:
- Navigation, ads, scripts bloat output
- LLMs perform better with focused content
- Reduces token usage
Cleaning removes:
- `Content