[
  {
    "path": ".editorconfig",
    "content": "root = true\n\n[*]\nindent_style = space\nindent_size = 4\ncharset = utf-8\ntrim_trailing_whitespace = true\ninsert_final_newline = true\nend_of_line = lf\n\n[Makefile]\nindent_style = tab\n\n[{*.yaml, *.yml}]\nindent_size = 2\n"
  },
  {
    "path": ".github/CODEOWNERS",
    "content": "# Documentation codeowner\n\n/docs/*.md @TC-MO\n/docs/*.mdx @TC-MO\n"
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "### Description\n\n<!-- The purpose of the PR, list of the changes, ... -->\n\n- TODO\n\n### Issues\n\n<!-- If applicable, reference any related GitHub issues -->\n\n- Closes: #TODO\n\n### Testing\n\n<!-- Describe the testing process for these changes -->\n\n- TODO\n\n### Checklist\n\n- [ ] CI passed\n"
  },
  {
    "path": ".github/workflows/_check_code.yaml",
    "content": "name: Code checks\n\non:\n  # Runs when manually triggered from the GitHub UI.\n  workflow_dispatch:\n\n  # Runs when invoked by another workflow.\n  workflow_call:\n\npermissions:\n  contents: read\n\njobs:\n  actions_lint_check:\n    name: Actions lint check\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n      - name: Run actionlint\n        uses: rhysd/actionlint@v1.7.11\n\n  spell_check:\n    name: Spell check\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n      - name: Check spelling with typos\n        uses: crate-ci/typos@v1\n\n  lint_check:\n    name: Lint check\n    uses: apify/workflows/.github/workflows/python_lint_check.yaml@main\n    with:\n      python_versions: '[\"3.10\", \"3.11\", \"3.12\", \"3.13\", \"3.14\"]'\n\n  type_check:\n    name: Type check\n    uses: apify/workflows/.github/workflows/python_type_check.yaml@main\n    with:\n      python_versions: '[\"3.10\", \"3.11\", \"3.12\", \"3.13\", \"3.14\"]'\n"
  },
  {
    "path": ".github/workflows/_check_docs.yaml",
    "content": "name: Doc checks\n\non:\n  # Runs when manually triggered from the GitHub UI.\n  workflow_dispatch:\n\n  # Runs when invoked by another workflow.\n  workflow_call:\n\npermissions:\n  contents: read\n\njobs:\n  doc_checks:\n    name: Doc checks\n    uses: apify/workflows/.github/workflows/python_docs_check.yaml@main\n"
  },
  {
    "path": ".github/workflows/_release_docs.yaml",
    "content": "name: Doc release\n\non:\n  # Runs when manually triggered from the GitHub UI.\n  workflow_dispatch:\n\n  # Runs when invoked by another workflow.\n  workflow_call:\n    inputs:\n      ref:\n        required: true\n        type: string\n\npermissions:\n  contents: read\n\nenv:\n  NODE_VERSION: 22\n  PYTHON_VERSION: 3.14\n  CHECKOUT_REF: ${{ github.event_name == 'workflow_call' && inputs.ref || github.ref }}\n\njobs:\n  release_docs:\n    name: Doc release\n    environment:\n      name: github-pages\n    permissions:\n      contents: write\n      pages: write\n      id-token: write\n    runs-on: ubuntu-latest\n\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n        with:\n          token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}\n          ref: ${{ env.CHECKOUT_REF }}\n\n      - name: Set up Node\n        uses: actions/setup-node@v6\n        with:\n          node-version: ${{ env.NODE_VERSION }}\n\n      - name: Set up Python\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ env.PYTHON_VERSION }}\n\n      - name: Set up uv package manager\n        uses: astral-sh/setup-uv@v7\n        with:\n          python-version: ${{ env.PYTHON_VERSION }}\n\n      - name: Install Python dependencies\n        run: uv run poe install-dev\n\n      - name: Build Docusaurus docs\n        run: uv run poe build-docs\n        env:\n          APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}\n          SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}\n\n      - name: Set up GitHub Pages\n        uses: actions/configure-pages@v5\n\n      - name: Upload GitHub Pages artifact\n        uses: actions/upload-pages-artifact@v4\n        with:\n          path: ./website/build\n\n      - name: Deploy artifact to GitHub Pages\n        uses: actions/deploy-pages@v4\n\n      - name: Invalidate CloudFront cache\n        run: |\n          gh workflow run invalidate-cloudfront.yml \\\n            --repo apify/apify-docs-private \\\n            --field deployment=crawlee-web\n          echo \"✅ CloudFront cache invalidation workflow triggered successfully\"\n        env:\n          GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}\n"
  },
  {
    "path": ".github/workflows/_tests.yaml",
    "content": "name: Tests\n\non:\n  # Runs when manually triggered from the GitHub UI.\n  workflow_dispatch:\n\n  # Runs when invoked by another workflow.\n  workflow_call:\n\npermissions:\n  contents: read\n\njobs:\n  unit_tests:\n    name: Unit tests\n    uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main\n    secrets: inherit\n    with:\n      python_versions: '[\"3.10\", \"3.11\", \"3.12\", \"3.13\", \"3.14\"]'\n      operating_systems: '[\"ubuntu-latest\", \"windows-latest\", \"macos-latest\"]'\n      python_version_for_codecov: \"3.14\"\n      operating_system_for_codecov: ubuntu-latest\n      tests_concurrency: \"8\"\n"
  },
  {
    "path": ".github/workflows/manual_release_stable.yaml",
    "content": "name: Stable release\n\non:\n  # Runs when manually triggered from the GitHub UI, with options to specify the type of release.\n  workflow_dispatch:\n    inputs:\n      release_type:\n        description: Release type\n        required: true\n        type: choice\n        default: auto\n        options:\n          - auto\n          - custom\n          - patch\n          - minor\n          - major\n      custom_version:\n        description: The custom version to bump to (only for \"custom\" type)\n        required: false\n        type: string\n        default: \"\"\n\nconcurrency:\n  group: release\n  cancel-in-progress: false\n\npermissions:\n  contents: read\n\njobs:\n  code_checks:\n    name: Code checks\n    uses: ./.github/workflows/_check_code.yaml\n\n  release_prepare:\n    name: Release prepare\n    needs: [code_checks]\n    runs-on: ubuntu-latest\n    outputs:\n      version_number: ${{ steps.release_prepare.outputs.version_number }}\n      tag_name: ${{ steps.release_prepare.outputs.tag_name }}\n      changelog: ${{ steps.release_prepare.outputs.changelog }}\n      release_notes: ${{ steps.release_prepare.outputs.release_notes }}\n    steps:\n      - uses: apify/workflows/git-cliff-release@main\n        name: Release prepare\n        id: release_prepare\n        with:\n          release_type: ${{ inputs.release_type }}\n          custom_version: ${{ inputs.custom_version }}\n          existing_changelog_path: CHANGELOG.md\n\n  changelog_update:\n    name: Changelog update\n    needs: [release_prepare]\n    permissions:\n      contents: write\n    uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main\n    with:\n      version_number: ${{ needs.release_prepare.outputs.version_number }}\n      changelog: ${{ needs.release_prepare.outputs.changelog }}\n    secrets: inherit\n\n  github_release:\n    name: GitHub release\n    needs: [release_prepare, changelog_update]\n    runs-on: ubuntu-latest\n    permissions:\n      contents: write\n    env:\n      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n    steps:\n      - name: GitHub release\n        uses: softprops/action-gh-release@v2\n        with:\n          tag_name: ${{ needs.release_prepare.outputs.tag_name }}\n          name: ${{ needs.release_prepare.outputs.version_number }}\n          target_commitish: ${{ needs.changelog_update.outputs.changelog_commitish }}\n          body: ${{ needs.release_prepare.outputs.release_notes }}\n\n  pypi_publish:\n    name: PyPI publish\n    needs: [release_prepare, changelog_update]\n    runs-on: ubuntu-latest\n    permissions:\n      contents: write\n      id-token: write # Required for OIDC authentication.\n    environment:\n      name: pypi\n      url: https://pypi.org/project/crawlee\n    steps:\n      - name: Prepare distribution\n        uses: apify/workflows/prepare-pypi-distribution@main\n        with:\n          package_name: crawlee\n          is_prerelease: \"\"\n          version_number: ${{ needs.release_prepare.outputs.version_number }}\n          ref: ${{ needs.changelog_update.outputs.changelog_commitish }}\n      # Publishes the package to PyPI using PyPA official GitHub action with OIDC authentication.\n      - name: Publish package to PyPI\n        uses: pypa/gh-action-pypi-publish@release/v1\n\n      # TODO: add job for publish package to Conda\n      # https://github.com/apify/crawlee-python/issues/104\n\n  doc_release:\n    name: Doc release\n    needs: [changelog_update, pypi_publish]\n    permissions:\n      contents: write\n      pages: write\n      id-token: write\n    uses: ./.github/workflows/_release_docs.yaml\n    with:\n      # Use the ref from the changelog update to include the updated changelog.\n      ref: ${{ needs.changelog_update.outputs.changelog_commitish }}\n    secrets: inherit\n"
  },
  {
    "path": ".github/workflows/on_issue.yaml",
    "content": "name: CI (issue)\n\non:\n  # Runs when a new issue is opened.\n  issues:\n    types:\n      - opened\n\npermissions:\n  contents: read\n\njobs:\n  label_issues:\n    name: Add labels\n    runs-on: ubuntu-latest\n    permissions:\n      issues: write\n\n    steps:\n      # Add the \"t-tooling\" label to all new issues\n      - uses: actions/github-script@v8\n        with:\n          script: |\n            github.rest.issues.addLabels({\n              issue_number: context.issue.number,\n              owner: context.repo.owner,\n              repo: context.repo.repo,\n              labels: [\"t-tooling\"]\n            })\n"
  },
  {
    "path": ".github/workflows/on_master.yaml",
    "content": "name: CI (master)\n\non:\n  push:\n    branches:\n      - master\n    tags-ignore:\n      - \"**\" # Ignore all tags to avoid duplicate executions triggered by tag pushes.\n\nconcurrency:\n  group: release\n  cancel-in-progress: false\n\npermissions:\n  contents: read\n\njobs:\n  doc_checks:\n    name: Doc checks\n    uses: ./.github/workflows/_check_docs.yaml\n\n  doc_release:\n    # Skip this for non-\"docs\" commits.\n    if: startsWith(github.event.head_commit.message, 'docs')\n    name: Doc release\n    needs: [doc_checks]\n    permissions:\n      contents: write\n      pages: write\n      id-token: write\n    uses: ./.github/workflows/_release_docs.yaml\n    with:\n      # Use the same ref as the one that triggered the workflow.\n      ref: ${{ github.ref }}\n    secrets: inherit\n\n  code_checks:\n    name: Code checks\n    uses: ./.github/workflows/_check_code.yaml\n\n  tests:\n    # Skip this for \"docs\" commits.\n    if: \"!startsWith(github.event.head_commit.message, 'docs')\"\n    name: Tests\n    uses: ./.github/workflows/_tests.yaml\n    secrets: inherit\n\n  release_prepare:\n    # Run this only for \"feat\", \"fix\", \"perf\", \"refactor\" and \"style\" commits.\n    if: >-\n      startsWith(github.event.head_commit.message, 'feat') ||\n      startsWith(github.event.head_commit.message, 'fix') ||\n      startsWith(github.event.head_commit.message, 'perf') ||\n      startsWith(github.event.head_commit.message, 'refactor') ||\n      startsWith(github.event.head_commit.message, 'style')\n    name: Release prepare\n    needs: [code_checks, tests]\n    runs-on: ubuntu-latest\n    outputs:\n      version_number: ${{ steps.release_prepare.outputs.version_number }}\n      tag_name: ${{ steps.release_prepare.outputs.tag_name }}\n      changelog: ${{ steps.release_prepare.outputs.changelog }}\n    steps:\n      - uses: apify/workflows/git-cliff-release@main\n        id: release_prepare\n        name: Release prepare\n        with:\n          release_type: prerelease\n          existing_changelog_path: CHANGELOG.md\n\n  changelog_update:\n    name: Changelog update\n    needs: [release_prepare]\n    permissions:\n      contents: write\n    uses: apify/workflows/.github/workflows/python_bump_and_update_changelog.yaml@main\n    with:\n      version_number: ${{ needs.release_prepare.outputs.version_number }}\n      changelog: ${{ needs.release_prepare.outputs.changelog }}\n    secrets: inherit\n\n  pypi_publish:\n    name: PyPI publish\n    needs: [release_prepare, changelog_update]\n    runs-on: ubuntu-latest\n    permissions:\n      contents: write\n      id-token: write # Required for OIDC authentication.\n    environment:\n      name: pypi\n      url: https://pypi.org/project/crawlee\n    steps:\n      - name: Prepare distribution\n        uses: apify/workflows/prepare-pypi-distribution@main\n        with:\n          package_name: crawlee\n          is_prerelease: \"yes\"\n          version_number: ${{ needs.release_prepare.outputs.version_number }}\n          ref: ${{ needs.changelog_update.outputs.changelog_commitish }}\n\n      - name: Publish package to PyPI\n        uses: pypa/gh-action-pypi-publish@release/v1\n\n  doc_release_post_publish:\n    name: Doc release post publish\n    needs: [changelog_update, pypi_publish]\n    permissions:\n      contents: write\n      pages: write\n      id-token: write\n    uses: ./.github/workflows/_release_docs.yaml\n    with:\n      # Use the ref from the changelog update to include the updated changelog.\n      ref: ${{ needs.changelog_update.outputs.changelog_commitish }}\n    secrets: inherit\n"
  },
  {
    "path": ".github/workflows/on_pull_request.yaml",
    "content": "name: CI (PR)\n\non:\n  # Runs whenever a pull request is opened or updated.\n  pull_request:\n\npermissions:\n  contents: read\n  pull-requests: read\n\njobs:\n  pr_title_check:\n    name: PR title check\n    runs-on: ubuntu-latest\n    steps:\n      - uses: amannn/action-semantic-pull-request@v6.1.1\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n\n  doc_checks:\n    name: Doc checks\n    uses: ./.github/workflows/_check_docs.yaml\n\n  code_checks:\n    name: Code checks\n    uses: ./.github/workflows/_check_code.yaml\n\n  tests:\n    name: Tests\n    uses: ./.github/workflows/_tests.yaml\n    secrets: inherit\n"
  },
  {
    "path": ".github/workflows/on_schedule_tests.yaml",
    "content": "name: Scheduled tests\n\non:\n  # Runs when manually triggered from the GitHub UI.\n  workflow_dispatch:\n\n  # Runs on a daily schedule at 06:00 UTC.\n  schedule:\n    - cron: '0 6 * * *'\n\nconcurrency:\n  group: scheduled-tests\n  cancel-in-progress: false\n\npermissions:\n  contents: read\n\nenv:\n  NODE_VERSION: 22\n  PYTHON_VERSION: 3.14\n  TESTS_CONCURRENCY: 1\n\njobs:\n  end_to_end_tests:\n    name: End-to-end tests\n    strategy:\n      fail-fast: false\n      max-parallel: 12\n      matrix:\n        crawler-type: [\"playwright_camoufox\", \"playwright_chrome\", \"playwright_firefox\", \"playwright_webkit\", \"playwright\", \"parsel\", \"beautifulsoup\"]\n        http-client: [\"httpx\", \"curl_impersonate\"]\n        package-manager: [\"pip\", \"uv\", \"poetry\"]\n\n    runs-on: \"ubuntu-latest\"\n\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n\n      - name: Setup node\n        uses: actions/setup-node@v6\n        with:\n          node-version: ${{ env.NODE_VERSION }}\n\n      - name: Install dependencies\n        run: npm install -g apify-cli\n\n      - name: Set up Python ${{ env.PYTHON_VERSION }}\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ env.PYTHON_VERSION }}\n\n      # installed to be able to patch crawlee in the poetry.lock with custom wheel file for poetry based templates\n      - name: Install poetry\n        run: pipx install poetry\n\n      - name: Set up uv package manager\n        uses: astral-sh/setup-uv@v7\n        with:\n          python-version: ${{ env.PYTHON_VERSION }}\n\n      # Sync the project, but no need to install the browsers into the test runner environment.\n      - name: Install Python dependencies\n        run: uv run poe install-sync\n\n      - name: Run templates end-to-end tests\n        run: uv run poe e2e-templates-tests -m \"${{ matrix.http-client }} and ${{ matrix.crawler-type }} and ${{ matrix.package-manager }}\"\n        env:\n          APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# AI assistant files\n.agent\n.agents\n.ai\n.aider\n.claude\n.codeium\n.continue\n.copilot\n.cursor\n.gemini\n.llm\n.llms\n.openai\n.serena\n.windsurf\n.zed-ai\nAGENTS.local.md\nCLAUDE.local.md\nGEMINI.local.md\n\n# Cache\n__pycache__\n.pytest_cache\n.ruff_cache\n.ty_cache\n.uv-cache\n\n# Virtual envs\n.direnv\n.env\n.envrc\n.python-version\n.venv\n\n# Other Python tools\n.ropeproject\n\n# Mise\nmise.toml\n.mise.toml\n\n# Egg and build artifacts\n*.egg-info/\n*.egg\ndist/\nbuild/\n\n# Coverage reports\n.coverage*\nhtmlcov\ncoverage-unit.xml\ncoverage-integration.xml\n\n# IDE, editors\n*~\n.DS_Store\n.idea\n.nvim.lua\n.vscode\n.zed\nSession.vim\n\n# Docs\ndocs/changelog.md\n\n# Website build artifacts, node dependencies\nwebsite/build\nwebsite/node_modules\nwebsite/.yarn\nwebsite/.docusaurus\nwebsite/api-typedoc-generated.json\nwebsite/apify-shared-docspec-dump.jsonl\nwebsite/docspec-dump.jsonl\nwebsite/module_shortcuts.json\nwebsite/typedoc-types*\n# npm lockfile (we use yarn)\nwebsite/package-lock.json\n\n# Default directory for memory storage\nstorage/\n\n# Tmp dir\ntmp/\n"
  },
  {
    "path": ".markdownlint.yaml",
    "content": "default: true\nline-length:\n  line_length: 120\nMD007:\n  indent: 4\nMD004:\n  style: dash\nno-inline-html: false\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: local\n    hooks:\n      - id: lint-check\n        name: Lint check\n        entry: uv run poe lint\n        language: system\n        pass_filenames: false\n\n      - id: type-check\n        name: Type check\n        entry: uv run poe type-check\n        language: system\n        pass_filenames: false\n"
  },
  {
    "path": ".rules.md",
    "content": "# Coding guidelines\n\nThis file provides guidance to programming agents when working with code in this repository.\n\n## Development Commands\n\nAll commands use `uv` (package manager) and `poe` (task runner):\n\n```bash\n# Install all dependencies (dev + extras + pre-commit + playwright)\nuv run poe install-dev\n\n# Run full check suite (lint + type-check + unit tests)\nuv run poe check-code\n\n# Linting (ruff format check + ruff check)\nuv run poe lint\n\n# Auto-fix formatting\nuv run poe format\n\n# Type checking (ty)\nuv run poe type-check\n\n# Run all unit tests\nuv run poe unit-tests\n\n# Run a single test file\nuv run pytest tests/unit/path/to/test_file.py\n\n# Run a single test by name\nuv run pytest tests/unit/path/to/test_file.py::test_name -v\n\n# Run tests with coverage XML report\nuv run poe unit-tests-cov\n\n# Build package\nuv run poe build\n\n# Clean build artifacts\nuv run poe clean\n```\n\nNote: `uv run poe unit-tests` first runs tests marked `@pytest.mark.run_alone` in isolation, then runs the rest with `-x` (fail-fast) and parallelism via `pytest-xdist`.\n\n## Code Style\n\n- **Linter/formatter**: Ruff with `select = [\"ALL\"]` and specific ignores\n- **Line length**: 120 characters\n- **Quotes**: Single quotes (double for docstrings)\n- **Docstrings**: Google format (enforced by Ruff)\n- **Type checker**: ty (Astral's type checker), target Python 3.10\n- **Async mode**: pytest-asyncio in `auto` mode (no need for `@pytest.mark.asyncio`)\n- **Commit format**: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, etc.)\n\n## Architecture\n\n### Crawler Hierarchy\n\n```\nBasicCrawler[TCrawlingContext, TStatisticsState]\n├── AbstractHttpCrawler  →  HttpCrawler, BeautifulSoupCrawler, ParselCrawler\n├── PlaywrightCrawler\n└── AdaptivePlaywrightCrawler (extends PlaywrightCrawler)\n```\n\n- **BasicCrawler** (`src/crawlee/crawlers/_basic/`): Core request lifecycle, autoscaling pool, retries, session management, router dispatch. Generic over `TCrawlingContext`.\n- **AbstractHttpCrawler** (`src/crawlee/crawlers/_abstract_http/`): Adds HTTP client integration, response parsing, pre-navigation hooks. Generic over parser result type.\n- **PlaywrightCrawler** (`src/crawlee/crawlers/_playwright/`): Browser-based crawling with Playwright.\n\n### Context Pipeline (Middleware Pattern)\n\nContexts are progressively enhanced through `ContextPipeline` middleware:\n\n```\nBasicCrawlingContext → HttpCrawlingContext → ParsedHttpCrawlingContext → BeautifulSoupCrawlingContext\n```\n\nEach middleware is an async generator that wraps the next handler, enabling setup/teardown around request processing.\n\n### Storage Layer\n\nThree-tier design:\n- **High-level**: `Dataset`, `KeyValueStore`, `RequestQueue` in `src/crawlee/storages/`\n- **Storage clients** (`src/crawlee/storage_clients/`): `FileSystemStorageClient` (default), `MemoryStorageClient`, `SqlStorageClient`, `RedisStorageClient`\n- **Instance caching**: `StorageInstanceManager` is a global singleton that caches storage instances by ID/name\n\n### Service Locator\n\n`src/crawlee/_service_locator.py` is a global singleton managing `Configuration`, `EventManager`, `StorageClient`, and `StorageInstanceManager`. Prevents double-initialization with `ServiceConflictError`.\n\n### HTTP Clients\n\nPluggable via `HttpClient` interface in `src/crawlee/http_clients/`:\n- `ImpitHttpClient` (default), `HttpxHttpClient`, `CurlImpersonateHttpClient`\n- Each provides `crawl()` (for crawler pipeline) and `send_request()` (for in-handler use)\n\n### Request Model\n\n`Request` (`src/crawlee/_request.py`) uses `unique_key` for deduplication. Lifecycle states: `UNPROCESSED → DONE`. Crawlee-specific metadata stored in `user_data['__crawlee']`.\n\n### Router\n\n```python\n@crawler.router.default_handler\nasync def handler(context: BeautifulSoupCrawlingContext): ...\n\n@crawler.router.handler(label='detail')\nasync def detail(context: BeautifulSoupCrawlingContext): ...\n```\n\nRequests are routed by their `label` field; unmatched requests go to the default handler.\n\n### Key Directories\n\n- `src/crawlee/crawlers/` - All crawler implementations\n- `src/crawlee/storages/` - Dataset, KVS, RequestQueue\n- `src/crawlee/storage_clients/` - Backend implementations\n- `src/crawlee/http_clients/` - HTTP client implementations\n- `src/crawlee/browsers/` - Playwright browser pool and plugins\n- `src/crawlee/sessions/` - Session management with cookie persistence\n- `src/crawlee/events/` - Event system (persist state, progress, aborting)\n- `src/crawlee/_autoscaling/` - Autoscaled pool for concurrency control\n- `src/crawlee/fingerprint_suite/` - Anti-bot fingerprint generation\n- `src/crawlee/project_template/` - CLI scaffolding template (excluded from linting)\n- `tests/unit/` - Unit tests\n- `tests/e2e/` - End-to-end tests (require `apify-cli` + API token)\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Changelog\n\nAll notable changes to this project will be documented in this file.\n\n## [1.6.0](https://github.com/apify/crawlee-python/releases/tag/v1.6.0) (2026-03-20)\n\n### 🚀 Features\n\n- Allow non-href links extract &amp; enqueue ([#1781](https://github.com/apify/crawlee-python/pull/1781)) ([6db365d](https://github.com/apify/crawlee-python/commit/6db365d1625206d8d691256c9cd4b44a821238bb)) by [@kozlice](https://github.com/kozlice)\n- Add `post_navigation_hooks` to crawlers ([#1795](https://github.com/apify/crawlee-python/pull/1795)) ([38ceda6](https://github.com/apify/crawlee-python/commit/38ceda635a18cb2f14efc7c8e8b67f3adb7e53fd)) by [@Mantisus](https://github.com/Mantisus)\n- Add page lifecycle hooks to `BrowserPool` ([#1791](https://github.com/apify/crawlee-python/pull/1791)) ([6f2ac13](https://github.com/apify/crawlee-python/commit/6f2ac13fea4cfa8a65e6e41430d3e8d28cc3a787)) by [@Mantisus](https://github.com/Mantisus)\n- Expose `BrowserType` and `CrawleePage` ([#1798](https://github.com/apify/crawlee-python/pull/1798)) ([b50b9f2](https://github.com/apify/crawlee-python/commit/b50b9f2a8396dcee2bd7eaf76c94d24912c2bc5f)) by [@Mantisus](https://github.com/Mantisus)\n- Expose `use_state` in `BasicCrawler` ([#1799](https://github.com/apify/crawlee-python/pull/1799)) ([d121873](https://github.com/apify/crawlee-python/commit/d121873a7f5902b911dd04b4aa9eaf75a8449323)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- **redis:** Do not remove handled request data from request queue ([#1787](https://github.com/apify/crawlee-python/pull/1787)) ([3008c61](https://github.com/apify/crawlee-python/commit/3008c61dcbe07ccdf3c43f198b37582cc1356c9a)) by [@kozlice](https://github.com/kozlice)\n- **redis:** Update actual `Request` state in request queue Redis storage client ([#1789](https://github.com/apify/crawlee-python/pull/1789)) ([787231c](https://github.com/apify/crawlee-python/commit/787231cebeb863ee2b4395964a79a37053dbec01)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [1.5.0](https://github.com/apify/crawlee-python/releases/tag/v1.5.0) (2026-03-06)\n\n### 🚀 Features\n\n- Use specialized Playwright docker images in templates ([#1757](https://github.com/apify/crawlee-python/pull/1757)) ([747c0cf](https://github.com/apify/crawlee-python/commit/747c0cf4a82296a2e3ea5cac5ef4c9578ea62a0c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1756](https://github.com/apify/crawlee-python/issues/1756)\n- Add `discover_valid_sitemaps` utility ([#1777](https://github.com/apify/crawlee-python/pull/1777)) ([872447b](https://github.com/apify/crawlee-python/commit/872447b60bbdb3926068064a971492807b1bdfbb)) by [@Mantisus](https://github.com/Mantisus), closes [#1740](https://github.com/apify/crawlee-python/issues/1740)\n\n### 🐛 Bug Fixes\n\n- Prevent list modification during iteration in BrowserPool ([#1703](https://github.com/apify/crawlee-python/pull/1703)) ([70309d9](https://github.com/apify/crawlee-python/commit/70309d9bf568d268a26b3ba6392be2b6ff284c65)) by [@vdusek](https://github.com/vdusek)\n- Fix ` max_requests_per_crawl` excluding failed requests ([#1766](https://github.com/apify/crawlee-python/pull/1766)) ([d6bb0b4](https://github.com/apify/crawlee-python/commit/d6bb0b4a9dc5dd6668d076fbfa1b5e748deaee0d)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1765](https://github.com/apify/crawlee-python/issues/1765)\n- **playwright:** Dispose of `APIResponse` body for `send_request` ([#1771](https://github.com/apify/crawlee-python/pull/1771)) ([29d301b](https://github.com/apify/crawlee-python/commit/29d301bf9d7795f2fbaddb99235a7157b880f60c)) by [@kozlice](https://github.com/kozlice)\n- Return `None` from `add_request` when storage client fails to enqueue request ([#1775](https://github.com/apify/crawlee-python/pull/1775)) ([944753a](https://github.com/apify/crawlee-python/commit/944753a71956c30f3ce0896ffa24be7de5348933)) by [@Mantisus](https://github.com/Mantisus)\n- Re-use pre-existing browser context in `PlaywrightBrowserController` ([#1778](https://github.com/apify/crawlee-python/pull/1778)) ([4487543](https://github.com/apify/crawlee-python/commit/44875433df83d433aa69ada458b91df3ad569f5e)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1776](https://github.com/apify/crawlee-python/issues/1776)\n\n\n## [1.4.0](https://github.com/apify/crawlee-python/releases/tag/v1.4.0) (2026-02-17)\n\n### 🚀 Features\n\n- Dynamic memory snapshots  ([#1715](https://github.com/apify/crawlee-python/pull/1715)) ([568a7b1](https://github.com/apify/crawlee-python/commit/568a7b186dedda19ad814ee8af3cd8e256cc4ad9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1704](https://github.com/apify/crawlee-python/issues/1704)\n- Add `MySQL` and `MariaDB` support for `SqlStorageClient` ([#1749](https://github.com/apify/crawlee-python/pull/1749)) ([202b500](https://github.com/apify/crawlee-python/commit/202b5009ea5d35ea779eb5b8db1fc575f90ca7bb)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Make log levels consistent in ServiceLocator ([#1746](https://github.com/apify/crawlee-python/pull/1746)) ([4163413](https://github.com/apify/crawlee-python/commit/4163413049485b035c38efd6a4a7d41502a44cfc)) by [@janbuchar](https://github.com/janbuchar)\n- Fix `PlaywrightCrawler` unintentionally setting the global configuration ([#1747](https://github.com/apify/crawlee-python/pull/1747)) ([fa58438](https://github.com/apify/crawlee-python/commit/fa58438026eb72a6002c8d494725bf4e48b4407e)) by [@Pijukatel](https://github.com/Pijukatel)\n- Fix `Snapshotter` handling of out of order samples ([#1735](https://github.com/apify/crawlee-python/pull/1735)) ([387c712](https://github.com/apify/crawlee-python/commit/387c712306055d901b1c0df4a9666967f039aefd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1734](https://github.com/apify/crawlee-python/issues/1734)\n\n### ⚡ Performance\n\n- Optimize metadata records processing in `SqlStorageClient` ([#1551](https://github.com/apify/crawlee-python/pull/1551)) ([df1347a](https://github.com/apify/crawlee-python/commit/df1347aacf05c05980000d15b36b65996119ea86)) by [@Mantisus](https://github.com/Mantisus), closes [#1533](https://github.com/apify/crawlee-python/issues/1533)\n\n\n## [1.3.2](https://github.com/apify/crawlee-python/releases/tag/v1.3.2) (2026-02-09)\n\n### 🐛 Bug Fixes\n\n- Use `max()` instead of `min()` for `request_max_duration` statistic ([#1701](https://github.com/apify/crawlee-python/pull/1701)) ([85c4335](https://github.com/apify/crawlee-python/commit/85c43351a05ada1369b720061f6f1a7e158340b6)) by [@vdusek](https://github.com/vdusek)\n- Prevent mutation of default URL patterns list in `block_requests` ([#1702](https://github.com/apify/crawlee-python/pull/1702)) ([fcf9adb](https://github.com/apify/crawlee-python/commit/fcf9adb6a0cfeaa87ca482372d4e066584eb28d6)) by [@vdusek](https://github.com/vdusek)\n- Keep None values for `user_data` in `Request` ([#1707](https://github.com/apify/crawlee-python/pull/1707)) ([3c575bc](https://github.com/apify/crawlee-python/commit/3c575bc2b0f1c89c99d134ad3a3fa7455ccc6910)) by [@Mantisus](https://github.com/Mantisus), closes [#1706](https://github.com/apify/crawlee-python/issues/1706)\n- Respect `max_open_pages_per_browser` limit for `PlaywrightBrowserController` on concurrent `new_page` calls ([#1712](https://github.com/apify/crawlee-python/pull/1712)) ([2e5534b](https://github.com/apify/crawlee-python/commit/2e5534b98913d5cbd6b721b2423d063772024417)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [1.3.1](https://github.com/apify/crawlee-python/releases/tag/v1.3.1) (2026-01-30)\n\n### 🐛 Bug Fixes\n\n- Reset all counter in metadata with `purge` for `RequestQueue` ([#1686](https://github.com/apify/crawlee-python/pull/1686)) ([ee09260](https://github.com/apify/crawlee-python/commit/ee0926084589f1b6e15840b6185ec5433be3b72f)) by [@Mantisus](https://github.com/Mantisus), closes [#1682](https://github.com/apify/crawlee-python/issues/1682)\n- Set default `http3=False` for `ImpitHttpClient` ([#1685](https://github.com/apify/crawlee-python/pull/1685)) ([3f390f6](https://github.com/apify/crawlee-python/commit/3f390f677540a3905038d7db6a6d1efad32fd045)) by [@Mantisus](https://github.com/Mantisus), closes [#1683](https://github.com/apify/crawlee-python/issues/1683)\n- Prevent get_request from permanently blocking requests ([#1684](https://github.com/apify/crawlee-python/pull/1684)) ([da416f9](https://github.com/apify/crawlee-python/commit/da416f98fb453904d62e7d29d8f24611ffb3ba8d)) by [@Mirza-Samad-Ahmed-Baig](https://github.com/Mirza-Samad-Ahmed-Baig)\n- Do not share state between different crawlers unless requested ([#1669](https://github.com/apify/crawlee-python/pull/1669)) ([64c246b](https://github.com/apify/crawlee-python/commit/64c246bedea14f86e607d23adc5bec644c578364)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1627](https://github.com/apify/crawlee-python/issues/1627)\n\n\n## [1.3.0](https://github.com/apify/crawlee-python/releases/tag/v1.3.0) (2026-01-20)\n\n### 🚀 Features\n\n- Expose `AdaptivePlaywrightCrawlerStatisticState` for `AdaptivePlaywrightCrawler` ([#1635](https://github.com/apify/crawlee-python/pull/1635)) ([1bb4bcb](https://github.com/apify/crawlee-python/commit/1bb4bcb4ccbec347ad9c14f70e9e946d48e3c38e)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Prevent race condition in concurrent storage creation ([#1626](https://github.com/apify/crawlee-python/pull/1626)) ([7f17a43](https://github.com/apify/crawlee-python/commit/7f17a4347d5884962767e757a92ec173688fed7b)) by [@Mantisus](https://github.com/Mantisus), closes [#1621](https://github.com/apify/crawlee-python/issues/1621)\n- Create correct statistics for `AdaptivePlaywrightCrawler` on initialization with a custom parser ([#1637](https://github.com/apify/crawlee-python/pull/1637)) ([bff7260](https://github.com/apify/crawlee-python/commit/bff726055dd0d7e07a2c546b15cbee22abd85960)) by [@Mantisus](https://github.com/Mantisus), closes [#1630](https://github.com/apify/crawlee-python/issues/1630)\n- Fix adding extra link for `EnqueueLinksFunction` with `limit` ([#1674](https://github.com/apify/crawlee-python/pull/1674)) ([71d7867](https://github.com/apify/crawlee-python/commit/71d7867b14f7f07cac06899f5da006091af4a954)) by [@Mantisus](https://github.com/Mantisus), closes [#1673](https://github.com/apify/crawlee-python/issues/1673)\n\n\n## [1.2.1](https://github.com/apify/crawlee-python/releases/tag/v1.2.1) (2025-12-16)\n\n### 🐛 Bug Fixes\n\n- Fix short error summary ([#1605](https://github.com/apify/crawlee-python/pull/1605)) ([b751208](https://github.com/apify/crawlee-python/commit/b751208d9a56e9d923e4559baeba35e2eede0450)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1602](https://github.com/apify/crawlee-python/issues/1602)\n- Freeze core `Request` fields ([#1603](https://github.com/apify/crawlee-python/pull/1603)) ([ae6d86b](https://github.com/apify/crawlee-python/commit/ae6d86b8c82900116032596201d94cd7875aaadc)) by [@Mantisus](https://github.com/Mantisus)\n- Respect `enqueue_strategy` after redirects in `enqueue_links` ([#1607](https://github.com/apify/crawlee-python/pull/1607)) ([700df91](https://github.com/apify/crawlee-python/commit/700df91bc9be1299388030a3e48e4dbc6f5b85a0)) by [@Mantisus](https://github.com/Mantisus), closes [#1606](https://github.com/apify/crawlee-python/issues/1606)\n- Protect `Request` from partial mutations on request handler failure ([#1585](https://github.com/apify/crawlee-python/pull/1585)) ([a69caf8](https://github.com/apify/crawlee-python/commit/a69caf87edecc755287c53c8cc0ca4725af5d411)) by [@Mantisus](https://github.com/Mantisus), closes [#1514](https://github.com/apify/crawlee-python/issues/1514)\n\n\n\n## [1.2.0](https://github.com/apify/crawlee-python/releases/tag/v1.2.0) (2025-12-08)\n\n### 🚀 Features\n\n- Add additional kwargs to Crawler&#x27;s export_data ([#1597](https://github.com/apify/crawlee-python/pull/1597)) ([5977f37](https://github.com/apify/crawlee-python/commit/5977f376b93a7c0d4dd53f0d331a4b04fedba2c6)) by [@vdusek](https://github.com/vdusek), closes [#526](https://github.com/apify/crawlee-python/issues/526)\n- Add `goto_options` for `PlaywrightCrawler` ([#1599](https://github.com/apify/crawlee-python/pull/1599)) ([0b82f3b](https://github.com/apify/crawlee-python/commit/0b82f3b6fb175223ea2aa5b348afcd5fdb767972)) by [@Mantisus](https://github.com/Mantisus), closes [#1576](https://github.com/apify/crawlee-python/issues/1576)\n\n### 🐛 Bug Fixes\n\n- Only apply requestHandlerTimeout to request handler ([#1474](https://github.com/apify/crawlee-python/pull/1474)) ([0dfb6c2](https://github.com/apify/crawlee-python/commit/0dfb6c2a13b6650736245fa39b3fbff397644df7)) by [@janbuchar](https://github.com/janbuchar)\n- Handle the case when `error_handler` returns `Request` ([#1595](https://github.com/apify/crawlee-python/pull/1595)) ([8a961a2](https://github.com/apify/crawlee-python/commit/8a961a2b07d0d33a7302dbb13c17f3d90999d390)) by [@Mantisus](https://github.com/Mantisus)\n- Align `Request.state` transitions with `Request` lifecycle ([#1601](https://github.com/apify/crawlee-python/pull/1601)) ([383225f](https://github.com/apify/crawlee-python/commit/383225f9f055d95ffb1302b8cf96f42ec264f1fc)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)\n\n### 🐛 Bug Fixes\n\n- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)\n- Fix `same-domain` strategy ignoring public suffix  ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)\n- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)\n- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)\n- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)\n\n\n## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)\n\n### 🚀 Features\n\n- Add `chrome` `BrowserType` for `PlaywrightCrawler` to use the Chrome browser ([#1487](https://github.com/apify/crawlee-python/pull/1487)) ([b06937b](https://github.com/apify/crawlee-python/commit/b06937bbc3afe3c936b554bfc503365c1b2c526b)) by [@Mantisus](https://github.com/Mantisus), closes [#1071](https://github.com/apify/crawlee-python/issues/1071)\n- Add `RedisStorageClient` based on Redis v8.0+ ([#1406](https://github.com/apify/crawlee-python/pull/1406)) ([d08d13d](https://github.com/apify/crawlee-python/commit/d08d13d39203c24ab61fe254b0956d6744db3b5f)) by [@Mantisus](https://github.com/Mantisus)\n- Add support for Python 3.14 ([#1553](https://github.com/apify/crawlee-python/pull/1553)) ([89e9130](https://github.com/apify/crawlee-python/commit/89e9130cabee0fbc974b29c26483b7fa0edf627c)) by [@Mantisus](https://github.com/Mantisus)\n- Add `transform_request_function` parameter for `SitemapRequestLoader` ([#1525](https://github.com/apify/crawlee-python/pull/1525)) ([dc90127](https://github.com/apify/crawlee-python/commit/dc901271849b239ba2a947e8ebff8e1815e8c4fb)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Improve indexing of the `request_queue_records` table for `SqlRequestQueueClient` ([#1527](https://github.com/apify/crawlee-python/pull/1527)) ([6509534](https://github.com/apify/crawlee-python/commit/65095346a9d8b703b10c91e0510154c3c48a4176)) by [@Mantisus](https://github.com/Mantisus), closes [#1526](https://github.com/apify/crawlee-python/issues/1526)\n- Improve error handling for `RobotsTxtFile.load` ([#1524](https://github.com/apify/crawlee-python/pull/1524)) ([596a311](https://github.com/apify/crawlee-python/commit/596a31184914a254b3e7a81fd2f48ea8eda7db49)) by [@Mantisus](https://github.com/Mantisus)\n- Fix `crawler_runtime` not being updated during run and only in the end ([#1540](https://github.com/apify/crawlee-python/pull/1540)) ([0d6c3f6](https://github.com/apify/crawlee-python/commit/0d6c3f6d3337ddb6cab4873747c28cf95605d550)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1541](https://github.com/apify/crawlee-python/issues/1541)\n- Ensure persist state event emission when exiting `EventManager` context ([#1562](https://github.com/apify/crawlee-python/pull/1562)) ([6a44f17](https://github.com/apify/crawlee-python/commit/6a44f172600cbcacebab899082d6efc9105c4e03)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1560](https://github.com/apify/crawlee-python/issues/1560)\n\n\n## [1.0.4](https://github.com/apify/crawlee-python/releases/tag/v1.0.4) (2025-10-24)\n\n### 🐛 Bug Fixes\n\n- Respect `enqueue_strategy` in `enqueue_links` ([#1505](https://github.com/apify/crawlee-python/pull/1505)) ([6ee04bc](https://github.com/apify/crawlee-python/commit/6ee04bc08c50a70f2e956a79d4ce5072a726c3a8)) by [@Mantisus](https://github.com/Mantisus), closes [#1504](https://github.com/apify/crawlee-python/issues/1504)\n- Exclude incorrect links before checking `robots.txt` ([#1502](https://github.com/apify/crawlee-python/pull/1502)) ([3273da5](https://github.com/apify/crawlee-python/commit/3273da5fee62ec9254666b376f382474c3532a56)) by [@Mantisus](https://github.com/Mantisus), closes [#1499](https://github.com/apify/crawlee-python/issues/1499)\n- Resolve compatibility issue between `SqlStorageClient` and `AdaptivePlaywrightCrawler` ([#1496](https://github.com/apify/crawlee-python/pull/1496)) ([ce172c4](https://github.com/apify/crawlee-python/commit/ce172c425a8643a1d4c919db4f5e5a6e47e91deb)) by [@Mantisus](https://github.com/Mantisus), closes [#1495](https://github.com/apify/crawlee-python/issues/1495)\n- Fix `BasicCrawler` statistics persistence ([#1490](https://github.com/apify/crawlee-python/pull/1490)) ([1eb1c19](https://github.com/apify/crawlee-python/commit/1eb1c19aa6f9dda4a0e3f7eda23f77a554f95076)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1501](https://github.com/apify/crawlee-python/issues/1501)\n- Save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ([#1488](https://github.com/apify/crawlee-python/pull/1488)) ([62b7c70](https://github.com/apify/crawlee-python/commit/62b7c70b54085fc65a660062028014f4502beba9)) by [@Mantisus](https://github.com/Mantisus), closes [#1483](https://github.com/apify/crawlee-python/issues/1483)\n\n\n## [1.0.3](https://github.com/apify/crawlee-python/releases/tag/v1.0.3) (2025-10-17)\n\n### 🐛 Bug Fixes\n\n- Add support for Pydantic v2.12 ([#1471](https://github.com/apify/crawlee-python/pull/1471)) ([35c1108](https://github.com/apify/crawlee-python/commit/35c110878c2f445a2866be2522ea8703e9b371dd)) by [@Mantisus](https://github.com/Mantisus), closes [#1464](https://github.com/apify/crawlee-python/issues/1464)\n- Fix database version warning message ([#1485](https://github.com/apify/crawlee-python/pull/1485)) ([18a545e](https://github.com/apify/crawlee-python/commit/18a545ee8add92e844acd0068f9cb8580a82e1c9)) by [@Mantisus](https://github.com/Mantisus)\n- Fix `reclaim_request` in `SqlRequestQueueClient` to correctly update the request state ([#1486](https://github.com/apify/crawlee-python/pull/1486)) ([1502469](https://github.com/apify/crawlee-python/commit/150246957f8f7f1ceb77bb77e3a02a903c50cae1)) by [@Mantisus](https://github.com/Mantisus), closes [#1484](https://github.com/apify/crawlee-python/issues/1484)\n- Fix `KeyValueStore.auto_saved_value` failing in some scenarios ([#1438](https://github.com/apify/crawlee-python/pull/1438)) ([b35dee7](https://github.com/apify/crawlee-python/commit/b35dee78180e57161b826641d45a61b8d8f6ef51)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1354](https://github.com/apify/crawlee-python/issues/1354)\n\n\n## [1.0.2](https://github.com/apify/crawlee-python/releases/tag/v1.0.2) (2025-10-08)\n\n### 🐛 Bug Fixes\n\n- Use Self type in the open() method of storage clients ([#1462](https://github.com/apify/crawlee-python/pull/1462)) ([4ec6f6c](https://github.com/apify/crawlee-python/commit/4ec6f6c08f81632197f602ff99151338b3eba6e7)) by [@janbuchar](https://github.com/janbuchar)\n- Add storages name validation ([#1457](https://github.com/apify/crawlee-python/pull/1457)) ([84de11a](https://github.com/apify/crawlee-python/commit/84de11a3a603503076f5b7df487c9abab68a9015)) by [@Mantisus](https://github.com/Mantisus), closes [#1434](https://github.com/apify/crawlee-python/issues/1434)\n- Pin pydantic version to &lt;2.12.0 to avoid compatibility issues ([#1467](https://github.com/apify/crawlee-python/pull/1467)) ([f11b86f](https://github.com/apify/crawlee-python/commit/f11b86f7ed57f98e83dc1b52f15f2017a919bf59)) by [@vdusek](https://github.com/vdusek)\n\n\n## [1.0.1](https://github.com/apify/crawlee-python/releases/tag/v1.0.1) (2025-10-06)\n\n### 🐛 Bug Fixes\n\n- Fix memory leak in `PlaywrightCrawler` on browser context creation ([#1446](https://github.com/apify/crawlee-python/pull/1446)) ([bb181e5](https://github.com/apify/crawlee-python/commit/bb181e58d8070fba38e62d6e57fe981a00e5f035)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1443](https://github.com/apify/crawlee-python/issues/1443)\n- Update templates to handle optional httpx client ([#1440](https://github.com/apify/crawlee-python/pull/1440)) ([c087efd](https://github.com/apify/crawlee-python/commit/c087efd39baedf46ca3e5cae1ddc1acd6396e6c1)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29)\n\n- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v1) for more details.\n- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v1) to ensure a smooth update.\n\n### 🚀 Features\n\n- Add utility for load and parse Sitemap and `SitemapRequestLoader` ([#1169](https://github.com/apify/crawlee-python/pull/1169)) ([66599f8](https://github.com/apify/crawlee-python/commit/66599f8d085f3a8622e130019b6fdce2325737de)) by [@Mantisus](https://github.com/Mantisus), closes [#1161](https://github.com/apify/crawlee-python/issues/1161)\n- Add periodic status logging and `status_message_callback` parameter for customization ([#1265](https://github.com/apify/crawlee-python/pull/1265)) ([b992fb2](https://github.com/apify/crawlee-python/commit/b992fb2a457dedd20fc3014d7a4a8afe14602342)) by [@Mantisus](https://github.com/Mantisus), closes [#96](https://github.com/apify/crawlee-python/issues/96)\n- Add crawlee-cli option to skip project installation ([#1294](https://github.com/apify/crawlee-python/pull/1294)) ([4d5aef0](https://github.com/apify/crawlee-python/commit/4d5aef05613d10c1442fe449d1cf0f63392c98e3)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1122](https://github.com/apify/crawlee-python/issues/1122)\n- Improve `Crawlee` CLI help text ([#1297](https://github.com/apify/crawlee-python/pull/1297)) ([afbe10f](https://github.com/apify/crawlee-python/commit/afbe10f15d93353f5bc551bf9f193414179d0dd7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1295](https://github.com/apify/crawlee-python/issues/1295)\n- Add basic `OpenTelemetry` instrumentation ([#1255](https://github.com/apify/crawlee-python/pull/1255)) ([a92d8b3](https://github.com/apify/crawlee-python/commit/a92d8b3f843ee795bba7e14710bb1faa1fdbf292)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1254](https://github.com/apify/crawlee-python/issues/1254)\n- Add `ImpitHttpClient` http-client client using the `impit` library ([#1151](https://github.com/apify/crawlee-python/pull/1151)) ([0d0d268](https://github.com/apify/crawlee-python/commit/0d0d2681a4379c0e7ba54c49c86dabfef641610f)) by [@Mantisus](https://github.com/Mantisus)\n- Prevent overloading system memory when running locally ([#1270](https://github.com/apify/crawlee-python/pull/1270)) ([30de3bd](https://github.com/apify/crawlee-python/commit/30de3bd7722cbc34db9fc582b4bda7dc2dfa90ff)) by [@janbuchar](https://github.com/janbuchar), closes [#1232](https://github.com/apify/crawlee-python/issues/1232)\n- Expose `PlaywrightPersistentBrowser` class ([#1314](https://github.com/apify/crawlee-python/pull/1314)) ([b5fa955](https://github.com/apify/crawlee-python/commit/b5fa95508d7c099ff3a342577f338439283a975f)) by [@Mantisus](https://github.com/Mantisus)\n- Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus)\n- Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99)\n- Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272)\n- Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269)\n- Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175)\n- Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402)\n- Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307)\n\n### 🐛 Bug Fixes\n\n- Fix memory estimation not working on MacOS ([#1330](https://github.com/apify/crawlee-python/pull/1330)) ([ab020eb](https://github.com/apify/crawlee-python/commit/ab020eb821a75723225b652d64babd84c368183f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)\n- Fix retry count to not count the original request ([#1328](https://github.com/apify/crawlee-python/pull/1328)) ([74fa1d9](https://github.com/apify/crawlee-python/commit/74fa1d936cb3c29cf62d87862a96b4266694af2f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)\n- [**breaking**] Remove unused &quot;stats&quot; field from RequestQueueMetadata ([#1331](https://github.com/apify/crawlee-python/pull/1331)) ([0a63bef](https://github.com/apify/crawlee-python/commit/0a63bef514b0bdcd3d6f208b386f706d0fe561e6)) by [@vdusek](https://github.com/vdusek)\n- Ignore unknown parameters passed in cookies ([#1336](https://github.com/apify/crawlee-python/pull/1336)) ([50d3ef7](https://github.com/apify/crawlee-python/commit/50d3ef7540551383d26d40f3404b435bde35b47d)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)\n- Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus)\n- Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318)\n- Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317)\n- Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383)\n- Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410)\n- Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek)\n\n### 🚜 Refactor\n\n- [**breaking**] Introduce new storage client system ([#1194](https://github.com/apify/crawlee-python/pull/1194)) ([de1c03f](https://github.com/apify/crawlee-python/commit/de1c03f70dbd4ae1773fd49c632b3cfcfab82c26)) by [@vdusek](https://github.com/vdusek), closes [#92](https://github.com/apify/crawlee-python/issues/92), [#147](https://github.com/apify/crawlee-python/issues/147), [#783](https://github.com/apify/crawlee-python/issues/783), [#1247](https://github.com/apify/crawlee-python/issues/1247)\n- [**breaking**] Split `BrowserType` literal into two different literals based on context ([#1070](https://github.com/apify/crawlee-python/pull/1070)) ([72b5698](https://github.com/apify/crawlee-python/commit/72b5698fa0647ea02b08da5651736cc37c4c0f6a)) by [@Pijukatel](https://github.com/Pijukatel)\n- [**breaking**] Change method `HttpResponse.read` from sync to async ([#1296](https://github.com/apify/crawlee-python/pull/1296)) ([83fa8a4](https://github.com/apify/crawlee-python/commit/83fa8a416b6d2d4e27c678b9bf99bd1b8799f57b)) by [@Mantisus](https://github.com/Mantisus)\n- [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079)\n- [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358)\n- [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379)\n\n\n\n## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30)\n\n### 🚀 Features\n\n- Add `retire_browser_after_page_count` parameter for `BrowserPool` ([#1266](https://github.com/apify/crawlee-python/pull/1266)) ([603aa2b](https://github.com/apify/crawlee-python/commit/603aa2b192ef4bc42d88244bd009fffdb0614c06)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Use `perf_counter_ns` for request duration tracking ([#1260](https://github.com/apify/crawlee-python/pull/1260)) ([9e92f6b](https://github.com/apify/crawlee-python/commit/9e92f6b54400ce5004fbab770e2e4ac42f73148f)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1256](https://github.com/apify/crawlee-python/issues/1256)\n- Fix memory estimation not working on MacOS (#1330) ([8558954](https://github.com/apify/crawlee-python/commit/8558954feeb7d5e91378186974a29851fedae9c8)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1329](https://github.com/apify/crawlee-python/issues/1329)\n- Fix retry count to not count the original request (#1328) ([1aff3aa](https://github.com/apify/crawlee-python/commit/1aff3aaf0cdbe452a3731192449a445e5b2d7a63)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1326](https://github.com/apify/crawlee-python/issues/1326)\n- Ignore unknown parameters passed in cookies (#1336) ([0f2610c](https://github.com/apify/crawlee-python/commit/0f2610c0ee1154dc004de60fc57fe7c9f478166a)) by [@Mantisus](https://github.com/Mantisus), closes [#1333](https://github.com/apify/crawlee-python/issues/1333)\n\n\n## [0.6.11](https://github.com/apify/crawlee-python/releases/tag/v0.6.11) (2025-06-23)\n\n### 🚀 Features\n\n- Add `stream` method for `HttpClient` ([#1241](https://github.com/apify/crawlee-python/pull/1241)) ([95c68b0](https://github.com/apify/crawlee-python/commit/95c68b0b2d0bf9e093c1d0ee1002625172f7a868)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Fix `ClientSnapshot` overload calculation ([#1228](https://github.com/apify/crawlee-python/pull/1228)) ([a4fc1b6](https://github.com/apify/crawlee-python/commit/a4fc1b6e83143650666108c289c084ea0463b80c)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1207](https://github.com/apify/crawlee-python/issues/1207)\n- Use `PSS` instead of `RSS` to estimate children process memory usage on Linux ([#1210](https://github.com/apify/crawlee-python/pull/1210)) ([436032f](https://github.com/apify/crawlee-python/commit/436032f2de5f7d7fa1016033f1bb224159a8e6bf)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1206](https://github.com/apify/crawlee-python/issues/1206)\n- Do not raise an error to check &#x27;same-domain&#x27; if there is no hostname in the url ([#1251](https://github.com/apify/crawlee-python/pull/1251)) ([a6c3aab](https://github.com/apify/crawlee-python/commit/a6c3aabf5f8341f215275077b6768a56118bc656)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [0.6.10](https://github.com/apify/crawlee-python/releases/tag/v0.6.10) (2025-06-02)\n\n### 🐛 Bug Fixes\n\n- Allow config change on `PlaywrightCrawler` ([#1186](https://github.com/apify/crawlee-python/pull/1186)) ([f17bf31](https://github.com/apify/crawlee-python/commit/f17bf31456b702631aa7e0c26d4f07fd5eb7d1bd)) by [@mylank](https://github.com/mylank), closes [#1185](https://github.com/apify/crawlee-python/issues/1185)\n- Add `payload` to `SendRequestFunction` to support `POST` request ([#1202](https://github.com/apify/crawlee-python/pull/1202)) ([e7449f2](https://github.com/apify/crawlee-python/commit/e7449f206c580cb8383a66e4c9ff5f67c5ce8409)) by [@Mantisus](https://github.com/Mantisus)\n- Fix match check for specified enqueue strategy for requests with redirect ([#1199](https://github.com/apify/crawlee-python/pull/1199)) ([d84c30c](https://github.com/apify/crawlee-python/commit/d84c30cbd7c94d6525d3b6e8e86b379050454c0e)) by [@Mantisus](https://github.com/Mantisus), closes [#1198](https://github.com/apify/crawlee-python/issues/1198)\n- Set `WindowsSelectorEventLoopPolicy` only for curl-impersonate template without `playwright` ([#1209](https://github.com/apify/crawlee-python/pull/1209)) ([f3b839f](https://github.com/apify/crawlee-python/commit/f3b839ffc2ccc1b889b6d5928f35f57b725e27f1)) by [@Mantisus](https://github.com/Mantisus), closes [#1204](https://github.com/apify/crawlee-python/issues/1204)\n- Add support non-GET requests for `PlaywrightCrawler` ([#1208](https://github.com/apify/crawlee-python/pull/1208)) ([dbb9f44](https://github.com/apify/crawlee-python/commit/dbb9f44c71af15e1f86766fa0ba68281dd85fd9e)) by [@Mantisus](https://github.com/Mantisus), closes [#1201](https://github.com/apify/crawlee-python/issues/1201)\n- Respect `EnqueueLinksKwargs` for `extract_links` function ([#1213](https://github.com/apify/crawlee-python/pull/1213)) ([c9907d6](https://github.com/apify/crawlee-python/commit/c9907d6ff4c3a4a719b279cea77694c00a5a963d)) by [@Mantisus](https://github.com/Mantisus), closes [#1212](https://github.com/apify/crawlee-python/issues/1212)\n\n\n## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02)\n\n### 🚀 Features\n\n- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928)\n- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158)\n- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160)\n\n### 🐛 Bug Fixes\n\n- Fix handle error without `args` in `_get_error_message`  for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179)\n- Temporarily add `certifi&lt;=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25)\n\n### 🚀 Features\n\n- Handle unprocessed requests in `add_requests_batched` ([#1159](https://github.com/apify/crawlee-python/pull/1159)) ([7851175](https://github.com/apify/crawlee-python/commit/7851175304d63e455223b25853021cfbe15d68bd)) by [@Pijukatel](https://github.com/Pijukatel), closes [#456](https://github.com/apify/crawlee-python/issues/456)\n- Add  `respect_robots_txt_file` option ([#1162](https://github.com/apify/crawlee-python/pull/1162)) ([c23f365](https://github.com/apify/crawlee-python/commit/c23f365bfd263b4357edf82c14a7c6ff8dee45e4)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Update `UnprocessedRequest` to match actual data ([#1155](https://github.com/apify/crawlee-python/pull/1155)) ([a15a1f3](https://github.com/apify/crawlee-python/commit/a15a1f3528c7cbcf78d3bda5a236bcee1d492764)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1150](https://github.com/apify/crawlee-python/issues/1150)\n- Fix the order in which cookies are saved to the `SessionCookies` and the handler is executed for `PlaywrightCrawler` ([#1163](https://github.com/apify/crawlee-python/pull/1163)) ([82ff69a](https://github.com/apify/crawlee-python/commit/82ff69acd8e409f56be56dd061aae0f854ec25b4)) by [@Mantisus](https://github.com/Mantisus)\n- Call `failed_request_handler` for `SessionError` when session rotation count exceeds maximum ([#1147](https://github.com/apify/crawlee-python/pull/1147)) ([b3637b6](https://github.com/apify/crawlee-python/commit/b3637b68ec7eae9de7f1b923fa2f68885da64b90)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [0.6.7](https://github.com/apify/crawlee-python/releases/tag/v0.6.7) (2025-04-17)\n\n### 🚀 Features\n\n- Add `ErrorSnapshotter` to `ErrorTracker` ([#1125](https://github.com/apify/crawlee-python/pull/1125)) ([9666092](https://github.com/apify/crawlee-python/commit/9666092c6a59ac4d34409038d5476e5b6fb58a26)) by [@Pijukatel](https://github.com/Pijukatel), closes [#151](https://github.com/apify/crawlee-python/issues/151)\n\n### 🐛 Bug Fixes\n\n- Improve validation errors in Crawlee CLI ([#1140](https://github.com/apify/crawlee-python/pull/1140)) ([f2d33df](https://github.com/apify/crawlee-python/commit/f2d33dff178a3d3079eb3807feb9645a25cc7a93)) by [@vdusek](https://github.com/vdusek), closes [#1138](https://github.com/apify/crawlee-python/issues/1138)\n- Disable logger propagation to prevent duplicate logs ([#1156](https://github.com/apify/crawlee-python/pull/1156)) ([0b3648d](https://github.com/apify/crawlee-python/commit/0b3648d5d399f0af23520f7fb8ee635d38b512c4)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.6.6](https://github.com/apify/crawlee-python/releases/tag/v0.6.6) (2025-04-03)\n\n### 🚀 Features\n\n- Add `statistics_log_format` parameter to `BasicCrawler` ([#1061](https://github.com/apify/crawlee-python/pull/1061)) ([635ae4a](https://github.com/apify/crawlee-python/commit/635ae4a56c65e434783ca721f4164203f465abf0)) by [@Mantisus](https://github.com/Mantisus), closes [#700](https://github.com/apify/crawlee-python/issues/700)\n- Add Session binding capability via `session_id` in `Request` ([#1086](https://github.com/apify/crawlee-python/pull/1086)) ([cda7b31](https://github.com/apify/crawlee-python/commit/cda7b314ffda3104e4fd28a5e85c9e238d8866a4)) by [@Mantisus](https://github.com/Mantisus), closes [#1076](https://github.com/apify/crawlee-python/issues/1076)\n- Add `requests` argument to `EnqueueLinksFunction` ([#1024](https://github.com/apify/crawlee-python/pull/1024)) ([fc8444c](https://github.com/apify/crawlee-python/commit/fc8444c245c7607d3e378a4835d7d3355c4059be)) by [@Pijukatel](https://github.com/Pijukatel)\n\n### 🐛 Bug Fixes\n\n- Add port for `same-origin` strategy check ([#1096](https://github.com/apify/crawlee-python/pull/1096)) ([9e24598](https://github.com/apify/crawlee-python/commit/9e245987d0aab0ba9c763689f12958b5a332db46)) by [@Mantisus](https://github.com/Mantisus)\n- Fix handling of loading empty `metadata` file for queue ([#1042](https://github.com/apify/crawlee-python/pull/1042)) ([b00876e](https://github.com/apify/crawlee-python/commit/b00876e8dcb30a12d3737bd31237da9daada46bb)) by [@Mantisus](https://github.com/Mantisus), closes [#1029](https://github.com/apify/crawlee-python/issues/1029)\n- Update favicon ([#1114](https://github.com/apify/crawlee-python/pull/1114)) ([eba900f](https://github.com/apify/crawlee-python/commit/eba900fc1e8d918c6fc464657c53004a3e0fe668)) by [@baldasseva](https://github.com/baldasseva)\n- **website:** Use correct image source ([#1115](https://github.com/apify/crawlee-python/pull/1115)) ([ee7806f](https://github.com/apify/crawlee-python/commit/ee7806fc2f9b7b590d9668cc9f86009a898a3da6)) by [@baldasseva](https://github.com/baldasseva)\n\n\n## [0.6.5](https://github.com/apify/crawlee-python/releases/tag/v0.6.5) (2025-03-13)\n\n### 🐛 Bug Fixes\n\n- Update to `browserforge` workaround ([#1075](https://github.com/apify/crawlee-python/pull/1075)) ([2378cf8](https://github.com/apify/crawlee-python/commit/2378cf84ab1ed06473049a9ddfca2ba6f166306d)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.6.4](https://github.com/apify/crawlee-python/releases/tag/v0.6.4) (2025-03-12)\n\n### 🐛 Bug Fixes\n\n- Add a check thread before set `add_signal_handler` ([#1068](https://github.com/apify/crawlee-python/pull/1068)) ([6983bda](https://github.com/apify/crawlee-python/commit/6983bda2dbc202b3ecbf7db62b11deee007b4b5f)) by [@Mantisus](https://github.com/Mantisus)\n- Temporary workaround for `browserforge` import time code execution ([#1073](https://github.com/apify/crawlee-python/pull/1073)) ([17d914f](https://github.com/apify/crawlee-python/commit/17d914f78242078f88c07d686a567d1091255eb1)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.6.3](https://github.com/apify/crawlee-python/releases/tag/v0.6.3) (2025-03-07)\n\n### 🚀 Features\n\n- Add project template with `uv` package manager ([#1057](https://github.com/apify/crawlee-python/pull/1057)) ([9ec06e5](https://github.com/apify/crawlee-python/commit/9ec06e58032aa11af46ac9cd1ea7bb002a18eb13)) by [@Mantisus](https://github.com/Mantisus), closes [#1053](https://github.com/apify/crawlee-python/issues/1053)\n- Use fingerprint generator in `PlaywrightCrawler` by default  ([#1060](https://github.com/apify/crawlee-python/pull/1060)) ([09cec53](https://github.com/apify/crawlee-python/commit/09cec532911043623eeb475aa8552c70bd94f8b7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1054](https://github.com/apify/crawlee-python/issues/1054)\n\n### 🐛 Bug Fixes\n\n- Update project templates for Poetry v2.x compatibility ([#1049](https://github.com/apify/crawlee-python/pull/1049)) ([96dc2f9](https://github.com/apify/crawlee-python/commit/96dc2f9b53b0a2d0f1d0c73d10e5244114e849ff)) by [@Mantisus](https://github.com/Mantisus), closes [#954](https://github.com/apify/crawlee-python/issues/954)\n- Remove tmp folder for PlaywrightCrawler in non-headless mode ([#1046](https://github.com/apify/crawlee-python/pull/1046)) ([3a7f444](https://github.com/apify/crawlee-python/commit/3a7f444fb7ee9a0ab1867c8c9586b15aab1e7df2)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [0.6.2](https://github.com/apify/crawlee-python/releases/tag/v0.6.2) (2025-03-05)\n\n### 🚀 Features\n\n- Extend ErrorTracker with error grouping ([#1014](https://github.com/apify/crawlee-python/pull/1014)) ([561de5c](https://github.com/apify/crawlee-python/commit/561de5c6b76af386cad5ac804a22fb7af227e460)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.6.1](https://github.com/apify/crawlee-python/releases/tag/v0.6.1) (2025-03-03)\n\n### 🐛 Bug Fixes\n\n- Add `browserforge` to mandatory dependencies ([#1044](https://github.com/apify/crawlee-python/pull/1044)) ([ddfbde8](https://github.com/apify/crawlee-python/commit/ddfbde89dd3e3cbef0f3954936f4a41c3d6df909)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.6.0](https://github.com/apify/crawlee-python/releases/tag/v0.6.0) (2025-03-03)\n\n- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v06) for more details.\n- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v06) to ensure a smooth update.\n\n### 🚀 Features\n\n- Integrate browserforge fingerprints ([#829](https://github.com/apify/crawlee-python/pull/829)) ([2b156b4](https://github.com/apify/crawlee-python/commit/2b156b4ba688f9111195422e6058dff30eb1f782)) by [@Pijukatel](https://github.com/Pijukatel), closes [#549](https://github.com/apify/crawlee-python/issues/549)\n- Add AdaptivePlaywrightCrawler ([#872](https://github.com/apify/crawlee-python/pull/872)) ([5ba70b6](https://github.com/apify/crawlee-python/commit/5ba70b6e846a908a55db461ab0c85e3946f2bc7c)) by [@Pijukatel](https://github.com/Pijukatel)\n- Implement `_snapshot_client` for `Snapshotter` ([#957](https://github.com/apify/crawlee-python/pull/957)) ([ba4d384](https://github.com/apify/crawlee-python/commit/ba4d384228d030c20c580ed01fae0e78af3a9543)) by [@Mantisus](https://github.com/Mantisus), closes [#60](https://github.com/apify/crawlee-python/issues/60)\n- Add adaptive context helpers ([#964](https://github.com/apify/crawlee-python/pull/964)) ([e248f17](https://github.com/apify/crawlee-python/commit/e248f17fad7b6d1fc5e23a0a1e961db66068a411)) by [@Pijukatel](https://github.com/Pijukatel), closes [#249](https://github.com/apify/crawlee-python/issues/249)\n- [**breaking**] Enable additional status codes arguments to PlaywrightCrawler ([#959](https://github.com/apify/crawlee-python/pull/959)) ([87cf446](https://github.com/apify/crawlee-python/commit/87cf446a7cbaa900e28abd93d4c8a2e0d1747059)) by [@Pijukatel](https://github.com/Pijukatel), closes [#953](https://github.com/apify/crawlee-python/issues/953)\n- Replace `HeaderGenerator` implementation by `browserforge` implementation ([#960](https://github.com/apify/crawlee-python/pull/960)) ([c2f8c93](https://github.com/apify/crawlee-python/commit/c2f8c93a4ad57c4ede354545bf925bf3707899c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#937](https://github.com/apify/crawlee-python/issues/937)\n\n### 🐛 Bug Fixes\n\n- Fix playwright template and dockerfile ([#972](https://github.com/apify/crawlee-python/pull/972)) ([c33b34d](https://github.com/apify/crawlee-python/commit/c33b34dd6e253b1261c700857bb5c4bbec6d5c14)) by [@janbuchar](https://github.com/janbuchar), closes [#969](https://github.com/apify/crawlee-python/issues/969)\n- Fix installing dependencies via pip in project template ([#977](https://github.com/apify/crawlee-python/pull/977)) ([1e3b8eb](https://github.com/apify/crawlee-python/commit/1e3b8eb1cdb57bf2f7256e8ae5f0706b0afc3ba9)) by [@janbuchar](https://github.com/janbuchar), closes [#975](https://github.com/apify/crawlee-python/issues/975)\n- Fix default migration storage ([#1018](https://github.com/apify/crawlee-python/pull/1018)) ([6a0c4d9](https://github.com/apify/crawlee-python/commit/6a0c4d94593f7e94f24eee8a97fc7bc83c4d02e1)) by [@Pijukatel](https://github.com/Pijukatel), closes [#991](https://github.com/apify/crawlee-python/issues/991)\n- Fix logger name for http based loggers ([#1023](https://github.com/apify/crawlee-python/pull/1023)) ([bfb3944](https://github.com/apify/crawlee-python/commit/bfb394446351c8f3b9879a9905607f7c929f2542)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1021](https://github.com/apify/crawlee-python/issues/1021)\n- Remove allow_redirects override in CurlImpersonateHttpClient ([#1017](https://github.com/apify/crawlee-python/pull/1017)) ([01d855a](https://github.com/apify/crawlee-python/commit/01d855a43389a6b4b16ec74767624fa7eb13151f)) by [@2tunnels](https://github.com/2tunnels), closes [#1016](https://github.com/apify/crawlee-python/issues/1016)\n- Remove follow_redirects override in HttpxHttpClient ([#1015](https://github.com/apify/crawlee-python/pull/1015)) ([88afda3](https://github.com/apify/crawlee-python/commit/88afda33e77be84bc91ad1239740b8e661bef2a2)) by [@2tunnels](https://github.com/2tunnels), closes [#1013](https://github.com/apify/crawlee-python/issues/1013)\n- Fix flaky test_common_headers_and_user_agent ([#1030](https://github.com/apify/crawlee-python/pull/1030)) ([58aa70e](https://github.com/apify/crawlee-python/commit/58aa70e9600d313b823a1376ab9b36fb416c1c4a)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1027](https://github.com/apify/crawlee-python/issues/1027)\n\n### 🚜 Refactor\n\n- [**breaking**] Remove unused config properties ([#978](https://github.com/apify/crawlee-python/pull/978)) ([4b7fe29](https://github.com/apify/crawlee-python/commit/4b7fe2930540a5fbd753135e3ce29dc80f80c543)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Remove Base prefix from abstract class names ([#980](https://github.com/apify/crawlee-python/pull/980)) ([8ccb5d4](https://github.com/apify/crawlee-python/commit/8ccb5d41a1dae9b02088b433266ac89bd089561a)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Сhange default `incognito context` to `persistent context` for `Playwright` ([#985](https://github.com/apify/crawlee-python/pull/985)) ([f01520d](https://github.com/apify/crawlee-python/commit/f01520d22b31af9f0f13ca162cc47e6aa9744c6d)) by [@Mantisus](https://github.com/Mantisus), closes [#721](https://github.com/apify/crawlee-python/issues/721), [#963](https://github.com/apify/crawlee-python/issues/963)\n- [**breaking**] Change `Session` cookies from `dict` to `SessionCookies` with `CookieJar` ([#984](https://github.com/apify/crawlee-python/pull/984)) ([6523b3a](https://github.com/apify/crawlee-python/commit/6523b3ade0eed53b0363ddce250c557024339b5e)) by [@Mantisus](https://github.com/Mantisus), closes [#710](https://github.com/apify/crawlee-python/issues/710), [#933](https://github.com/apify/crawlee-python/issues/933)\n- [**breaking**] Replace enum with literal for `EnqueueStrategy` ([#1019](https://github.com/apify/crawlee-python/pull/1019)) ([d2481ef](https://github.com/apify/crawlee-python/commit/d2481ef71d3539979c5b1129387e72b4126fe366)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Update status code handling ([#1028](https://github.com/apify/crawlee-python/pull/1028)) ([6b59471](https://github.com/apify/crawlee-python/commit/6b5947125e63abdfff481b0669398fc9a7293e55)) by [@Mantisus](https://github.com/Mantisus), closes [#830](https://github.com/apify/crawlee-python/issues/830), [#998](https://github.com/apify/crawlee-python/issues/998)\n- [**breaking**] Move `cli` dependencies to optional dependencies ([#1011](https://github.com/apify/crawlee-python/pull/1011)) ([4382959](https://github.com/apify/crawlee-python/commit/43829590c6b4efd1dc9b833373f82a842a0a1a8e)) by [@Mantisus](https://github.com/Mantisus), closes [#703](https://github.com/apify/crawlee-python/issues/703), [#1010](https://github.com/apify/crawlee-python/issues/1010)\n\n\n## [0.5.4](https://github.com/apify/crawlee-python/releases/tag/v0.5.4) (2025-02-05)\n\n### 🚀 Features\n\n- Add support `use_incognito_pages` for `browser_launch_options` in `PlaywrightCrawler` ([#941](https://github.com/apify/crawlee-python/pull/941)) ([eae3a33](https://github.com/apify/crawlee-python/commit/eae3a33a1842ebbdac5f9c51866a4be4bcf1ae2c)) by [@Mantisus](https://github.com/Mantisus)\n\n### 🐛 Bug Fixes\n\n- Fix session management with retire ([#947](https://github.com/apify/crawlee-python/pull/947)) ([caee03f](https://github.com/apify/crawlee-python/commit/caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa)) by [@Mantisus](https://github.com/Mantisus)\n- Fix templates - poetry-plugin-export version and camoufox template name ([#952](https://github.com/apify/crawlee-python/pull/952)) ([7addea6](https://github.com/apify/crawlee-python/commit/7addea6605359cceba208e16ec9131724bdb3e9b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#951](https://github.com/apify/crawlee-python/issues/951)\n- Fix convert relative link to absolute in `enqueue_links` for response with redirect ([#956](https://github.com/apify/crawlee-python/pull/956)) ([694102e](https://github.com/apify/crawlee-python/commit/694102e163bb9021a4830d2545d153f6f8f3de90)) by [@Mantisus](https://github.com/Mantisus), closes [#955](https://github.com/apify/crawlee-python/issues/955)\n- Fix `CurlImpersonateHttpClient` cookies handler ([#946](https://github.com/apify/crawlee-python/pull/946)) ([ed415c4](https://github.com/apify/crawlee-python/commit/ed415c433da2a40b0ee62534f0730d0737e991b8)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [0.5.3](https://github.com/apify/crawlee-python/releases/tag/v0.5.3) (2025-01-31)\n\n### 🚀 Features\n\n- Add keep_alive flag to `crawler.__init__` ([#921](https://github.com/apify/crawlee-python/pull/921)) ([7a82d0c](https://github.com/apify/crawlee-python/commit/7a82d0cbdbe6c8739d4bf6a9b014e31f07e5a520)) by [@Pijukatel](https://github.com/Pijukatel), closes [#891](https://github.com/apify/crawlee-python/issues/891)\n- Add `block_requests` helper for `PlaywrightCrawler` ([#919](https://github.com/apify/crawlee-python/pull/919)) ([1030459](https://github.com/apify/crawlee-python/commit/103045994908f80cffee5ccfff91a040e0042f48)) by [@Mantisus](https://github.com/Mantisus), closes [#848](https://github.com/apify/crawlee-python/issues/848)\n- Return request handlers from decorator methods to allow further decoration ([#934](https://github.com/apify/crawlee-python/pull/934)) ([9ec0aae](https://github.com/apify/crawlee-python/commit/9ec0aae54e2a340d29c893567ae80bf8bd4510a9)) by [@mylank](https://github.com/mylank)\n- Add `transform_request_function` for `enqueue_links` ([#923](https://github.com/apify/crawlee-python/pull/923)) ([6b15957](https://github.com/apify/crawlee-python/commit/6b159578f612251e6d2253a72b6521430f4f9b09)) by [@Mantisus](https://github.com/Mantisus), closes [#894](https://github.com/apify/crawlee-python/issues/894)\n- Add `time_remaining_secs` property to `MIGRATING` event data ([#940](https://github.com/apify/crawlee-python/pull/940)) ([b44501b](https://github.com/apify/crawlee-python/commit/b44501bcadbd12673a8f47aa92f12da8e404f60b)) by [@fnesveda](https://github.com/fnesveda)\n- Add LogisticalRegressionPredictor - rendering type predictor for adaptive crawling ([#930](https://github.com/apify/crawlee-python/pull/930)) ([8440499](https://github.com/apify/crawlee-python/commit/8440499468db115a4c478e9bcdb692554d1655c5)) by [@Pijukatel](https://github.com/Pijukatel)\n\n### 🐛 Bug Fixes\n\n- Fix crawler not retrying user handler if there was timeout in the handler ([#909](https://github.com/apify/crawlee-python/pull/909)) ([f4090ef](https://github.com/apify/crawlee-python/commit/f4090ef0ea0281d53dab16a77ceea2ef6ac43d76)) by [@Pijukatel](https://github.com/Pijukatel), closes [#907](https://github.com/apify/crawlee-python/issues/907)\n- Optimize memory consumption for `HttpxHttpClient`, fix proxy handling ([#905](https://github.com/apify/crawlee-python/pull/905)) ([d7ad480](https://github.com/apify/crawlee-python/commit/d7ad480834263ae0480049cb0a8db4dfc3946d8d)) by [@Mantisus](https://github.com/Mantisus), closes [#895](https://github.com/apify/crawlee-python/issues/895)\n- Fix `BrowserPool` and `PlaywrightBrowserPlugin` closure ([#932](https://github.com/apify/crawlee-python/pull/932)) ([997543d](https://github.com/apify/crawlee-python/commit/997543d2fa5afba49929f4407ee95d7a4933a50d)) by [@Mantisus](https://github.com/Mantisus)\n\n\n## [0.5.2](https://github.com/apify/crawlee-python/releases/tag/v0.5.2) (2025-01-17)\n\n### 🐛 Bug Fixes\n\n- Avoid `use_state` race conditions. Remove key argument to `use_state` ([#868](https://github.com/apify/crawlee-python/pull/868)) ([000b976](https://github.com/apify/crawlee-python/commit/000b9761211502d86a893a31e3ca21998a6e3b99)) by [@Pijukatel](https://github.com/Pijukatel), closes [#856](https://github.com/apify/crawlee-python/issues/856)\n- Restore proxy functionality for PlaywrightCrawler broken in v0.5 ([#889](https://github.com/apify/crawlee-python/pull/889)) ([908c944](https://github.com/apify/crawlee-python/commit/908c944ff9b1fc8ed7eb35f0078a1de71e34d5c5)) by [@Mantisus](https://github.com/Mantisus), closes [#887](https://github.com/apify/crawlee-python/issues/887)\n- Fix the usage of Configuration ([#899](https://github.com/apify/crawlee-python/pull/899)) ([0f1cf6f](https://github.com/apify/crawlee-python/commit/0f1cf6f0b52c92ca4e465a2a01f8111cd9ab42ec)) by [@vdusek](https://github.com/vdusek), closes [#670](https://github.com/apify/crawlee-python/issues/670)\n\n\n## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07)\n\n### 🐛 Bug Fixes\n\n- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar)\n\n\n## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02)\n\n- Check out the [Release blog post](https://crawlee.dev/blog/crawlee-for-python-v05) for more details.\n- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) to ensure a smooth update.\n\n### 🚀 Features\n\n- Add possibility to use None as no proxy in tiered proxies ([#760](https://github.com/apify/crawlee-python/pull/760)) ([0fbd017](https://github.com/apify/crawlee-python/commit/0fbd01723b9fe2e3410e0f358cab2f22848b08d0)) by [@Pijukatel](https://github.com/Pijukatel), closes [#687](https://github.com/apify/crawlee-python/issues/687)\n- Add `use_state` context method ([#682](https://github.com/apify/crawlee-python/pull/682)) ([868b41e](https://github.com/apify/crawlee-python/commit/868b41ebd4c8003fa60ab07887577d0fb85b6ecc)) by [@Mantisus](https://github.com/Mantisus), closes [#191](https://github.com/apify/crawlee-python/issues/191)\n- Add pre-navigation hooks router to AbstractHttpCrawler ([#791](https://github.com/apify/crawlee-python/pull/791)) ([0f23205](https://github.com/apify/crawlee-python/commit/0f23205923065074c522b3de9d47218a204dfa78)) by [@Pijukatel](https://github.com/Pijukatel), closes [#635](https://github.com/apify/crawlee-python/issues/635)\n- Add example of how to integrate Camoufox into PlaywrightCrawler ([#789](https://github.com/apify/crawlee-python/pull/789)) ([246cfc4](https://github.com/apify/crawlee-python/commit/246cfc4ebc8bce1d15e1dddd62d652bd65869328)) by [@Pijukatel](https://github.com/Pijukatel), closes [#684](https://github.com/apify/crawlee-python/issues/684)\n- Expose event types, improve on&#x2F;emit signature, allow parameterless listeners ([#800](https://github.com/apify/crawlee-python/pull/800)) ([c102c4c](https://github.com/apify/crawlee-python/commit/c102c4c894a00b09adfd5f4911563c81cf3e98b4)) by [@janbuchar](https://github.com/janbuchar), closes [#561](https://github.com/apify/crawlee-python/issues/561)\n- Add stop method to BasicCrawler ([#807](https://github.com/apify/crawlee-python/pull/807)) ([6d01af4](https://github.com/apify/crawlee-python/commit/6d01af4231d02b4349a8719f5ed18d812843fde5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#651](https://github.com/apify/crawlee-python/issues/651)\n- Add `html_to_text` helper function ([#792](https://github.com/apify/crawlee-python/pull/792)) ([2b9d970](https://github.com/apify/crawlee-python/commit/2b9d97009dd653870681bb3cadbb46b214ff1a73)) by [@Pijukatel](https://github.com/Pijukatel), closes [#659](https://github.com/apify/crawlee-python/issues/659)\n- [**breaking**] Implement `RequestManagerTandem`, remove `add_request` from `RequestList`, accept any iterable in `RequestList` constructor ([#777](https://github.com/apify/crawlee-python/pull/777)) ([4172652](https://github.com/apify/crawlee-python/commit/4172652079e5e91190c1cc5e2138fd41a7c84a6b)) by [@janbuchar](https://github.com/janbuchar)\n\n### 🐛 Bug Fixes\n\n- Fix circular import in `KeyValueStore` ([#805](https://github.com/apify/crawlee-python/pull/805)) ([8bdf49d](https://github.com/apify/crawlee-python/commit/8bdf49d1cb2a94b66f69fd1b77063a4113517fae)) by [@Mantisus](https://github.com/Mantisus), closes [#804](https://github.com/apify/crawlee-python/issues/804)\n- [**breaking**] Refactor service usage to rely on `service_locator` ([#691](https://github.com/apify/crawlee-python/pull/691)) ([1d31c6c](https://github.com/apify/crawlee-python/commit/1d31c6c7e7a9ec7cee5b2de900568d9f77db65ba)) by [@vdusek](https://github.com/vdusek), closes [#369](https://github.com/apify/crawlee-python/issues/369), [#539](https://github.com/apify/crawlee-python/issues/539), [#699](https://github.com/apify/crawlee-python/issues/699)\n- Pass `verify` in httpx client ([#802](https://github.com/apify/crawlee-python/pull/802)) ([074d083](https://github.com/apify/crawlee-python/commit/074d0836b55e52f13726e7cd1c21602623fda4fc)) by [@Mantisus](https://github.com/Mantisus), closes [#798](https://github.com/apify/crawlee-python/issues/798)\n- Fix `page_options` for `PlaywrightBrowserPlugin` ([#796](https://github.com/apify/crawlee-python/pull/796)) ([bd3bdd4](https://github.com/apify/crawlee-python/commit/bd3bdd4046c2ddea62feb77322033cad50f382dd)) by [@Mantisus](https://github.com/Mantisus), closes [#755](https://github.com/apify/crawlee-python/issues/755)\n- Fix event migrating handler in `RequestQueue` ([#825](https://github.com/apify/crawlee-python/pull/825)) ([fd6663f](https://github.com/apify/crawlee-python/commit/fd6663f903bc7eecd1000da89e06197b43dfb962)) by [@Mantisus](https://github.com/Mantisus), closes [#815](https://github.com/apify/crawlee-python/issues/815)\n- Respect user configuration for work with status codes ([#812](https://github.com/apify/crawlee-python/pull/812)) ([8daf4bd](https://github.com/apify/crawlee-python/commit/8daf4bd49c1b09a0924f827daedebf7600ac609b)) by [@Mantisus](https://github.com/Mantisus), closes [#708](https://github.com/apify/crawlee-python/issues/708), [#756](https://github.com/apify/crawlee-python/issues/756)\n- `abort-on-error` for successive runs ([#834](https://github.com/apify/crawlee-python/pull/834)) ([0cea673](https://github.com/apify/crawlee-python/commit/0cea67387bf366800b447de784af580159b199ee)) by [@Mantisus](https://github.com/Mantisus)\n- Relax ServiceLocator restrictions ([#837](https://github.com/apify/crawlee-python/pull/837)) ([aa3667f](https://github.com/apify/crawlee-python/commit/aa3667f344d78945df3eca77431e1409f43f8bb5)) by [@janbuchar](https://github.com/janbuchar), closes [#806](https://github.com/apify/crawlee-python/issues/806)\n- Fix typo in exports ([#841](https://github.com/apify/crawlee-python/pull/841)) ([8fa6ac9](https://github.com/apify/crawlee-python/commit/8fa6ac994fe4f3f6430cb796a0c6a732c93c672b)) by [@janbuchar](https://github.com/janbuchar)\n\n### 🚜 Refactor\n\n- [**breaking**] Refactor HttpCrawler, BeautifulSoupCrawler, ParselCrawler inheritance ([#746](https://github.com/apify/crawlee-python/pull/746)) ([9d3c269](https://github.com/apify/crawlee-python/commit/9d3c2697c91ce93028ca86a91d85d465d36c1ad7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#350](https://github.com/apify/crawlee-python/issues/350)\n- [**breaking**] Remove `json_` and `order_no` from `Request` ([#788](https://github.com/apify/crawlee-python/pull/788)) ([5381d13](https://github.com/apify/crawlee-python/commit/5381d13aa51a757fc1906f400788555df090a1af)) by [@Mantisus](https://github.com/Mantisus), closes [#94](https://github.com/apify/crawlee-python/issues/94)\n- [**breaking**] Rename PwPreNavContext to PwPreNavCrawlingContext ([#827](https://github.com/apify/crawlee-python/pull/827)) ([84b61a3](https://github.com/apify/crawlee-python/commit/84b61a3d25bee42faed4e81cd156663f251b3d3d)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Rename PlaywrightCrawler kwargs: browser_options, page_options ([#831](https://github.com/apify/crawlee-python/pull/831)) ([ffc6048](https://github.com/apify/crawlee-python/commit/ffc6048e9dc5c5e862271fa50c48bb0fb6f0a18f)) by [@Pijukatel](https://github.com/Pijukatel)\n- [**breaking**] Update the crawlers &amp; storage clients structure ([#828](https://github.com/apify/crawlee-python/pull/828)) ([0ba04d1](https://github.com/apify/crawlee-python/commit/0ba04d1633881043928a408678932c46fb90e21f)) by [@vdusek](https://github.com/vdusek), closes [#764](https://github.com/apify/crawlee-python/issues/764)\n\n\n## [0.4.5](https://github.com/apify/crawlee-python/releases/tag/v0.4.5) (2024-12-06)\n\n### 🚀 Features\n\n- Improve project bootstrapping ([#538](https://github.com/apify/crawlee-python/pull/538)) ([367899c](https://github.com/apify/crawlee-python/commit/367899cbad5021674f6e41c4dd7eb2266fe043aa)) by [@janbuchar](https://github.com/janbuchar), closes [#317](https://github.com/apify/crawlee-python/issues/317), [#414](https://github.com/apify/crawlee-python/issues/414), [#495](https://github.com/apify/crawlee-python/issues/495), [#511](https://github.com/apify/crawlee-python/issues/511)\n\n### 🐛 Bug Fixes\n\n- Add upper bound of HTTPX version ([#775](https://github.com/apify/crawlee-python/pull/775)) ([b59e34d](https://github.com/apify/crawlee-python/commit/b59e34d6301e26825d88608152ffb337ef602a9f)) by [@vdusek](https://github.com/vdusek)\n- Fix incorrect use of desired concurrency ratio ([#780](https://github.com/apify/crawlee-python/pull/780)) ([d1f8bfb](https://github.com/apify/crawlee-python/commit/d1f8bfb68ce2ef13b550ce415a3689858112a4c7)) by [@Pijukatel](https://github.com/Pijukatel), closes [#759](https://github.com/apify/crawlee-python/issues/759)\n- Remove pydantic constraint &lt;2.10.0 and update timedelta validator, serializer type hints ([#757](https://github.com/apify/crawlee-python/pull/757)) ([c0050c0](https://github.com/apify/crawlee-python/commit/c0050c0ee76e5deb28f174ecf276b0e6abf68b9d)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.4.4](https://github.com/apify/crawlee-python/releases/tag/v0.4.4) (2024-11-29)\n\n### 🚀 Features\n\n- Expose browser_options and page_options to PlaywrightCrawler ([#730](https://github.com/apify/crawlee-python/pull/730)) ([dbe85b9](https://github.com/apify/crawlee-python/commit/dbe85b90e59def281cfc6617a0eb869a4adf2fc0)) by [@vdusek](https://github.com/vdusek), closes [#719](https://github.com/apify/crawlee-python/issues/719)\n- Add `abort_on_error` property ([#731](https://github.com/apify/crawlee-python/pull/731)) ([6dae03a](https://github.com/apify/crawlee-python/commit/6dae03a68a2d23c68c78d8d44611d43e40eb9404)) by [@Mantisus](https://github.com/Mantisus), closes [#704](https://github.com/apify/crawlee-python/issues/704)\n\n### 🐛 Bug Fixes\n\n- Fix init of context managers and context handling in `BasicCrawler` ([#714](https://github.com/apify/crawlee-python/pull/714)) ([486fe6d](https://github.com/apify/crawlee-python/commit/486fe6d6cd56cb560ab51a32ec0286d9e32267cb)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.4.3](https://github.com/apify/crawlee-python/releases/tag/v0.4.3) (2024-11-21)\n\n### 🐛 Bug Fixes\n\n- Pydantic 2.10.0 issues ([#716](https://github.com/apify/crawlee-python/pull/716)) ([8d8b3fc](https://github.com/apify/crawlee-python/commit/8d8b3fcff8be10edf5351f5324c7ba112c1d2ba0)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.4.2](https://github.com/apify/crawlee-python/releases/tag/v0.4.2) (2024-11-20)\n\n### 🐛 Bug Fixes\n\n- Respect custom HTTP headers in `PlaywrightCrawler` ([#685](https://github.com/apify/crawlee-python/pull/685)) ([a84125f](https://github.com/apify/crawlee-python/commit/a84125f031347426de44b8f015c87882c8f96f72)) by [@Mantisus](https://github.com/Mantisus)\n- Fix serialization payload in Request. Fix Docs for Post Request ([#683](https://github.com/apify/crawlee-python/pull/683)) ([e8b4d2d](https://github.com/apify/crawlee-python/commit/e8b4d2d4989fd9967403b828c914cb7ae2ef9b8b)) by [@Mantisus](https://github.com/Mantisus), closes [#668](https://github.com/apify/crawlee-python/issues/668)\n- Accept string payload in the Request constructor ([#697](https://github.com/apify/crawlee-python/pull/697)) ([19f5add](https://github.com/apify/crawlee-python/commit/19f5addc0223d68389eea47864830c709335ab6e)) by [@vdusek](https://github.com/vdusek)\n- Fix snapshots handling ([#692](https://github.com/apify/crawlee-python/pull/692)) ([4016c0d](https://github.com/apify/crawlee-python/commit/4016c0d8121a8950ab1df22188eac838a011c39f)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.4.1](https://github.com/apify/crawlee-python/releases/tag/v0.4.1) (2024-11-11)\n\n### 🚀 Features\n\n- Add `max_crawl_depth` option to `BasicCrawler` ([#637](https://github.com/apify/crawlee-python/pull/637)) ([77deaa9](https://github.com/apify/crawlee-python/commit/77deaa964e2c1e74af1c5117a13d8d8257f0e27e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#460](https://github.com/apify/crawlee-python/issues/460)\n- Add BeautifulSoupParser type alias ([#674](https://github.com/apify/crawlee-python/pull/674)) ([b2cf88f](https://github.com/apify/crawlee-python/commit/b2cf88ffea8d75808c9210850a03fcc70b0b9e3d)) by [@Pijukatel](https://github.com/Pijukatel)\n\n### 🐛 Bug Fixes\n\n- Fix total_size usage in memory size monitoring ([#661](https://github.com/apify/crawlee-python/pull/661)) ([c2a3239](https://github.com/apify/crawlee-python/commit/c2a32397eecd5cc7f412c2af7269b004a8b2eaf2)) by [@janbuchar](https://github.com/janbuchar)\n- Add HttpHeaders to module exports ([#664](https://github.com/apify/crawlee-python/pull/664)) ([f0c5ca7](https://github.com/apify/crawlee-python/commit/f0c5ca717d9f9e304d375da2c23552c26ca870da)) by [@vdusek](https://github.com/vdusek), closes [#663](https://github.com/apify/crawlee-python/issues/663)\n- Fix unhandled ValueError in request handler result processing ([#666](https://github.com/apify/crawlee-python/pull/666)) ([0a99d7f](https://github.com/apify/crawlee-python/commit/0a99d7f693245eb9a065016fb6f2d268f6956805)) by [@janbuchar](https://github.com/janbuchar)\n- Fix BaseDatasetClient.iter_items type hints ([#680](https://github.com/apify/crawlee-python/pull/680)) ([a968b1b](https://github.com/apify/crawlee-python/commit/a968b1be6fceb56676b0198a044c8fceac7c92a6)) by [@Pijukatel](https://github.com/Pijukatel)\n\n\n## [0.4.0](https://github.com/apify/crawlee-python/releases/tag/v0.4.0) (2024-11-01)\n\n- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v04) to ensure a smooth update.\n\n### 🚀 Features\n\n- [**breaking**] Add headers in unique key computation ([#609](https://github.com/apify/crawlee-python/pull/609)) ([6c4746f](https://github.com/apify/crawlee-python/commit/6c4746fa8ff86952a812b32a1d70dc910e76b43e)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#548](https://github.com/apify/crawlee-python/issues/548)\n- Add `pre_navigation_hooks` to `PlaywrightCrawler` ([#631](https://github.com/apify/crawlee-python/pull/631)) ([5dd5b60](https://github.com/apify/crawlee-python/commit/5dd5b60e2a44d5bd3748b613790e1bee3232d6f3)) by [@Prathamesh010](https://github.com/Prathamesh010), closes [#427](https://github.com/apify/crawlee-python/issues/427)\n- Add `always_enqueue` option to bypass URL deduplication ([#621](https://github.com/apify/crawlee-python/pull/621)) ([4e59fa4](https://github.com/apify/crawlee-python/commit/4e59fa46daaec05e52262cf62c26f28ddcd772af)) by [@Rutam21](https://github.com/Rutam21), closes [#547](https://github.com/apify/crawlee-python/issues/547)\n- Split and add extra configuration to export_data method ([#580](https://github.com/apify/crawlee-python/pull/580)) ([6751635](https://github.com/apify/crawlee-python/commit/6751635e1785a4a27f60092c82f5dd0c40193d52)) by [@deshansh](https://github.com/deshansh), closes [#526](https://github.com/apify/crawlee-python/issues/526)\n\n### 🐛 Bug Fixes\n\n- Use strip in headers normalization ([#614](https://github.com/apify/crawlee-python/pull/614)) ([a15b21e](https://github.com/apify/crawlee-python/commit/a15b21e51deaf2b67738f95bc2b15c1c16d1775f)) by [@vdusek](https://github.com/vdusek)\n- [**breaking**] Merge payload and data fields of Request ([#542](https://github.com/apify/crawlee-python/pull/542)) ([d06fcef](https://github.com/apify/crawlee-python/commit/d06fcef3fee44616ded5f587b9c7313b82a57cc7)) by [@vdusek](https://github.com/vdusek), closes [#560](https://github.com/apify/crawlee-python/issues/560)\n- Default ProxyInfo port if httpx.URL port is None ([#619](https://github.com/apify/crawlee-python/pull/619)) ([8107a6f](https://github.com/apify/crawlee-python/commit/8107a6f97e8f16a330e7d02d3fc6ea34c5f78d77)) by [@steffansafey](https://github.com/steffansafey), closes [#618](https://github.com/apify/crawlee-python/issues/618)\n\n### ⚙️ Miscellaneous Tasks\n\n- [**breaking**] Remove Request.query_params field ([#639](https://github.com/apify/crawlee-python/pull/639)) ([6ec0ec4](https://github.com/apify/crawlee-python/commit/6ec0ec4fa0cef9b8bf893e70d99f068675c9c54c)) by [@vdusek](https://github.com/vdusek), closes [#615](https://github.com/apify/crawlee-python/issues/615)\n\n\n## [0.3.9](https://github.com/apify/crawlee-python/releases/tag/v0.3.9) (2024-10-23)\n\n### 🚀 Features\n\n- Key-value store context helpers ([#584](https://github.com/apify/crawlee-python/pull/584)) ([fc15622](https://github.com/apify/crawlee-python/commit/fc156222c3747fc4cc7bd7666a21769845c7d0d5)) by [@janbuchar](https://github.com/janbuchar)\n- Added get_public_url method to KeyValueStore ([#572](https://github.com/apify/crawlee-python/pull/572)) ([3a4ba8f](https://github.com/apify/crawlee-python/commit/3a4ba8f459903b6288aec40de2c3ca862e36abec)) by [@akshay11298](https://github.com/akshay11298), closes [#514](https://github.com/apify/crawlee-python/issues/514)\n\n### 🐛 Bug Fixes\n\n- Workaround for JSON value typing problems ([#581](https://github.com/apify/crawlee-python/pull/581)) ([403496a](https://github.com/apify/crawlee-python/commit/403496a53c12810351139a6e073238143ecc5930)) by [@janbuchar](https://github.com/janbuchar), closes [#563](https://github.com/apify/crawlee-python/issues/563)\n\n\n## [0.3.8](https://github.com/apify/crawlee-python/releases/tag/v0.3.8) (2024-10-02)\n\n### 🚀 Features\n\n- Mask Playwright's \"headless\" headers ([#545](https://github.com/apify/crawlee-python/pull/545)) ([d1445e4](https://github.com/apify/crawlee-python/commit/d1445e4858fd804bb4a2e35efa1d2f5254d8df6b)) by [@vdusek](https://github.com/vdusek), closes [#401](https://github.com/apify/crawlee-python/issues/401)\n- Add new model for `HttpHeaders` ([#544](https://github.com/apify/crawlee-python/pull/544)) ([854f2c1](https://github.com/apify/crawlee-python/commit/854f2c1e2e09cf398e04b1e153534282add1247e)) by [@vdusek](https://github.com/vdusek)\n\n### 🐛 Bug Fixes\n\n- Call `error_handler` for `SessionError` ([#557](https://github.com/apify/crawlee-python/pull/557)) ([e75ac4b](https://github.com/apify/crawlee-python/commit/e75ac4b70cd48a4ca9f8245cea3c5f3c188b8824)) by [@vdusek](https://github.com/vdusek), closes [#546](https://github.com/apify/crawlee-python/issues/546)\n- Extend from `StrEnum` in `RequestState` to fix serialization ([#556](https://github.com/apify/crawlee-python/pull/556)) ([6bf35ba](https://github.com/apify/crawlee-python/commit/6bf35ba4a6913819706ebd1d2c1156a4c62f944e)) by [@vdusek](https://github.com/vdusek), closes [#551](https://github.com/apify/crawlee-python/issues/551)\n- Add equality check to UserData model ([#562](https://github.com/apify/crawlee-python/pull/562)) ([899a25c](https://github.com/apify/crawlee-python/commit/899a25ca63f570b3c4d8d56c85a838b371fd3924)) by [@janbuchar](https://github.com/janbuchar)\n\n\n## [0.3.7](https://github.com/apify/crawlee-python/releases/tag/v0.3.7) (2024-09-25)\n\n### 🐛 Bug Fixes\n\n- Improve `Request.user_data` serialization ([#540](https://github.com/apify/crawlee-python/pull/540)) ([de29c0e](https://github.com/apify/crawlee-python/commit/de29c0e6b737a9d2544c5382472618dde76eb2a5)) by [@janbuchar](https://github.com/janbuchar), closes [#524](https://github.com/apify/crawlee-python/issues/524)\n- Adopt new version of curl-cffi ([#543](https://github.com/apify/crawlee-python/pull/543)) ([f6fcf48](https://github.com/apify/crawlee-python/commit/f6fcf48d99bfcb4b8e75c5c9c38dc8c265164a10)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.3.6](https://github.com/apify/crawlee-python/releases/tag/v0.3.6) (2024-09-19)\n\n### 🚀 Features\n\n- Add HTTP/2 support for HTTPX client ([#513](https://github.com/apify/crawlee-python/pull/513)) ([0eb0a33](https://github.com/apify/crawlee-python/commit/0eb0a33411096011198e52c393f35730f1a0b6ac)) by [@vdusek](https://github.com/vdusek), closes [#512](https://github.com/apify/crawlee-python/issues/512)\n- Expose extended unique key when creating a new Request ([#515](https://github.com/apify/crawlee-python/pull/515)) ([1807f41](https://github.com/apify/crawlee-python/commit/1807f419e47a815dd706d09acb0f3b3af8cfc691)) by [@vdusek](https://github.com/vdusek)\n- Add header generator and integrate it into HTTPX client ([#530](https://github.com/apify/crawlee-python/pull/530)) ([b63f9f9](https://github.com/apify/crawlee-python/commit/b63f9f98c6613e095546ef544eab271d433e3379)) by [@vdusek](https://github.com/vdusek), closes [#402](https://github.com/apify/crawlee-python/issues/402)\n\n### 🐛 Bug Fixes\n\n- Use explicitly UTF-8 encoding in local storage ([#533](https://github.com/apify/crawlee-python/pull/533)) ([a3a0ab2](https://github.com/apify/crawlee-python/commit/a3a0ab2f6809b7a06319a77dfbf289df78638dea)) by [@vdusek](https://github.com/vdusek), closes [#532](https://github.com/apify/crawlee-python/issues/532)\n\n\n## [0.3.5](https://github.com/apify/crawlee-python/releases/tag/v0.3.5) (2024-09-10)\n\n### 🚀 Features\n\n- Memory usage limit configuration via environment variables ([#502](https://github.com/apify/crawlee-python/pull/502)) ([c62e554](https://github.com/apify/crawlee-python/commit/c62e5545de6a1836f0514ebd3dd695e4fd856844)) by [@janbuchar](https://github.com/janbuchar)\n\n### 🐛 Bug Fixes\n\n- Http clients detect 4xx as errors by default ([#498](https://github.com/apify/crawlee-python/pull/498)) ([1895dca](https://github.com/apify/crawlee-python/commit/1895dca538f415feca37b4a030525c7c0d32f114)) by [@vdusek](https://github.com/vdusek), closes [#496](https://github.com/apify/crawlee-python/issues/496)\n- Correctly handle log level configuration ([#508](https://github.com/apify/crawlee-python/pull/508)) ([7ea8fe6](https://github.com/apify/crawlee-python/commit/7ea8fe69f4a6146a1e417bebff60c08a85e2ca27)) by [@janbuchar](https://github.com/janbuchar)\n\n\n## [0.3.4](https://github.com/apify/crawlee-python/releases/tag/v0.3.4) (2024-09-05)\n\n### 🐛 Bug Fixes\n\n- Expose basic crawling context ([#501](https://github.com/apify/crawlee-python/pull/501)) ([b484535](https://github.com/apify/crawlee-python/commit/b484535dbacc5d206a026f55a1d3e58edd375e91)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.3.3](https://github.com/apify/crawlee-python/releases/tag/v0.3.3) (2024-09-05)\n\n### 🐛 Bug Fixes\n\n- Deduplicate requests by unique key before submitting them to the queue ([#499](https://github.com/apify/crawlee-python/pull/499)) ([6a3e0e7](https://github.com/apify/crawlee-python/commit/6a3e0e78490851c43cefb0497ce34ca52a31a25c)) by [@janbuchar](https://github.com/janbuchar)\n\n\n## [0.3.2](https://github.com/apify/crawlee-python/releases/tag/v0.3.2) (2024-09-02)\n\n### 🐛 Bug Fixes\n\n- Double incrementation of `item_count` ([#443](https://github.com/apify/crawlee-python/pull/443)) ([cd9adf1](https://github.com/apify/crawlee-python/commit/cd9adf15731e8c4a39cb142b6d1a62909cafdc51)) by [@cadlagtrader](https://github.com/cadlagtrader), closes [#442](https://github.com/apify/crawlee-python/issues/442)\n- Field alias in `BatchRequestsOperationResponse` ([#485](https://github.com/apify/crawlee-python/pull/485)) ([126a862](https://github.com/apify/crawlee-python/commit/126a8629cb5b989a0f9fe22156fb09731a34acd2)) by [@janbuchar](https://github.com/janbuchar)\n- JSON handling with Parsel ([#490](https://github.com/apify/crawlee-python/pull/490)) ([ebf5755](https://github.com/apify/crawlee-python/commit/ebf575539ffb631ae131a1b801cec8f21dd0cf4c)) by [@janbuchar](https://github.com/janbuchar), closes [#488](https://github.com/apify/crawlee-python/issues/488)\n\n\n## [0.3.1](https://github.com/apify/crawlee-python/releases/tag/v0.3.1) (2024-08-30)\n\n### 🚀 Features\n\n- Curl http client selects chrome impersonation by default ([#473](https://github.com/apify/crawlee-python/pull/473)) ([82dc939](https://github.com/apify/crawlee-python/commit/82dc93957b1a380ea975564dea5c6ba4639be548)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.3.0](https://github.com/apify/crawlee-python/releases/tag/v0.3.0) (2024-08-27)\n\n- Check out the [Upgrading guide](https://crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v03) to ensure a smooth update.\n\n### 🚀 Features\n\n- Implement ParselCrawler that adds support for Parsel ([#348](https://github.com/apify/crawlee-python/pull/348)) ([a3832e5](https://github.com/apify/crawlee-python/commit/a3832e527f022f32cce4a80055da3b7967b74522)) by [@asymness](https://github.com/asymness), closes [#335](https://github.com/apify/crawlee-python/issues/335)\n- Add support for filling a web form ([#453](https://github.com/apify/crawlee-python/pull/453)) ([5a125b4](https://github.com/apify/crawlee-python/commit/5a125b464b2619000b92dacad4c3a7faa1869f29)) by [@vdusek](https://github.com/vdusek), closes [#305](https://github.com/apify/crawlee-python/issues/305)\n\n### 🐛 Bug Fixes\n\n- Remove indentation from statistics logging and print the data in tables ([#322](https://github.com/apify/crawlee-python/pull/322)) ([359b515](https://github.com/apify/crawlee-python/commit/359b515d647f064886f91441c2c01d3099e21035)) by [@TymeeK](https://github.com/TymeeK), closes [#306](https://github.com/apify/crawlee-python/issues/306)\n- Remove redundant log, fix format ([#408](https://github.com/apify/crawlee-python/pull/408)) ([8d27e39](https://github.com/apify/crawlee-python/commit/8d27e3928c605d6eceb51a948453a15024fa2aa2)) by [@janbuchar](https://github.com/janbuchar)\n- Dequeue items from RequestQueue in the correct order ([#411](https://github.com/apify/crawlee-python/pull/411)) ([96fc33e](https://github.com/apify/crawlee-python/commit/96fc33e2cc4631cae3c50dad9eace6407103a2a9)) by [@janbuchar](https://github.com/janbuchar)\n- Relative URLS supports & If not a URL, pass #417 ([#431](https://github.com/apify/crawlee-python/pull/431)) ([ccd8145](https://github.com/apify/crawlee-python/commit/ccd81454166ece68391cdffedb8efe9e663361d9)) by [@black7375](https://github.com/black7375), closes [#417](https://github.com/apify/crawlee-python/issues/417)\n- Typo in ProlongRequestLockResponse ([#458](https://github.com/apify/crawlee-python/pull/458)) ([30ccc3a](https://github.com/apify/crawlee-python/commit/30ccc3a4763bc3706a3bbeaedc95f9648f5ba09a)) by [@janbuchar](https://github.com/janbuchar)\n- Add missing __all__ to top-level __init__.py file ([#463](https://github.com/apify/crawlee-python/pull/463)) ([353a1ce](https://github.com/apify/crawlee-python/commit/353a1ce28cd38c97ffb36dc1e6b0e86d3aef1a48)) by [@janbuchar](https://github.com/janbuchar)\n\n### 🚜 Refactor\n\n- [**breaking**] RequestQueue and service management rehaul ([#429](https://github.com/apify/crawlee-python/pull/429)) ([b155a9f](https://github.com/apify/crawlee-python/commit/b155a9f602a163e891777bef5608072fb5d0156f)) by [@janbuchar](https://github.com/janbuchar), closes [#83](https://github.com/apify/crawlee-python/issues/83), [#174](https://github.com/apify/crawlee-python/issues/174), [#203](https://github.com/apify/crawlee-python/issues/203), [#423](https://github.com/apify/crawlee-python/issues/423)\n- [**breaking**] Declare private and public interface ([#456](https://github.com/apify/crawlee-python/pull/456)) ([d6738df](https://github.com/apify/crawlee-python/commit/d6738df30586934e8d1aba50b9cd437a0ea40400)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.2.1](https://github.com/apify/crawlee-python/releases/tag/v0.2.1) (2024-08-05)\n\n### 🐛 Bug Fixes\n\n- Do not import curl impersonate in http clients init ([#396](https://github.com/apify/crawlee-python/pull/396)) ([3bb8009](https://github.com/apify/crawlee-python/commit/3bb80093e61c1615f869ecd5ab80b061e0e5db36)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.2.0](https://github.com/apify/crawlee-python/releases/tag/v0.2.0) (2024-08-05)\n\n### 🚀 Features\n\n- Add new curl impersonate HTTP client ([#387](https://github.com/apify/crawlee-python/pull/387)) ([9c06260](https://github.com/apify/crawlee-python/commit/9c06260c0ee958522caa9322001a3186e9e43af4)) by [@vdusek](https://github.com/vdusek), closes [#292](https://github.com/apify/crawlee-python/issues/292)\n- **playwright:** `infinite_scroll` helper ([#393](https://github.com/apify/crawlee-python/pull/393)) ([34f74bd](https://github.com/apify/crawlee-python/commit/34f74bdcffb42a6c876a856e1c89923d9b3e60bd)) by [@janbuchar](https://github.com/janbuchar)\n\n\n## [0.1.2](https://github.com/apify/crawlee-python/releases/tag/v0.1.2) (2024-07-30)\n\n### 🚀 Features\n\n- Add URL validation ([#343](https://github.com/apify/crawlee-python/pull/343)) ([1514538](https://github.com/apify/crawlee-python/commit/15145388009c85ab54dc72ea8f2d07efd78f80fd)) by [@vdusek](https://github.com/vdusek), closes [#300](https://github.com/apify/crawlee-python/issues/300)\n\n### 🐛 Bug Fixes\n\n- Minor log fix ([#341](https://github.com/apify/crawlee-python/pull/341)) ([0688bf1](https://github.com/apify/crawlee-python/commit/0688bf1860534ab6b2a85dc850bf3d56507ab154)) by [@souravjain540](https://github.com/souravjain540)\n- Also use error_handler for context pipeline errors ([#331](https://github.com/apify/crawlee-python/pull/331)) ([7a66445](https://github.com/apify/crawlee-python/commit/7a664456b45c7e429b4c90aaf1c09d5796b93e3d)) by [@janbuchar](https://github.com/janbuchar), closes [#296](https://github.com/apify/crawlee-python/issues/296)\n- Strip whitespace from href in enqueue_links ([#346](https://github.com/apify/crawlee-python/pull/346)) ([8a3174a](https://github.com/apify/crawlee-python/commit/8a3174aed24f9eb4f9ac415a79a58685a081cde2)) by [@janbuchar](https://github.com/janbuchar), closes [#337](https://github.com/apify/crawlee-python/issues/337)\n- Warn instead of crashing when an empty dataset is being exported ([#342](https://github.com/apify/crawlee-python/pull/342)) ([22b95d1](https://github.com/apify/crawlee-python/commit/22b95d1948d4acd23a010898fa6af2f491e7f514)) by [@janbuchar](https://github.com/janbuchar), closes [#334](https://github.com/apify/crawlee-python/issues/334)\n- Avoid Github rate limiting in project bootstrapping test ([#364](https://github.com/apify/crawlee-python/pull/364)) ([992f07f](https://github.com/apify/crawlee-python/commit/992f07f266f7b8433d99e9a179f277995f81eb17)) by [@janbuchar](https://github.com/janbuchar)\n- Pass crawler configuration to storages ([#375](https://github.com/apify/crawlee-python/pull/375)) ([b2d3a52](https://github.com/apify/crawlee-python/commit/b2d3a52712abe21f4a4a5db4e20c80afe72c27de)) by [@janbuchar](https://github.com/janbuchar)\n- Purge request queue on repeated crawler runs ([#377](https://github.com/apify/crawlee-python/pull/377)) ([7ad3d69](https://github.com/apify/crawlee-python/commit/7ad3d6908e153c590bff72478af7ee3239a249bc)) by [@janbuchar](https://github.com/janbuchar), closes [#152](https://github.com/apify/crawlee-python/issues/152)\n\n\n## [0.1.1](https://github.com/apify/crawlee-python/releases/tag/v0.1.1) (2024-07-19)\n\n### 🚀 Features\n\n- Expose crawler log ([#316](https://github.com/apify/crawlee-python/pull/316)) ([ae475fa](https://github.com/apify/crawlee-python/commit/ae475fa450c4fe053620d7b7eb475f3d58804674)) by [@vdusek](https://github.com/vdusek), closes [#303](https://github.com/apify/crawlee-python/issues/303)\n- Integrate proxies into `PlaywrightCrawler` ([#325](https://github.com/apify/crawlee-python/pull/325)) ([2e072b6](https://github.com/apify/crawlee-python/commit/2e072b6ad7d5d82d96a7b489cafb87e7bfaf6e83)) by [@vdusek](https://github.com/vdusek)\n- Blocking detection for playwright crawler ([#328](https://github.com/apify/crawlee-python/pull/328)) ([49ff6e2](https://github.com/apify/crawlee-python/commit/49ff6e25c12a97550eee718d64bb4130f9990189)) by [@vdusek](https://github.com/vdusek), closes [#239](https://github.com/apify/crawlee-python/issues/239)\n\n### 🐛 Bug Fixes\n\n- Pylance reportPrivateImportUsage errors ([#313](https://github.com/apify/crawlee-python/pull/313)) ([09d7203](https://github.com/apify/crawlee-python/commit/09d72034d5db8c47f461111ec093761935a3e2ef)) by [@vdusek](https://github.com/vdusek), closes [#283](https://github.com/apify/crawlee-python/issues/283)\n- Set httpx logging to warning ([#314](https://github.com/apify/crawlee-python/pull/314)) ([1585def](https://github.com/apify/crawlee-python/commit/1585defffb2c0c844fab39bbc0e0b793d6169cbf)) by [@vdusek](https://github.com/vdusek), closes [#302](https://github.com/apify/crawlee-python/issues/302)\n- Byte size serialization in MemoryInfo ([#245](https://github.com/apify/crawlee-python/pull/245)) ([a030174](https://github.com/apify/crawlee-python/commit/a0301746c2df076d281708344fb906e1c42e0790)) by [@janbuchar](https://github.com/janbuchar)\n- Project bootstrapping in existing folder ([#318](https://github.com/apify/crawlee-python/pull/318)) ([c630818](https://github.com/apify/crawlee-python/commit/c630818538e0c37217ab73f6c6da05505ed8b364)) by [@janbuchar](https://github.com/janbuchar), closes [#301](https://github.com/apify/crawlee-python/issues/301)\n\n\n## [0.1.0](https://github.com/apify/crawlee-python/releases/tag/v0.1.0) (2024-07-08)\n\n### 🚀 Features\n\n- Project templates ([#237](https://github.com/apify/crawlee-python/pull/237)) ([c23c12c](https://github.com/apify/crawlee-python/commit/c23c12c66688f825f74deb39702f07cc6c6bbc46)) by [@janbuchar](https://github.com/janbuchar), closes [#215](https://github.com/apify/crawlee-python/issues/215)\n\n### 🐛 Bug Fixes\n\n- CLI UX improvements ([#271](https://github.com/apify/crawlee-python/pull/271)) ([123d515](https://github.com/apify/crawlee-python/commit/123d515b224c663577bfe0fab387d0aa11e5e4d4)) by [@janbuchar](https://github.com/janbuchar), closes [#267](https://github.com/apify/crawlee-python/issues/267)\n- Error handling in CLI and templates documentation ([#273](https://github.com/apify/crawlee-python/pull/273)) ([61083c3](https://github.com/apify/crawlee-python/commit/61083c33434d431a118538f15bfa9a68c312ab03)) by [@vdusek](https://github.com/vdusek), closes [#268](https://github.com/apify/crawlee-python/issues/268)\n\n\n## [0.0.7](https://github.com/apify/crawlee-python/releases/tag/v0.0.7) (2024-06-27)\n\n### 🐛 Bug Fixes\n\n- Do not wait for consistency in request queue ([#235](https://github.com/apify/crawlee-python/pull/235)) ([03ff138](https://github.com/apify/crawlee-python/commit/03ff138aadaf8e915abc7fafb854fe12947b9696)) by [@vdusek](https://github.com/vdusek)\n- Selector handling in BeautifulSoupCrawler enqueue_links ([#231](https://github.com/apify/crawlee-python/pull/231)) ([896501e](https://github.com/apify/crawlee-python/commit/896501edb44f801409fec95cb3e5f2bcfcb4188d)) by [@janbuchar](https://github.com/janbuchar), closes [#230](https://github.com/apify/crawlee-python/issues/230)\n- Handle blocked request ([#234](https://github.com/apify/crawlee-python/pull/234)) ([f8ef79f](https://github.com/apify/crawlee-python/commit/f8ef79ffcb7410713182af716d37dbbaad66fdbc)) by [@Mantisus](https://github.com/Mantisus)\n- Improve AutoscaledPool state management ([#241](https://github.com/apify/crawlee-python/pull/241)) ([fdea3d1](https://github.com/apify/crawlee-python/commit/fdea3d16b13afe70039d864de861486c760aa0ba)) by [@janbuchar](https://github.com/janbuchar), closes [#236](https://github.com/apify/crawlee-python/issues/236)\n\n\n## [0.0.6](https://github.com/apify/crawlee-python/releases/tag/v0.0.6) (2024-06-25)\n\n### 🚀 Features\n\n- Maintain a global configuration instance ([#207](https://github.com/apify/crawlee-python/pull/207)) ([e003aa6](https://github.com/apify/crawlee-python/commit/e003aa63d859bec8199d0c890b5c9604f163ccd3)) by [@janbuchar](https://github.com/janbuchar)\n- Add max requests per crawl to `BasicCrawler` ([#198](https://github.com/apify/crawlee-python/pull/198)) ([b5b3053](https://github.com/apify/crawlee-python/commit/b5b3053f43381601274e4034d07b4bf41720c7c2)) by [@vdusek](https://github.com/vdusek)\n- Add support decompress *br* response content ([#226](https://github.com/apify/crawlee-python/pull/226)) ([a3547b9](https://github.com/apify/crawlee-python/commit/a3547b9c882dc5333a4fcd1223687ef85e79138d)) by [@Mantisus](https://github.com/Mantisus)\n- BasicCrawler.export_data helper ([#222](https://github.com/apify/crawlee-python/pull/222)) ([237ec78](https://github.com/apify/crawlee-python/commit/237ec789b7dccc17cc57ef47ec56bcf73c6ca006)) by [@janbuchar](https://github.com/janbuchar), closes [#211](https://github.com/apify/crawlee-python/issues/211)\n- Automatic logging setup ([#229](https://github.com/apify/crawlee-python/pull/229)) ([a67b72f](https://github.com/apify/crawlee-python/commit/a67b72faacd75674071bae496d59e1c60636350c)) by [@janbuchar](https://github.com/janbuchar), closes [#214](https://github.com/apify/crawlee-python/issues/214)\n\n### 🐛 Bug Fixes\n\n- Handling of relative URLs in add_requests ([#213](https://github.com/apify/crawlee-python/pull/213)) ([8aa8c57](https://github.com/apify/crawlee-python/commit/8aa8c57f44149caa0e01950a5d773726f261699a)) by [@janbuchar](https://github.com/janbuchar), closes [#202](https://github.com/apify/crawlee-python/issues/202), [#204](https://github.com/apify/crawlee-python/issues/204)\n- Graceful exit in BasicCrawler.run ([#224](https://github.com/apify/crawlee-python/pull/224)) ([337286e](https://github.com/apify/crawlee-python/commit/337286e1b721cf61f57bc0ff3ead08df1f4f5448)) by [@janbuchar](https://github.com/janbuchar), closes [#212](https://github.com/apify/crawlee-python/issues/212)\n\n\n## [0.0.5](https://github.com/apify/crawlee-python/releases/tag/v0.0.5) (2024-06-21)\n\n### 🚀 Features\n\n- Browser rotation and better browser abstraction ([#177](https://github.com/apify/crawlee-python/pull/177)) ([a42ae6f](https://github.com/apify/crawlee-python/commit/a42ae6f53c5e24678f04011c3684290b68684016)) by [@vdusek](https://github.com/vdusek), closes [#131](https://github.com/apify/crawlee-python/issues/131)\n- Add emit persist state event to event manager ([#181](https://github.com/apify/crawlee-python/pull/181)) ([97f6c68](https://github.com/apify/crawlee-python/commit/97f6c68275b65f76c62b6d16d94354fc7f00d336)) by [@vdusek](https://github.com/vdusek)\n- Batched request addition in RequestQueue ([#186](https://github.com/apify/crawlee-python/pull/186)) ([f48c806](https://github.com/apify/crawlee-python/commit/f48c8068fe16ce3dd4c46fc248733346c0621411)) by [@vdusek](https://github.com/vdusek)\n- Add storage helpers to crawler & context ([#192](https://github.com/apify/crawlee-python/pull/192)) ([f8f4066](https://github.com/apify/crawlee-python/commit/f8f4066d8b32d6e7dc0d999a5aa8db75f99b43b8)) by [@vdusek](https://github.com/vdusek), closes [#98](https://github.com/apify/crawlee-python/issues/98), [#100](https://github.com/apify/crawlee-python/issues/100), [#172](https://github.com/apify/crawlee-python/issues/172)\n- Handle all supported configuration options ([#199](https://github.com/apify/crawlee-python/pull/199)) ([23c901c](https://github.com/apify/crawlee-python/commit/23c901cd68cf14b4041ee03568622ee32822e94b)) by [@janbuchar](https://github.com/janbuchar), closes [#84](https://github.com/apify/crawlee-python/issues/84)\n- Add Playwright's enqueue links helper ([#196](https://github.com/apify/crawlee-python/pull/196)) ([849d73c](https://github.com/apify/crawlee-python/commit/849d73cc7d137171b98f9f2ab85374e8beec0dad)) by [@vdusek](https://github.com/vdusek)\n\n### 🐛 Bug Fixes\n\n- Tmp path in tests is working ([#164](https://github.com/apify/crawlee-python/pull/164)) ([382b6f4](https://github.com/apify/crawlee-python/commit/382b6f48174bdac3931cc379eaf770ab06f826dc)) by [@vdusek](https://github.com/vdusek), closes [#159](https://github.com/apify/crawlee-python/issues/159)\n- Add explicit err msgs for missing pckg extras during import ([#165](https://github.com/apify/crawlee-python/pull/165)) ([200ebfa](https://github.com/apify/crawlee-python/commit/200ebfa63d6e20e17c8ca29544ef7229ed0df308)) by [@vdusek](https://github.com/vdusek), closes [#155](https://github.com/apify/crawlee-python/issues/155)\n- Make timedelta_ms accept string-encoded numbers ([#190](https://github.com/apify/crawlee-python/pull/190)) ([d8426ff](https://github.com/apify/crawlee-python/commit/d8426ff41e36f701af459ad17552fee39637674d)) by [@janbuchar](https://github.com/janbuchar)\n- **deps:** Update dependency psutil to v6 ([#193](https://github.com/apify/crawlee-python/pull/193)) ([eb91f51](https://github.com/apify/crawlee-python/commit/eb91f51e19da406e3f9293e5336c1f85fc7885a4)) by [@renovate[bot]](https://github.com/renovate[bot])\n- Improve compatibility between ProxyConfiguration and its SDK counterpart ([#201](https://github.com/apify/crawlee-python/pull/201)) ([1a76124](https://github.com/apify/crawlee-python/commit/1a76124080d561e0153a4dda0bdb0d9863c3aab6)) by [@janbuchar](https://github.com/janbuchar)\n- Correct return type of storage get_info methods ([#200](https://github.com/apify/crawlee-python/pull/200)) ([332673c](https://github.com/apify/crawlee-python/commit/332673c4fb519b80846df7fb8cd8bb521538a8a4)) by [@janbuchar](https://github.com/janbuchar)\n- Type error in statistics persist state ([#206](https://github.com/apify/crawlee-python/pull/206)) ([96ceef6](https://github.com/apify/crawlee-python/commit/96ceef697769cd57bd1a50b6615cf1e70549bd2d)) by [@vdusek](https://github.com/vdusek), closes [#194](https://github.com/apify/crawlee-python/issues/194)\n\n\n## [0.0.4](https://github.com/apify/crawlee-python/releases/tag/v0.0.4) (2024-05-30)\n\n### 🚀 Features\n\n- Capture statistics about the crawler run ([#142](https://github.com/apify/crawlee-python/pull/142)) ([eeebe9b](https://github.com/apify/crawlee-python/commit/eeebe9b1e24338d68a0a55228bbfc717f4d9d295)) by [@janbuchar](https://github.com/janbuchar), closes [#97](https://github.com/apify/crawlee-python/issues/97)\n- Proxy configuration ([#156](https://github.com/apify/crawlee-python/pull/156)) ([5c3753a](https://github.com/apify/crawlee-python/commit/5c3753a5527b1d01f7260b9e4c566e43f956a5e8)) by [@janbuchar](https://github.com/janbuchar), closes [#136](https://github.com/apify/crawlee-python/issues/136)\n- Add first version of browser pool and playwright crawler ([#161](https://github.com/apify/crawlee-python/pull/161)) ([2d2a050](https://github.com/apify/crawlee-python/commit/2d2a0505b1c2b1529a8835163ca97d1ec2a6e44a)) by [@vdusek](https://github.com/vdusek)\n\n\n## [0.0.3](https://github.com/apify/crawlee-python/releases/tag/v0.0.3) (2024-05-13)\n\n### 🚀 Features\n\n- AutoscaledPool implementation ([#55](https://github.com/apify/crawlee-python/pull/55)) ([621ada2](https://github.com/apify/crawlee-python/commit/621ada2bd1ba4e2346fb948dc02686e2b37e3856)) by [@janbuchar](https://github.com/janbuchar), closes [#19](https://github.com/apify/crawlee-python/issues/19)\n- Add Snapshotter ([#20](https://github.com/apify/crawlee-python/pull/20)) ([492ee38](https://github.com/apify/crawlee-python/commit/492ee38c893b8f54e9583dd492576c5106e29881)) by [@vdusek](https://github.com/vdusek)\n- Implement BasicCrawler ([#56](https://github.com/apify/crawlee-python/pull/56)) ([6da971f](https://github.com/apify/crawlee-python/commit/6da971fcddbf8b6795346c88e295dada28e7b1d3)) by [@janbuchar](https://github.com/janbuchar), closes [#30](https://github.com/apify/crawlee-python/issues/30)\n- BeautifulSoupCrawler ([#107](https://github.com/apify/crawlee-python/pull/107)) ([4974dfa](https://github.com/apify/crawlee-python/commit/4974dfa20c7911ee073438fd388e60ba4b2c07db)) by [@janbuchar](https://github.com/janbuchar), closes [#31](https://github.com/apify/crawlee-python/issues/31)\n- Add_requests and enqueue_links context helpers ([#120](https://github.com/apify/crawlee-python/pull/120)) ([dc850a5](https://github.com/apify/crawlee-python/commit/dc850a5778b105ff09e19eaecbb0a12d94798a62)) by [@janbuchar](https://github.com/janbuchar), closes [#5](https://github.com/apify/crawlee-python/issues/5)\n- Use SessionPool in BasicCrawler ([#128](https://github.com/apify/crawlee-python/pull/128)) ([9fc4648](https://github.com/apify/crawlee-python/commit/9fc464837e596b3b5a7cd818b6d617550e249352)) by [@janbuchar](https://github.com/janbuchar), closes [#110](https://github.com/apify/crawlee-python/issues/110)\n- Add base storage client and resource subclients ([#138](https://github.com/apify/crawlee-python/pull/138)) ([44d6597](https://github.com/apify/crawlee-python/commit/44d65974e4837576918069d7e63f8b804964971a)) by [@vdusek](https://github.com/vdusek)\n\n### 🐛 Bug Fixes\n\n- **deps:** Update dependency docutils to ^0.21.0 ([#101](https://github.com/apify/crawlee-python/pull/101)) ([534b613](https://github.com/apify/crawlee-python/commit/534b613f7cdfe7adf38b548ee48537db3167d1ec)) by [@renovate[bot]](https://github.com/renovate[bot])\n- **deps:** Update dependency eval-type-backport to ^0.2.0 ([#124](https://github.com/apify/crawlee-python/pull/124)) ([c9e69a8](https://github.com/apify/crawlee-python/commit/c9e69a8534f4d82d9a6314947d76a86bcb744607)) by [@renovate[bot]](https://github.com/renovate[bot])\n- Fire local SystemInfo events every second ([#144](https://github.com/apify/crawlee-python/pull/144)) ([f1359fa](https://github.com/apify/crawlee-python/commit/f1359fa7eea23f8153ad711287c073e45d498401)) by [@vdusek](https://github.com/vdusek)\n- Storage manager & purging the defaults ([#150](https://github.com/apify/crawlee-python/pull/150)) ([851042f](https://github.com/apify/crawlee-python/commit/851042f25ad07e25651768e476f098ef0ed21914)) by [@vdusek](https://github.com/vdusek)\n\n\n<!-- generated by git-cliff -->"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Development\n\nHere you'll find a contributing guide to get started with development.\n\n## Environment\n\nFor local development, it is required to have Python 3.10 (or a later version) installed.\n\nWe use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly.\n\nWe use [Poe the Poet](https://poethepoet.natn.io/) as a task runner, similar to npm scripts in `package.json`.\nAll tasks are defined in `pyproject.toml` under `[tool.poe.tasks]` and can be run with `uv run poe <task>`.\n\n### Available tasks\n\n| Task | Description |\n| ---- | ----------- |\n| `install-dev` | Install development dependencies |\n| `check-code` | Run lint, type-check, and unit-tests |\n| `lint` | Run linter |\n| `format` | Fix lint issues and format code |\n| `type-check` | Run type checker |\n| `unit-tests` | Run unit tests |\n| `unit-tests-cov` | Run unit tests with coverage |\n| `e2e-templates-tests` | Run end-to-end template tests |\n| `build-docs` | Build documentation website |\n| `run-docs` | Run documentation website locally |\n| `build` | Build package |\n| `clean` | Remove build artifacts and clean caches |\n\n## Dependencies\n\nTo install this package and its development dependencies, run:\n\n```sh\nuv run poe install-dev\n```\n\n## Code checking\n\nTo execute all code checking tools together, run:\n\n```sh\nuv run poe check-code\n```\n\n### Linting\n\nWe utilize [ruff](https://docs.astral.sh/ruff/) for linting, which analyzes code for potential issues and enforces consistent style. Refer to `pyproject.toml` for configuration details.\n\nTo run linting:\n\n```sh\nuv run poe lint\n```\n\n### Formatting\n\nOur automated code formatting also leverages [ruff](https://docs.astral.sh/ruff/), ensuring uniform style and addressing fixable linting issues. Configuration specifics are outlined in `pyproject.toml`.\n\nTo run formatting:\n\n```sh\nuv run poe format\n```\n\n### Type checking\n\nType checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.\n\nTo run type checking:\n\n```sh\nuv run poe type-check\n```\n\n### Unit tests\n\nWe use [pytest](https://docs.pytest.org/) as a testing framework with many plugins. Check `pyproject.toml` for configuration details and installed plugins.\n\nTo run unit tests:\n\n```sh\nuv run poe unit-tests\n```\n\nTo run unit tests with coverage report:\n\n```sh\nuv run poe unit-tests-cov\n```\n\n## End-to-end tests\n\nPrerequisites:\n\n- [apify-cli](https://docs.apify.com/cli/docs/installation) installed and available in `PATH`\n- Set `APIFY_TEST_USER_API_TOKEN` to your [Apify API token](https://docs.apify.com/platform/integrations/api#api-token)\n\nTo run end-to-end tests:\n\n```sh\nuv run poe e2e-templates-tests\n```\n\n## Documentation\n\nWe follow the [Google docstring format](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for code documentation. All user-facing classes and functions must be documented. Documentation standards are enforced using [Ruff](https://docs.astral.sh/ruff/).\n\nOur API documentation is generated from these docstrings using [pydoc-markdown](https://pypi.org/project/pydoc-markdown/) with custom post-processing. Additional content is provided through markdown files in the `docs/` directory. The final documentation is rendered using [Docusaurus](https://docusaurus.io/) and published to GitHub Pages.\n\nTo run the documentation locally, ensure you have `Node.js` 20+ installed, then run:\n\n```sh\nuv run poe run-docs\n```\n\n## Commits\n\nWe use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.\n\n### Available commit types\n\n| Type | Description |\n| ---- | ----------- |\n| `feat` | A new feature |\n| `fix` | A bug fix |\n| `docs` | Documentation only changes |\n| `style` | Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) |\n| `refactor` | A code change that neither fixes a bug nor adds a feature |\n| `perf` | A code change that improves performance |\n| `test` | Adding missing tests or correcting existing tests |\n| `build` | Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm) |\n| `ci` | Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs) |\n| `chore` | Other changes that don't modify src or test files |\n| `revert` | Reverts a previous commit |\n\n## Release process\n\nPublishing new versions to [PyPI](https://pypi.org/project/crawlee) is automated through GitHub Actions.\n\n- **Beta releases**: On each commit to the master branch, a new beta release is automatically published. The version number is determined based on the latest release and conventional commits. The beta version suffix is incremented by 1 from the last beta release on PyPI.\n- **Stable releases**: A stable version release may be created by triggering the `release` GitHub Actions workflow. The version number is determined based on the latest release and conventional commits (`auto` release type), or it may be overridden using the `custom` release type.\n\n### Publishing to PyPI manually\n\n1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.\n2. **Make sure you know what you're doing.**\n\n3. Update the version number:\n\n- Modify the `version` field under `project` in `pyproject.toml`.\n\n```toml\n[project]\nname = \"crawlee\"\nversion = \"x.z.y\"\n```\n\n4. Build the package:\n\n```sh\nuv run poe build\n```\n\n5. Upload to PyPI:\n\n```sh\nuv publish --token YOUR_API_TOKEN\n```\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2023 Apify Technologies s.r.o.\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "<h1 align=\"center\">\n    <a href=\"https://crawlee.dev\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-dark.svg?sanitize=true\">\n          <img alt=\"Crawlee\" src=\"https://raw.githubusercontent.com/apify/crawlee-python/master/website/static/img/crawlee-light.svg?sanitize=true\" width=\"500\">\n        </picture>\n    </a>\n    <br>\n    <small>A web scraping and browser automation library</small>\n</h1>\n\n<p align=center>\n    <a href=\"https://trendshift.io/repositories/11169\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/11169\" alt=\"apify%2Fcrawlee-python | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/></a>\n</p>\n\n<p align=\"center\">\n  <a href=\"https://badge.fury.io/py/crawlee\" rel=\"nofollow\"><img src=\"https://badge.fury.io/py/crawlee.svg\" alt=\"PyPI package version\"></a>\n  <a href=\"https://pypi.org/project/crawlee/\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/crawlee\" alt=\"PyPI package downloads\"></a>\n  <a href=\"https://codecov.io/gh/apify/crawlee-python\"><img src=\"https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG\" alt=\"Codecov report\"></a>\n  <a href=\"https://pypi.org/project/crawlee/\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/pyversions/crawlee\" alt=\"PyPI Python version\"></a>\n  <a href=\"https://discord.gg/jyEM2PRvMU\" rel=\"nofollow\"><img src=\"https://img.shields.io/discord/801163717915574323?label=discord\" alt=\"Chat on Discord\"></a>\n</p>\n\nCrawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**\n\nYour crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.\n\n> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈\n\nWe also have a TypeScript implementation of the Crawlee, which you can explore and utilize for your projects. Visit our GitHub repository for more information [Crawlee for JS/TS on GitHub](https://github.com/apify/crawlee).\n\n## Installation\n\nWe recommend visiting the [Introduction tutorial](https://crawlee.dev/python/docs/introduction) in Crawlee documentation for more information.\n\nCrawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.\n\nTo install Crawlee with all features, run the following command:\n\n```sh\npython -m pip install 'crawlee[all]'\n```\n\nThen, install the [Playwright](https://playwright.dev/) dependencies:\n\n```sh\nplaywright install\n```\n\nVerify that Crawlee is successfully installed:\n\n```sh\npython -c 'import crawlee; print(crawlee.__version__)'\n```\n\nFor detailed installation instructions see the [Setting up](https://crawlee.dev/python/docs/introduction/setting-up) documentation page.\n\n### With Crawlee CLI\n\nThe quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. First, ensure you have [uv](https://pypi.org/project/uv/) installed:\n\n```sh\nuv --help\n```\n\nIf [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/).\n\nThen, run the CLI and choose from the available templates:\n\n```sh\nuvx 'crawlee[cli]' create my-crawler\n```\n\nIf you already have `crawlee` installed, you can spin it up by running:\n\n```sh\ncrawlee create my-crawler\n```\n\n## Examples\n\nHere are some practical examples to help you get started with different types of crawlers in Crawlee. Each example demonstrates how to set up and run a crawler for specific use cases, whether you need to handle simple HTML pages or interact with JavaScript-heavy sites. A crawler run will create a `storage/` directory in your current working directory.\n\n### BeautifulSoupCrawler\n\nThe [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler) downloads web pages using an HTTP library and provides HTML-parsed content to the user. By default it uses [`HttpxHttpClient`](https://crawlee.dev/python/api/class/HttpxHttpClient) for HTTP communication and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) for parsing HTML. It is ideal for projects that require efficient extraction of data from HTML content. This crawler has very good performance since it does not use a browser. However, if you need to execute client-side JavaScript, to get your content, this is not going to be enough and you will need to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `beautifulsoup` extra.\n\n```python\nimport asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n```\n\n### PlaywrightCrawler\n\nThe [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) uses a headless browser to download web pages and provides an API for data extraction. It is built on [Playwright](https://playwright.dev/), an automation library designed for managing headless browsers. It excels at retrieving web pages that rely on client-side JavaScript for content generation, or tasks requiring interaction with JavaScript-driven content. For scenarios where JavaScript execution is unnecessary or higher performance is required, consider using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler). Also if you want to use this crawler, make sure you install `crawlee` with `playwright` extra.\n\n```python\nimport asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n```\n\n### More examples\n\nExplore our [Examples](https://crawlee.dev/python/docs/examples) page in the Crawlee documentation for a wide range of additional use cases and demonstrations.\n\n## Features\n\nWhy Crawlee is the preferred choice for web scraping and crawling?\n\n### Why use Crawlee instead of just a random HTTP library with an HTML parser?\n\n- Unified interface for **HTTP & headless browser** crawling.\n- Automatic **parallel crawling** based on available system resources.\n- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking).\n- Automatic **retries** on errors or when you’re getting blocked.\n- Integrated **proxy rotation** and session management.\n- Configurable **request routing** - direct URLs to the appropriate handlers.\n- Persistent **queue for URLs** to crawl.\n- Pluggable **storage** of both tabular data and files.\n- Robust **error handling**.\n\n### Why to use Crawlee rather than Scrapy?\n\n- **Asyncio-based** – Leveraging the standard [Asyncio](https://docs.python.org/3/library/asyncio.html) library, Crawlee delivers better performance and seamless compatibility with other modern asynchronous libraries.\n- **Type hints** – Newer project built with modern Python, and complete type hint coverage for a better developer experience.\n- **Simple integration** – Crawlee crawlers are regular Python scripts, requiring no additional launcher executor. This flexibility allows to integrate a crawler directly into other applications.\n- **State persistence** – Supports state persistence during interruptions, saving time and costs by avoiding the need to restart scraping pipelines from scratch after an issue.\n- **Organized data storages** – Allows saving of multiple types of results in a single scraping run. Offers several storing options (see [datasets](https://crawlee.dev/python/api/class/Dataset) & [key-value stores](https://crawlee.dev/python/api/class/KeyValueStore)).\n\n## Running on the Apify platform\n\nCrawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://docs.apify.com/sdk/python/) to learn more about deploying Crawlee to the Apify platform.\n\n## Support\n\nIf you find any bug or issue with Crawlee, please [submit an issue on GitHub](https://github.com/apify/crawlee-python/issues). For questions, you can ask on [Stack Overflow](https://stackoverflow.com/questions/tagged/apify), in GitHub Discussions or you can join our [Discord server](https://discord.com/invite/jyEM2PRvMU).\n\n## Contributing\n\nYour code contributions are welcome, and you'll be praised for eternity! If you have any ideas for improvements, either submit an issue or create a pull request. For contribution guidelines and the code of conduct, see [CONTRIBUTING.md](https://github.com/apify/crawlee-python/blob/master/CONTRIBUTING.md).\n\n## License\n\nThis project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/apify/crawlee-python/blob/master/LICENSE) file for details.\n"
  },
  {
    "path": "codecov.yaml",
    "content": "coverage:\n  status:\n    project:\n      default:\n        target: auto\n        threshold: 0.10%   # tolerate up to 0.10% decrease\n        informational: true # CI check reports status but never fails\n    patch:\n      default:\n        target: 50%         # error only if patch coverage drops below 50%\n        informational: true  # CI check reports status but never fails\n"
  },
  {
    "path": "docs/deployment/apify_platform.mdx",
    "content": "---\nid: apify-platform\ntitle: Apify platform\ndescription: Apify platform - large-scale and high-performance web scraping\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport CodeBlock from '@theme/CodeBlock';\n\nimport LogWithConfigExample from '!!raw-loader!./code_examples/apify/log_with_config_example.py';\nimport CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as_actor_example.py';\nimport ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py';\nimport ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py';\n\nApify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api).\n\nWhile we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure.\n\n:::note\n\nWe do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees.\n\n:::\n\n## Requirements\n\nTo run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up).\n\nAdditionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation).\n\nFinally, ensure that the [Apify SDK] (https://docs.apify.com/sdk/python/) is installed in your project. You can install it using `pip`:\n\n```bash\npip install apify\n```\n\n## Logging into Apify platform from Crawlee\n\nTo access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://docs.apify.com/cli/) or with environment variables.\n\nOnce you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on.\n\n### Log in with CLI\n\nApify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added.\n\n```bash\nnpm install -g apify-cli\napify login -t YOUR_API_TOKEN\n```\n\n### Log in with environment variables\n\nAlternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token.\n\n> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password)\n> environment variable. Actor automatically infers that from your token, but it can be useful\n> when you need to access proxies from a different account than your token represents.\n\n### Log in with Configuration\n\nAnother option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there.\n\n<CodeBlock className=\"language-python\">\n    {LogWithConfigExample}\n</CodeBlock>\n\n## What is an Actor\n\nWhen you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset.\n\nActors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours.\n\n**Related links**\n\n- [Store of existing Actors](https://apify.com/store)\n- [Documentation](https://docs.apify.com/actors)\n- [View Actors in Apify Console](https://console.apify.com/actors)\n- [API reference](https://apify.com/docs/api/v2#/reference/actors)\n\n## Running an Actor locally\n\nFirst let's create a boilerplate of the new Actor. You could use Apify CLI and just run:\n\n```bash\napify create my-hello-world\n```\n\nThe CLI will prompt you to select a project boilerplate template - let's pick \"Crawlee + BeautifulSoup\". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows:\n\n```bash\ncd my-hello-world\napify run\n```\n\n## Running Crawlee code as an Actor\n\nFor running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`.\n\n:::info NOTE\nAdding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exiting the process.\n:::\n\nLet's look at the `BeautifulSoupCrawler` example from the [Quick start](../quick-start) guide:\n\n<CodeBlock className=\"language-python\">\n    {CrawlerAsActorExample}\n</CodeBlock>\n\nNote that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder:\n\n```bash\napify run\n```\n\n## Deploying an Actor to Apify platform\n\nNow (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running:\n\n```bash\napify push\n```\n\nYour script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the\n[Apify Actor](https://docs.apify.com/cli) documentation.\n\n## Usage on Apify platform\n\nYou can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section.\n\n## Storages\n\nThere are several things worth mentioning here.\n\n### Helper functions for default Key-Value Store and Dataset\n\nTo simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use:\n- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore)\n- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)\n\n### Using platform storage in a local Actor\n\nWhen you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage.\n\nUsing each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk.\n\n:::note\nIf you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants <ApiLink to=\"class/KeyValueStore#open\">`KeyValueStore.open()`</ApiLink>, <ApiLink to=\"class/Dataset#open\">`Dataset.open()`</ApiLink> and <ApiLink to=\"class/RequestQueue#open\">`RequestQueue.open()`</ApiLink> will work the same.\n:::\n\n{/*\n### Getting public url of an item in the platform storage\n\nIf you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.\n\n<CodeBlock language=\"python\">\n    {GetPublicUrlSource}\n</CodeBlock>\n\n*/}\n\n### Exporting dataset data\n\nWhen the <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results.\n\n**Related links**\n\n- [Apify platform storage documentation](https://docs.apify.com/storage)\n- [View storage in Apify Console](https://console.apify.com/storage)\n- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores)\n- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets)\n- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues)\n\n## Environment variables\n\nThe following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation.\n\n:::note\n\nIt's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables.\n\n:::\n\n### `APIFY_TOKEN`\n\nThe API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage\nor to run an Actor on the Apify platform. You can find your API token on the\n[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page.\n\n### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`\n\nBy combining the env vars in various ways, you can greatly influence the Actor's behavior.\n\n| Env Vars                                | API | Storages         |\n| --------------------------------------- | --- | ---------------- |\n|  none OR `CRAWLEE_STORAGE_DIR`          | no  | local            |\n| `APIFY_TOKEN`                           | yes | Apify platform   |\n| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform |\n\nWhen using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform\nfeatures and your data will be stored locally by default. If you want to access platform storages,\nyou can use the `force_cloud=true` option in their respective functions.\n\n### `APIFY_PROXY_PASSWORD`\n\nOptional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation.\nAssuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy)\nin the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var,\nso in most cases, you don't need to touch it. You should use it when, for some reason,\nyou need access to Apify Proxy, but not access to Apify API, or when you need access to\nproxy from a different account than your token represents.\n\n## Proxy management\n\nIn addition to your own proxy servers and proxy servers acquired from\nthird-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy)\nfor your scraping needs.\n\n### Apify proxy\n\nIf you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account.\n\n<CodeBlock className=\"language-python\">\n    {ProxyExample}\n</CodeBlock>\n\nNote that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create <ApiLink to=\"class/ProxyConfiguration\">`ProxyConfiguration`</ApiLink> instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead.\n\n### Advanced Apify proxy configuration\n\nWith Apify Proxy, you can select specific proxy groups to use, or countries to connect from.\nThis allows you to get better proxy performance after some initial research.\n\n<CodeBlock className=\"language-python\">\n    {ProxyAdvancedExample}\n</CodeBlock>\n\nNow your crawlers will use only Residential proxies from the US. Note that you must first get access\nto a proxy group before you are able to use it. You can check proxy groups available to you\nin the [proxy dashboard](https://console.apify.com/proxy).\n\n### Apify proxy vs. own proxies\n\nThe [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy.\n\nThe difference is easy to remember.\n- If you're using your own proxies - you should create a <ApiLink to=\"class/ProxyConfiguration\">`ProxyConfiguration`</ApiLink> instance directly.\n- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy.\n\n**Related links**\n\n- [Apify Proxy docs](https://docs.apify.com/proxy)\n"
  },
  {
    "path": "docs/deployment/aws_lambda.mdx",
    "content": "---\nid: aws-lambda\ntitle: Deploy on AWS Lambda\ndescription: Prepare your crawler to run on AWS Lambda.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport CodeBlock from '@theme/CodeBlock';\n\nimport BeautifulSoupCrawlerLambda from '!!raw-loader!./code_examples/aws/beautifulsoup_crawler_lambda.py';\nimport PlaywrightCrawlerLambda from '!!raw-loader!./code_examples/aws/playwright_crawler_lambda.py';\nimport PlaywrightCrawlerDockerfile from '!!raw-loader!./code_examples/aws/playwright_dockerfile';\n\n[AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) is a serverless compute service that lets you run code without provisioning or managing servers. This guide covers deploying <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>.\n\nThe code examples are based on the [BeautifulSoupCrawler example](../examples/beautifulsoup-crawler).\n\n## BeautifulSoupCrawler on AWS Lambda\n\nFor simple crawlers that don't require browser rendering, you can deploy using a ZIP archive.\n\n### Updating the code\n\nWhen instantiating a crawler, use <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink>. By default, Crawlee uses file-based storage, but the Lambda filesystem is read-only (except for `/tmp`). Using `MemoryStorageClient` tells Crawlee to use in-memory storage instead.\n\nWrap the crawler logic in a `lambda_handler` function. This is the entry point that AWS will execute.\n\n:::important\n\nMake sure to always instantiate a new crawler for every Lambda invocation. AWS keeps the environment running for some time after the first execution (to reduce cold-start times), so subsequent calls may access an already-used crawler instance.\n\n**TL;DR: Keep your Lambda stateless.**\n\n:::\n\nFinally, return the scraped data from the Lambda when the crawler run ends.\n\n<CodeBlock language=\"python\" title=\"lambda_function.py\">\n    {BeautifulSoupCrawlerLambda}\n</CodeBlock>\n\n### Preparing the environment\n\nLambda requires all dependencies to be included in the deployment package. Create a virtual environment and install dependencies:\n\n```bash\npython3.14 -m venv .venv\nsource .venv/bin/activate\npip install 'crawlee[beautifulsoup]' 'boto3' 'aws-lambda-powertools'\n```\n\n[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Including it in your dependencies is recommended to avoid version misalignment issues with the Lambda runtime.\n\n### Creating the ZIP archive\n\nCreate a ZIP archive from your project, including dependencies from the virtual environment:\n\n```bash\ncd .venv/lib/python3.14/site-packages\nzip -r ../../../../package.zip .\ncd ../../../../\nzip package.zip lambda_function.py\n```\n\n:::note Large dependencies?\n\nAWS has a limit of 50 MB for direct upload and 250 MB for unzipped deployment package size.\n\nA better way to manage dependencies is by using Lambda Layers. With Layers, you can share files between multiple Lambda functions and keep the actual code as slim as possible.\n\nTo create a Lambda Layer:\n\n1. Create a `python/` folder and copy dependencies from `site-packages` into it\n2. Create a zip archive: `zip -r layer.zip python/`\n3. Create a new Lambda Layer from the archive (you may need to upload it to S3 first)\n4. Attach the Layer to your Lambda function\n\n:::\n\n### Creating the Lambda function\n\nCreate the Lambda function in the AWS Lambda Console:\n\n1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).\n2. Click **Create function**.\n3. Select **Author from scratch**.\n4. Enter a **Function name**, for example `BeautifulSoupTest`.\n5. Choose a **Python runtime** that matches the version used in your virtual environment (for example, Python 3.14).\n6. Click **Create function** to finish.\n\nOnce created, upload `package.zip` as the code source in the AWS Lambda Console using the \"Upload from\" button.\n\nIn Lambda Runtime Settings, set the handler. Since the file is named `lambda_function.py` and the function is `lambda_handler`, you can use the default value `lambda_function.lambda_handler`.\n\n:::tip Configuration\n\nIn the Configuration tab, you can adjust:\n\n- **Memory**: Memory size can greatly affect execution speed. A minimum of 256-512 MB is recommended.\n- **Timeout**: Set according to the size of the website you are scraping (1 minute for the example code).\n- **Ephemeral storage**: Size of the `/tmp` directory.\n\nSee the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.\n\n:::\n\nAfter the Lambda deploys, you can test it by clicking the \"Test\" button. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.\n\n## PlaywrightCrawler on AWS Lambda\n\nFor crawlers that require browser rendering, you need to deploy using Docker container images because Playwright and browser binaries exceed Lambda's ZIP deployment size limits.\n\n### Updating the code\n\nAs with <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, use <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink> and wrap the logic in a `lambda_handler` function. Additionally, configure `browser_launch_options` with flags optimized for serverless environments. These flags disable sandboxing and GPU features that aren't available in Lambda's containerized runtime.\n\n<CodeBlock language=\"python\" title=\"main.py\">\n    {PlaywrightCrawlerLambda}\n</CodeBlock>\n\n### Installing and configuring AWS CLI\n\nInstall AWS CLI following the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) according to your operating system.\n\nAuthenticate by running:\n\n```bash\naws login\n```\n\n### Preparing the project\n\nInitialize the project by running `uvx 'crawlee[cli]' create`.\n\nOr use a single command if you don't need interactive mode:\n\n```bash\nuvx 'crawlee[cli]' create aws_playwright --crawler-type playwright --http-client impit --package-manager uv --no-apify --start-url 'https://crawlee.dev' --install\n```\n\nAdd the following dependencies:\n\n```bash\nuv add awslambdaric aws-lambda-powertools boto3\n```\n\n[`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) is the AWS SDK for Python. Use it if your function integrates with any other AWS services.\n\nThe project is created with a Dockerfile that needs to be modified for AWS Lambda by adding `ENTRYPOINT` and updating `CMD`:\n\n<CodeBlock language=\"dockerfile\" title=\"Dockerfile\">\n    {PlaywrightCrawlerDockerfile}\n</CodeBlock>\n\n### Building and pushing the Docker image\n\nCreate a repository `lambda/aws-playwright` in [Amazon Elastic Container Registry](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html) in the same region where your Lambda functions will run. To learn more, refer to the [official documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-cli.html).\n\nNavigate to the created repository and click the \"View push commands\" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them.\n\nExample:\n```bash\naws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data}\ndocker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright .\ndocker tag lambda/aws-playwright:latest {user-specific-data}/lambda/aws-playwright:latest\ndocker push {user-specific-data}/lambda/aws-playwright:latest\n```\n\n### Creating the Lambda function\n\n1. Navigate to `Lambda` in [AWS Management Console](https://aws.amazon.com/console/).\n2. Click **Create function**.\n3. Select **Container image**.\n4. Browse and select your ECR image.\n5. Click **Create function** to finish.\n\n:::tip Configuration\n\nIn the Configuration tab, you can adjust resources. Playwright crawlers require more resources than BeautifulSoup crawlers:\n\n- **Memory**: Minimum 1024 MB recommended. Browser operations are memory-intensive, so 2048 MB or more may be needed for complex pages.\n- **Timeout**: Set according to crawl size. Browser startup adds overhead, so allow at least 5 minutes even for simple crawls.\n- **Ephemeral storage**: Default 512 MB is usually sufficient unless downloading large files.\n\nSee the [official documentation](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) to learn how performance and cost scale with memory.\n\n:::\n\nAfter the Lambda deploys, click the \"Test\" button to invoke it. The event contents don't matter for a basic test, but you can parameterize your crawler by parsing the event object that AWS passes as the first argument to the handler.\n"
  },
  {
    "path": "docs/deployment/code_examples/apify/crawler_as_actor_example.py",
    "content": "import asyncio\n\nfrom apify import Actor\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # Wrap the crawler code in an Actor context manager.\n    async with Actor:\n        crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n        @crawler.router.default_handler\n        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n            context.log.info(f'Processing {context.request.url} ...')\n            data = {\n                'url': context.request.url,\n                'title': context.soup.title.string if context.soup.title else None,\n            }\n            await context.push_data(data)\n            await context.enqueue_links()\n\n        await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/deployment/code_examples/apify/get_public_url.py",
    "content": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n    async with Actor:\n        store = await Actor.open_key_value_store()\n        await store.set_value('your-file', {'foo': 'bar'})\n        url = store.get_public_url('your-file')\n        Actor.log.info(f'KVS public URL: {url}')\n        # https://api.apify.com/v2/key-value-stores/<your-store-id>/records/your-file\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/deployment/code_examples/apify/log_with_config_example.py",
    "content": "import asyncio\n\nfrom apify import Actor, Configuration\n\n\nasync def main() -> None:\n    # Create a new configuration with your API key. You can find it at\n    # https://console.apify.com/settings/integrations. It can be provided either\n    # as a parameter \"token\" or as an environment variable \"APIFY_TOKEN\".\n    config = Configuration(\n        token='apify_api_YOUR_TOKEN',\n    )\n\n    async with Actor(config):\n        Actor.log.info('Hello from Apify platform!')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/deployment/code_examples/apify/proxy_advanced_example.py",
    "content": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n    async with Actor:\n        proxy_configuration = await Actor.create_proxy_configuration(\n            password='apify_proxy_YOUR_PASSWORD',\n            # Specify the proxy group to use.\n            groups=['RESIDENTIAL'],\n            # Set the country code for the proxy.\n            country_code='US',\n        )\n\n        # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/deployment/code_examples/apify/proxy_example.py",
    "content": "import asyncio\n\nfrom apify import Actor\n\n\nasync def main() -> None:\n    async with Actor:\n        # Create a new Apify Proxy configuration. The password can be found at\n        # https://console.apify.com/proxy/http-settings and should be provided either\n        # as a parameter \"password\" or as an environment variable \"APIFY_PROXY_PASSWORD\".\n        proxy_configuration = await Actor.create_proxy_configuration(\n            password='apify_proxy_YOUR_PASSWORD',\n        )\n\n        if not proxy_configuration:\n            Actor.log.warning('Failed to create proxy configuration.')\n            return\n\n        proxy_url = await proxy_configuration.new_url()\n        Actor.log.info(f'Proxy URL: {proxy_url}')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/deployment/code_examples/aws/beautifulsoup_crawler_lambda.py",
    "content": "import asyncio\nimport json\nfrom datetime import timedelta\nfrom typing import Any\n\nfrom aws_lambda_powertools.utilities.typing import LambdaContext\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset, RequestQueue\n\n\nasync def main() -> str:\n    # highlight-start\n    # Disable writing storage data to the file system\n    storage_client = MemoryStorageClient()\n    # highlight-end\n\n    # Initialize storages\n    dataset = await Dataset.open(storage_client=storage_client)\n    request_queue = await RequestQueue.open(storage_client=storage_client)\n\n    crawler = BeautifulSoupCrawler(\n        storage_client=storage_client,\n        max_request_retries=1,\n        request_handler_timeout=timedelta(seconds=30),\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n            'h1s': [h1.text for h1 in context.soup.find_all('h1')],\n            'h2s': [h2.text for h2 in context.soup.find_all('h2')],\n            'h3s': [h3.text for h3 in context.soup.find_all('h3')],\n        }\n\n        await context.push_data(data)\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev'])\n\n    # Extract data saved in `Dataset`\n    data = await crawler.get_data()\n\n    # Clean up storages after the crawl\n    await dataset.drop()\n    await request_queue.drop()\n\n    # Serialize the list of scraped items to JSON string\n    return json.dumps(data.items)\n\n\ndef lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:\n    result = asyncio.run(main())\n    # Return the response with results\n    return {'statusCode': 200, 'body': result}\n"
  },
  {
    "path": "docs/deployment/code_examples/aws/playwright_crawler_lambda.py",
    "content": "import asyncio\nimport json\nfrom datetime import timedelta\nfrom typing import Any\n\nfrom aws_lambda_powertools.utilities.typing import LambdaContext\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset, RequestQueue\n\n\nasync def main() -> str:\n    # highlight-start\n    # Disable writing storage data to the file system\n    storage_client = MemoryStorageClient()\n    # highlight-end\n\n    # Initialize storages\n    dataset = await Dataset.open(storage_client=storage_client)\n    request_queue = await RequestQueue.open(storage_client=storage_client)\n\n    crawler = PlaywrightCrawler(\n        storage_client=storage_client,\n        max_request_retries=1,\n        request_handler_timeout=timedelta(seconds=30),\n        max_requests_per_crawl=10,\n        # highlight-start\n        # Configure Playwright to run in AWS Lambda environment\n        browser_launch_options={\n            'args': [\n                '--no-sandbox',\n                '--disable-setuid-sandbox',\n                '--disable-dev-shm-usage',\n                '--disable-gpu',\n                '--single-process',\n            ]\n        },\n        # highlight-end\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n            'h1s': await context.page.locator('h1').all_text_contents(),\n            'h2s': await context.page.locator('h2').all_text_contents(),\n            'h3s': await context.page.locator('h3').all_text_contents(),\n        }\n\n        await context.push_data(data)\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev'])\n\n    # Extract data saved in `Dataset`\n    data = await crawler.get_data()\n\n    # Clean up storages after the crawl\n    await dataset.drop()\n    await request_queue.drop()\n\n    # Serialize the list of scraped items to JSON string\n    return json.dumps(data.items)\n\n\ndef lambda_handler(_event: dict[str, Any], _context: LambdaContext) -> dict[str, Any]:\n    result = asyncio.run(main())\n    # Return the response with results\n    return {'statusCode': 200, 'body': result}\n"
  },
  {
    "path": "docs/deployment/code_examples/aws/playwright_dockerfile",
    "content": "FROM apify/actor-python-playwright:3.14\n\nRUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*\n\nRUN pip install -U pip setuptools \\\n    && pip install 'uv<1'\n\nENV UV_PROJECT_ENVIRONMENT=\"/usr/local\"\n\nCOPY pyproject.toml uv.lock ./\n\nRUN echo \"Python version:\" \\\n    && python --version \\\n    && echo \"Installing dependencies:\" \\\n    && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo \"true\" || echo \"false\") \\\n    && if [ \"$PLAYWRIGHT_INSTALLED\" = \"true\" ]; then \\\n        echo \"Playwright already installed, excluding from uv sync\" \\\n        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \\\n    else \\\n        echo \"Playwright not found, installing all dependencies\" \\\n        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \\\n    fi \\\n    && echo \"All installed Python packages:\" \\\n    && pip freeze\n\nCOPY . ./\n\nRUN python -m compileall -q .\n\n# highlight-start\n# AWS Lambda entrypoint\nENTRYPOINT [ \"/usr/local/bin/python3\", \"-m\", \"awslambdaric\" ]\n\n# Lambda handler function\nCMD [ \"aws_playwright.main.lambda_handler\" ]\n# highlight-end\n"
  },
  {
    "path": "docs/deployment/code_examples/google/cloud_run_example.py",
    "content": "import json\nimport os\n\nimport uvicorn\nfrom litestar import Litestar, get\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\n@get('/')\nasync def main() -> str:\n    \"\"\"The crawler entry point that will be called when the HTTP endpoint is accessed.\"\"\"\n    # highlight-start\n    # Disable writing storage data to the file system\n    storage_client = MemoryStorageClient()\n    # highlight-end\n\n    crawler = PlaywrightCrawler(\n        headless=True,\n        max_requests_per_crawl=10,\n        browser_type='firefox',\n        storage_client=storage_client,\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        \"\"\"Default request handler that processes each page during crawling.\"\"\"\n        context.log.info(f'Processing {context.request.url} ...')\n        title = await context.page.query_selector('title')\n        await context.push_data(\n            {\n                'url': context.request.loaded_url,\n                'title': await title.inner_text() if title else None,\n            }\n        )\n\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev'])\n\n    data = await crawler.get_data()\n\n    # Return the results as JSON to the client\n    return json.dumps(data.items)\n\n\n# Initialize the Litestar app with our route handler\napp = Litestar(route_handlers=[main])\n\n# Start the Uvicorn server using the `PORT` environment variable provided by GCP\n# This is crucial - Cloud Run expects your app to listen on this specific port\nuvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080')))  # noqa: S104 # Use all interfaces in a container, safely\n"
  },
  {
    "path": "docs/deployment/code_examples/google/google_example.py",
    "content": "import asyncio\nimport json\nfrom datetime import timedelta\n\nimport functions_framework\nfrom flask import Request, Response\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\nasync def main() -> str:\n    # highlight-start\n    # Disable writing storage data to the file system\n    storage_client = MemoryStorageClient()\n    # highlight-end\n\n    crawler = BeautifulSoupCrawler(\n        storage_client=storage_client,\n        max_request_retries=1,\n        request_handler_timeout=timedelta(seconds=30),\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n            'h1s': [h1.text for h1 in context.soup.find_all('h1')],\n            'h2s': [h2.text for h2 in context.soup.find_all('h2')],\n            'h3s': [h3.text for h3 in context.soup.find_all('h3')],\n        }\n\n        await context.push_data(data)\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev'])\n\n    # highlight-start\n    # Extract data saved in `Dataset`\n    data = await crawler.get_data()\n    # Serialize to json string and return\n    return json.dumps(data.items)\n    # highlight-end\n\n\n@functions_framework.http\ndef crawlee_run(request: Request) -> Response:\n    # You can pass data to your crawler using `request`\n    function_id = request.headers['Function-Execution-Id']\n    response_str = asyncio.run(main())\n\n    # Return a response with the crawling results\n    return Response(response=response_str, status=200)\n"
  },
  {
    "path": "docs/deployment/google_cloud.mdx",
    "content": "---\nid: gcp-cloud-run-functions\ntitle: Cloud Run functions\ndescription: Prepare your crawler to run in Cloud Run functions on Google Cloud Platform.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport CodeBlock from '@theme/CodeBlock';\n\nimport GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py';\n\n[Google Cloud Run Functions](https://cloud.google.com/functions) is a serverless execution environment for running simple HTTP-based web scrapers. This service is best suited for lightweight crawlers that don't require browser rendering capabilities and can be executed via HTTP requests.\n\n## Updating the project\n\nFor the project foundation, use <ApiLink to=\"class/BeautifulSoupCrawler\">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler).\n\nAdd [`functions-framework`](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`.\n\nUpdate the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project.\n\n<CodeBlock className=\"language-python\">\n    {GoogleFunctions.replace(/^.*?\\n/, '')}\n</CodeBlock>\n\nYou can test your project locally. Start the server by running:\n\n```bash\nfunctions-framework --target=crawlee_run\n```\n\nThen make a GET request to `http://127.0.0.1:8080/`, for example in your browser.\n\n## Deploying to Google Cloud Platform\n\nIn the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout.\n\nWhen deploying, select **\"Use an inline editor to create a function\"**. This allows you to configure the project using only the Google Cloud Console dashboard.\n\nUsing the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies.\n\nAlso, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`.\n\nAfter the Function deploys, you can test it by clicking the \"Test\" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.\n"
  },
  {
    "path": "docs/deployment/google_cloud_run.mdx",
    "content": "---\nid: gcp-cloud-run\ntitle: Cloud Run\ndescription: Prepare your crawler to run in Cloud Run on Google Cloud Platform.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport CodeBlock from '@theme/CodeBlock';\n\nimport GoogleCloudRun from '!!raw-loader!./code_examples/google/cloud_run_example.py';\n\n\n[Google Cloud Run](https://cloud.google.com/run)  is a container-based serverless platform that allows you to run web crawlers with headless browsers. This service is recommended when your Crawlee applications need browser rendering capabilities, require more granular control, or have complex dependencies that aren't supported by [Cloud Functions](./gcp-cloud-run-functions).\n\nGCP Cloud Run allows you to deploy using Docker containers, giving you full control over your environment and the flexibility to use any web server framework of your choice, unlike Cloud Functions which are limited to [Flask](https://flask.palletsprojects.com/en/stable/).\n\n## Preparing the project\n\nWe'll prepare our project using [Litestar](https://litestar.dev/) and the [Uvicorn](https://www.uvicorn.org/) web server. The HTTP server handler will wrap the crawler to communicate with clients. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves.\n\n:::info\n\nGCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world).\n\n:::\n\n<CodeBlock className=\"language-python\">\n    {GoogleCloudRun.replace(/^.*?\\n/, '')}\n</CodeBlock>\n\n\n:::tip\n\nAlways make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.**\n\n:::\n\n## Deploying to Google Cloud Platform\n\nNow, we’re ready to deploy! If you have initialized your project using `uvx crawlee create`, the initialization script has prepared a Dockerfile for you.\n\nAll you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private.\n\nAfter answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there.\n\n:::tip\n\nIn case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping.\n\n:::\n"
  },
  {
    "path": "docs/examples/add_data_to_dataset.mdx",
    "content": "---\nid: add-data-to-dataset\ntitle: Add data to dataset\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_bs.py';\nimport PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_pw.py';\nimport DatasetExample from '!!raw-loader!roa-loader!./code_examples/add_data_to_dataset_dataset.py';\n\nThis example demonstrates how to store extracted data into datasets using the <ApiLink to=\"class/PushDataFunction#open\">`context.push_data`</ApiLink> helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the <ApiLink to=\"class/PushDataFunction#open\">`push_data`</ApiLink> function.\n\n<Tabs groupId=\"main\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\nEach item in the dataset will be stored in its own file within the following directory:\n\n```text\n{PROJECT_FOLDER}/storage/datasets/default/\n```\n\nFor more control, you can also open a dataset manually using the asynchronous constructor <ApiLink to=\"class/Dataset#open\">`Dataset.open`</ApiLink>\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {DatasetExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/beautifulsoup_crawler.mdx",
    "content": "---\nid: beautifulsoup-crawler\ntitle: BeautifulSoup crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler.py';\n\nThis example demonstrates how to use <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/capture_screenshot_using_playwright.mdx",
    "content": "---\nid: capture-screenshots-using-playwright\ntitle: Capture screenshots using Playwright\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport CaptureScreenshotExample from '!!raw-loader!roa-loader!./code_examples/capture_screenshot_using_playwright.py';\n\nThis example demonstrates how to capture screenshots of web pages using <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and store them in the key-value store.\n\nThe <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.\n\nThe captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CaptureScreenshotExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx",
    "content": "---\nid: capturing-page-snapshots-with-error-snapshotter\ntitle: Capturing page snapshots with ErrorSnapshotter\ndescription: How to capture page snapshots on errors.\n---\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\nimport ApiLink from '@site/src/components/ApiLink';\nimport ParselCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_with_error_snapshotter.py';\nimport PlaywrightCrawlerWithErrorSnapshotter from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_error_snapshotter.py';\n\n\nThis example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set `save_error_snapshots=True` in the crawler's <ApiLink to=\"class/Statistics\">`Statistics`</ApiLink>. The error snapshot can contain `html` file and `jpeg` file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and [HTTP crawlers](../guides/http-crawlers) are capable of capturing the html file, but only <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> is able to capture page screenshot as well.\n\n<Tabs>\n    <TabItem value=\"ParselCrawler\" label=\"ParselCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            { ParselCrawlerWithErrorSnapshotter }\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            { PlaywrightCrawlerWithErrorSnapshotter }\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/code_examples/adaptive_playwright_crawler.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom playwright.async_api import Route\n\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    AdaptivePlaywrightCrawlingContext,\n    AdaptivePlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    # Crawler created by following factory method will use `beautifulsoup`\n    # for parsing static content.\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n        playwright_crawler_specific_kwargs={'headless': False},\n    )\n\n    @crawler.router.default_handler\n    async def request_handler_for_label(\n        context: AdaptivePlaywrightCrawlingContext,\n    ) -> None:\n        # Do some processing using `parsed_content`\n        context.log.info(context.parsed_content.title)\n\n        # Locate element h2 within 5 seconds\n        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n        # Do stuff with element found by the selector\n        context.log.info(h2)\n\n        # Find more links and enqueue them.\n        await context.enqueue_links()\n        # Save some data.\n        await context.push_data({'Visited url': context.request.url})\n\n    @crawler.pre_navigation_hook\n    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        \"\"\"Hook executed both in static sub crawler and playwright sub crawler.\n\n        Trying to access `context.page` in this hook would raise `AdaptiveContextError`\n        for pages crawled without playwright.\"\"\"\n        context.log.info(f'pre navigation hook for: {context.request.url} ...')\n\n    @crawler.pre_navigation_hook(playwright_only=True)\n    async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        \"\"\"Hook executed only in playwright sub crawler.\n\n        It is safe to access `page` object.\n        \"\"\"\n\n        async def some_routing_function(route: Route) -> None:\n            await route.continue_()\n\n        await context.page.route('*/**', some_routing_function)\n        context.log.info(\n            f'Playwright only pre navigation hook for: {context.request.url} ...'\n        )\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/add_data_to_dataset_bs.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n            'html': str(context.soup)[:1000],\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(\n        [\n            'https://crawlee.dev',\n            'https://apify.com',\n            'https://example.com',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/add_data_to_dataset_dataset.py",
    "content": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Open dataset manually using asynchronous constructor open().\n    dataset = await Dataset.open()\n\n    # Interact with dataset directly.\n    await dataset.push_data({'key': 'value'})\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/add_data_to_dataset_pw.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n            'html': str(await context.page.content())[:1000],\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(\n        [\n            'https://crawlee.dev',\n            'https://apify.com',\n            'https://example.com',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/beautifulsoup_crawler.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import (\n    BasicCrawlingContext,\n    BeautifulSoupCrawler,\n    BeautifulSoupCrawlingContext,\n)\n\n\nasync def main() -> None:\n    # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically\n    # loads the URLs and parses their HTML using the BeautifulSoup library.\n    crawler = BeautifulSoupCrawler(\n        # On error, retry each page at most once.\n        max_request_retries=1,\n        # Increase the timeout for processing each page to 30 seconds.\n        request_handler_timeout=timedelta(seconds=30),\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    # The handler receives a context parameter, providing various properties and\n    # helper methods. Here are a few key ones we use for demonstration:\n    # - request: an instance of the Request class containing details such as the URL\n    #   being crawled and the HTTP method used.\n    # - soup: the BeautifulSoup object containing the parsed HTML of the response.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n            'h1s': [h1.text for h1 in context.soup.find_all('h1')],\n            'h2s': [h2.text for h2 in context.soup.find_all('h2')],\n            'h3s': [h3.text for h3 in context.soup.find_all('h3')],\n        }\n\n        # Push the extracted data to the default dataset. In local configuration,\n        # the data will be stored as JSON files in ./storage/datasets/default.\n        await context.push_data(data)\n\n    # Register pre navigation hook which will be called before each request.\n    # This hook is optional and does not need to be defined at all.\n    @crawler.pre_navigation_hook\n    async def some_hook(context: BasicCrawlingContext) -> None:\n        pass\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py",
    "content": "import asyncio\n\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Keep the crawler alive even when there are no requests to be processed now.\n        keep_alive=True,\n    )\n\n    def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:\n        \"\"\"Stop crawler once specific url is visited.\n\n        Example of guard condition to stop the crawler.\"\"\"\n        if context.request.url == 'https://crawlee.dev/docs/examples':\n            crawler.stop(\n                'Stop crawler that was in keep_alive state after specific url was visite'\n            )\n        else:\n            context.log.info('keep_alive=True, waiting for more requests to come.')\n\n    async def add_request_later(url: str, after_s: int) -> None:\n        \"\"\"Add requests to the queue after some time. Can be done by external code.\"\"\"\n        # Just an example of request being added to the crawler later,\n        # when it is waiting due to `keep_alive=True`.\n        await asyncio.sleep(after_s)\n        await crawler.add_requests([url])\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BasicCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Stop crawler if some guard condition has been met.\n        stop_crawler_if_url_visited(context)\n\n    # Start some tasks that will add some requests later to simulate real situation,\n    # where requests are added later by external code.\n    add_request_later_task1 = asyncio.create_task(\n        add_request_later(url='https://crawlee.dev', after_s=1)\n    )\n    add_request_later_task2 = asyncio.create_task(\n        add_request_later(url='https://crawlee.dev/docs/examples', after_s=5)\n    )\n\n    # Run the crawler without the initial list of requests.\n    # Wait for more requests to be added to the queue later due to `keep_alive=True`.\n    await crawler.run()\n\n    await asyncio.gather(add_request_later_task1, add_request_later_task2)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/beautifulsoup_crawler_stop.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically\n    # loads the URLs and parses their HTML using the BeautifulSoup library.\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    # The handler receives a context parameter, providing various properties and\n    # helper methods. Here are a few key ones we use for demonstration:\n    # - request: an instance of the Request class containing details such as the URL\n    #   being crawled and the HTTP method used.\n    # - soup: the BeautifulSoup object containing the parsed HTML of the response.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Create custom condition to stop crawler once it finds what it is looking for.\n        if 'crawlee' in context.request.url:\n            crawler.stop(\n                reason='Manual stop of crawler after finding `crawlee` in the url.'\n            )\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n        }\n\n        # Push the extracted data to the default dataset. In local configuration,\n        # the data will be stored as JSON files in ./storage/datasets/default.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/capture_screenshot_using_playwright.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storages import KeyValueStore\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n        # Headless mode, set to False to see the browser in action.\n        headless=False,\n        # Browser types supported by Playwright.\n        browser_type='chromium',\n    )\n\n    # Open the default key-value store.\n    kvs = await KeyValueStore.open()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Capture the screenshot of the page using Playwright's API.\n        screenshot = await context.page.screenshot()\n        name = context.request.url.split('/')[-1]\n\n        # Store the screenshot in the key-value store.\n        await kvs.set_value(\n            key=f'screenshot-{name}',\n            value=screenshot,\n            content_type='image/png',\n        )\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(\n        [\n            'https://crawlee.dev',\n            'https://apify.com',\n            'https://example.com',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/configure_json_logging.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport inspect\nimport logging\nimport sys\nfrom typing import TYPE_CHECKING\n\nfrom loguru import logger\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\nif TYPE_CHECKING:\n    from loguru import Record\n\n\n# Configure loguru interceptor to capture standard logging output\nclass InterceptHandler(logging.Handler):\n    def emit(self, record: logging.LogRecord) -> None:\n        # Get corresponding Loguru level if it exists\n        try:\n            level: str | int = logger.level(record.levelname).name\n        except ValueError:\n            level = record.levelno\n\n        # Find caller from where originated the logged message\n        frame, depth = inspect.currentframe(), 0\n        while frame:\n            filename = frame.f_code.co_filename\n            is_logging = filename == logging.__file__\n            is_frozen = 'importlib' in filename and '_bootstrap' in filename\n            if depth > 0 and not (is_logging | is_frozen):\n                break\n            frame = frame.f_back\n            depth += 1\n\n        dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)\n        standard_attrs = set(dummy_record.__dict__.keys())\n        extra_dict = {\n            key: value\n            for key, value in record.__dict__.items()\n            if key not in standard_attrs\n        }\n\n        (\n            logger.bind(**extra_dict)\n            .opt(depth=depth, exception=record.exc_info)\n            .patch(lambda loguru_record: loguru_record.update({'name': record.name}))\n            .log(level, record.getMessage())\n        )\n\n\n# Configure loguru formatter\ndef formatter(record: Record) -> str:\n    basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}'\n    if record['extra']:\n        basic_format = basic_format + ' {extra}'\n    return f'{basic_format}\\n'\n\n\n# Remove default loguru logger\nlogger.remove()\n\n# Set up loguru with JSONL serialization in file `crawler.log`\nlogger.add('crawler.log', format=formatter, serialize=True, level='INFO')\n\n# Set up loguru logger for console\nlogger.add(sys.stderr, format=formatter, colorize=True, level='INFO')\n\n# Configure standard logging to use our interceptor\nlogging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)\n\n\nasync def main() -> None:\n    # Initialize crawler with disabled table logs\n    crawler = HttpCrawler(\n        configure_logging=False,  # Disable default logging configuration\n        statistics_log_format='inline',  # Set inline formatting for statistics logs\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # Run the crawler\n    await crawler.run(['https://www.crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_all_links_on_website_bs.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_all_links_on_website_pw.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_multiple_urls_bs.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(\n        [\n            'https://crawlee.dev',\n            'https://apify.com',\n            'https://example.com',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_multiple_urls_pw.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(\n        [\n            'https://crawlee.dev',\n            'https://apify.com',\n            'https://example.com',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_specific_links_on_website_bs.py",
    "content": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all the documentation links found on the page, except for the examples.\n        await context.enqueue_links(\n            include=[Glob('https://crawlee.dev/docs/**')],\n            exclude=[Glob('https://crawlee.dev/docs/examples')],\n        )\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_specific_links_on_website_pw.py",
    "content": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all the documentation links found on the page, except for the examples.\n        await context.enqueue_links(\n            include=[Glob('https://crawlee.dev/docs/**')],\n            exclude=[Glob('https://crawlee.dev/docs/examples')],\n        )\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_website_with_relative_links_all_links.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links found on the page. Any URLs found will be matched by\n        # this strategy, even if they go off the site you are currently crawling.\n        await context.enqueue_links(strategy='all')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Setting the strategy to same domain will enqueue all links found that\n        # are on the same hostname as request.loaded_url or request.url.\n        await context.enqueue_links(strategy='same-domain')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Setting the strategy to same hostname will enqueue all links found that are on\n        # the same hostname (including subdomains) as request.loaded_url or request.url.\n        await context.enqueue_links(strategy='same-hostname')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Setting the strategy to same origin will enqueue all links found that are on\n        # the same origin as request.loaded_url or request.url.\n        await context.enqueue_links(strategy='same-origin')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/export_entire_dataset_to_file_csv.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n    # Export the entire dataset to a CSV file.\n    # Use semicolon as delimiter and always quote strings.\n    await crawler.export_data(path='results.csv', delimiter=';', quoting='all')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/export_entire_dataset_to_file_json.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n    # Export the entire dataset to a JSON file.\n    # Set ensure_ascii=False to allow Unicode characters in the output.\n    await crawler.export_data(path='results.json', ensure_ascii=False)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py",
    "content": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract all the documentation links found on the page, except for the examples.\n        extracted_links = await context.extract_links(\n            include=[Glob('https://crawlee.dev/docs/**')],\n            exclude=[Glob('https://crawlee.dev/docs/examples')],\n        )\n        # Some very custom filtering which can't be achieved by `extract_links` arguments.\n        max_link_length = 30\n        filtered_links = [\n            link for link in extracted_links if len(link.url) < max_link_length\n        ]\n        # Add filtered links to the request queue.\n        await context.add_requests(filtered_links)\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py",
    "content": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract all the documentation links found on the page, except for the examples.\n        extracted_links = await context.extract_links(\n            include=[Glob('https://crawlee.dev/docs/**')],\n            exclude=[Glob('https://crawlee.dev/docs/examples')],\n        )\n        # Some very custom filtering which can't be achieved by `extract_links` arguments.\n        max_link_length = 30\n        filtered_links = [\n            link for link in extracted_links if len(link.url) < max_link_length\n        ]\n        # Add filtered links to the request queue.\n        await context.add_requests(filtered_links)\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/fill_and_submit_web_form_crawler.py",
    "content": "import asyncio\nfrom urllib.parse import urlencode\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        response = (await context.http_response.read()).decode('utf-8')\n        context.log.info(f'Response: {response}')  # To see the response in the logs.\n\n    # Prepare a POST request to the form endpoint.\n    request = Request.from_url(\n        url='https://httpbin.org/post',\n        method='POST',\n        headers={'content-type': 'application/x-www-form-urlencoded'},\n        payload=urlencode(\n            {\n                'custname': 'John Doe',\n                'custtel': '1234567890',\n                'custemail': 'johndoe@example.com',\n                'size': 'large',\n                'topping': ['bacon', 'cheese', 'mushroom'],\n                'delivery': '13:00',\n                'comments': 'Please ring the doorbell upon arrival.',\n            }\n        ).encode(),\n    )\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run([request])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/fill_and_submit_web_form_request.py",
    "content": "import asyncio\nfrom urllib.parse import urlencode\n\nfrom crawlee import Request\n\n\nasync def main() -> None:\n    # Prepare a POST request to the form endpoint.\n    request = Request.from_url(\n        url='https://httpbin.org/post',\n        method='POST',\n        headers={'content-type': 'application/x-www-form-urlencoded'},\n        payload=urlencode(\n            {\n                'custname': 'John Doe',\n                'custtel': '1234567890',\n                'custemail': 'johndoe@example.com',\n                'size': 'large',\n                'topping': ['bacon', 'cheese', 'mushroom'],\n                'delivery': '13:00',\n                'comments': 'Please ring the doorbell upon arrival.',\n            }\n        ).encode(),\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/parsel_crawler.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\n\n# Regex for identifying email addresses on a webpage.\nEMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}'\n\n\nasync def main() -> None:\n    crawler = ParselCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n            'email_address_list': context.selector.re(EMAIL_REGEX),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n        # Enqueue all links found on the page.\n        await context.enqueue_links()\n\n    # Register pre navigation hook which will be called before each request.\n    # This hook is optional and does not need to be defined at all.\n    @crawler.pre_navigation_hook\n    async def some_hook(context: BasicCrawlingContext) -> None:\n        pass\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://github.com'])\n\n    # Export the entire dataset to a JSON file.\n    await crawler.export_data(path='results.json')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py",
    "content": "import asyncio\nfrom random import choice\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.statistics import Statistics\n\n\nasync def main() -> None:\n    crawler = ParselCrawler(\n        statistics=Statistics.with_default_state(save_error_snapshots=True)\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Simulate various errors to demonstrate `ErrorSnapshotter`\n        # saving only the first occurrence of unique error.\n        await context.enqueue_links()\n        random_number = choice(range(10))\n        if random_number == 1:\n            raise KeyError('Some KeyError')\n        if random_number == 2:\n            raise ValueError('Some ValueError')\n        if random_number == 3:\n            raise RuntimeError('Some RuntimeError')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/playwright_block_requests.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        await context.enqueue_links()\n\n    # Define the hook, which will be called before every request.\n    @crawler.pre_navigation_hook\n    async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        context.log.info(f'Navigating to {context.request.url} ...')\n\n        # Block all requests to URLs that include `adsbygoogle.js` and also all defaults.\n        await context.block_requests(extra_url_patterns=['adsbygoogle.js'])\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/playwright_crawler.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n        # Headless mode, set to False to see the browser in action.\n        headless=False,\n        # Browser types supported by Playwright.\n        browser_type='chromium',\n    )\n\n    # Define the default request handler, which will be called for every request.\n    # The handler receives a context parameter, providing various properties and\n    # helper methods. Here are a few key ones we use for demonstration:\n    # - request: an instance of the Request class containing details such as the URL\n    #   being crawled and the HTTP method used.\n    # - page: Playwright's Page object, which allows interaction with the web page\n    #   (see https://playwright.dev/python/docs/api/class-page for more details).\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page using Playwright's API.\n        posts = await context.page.query_selector_all('.athing')\n        data = []\n\n        for post in posts:\n            # Get the HTML elements for the title and rank within each post.\n            title_element = await post.query_selector('.title a')\n            rank_element = await post.query_selector('.rank')\n\n            # Extract the data we want from the elements.\n            title = await title_element.inner_text() if title_element else None\n            rank = await rank_element.inner_text() if rank_element else None\n            href = await title_element.get_attribute('href') if title_element else None\n\n            data.append({'title': title, 'rank': rank, 'href': href})\n\n        # Push the extracted data to the default dataset. In local configuration,\n        # the data will be stored as JSON files in ./storage/datasets/default.\n        await context.push_data(data)\n\n        # Find a link to the next page and enqueue it if it exists.\n        await context.enqueue_links(selector='.morelink')\n\n    # Define a hook that will be called each time before navigating to a new URL.\n    # The hook receives a context parameter, providing access to the request and\n    # browser page among other things. In this example, we log the URL being\n    # navigated to.\n    @crawler.pre_navigation_hook\n    async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:\n        context.log.info(f'Navigating to {context.request.url} ...')\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://news.ycombinator.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/playwright_crawler_with_camoufox.py",
    "content": "import asyncio\n\n#  Camoufox is external package and needs to be installed. It is not included in crawlee.\nfrom camoufox import AsyncNewBrowser\nfrom typing_extensions import override\n\nfrom crawlee.browsers import (\n    BrowserPool,\n    PlaywrightBrowserController,\n    PlaywrightBrowserPlugin,\n)\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nclass CamoufoxPlugin(PlaywrightBrowserPlugin):\n    \"\"\"Example browser plugin that uses Camoufox browser,\n    but otherwise keeps the functionality of PlaywrightBrowserPlugin.\n    \"\"\"\n\n    @override\n    async def new_browser(self) -> PlaywrightBrowserController:\n        if not self._playwright:\n            raise RuntimeError('Playwright browser plugin is not initialized.')\n\n        return PlaywrightBrowserController(\n            browser=await AsyncNewBrowser(\n                self._playwright, **self._browser_launch_options\n            ),\n            # Increase, if camoufox can handle it in your use case.\n            max_open_pages_per_browser=1,\n            # This turns off the crawlee header_generation. Camoufox has its own.\n            header_generator=None,\n        )\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n        # Custom browser pool. Gives users full control over browsers used by the crawler.\n        browser_pool=BrowserPool(plugins=[CamoufoxPlugin()]),\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract some data from the page using Playwright's API.\n        posts = await context.page.query_selector_all('.athing')\n        for post in posts:\n            # Get the HTML elements for the title and rank within each post.\n            title_element = await post.query_selector('.title a')\n\n            # Extract the data we want from the elements.\n            title = await title_element.inner_text() if title_element else None\n\n        # Push the extracted data to the default dataset.\n        await context.push_data({'title': title})\n\n        # Find a link to the next page and enqueue it if it exists.\n        await context.enqueue_links(selector='.morelink')\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://news.ycombinator.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py",
    "content": "import asyncio\nfrom random import choice\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.statistics import Statistics\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        statistics=Statistics.with_default_state(save_error_snapshots=True)\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Simulate various errors to demonstrate `ErrorSnapshotter`\n        # saving only the first occurrence of unique error.\n        await context.enqueue_links()\n        random_number = choice(range(10))\n        if random_number == 1:\n            raise KeyError('Some KeyError')\n        if random_number == 2:\n            raise ValueError('Some ValueError')\n        if random_number == 3:\n            raise RuntimeError('Some RuntimeError')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.fingerprint_suite import (\n    DefaultFingerprintGenerator,\n    HeaderGeneratorOptions,\n    ScreenOptions,\n)\n\n\nasync def main() -> None:\n    # Use default fingerprint generator with desired fingerprint options.\n    # Generator will generate real looking browser fingerprint based on the options.\n    # Unspecified fingerprint options will be automatically selected by the generator.\n    fingerprint_generator = DefaultFingerprintGenerator(\n        header_options=HeaderGeneratorOptions(browsers=['chrome']),\n        screen_options=ScreenOptions(min_width=400),\n    )\n\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n        # Headless mode, set to False to see the browser in action.\n        headless=False,\n        # Browser types supported by Playwright.\n        browser_type='chromium',\n        # Fingerprint generator to be used. By default no fingerprint generation is done.\n        fingerprint_generator=fingerprint_generator,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Find a link to the next page and enqueue it if it exists.\n        await context.enqueue_links(selector='.morelink')\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://news.ycombinator.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/respect_robots_on_skipped_request.py",
    "content": "import asyncio\n\nfrom crawlee import SkippedReason\nfrom crawlee.crawlers import (\n    BeautifulSoupCrawler,\n    BeautifulSoupCrawlingContext,\n)\n\n\nasync def main() -> None:\n    # Initialize the crawler with robots.txt compliance enabled\n    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # highlight-start\n    # This handler is called when a request is skipped\n    @crawler.on_skipped_request\n    async def skipped_request_handler(url: str, reason: SkippedReason) -> None:\n        # Check if the request was skipped due to robots.txt rules\n        if reason == 'robots_txt':\n            crawler.log.info(f'Skipped {url} due to robots.txt rules.')\n\n    # highlight-end\n\n    # Start the crawler with the specified URLs\n    # The login URL will be skipped and handled by the skipped_request_handler\n    await crawler.run(\n        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/respect_robots_txt_file.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    BeautifulSoupCrawler,\n    BeautifulSoupCrawlingContext,\n)\n\n\nasync def main() -> None:\n    # Initialize the crawler with robots.txt compliance enabled\n    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # Start the crawler with the specified URLs\n    # The crawler will check the robots.txt file before making requests\n    # In this example, 'https://news.ycombinator.com/login' will be skipped\n    # because it's disallowed in the site's robots.txt file\n    await crawler.run(\n        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/resuming_paused_crawl.py",
    "content": "import asyncio\n\nfrom crawlee import ConcurrencySettings, service_locator\nfrom crawlee.crawlers import (\n    BeautifulSoupCrawler,\n    BeautifulSoupCrawlingContext,\n)\n\n# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run.\n# This makes the scraper continue from where it left off in the previous run.\n# The recommended way to achieve this behavior is setting the environment variable\n# `CRAWLEE_PURGE_ON_START=0`\nconfiguration = service_locator.get_configuration()\nconfiguration.purge_on_start = False\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(\n        # Let's slow down the crawler for a demonstration\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=20)\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # List of links for crawl\n    requests = [\n        'https://crawlee.dev',\n        'https://crawlee.dev/python/docs',\n        'https://crawlee.dev/python/docs/examples',\n        'https://crawlee.dev/python/docs/guides',\n        'https://crawlee.dev/python/docs/quick-start',\n    ]\n\n    await crawler.run(requests)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/run_parallel_crawlers.py",
    "content": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import (\n    ParselCrawler,\n    ParselCrawlingContext,\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n)\nfrom crawlee.sessions import SessionPool\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # Open request queues for both crawlers with different aliases\n    playwright_rq = await RequestQueue.open(alias='playwright-requests')\n    parsel_rq = await RequestQueue.open(alias='parsel-requests')\n\n    # Use a shared session pool between both crawlers\n    async with SessionPool() as session_pool:\n        playwright_crawler = PlaywrightCrawler(\n            # Set the request queue for Playwright crawler\n            request_manager=playwright_rq,\n            session_pool=session_pool,\n            # Configure concurrency settings for Playwright crawler\n            concurrency_settings=ConcurrencySettings(\n                max_concurrency=5, desired_concurrency=5\n            ),\n            # Set `keep_alive`` so that the crawler does not stop working when there are\n            # no requests in the queue.\n            keep_alive=True,\n        )\n\n        parsel_crawler = ParselCrawler(\n            # Set the request queue for Parsel crawler\n            request_manager=parsel_rq,\n            session_pool=session_pool,\n            # Configure concurrency settings for Parsel crawler\n            concurrency_settings=ConcurrencySettings(\n                max_concurrency=10, desired_concurrency=10\n            ),\n            # Set maximum requests per crawl for Parsel crawler\n            max_requests_per_crawl=50,\n        )\n\n        @playwright_crawler.router.default_handler\n        async def handle_playwright(context: PlaywrightCrawlingContext) -> None:\n            context.log.info(f'Playwright Processing {context.request.url}...')\n\n            title = await context.page.title()\n            # Push the extracted data to the dataset for Playwright crawler\n            await context.push_data(\n                {'title': title, 'url': context.request.url, 'source': 'playwright'},\n                dataset_name='playwright-data',\n            )\n\n        @parsel_crawler.router.default_handler\n        async def handle_parsel(context: ParselCrawlingContext) -> None:\n            context.log.info(f'Parsel Processing {context.request.url}...')\n\n            title = context.parsed_content.css('title::text').get()\n            # Push the extracted data to the dataset for Parsel crawler\n            await context.push_data(\n                {'title': title, 'url': context.request.url, 'source': 'parsel'},\n                dataset_name='parsel-data',\n            )\n\n            # Enqueue links to the Playwright request queue for blog pages\n            await context.enqueue_links(\n                selector='a[href*=\"/blog/\"]', rq_alias='playwright-requests'\n            )\n            # Enqueue other links to the Parsel request queue\n            await context.enqueue_links(selector='a:not([href*=\"/blog/\"])')\n\n        # Start the Playwright crawler in the background\n        background_crawler_task = asyncio.create_task(playwright_crawler.run([]))\n\n        # Run the Parsel crawler with the initial URL and wait for it to finish\n        await parsel_crawler.run(['https://crawlee.dev/blog'])\n\n        # Wait for the Playwright crawler to finish processing all requests\n        while not await playwright_rq.is_empty():\n            playwright_crawler.log.info('Waiting for Playwright crawler to finish...')\n            await asyncio.sleep(5)\n\n        # Stop the Playwright crawler after all requests are processed\n        playwright_crawler.stop()\n\n        # Wait for the background Playwright crawler task to complete\n        await background_crawler_task\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/using_browser_profiles_chrome.py",
    "content": "import asyncio\nimport shutil\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n# Profile name to use (usually 'Default' for single profile setups)\nPROFILE_NAME = 'Default'\n\n# Paths to Chrome profiles in your system (example for Windows)\n# Use `chrome://version/` to find your profile path\nPROFILE_PATH = Path(Path.home(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data')\n\n\nasync def main() -> None:\n    # Create a temporary folder to copy the profile to\n    with TemporaryDirectory(prefix='crawlee-') as tmpdirname:\n        tmp_profile_dir = Path(tmpdirname)\n\n        # Copy the profile to a temporary folder\n        shutil.copytree(\n            PROFILE_PATH / PROFILE_NAME,\n            tmp_profile_dir / PROFILE_NAME,\n            dirs_exist_ok=True,\n        )\n\n        crawler = PlaywrightCrawler(\n            headless=False,\n            # Use the installed Chrome browser\n            browser_type='chrome',\n            # Disable fingerprints to preserve profile identity\n            fingerprint_generator=None,\n            # Set user data directory to temp folder\n            user_data_dir=tmp_profile_dir,\n            browser_launch_options={\n                # Slow down actions to mimic human behavior\n                'slow_mo': 200,\n                'args': [\n                    # Use the specified profile\n                    f'--profile-directory={PROFILE_NAME}',\n                ],\n            },\n        )\n\n        @crawler.router.default_handler\n        async def default_handler(context: PlaywrightCrawlingContext) -> None:\n            context.log.info(f'Visiting {context.request.url}')\n\n        await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/using_browser_profiles_firefox.py",
    "content": "import asyncio\nfrom pathlib import Path\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n# Replace this with your actual Firefox profile name\n# Find it at about:profiles in Firefox\nPROFILE_NAME = 'your-profile-name-here'\n\n# Paths to Firefox profiles in your system (example for Windows)\n# Use `about:profiles` to find your profile path\nPROFILE_PATH = Path(\n    Path.home(), 'AppData', 'Roaming', 'Mozilla', 'Firefox', 'Profiles', PROFILE_NAME\n)\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Use Firefox browser type\n        browser_type='firefox',\n        # Disable fingerprints to use the profile as is\n        fingerprint_generator=None,\n        headless=False,\n        # Path to your Firefox profile\n        user_data_dir=PROFILE_PATH,\n        browser_launch_options={\n            'args': [\n                # Required to avoid version conflicts\n                '--allow-downgrade'\n            ]\n        },\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Visiting {context.request.url}')\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/code_examples/using_sitemap_request_loader.py",
    "content": "import asyncio\nfrom collections.abc import Callable\n\nfrom yarl import URL\n\nfrom crawlee import RequestOptions, RequestTransformAction\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import SitemapRequestLoader\n\n\n# Create a transform_request_function that maps request options based on the host in\n# the URL\ndef create_transform_request(\n    data_mapper: dict[str, dict],\n) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]:\n    def transform_request(\n        request_options: RequestOptions,\n    ) -> RequestOptions | RequestTransformAction:\n        # According to the Sitemap protocol, all URLs in a Sitemap must be from a single\n        # host.\n        request_host = URL(request_options['url']).host\n\n        if request_host and (mapping_data := data_mapper.get(request_host)):\n            # Set properties from the mapping data\n            if 'label' in mapping_data:\n                request_options['label'] = mapping_data['label']\n            if 'user_data' in mapping_data:\n                request_options['user_data'] = mapping_data['user_data']\n\n            return request_options\n\n        return 'unchanged'\n\n    return transform_request\n\n\nasync def main() -> None:\n    # Prepare data mapping for hosts\n    apify_host = URL('https://apify.com/sitemap.xml').host\n    crawlee_host = URL('https://crawlee.dev/sitemap.xml').host\n\n    if not apify_host or not crawlee_host:\n        raise ValueError('Unable to extract host from URLs')\n\n    data_map = {\n        apify_host: {\n            'label': 'apify',\n            'user_data': {'source': 'apify'},\n        },\n        crawlee_host: {\n            'label': 'crawlee',\n            'user_data': {'source': 'crawlee'},\n        },\n    }\n\n    # Initialize the SitemapRequestLoader with the transform function\n    async with SitemapRequestLoader(\n        # Set the sitemap URLs and the HTTP client\n        sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'],\n        http_client=ImpitHttpClient(),\n        transform_request_function=create_transform_request(data_map),\n    ) as sitemap_loader:\n        # Convert the sitemap loader to a request manager\n        request_manager = await sitemap_loader.to_tandem()\n\n        # Create and configure the crawler\n        crawler = BeautifulSoupCrawler(\n            request_manager=request_manager,\n            max_requests_per_crawl=10,\n        )\n\n        # Create default handler for requests without a specific label\n        @crawler.router.default_handler\n        async def handler(context: BeautifulSoupCrawlingContext) -> None:\n            source = context.request.user_data.get('source', 'unknown')\n            context.log.info(\n                f'Processing request: {context.request.url} from source: {source}'\n            )\n\n        # Create handler for requests labeled 'apify'\n        @crawler.router.handler('apify')\n        async def apify_handler(context: BeautifulSoupCrawlingContext) -> None:\n            source = context.request.user_data.get('source', 'unknown')\n            context.log.info(\n                f'Apify handler processing: {context.request.url} from source: {source}'\n            )\n\n        # Create handler for requests labeled 'crawlee'\n        @crawler.router.handler('crawlee')\n        async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None:\n            source = context.request.user_data.get('source', 'unknown')\n            context.log.info(\n                f'Crawlee handler processing: {context.request.url} from source: {source}'\n            )\n\n        await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/examples/crawl_all_links_on_website.mdx",
    "content": "---\nid: crawl-all-links-on-website\ntitle: Crawl all links on website\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_bs.py';\nimport PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_all_links_on_website_pw.py';\n\nThis example uses the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> helper to add new links to the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.\n\n:::tip\n\nIf no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the `EnqueueStrategy` type alias. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.\n\n:::\n\n<Tabs groupId=\"main\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/crawl_multiple_urls.mdx",
    "content": "---\nid: crawl-multiple-urls\ntitle: Crawl multiple URLs\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_bs.py';\nimport PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_multiple_urls_pw.py';\n\nThis example demonstrates how to crawl a specified list of URLs using different crawlers. You'll learn how to set up the crawler, define a request handler, and run the crawler with multiple URLs. This setup is useful for scraping data from multiple pages or websites concurrently.\n\n<Tabs groupId=\"main\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/crawl_specific_links_on_website.mdx",
    "content": "---\nid: crawl-specific-links-on-website\ntitle: Crawl specific links on website\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py';\nimport PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py';\n\nimport BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py';\nimport PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py';\n\nThis example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.\n\n<Tabs groupId=\"first-example\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n## Even more control over the enqueued links\n\n<ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to=\"class/ExtractLinksFunction\">`extract_links`</ApiLink> to find the links and <ApiLink to=\"class/AddRequestsFunction\">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to=\"class/ExtractLinksFunction\">`extract_links`</ApiLink> and <ApiLink to=\"class/AddRequestsFunction\">`add_requests`</ApiLink> instead of the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink>\n\n<Tabs groupId=\"second-example\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupExampleExtractAndAdd}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightExampleExtractAndAdd}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/crawl_website_with_relative_links.mdx",
    "content": "---\nid: crawl-website-with-relative-links\ntitle: Crawl website with relative links\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport AllLinksExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_all_links.py';\nimport SameDomainExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_domain.py';\nimport SameHostnameExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_hostname.py';\nimport SameOriginExample from '!!raw-loader!roa-loader!./code_examples/crawl_website_with_relative_links_same_origin.py';\n\nWhen crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> method on the crawler context, which will automatically find and add these links to the crawler's <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.\n\n:::note\n\nFor these examples, we are using the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way.\n\n:::\n\n`EnqueueStrategy` type alias provides four distinct strategies for crawling relative links:\n\n- `all` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.\n- `same-domain` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.\n- `same-hostname` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.\n- `same-origin` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.\n\n<Tabs groupId=\"main\">\n    <TabItem value=\"all_links\" label=\"All links\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {AllLinksExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"same-domain\" label=\"Same domain\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {SameDomainExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"same-hostname\" label=\"Same hostname\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {SameHostnameExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"same-origin\" label=\"Same origin\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {SameOriginExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/crawler_keep_alive.mdx",
    "content": "---\nid: crawler-keep-alive\ntitle: Keep a Crawler alive waiting for more requests\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_keep_alive.py';\n\nThis example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to=\"class/BasicCrawler#__init__\">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to=\"class/BasicCrawler\">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/crawler_stop.mdx",
    "content": "---\nid: crawler-stop\ntitle: Stopping a Crawler with stop method\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_stop.py';\n\nThis example demonstrates how to use `stop` method of <ApiLink to=\"class/BasicCrawler\">`BasicCrawler`</ApiLink> to stop crawler once the crawler finds what it is looking for. This method is available to all crawlers that inherit from <ApiLink to=\"class/BasicCrawler\">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>. Simply call `crawler.stop()` to stop the crawler. It will not continue to crawl through new requests. Requests that are already being concurrently processed are going to get finished. It is possible to call `stop` method with optional argument `reason` that is a string that will be used in logs and it can improve logs readability especially if you have multiple different conditions for triggering `stop`.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/export_entire_dataset_to_file.mdx",
    "content": "---\nid: export-entire-dataset-to-file\ntitle: Export entire dataset to file\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py';\nimport CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py';\n\nThis example demonstrates how to use the <ApiLink to=\"class/BasicCrawler#export_data\">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior.\n\n:::note\n\nFor these examples, we are using the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way.\n\n:::\n\n<Tabs groupId=\"main\">\n    <TabItem value=\"json\" label=\"JSON\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {JsonExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"csv\" label=\"CSV\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {CsvExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/examples/fill_and_submit_web_form.mdx",
    "content": "---\nid: fill-and-submit-web-form\ntitle: Fill and submit web form\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport RequestExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_request.py';\nimport CrawlerExample from '!!raw-loader!roa-loader!./code_examples/fill_and_submit_web_form_crawler.py';\n\nThis example demonstrates how to fill and submit a web form using the <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> crawler. The same approach applies to any crawler that inherits from it, such as the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> or <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>.\n\nWe are going to use the [httpbin.org](https://httpbin.org) website to demonstrate how it works.\n\n## Investigate the form fields\n\nFirst, we need to examine the form fields and the form's action URL. You can do this by opening the [httpbin.org/forms/post](https://httpbin.org/forms/post) page in a browser and inspecting the form fields.\n\nIn Chrome, right-click on the page and select \"Inspect\" or press `Ctrl+Shift+I`.\nUse the element selector (`Ctrl+Shift+C`) to click on the form element you want to inspect.\n\n![HTML input element name](/img/fill-and-submit-web-form/00.jpg 'HTML input element name.')\n\nIdentify the field names. For example, the customer name field is `custname`, the email field is `custemail`, and the phone field is `custtel`.\n\nNow navigate to the \"Network\" tab in developer tools and submit the form by clicking the \"Submit order\" button.\n\n![Submitting the form](/img/fill-and-submit-web-form/01.jpg 'Submitting the form.')\n\nFind the form submission request and examine its details. The \"Headers\" tab will show the submission URL, in this case, it is `https://httpbin.org/post`.\n\n![Network request investigation](/img/fill-and-submit-web-form/02.jpg 'Network request investigation.')\n\nThe \"Payload\" tab will display the form fields and their submitted values. This method could be an alternative to inspecting the HTML source code directly.\n\n![Network payload investigation](/img/fill-and-submit-web-form/03.jpg 'Network payload investigation.')\n\n## Preparing a POST request\n\nNow, let's create a POST request with the form fields and their values using the <ApiLink to=\"class/Request\">`Request`</ApiLink> class, specifically its <ApiLink to=\"class/Request#from_url\">`Request.from_url`</ApiLink> constructor:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RequestExample}\n</RunnableCodeBlock>\n\nAlternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.\n\n## Implementing the crawler\n\nFinally, let's implement the crawler and run it with the prepared request. Although we are using the <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>, the process is the same for any crawler that inherits from it.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CrawlerExample}\n</RunnableCodeBlock>\n\n## Running the crawler\n\nFinally, run your crawler. Your logs should show something like this:\n\n```plaintext\n...\n[crawlee.http_crawler._http_crawler] INFO  Processing https://httpbin.org/post ...\n[crawlee.http_crawler._http_crawler] INFO  Response: {\n  \"args\": {},\n  \"data\": \"\",\n  \"files\": {},\n  \"form\": {\n    \"comments\": \"Please ring the doorbell upon arrival.\",\n    \"custemail\": \"johndoe@example.com\",\n    \"custname\": \"John Doe\",\n    \"custtel\": \"1234567890\",\n    \"delivery\": \"13:00\",\n    \"size\": \"large\",\n    \"topping\": [\n      \"bacon\",\n      \"cheese\",\n      \"mushroom\"\n    ]\n  },\n  \"headers\": {\n    \"Accept\": \"*/*\",\n    \"Accept-Encoding\": \"gzip, deflate, br\",\n    \"Content-Length\": \"190\",\n    \"Content-Type\": \"application/x-www-form-urlencoded\",\n    \"Host\": \"httpbin.org\",\n    \"User-Agent\": \"python-httpx/0.27.0\",\n    \"X-Amzn-Trace-Id\": \"Root=1-66c849d6-1ae432fb7b4156e6149ff37f\"\n  },\n  \"json\": null,\n  \"origin\": \"78.80.81.196\",\n  \"url\": \"https://httpbin.org/post\"\n}\n\n[crawlee._autoscaling.autoscaled_pool] INFO  Waiting for remaining tasks to finish\n[crawlee.http_crawler._http_crawler] INFO  Final request statistics:\n┌───────────────────────────────┬──────────┐\n│ requests_finished             │ 1        │\n│ requests_failed               │ 0        │\n│ retry_histogram               │ [1]      │\n│ request_avg_failed_duration   │ None     │\n│ request_avg_finished_duration │ 0.678442 │\n│ requests_finished_per_minute  │ 85       │\n│ requests_failed_per_minute    │ 0        │\n│ request_total_duration        │ 0.678442 │\n│ requests_total                │ 1        │\n│ crawler_runtime               │ 0.707666 │\n└───────────────────────────────┴──────────┘\n```\n\nThis log output confirms that the crawler successfully submitted the form and processed the response. Congratulations! You have successfully filled and submitted a web form using the <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>.\n"
  },
  {
    "path": "docs/examples/json_logging.mdx",
    "content": "---\nid: configure-json-logging\ntitle: Сonfigure JSON logging\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport JsonLoggingExample from '!!raw-loader!roa-loader!./code_examples/configure_json_logging.py';\n\nThis example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.\n\nThe example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {JsonLoggingExample}\n</RunnableCodeBlock>\n\nHere's an example of what a crawler statistics log entry in JSONL format.\n\n```json\n{\n    \"text\": \"[HttpCrawler] |   INFO   | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\\n\",\n    \"record\": {\n        \"elapsed\": { \"repr\": \"0:00:05.604568\", \"seconds\": 5.604568 },\n        \"exception\": null,\n        \"extra\": {\n            \"requests_finished\": 1,\n            \"requests_failed\": 0,\n            \"retry_histogram\": [1],\n            \"request_avg_failed_duration\": null,\n            \"request_avg_finished_duration\": 3.57098,\n            \"requests_finished_per_minute\": 17,\n            \"requests_failed_per_minute\": 0,\n            \"request_total_duration\": 3.57098,\n            \"requests_total\": 1,\n            \"crawler_runtime\": 3.59165\n        },\n        \"file\": {\n            \"name\": \"_basic_crawler.py\",\n            \"path\": \"/crawlers/_basic/_basic_crawler.py\"\n        },\n        \"function\": \"run\",\n        \"level\": { \"icon\": \"ℹ️\", \"name\": \"INFO\", \"no\": 20 },\n        \"line\": 583,\n        \"message\": \"Final request statistics:\",\n        \"module\": \"_basic_crawler\",\n        \"name\": \"HttpCrawler\",\n        \"process\": { \"id\": 198383, \"name\": \"MainProcess\" },\n        \"thread\": { \"id\": 135312814966592, \"name\": \"MainThread\" },\n        \"time\": {\n            \"repr\": \"2025-03-17 17:14:45.339150+00:00\",\n            \"timestamp\": 1742231685.33915\n        }\n    }\n}\n```\n"
  },
  {
    "path": "docs/examples/parsel_crawler.mdx",
    "content": "---\nid: parsel-crawler\ntitle: Parsel crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler.py';\n\nThis example shows how to use <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping.  It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ParselCrawlerExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/playwright_crawler.mdx",
    "content": "---\nid: playwright-crawler\ntitle: Playwright crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler.py';\n\nThis example demonstrates how to use <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> to recursively scrape the Hacker news website using headless Chromium and Playwright.\n\nThe <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.\n\nA **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightCrawlerExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/playwright_crawler_adaptive.mdx",
    "content": "---\nid: adaptive-playwright-crawler\ntitle: Adaptive Playwright crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport AdaptivePlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/adaptive_playwright_crawler.py';\n\nThis example demonstrates how to use <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink>. An <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> or <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>.\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.\n\nA [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler.\n\nFor more detailed description please see [Adaptive Playwright crawler guide](/python/docs/guides/adaptive-playwright-crawler)\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {AdaptivePlaywrightCrawlerExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/playwright_crawler_with_block_requests.mdx",
    "content": "---\nid: playwright-crawler-with-block-requests\ntitle: Playwright crawler with block requests\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightBlockRequests from '!!raw-loader!roa-loader!./code_examples/playwright_block_requests.py';\n\nThis example demonstrates how to optimize your <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> performance by blocking unnecessary network requests.\n\nThe primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed.\n\nThe <ApiLink to=\"class/BlockRequestsFunction\">`block_requests`</ApiLink> helper provides the most efficient way to block requests as it operates directly in the browser.\n\nBy default, <ApiLink to=\"class/BlockRequestsFunction\">`block_requests`</ApiLink> will block all URLs including the following patterns:\n\n```python\n['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']\n```\n\nYou can also replace the default patterns list with your own by providing `url_patterns`, or extend it by passing additional patterns in `extra_url_patterns`.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightBlockRequests}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/playwright_crawler_with_camoufox.mdx",
    "content": "---\nid: playwright-crawler-with-camoufox\ntitle: Playwright crawler with Camoufox\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightCrawlerExampleWithCamoufox from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_camoufox.py';\n\nThis example demonstrates how to integrate Camoufox into <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> using <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> with custom <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink>.\n\nCamoufox is a stealthy minimalistic build of Firefox. For details please visit its homepage https://camoufox.com/ .\nTo be able to run this example you will need to install camoufox, as it is external tool, and it is not part of the crawlee. For installation please see https://pypi.org/project/camoufox/.\n\n**Warning!** Camoufox is using custom build of firefox. This build can be hundreds of MB large.\nYou can either pre-download this file using following command `python3 -m camoufox fetch` or camoufox will download it automatically once you try to run it, and it does not find existing binary.\nFor more details please refer to: https://github.com/daijro/camoufox/tree/main/pythonlib#camoufox-python-interface\n\n**Project template -** It is possible to generate project with Python code which includes Camoufox integration into crawlee through crawlee cli. Call `crawlee create` and pick `Playwright-camoufox` when asked for Crawler type.\n\nThe example code after PlayWrightCrawler instantiation is similar to example describing the use of Playwright Crawler. The main difference is that in this example Camoufox will be used as the browser through BrowserPool.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightCrawlerExampleWithCamoufox}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/playwright_crawler_with_fingerprint_generator.mdx",
    "content": "---\nid: playwright-crawler-with-fingerprint-generator\ntitle: Playwright crawler with fingerprint generator\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_with_fingerprint_generator.py';\n\nThis example demonstrates how to use <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> together with <ApiLink to=\"class/FingerprintGenerator\">`FingerprintGenerator`</ApiLink> that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting.\n\nYou can implement your own fingerprint generator or use <ApiLink to=\"class/BrowserforgeFingerprintGenerator\">`DefaultFingerprintGenerator`</ApiLink>. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightCrawlerExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/respect_robots_txt_file.mdx",
    "content": "---\nid: respect-robots-txt-file\ntitle: Respect robots.txt file\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';\nimport OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';\n\nThis example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.\n\nTo configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to=\"class/BasicCrawlerOptions\">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.\n\nAs an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.\n\nThe code below demonstrates this behavior using the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RespectRobotsTxt}\n</RunnableCodeBlock>\n\n## Handle with `on_skipped_request`\n\nIf you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to=\"class/BasicCrawler#on_skipped_request\">`BasicCrawler`</ApiLink>.\n\nLet's update the code by adding the `on_skipped_request` handler:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {OnSkippedRequest}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/resuming_paused_crawl.mdx",
    "content": "---\nid: resuming-paused-crawl\ntitle: Resuming a paused crawl\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport ResumeCrawl from '!!raw-loader!roa-loader!./code_examples/resuming_paused_crawl.py';\n\nThis example demonstrates how to resume crawling from its last state when running locally, if for some reason it was unexpectedly terminated.\n\nIf each run should continue crawling from the previous state, you can configure this using `purge_on_start` in <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink>.\n\nUse the code below and perform 2 sequential runs. During the 1st run, stop the crawler by pressing `CTRL+C`, and the 2nd run will resume crawling from where it stopped.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ResumeCrawl}\n</RunnableCodeBlock>\n\nPerform the 1st run, interrupting the crawler with `CTRL+C` after 2 links have been processed.\n\n![Run with interruption](/img/resuming-paused-crawl/00.webp 'Run with interruption.')\n\nNow resume crawling after the pause to process the remaining 3 links.\n\n![Resuming crawling](/img/resuming-paused-crawl/01.webp 'Resuming crawling.')\n\nAlternatively, use the environment variable `CRAWLEE_PURGE_ON_START=0` instead of using `configuration.purge_on_start = False`.\n\nFor example, when running code:\n\n```bash\nCRAWLEE_PURGE_ON_START=0 python -m best_crawler\n```\n"
  },
  {
    "path": "docs/examples/run_parallel_crawlers.mdx",
    "content": "---\nid: run-parallel-crawlers\ntitle: Run parallel crawlers\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py';\n\nThis example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.\n\nIn some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more.\n\nThe code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients).\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RunParallelCrawlersExample}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/examples/using_browser_profile.mdx",
    "content": "---\nid: using_browser_profile\ntitle: Using browser profile\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport CodeBlock from '@theme/CodeBlock';\n\nimport ChromeProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_chrome.py';\nimport FirefoxProfileExample from '!!raw-loader!./code_examples/using_browser_profiles_firefox.py';\n\nThis example demonstrates how to run <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> using your local browser profile from [Chrome](https://www.google.com/intl/us/chrome/) or [Firefox](https://www.firefox.com/).\n\nUsing browser profiles allows you to leverage existing login sessions, saved passwords, bookmarks, and other personalized browser data during crawling. This can be particularly useful for testing scenarios or when you need to access content that requires authentication.\n\n## Chrome browser\n\nTo run <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`.\n\n:::warning Profile access limitation\nDue to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround.\n:::\n\nMake sure you don't have any running Chrome browser processes before running this code:\n\n<CodeBlock className=\"language-python\" language=\"python\">\n    {ChromeProfileExample}\n</CodeBlock>\n\n## Firefox browser\n\nTo find the path to your Firefox profile, enter `about:profiles` as a URL in your Firefox browser. Unlike Chrome, you can use your standard profile path directly without copying it first.\n\nMake sure you don't have any running Firefox browser processes before running this code:\n\n<CodeBlock className=\"language-python\" language=\"python\">\n    {FirefoxProfileExample}\n</CodeBlock>\n"
  },
  {
    "path": "docs/examples/using_sitemap_request_loader.mdx",
    "content": "---\nid: using-sitemap-request-loader\ntitle: Using sitemap request loader\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py';\n\nThis example demonstrates how to use <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps.\n\nThe example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections.\n\nThe following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SitemapRequestLoaderExample}\n</RunnableCodeBlock>\n\nFor more information about request loaders, see the [Request loaders guide](../guides/request-loaders).\n"
  },
  {
    "path": "docs/guides/architecture_overview.mdx",
    "content": "---\nid: architecture-overview\ntitle: Architecture overview\ndescription: An overview of the core components of the Crawlee library and its architecture.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nCrawlee is a modern and modular web scraping framework. It is designed for both HTTP-only and browser-based scraping. In this guide, we will provide a high-level overview of its architecture and the main components that make up the system.\n\n## Crawler\n\nThe main user-facing component of Crawlee is the crawler, which orchestrates the crawling process and takes care of all other components. It manages storages, executes user-defined request handlers, handles retries, manages concurrency, and coordinates all other components. All crawlers inherit from the <ApiLink to=\"class/BasicCrawler\">`BasicCrawler`</ApiLink> class, which provides the basic functionality. There are two main groups of specialized crawlers: HTTP crawlers and browser crawlers.\n\n:::info\n\nYou will learn more about the request handlers in the request router section.\n\n:::\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass BasicCrawler {\n    <<abstract>>\n}\n\nclass AbstractHttpCrawler {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass HttpCrawler\n\nclass ParselCrawler\n\nclass BeautifulSoupCrawler\n\nclass PlaywrightCrawler\n\nclass AdaptivePlaywrightCrawler\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nBasicCrawler --|> AbstractHttpCrawler\nBasicCrawler --|> PlaywrightCrawler\nBasicCrawler --|> AdaptivePlaywrightCrawler\nAbstractHttpCrawler --|> HttpCrawler\nAbstractHttpCrawler --|> ParselCrawler\nAbstractHttpCrawler --|> BeautifulSoupCrawler\n```\n\n### HTTP crawlers\n\nHTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).\n\nHTTP crawlers inherit from <ApiLink to=\"class/AbstractHttpCrawler\">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category:\n\n- <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.\n- <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.\n- <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required.\n\nYou can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).\n\n### Browser crawlers\n\nBrowser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler).\n\n### Adaptive crawler\n\nThe <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> sits between HTTP and browser crawlers. It can automatically decide whether to use HTTP or browser crawling for each request based on heuristics or user configuration. This allows for optimal performance and compatibility. It also provides a uniform interface for both crawling types (modes). You can learn more about adaptive crawling in the [Adaptive Playwright crawler guide](./adaptive-playwright-crawler).\n\n## Crawling contexts\n\nCrawling contexts are objects that encapsulate the state and data for each request being processed by the crawler. They provide access to the request, response, session, and helper methods for handling the request. Crawling contexts are used to pass data between different parts of the crawler and to manage the lifecycle of each request. These contexts are provided to user-defined request handlers, which can then use them to access request data, response data, or use helper methods to interact with storages, and extract and enqueue new requests.\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Classes\n%% ========================\n\nclass BasicCrawlingContext\n\nclass HttpCrawlingContext\n\nclass HttpCrawlingResult\n\nclass ParsedHttpCrawlingContext\n\nclass ParselCrawlingContext\n\nclass BeautifulSoupCrawlingContext\n\nclass PlaywrightPreNavCrawlingContext\n\nclass PlaywrightCrawlingContext\n\nclass AdaptivePlaywrightPreNavCrawlingContext\n\nclass AdaptivePlaywrightCrawlingContext\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nBasicCrawlingContext --|> HttpCrawlingContext\n\nHttpCrawlingResult --|> HttpCrawlingContext\n\nHttpCrawlingContext --|> ParsedHttpCrawlingContext\n\nParsedHttpCrawlingContext --|> ParselCrawlingContext\n\nParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext\n\nBasicCrawlingContext --|> PlaywrightPreNavCrawlingContext\n\nPlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext\n\nBasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext\n\nParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext\n```\n\nThey have a similar inheritance structure as the crawlers, with the base class being <ApiLink to=\"class/BasicCrawlingContext\">`BasicCrawlingContext`</ApiLink>. The specific crawling contexts are:\n- <ApiLink to=\"class/HttpCrawlingContext\">`HttpCrawlingContext`</ApiLink> for HTTP crawlers.\n- <ApiLink to=\"class/ParsedHttpCrawlingContext\">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.\n- <ApiLink to=\"class/ParselCrawlingContext\">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.\n- <ApiLink to=\"class/BeautifulSoupCrawlingContext\">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.\n- <ApiLink to=\"class/PlaywrightPreNavCrawlingContext\">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated.\n- <ApiLink to=\"class/PlaywrightCrawlingContext\">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers.\n- <ApiLink to=\"class/AdaptivePlaywrightPreNavCrawlingContext\">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated.\n- <ApiLink to=\"class/AdaptivePlaywrightCrawlingContext\">`AdaptivePlaywrightCrawlingContext`</ApiLink> for Adaptive Playwright crawlers.\n\n## Storages\n\nStorages are the components that manage data in Crawlee. They provide a way to store and retrieve data during the crawling process. Crawlee's storage system consists of two main layers:\n\n- **Storages**: High-level interfaces for interacting with different storage types\n- **Storage clients**: Backend implementations that handle the actual data persistence and management (you will learn more about them in the next section)\n\nCrawlee provides three built-in storage types for managing data:\n\n- <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> - Append-only, tabular storage for structured data. It is ideal for storing scraping results.\n- <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink> - Storage for arbitrary data like JSON documents, images or configs. It supports get and set operations with key-value pairs; updates are only possible by replacement.\n- <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> - A managed queue for pending and completed requests, with automatic deduplication and dynamic addition of new items. It is used to track URLs for crawling.\n\nSee the [Storages guide](./storages) for more details.\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass Storage {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass Dataset\n\nclass KeyValueStore\n\nclass RequestQueue\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nStorage --|> Dataset\nStorage --|> KeyValueStore\nStorage --|> RequestQueue\n```\n\n## Storage clients\n\nStorage clients are the backend implementations for storages that handle interactions with different storage systems. They provide a unified interface for <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, and <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, regardless of the underlying storage implementation.\n\nCrawlee provides several built-in storage client implementations:\n\n- <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence (ideal for testing and fast operations).\n- <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with caching (default client).\n- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You can find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction).\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass StorageClient {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass MemoryStorageClient\n\nclass FileSystemStorageClient\n\nclass ApifyStorageClient\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nStorageClient --|> MemoryStorageClient\nStorageClient --|> FileSystemStorageClient\nStorageClient --|> ApifyStorageClient\n```\n\nStorage clients can be registered globally with the <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> (you will learn more about the <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> in the next section), passed directly to crawlers, or specified when opening individual storage instances. You can also create custom storage clients by implementing the <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> interface.\n\nSee the [Storage clients guide](./storage-clients) for more details.\n\n## Request router\n\nThe request <ApiLink to=\"class/Router\">`Router`</ApiLink> is a central component that manages the flow of requests and responses in Crawlee. It is responsible for routing requests to the appropriate request handlers, managing the crawling context, and coordinating the execution of user-defined logic.\n\n### Request handlers\n\nRequest handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the <ApiLink to=\"class/Router\">`Router`</ApiLink>.\n\nThe request routing in Crawlee supports:\n- Default handlers - Fallback handlers for requests without specific labels.\n- Label-based routing - Handlers for specific request types based on labels.\n- Error handlers - Handle errors during request processing.\n- Failed request handlers - Handle requests that exceed retry limits.\n- Pre-navigation hooks - Execute logic before navigating to URLs.\n\nSee the [Request router guide](./request-router) for detailed information and examples.\n\n## Service locator\n\nThe <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> is a central registry for global services in Crawlee. It manages and provides access to core services throughout the framework, ensuring consistent configuration across all components. The service locator coordinates these three services:\n\n- <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> - Application-wide settings and parameters that control various aspects of Crawlee behavior.\n- <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> - Backend implementation for data storage across datasets, key-value stores, and request queues.\n- <ApiLink to=\"class/EventManager\">`EventManager`</ApiLink> - Event coordination system for internal framework events and custom user hooks.\n\nServices can be registered globally through the `service_locator` singleton instance, passed to crawler constructors, or provided when opening individual storage instances. The service locator includes conflict prevention mechanisms to ensure configuration consistency and prevent accidental service conflicts during runtime.\n\nSee the [Service locator guide](./service-locator) for detailed information about service registration and configuration options.\n\n## Request loaders\n\nRequest loaders provide a subset of <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> functionality, focusing specifically on reading and accessing streams of requests from various sources. They define how requests are fetched and processed, enabling use cases such as reading URLs from files, external APIs, sitemaps, or combining multiple sources together. Unlike request queues, they do not handle storage or persistence—they only provide request reading capabilities.\n\n- <ApiLink to=\"class/RequestLoader\">`RequestLoader`</ApiLink> - Base interface for read-only access to a stream of requests, with capabilities like fetching the next request, marking as handled, and status checking.\n- <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> - Lightweight in-memory implementation of `RequestLoader` for managing static lists of URLs.\n- <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> - A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities.\n\n### Request managers\n\n<ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> extends <ApiLink to=\"class/RequestLoader\">`RequestLoader`</ApiLink> with write capabilities for adding and reclaiming requests, providing full request management functionality. <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> is the primary concrete implementation of <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink>.\n\n<ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink> combines a read-only `RequestLoader` with a writable <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink>, transferring requests from the loader to the manager for hybrid scenarios. This is useful when you want to start with a predefined set of URLs (from a file or sitemap) but also need to add new requests dynamically during crawling. The tandem first processes all requests from the loader, then handles any additional requests added to the manager.\n\nRequest loaders are useful when you need to start with a predefined set of URLs. The tandem approach allows processing requests from static sources (like files or sitemaps) while maintaining the ability to add new requests dynamically.\n\nSee the [Request loaders guide](./request-loaders) for detailed information.\n\n## Event manager\n\nThe <ApiLink to=\"class/EventManager\">`EventManager`</ApiLink> is responsible for coordinating internal events throughout Crawlee and enabling custom hooks. It provides a system for registering event listeners, emitting events, and managing their execution lifecycle.\n\nCrawlee provides several implementations of the event manager:\n\n- <ApiLink to=\"class/EventManager\">`EventManager`</ApiLink> is the base class for event management in Crawlee.\n- <ApiLink to=\"class/LocalEventManager\">`LocalEventManager`</ApiLink> extends the base event manager for local environments by automatically emitting `SYSTEM_INFO` events at regular intervals. This provides real-time system metrics including CPU usage and memory consumption, which are essential for internal components like the <ApiLink to=\"class/Snapshotter\">`Snapshotter`</ApiLink> and <ApiLink to=\"class/AutoscaledPool\">`AutoscaledPool`</ApiLink>.\n- [`ApifyEventManager`](https://docs.apify.com/sdk/python/reference/class/PlatformEventManager) - Manages events on the [Apify platform](https://apify.com/) (cloud-based). It is implemented in the [Apify SDK](https://docs.apify.com/sdk/python/).\n\n:::info\n\nYou can learn more about <ApiLink to=\"class/Snapshotter\">`Snapshotter`</ApiLink> and <ApiLink to=\"class/AutoscaledPool\">`AutoscaledPool`</ApiLink> and their configuration in the [Scaling crawlers guide](./scaling-crawlers).\n\n:::\n\nCrawlee defines several built-in event types:\n\n- `PERSIST_STATE` - Emitted periodically to trigger state persistence.\n- `SYSTEM_INFO` - Contains CPU and memory usage information.\n- `MIGRATING` - Signals that the crawler is migrating to a different environment.\n- `ABORTING` - Indicates the crawler is aborting execution.\n- `EXIT` - Emitted when the crawler is exiting.\n- `CRAWLER_STATUS` - Provides status updates from crawlers.\n\nAdditional specialized events for browser and session management are also available.\n\nThe event manager operates as an async context manager, automatically starting periodic tasks when entered and ensuring all listeners complete before exiting. Event listeners can be either synchronous or asynchronous functions and are executed safely without blocking the main event loop.\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass EventManager {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass LocalEventManager\n\nclass ApifyEventManager\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nEventManager --|> LocalEventManager\nEventManager --|> ApifyEventManager\n```\n\n## Session management\n\nThe core component of session management in Crawlee is <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink>. It manages a collection of sessions that simulate individual users with unique attributes like cookies, IP addresses (via proxies), and browser fingerprints. Sessions help avoid blocking by rotating user identities and maintaining realistic browsing patterns.\n\n:::info\n\nYou can learn more about fingerprints and how to avoid getting blocked in the [Avoid blocking guide](./avoid-blocking).\n\n:::\n\n### Session\n\nA session is represented as a <ApiLink to=\"class/Session\">`Session`</ApiLink> object, which contains components like cookies, error tracking, usage limits, and expiration handling. Sessions can be marked as good (<ApiLink to=\"class/Session#mark_good\">`Session.mark_good`</ApiLink>), bad (<ApiLink to=\"class/Session#mark_bad\">`Session.mark_bad`</ApiLink>), or retired (<ApiLink to=\"class/Session#retire\">`Session.retire`</ApiLink>) based on their performance, and they automatically become unusable when they exceed error thresholds or usage limits.\n\n### Session pool\n\nThe session pool provides automated session lifecycle management:\n\n- Automatic rotation - Retrieves random sessions from the pool and creates new ones as needed.\n- Pool maintenance - Removes retired sessions and maintains the pool at maximum capacity.\n- State persistence - Persists session state to enable recovery across restarts.\n- Configurable limits - Supports custom pool sizes, session settings, and creation functions.\n\nThe pool operates as an async context manager, automatically initializing with sessions and cleaning up on exit. It ensures proper session management by rotating sessions based on usage count, expiration time, and custom rules while maintaining optimal pool size.\n\nSee the [Session management guide](./session-management) for more information.\n\n## Statistics\n\nThe <ApiLink to=\"class/Statistics\">`Statistics`</ApiLink> class provides runtime monitoring for crawler operations, tracking performance metrics like request counts, processing times, retry attempts, and error patterns. It operates as an async context manager, automatically persisting data across crawler restarts and migrations using <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>.\n\nThe system includes error tracking through the <ApiLink to=\"class/ErrorTracker\">`ErrorTracker`</ApiLink> class, which groups similar errors by type and message patterns using wildcard matching. It can capture HTML snapshots and screenshots for debugging and separately track retry-specific errors.\n\nStatistics are logged at configurable intervals in both table and inline formats, with final summary data returned from the `crawler.run` method available through <ApiLink to=\"class/FinalStatistics\">`FinalStatistics`</ApiLink>.\n\n## Conclusion\n\nIn this guide, we provided a high-level overview of the core components of the Crawlee library and its architecture. We covered the main components like crawlers, crawling contexts, storages, request routers, service locator, request loaders, event manager, session management, and statistics. Check out other guides, the [API reference](https://crawlee.dev/python/api), and [Examples](../examples) for more details on how to use these components in your own projects.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/avoid_blocking.mdx",
    "content": "---\nid: avoid-blocking\ntitle: Avoid getting blocked\ndescription: How to avoid getting blocked when scraping\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py';\nimport PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py';\n\nimport PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py';\n\nA scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks.\n\nBrowser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even within different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the blocking.\n\n## Using browser fingerprints\n\nChanging browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the <ApiLink to=\"class/PlaywrightCrawler#__init__\">`PlaywrightCrawler.__init__`</ApiLink>, either pass your own implementation of <ApiLink to=\"class/FingerprintGenerator\">`FingerprintGenerator`</ApiLink> or use <ApiLink to=\"class/BrowserforgeFingerprintGenerator\">`DefaultFingerprintGenerator`</ApiLink>.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightDefaultFingerprintGenerator}\n</RunnableCodeBlock>\n\nIn certain cases we want to narrow down the fingerprints used - e.g. specify a certain operating system, locale or browser. This is also possible with Crawlee - the crawler can have the generation algorithm customized to reflect the particular browser version and many more. For description of fingerprint generation options please see <ApiLink to=\"class/HeaderGeneratorOptions\">`HeaderGeneratorOptions`</ApiLink>, <ApiLink to=\"class/ScreenOptions\">`ScreenOptions`</ApiLink> and <ApiLink to=\"class/BrowserforgeFingerprintGenerator#__init__\">`DefaultFingerprintGenerator.__init__`</ApiLink>  See the example below:\n\n<CodeBlock className=\"language-python\">\n    {PlaywrightDefaultFingerprintGeneratorWithArgs}\n</CodeBlock>\n\nIf you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the <ApiLink to=\"class/PlaywrightCrawler#__init__\">`PlaywrightCrawler.__init__`</ApiLink>.\n\n## Using Camoufox\n\nIn some cases even <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> with fingerprints is not enough. You can try using <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> together with [Camoufox](https://camoufox.com/). See the example integration below:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightWithCamoufox}\n</RunnableCodeBlock>\n\n**Related links**\n\n- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)\n- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping)\n"
  },
  {
    "path": "docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py",
    "content": "import asyncio\n\nfrom crawlee.fingerprint_suite import (\n    DefaultFingerprintGenerator,\n    HeaderGeneratorOptions,\n    ScreenOptions,\n)\n\n\nasync def main() -> None:\n    fingerprint_generator = DefaultFingerprintGenerator(\n        header_options=HeaderGeneratorOptions(browsers=['chrome']),\n        screen_options=ScreenOptions(min_width=400),\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    # Fingerprint generator is used by default.\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Find a link to the next page and enqueue it if it exists.\n        await context.enqueue_links(selector='.morelink')\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://news.ycombinator.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py",
    "content": "import asyncio\nimport io\nfrom pathlib import Path\n\nfrom warcio.statusandheaders import StatusAndHeaders\nfrom warcio.warcwriter import WARCWriter\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:\n    \"\"\"Helper function for archiving response in WARC format.\"\"\"\n    # Create WARC records for response\n    response_body = await context.http_response.read()\n    response_payload_stream = io.BytesIO(response_body)\n\n    response_headers = StatusAndHeaders(\n        str(context.http_response.status_code),\n        context.http_response.headers,\n        protocol='HTTP/1.1',\n    )\n    response_record = writer.create_warc_record(\n        context.request.url,\n        'response',\n        payload=response_payload_stream,\n        length=len(response_body),\n        http_headers=response_headers,\n    )\n    writer.write_record(response_record)\n\n\nasync def main() -> None:\n    crawler = ParselCrawler(\n        max_requests_per_crawl=10,\n    )\n\n    # Create a WARC archive file a prepare the writer.\n    archive = Path('example.warc.gz')\n    with archive.open('wb') as output:\n        writer = WARCWriter(output, gzip=True)\n\n        # Create a WARC info record to store metadata about the archive.\n        warcinfo_payload = {\n            'software': 'Crawlee',\n            'format': 'WARC/1.1',\n            'description': 'Example archive created with ParselCrawler',\n        }\n        writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload))\n\n        # Define the default request handler, which will be called for every request.\n        @crawler.router.default_handler\n        async def request_handler(context: ParselCrawlingContext) -> None:\n            context.log.info(f'Archiving {context.request.url} ...')\n            await archive_response(context=context, writer=writer)\n            await context.enqueue_links(strategy='same-domain')\n\n        await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py",
    "content": "import asyncio\nimport io\nimport logging\nfrom functools import partial\nfrom pathlib import Path\n\nfrom playwright.async_api import Request\nfrom warcio.statusandheaders import StatusAndHeaders\nfrom warcio.warcwriter import WARCWriter\n\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\n\n\nasync def archive_response(\n    request: Request, writer: WARCWriter, logger: logging.Logger\n) -> None:\n    \"\"\"Helper function for archiving response in WARC format.\"\"\"\n    response = await request.response()\n    if not response:\n        logger.warning(f'Could not get response {request.url}')\n        return\n    try:\n        response_body = await response.body()\n    except Exception as e:\n        logger.warning(f'Could not get response body for {response.url}: {e}')\n        return\n    logger.info(f'Archiving resource {response.url}')\n    response_payload_stream = io.BytesIO(response_body)\n    response_headers = StatusAndHeaders(\n        str(response.status), response.headers, protocol='HTTP/1.1'\n    )\n    response_record = writer.create_warc_record(\n        response.url,\n        'response',\n        payload=response_payload_stream,\n        length=len(response_body),\n        http_headers=response_headers,\n    )\n    writer.write_record(response_record)\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        max_requests_per_crawl=1,\n        headless=False,\n    )\n\n    # Create a WARC archive file a prepare the writer.\n    archive = Path('example.warc.gz')\n    with archive.open('wb') as output:\n        writer = WARCWriter(output, gzip=True)\n\n        # Create a WARC info record to store metadata about the archive.\n        warcinfo_payload = {\n            'software': 'Crawlee',\n            'format': 'WARC/1.1',\n            'description': 'Example archive created with PlaywrightCrawler',\n        }\n        writer.write_record(writer.create_warcinfo_record(archive.name, warcinfo_payload))\n\n        @crawler.pre_navigation_hook\n        async def archiving_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n            # Ensure that all responses with additional resources are archived\n            context.page.on(\n                'requestfinished',\n                partial(archive_response, logger=context.log, writer=writer),\n            )\n\n        @crawler.router.default_handler\n        async def request_handler(context: PlaywrightCrawlingContext) -> None:\n            # For some sites, where the content loads dynamically,\n            # it is needed to scroll the page to load all content.\n            # It slows down the crawling, but ensures that all content is loaded.\n            await context.infinite_scroll()\n            await context.enqueue_links(strategy='same-domain')\n\n        await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Use the local wayback server as a proxy\n        proxy_configuration=ProxyConfiguration(proxy_urls=['http://localhost:8080/']),\n        # Ignore the HTTPS errors if you have not followed pywb CA setup instructions\n        browser_launch_options={'ignore_https_errors': True},\n        max_requests_per_crawl=10,\n        headless=False,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Archiving {context.request.url} ...')\n        # For some sites, where the content loads dynamically,\n        # it is needed to scroll the page to load all content.\n        # It slows down the crawling, but ensures that all content is loaded.\n        await context.infinite_scroll()\n        await context.enqueue_links(strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/error_handling/change_handle_error_status.py",
    "content": "import asyncio\nimport json\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors import HttpStatusCodeError\nfrom crawlee.sessions import SessionPool\n\n# Using a placeholder refresh token for this example\nREFRESH_TOKEN = 'PLACEHOLDER'\nUNAUTHORIZED_CODE = 401\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=2,\n        # Only treat 403 as a blocking status code, not 401\n        session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}),\n        # Don't treat 401 responses as errors\n        ignore_http_error_status_codes=[UNAUTHORIZED_CODE],\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Now we can handle 401 responses ourselves\n        if context.http_response.status_code == UNAUTHORIZED_CODE:\n            # Get a fresh access token\n            headers = {'authorization': f'Bearer {REFRESH_TOKEN}'}\n            response = await context.send_request(\n                'https://placeholder.org/refresh', headers=headers\n            )\n            data = json.loads(await response.read())\n            # Add the new token to our `Request` headers\n            context.request.headers |= HttpHeaders(\n                {'authorization': f'Bearer {data[\"access_token\"]}'},\n            )\n            # Trigger a retry with our updated headers\n            raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE)\n\n    await crawler.run(['http://httpbingo.org/status/401'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/error_handling/disable_retry.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors import HttpStatusCodeError, SessionError\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(max_request_retries=5)\n\n    # Create a parsing error for demonstration\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        raise ValueError('Simulated parsing error')\n\n    # This handler runs before any retry attempts\n    @crawler.error_handler\n    async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        context.log.error(f'Failed request {context.request.url}')\n        # Only allow retries for network-related errors\n        if not isinstance(error, (SessionError, HttpStatusCodeError)):\n            context.log.error('Non-network error detected')\n            # Stop further retry attempts for this `Request`\n            context.request.no_retry = True\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/error_handling/handle_proxy_error.py",
    "content": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors import ProxyError\n\n\nasync def main() -> None:\n    # Set how many session rotations will happen before calling the error handler\n    # when ProxyError occurs\n    crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6)\n\n    # For this example, we'll create a proxy error in our handler\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        raise ProxyError('Simulated proxy error')\n\n    # This handler runs after all retry attempts are exhausted\n    @crawler.failed_request_handler\n    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        context.log.error(f'Failed request {context.request.url}, after 5 rotations')\n        request = context.request\n        # For proxy errors, we can add a new `Request` to try again\n        if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'):\n            context.log.info(f'Retrying {request.url} ...')\n            # Create a new `Request` with a modified key to avoid deduplication\n            new_request = Request.from_url(\n                request.url, unique_key=f'retry{request.unique_key}'\n            )\n\n            # Add the new `Request` to the `Queue`\n            rq = await crawler.get_request_manager()\n            await rq.add_request(new_request)\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n\n\nasync def main() -> None:\n    http_client = CurlImpersonateHttpClient(\n        # Optional additional keyword arguments for `curl_cffi.requests.AsyncSession`.\n        timeout=10,\n        impersonate='chrome131',\n    )\n\n    crawler = ParselCrawler(\n        http_client=http_client,\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links from the page.\n        await context.enqueue_links()\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.css('title::text').get(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_clients/parsel_httpx_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import HttpxHttpClient\n\n\nasync def main() -> None:\n    http_client = HttpxHttpClient(\n        # Optional additional keyword arguments for `httpx.AsyncClient`.\n        timeout=10,\n        follow_redirects=True,\n    )\n\n    crawler = ParselCrawler(\n        http_client=http_client,\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links from the page.\n        await context.enqueue_links()\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.css('title::text').get(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_clients/parsel_impit_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import ImpitHttpClient\n\n\nasync def main() -> None:\n    http_client = ImpitHttpClient(\n        # Optional additional keyword arguments for `impit.AsyncClient`.\n        http3=True,\n        browser='firefox',\n        verify=True,\n    )\n\n    crawler = ParselCrawler(\n        http_client=http_client,\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Enqueue all links from the page.\n        await context.enqueue_links()\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.css('title::text').get(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/__init__.py",
    "content": ""
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/beautifulsoup_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # Create a BeautifulSoupCrawler instance\n    crawler = BeautifulSoupCrawler(\n        # Limit the crawl to 10 requests\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract data using BeautifulSoup\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Push extracted data to the dataset\n        await context.push_data(data)\n\n        # Enqueue links found on the page for further crawling\n        await context.enqueue_links()\n\n    # Run the crawler\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/custom_crawler_example.py",
    "content": ""
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/http_example.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    # Create an HttpCrawler instance - no automatic parsing\n    crawler = HttpCrawler(\n        # Limit the crawl to 10 requests\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Get the raw response content\n        response_body = await context.http_response.read()\n        response_text = response_body.decode('utf-8')\n\n        # Extract title manually using regex (since we don't have a parser)\n        title_match = re.search(\n            r'<title[^>]*>([^<]+)</title>', response_text, re.IGNORECASE\n        )\n        title = title_match.group(1).strip() if title_match else None\n\n        # Extract basic information\n        data = {\n            'url': context.request.url,\n            'title': title,\n        }\n\n        # Push extracted data to the dataset\n        await context.push_data(data)\n\n        # Simple link extraction for further crawling\n        href_pattern = r'href=[\"\\']([^\"\\']+)[\"\\']'\n        matches = re.findall(href_pattern, response_text, re.IGNORECASE)\n\n        # Enqueue first few links found (limit to avoid too many requests)\n        for href in matches[:3]:\n            if href.startswith('http') and 'crawlee.dev' in href:\n                await context.add_requests([href])\n\n    # Run the crawler\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/lexbor_parser.py",
    "content": "import asyncio\n\nfrom pydantic import ValidationError\nfrom selectolax.lexbor import LexborHTMLParser\nfrom yarl import URL\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=1,\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Parse the HTML content using Selectolax with Lexbor backend.\n        parsed_html = LexborHTMLParser(await context.http_response.read())\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': parsed_html.css_first('title').text(),\n            'h1s': [h1.text() for h1 in parsed_html.css('h1')],\n            'h2s': [h2.text() for h2 in parsed_html.css('h2')],\n            'h3s': [h3.text() for h3 in parsed_html.css('h3')],\n        }\n        await context.push_data(data)\n\n        # Css selector to extract valid href attributes.\n        links_selector = (\n            'a[href]:not([href^=\"#\"]):not([href^=\"javascript:\"]):not([href^=\"mailto:\"])'\n        )\n        base_url = URL(context.request.url)\n        extracted_requests = []\n\n        # Extract links.\n        for item in parsed_html.css(links_selector):\n            href = item.attributes.get('href')\n            if not href:\n                continue\n\n            # Convert relative URLs to absolute if needed.\n            url = str(base_url.join(URL(href)))\n            try:\n                request = Request.from_url(url)\n            except ValidationError as exc:\n                context.log.warning(f'Skipping invalid URL \"{url}\": {exc}')\n                continue\n            extracted_requests.append(request)\n\n        # Add extracted requests to the queue with the same-domain strategy.\n        await context.add_requests(extracted_requests, strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/lxml_parser.py",
    "content": "import asyncio\n\nfrom lxml import html\nfrom pydantic import ValidationError\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=1,\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Parse the HTML content using lxml.\n        parsed_html = html.fromstring(await context.http_response.read())\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': parsed_html.findtext('.//title'),\n            'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],\n            'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],\n            'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],\n        }\n        await context.push_data(data)\n\n        # Convert relative URLs to absolute before extracting links.\n        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)\n\n        # Xpath 1.0 selector for extracting valid href attributes.\n        links_xpath = (\n            '//a/@href[not(starts-with(., \"#\")) '\n            'and not(starts-with(., \"javascript:\")) '\n            'and not(starts-with(., \"mailto:\"))]'\n        )\n\n        extracted_requests = []\n\n        # Extract links.\n        for url in parsed_html.xpath(links_xpath):\n            try:\n                request = Request.from_url(url)\n            except ValidationError as exc:\n                context.log.warning(f'Skipping invalid URL \"{url}\": {exc}')\n                continue\n            extracted_requests.append(request)\n\n        # Add extracted requests to the queue with the same-domain strategy.\n        await context.add_requests(extracted_requests, strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py",
    "content": "import asyncio\n\nfrom lxml import html\nfrom pydantic import ValidationError\nfrom saxonche import PySaxonProcessor\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=1,\n        max_requests_per_crawl=10,\n    )\n\n    # Create Saxon processor once and reuse across requests.\n    saxon_proc = PySaxonProcessor(license=False)\n    xpath_proc = saxon_proc.new_xpath_processor()\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Parse HTML with lxml.\n        parsed_html = html.fromstring(await context.http_response.read())\n        # Convert relative URLs to absolute before extracting links.\n        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)\n        # Convert parsed HTML to XML for Saxon processing.\n        xml = html.tostring(parsed_html, encoding='unicode', method='xml')\n        # Parse XML with Saxon.\n        parsed_xml = saxon_proc.parse_xml(xml_text=xml)\n        # Set the parsed context for XPath evaluation.\n        xpath_proc.set_context(xdm_item=parsed_xml)\n\n        # Extract data using XPath 2.0 string() function.\n        data = {\n            'url': context.request.url,\n            'title': xpath_proc.evaluate_single('.//title/string()'),\n            'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],\n            'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],\n            'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],\n        }\n        await context.push_data(data)\n\n        # XPath 2.0 with distinct-values() to get unique links and remove fragments.\n        links_xpath = \"\"\"\n            distinct-values(\n                for $href in //a/@href[\n                    not(starts-with(., \"#\"))\n                    and not(starts-with(., \"javascript:\"))\n                    and not(starts-with(., \"mailto:\"))\n                ]\n                return replace($href, \"#.*$\", \"\")\n            )\n        \"\"\"\n\n        extracted_requests = []\n\n        # Extract links.\n        for item in xpath_proc.evaluate(links_xpath) or []:\n            url = item.string_value\n            try:\n                request = Request.from_url(url)\n            except ValidationError as exc:\n                context.log.warning(f'Skipping invalid URL \"{url}\": {exc}')\n                continue\n            extracted_requests.append(request)\n\n        # Add extracted requests to the queue with the same-domain strategy.\n        await context.add_requests(extracted_requests, strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/parsel_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n    # Create a ParselCrawler instance\n    crawler = ParselCrawler(\n        # Limit the crawl to 10 requests\n        max_requests_per_crawl=10,\n    )\n\n    # Define the default request handler\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract data using Parsel's XPath and CSS selectors\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n\n        # Push extracted data to the dataset\n        await context.push_data(data)\n\n        # Enqueue links found on the page for further crawling\n        await context.enqueue_links()\n\n    # Run the crawler\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/pyquery_parser.py",
    "content": "import asyncio\n\nfrom pydantic import ValidationError\nfrom pyquery import PyQuery\nfrom yarl import URL\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=1,\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Parse the HTML content using PyQuery.\n        parsed_html = PyQuery(await context.http_response.read())\n\n        # Extract data using jQuery-style selectors.\n        data = {\n            'url': context.request.url,\n            'title': parsed_html('title').text(),\n            'h1s': [h1.text() for h1 in parsed_html('h1').items()],\n            'h2s': [h2.text() for h2 in parsed_html('h2').items()],\n            'h3s': [h3.text() for h3 in parsed_html('h3').items()],\n        }\n        await context.push_data(data)\n\n        # Css selector to extract valid href attributes.\n        links_selector = (\n            'a[href]:not([href^=\"#\"]):not([href^=\"javascript:\"]):not([href^=\"mailto:\"])'\n        )\n        base_url = URL(context.request.url)\n\n        extracted_requests = []\n\n        # Extract links.\n        for item in parsed_html(links_selector).items():\n            href = item.attr('href')\n            if not href:\n                continue\n\n            # Convert relative URLs to absolute if needed.\n            url = str(base_url.join(URL(str(href))))\n            try:\n                request = Request.from_url(url)\n            except ValidationError as exc:\n                context.log.warning(f'Skipping invalid URL \"{url}\": {exc}')\n                continue\n            extracted_requests.append(request)\n\n        # Add extracted requests to the queue with the same-domain strategy.\n        await context.add_requests(extracted_requests, strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/scrapling_parser.py",
    "content": "import asyncio\n\nfrom pydantic import ValidationError\nfrom scrapling.parser import Selector\nfrom yarl import URL\n\nfrom crawlee import Request\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_request_retries=1,\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Parse the HTML content using Scrapling.\n        page = Selector(await context.http_response.read(), url=context.request.url)\n\n        # Extract data using Xpath selectors with .get_all_text method for full text\n        # content.\n        title_el = page.xpath_first('//title')\n        data = {\n            'url': context.request.url,\n            'title': title_el.text if isinstance(title_el, Selector) else title_el,\n            'h1s': [\n                h1.get_all_text() if isinstance(h1, Selector) else h1\n                for h1 in page.xpath('//h1')\n            ],\n            'h2s': [\n                h2.get_all_text() if isinstance(h2, Selector) else h2\n                for h2 in page.xpath('//h2')\n            ],\n            'h3s': [\n                h3.get_all_text() if isinstance(h3, Selector) else h3\n                for h3 in page.xpath('//h3')\n            ],\n        }\n        await context.push_data(data)\n\n        # Css selector to extract valid href attributes.\n        links_selector = (\n            'a[href]:not([href^=\"#\"]):not([href^=\"javascript:\"]):not([href^=\"mailto:\"])'\n        )\n        base_url = URL(context.request.url)\n        extracted_requests = []\n\n        # Extract links.\n        for item in page.css(links_selector):\n            href = item.attrib.get('href') if isinstance(item, Selector) else None\n            if not href:\n                continue\n\n            # Convert relative URLs to absolute if needed.\n            url = str(base_url.join(URL(href)))\n            try:\n                request = Request.from_url(url)\n            except ValidationError as exc:\n                context.log.warning(f'Skipping invalid URL \"{url}\": {exc}')\n                continue\n            extracted_requests.append(request)\n\n        # Add extracted requests to the queue with the same-domain strategy.\n        await context.add_requests(extracted_requests, strategy='same-domain')\n\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    AdaptivePlaywrightCrawlingContext,\n)\n\nfrom .selectolax_parser import SelectolaxLexborParser\n\n\nasync def main() -> None:\n    crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(\n        max_requests_per_crawl=10,\n        # Use custom Selectolax parser for static content parsing.\n        static_parser=SelectolaxLexborParser(),\n    )\n\n    @crawler.router.default_handler\n    async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        data = {\n            'url': context.request.url,\n            'title': await context.query_selector_one('title'),\n        }\n\n        await context.push_data(data)\n\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/selectolax_context.py",
    "content": "from dataclasses import dataclass, fields\n\nfrom selectolax.lexbor import LexborHTMLParser\nfrom typing_extensions import Self\n\nfrom crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext\n\n\n# Custom context for Selectolax parser, you can add your own methods here\n# to facilitate working with the parsed document.\n@dataclass(frozen=True)\nclass SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):\n    \"\"\"Crawling context providing access to the parsed page.\n\n    This context is passed to request handlers and includes all standard\n    context methods (push_data, enqueue_links, etc.) plus custom helpers.\n    \"\"\"\n\n    @property\n    def parser(self) -> LexborHTMLParser:\n        \"\"\"Convenient alias for accessing the parsed document.\"\"\"\n        return self.parsed_content\n\n    @classmethod\n    def from_parsed_http_crawling_context(\n        cls, context: ParsedHttpCrawlingContext[LexborHTMLParser]\n    ) -> Self:\n        \"\"\"Create custom context from the base context.\n\n        Copies all fields from the base context to preserve framework\n        functionality while adding custom interface.\n        \"\"\"\n        return cls(\n            **{field.name: getattr(context, field.name) for field in fields(context)}\n        )\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/selectolax_crawler.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom selectolax.lexbor import LexborHTMLParser, LexborNode\n\nfrom crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions\n\nfrom .selectolax_context import SelectolaxLexborContext\nfrom .selectolax_parser import SelectolaxLexborParser\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from typing_extensions import Unpack\n\n    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext\n\n\n# Custom crawler using custom context, It is optional and you can use\n# AbstractHttpCrawler directly with SelectolaxLexborParser if you don't need\n# any custom context methods.\nclass SelectolaxLexborCrawler(\n    AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode]\n):\n    \"\"\"Custom crawler using Selectolax Lexbor for HTML parsing.\"\"\"\n\n    def __init__(\n        self,\n        **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]],\n    ) -> None:\n        # Final step converts the base context to custom context type.\n        async def final_step(\n            context: ParsedHttpCrawlingContext[LexborHTMLParser],\n        ) -> AsyncGenerator[SelectolaxLexborContext, None]:\n            # Yield custom context wrapping with additional functionality around the base\n            # context.\n            yield SelectolaxLexborContext.from_parsed_http_crawling_context(context)\n\n        # Build context pipeline: HTTP request -> parsing -> custom context.\n        kwargs['_context_pipeline'] = (\n            self._create_static_content_crawler_pipeline().compose(final_step)\n        )\n        super().__init__(\n            parser=SelectolaxLexborParser(),\n            **kwargs,\n        )\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py",
    "content": "import asyncio\n\nfrom .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler\n\n\nasync def main() -> None:\n    crawler = SelectolaxLexborCrawler(\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def handle_request(context: SelectolaxLexborContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        data = {\n            'url': context.request.url,\n            'title': context.parser.css_first('title').text(),\n        }\n\n        await context.push_data(data)\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/http_crawlers/selectolax_parser.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nfrom selectolax.lexbor import LexborHTMLParser, LexborNode\nfrom typing_extensions import override\n\nfrom crawlee.crawlers._abstract_http import AbstractHttpParser\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Sequence\n\n    from crawlee.http_clients import HttpResponse\n\n\nclass SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]):\n    \"\"\"Parser for parsing HTTP response using Selectolax Lexbor.\"\"\"\n\n    @override\n    async def parse(self, response: HttpResponse) -> LexborHTMLParser:\n        \"\"\"Parse HTTP response body into a document object.\"\"\"\n        response_body = await response.read()\n        # Run parsing in a thread to avoid blocking the event loop.\n        return await asyncio.to_thread(LexborHTMLParser, response_body)\n\n    @override\n    async def parse_text(self, text: str) -> LexborHTMLParser:\n        \"\"\"Parse raw HTML string into a document object.\"\"\"\n        return LexborHTMLParser(text)\n\n    @override\n    async def select(\n        self, parsed_content: LexborHTMLParser, selector: str\n    ) -> Sequence[LexborNode]:\n        \"\"\"Select elements matching a CSS selector.\"\"\"\n        return tuple(item for item in parsed_content.css(selector))\n\n    @override\n    def is_matching_selector(\n        self, parsed_content: LexborHTMLParser, selector: str\n    ) -> bool:\n        \"\"\"Check if any element matches the selector.\"\"\"\n        return parsed_content.css_first(selector) is not None\n\n    @override\n    def find_links(\n        self, parsed_content: LexborHTMLParser, selector: str, attribute: str\n    ) -> Iterable[str]:\n        \"\"\"Extract href attributes from elements matching the selector.\n\n        Used by `enqueue_links` helper to discover URLs.\n        \"\"\"\n        link: LexborNode\n        urls: list[str] = []\n        for link in parsed_content.css(selector):\n            url = link.attributes.get(attribute)\n            if url:\n                urls.append(url.strip())\n        return urls\n"
  },
  {
    "path": "docs/guides/code_examples/login_crawler/http_login.py",
    "content": "import asyncio\nimport json\nfrom datetime import datetime, timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers import (\n    HttpCrawler,\n    HttpCrawlingContext,\n)\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        max_requests_per_crawl=10,\n        # Configure to use a single persistent session throughout the crawl\n        max_session_rotations=0,\n        # Limit request rate to avoid triggering anti-scraping measures\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30),\n        session_pool=SessionPool(\n            max_pool_size=1,\n            create_session_settings={\n                # Set high value to ensure the session isn't replaced during crawling\n                'max_usage_count': 999_999,\n                # Set high value to prevent session expiration during crawling\n                'max_age': timedelta(hours=999_999),\n                # Higher error tolerance before the session is considered blocked\n                # Make sure you implement proper error handling in your code\n                'max_error_score': 100,\n            },\n        ),\n    )\n\n    # Default request handler for normal page processing\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # Specialized handler for the login API request\n    @crawler.router.handler('login')\n    async def login_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing login at {context.request.url} ...')\n\n        # Verify that a session is available before proceeding\n        if not context.session:\n            raise RuntimeError('Session not found')\n\n        # Parse the API response containing authentication tokens and user data\n        data = json.loads(await context.http_response.read())\n\n        # Extract authentication data from the response\n        token = data['token']\n        expires = data['expires'].replace('Z', '+00:00')\n        expires_int = int(datetime.fromisoformat(expires).timestamp())\n        user_id = data['userId']\n        username = data['username']\n\n        # Set authentication cookies in the session that will be used\n        # for subsequent requests\n        context.session.cookies.set(name='token', value=token, expires=expires_int)\n        context.session.cookies.set(name='userID', value=user_id)\n        context.session.cookies.set(name='userName', value=username)\n\n        # After successful authentication, continue crawling with the\n        # authenticated session\n        await context.add_requests(['https://demoqa.com/BookStore/v1/Books'])\n\n    # Create a POST request to the authentication API endpoint\n    # This will trigger the login_handler when executed\n    request = Request.from_url(\n        'https://demoqa.com/Account/v1/Login',\n        label='login',\n        method='POST',\n        payload=json.dumps(\n            {'userName': 'crawlee_test', 'password': 'Test1234!'}\n        ).encode(),\n        headers={'Content-Type': 'application/json'},\n    )\n\n    # Start the crawling process with the login request\n    await crawler.run([request])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/login_crawler/playwright_login.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n)\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        max_requests_per_crawl=10,\n        headless=True,\n        browser_type='chromium',\n        # We only have one session and it shouldn't rotate\n        max_session_rotations=0,\n        # Limit crawling intensity to avoid blocking\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=30),\n        session_pool=SessionPool(\n            # Limit the pool to one session\n            max_pool_size=1,\n            create_session_settings={\n                # High value for session usage limit\n                'max_usage_count': 999_999,\n                # High value for session lifetime\n                'max_age': timedelta(hours=999_999),\n                # High score allows the session to encounter more errors\n                # before crawlee decides the session is blocked\n                # Make sure you know how to handle these errors\n                'max_error_score': 100,\n            },\n        ),\n    )\n\n    # The main handler for processing requests\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # A handler for the login page\n    @crawler.router.handler('login')\n    async def login_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing login {context.request.url} ...')\n\n        # Check if the session is available\n        if not context.session:\n            raise RuntimeError('Session not found')\n\n        # Entering data into the form, `delay` to simulate human typing\n        # Without this, the data will be entered instantly\n        await context.page.type('#userName', 'crawlee_test', delay=100)\n        await context.page.type('#password', 'Test1234!', delay=100)\n        await context.page.click('#login', delay=100)\n\n        # Wait for an element confirming that we have successfully\n        # logged in to the site\n        await context.page.locator('#userName-value').first.wait_for(state='visible')\n        context.log.info('Login successful!')\n\n        # Moving on to the basic flow of crawling\n        await context.add_requests(['https://demoqa.com/books'])\n\n    # We start crawling with login. This is necessary to access the rest of the pages\n    await crawler.run([Request.from_url('https://demoqa.com/login', label='login')])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler/browser_configuration_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        headless=False,\n        browser_type='chromium',\n        # Browser launch options\n        browser_launch_options={\n            # For support `msedge` channel you need to install it\n            # `playwright install msedge`\n            'channel': 'msedge',\n            'slow_mo': 200,\n        },\n        # Context launch options, applied to each page as it is created\n        browser_new_context_options={\n            'color_scheme': 'dark',\n            # Set headers\n            'extra_http_headers': {\n                'Custom-Header': 'my-header',\n                'Accept-Language': 'en',\n            },\n            # Set only User Agent\n            'user_agent': 'My-User-Agent',\n        },\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom typing import TYPE_CHECKING, Any\n\nfrom crawlee.browsers import BrowserPool\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storages import KeyValueStore\n\nif TYPE_CHECKING:\n    from crawlee.browsers._browser_controller import BrowserController\n    from crawlee.browsers._types import CrawleePage\n    from crawlee.proxy_configuration import ProxyInfo\n\nlogger = logging.getLogger(__name__)\n\n\nasync def main() -> None:\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.pre_page_create_hook\n        async def log_page_init(\n            page_id: str,\n            _browser_controller: BrowserController,\n            _browser_new_context_options: dict[str, Any],\n            _proxy_info: ProxyInfo | None,\n        ) -> None:\n            \"\"\"Log when a new page is about to be created.\"\"\"\n            logger.info(f'Creating page {page_id}...')\n\n        @browser_pool.post_page_create_hook\n        async def set_viewport(\n            crawlee_page: CrawleePage, _browser_controller: BrowserController\n        ) -> None:\n            \"\"\"Set a fixed viewport size on each newly created page.\"\"\"\n            await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})\n\n        @browser_pool.pre_page_close_hook\n        async def save_screenshot(\n            crawlee_page: CrawleePage, _browser_controller: BrowserController\n        ) -> None:\n            \"\"\"Save a screenshot to KeyValueStore before each page is closed.\"\"\"\n            kvs = await KeyValueStore.open()\n\n            screenshot = await crawlee_page.page.screenshot()\n            await kvs.set_value(\n                key=f'screenshot-{crawlee_page.id}',\n                value=screenshot,\n                content_type='image/png',\n            )\n            logger.info(f'Saved screenshot for page {crawlee_page.id}.')\n\n        @browser_pool.post_page_close_hook\n        async def log_page_closed(\n            page_id: str, _browser_controller: BrowserController\n        ) -> None:\n            \"\"\"Log after each page is closed.\"\"\"\n            logger.info(f'Page {page_id} closed successfully.')\n\n        crawler = PlaywrightCrawler(\n            browser_pool=browser_pool,\n            max_requests_per_crawl=5,\n        )\n\n        @crawler.router.default_handler\n        async def request_handler(context: PlaywrightCrawlingContext) -> None:\n            context.log.info(f'Processing {context.request.url} ...')\n\n            await context.enqueue_links()\n\n        # Run the crawler with the initial list of URLs.\n        await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler/multiple_launch_example.py",
    "content": "import asyncio\n\nfrom crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    # Create a plugin for each required browser.\n    plugin_chromium = PlaywrightBrowserPlugin(\n        browser_type='chromium', max_open_pages_per_browser=1\n    )\n    plugin_firefox = PlaywrightBrowserPlugin(\n        browser_type='firefox', max_open_pages_per_browser=1\n    )\n\n    crawler = PlaywrightCrawler(\n        browser_pool=BrowserPool(plugins=[plugin_chromium, plugin_firefox]),\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        browser_name = (\n            context.page.context.browser.browser_type.name\n            if context.page.context.browser\n            else 'undefined'\n        )\n        context.log.info(f'Processing {context.request.url} with {browser_name} ...')\n\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev', 'https://apify.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler/navigation_hooks_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPostNavCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\nfrom crawlee.errors import SessionError\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(max_requests_per_crawl=10)\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        await context.enqueue_links()\n\n    @crawler.pre_navigation_hook\n    async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:\n        context.log.info(f'Navigating to {context.request.url} ...')\n\n        # block stylesheets, images, fonts and other static assets\n        # to speed up page loading\n        await context.block_requests()\n\n    @crawler.post_navigation_hook\n    async def custom_captcha_check(context: PlaywrightPostNavCrawlingContext) -> None:\n        # check if the page contains a captcha\n        captcha_element = context.page.locator('input[name=\"captcha\"]').first\n        if await captcha_element.is_visible():\n            context.log.warning('Captcha detected! Skipping the page.')\n            raise SessionError('Captcha detected')\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py",
    "content": "import asyncio\n\nfrom crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin\nfrom crawlee.crawlers import PlaywrightCrawler\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        browser_pool=BrowserPool(\n            plugins=[\n                PlaywrightBrowserPlugin(\n                    browser_type='chromium',\n                    browser_launch_options={\n                        'headless': False,\n                        'channel': 'msedge',\n                        'slow_mo': 200,\n                    },\n                    browser_new_context_options={\n                        'color_scheme': 'dark',\n                        'extra_http_headers': {\n                            'Custom-Header': 'my-header',\n                            'Accept-Language': 'en',\n                        },\n                        'user_agent': 'My-User-Agent',\n                    },\n                )\n            ]\n        )\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_adaptive/handler.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        # Locate element h2 within 5 seconds\n        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n        # Do stuff with element found by the selector\n        context.log.info(h2)\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        # Arguments relevant only for PlaywrightCrawler\n        playwright_crawler_specific_kwargs={\n            'headless': False,\n            'browser_type': 'chromium',\n        },\n        # Common arguments relevant to all crawlers\n        max_crawl_depth=5,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import AdaptivePlaywrightCrawler\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        # Arguments relevant only for PlaywrightCrawler\n        playwright_crawler_specific_kwargs={\n            'headless': False,\n            'browser_type': 'chromium',\n        },\n        # Common arguments relevant to all crawlers\n        max_crawl_depth=5,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py",
    "content": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee._types import RequestHandlerRunResult\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    RenderingType,\n    RenderingTypePrediction,\n    RenderingTypePredictor,\n)\n\n\nclass CustomRenderingTypePredictor(RenderingTypePredictor):\n    def __init__(self) -> None:\n        super().__init__()\n\n        self._learning_data = list[tuple[Request, RenderingType]]()\n\n    def predict(self, request: Request) -> RenderingTypePrediction:\n        # Some custom logic that produces some `RenderingTypePrediction`\n        # based on the `request` input.\n        rendering_type: RenderingType = (\n            'static' if 'abc' in request.url else 'client only'\n        )\n\n        return RenderingTypePrediction(\n            #  Recommends `static` rendering type -> HTTP-based sub crawler will be used.\n            rendering_type=rendering_type,\n            # Recommends that both sub crawlers should run with 20% chance. When both sub\n            # crawlers are running, the predictor can compare results and learn.\n            # High number means that predictor is not very confident about the\n            # `rendering_type`, low number means that predictor is very confident.\n            detection_probability_recommendation=0.2,\n        )\n\n    def store_result(self, request: Request, rendering_type: RenderingType) -> None:\n        # This function allows predictor to store new learning data and retrain itself\n        # if needed. `request` is input for prediction and `rendering_type` is the correct\n        # prediction.\n        self._learning_data.append((request, rendering_type))\n        # retrain\n\n\ndef result_checker(result: RequestHandlerRunResult) -> bool:\n    # Some function that inspects produced `result` and returns `True` if the result\n    # is correct.\n    return bool(result)  # Check something on result\n\n\ndef result_comparator(\n    result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult\n) -> bool:\n    # Some function that inspects two results and returns `True` if they are\n    # considered equivalent. It is used when comparing results produced by HTTP-based\n    # sub crawler and playwright based sub crawler.\n    return (\n        result_1.push_data_calls == result_2.push_data_calls\n    )  #  For example compare `push_data` calls.\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        rendering_type_predictor=CustomRenderingTypePredictor(),\n        result_checker=result_checker,\n        result_comparator=result_comparator,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py",
    "content": "import asyncio\n\nfrom playwright.async_api import Route\n\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    AdaptivePlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()\n\n    @crawler.pre_navigation_hook\n    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        \"\"\"Hook executed both in static sub crawler and playwright sub crawler.\n\n        Trying to access `context.page` in this hook would raise `AdaptiveContextError`\n        for pages crawled without playwright.\n        \"\"\"\n        context.log.info(f'pre navigation hook for: {context.request.url}')\n\n    @crawler.pre_navigation_hook(playwright_only=True)\n    async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        \"\"\"Hook executed only in playwright sub crawler.\"\"\"\n\n        async def some_routing_function(route: Route) -> None:\n            await route.continue_()\n\n        await context.page.route('*/**', some_routing_function)\n        context.log.info(\n            f'Playwright only pre navigation hook for: {context.request.url}'\n        )\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_stagehand/__init__.py",
    "content": ""
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom stagehand.context import StagehandContext\nfrom typing_extensions import override\n\nfrom crawlee.browsers import (\n    PlaywrightBrowserController,\n    PlaywrightBrowserPlugin,\n    PlaywrightPersistentBrowser,\n)\n\nfrom .support_classes import CrawleeStagehandPage\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n\n    from playwright.async_api import Page\n    from stagehand import Stagehand\n\n    from crawlee.proxy_configuration import ProxyInfo\n\n\nclass StagehandBrowserController(PlaywrightBrowserController):\n    @override\n    def __init__(\n        self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any\n    ) -> None:\n        # Initialize with browser context instead of browser instance\n        super().__init__(browser, **kwargs)\n\n        self._stagehand = stagehand\n        self._stagehand_context: StagehandContext | None = None\n\n    @override\n    async def new_page(\n        self,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        proxy_info: ProxyInfo | None = None,\n    ) -> Page:\n        # Initialize browser context if not already done\n        if not self._browser_context:\n            self._browser_context = await self._create_browser_context(\n                browser_new_context_options=browser_new_context_options,\n                proxy_info=proxy_info,\n            )\n\n        # Initialize Stagehand context if not already done\n        if not self._stagehand_context:\n            self._stagehand_context = await StagehandContext.init(\n                self._browser_context, self._stagehand\n            )\n\n        # Create a new page using Stagehand context\n        page = await self._stagehand_context.new_page()\n\n        pw_page = page._page  # noqa: SLF001\n\n        # Handle page close event\n        pw_page.on(event='close', f=self._on_page_close)\n\n        # Update internal state\n        self._pages.append(pw_page)\n        self._last_page_opened_at = datetime.now(timezone.utc)\n\n        self._total_opened_pages += 1\n\n        # Wrap StagehandPage to provide Playwright Page interface\n        return cast('Page', CrawleeStagehandPage(page))\n\n\nclass StagehandPlugin(PlaywrightBrowserPlugin):\n    \"\"\"Browser plugin that integrates Stagehand with Crawlee's browser management.\"\"\"\n\n    @override\n    def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None:\n        super().__init__(**kwargs)\n\n        self._stagehand = stagehand\n\n    @override\n    async def new_browser(self) -> StagehandBrowserController:\n        if not self._playwright:\n            raise RuntimeError('Playwright browser plugin is not initialized.')\n\n        browser = PlaywrightPersistentBrowser(\n            # Stagehand can run only on a Chromium-based browser.\n            self._playwright.chromium,\n            self._user_data_dir,\n            self._browser_launch_options,\n        )\n\n        # Return custom controller with Stagehand\n        return StagehandBrowserController(\n            browser=browser,\n            stagehand=self._stagehand,\n            header_generator=None,\n            fingerprint_generator=None,\n        )\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport os\nfrom typing import cast\n\nfrom stagehand import StagehandConfig, StagehandPage\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.browsers import BrowserPool\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\nfrom .browser_classes import StagehandPlugin\nfrom .support_classes import CrawleeStagehand\n\n\nasync def main() -> None:\n    # Configure local Stagehand with Gemini model\n    config = StagehandConfig(\n        env='LOCAL',\n        model_name='google/gemini-2.5-flash-preview-05-20',\n        model_api_key=os.getenv('GEMINI_API_KEY'),\n    )\n\n    # Create Stagehand instance\n    stagehand = CrawleeStagehand(config)\n\n    # Create crawler with custom browser pool using Stagehand\n    crawler = PlaywrightCrawler(\n        # Limit the crawl to max requests. Remove or increase it for crawling all links.\n        max_requests_per_crawl=10,\n        # Custom browser pool. Gives users full control over browsers used by the crawler.\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10),\n        browser_pool=BrowserPool(\n            plugins=[\n                StagehandPlugin(stagehand, browser_launch_options={'headless': True})\n            ],\n        ),\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Cast to StagehandPage for proper type hints in IDE\n        page = cast('StagehandPage', context.page)\n\n        # Use regular Playwright method\n        playwright_title = await page.title()\n        context.log.info(f'Playwright page title: {playwright_title}')\n\n        # highlight-start\n        # Use AI-powered extraction with natural language\n        gemini_title = await page.extract('Extract page title')\n        context.log.info(f'Gemini page title: {gemini_title}')\n        # highlight-end\n\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any\n\nfrom stagehand import Stagehand, StagehandPage\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n\nclass CrawleeStagehandPage:\n    \"\"\"StagehandPage wrapper for Crawlee.\"\"\"\n\n    def __init__(self, page: StagehandPage) -> None:\n        self._page = page\n\n    async def goto(\n        self,\n        url: str,\n        *,\n        referer: str | None = None,\n        timeout: int | None = None,\n        wait_until: str | None = None,\n    ) -> Any:\n        \"\"\"Navigate to the specified URL.\"\"\"\n        # Override goto to return navigation result that `PlaywrightCrawler` expects\n        return await self._page._page.goto(  # noqa: SLF001\n            url,\n            referer=referer,\n            timeout=timeout,\n            wait_until=wait_until,\n        )\n\n    def __getattr__(self, name: str) -> Any:\n        \"\"\"Delegate all other methods to the underlying StagehandPage.\"\"\"\n        return getattr(self._page, name)\n\n    async def __aenter__(self) -> CrawleeStagehandPage:\n        \"\"\"Enter the context manager.\"\"\"\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        await self._page.close()\n\n\nclass CrawleeStagehand(Stagehand):\n    \"\"\"Stagehand wrapper for Crawlee to disable the launch of Playwright.\"\"\"\n\n    async def init(self) -> None:\n        # Skip Stagehand's own Playwright initialization\n        # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle\n        self._initialized = True\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/inspecting_bs_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:\n        # Log the proxy used for the current request.\n        context.log.info(f'Proxy for the current request: {context.proxy_info}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/inspecting_pw_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        # Log the proxy used for the current request.\n        context.log.info(f'Proxy for the current request: {context.proxy_info}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/integration_bs_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n        context.log.info(f'Extracted data: {data}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/integration_pw_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n        }\n        context.log.info(f'Extracted data: {data}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/quick_start_example.py",
    "content": "import asyncio\n\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n\n    # The proxy URLs are rotated in a round-robin.\n    proxy_url_1 = await proxy_configuration.new_url()  # http://proxy-1.com/\n    proxy_url_2 = await proxy_configuration.new_url()  # http://proxy-2.com/\n    proxy_url_3 = await proxy_configuration.new_url()  # http://proxy-1.com/\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/session_bs_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = BeautifulSoupCrawler(\n        proxy_configuration=proxy_configuration,\n        use_session_pool=True,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/session_pw_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        proxy_urls=[\n            'http://proxy-1.com/',\n            'http://proxy-2.com/',\n        ]\n    )\n    crawler = PlaywrightCrawler(\n        proxy_configuration=proxy_configuration,\n        use_session_pool=True,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/tiers_bs_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        tiered_proxy_urls=[\n            # No proxy tier.\n            # Optional in case you do not want to use any proxy on lowest tier.\n            [None],\n            # lower tier, cheaper, preferred as long as they work\n            [\n                'http://cheap-datacenter-proxy-1.com/',\n                'http://cheap-datacenter-proxy-2.com/',\n            ],\n            # higher tier, more expensive, used as a fallback\n            [\n                'http://expensive-residential-proxy-1.com/',\n                'http://expensive-residential-proxy-2.com/',\n            ],\n        ]\n    )\n    crawler = BeautifulSoupCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:\n        # Log the proxy used for the current request.\n        context.log.info(f'Proxy for the current request: {context.proxy_info}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/proxy_management/tiers_pw_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def main() -> None:\n    # Create a ProxyConfiguration object and pass it to the crawler.\n    proxy_configuration = ProxyConfiguration(\n        tiered_proxy_urls=[\n            # No proxy tier.\n            # Optional in case you do not want to use any proxy on lowest tier.\n            [None],\n            # lower tier, cheaper, preferred as long as they work\n            [\n                'http://cheap-datacenter-proxy-1.com/',\n                'http://cheap-datacenter-proxy-2.com/',\n            ],\n            # higher tier, more expensive, used as a fallback\n            [\n                'http://expensive-residential-proxy-1.com/',\n                'http://expensive-residential-proxy-2.com/',\n            ],\n        ]\n    )\n    crawler = PlaywrightCrawler(proxy_configuration=proxy_configuration)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        # Log the proxy used for the current request.\n        context.log.info(f'Proxy for the current request: {context.proxy_info}')\n\n    # Run the crawler with the initial list of requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/rl_basic_example.py",
    "content": "import asyncio\n\nfrom crawlee.request_loaders import RequestList\n\n\nasync def main() -> None:\n    # Open the request list, if it does not exist, it will be created.\n    # Leave name empty to use the default request list.\n    request_list = RequestList(\n        name='my-request-list',\n        requests=[\n            'https://apify.com/',\n            'https://crawlee.dev/',\n            'https://crawlee.dev/python/',\n        ],\n    )\n\n    # Fetch and process requests from the queue.\n    while request := await request_list.fetch_next_request():\n        # Do something with it...\n        print(f'Processing {request.url}')\n\n        # And mark it as handled.\n        await request_list.mark_request_as_handled(request)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py",
    "content": "import asyncio\nimport logging\n\nfrom crawlee import service_locator\nfrom crawlee.request_loaders import RequestList\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')\nlogger = logging.getLogger(__name__)\n\n\n# Disable clearing the `KeyValueStore` on each run.\n# This is necessary so that the state keys are not cleared at startup.\n# The recommended way to achieve this behavior is setting the environment variable\n# `CRAWLEE_PURGE_ON_START=0`\nconfiguration = service_locator.get_configuration()\nconfiguration.purge_on_start = False\n\n\nasync def main() -> None:\n    # Open the request list, if it does not exist, it will be created.\n    # Leave name empty to use the default request list.\n    request_list = RequestList(\n        name='my-request-list',\n        requests=[\n            'https://apify.com/',\n            'https://crawlee.dev/',\n            'https://crawlee.dev/python/',\n        ],\n        # Enable persistence\n        persist_state_key='my-persist-state',\n        persist_requests_key='my-persist-requests',\n    )\n\n    # We receive only one request.\n    # Each time you run it, it will be a new request until you exhaust the `RequestList`.\n    request = await request_list.fetch_next_request()\n    if request:\n        logger.info(f'Processing request: {request.url}')\n        # Do something with it...\n\n        # And mark it as handled.\n        await request_list.mark_request_as_handled(request)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/rl_tandem_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.request_loaders import RequestList\n\n\nasync def main() -> None:\n    # Create a static request list.\n    request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])\n\n    # highlight-start\n    # Convert the request list to a request manager using the to_tandem method.\n    # It is a tandem with the default request queue.\n    request_manager = await request_list.to_tandem()\n    # highlight-end\n\n    # Create a crawler and pass the request manager to it.\n    crawler = ParselCrawler(\n        request_manager=request_manager,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # New links will be enqueued directly to the queue.\n        await context.enqueue_links()\n\n        # Extract data using Parsel's XPath and CSS selectors.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n\n        # Push extracted data to the dataset.\n        await context.push_data(data)\n\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.request_loaders import RequestList, RequestManagerTandem\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # Create a static request list.\n    request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])\n\n    # Open the default request queue.\n    request_queue = await RequestQueue.open()\n\n    # And combine them together to a sinhle request manager.\n    request_manager = RequestManagerTandem(request_list, request_queue)\n\n    # Create a crawler and pass the request manager to it.\n    crawler = ParselCrawler(\n        request_manager=request_manager,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # New links will be enqueued directly to the queue.\n        await context.enqueue_links()\n\n        # Extract data using Parsel's XPath and CSS selectors.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n\n        # Push extracted data to the dataset.\n        await context.push_data(data)\n\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/sitemap_basic_example.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import SitemapRequestLoader\n\n\nasync def main() -> None:\n    # Create an HTTP client for fetching the sitemap.\n    http_client = ImpitHttpClient()\n\n    # Create a sitemap request loader with filtering rules.\n    sitemap_loader = SitemapRequestLoader(\n        sitemap_urls=['https://crawlee.dev/sitemap.xml'],\n        http_client=http_client,\n        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.\n        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.\n    )\n\n    # We work with the loader until we process all relevant links from the sitemap.\n    while request := await sitemap_loader.fetch_next_request():\n        # Do something with it...\n        print(f'Processing {request.url}')\n\n        # And mark it as handled.\n        await sitemap_loader.mark_request_as_handled(request)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py",
    "content": "import asyncio\nimport logging\n\nfrom crawlee import service_locator\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import SitemapRequestLoader\n\nlogging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')\nlogger = logging.getLogger(__name__)\n\n\n# Disable clearing the `KeyValueStore` on each run.\n# This is necessary so that the state keys are not cleared at startup.\n# The recommended way to achieve this behavior is setting the environment variable\n# `CRAWLEE_PURGE_ON_START=0`\nconfiguration = service_locator.get_configuration()\nconfiguration.purge_on_start = False\n\n\nasync def main() -> None:\n    # Create an HTTP client for fetching sitemaps\n    # Use the context manager for `SitemapRequestLoader` to correctly save the state when\n    # the work is completed.\n    async with (\n        ImpitHttpClient() as http_client,\n        SitemapRequestLoader(\n            sitemap_urls=['https://crawlee.dev/sitemap.xml'],\n            http_client=http_client,\n            # Enable persistence\n            persist_state_key='my-persist-state',\n        ) as sitemap_loader,\n    ):\n        # We receive only one request.\n        # Each time you run it, it will be a new request until you exhaust the sitemap.\n        request = await sitemap_loader.fetch_next_request()\n        if request:\n            logger.info(f'Processing request: {request.url}')\n            # Do something with it...\n\n            # And mark it as handled.\n            await sitemap_loader.mark_request_as_handled(request)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/sitemap_tandem_example.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import SitemapRequestLoader\n\n\nasync def main() -> None:\n    # Create an HTTP client for fetching the sitemap.\n    http_client = ImpitHttpClient()\n\n    # Create a sitemap request loader with filtering rules.\n    sitemap_loader = SitemapRequestLoader(\n        sitemap_urls=['https://crawlee.dev/sitemap.xml'],\n        http_client=http_client,\n        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.\n        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.\n    )\n\n    # highlight-start\n    # Convert the sitemap loader into a request manager linked\n    # to the default request queue.\n    request_manager = await sitemap_loader.to_tandem()\n    # highlight-end\n\n    # Create a crawler and pass the request manager to it.\n    crawler = ParselCrawler(\n        request_manager=request_manager,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # New links will be enqueued directly to the queue.\n        await context.enqueue_links()\n\n        # Extract data using Parsel's XPath and CSS selectors.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n\n        # Push extracted data to the dataset.\n        await context.push_data(data)\n\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # Create an HTTP client for fetching the sitemap.\n    http_client = ImpitHttpClient()\n\n    # Create a sitemap request loader with filtering rules.\n    sitemap_loader = SitemapRequestLoader(\n        sitemap_urls=['https://crawlee.dev/sitemap.xml'],\n        http_client=http_client,\n        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.\n        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.\n    )\n\n    # Open the default request queue.\n    request_queue = await RequestQueue.open()\n\n    # And combine them together to a single request manager.\n    request_manager = RequestManagerTandem(sitemap_loader, request_queue)\n\n    # Create a crawler and pass the request manager to it.\n    crawler = ParselCrawler(\n        request_manager=request_manager,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # New links will be enqueued directly to the queue.\n        await context.enqueue_links()\n\n        # Extract data using Parsel's XPath and CSS selectors.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n\n        # Push extracted data to the dataset.\n        await context.push_data(data)\n\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/adaptive_crawler_handlers.py",
    "content": "import asyncio\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    AdaptivePlaywrightCrawlingContext,\n    AdaptivePlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.pre_navigation_hook\n    async def common_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        # Common pre-navigation hook - runs for both HTTP and browser requests.\n        context.request.headers |= HttpHeaders(\n            {'Accept': 'text/html,application/xhtml+xml'},\n        )\n\n    @crawler.pre_navigation_hook(playwright_only=True)\n    async def browser_setup(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        # Playwright-specific pre-navigation hook - runs only when browser is used.\n        await context.page.set_viewport_size({'width': 1280, 'height': 720})\n        if context.block_requests:\n            await context.block_requests(extra_url_patterns=['*.css', '*.js'])\n\n    @crawler.router.default_handler\n    async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        # Extract title using the unified context interface.\n        title_tag = context.parsed_content.find('title')\n        title = title_tag.get_text() if title_tag else None\n\n        # Extract other data consistently across both modes.\n        links = [a.get('href') for a in context.parsed_content.find_all('a', href=True)]\n\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n                'links': links,\n            }\n        )\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/basic_request_handlers.py",
    "content": "import asyncio\n\nfrom crawlee import Request\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.router import Router\n\n\nasync def main() -> None:\n    # Create a custom router instance\n    router = Router[ParselCrawlingContext]()\n\n    # Define the default handler (fallback for requests without specific labels)\n    @router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing homepage: {context.request.url}')\n\n        # Extract page title\n        title = context.selector.css('title::text').get() or 'No title found'\n\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n                'page_type': 'homepage',\n            }\n        )\n\n        # Find and enqueue collection/category links\n        await context.enqueue_links(selector='a[href*=\"/collections/\"]', label='CATEGORY')\n\n    # Define a handler for category pages\n    @router.handler('CATEGORY')\n    async def category_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing category page: {context.request.url}')\n\n        # Extract category information\n        category_title = context.selector.css('h1::text').get() or 'Unknown Category'\n        product_count = len(context.selector.css('.product-item').getall())\n\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'type': 'category',\n                'category_title': category_title,\n                'product_count': product_count,\n                'handler': 'category',\n            }\n        )\n\n        # Enqueue product links from this category\n        await context.enqueue_links(selector='a[href*=\"/products/\"]', label='PRODUCT')\n\n    # Define a handler for product detail pages\n    @router.handler('PRODUCT')\n    async def product_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing product page: {context.request.url}')\n\n        # Extract detailed product information\n        product_data = {\n            'url': context.request.url,\n            'name': context.selector.css('h1::text').get(),\n            'price': context.selector.css('.price::text').get(),\n            'description': context.selector.css('.product-description p::text').get(),\n            'images': context.selector.css('.product-gallery img::attr(src)').getall(),\n            'in_stock': bool(context.selector.css('.add-to-cart-button').get()),\n            'handler': 'product',\n        }\n\n        await context.push_data(product_data)\n\n    # Create crawler with the router\n    crawler = ParselCrawler(\n        request_handler=router,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    # Start crawling with some initial requests\n    await crawler.run(\n        [\n            # Will use default handler\n            'https://warehouse-theme-metal.myshopify.com/',\n            # Will use category handler\n            Request.from_url(\n                'https://warehouse-theme-metal.myshopify.com/collections/all',\n                label='CATEGORY',\n            ),\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/custom_router_default_only.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.router import Router\n\n\nasync def main() -> None:\n    # Create a custom router instance\n    router = Router[ParselCrawlingContext]()\n\n    # Define only a default handler\n    @router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract page title\n        title = context.selector.css('title::text').get() or 'No title found'\n\n        # Extract and save basic page data\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n            }\n        )\n\n        # Find and enqueue product links for further crawling\n        await context.enqueue_links(\n            selector='a[href*=\"/products/\"]',\n            label='PRODUCT',  # Note: no handler for this label, will use default\n        )\n\n    # Create crawler with the custom router\n    crawler = ParselCrawler(\n        request_handler=router,\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    # Start crawling\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/error_handler.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\nfrom crawlee.errors import HttpStatusCodeError\n\n# HTTP status code constants\nTOO_MANY_REQUESTS = 429\n\n\nasync def main() -> None:\n    # Create a crawler instance\n    crawler = ParselCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract product information (might fail for some pages)\n        product_name = context.selector.css('h1[data-testid=\"product-title\"]::text').get()\n        if not product_name:\n            raise ValueError('Product name not found - might be a non-product page')\n\n        price = context.selector.css('.price::text').get()\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'product_name': product_name,\n                'price': price,\n            }\n        )\n\n    # Error handler - called when an error occurs during request processing\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        error_name = type(error).__name__\n        context.log.warning(f'Error occurred for {context.request.url}: {error_name}')\n\n        # You can modify the request or context here before retry\n        if (\n            isinstance(error, HttpStatusCodeError)\n            and error.status_code == TOO_MANY_REQUESTS\n        ):\n            context.log.info('Rate limited - will retry with delay')\n            # You could modify headers, add delay, etc.\n        elif isinstance(error, ValueError):\n            context.log.info('Parse error - marking request as no retry')\n            context.request.no_retry = True\n\n    # Start crawling\n    await crawler.run(\n        [\n            'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens',\n            # Might cause parse error\n            'https://warehouse-theme-metal.myshopify.com/collections/mens-running',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/failed_request_handler.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n    # Create a crawler instance with retry settings\n    crawler = ParselCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n        max_request_retries=2,  # Allow 2 retries before failing\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract product information\n        product_name = context.selector.css('h1[data-testid=\"product-title\"]::text').get()\n        if not product_name:\n            product_name = context.selector.css('h1::text').get() or 'Unknown Product'\n\n        price = context.selector.css('.price::text').get() or 'Price not available'\n\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'product_name': product_name,\n                'price': price,\n                'status': 'success',\n            }\n        )\n\n    # Failed request handler - called when request has exhausted all retries\n    @crawler.failed_request_handler\n    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        context.log.error(\n            f'Failed to process {context.request.url} after all retries: {error}'\n        )\n\n        # Save failed request information for analysis\n        await context.push_data(\n            {\n                'failed_url': context.request.url,\n                'label': context.request.label,\n                'error_type': type(error).__name__,\n                'error_message': str(error),\n                'retry_count': context.request.retry_count,\n                'status': 'failed',\n            }\n        )\n\n    # Start crawling with some URLs that might fail\n    await crawler.run(\n        [\n            'https://warehouse-theme-metal.myshopify.com/products/on-running-cloudmonster-2-mens',\n            # This will likely fail\n            'https://warehouse-theme-metal.myshopify.com/invalid-url',\n            'https://warehouse-theme-metal.myshopify.com/products/valid-product',\n        ]\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/http_pre_navigation.py",
    "content": "import asyncio\n\nfrom crawlee import HttpHeaders\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n    crawler = ParselCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.pre_navigation_hook\n    async def setup_request(context: BasicCrawlingContext) -> None:\n        # Add custom headers before making the request\n        context.request.headers |= HttpHeaders(\n            {\n                'User-Agent': 'Crawlee Bot 1.0',\n                'Accept': 'text/html,application/xhtml+xml',\n            },\n        )\n\n    @crawler.router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        # Extract basic page information\n        title = context.selector.css('title::text').get()\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n            }\n        )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/playwright_pre_navigation.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import (\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    @crawler.pre_navigation_hook\n    async def setup_page(context: PlaywrightPreNavCrawlingContext) -> None:\n        # Set viewport size for consistent rendering\n        await context.page.set_viewport_size({'width': 1280, 'height': 720})\n\n        # Block unnecessary resources to speed up crawling\n        await context.block_requests(\n            extra_url_patterns=[\n                '*.png',\n                '*.jpg',\n                '*.jpeg',\n                '*.gif',\n                '*.svg',\n                '*.css',\n                '*.woff',\n                '*.woff2',\n                '*.ttf',\n                '*google-analytics*',\n                '*facebook*',\n                '*twitter*',\n            ]\n        )\n\n        # Set custom user agent\n        await context.page.set_extra_http_headers(\n            {\n                'User-Agent': 'Mozilla/5.0 (compatible; Crawlee Bot)',\n            }\n        )\n\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        title = await context.page.title()\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n            }\n        )\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/request_router/simple_default_handler.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n    # Create a crawler instance\n    crawler = ParselCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n    )\n\n    # Use the crawler's built-in router to define a default handler\n    @crawler.router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Extract page title\n        title = context.selector.css('title::text').get() or 'No title found'\n\n        # Extract and save basic page data\n        await context.push_data(\n            {\n                'url': context.request.url,\n                'title': title,\n            }\n        )\n\n        # Find and enqueue product links for further crawling\n        await context.enqueue_links(selector='a[href*=\"/products/\"]', label='PRODUCT')\n\n    # Start crawling\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/running_in_web_server/__init__.py",
    "content": ""
  },
  {
    "path": "docs/guides/code_examples/running_in_web_server/crawler.py",
    "content": "import asyncio\nfrom collections.abc import AsyncIterator\nfrom contextlib import asynccontextmanager\nfrom typing import TypedDict\n\nfrom fastapi import FastAPI\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nclass State(TypedDict):\n    \"\"\"State available in the app.\"\"\"\n\n    crawler: ParselCrawler\n    requests_to_results: dict[str, asyncio.Future[dict[str, str]]]\n\n\n@asynccontextmanager\nasync def lifespan(app: FastAPI) -> AsyncIterator[State]:\n    # Start up code that runs once when the app starts\n\n    # Results will be stored in this dictionary\n    requests_to_results = dict[str, asyncio.Future[dict[str, str]]]()\n\n    crawler = ParselCrawler(\n        # Keep the crawler alive even when there are no more requests to process now.\n        # This makes the crawler wait for more requests to be added later.\n        keep_alive=True\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        title = context.selector.xpath('//title/text()').get() or ''\n\n        # Extract data from the page and save it to the result dictionary.\n        requests_to_results[context.request.unique_key].set_result(\n            {\n                'title': title,\n            }\n        )\n\n    # Start the crawler without awaiting it to finish\n    crawler.log.info(f'Starting crawler for the {app.title}')\n    run_task = asyncio.create_task(crawler.run([]))\n\n    # Make the crawler and the result dictionary available in the app state\n    yield {'crawler': crawler, 'requests_to_results': requests_to_results}\n\n    # Cleanup code that runs once when the app shuts down\n    crawler.stop()\n    # Wait for the crawler to finish\n    await run_task\n"
  },
  {
    "path": "docs/guides/code_examples/running_in_web_server/server.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom uuid import uuid4\n\nfrom fastapi import FastAPI\nfrom starlette.requests import Request\nfrom starlette.responses import HTMLResponse\n\nimport crawlee\n\nfrom .crawler import lifespan\n\napp = FastAPI(lifespan=lifespan, title='Crawler app')\n\n\n@app.get('/', response_class=HTMLResponse)\ndef index() -> str:\n    return \"\"\"\n<!DOCTYPE html>\n<html>\n<body>\n    <h1>Scraper server</h1>\n        <p>To scrape some page, visit \"scrape\" endpoint with url parameter.\n            For example:\n            <a href=\"/scrape?url=https://www.example.com\">\n                /scrape?url=https://www.example.com\n            </a>\n        </p>\n</body>\n</html>\n\"\"\"\n\n\n@app.get('/scrape')\nasync def scrape_url(request: Request, url: str | None = None) -> dict:\n    if not url:\n        return {'url': 'missing', 'scrape result': 'no results'}\n\n    # Generate random unique key for the request\n    unique_key = str(uuid4())\n\n    # Set the result future in the result dictionary so that it can be awaited\n    request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]()\n\n    # Add the request to the crawler queue\n    await request.state.crawler.add_requests(\n        [crawlee.Request.from_url(url, unique_key=unique_key)]\n    )\n\n    # Wait for the result future to be finished\n    result = await request.state.requests_to_results[unique_key]\n\n    # Clean the result from the result dictionary to free up memory\n    request.state.requests_to_results.pop(unique_key)\n\n    # Return the result\n    return {'url': url, 'scrape result': result}\n"
  },
  {
    "path": "docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py",
    "content": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasync def main() -> None:\n    concurrency_settings = ConcurrencySettings(\n        # Set the maximum number of concurrent requests the crawler can run to 100.\n        max_concurrency=100,\n        # Limit the total number of requests to 10 per minute to avoid overwhelming\n        # the target website.\n        max_tasks_per_minute=10,\n    )\n\n    crawler = BeautifulSoupCrawler(\n        # Apply the defined concurrency settings to the crawler.\n        concurrency_settings=concurrency_settings,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py",
    "content": "import asyncio\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import BeautifulSoupCrawler\n\n\nasync def main() -> None:\n    concurrency_settings = ConcurrencySettings(\n        # Start with 8 concurrent tasks, as long as resources are available.\n        desired_concurrency=8,\n        # Maintain a minimum of 5 concurrent tasks to ensure steady crawling.\n        min_concurrency=5,\n        # Limit the maximum number of concurrent tasks to 10 to prevent\n        # overloading the system.\n        max_concurrency=10,\n    )\n\n    crawler = BeautifulSoupCrawler(\n        # Use the configured concurrency settings for the crawler.\n        concurrency_settings=concurrency_settings,\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_conflicts.py",
    "content": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\n\n\nasync def main() -> None:\n    # Register the storage client via service locator.\n    memory_storage_client = MemoryStorageClient()\n    service_locator.set_storage_client(memory_storage_client)\n\n    # Retrieve the storage client.\n    current_storage_client = service_locator.get_storage_client()\n\n    # Try to set a different storage client, which will raise ServiceConflictError\n    # if storage client was already retrieved.\n    file_system_storage_client = FileSystemStorageClient()\n    service_locator.set_storage_client(file_system_storage_client)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_crawler_configuration.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCrawler\n\n\nasync def main() -> None:\n    configuration = Configuration(\n        log_level='DEBUG',\n        headless=False,\n        persist_state_interval=timedelta(seconds=30),\n    )\n\n    # Register configuration via crawler.\n    crawler = ParselCrawler(\n        configuration=configuration,\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_crawler_event_manager.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.events import LocalEventManager\n\n\nasync def main() -> None:\n    event_manager = LocalEventManager(\n        system_info_interval=timedelta(seconds=5),\n    )\n\n    # Register event manager via crawler.\n    crawler = ParselCrawler(\n        event_manager=event_manager,\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_crawler_storage_client.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\nasync def main() -> None:\n    storage_client = MemoryStorageClient()\n\n    # Register storage client via crawler.\n    crawler = ParselCrawler(\n        storage_client=storage_client,\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_locator_configuration.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\n\n\nasync def main() -> None:\n    configuration = Configuration(\n        log_level='DEBUG',\n        headless=False,\n        persist_state_interval=timedelta(seconds=30),\n    )\n\n    # Register configuration via service locator.\n    service_locator.set_configuration(configuration)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_locator_event_manager.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.events import LocalEventManager\n\n\nasync def main() -> None:\n    event_manager = LocalEventManager(\n        system_info_interval=timedelta(seconds=5),\n    )\n\n    # Register event manager via service locator.\n    service_locator.set_event_manager(event_manager)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_locator_storage_client.py",
    "content": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import MemoryStorageClient\n\n\nasync def main() -> None:\n    storage_client = MemoryStorageClient()\n\n    # Register storage client via service locator.\n    service_locator.set_storage_client(storage_client)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_storage_configuration.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    configuration = Configuration(\n        log_level='DEBUG',\n        headless=False,\n        persist_state_interval=timedelta(seconds=30),\n    )\n    # Set the custom configuration as the global default configuration.\n    service_locator.set_configuration(configuration)\n\n    # Use the global defaults when creating the dataset (or other storage).\n    dataset_1 = await Dataset.open()\n\n    # Or set explicitly specific configuration if\n    # you do not want to rely on global defaults.\n    dataset_2 = await Dataset.open(\n        storage_client=MemoryStorageClient(), configuration=configuration\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/service_locator/service_storage_storage_client.py",
    "content": "import asyncio\n\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    storage_client = MemoryStorageClient()\n\n    # Pass the storage client to the dataset (or other storage) when opening it.\n    dataset = await Dataset.open(\n        storage_client=storage_client,\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/multi_sessions_http.py",
    "content": "import asyncio\nfrom collections.abc import Callable\nfrom datetime import timedelta\nfrom itertools import count\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors import RequestCollisionError\nfrom crawlee.sessions import Session, SessionPool\n\n\n# Define a function for creating sessions with simple logic for unique `id` generation.\n# This is necessary if you need to specify a particular session for the first request,\n# for example during authentication\ndef create_session_function() -> Callable[[], Session]:\n    counter = count()\n\n    def create_session() -> Session:\n        return Session(\n            id=str(next(counter)),\n            max_usage_count=999_999,\n            max_age=timedelta(hours=999_999),\n            max_error_score=100,\n            blocked_status_codes=[403],\n        )\n\n    return create_session\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        # Adjust request limits according to your pool size\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=500),\n        # Requests are bound to specific sessions, no rotation needed\n        max_session_rotations=0,\n        session_pool=SessionPool(\n            max_pool_size=10, create_session_function=create_session_function()\n        ),\n    )\n\n    @crawler.router.default_handler\n    async def basic_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n    # Initialize the session and bind the next request to this session if needed\n    @crawler.router.handler(label='session_init')\n    async def session_init(context: HttpCrawlingContext) -> None:\n        next_requests = []\n        if context.session:\n            context.log.info(f'Init session {context.session.id}')\n            next_request = Request.from_url(\n                'https://a.placeholder.com', session_id=context.session.id\n            )\n            next_requests.append(next_request)\n\n        await context.add_requests(next_requests)\n\n    # Handle errors when a session is blocked and no longer available in the pool\n    # when attempting to execute requests bound to it\n    @crawler.failed_request_handler\n    async def error_processing(context: BasicCrawlingContext, error: Exception) -> None:\n        if isinstance(error, RequestCollisionError) and context.session:\n            context.log.error(\n                f'Request {context.request.url} failed, because the bound '\n                'session is unavailable'\n            )\n\n    # Create a pool of requests bound to their respective sessions\n    # Use `always_enqueue=True` if session initialization happens on a non-unique address,\n    # such as the site's main page\n    init_requests = [\n        Request.from_url(\n            'https://example.org/',\n            label='session_init',\n            session_id=str(session_id),\n            use_extended_unique_key=True,\n        )\n        for session_id in range(1, 11)\n    ]\n\n    await crawler.run(init_requests)\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/one_session_http.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nfrom crawlee import ConcurrencySettings, Request\nfrom crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext\nfrom crawlee.errors import SessionError\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    crawler = HttpCrawler(\n        # Limit requests per minute to reduce the chance of being blocked\n        concurrency_settings=ConcurrencySettings(max_tasks_per_minute=50),\n        # Disable session rotation\n        max_session_rotations=0,\n        session_pool=SessionPool(\n            # Only one session in the pool\n            max_pool_size=1,\n            create_session_settings={\n                # High value for session usage limit\n                'max_usage_count': 999_999,\n                # High value for session lifetime\n                'max_age': timedelta(hours=999_999),\n                # High score allows the session to encounter more errors\n                # before crawlee decides the session is blocked\n                # Make sure you know how to handle these errors\n                'max_error_score': 100,\n                # 403 status usually indicates you're already blocked\n                'blocked_status_codes': [403],\n            },\n        ),\n    )\n\n    # Basic request handling logic\n    @crawler.router.default_handler\n    async def basic_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n    # Handler for session initialization (authentication, initial cookies, etc.)\n    @crawler.router.handler(label='session_init')\n    async def session_init(context: HttpCrawlingContext) -> None:\n        if context.session:\n            context.log.info(f'Init session {context.session.id}')\n\n    # Monitor if our session gets blocked and explicitly stop the crawler\n    @crawler.error_handler\n    async def error_processing(context: BasicCrawlingContext, error: Exception) -> None:\n        if isinstance(error, SessionError) and context.session:\n            context.log.info(f'Session {context.session.id} blocked')\n            crawler.stop()\n\n    await crawler.run([Request.from_url('https://example.org/', label='session_init')])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_basic.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.crawlers import BasicCrawler, BasicCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # To use the proxy IP session rotation logic, you must turn the proxy usage on.\n    proxy_configuration = ProxyConfiguration(\n        # options\n    )\n\n    # Initialize crawler with a custom SessionPool configuration\n    # to manage concurrent sessions and proxy rotation\n    crawler = BasicCrawler(\n        proxy_configuration=proxy_configuration,\n        # Activates the Session pool (default is true).\n        use_session_pool=True,\n        # Overrides default Session pool configuration.\n        session_pool=SessionPool(max_pool_size=100),\n    )\n\n    # Define the default request handler that manages session states\n    @crawler.router.default_handler\n    async def default_handler(context: BasicCrawlingContext) -> None:\n        # Send request, BasicCrawler automatically selects a session from the pool\n        # and sets a proxy for it. You can check it with `context.session`\n        # and `context.proxy_info`.\n        response = await context.send_request(context.request.url)\n\n        page_content = (await response.read()).decode()\n        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)\n\n        if context.session and (title := title_match.group(1) if title_match else None):\n            if title == 'Blocked':\n                context.session.retire()\n            elif title == 'Not sure if blocked, might also be a connection error':\n                context.session.mark_bad()\n            else:\n                context.session.mark_good()  # BasicCrawler handles this automatically.\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_beautifulsoup.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # To use the proxy IP session rotation logic, you must turn the proxy usage on.\n    proxy_configuration = ProxyConfiguration(\n        # options\n    )\n\n    # Initialize crawler with a custom SessionPool configuration\n    # to manage concurrent sessions and proxy rotation\n    crawler = BeautifulSoupCrawler(\n        proxy_configuration=proxy_configuration,\n        # Activates the Session pool (default is true).\n        use_session_pool=True,\n        # Overrides default Session pool configuration.\n        session_pool=SessionPool(max_pool_size=100),\n    )\n\n    # Define the default request handler that manages session states\n    # based on the response content and potential blocking\n    @crawler.router.default_handler\n    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:\n        title = context.soup.title.get_text() if context.soup.title else None\n\n        if context.session:\n            if title == 'Blocked':\n                context.session.retire()\n            elif title == 'Not sure if blocked, might also be a connection error':\n                context.session.mark_bad()\n            else:\n                context.session.mark_good()  # BasicCrawler handles this automatically.\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_http.py",
    "content": "import asyncio\nimport re\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # To use the proxy IP session rotation logic, you must turn the proxy usage on.\n    proxy_configuration = ProxyConfiguration(\n        # options\n    )\n\n    # Initialize crawler with a custom SessionPool configuration\n    # to manage concurrent sessions and proxy rotation\n    crawler = HttpCrawler(\n        proxy_configuration=proxy_configuration,\n        # Activates the Session pool (default is true).\n        use_session_pool=True,\n        # Overrides default Session pool configuration.\n        session_pool=SessionPool(max_pool_size=100),\n    )\n\n    # Define the default request handler that manages session states\n    # based on the response content and potential blocking\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        page_content = (await context.http_response.read()).decode()\n        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)\n\n        if context.session and (title := title_match.group(1) if title_match else None):\n            if title == 'Blocked':\n                context.session.retire()\n            elif title == 'Not sure if blocked, might also be a connection error':\n                context.session.mark_bad()\n            else:\n                context.session.mark_good()  # BasicCrawler handles this automatically.\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_parsel.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # To use the proxy IP session rotation logic, you must turn the proxy usage on.\n    proxy_configuration = ProxyConfiguration(\n        # options\n    )\n\n    # Initialize crawler with a custom SessionPool configuration\n    # to manage concurrent sessions and proxy rotation\n    crawler = ParselCrawler(\n        proxy_configuration=proxy_configuration,\n        # Activates the Session pool (default is true).\n        use_session_pool=True,\n        # Overrides default Session pool configuration.\n        session_pool=SessionPool(max_pool_size=100),\n    )\n\n    # Define the default request handler that manages session states\n    # based on the response content and potential blocking\n    @crawler.router.default_handler\n    async def default_handler(context: ParselCrawlingContext) -> None:\n        title = context.selector.css('title::text').get()\n\n        if context.session:\n            if title == 'Blocked':\n                context.session.retire()\n            elif title == 'Not sure if blocked, might also be a connection error':\n                context.session.mark_bad()\n            else:\n                context.session.mark_good()  # BasicCrawler handles this automatically.\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_playwright.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # To use the proxy IP session rotation logic, you must turn the proxy usage on.\n    proxy_configuration = ProxyConfiguration(\n        # options\n    )\n\n    # Initialize crawler with a custom SessionPool configuration\n    # to manage concurrent sessions and proxy rotation\n    crawler = PlaywrightCrawler(\n        proxy_configuration=proxy_configuration,\n        # Activates the Session pool (default is true).\n        use_session_pool=True,\n        # Overrides default Session pool configuration.\n        session_pool=SessionPool(max_pool_size=100),\n    )\n\n    # Define the default request handler that manages session states\n    # based on the response content and potential blocking\n    @crawler.router.default_handler\n    async def default_handler(context: PlaywrightCrawlingContext) -> None:\n        title = await context.page.title()\n\n        if context.session:\n            if title == 'Blocked':\n                context.session.retire()\n            elif title == 'Not sure if blocked, might also be a connection error':\n                context.session.mark_bad()\n            else:\n                context.session.mark_good()  # BasicCrawler handles this automatically.\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/session_management/sm_standalone.py",
    "content": "import asyncio\n\nfrom crawlee.sessions import SessionPool\n\n\nasync def main() -> None:\n    # Override the default Session pool configuration.\n    async with SessionPool(\n        max_pool_size=100,\n        create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]},\n    ) as session_pool:\n        session = await session_pool.get_session()\n\n        # Increase the error_score.\n        session.mark_bad()\n\n        # Throw away the session.\n        session.retire()\n\n        # Lower the error_score and mark the session good.\n        session.mark_good()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/custom_storage_client_example.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom crawlee.storage_clients import StorageClient\nfrom crawlee.storage_clients._base import (\n    DatasetClient,\n    KeyValueStoreClient,\n    RequestQueueClient,\n)\n\nif TYPE_CHECKING:\n    from crawlee.configuration import Configuration\n\n# Implement the storage type clients with your backend logic.\n\n\nclass CustomDatasetClient(DatasetClient):\n    # Implement methods like push_data, get_data, iterate_items, etc.\n    pass\n\n\nclass CustomKeyValueStoreClient(KeyValueStoreClient):\n    # Implement methods like get_value, set_value, delete, etc.\n    pass\n\n\nclass CustomRequestQueueClient(RequestQueueClient):\n    # Implement methods like add_request, fetch_next_request, etc.\n    pass\n\n\n# Implement the storage client factory.\n\n\nclass CustomStorageClient(StorageClient):\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> CustomDatasetClient:\n        # Create and return your custom dataset client.\n        pass\n\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> CustomKeyValueStoreClient:\n        # Create and return your custom key-value store client.\n        pass\n\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> CustomRequestQueueClient:\n        # Create and return your custom request queue client.\n        pass\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py",
    "content": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import FileSystemStorageClient\n\n# Create a new instance of storage client.\nstorage_client = FileSystemStorageClient()\n\n# And pass it to the crawler.\ncrawler = ParselCrawler(storage_client=storage_client)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py",
    "content": "from crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import FileSystemStorageClient\n\n# Create a new instance of storage client.\nstorage_client = FileSystemStorageClient()\n\n# Create a configuration with custom settings.\nconfiguration = Configuration(\n    storage_dir='./my_storage',\n    purge_on_start=False,\n)\n\n# And pass them to the crawler.\ncrawler = ParselCrawler(\n    storage_client=storage_client,\n    configuration=configuration,\n)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py",
    "content": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\n\n# Create a new instance of storage client.\nstorage_client = MemoryStorageClient()\n\n# And pass it to the crawler.\ncrawler = ParselCrawler(storage_client=storage_client)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py",
    "content": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import RedisStorageClient\n\n# Create a new instance of storage client using connection string.\n# 'redis://localhost:6379' is the just placeholder, replace it with your actual\n# connection string.\nstorage_client = RedisStorageClient(connection_string='redis://localhost:6379')\n\n# And pass it to the crawler.\ncrawler = ParselCrawler(storage_client=storage_client)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py",
    "content": "from redis.asyncio import Redis\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import RedisStorageClient\n\n# Create a new instance of storage client using a Redis client with custom settings.\n# Replace host and port with your actual Redis server configuration.\n# Other Redis client settings can be adjusted as needed.\nstorage_client = RedisStorageClient(\n    redis=Redis(\n        host='localhost',\n        port=6379,\n        retry_on_timeout=True,\n        socket_keepalive=True,\n        socket_connect_timeout=10,\n    )\n)\n\n# Create a configuration with custom settings.\nconfiguration = Configuration(purge_on_start=False)\n\n# And pass them to the crawler.\ncrawler = ParselCrawler(\n    storage_client=storage_client,\n    configuration=configuration,\n)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/registering_storage_clients_example.py",
    "content": "import asyncio\n\nfrom crawlee import service_locator\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Create custom storage client, MemoryStorageClient for example.\n    storage_client = MemoryStorageClient()\n\n    # Register it globally via the service locator.\n    service_locator.set_storage_client(storage_client)\n\n    # Or pass it directly to the crawler, it will be registered globally\n    # to the service locator under the hood.\n    crawler = ParselCrawler(storage_client=storage_client)\n\n    # Or just provide it when opening a storage (e.g. dataset), it will be used\n    # for this storage only, not globally.\n    dataset = await Dataset.open(\n        name='my-dataset',\n        storage_client=storage_client,\n    )\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py",
    "content": "from crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import SqlStorageClient\n\n\nasync def main() -> None:\n    # Create a new instance of storage client.\n    # This will create an SQLite database file crawlee.db or created tables in your\n    # database if you pass `connection_string` or `engine`\n    # Use the context manager to ensure that connections are properly cleaned up.\n    async with SqlStorageClient() as storage_client:\n        # And pass it to the crawler.\n        crawler = ParselCrawler(storage_client=storage_client)\n"
  },
  {
    "path": "docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py",
    "content": "from sqlalchemy.ext.asyncio import create_async_engine\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import SqlStorageClient\n\n\nasync def main() -> None:\n    # Create a new instance of storage client.\n    # On first run, also creates tables in your PostgreSQL database.\n    # Use the context manager to ensure that connections are properly cleaned up.\n    async with SqlStorageClient(\n        # Create an `engine` with the desired configuration\n        engine=create_async_engine(\n            'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres',\n            future=True,\n            pool_size=5,\n            max_overflow=10,\n            pool_recycle=3600,\n            pool_pre_ping=True,\n            echo=False,\n        )\n    ) as storage_client:\n        # Create a configuration with custom settings.\n        configuration = Configuration(\n            purge_on_start=False,\n        )\n\n        # And pass them to the crawler.\n        crawler = ParselCrawler(\n            storage_client=storage_client,\n            configuration=configuration,\n        )\n"
  },
  {
    "path": "docs/guides/code_examples/storages/cleaning_do_not_purge_example.py",
    "content": "import asyncio\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    # Set the purge_on_start field to False to avoid purging the storage on start.\n    # highlight-next-line\n    configuration = Configuration(purge_on_start=False)\n\n    # Pass the configuration to the crawler.\n    crawler = HttpCrawler(configuration=configuration)\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py",
    "content": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Create storage client with configuration\n    dataset = await Dataset.open(name='my-dataset')\n\n    # Purge the dataset explicitly - purging will remove all items from the dataset.\n    # But keeps the dataset itself and its metadata.\n    await dataset.purge()\n\n    # Or you can drop the dataset completely, which will remove the dataset\n    # and all its items.\n    await dataset.drop()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/dataset_basic_example.py",
    "content": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Open the dataset, if it does not exist, it will be created.\n    # Leave name empty to use the default dataset.\n    dataset = await Dataset.open(name='my-dataset')\n\n    # Push a single row of data.\n    await dataset.push_data({'foo': 'bar'})\n\n    # Push multiple rows of data (anything JSON-serializable can be pushed).\n    await dataset.push_data([{'foo': 'bar2', 'col2': 'val2'}, {'col3': 123}])\n\n    # Fetch all data from the dataset.\n    data = await dataset.get_data()\n    # Do something with it...\n\n    # Remove the dataset.\n    await dataset.drop()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/dataset_with_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # Create a new crawler (it can be any subclass of BasicCrawler).\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Push the extracted data to the (default) dataset.\n        await context.push_data(data)\n\n    # Run the crawler with the initial URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n    # Export the dataset to a file.\n    await crawler.export_data(path='dataset.csv')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Open the dataset, if it does not exist, it will be created.\n    # Leave name empty to use the default dataset.\n    dataset = await Dataset.open(name='my-dataset')\n\n    # Create a new crawler (it can be any subclass of BasicCrawler).\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Push the extracted data to the dataset.\n        await dataset.push_data(data)\n\n    # Run the crawler with the initial URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n    # Export the dataset to the key-value store.\n    await dataset.export_to(key='dataset', content_type='csv')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/helper_add_requests_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # highlight-next-line\n        await context.add_requests(['https://apify.com/'])\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/helper_enqueue_links_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # highlight-next-line\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/kvs_basic_example.py",
    "content": "import asyncio\n\nfrom crawlee.storages import KeyValueStore\n\n\nasync def main() -> None:\n    # Open the key-value store, if it does not exist, it will be created.\n    # Leave name empty to use the default KVS.\n    kvs = await KeyValueStore.open(name='my-key-value-store')\n\n    # Set a value associated with 'some-key'.\n    await kvs.set_value(key='some-key', value={'foo': 'bar'})\n\n    # Get the value associated with 'some-key'.\n    value = kvs.get_value('some-key')\n    # Do something with it...\n\n    # Delete the value associated with 'some-key' by setting it to None.\n    await kvs.set_value(key='some-key', value=None)\n\n    # Remove the key-value store.\n    await kvs.drop()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/kvs_with_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    # Create a new Playwright crawler.\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Capture the screenshot of the page using Playwright's API.\n        screenshot = await context.page.screenshot()\n        name = context.request.url.split('/')[-1]\n\n        # Get the key-value store from the context. # If it does not exist,\n        # it will be created. Leave name empty to use the default KVS.\n        kvs = await context.get_key_value_store()\n\n        # Store the screenshot in the key-value store.\n        await kvs.set_value(\n            key=f'screenshot-{name}',\n            value=screenshot,\n            content_type='image/png',\n        )\n\n    # Run the crawler with the initial URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storages import KeyValueStore\n\n\nasync def main() -> None:\n    # Open the key-value store, if it does not exist, it will be created.\n    # Leave name empty to use the default KVS.\n    kvs = await KeyValueStore.open(name='my-key-value-store')\n\n    # Create a new Playwright crawler.\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Capture the screenshot of the page using Playwright's API.\n        screenshot = await context.page.screenshot()\n        name = context.request.url.split('/')[-1]\n\n        # Store the screenshot in the key-value store.\n        await kvs.set_value(\n            key=f'screenshot-{name}',\n            value=screenshot,\n            content_type='image/png',\n        )\n\n    # Run the crawler with the initial URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/opening.py",
    "content": "import asyncio\n\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    # Named storage (persists across runs)\n    dataset_named = await Dataset.open(name='my-persistent-dataset')\n\n    # Unnamed storage with alias (purged on start)\n    dataset_unnamed = await Dataset.open(alias='temporary-results')\n\n    # Default unnamed storage (purged on start)\n    dataset_default = await Dataset.open()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/rq_basic_example.py",
    "content": "import asyncio\n\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # Open the request queue, if it does not exist, it will be created.\n    # Leave name empty to use the default request queue.\n    request_queue = await RequestQueue.open(name='my-request-queue')\n\n    # Add a single request.\n    await request_queue.add_request('https://apify.com/')\n\n    # Add multiple requests as a batch.\n    await request_queue.add_requests(\n        ['https://crawlee.dev/', 'https://crawlee.dev/python/']\n    )\n\n    # Fetch and process requests from the queue.\n    while request := await request_queue.fetch_next_request():\n        # Do something with it...\n\n        # And mark it as handled.\n        await request_queue.mark_request_as_handled(request)\n\n    # Remove the request queue.\n    await request_queue.drop()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/rq_with_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n\nasync def main() -> None:\n    # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is\n    # a default request manager, it will be opened, and fully managed if not specified.\n    crawler = HttpCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Use context's add_requests method helper to add new requests from the handler.\n        await context.add_requests(['https://crawlee.dev/python/'])\n\n    # Use crawler's add_requests method helper to add new requests.\n    await crawler.add_requests(['https://apify.com/'])\n\n    # Run the crawler. You can optionally pass the list of initial requests.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # Open the request queue, if it does not exist, it will be created.\n    # Leave name empty to use the default request queue.\n    request_queue = await RequestQueue.open(name='my-request-queue')\n\n    # Interact with the request queue directly, e.g. add a batch of requests.\n    await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])\n\n    # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request\n    # queue as request manager to it. It will be managed by the crawler.\n    crawler = HttpCrawler(request_manager=request_queue)\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n    # And execute the crawler.\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py",
    "content": "import asyncio\n\nfrom opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter\nfrom opentelemetry.sdk.resources import Resource\nfrom opentelemetry.sdk.trace import TracerProvider\nfrom opentelemetry.sdk.trace.export import SimpleSpanProcessor\nfrom opentelemetry.trace import set_tracer_provider\n\nfrom crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext\nfrom crawlee.otel import CrawlerInstrumentor\nfrom crawlee.storages import Dataset, KeyValueStore, RequestQueue\n\n\ndef instrument_crawler() -> None:\n    \"\"\"Add instrumentation to the crawler.\"\"\"\n    resource = Resource.create(\n        {\n            'service.name': 'ExampleCrawler',\n            'service.version': '1.0.0',\n            'environment': 'development',\n        }\n    )\n\n    # Set up the OpenTelemetry tracer provider and exporter\n    provider = TracerProvider(resource=resource)\n    otlp_exporter = OTLPSpanExporter(endpoint='localhost:4317', insecure=True)\n    provider.add_span_processor(SimpleSpanProcessor(otlp_exporter))\n    set_tracer_provider(provider)\n    # Instrument the crawler with OpenTelemetry\n    CrawlerInstrumentor(\n        instrument_classes=[RequestQueue, KeyValueStore, Dataset]\n    ).instrument()\n\n\nasync def main() -> None:\n    \"\"\"Run the crawler.\"\"\"\n    instrument_crawler()\n\n    crawler = ParselCrawler(max_requests_per_crawl=100)\n    kvs = await KeyValueStore.open()\n\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(_: BasicCrawlingContext) -> None:\n        # Simulate some pre-navigation processing\n        await asyncio.sleep(0.01)\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        await context.push_data({'url': context.request.url})\n        await kvs.set_value(key='url', value=context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/guides/crawler_login.mdx",
    "content": "---\nid: logging-in-with-a-crawler\ntitle: Logging in with a crawler\ndescription: How to log in to websites with Crawlee.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport PlaywrightLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/playwright_login.py';\nimport HttpLogin from '!!raw-loader!roa-loader!./code_examples/login_crawler/http_login.py';\n\nMany websites require authentication to access their content. This guide demonstrates how to implement login functionality using both <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>.\n\n## Session management for authentication\n\nWhen implementing authentication, you'll typically want to maintain the same <ApiLink to=\"class/Session\">`Session`</ApiLink> throughout your crawl to preserve login state. This requires proper configuration of the <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink>. For more details, see our [session management guide](./session-management).\n\nIf your use case requires multiple authenticated sessions with different credentials, you can:\n- Use the `new_session_function` parameter in <ApiLink to=\"class/SessionPool#__init__\">`SessionPool`</ApiLink> to customize session creation.\n- Specify the `session_id` parameter in <ApiLink to=\"class/Request#from_url\">`Request`</ApiLink> to bind specific requests to particular sessions.\n\nFor this guide, we'll use [demoqa.com](https://demoqa.com/login), a testing site designed for automation practice that provides a login form and protected content.\n\n## Login with Playwright crawler\n\nThe following example demonstrates how to authenticate on a website using <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, which provides browser automation capabilities for filling out logging forms.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightLogin}\n</RunnableCodeBlock>\n\n## Login with HTTP crawler\n\nYou can also use <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> (or its more specific variants like <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> or <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>) to authenticate by sending a POST <ApiLink to=\"class/Request\">`Request`</ApiLink> with your credentials directly to the authentication endpoint.\n\nHTTP-based authentication often varies significantly between websites. Using browser [DevTools](https://developer.chrome.com/docs/devtools/overview) to analyze the `Network` tab during manual login can help you understand the specific authentication flow, required headers, and body parameters for your target website.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {HttpLogin}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/guides/creating_web_archive.mdx",
    "content": "---\nid: creating-web-archive\ntitle: Creating web archive\ndescription: How to create a Web ARChive (WARC) with Crawlee\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport PlaywrightCrawlerRecordThroughProxy from '!!raw-loader!./code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py';\nimport ParselCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_parsel_crawler.py';\nimport PlaywrightCrawlerRecordManual from '!!raw-loader!./code_examples/creating_web_archive/manual_archiving_playwright_crawler.py';\n\nArchiving webpages is one of the tasks that a web crawler can be used for. There are various use cases, such as archiving for future reference, speeding up web crawler development, creating top-level regression tests for web crawlers and so on.\n\nThere are various existing libraries of web archives with massive amount of data stored during their years of existence, for example [Wayback Machine](https://web.archive.org/) or [Common Crawl](https://commoncrawl.org/). There are also dedicated tools for archiving web pages, to name some: simple browser extensions such as [Archive Webpage](https://archiveweb.page/), open source tools such as [pywb](https://pypi.org/project/pywb/) or [warcio](https://pypi.org/project/warcio/), or even web crawlers specialized in archiving such as [Browsertrix](https://webrecorder.net/browsertrix/).\n\nThe common file format used for archiving is [WARC](https://www.iso.org/standard/68004.html). Crawlee does not offer any out-of-the-box functionality to create WARC files, but in this guide, we will show examples of approaches that can be easily used in your use case to create WARC files with Crawlee.\n\n## Crawling through proxy recording server\n\nThis approach can be especially attractive as it does not require almost any code change to the crawler itself and the correct WARC creation is done by code from well maintained [pywb](https://pypi.org/project/pywb/) package. The trick is to run a properly configured [wayback proxy server](https://pywb.readthedocs.io/en/latest/manual/usage.html#using-pywb-recorder), use it as a proxy for the crawler and record any traffic. Another advantage of this approach is that it is language agnostic. This way, you can record both your Python-based crawler and your JavaScript-based crawler. This is very straightforward and a good place to start.\n\nThis approach expects that you have already created your crawler, and that you just want to archive all the pages it is visiting during its crawl.\n\nInstall [pywb](https://pypi.org/project/pywb/) which will allow you to use `wb-manager` and `wayback` commands.\nCreate a new collection that will be used for this archiving session and start the wayback server:\n```bash\nwb-manager init example-collection\nwayback --record --live -a --auto-interval 10 --proxy example-collection --proxy-record\n```\nInstead of passing many configuration  arguments to `wayback` command, you can configure the server by adding configuration options to `config.yaml`. See the details in the [documentation](https://pywb.readthedocs.io/en/latest/manual/configuring.html#configuring-the-web-archive).\n\n### Configure the crawler\n\nNow you should use this locally hosted server as a proxy in your crawler. There are two more steps before starting the crawler:\n - Make the crawler use the proxy server.\n - Deal with the [pywb Certificate Authority](https://pywb.readthedocs.io/en/latest/manual/configuring.html#https-proxy-and-pywb-certificate-authority).\n\nFor example, in <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, this is the simplest setup, which takes the shortcut and ignores the CA-related errors:\n\n<CodeBlock className=\"language-python\">\n    {PlaywrightCrawlerRecordThroughProxy}\n</CodeBlock>\n\nAfter you run the crawler you will be able to see the archived data in the wayback collection directory for example `.../collections/example-collection/archive`. You can then access the recorded pages directly in the proxy recording server or use it with any other WARC-compatible tool.\n\n## Manual WARC creation\n\nA different approach is to create WARC files manually in the crawler, which gives you full control over the WARC files. This is way more complex and low-level approach as you have to ensure that all the relevant data is collected, and correctly stored and that the archiving functions are called at the right time. This is by no means a trivial task and the example archiving functions below are just the most simple examples that will be insufficient for many real-world use cases. You will need to extend and improve them to properly fit your specific needs.\n\n### Simple crawlers\n\nWith non-browser crawlers such as <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> you will not be able to create high fidelity archive of the page as you will be missing all the JavaScript dynamic content. However, you can still create a WARC file with the HTML content of the page, which can be sufficient for some use cases. Let's take a look at the example below:\n<CodeBlock className=\"language-python\">\n    {ParselCrawlerRecordManual}\n</CodeBlock>\n\nThe example above is calling an archiving function on each request using the `request_handler`.\n\n### Browser-based crawlers\n\nWith browser crawlers such as <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> you should be able to create high fidelity archive of a web page. Let's take a look at the example below:\n\n<CodeBlock className=\"language-python\">\n    {PlaywrightCrawlerRecordManual}\n</CodeBlock>\n\nThe example above is adding an archiving callback on each response in the pre_navigation `archiving_hook`. This ensures that additional resources requested by the browser are also archived.\n\n## Using the archived data\n\nIn the following section, we will describe an example use case how you can use the recorded WARC files to speed up the development of your web crawler. The idea is to use the archived data as a source of responses for your crawler so that you can test it against the real data without having to crawl the web again.\n\nIt is assumed that you already have the WARC files. If not, please read the previous sections on how to create them first.\n\nLet's use pywb again. This time we will not use it as a recording server, but as a proxy server that will serve the previously archived pages to your crawler in development.\n\n```bash\nwb-manager init example-collection\nwb-manager add example-collection /your_path_to_warc_file/example.warc.gz\nwayback --proxy example-collection\n```\n\nPrevious commands start the wayback server that allows crawler requests to be served from the archived pages in the `example-collection` instead of sending requests to the real website. This is again [proxy mode of the wayback server](https://pywb.readthedocs.io/en/latest/manual/usage.html#http-s-proxy-mode-access), but without recording capability. Now you need to [configure your crawler](#configure-the-crawler) to use this proxy server, which was already described above. Once everything is finished, you can just run your crawler, and it will crawl the offline archived version of the website from your WARC file.\n\nYou can also manually browse the archived pages in the wayback server by going to the locally hosted server and entering the collection and URL of the archived page, for example: `http://localhost:8080/example-collection/https:/crawlee.dev/`. The wayback server will serve the page from the WARC file if it exists, or it will return a 404 error if it does not. For more detail about the server please refer to the [pywb documentation](https://pywb.readthedocs.io/en/latest/manual/usage.html#getting-started).\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord](https://discord.com/invite/jyEM2PRvMU) community.\n"
  },
  {
    "path": "docs/guides/error_handling.mdx",
    "content": "---\nid: error-handling\ntitle: Error handling\ndescription: How to handle errors that occur during web crawling.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py';\nimport ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py';\nimport DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py';\n\nThis guide demonstrates techniques for handling common errors encountered during web crawling operations.\n\n## Handling proxy errors\n\nLow-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to=\"class/BasicCrawlerOptions\">`BasicCrawlerOptions`</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to=\"class/BasicCrawler#failed_request_handler\">`failed_request_handler`</ApiLink>:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {HandleProxyError}\n</RunnableCodeBlock>\n\nYou can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many.\n\n## Changing how error status codes are handled\n\nBy default, when <ApiLink to=\"class/Session\">`Sessions`</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to=\"class/Session\">`Session`</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management).\n\nHere's an example of how to change this behavior:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ChangeHandleErrorStatus}\n</RunnableCodeBlock>\n\n## Turning off retries for non-network errors\n\nSometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that.\n\nHere's how to turn off retries for non-network errors using <ApiLink to=\"class/BasicCrawler#error_handler\">`error_handler`</ApiLink>, which runs before Crawlee tries again:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {DisableRetry}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/guides/http_clients.mdx",
    "content": "---\nid: http-clients\ntitle: HTTP clients\ndescription: Learn about Crawlee's HTTP client architecture, how to switch between different implementations, and create custom HTTP clients for specialized web scraping needs.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport ParselHttpxExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_httpx_example.py';\nimport ParselCurlImpersonateExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_curl_impersonate_example.py';\nimport ParselImpitExample from '!!raw-loader!roa-loader!./code_examples/http_clients/parsel_impit_example.py';\n\nHTTP clients are utilized by HTTP-based crawlers (e.g., <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> and <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>) to communicate with web servers. They use external HTTP libraries for communication rather than a browser. Examples of such libraries include [httpx](https://pypi.org/project/httpx/), [aiohttp](https://pypi.org/project/aiohttp/), [curl-cffi](https://pypi.org/project/curl-cffi/), and [impit](https://apify.github.io/impit/). After retrieving page content, an HTML parsing library is typically used to facilitate data extraction. Examples of such libraries include [beautifulsoup](https://pypi.org/project/beautifulsoup4/), [parsel](https://pypi.org/project/parsel/), [selectolax](https://pypi.org/project/selectolax/), and [pyquery](https://pypi.org/project/pyquery/). These crawlers are faster than browser-based crawlers but cannot execute client-side JavaScript.\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass HttpClient {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass ImpitHttpClient\n\nclass HttpxHttpClient\n\nclass CurlImpersonateHttpClient\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nHttpClient --|> ImpitHttpClient\nHttpClient --|> HttpxHttpClient\nHttpClient --|> CurlImpersonateHttpClient\n```\n\n## Switching between HTTP clients\n\nCrawlee currently provides three main HTTP clients: <ApiLink to=\"class/ImpitHttpClient\">`ImpitHttpClient`</ApiLink>, which uses the `impit` library, <ApiLink to=\"class/HttpxHttpClient\">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and <ApiLink to=\"class/CurlImpersonateHttpClient\">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to=\"class/ImpitHttpClient\">`ImpitHttpClient`</ApiLink>. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking).\n\nBelow are examples of how to configure the HTTP client for the <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>:\n\n<Tabs>\n    <TabItem value=\"ParselHttpxExample\" label=\"ParselCrawler with HTTPX\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ParselHttpxExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"ParselCurlImpersonateExample\" label=\"ParselCrawler with curl-cffi\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ParselCurlImpersonateExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"ParselImpitExample\" label=\"ParselCrawler with impit\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ParselImpitExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n## Installation requirements\n\nSince <ApiLink to=\"class/ImpitHttpClient\">`ImpitHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.\n\nFor <ApiLink to=\"class/CurlImpersonateHttpClient\">`CurlImpersonateHttpClient`</ApiLink>, you need to install Crawlee with the `curl-impersonate` extra:\n\n```sh\npython -m pip install 'crawlee[curl-impersonate]'\n```\n\nFor <ApiLink to=\"class/HttpxHttpClient\">`HttpxHttpClient`</ApiLink>, you need to install Crawlee with the `httpx` extra:\n\n```sh\npython -m pip install 'crawlee[httpx]'\n```\n\nAlternatively, you can install all available extras to get access to all HTTP clients and features:\n\n```sh\npython -m pip install 'crawlee[all]'\n```\n\n## Creating custom HTTP clients\n\nCrawlee provides an abstract base class, <ApiLink to=\"class/HttpClient\">`HttpClient`</ApiLink>, which defines the interface that all HTTP clients must implement. This allows you to create custom HTTP clients tailored to your specific requirements.\n\nHTTP clients are responsible for several key operations:\n\n- sending HTTP requests and receiving responses,\n- managing cookies and sessions,\n- handling headers and authentication,\n- managing proxy configurations,\n- connection pooling with timeout management.\n\nTo create a custom HTTP client, you need to inherit from the <ApiLink to=\"class/HttpClient\">`HttpClient`</ApiLink> base class and implement all required abstract methods. Your implementation must be async-compatible and include proper cleanup and resource management to work seamlessly with Crawlee's concurrent processing model.\n\n## Conclusion\n\nThis guide introduced you to the HTTP clients available in Crawlee and demonstrated how to switch between them, including their installation requirements and usage examples. You also learned about the responsibilities of HTTP clients and how to implement your own custom HTTP client by inheriting from the <ApiLink to=\"class/HttpClient\">`HttpClient`</ApiLink> base class.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/http_crawlers.mdx",
    "content": "---\nid: http-crawlers\ntitle: HTTP crawlers\ndescription: Learn about Crawlee's HTTP crawlers including BeautifulSoup, Parsel, and raw HTTP crawlers for efficient server-rendered content extraction without JavaScript execution.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py';\nimport ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py';\nimport HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py';\n\nimport LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py';\nimport LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py';\nimport LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py';\nimport PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py';\nimport ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py';\n\nimport SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py';\nimport SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py';\nimport SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py';\nimport SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py';\nimport AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py';\n\nHTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead.\n\n## Overview\n\nAll HTTP crawlers share a common architecture built around the <ApiLink to=\"class/AbstractHttpCrawler\">`AbstractHttpCrawler`</ApiLink> base class. The main differences lie in the parsing strategy and the context provided to request handlers. There are <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>, and <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>. It can also be extended to create custom crawlers with specialized parsing requirements. They use HTTP clients to fetch page content and parsing libraries to extract data from the HTML, check out the [HTTP clients guide](./http-clients) to learn about the HTTP clients used by these crawlers, how to switch between them, and how to create custom HTTP clients tailored to your specific requirements.\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass BasicCrawler {\n    <<abstract>>\n}\n\nclass AbstractHttpCrawler {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass HttpCrawler\n\nclass ParselCrawler\n\nclass BeautifulSoupCrawler\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nBasicCrawler --|> AbstractHttpCrawler\nAbstractHttpCrawler --|> HttpCrawler\nAbstractHttpCrawler --|> ParselCrawler\nAbstractHttpCrawler --|> BeautifulSoupCrawler\n```\n\n## BeautifulSoupCrawler\n\nThe <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> uses the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library for HTML parsing. It provides fault-tolerant parsing that handles malformed HTML, automatic character encoding detection, and supports CSS selectors, tag navigation, and custom search functions. Use this crawler when working with imperfect HTML structures, when you prefer BeautifulSoup's intuitive API, or when prototyping web scraping solutions.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupExample}\n</RunnableCodeBlock>\n\n## ParselCrawler\n\nThe <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> uses the [Parsel](https://parsel.readthedocs.io/) library, which provides XPath 1.0 and CSS selector support built on `lxml` for high performance. It includes built-in regex support for pattern matching, proper XML namespace handling, and offers better performance than BeautifulSoup while maintaining a clean API. Use this crawler when you need XPath functionality, require high-performance parsing, or need to extract data using regular expressions.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ParselExample}\n</RunnableCodeBlock>\n\n## HttpCrawler\n\nThe <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> provides direct access to HTTP response body and headers without automatic parsing, offering maximum performance with no parsing overhead. It supports any content type (JSON, XML, binary) and allows complete control over response processing, including memory-efficient handling of large responses. Use this crawler when working with non-HTML content, requiring maximum performance, implementing custom parsing logic, or needing access to raw response data.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {HttpExample}\n</RunnableCodeBlock>\n\n### Using custom parsers\n\nSince <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> provides raw HTTP responses, you can integrate any parsing library. Note that helpers like <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> and <ApiLink to=\"class/ExtractLinksFunction\">`extract_links`</ApiLink> are not available with this approach.\n\nThe following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods).\n\n<Tabs groupId=\"custom_parsers\">\n    <TabItem value=\"lxml\" label=\"lxml\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {LxmlParser}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"saxonche\" label=\"lxml with SaxonC-HE\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {LxmlSaxoncheParser}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"selectolax\" label=\"selectolax\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {LexborParser}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"pyquery\" label=\"PyQuery\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PyqueryParser}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"scrapling\" label=\"Scrapling\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ScraplingParser}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n## Custom HTTP crawler\n\nWhile the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from <ApiLink to=\"class/AbstractHttpCrawler\">`AbstractHttpCrawler`</ApiLink>. This approach requires implementing:\n\n1. **Custom parser class**: Inherit from <ApiLink to=\"class/AbstractHttpParser\">`AbstractHttpParser`</ApiLink>.\n2. **Custom context class**: Define what data and helpers are available to handlers.\n3. **Custom crawler class**: Tie everything together.\n\nThis approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format.\n\nThe following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine.\n\n### Parser implementation\n\nThe parser converts HTTP responses into a parsed document and provides methods for element selection. Implement <ApiLink to=\"class/AbstractHttpParser\">`AbstractHttpParser`</ApiLink> using `selectolax` with required methods for parsing and querying:\n\n<CodeBlock className=\"language-python\" language=\"python\" title=\"selectolax_parser.py\">\n    {SelectolaxParserSource}\n</CodeBlock>\n\nThis is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below.\n\n### Crawling context definition (optional)\n\nThe crawling context is passed to request handlers and provides access to the parsed content. Extend <ApiLink to=\"class/ParsedHttpCrawlingContext\">`ParsedHttpCrawlingContext`</ApiLink> to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context.\n\n<CodeBlock className=\"language-python\" language=\"python\" title=\"selectolax_context.py\">\n    {SelectolaxContextSource}\n</CodeBlock>\n\n### Crawler composition\n\nThe crawler class connects the parser and context. Extend <ApiLink to=\"class/AbstractHttpCrawler\">`AbstractHttpCrawler`</ApiLink> and configure the context pipeline to use your custom components:\n\n<CodeBlock className=\"language-python\" language=\"python\" title=\"selectolax_crawler.py\">\n    {SelectolaxCrawlerSource}\n</CodeBlock>\n\n### Crawler usage\n\nThe custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink>. Additionally, the custom parser can be used with <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> for adaptive crawling:\n\n<Tabs groupId=\"crawlers\">\n    <TabItem value=\"selectolax_crawler\" label=\"SelectolaxCrawler\">\n        <CodeBlock className=\"language-python\" language=\"python\">\n            {SelectolaxCrawlerRunSource}\n        </CodeBlock>\n    </TabItem>\n    <TabItem value=\"adaptive_playwright_crawler\" label=\"AdaptivePlaywrightCrawler with SelectolaxParser\">\n        <CodeBlock className=\"language-python\" language=\"python\">\n            {AdaptiveCrawlerRunSource}\n        </CodeBlock>\n    </TabItem>\n</Tabs>\n\n## Conclusion\n\nThis guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> for fault-tolerant HTML parsing, <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> for high-performance extraction with XPath and CSS selectors, and <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> for raw response processing. You also discovered how to integrate third-party parsing libraries with <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink> and how to create fully custom crawlers using <ApiLink to=\"class/AbstractHttpCrawler\">`AbstractHttpCrawler`</ApiLink> for specialized parsing requirements.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/playwright_crawler.mdx",
    "content": "---\nid: playwright-crawler\ntitle: Playwright crawler\ndescription: Learn how to use PlaywrightCrawler for browser-based web scraping.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';\nimport BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';\nimport NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py';\nimport BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';\nimport PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';\n\nA <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> or <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.\n\n## When to use Playwright crawler\n\nUse <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> in scenarios that require full browser capabilities, such as:\n\n- **Dynamic content rendering**: Required when pages rely on heavy JavaScript to load or modify content in the browser.\n- **Anti-scraping protection**: Helpful for sites using JavaScript-based security or advanced anti-automation measures.\n- **Complex cookie management**: Necessary for sites with session or cookie requirements that standard HTTP-based crawlers cannot handle easily.\n\nIf [HTTP-based crawlers](https://crawlee.dev/python/docs/guides/http-crawlers) are insufficient, <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> can address these challenges. See a [basic example](../examples/playwright-crawler) for a typical usage demonstration.\n\n## Advanced configuration\n\nThe <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> uses other Crawlee components under the hood, notably <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> and <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink>. These components let you to configure the browser and context settings, launch multiple browsers, and apply pre-navigation hooks. You can create your own instances of these components and pass them to the <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> constructor.\n\n- The <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> manages how browsers are launched and how browser contexts are created. It accepts [browser launch](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new context](https://playwright.dev/python/docs/api/class-browser#browser-new-context) options.\n- The <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> manages the lifecycle of browser instances (launching, recycling, etc.). You can customize its behavior to suit your needs.\n\n## Managing multiple browsers\n\nThe <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> allows you to manage multiple browsers. Each browser instance is managed by a separate <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> and can be configured independently. This is useful for scenarios like testing multiple configurations or implementing browser rotation to help avoid blocks or detect different site behaviors.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {MultipleLaunchExample}\n</RunnableCodeBlock>\n\n## Browser launch and context configuration\n\nThe <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> provides access to all relevant Playwright configuration options for both [browser launches](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch) and [new browser contexts](https://playwright.dev/python/docs/api/class-browser#browser-new-context). You can specify these options in the constructor of <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> or <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BrowserConfigurationExample}\n</RunnableCodeBlock>\n\nYou can also configure each plugin used by <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink>:\n\n<CodeBlock className=\"language-python\">\n    {PluginBrowserConfigExample}\n</CodeBlock>\n\nFor an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>.\n\n## Page configuration with lifecycle page hooks\n\nFor additional setup or event-driven actions around page creation and closure, the <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to=\"class/BrowserPool#pre_page_create_hook\">`pre_page_create_hook`</ApiLink>, <ApiLink to=\"class/BrowserPool#post_page_create_hook\">`post_page_create_hook`</ApiLink>, <ApiLink to=\"class/BrowserPool#pre_page_close_hook\">`pre_page_close_hook`</ApiLink>, and <ApiLink to=\"class/BrowserPool#post_page_close_hook\">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BrowserPoolPageHooksExample}\n</RunnableCodeBlock>\n\n## Navigation hooks\n\nNavigation hooks allow for additional configuration at specific points during page navigation. The <ApiLink to=\"class/PlaywrightCrawler#pre_navigation_hook\">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to=\"class/PlaywrightPreNavCrawlingContext\">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to=\"class/PlaywrightPreNavCrawlingContext#block_requests\">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough. Similarly, the <ApiLink to=\"class/PlaywrightCrawler#post_navigation_hook\">`post_navigation_hook`</ApiLink> is called after each navigation and provides <ApiLink to=\"class/PlaywrightPostNavCrawlingContext\">`PlaywrightPostNavCrawlingContext`</ApiLink> - useful for post-load checks such as detecting CAPTCHAs or verifying page state.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {NavigationHooksExample}\n</RunnableCodeBlock>\n\n## Conclusion\n\nThis guide introduced the <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> and <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to=\"class/BrowserPool\">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/playwright_crawler_adaptive.mdx",
    "content": "---\nid: adaptive-playwright-crawler\ntitle: Adaptive Playwright crawler\ndescription: Learn how to use the Adaptive Playwright crawler to automatically switch between browser-based and HTTP-only crawling.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport AdaptivePlaywrightCrawlerHandler from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/handler.py';\nimport AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_adaptive/pre_nav_hooks.py';\n\nimport AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_beautifulsoup.py';\nimport AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_parsel.py';\nimport AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code_examples/playwright_crawler_adaptive/init_prediction.py';\n\nAn <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> or <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>.\nIt uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.\n\nDetection is done based on the <ApiLink to=\"class/RenderingTypePredictor\">`RenderingTypePredictor`</ApiLink> with default implementation <ApiLink to=\"class/DefaultRenderingTypePredictor\">`DefaultRenderingTypePredictor`</ApiLink>. It predicts which crawling method should be used and learns from already crawled pages.\n\n## When to use AdaptivePlaywrightCrawler\n\nUse <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> in scenarios where some target pages have to be crawled with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites.\n\nAnother use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client.\n\n## Request handler and adaptive context helpers\n\nRequest handler for <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> works on special context type - <ApiLink to=\"class/AdaptivePlaywrightCrawlingContext\">`AdaptivePlaywrightCrawlingContext`</ApiLink>. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created.\n\n<ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#wait_for_selector\">`wait_for_selector`</ApiLink> accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright.\n\n<ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#query_selector_one\">`query_selector_one`</ApiLink> accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> and `Tag` for <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>.\n\n<ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#query_selector_one\">`query_selector_all`</ApiLink> same as <ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#query_selector_one\">`query_selector_one`</ApiLink>, but returns all found selectors.\n\n<ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#parse_with_static_parser\">`parse_with_static_parser`</ApiLink> will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls <ApiLink to=\"class/AdaptivePlaywrightCrawlingContext#wait_for_selector\">`wait_for_selector`</ApiLink> and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete.\n\nSee the following example about how to create request handler and use context helpers:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {AdaptivePlaywrightCrawlerHandler}\n</RunnableCodeBlock>\n\n## Crawler configuration\n\nTo use <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: <ApiLink to=\"class/AdaptivePlaywrightCrawler#with_beautifulsoup_static_parser\">`AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser`</ApiLink> or <ApiLink to=\"class/AdaptivePlaywrightCrawler#with_parsel_static_parser\">`AdaptivePlaywrightCrawler.with_parsel_static_parser`</ApiLink>.\n\n<ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>, <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>.\n\nIn the following example you can see how to create and configure <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> with two different HTTP-based sub crawlers:\n\n<Tabs>\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\" default>\n        <CodeBlock className=\"language-python\">\n            {AdaptivePlaywrightCrawlerInitBeautifulSoup}\n        </CodeBlock>\n    </TabItem>\n    <TabItem value=\"ParselCrawler\" label=\"ParselCrawler\">\n        <CodeBlock className=\"language-python\">\n            {AdaptivePlaywrightCrawlerInitParsel}\n        </CodeBlock>\n    </TabItem>\n</Tabs>\n\n### Prediction related arguments\n\nTo control which pages are crawled by which method you can use following arguments:\n\n<ApiLink to=\"class/RenderingTypePredictor\">`RenderingTypePredictor`</ApiLink> - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations.\n\n`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`.\n\n`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler.\n\nSee the following example about how to pass prediction related arguments:\n\n<CodeBlock className=\"language-python\">\n    {AdaptivePlaywrightCrawlerInitPrediction}\n</CodeBlock>\n\n## Page configuration with pre-navigation hooks\n\nIn some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to=\"class/AdaptivePlaywrightCrawler#pre_navigation_hook\">`pre_navigation_hook`</ApiLink> method of the <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook.\n\nSee the following example about how to register the pre navigation hooks:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {AdaptivePlaywrightCrawlerPreNavHooks}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/guides/playwright_crawler_stagehand.mdx",
    "content": "---\nid: playwright-crawler-stagehand\ntitle: Playwright with Stagehand\ndescription: How to integrate Stagehand AI-powered automation with PlaywrightCrawler.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py';\nimport BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py';\nimport StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py';\n\n[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic.\n\nStagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider.\n\n:::info\n\nThis guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions.\n\n:::\n\n## Get Gemini API key\n\nYou need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key.\n\n## Create support classes for Stagehand\n\nTo integrate Stagehand with Crawlee, you need to create wrapper classes that allow <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> to manage the Playwright lifecycle.\n\nCreate `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance.\n\nCreate `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>.\n\n<CodeBlock className=\"language-python\" title=\"support_classes.py\">\n    {SupportClasses}\n</CodeBlock>\n\n## Create browser integration classes\n\nYou need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`.\n\nCreate `StagehandPlugin` - a subclass of <ApiLink to=\"class/PlaywrightBrowserPlugin\">`PlaywrightBrowserPlugin`</ApiLink> that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances.\n\nCreate `StagehandBrowserController` - a subclass of <ApiLink to=\"class/PlaywrightBrowserController\">`PlaywrightBrowserController`</ApiLink> that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand.\n\n<CodeBlock className=\"language-python\" title=\"browser_classes.py\">\n    {BrowserClasses}\n</CodeBlock>\n\n## Create a crawler\n\nNow you can create a <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> that uses Stagehand's AI capabilities to interact with web pages using natural language commands:\n\n<CodeBlock className=\"language-python\" title=\"stagehand_run.py\">\n    {StagehandRun}\n</CodeBlock>\n\nThe integration works through several key components:\n- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle\n- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances\n- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities\n- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations\n\nIn the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors.\n"
  },
  {
    "path": "docs/guides/proxy_management.mdx",
    "content": "---\nid: proxy-management\ntitle: Proxy management\ndescription: Using proxies to get around those annoying IP-blocks\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport QuickStartExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/quick_start_example.py';\nimport IntegrationBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_bs_example.py';\nimport IntegrationPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/integration_pw_example.py';\nimport TiersBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_bs_example.py';\nimport TiersPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/tiers_pw_example.py';\nimport InspectionBsExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_bs_example.py';\nimport InspectionPwExample from '!!raw-loader!roa-loader!./code_examples/proxy_management/inspecting_pw_example.py';\n\nimport SessionBsExample from '!!raw-loader!./code_examples/proxy_management/session_bs_example.py';\nimport SessionPwExample from '!!raw-loader!./code_examples/proxy_management/session_pw_example.py';\n\n[IP address blocking](https://en.wikipedia.org/wiki/IP_address_blocking) is one of the oldest and most effective ways of preventing access to a website. It is therefore paramount for a good web scraping library to provide easy to use but powerful tools which can work around IP blocking. The most powerful weapon in our anti IP blocking arsenal is a [proxy server](https://en.wikipedia.org/wiki/Proxy_server).\n\nWith Crawlee we can use our own proxy servers or proxy servers acquired from third-party providers.\n\n[//]: # (Check out the [avoid blocking guide]&#40;./avoid-blocking&#41; for more information about blocking.)\n\n## Quick start\n\nIf you already have proxy URLs of your own, you can start using them immediately in only a few lines of code.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {QuickStartExample}\n</RunnableCodeBlock>\n\nExamples of how to use our proxy URLs with crawlers are shown below in [Crawler integration](#crawler-integration) section.\n\n## Proxy configuration\n\nAll our proxy needs are managed by the <ApiLink to=\"class/ProxyConfiguration\">`ProxyConfiguration`</ApiLink> class. We create an instance using the <ApiLink to=\"class/ProxyConfiguration\">`ProxyConfiguration`</ApiLink> constructor function based on the provided options.\n\n### Crawler integration\n\n`ProxyConfiguration` integrates seamlessly into <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>.\n\n<Tabs>\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {IntegrationBsExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {IntegrationPwExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\nOur crawlers will now use the selected proxies for all connections.\n\n### IP Rotation and session management\n\nThe <ApiLink to=\"class/ProxyConfiguration#new_url\">`proxy_configuration.new_url()`</ApiLink> method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink> class for more information on how maintaining a real session helps avoid blocking.\n\nFor more details on session management, check out the [Session management](./session-management) guide.\n\nWhen no `session_id` is provided, our proxy URLs are rotated round-robin.\n\n<Tabs>\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <CodeBlock className=\"language-python\">\n            {SessionBsExample}\n        </CodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <CodeBlock className=\"language-python\">\n            {SessionPwExample}\n        </CodeBlock>\n    </TabItem>\n</Tabs>\n\n### Tiered proxies\n\nWhen you use HTTP proxies in real world crawling scenarios, you have to decide which type of proxy to use to reach the sweet spot between cost efficiency and reliably avoiding blocking. Some websites may allow crawling with no proxy, on some you may get away with using datacenter proxies, which are cheap but easily detected, and sometimes you need to use expensive residential proxies.\n\nTo take the guesswork out of this process, Crawlee allows you to configure multiple tiers of proxy URLs. When crawling, it will automatically pick the lowest tier (smallest index) where it doesn't encounter blocking. If you organize your proxy server URLs in tiers so that the lowest tier contains the cheapest, least reliable ones and each higher tier contains more expensive, more reliable ones, you will get an optimal anti-blocking performance.\n\nIn an active tier, Crawlee will alternate between proxies in a round-robin fashion, just like it would with `proxy_urls`.\n\n<Tabs>\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {TiersBsExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {TiersPwExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n## Inspecting current proxy in crawlers\n\nThe <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> provide access to information about the currently used proxy via the request handler using a <ApiLink to=\"class/ProxyInfo\">`proxy_info`</ApiLink> object. This object allows easy access to the proxy URL.\n\n<Tabs>\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {InspectionBsExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {InspectionPwExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n"
  },
  {
    "path": "docs/guides/request_loaders.mdx",
    "content": "---\nid: request-loaders\ntitle: Request loaders\ndescription: How to manage the requests your crawler will go through.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';\nimport SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py';\nimport RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py';\nimport RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';\nimport SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';\nimport SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py';\nimport RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py';\nimport SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py';\n\nThe [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together.\n\n## Overview\n\nThe [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package introduces the following abstract classes:\n\n- <ApiLink to=\"class/RequestLoader\">`RequestLoader`</ApiLink>: The base interface for reading requests in a crawl.\n- <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink>: Extends `RequestLoader` with write capabilities.\n- <ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink>: Combines a read-only `RequestLoader` with a writable `RequestManager`.\n\nAnd specific request loader implementations:\n\n- <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink>: A lightweight implementation for managing a static list of URLs.\n- <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink>: A specialized loader that reads URLs from XML and plain-text sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html) with filtering capabilities.\n\nBelow is a class diagram that illustrates the relationships between these components and the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>:\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass Storage {\n    <<abstract>>\n    + id\n    + name\n    + open()\n    + drop()\n}\n\nclass RequestLoader {\n    <<abstract>>\n    + handled_count\n    + total_count\n    + fetch_next_request()\n    + mark_request_as_handled()\n    + is_empty()\n    + is_finished()\n    + to_tandem()\n}\n\nclass RequestManager {\n    <<abstract>>\n    + add_request()\n    + add_requests_batched()\n    + reclaim_request()\n    + drop()\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass RequestQueue\n\nclass RequestList\n\nclass SitemapRequestLoader\n\nclass RequestManagerTandem\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nStorage --|> RequestQueue\nRequestManager --|> RequestQueue\n\nRequestLoader --|> RequestManager\nRequestLoader --|> RequestList\nRequestLoader --|> SitemapRequestLoader\nRequestManager --|> RequestManagerTandem\n```\n\n## Request loaders\n\nThe <ApiLink to=\"class/RequestLoader\">`RequestLoader`</ApiLink> interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink>, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the <ApiLink to=\"class/RequestLoader\">`RequestLoader`</ApiLink> API reference.\n\n:::info NOTE\nTo learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below.\n:::\n\n### Request list\n\nThe <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs.\n\nHere is a basic example of working with the <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink>:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RlBasicExample}\n</RunnableCodeBlock>\n\n### Request list with persistence\n\nThe <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later.\n\nTo enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RlBasicPersistExample}\n</RunnableCodeBlock>\n\n### Sitemap request loader\n\nThe <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> is a specialized request loader that reads URLs from sitemaps following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats. It's particularly useful when you want to crawl a website systematically by following its sitemap structure.\n\n:::note\nThe `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality.\n:::\n\nThe loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SitemapExample}\n</RunnableCodeBlock>\n\n### Sitemap request loader with persistence\n\nSimilarly, the <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SitemapPersistExample}\n</RunnableCodeBlock>\n\nWhen using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed.\n\n## Request managers\n\nThe <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> API reference.\n\n## Request manager tandem\n\nThe <ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink> class allows you to combine the read-only capabilities of a `RequestLoader` (like <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink>) with the read-write capabilities of a `RequestManager` (like <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>). This is useful for scenarios where you need to load initial requests from a static source (such as a file or database) and dynamically add or retry requests during the crawl. Additionally, it provides deduplication capabilities, ensuring that requests are not processed multiple times.\n\nUnder the hood, <ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink> checks whether the read-only loader still has pending requests. If so, each new request from the loader is transferred to the manager. Any newly added or reclaimed requests go directly to the manager side.\n\n### Request list with request queue\n\nThis section describes the combination of the <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> and <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> classes. This setup is particularly useful when you have a static list of URLs that you want to crawl, but also need to handle dynamic requests discovered during the crawl process. The <ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink> class facilitates this combination, with the <ApiLink to=\"class/RequestLoader#to_tandem\">`RequestLoader.to_tandem`</ApiLink> method available as a convenient shortcut. Requests from the <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> are processed first by being enqueued into the default <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, which handles persistence and retries for failed requests.\n\n<Tabs groupId=\"request_manager_tandem\">\n    <TabItem value=\"request_manager_tandem_explicit\" label=\"Explicit usage\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RlExplicitTandemExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"request_manager_tandem_helper\" label=\"Using to_tandem helper\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RlTandemExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n### Sitemap request loader with request queue\n\nSimilar to the <ApiLink to=\"class/RequestList\">`RequestList`</ApiLink> example above, you can combine a <ApiLink to=\"class/SitemapRequestLoader\">`SitemapRequestLoader`</ApiLink> with a <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> using the <ApiLink to=\"class/RequestManagerTandem\">`RequestManagerTandem`</ApiLink> class. This setup is particularly useful when you want to crawl URLs from a sitemap while also handling dynamic requests discovered during the crawl process. URLs from the sitemap are processed first by being enqueued into the default <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, which handles persistence and retries for failed requests.\n\n<Tabs groupId=\"sitemap_request_manager_tandem\">\n    <TabItem value=\"sitemap_request_manager_tandem_explicit\" label=\"Explicit usage\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {SitemapExplicitTandemExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"sitemap_request_manager_tandem_helper\" label=\"Using to_tandem helper\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {SitemapTandemExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n## Conclusion\n\nThis guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs and requests. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` implementations. You also saw practical examples of how to work with these classes to handle various crawling scenarios.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/request_router.mdx",
    "content": "---\nid: request-router\ntitle: Request router\ndescription: Learn how to use the Router class to organize request handlers, error handlers, and pre-navigation hooks in Crawlee.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BasicRequestHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/basic_request_handlers.py';\nimport SimpleDefaultHandler from '!!raw-loader!roa-loader!./code_examples/request_router/simple_default_handler.py';\nimport CustomRouterDefaultOnly from '!!raw-loader!roa-loader!./code_examples/request_router/custom_router_default_only.py';\nimport HttpPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/http_pre_navigation.py';\nimport ErrorHandler from '!!raw-loader!roa-loader!./code_examples/request_router/error_handler.py';\nimport FailedRequestHandler from '!!raw-loader!roa-loader!./code_examples/request_router/failed_request_handler.py';\nimport PlaywrightPreNavigation from '!!raw-loader!roa-loader!./code_examples/request_router/playwright_pre_navigation.py';\nimport AdaptiveCrawlerHandlers from '!!raw-loader!roa-loader!./code_examples/request_router/adaptive_crawler_handlers.py';\n\nThe <ApiLink to=\"class/Router\">`Router`</ApiLink> class manages request flow and coordinates the execution of user-defined logic in Crawlee projects. It routes incoming requests to appropriate user-defined handlers based on labels, manages error scenarios, and provides hooks for pre-navigation execution. The <ApiLink to=\"class/Router\">`Router`</ApiLink> serves as the orchestrator for all crawling operations, ensuring that each request is processed by the correct handler according to its type and label.\n\n## Request handlers\n\nRequest handlers are user-defined functions that process individual requests and their corresponding responses. Each handler receives a crawling context as its primary argument, which provides access to the current request, response data, and utility methods for data extraction, link enqueuing, and storage operations. Handlers determine how different types of pages are processed and how data is extracted and stored.\n\n:::note\n\nThe code examples in this guide use <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> for demonstration, but the <ApiLink to=\"class/Router\">`Router`</ApiLink> works with all crawler types.\n\n:::\n\n### Built-in router\n\nEvery crawler instance includes a built-in <ApiLink to=\"class/Router\">`Router`</ApiLink> accessible through the `crawler.router` property. This approach simplifies initial setup and covers basic use cases where request routing requirements are straightforward.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SimpleDefaultHandler}\n</RunnableCodeBlock>\n\nThe default handler processes all requests that either lack a label or have a label for which no specific handler has been registered.\n\n### Custom router\n\nApplications requiring explicit control over router configuration or router reuse across multiple crawler instances can create custom <ApiLink to=\"class/Router\">`Router`</ApiLink> instances. Custom routers provide complete control over request routing configuration and enable modular application architecture. Router instances can be configured independently and attached to your crawler instances as needed.\n\nYou can also implement a custom request router class from scratch or by inheriting from <ApiLink to=\"class/Router\">`Router`</ApiLink>. This allows you to define custom routing logic or manage request handlers in a different way.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CustomRouterDefaultOnly}\n</RunnableCodeBlock>\n\n### Advanced routing by labels\n\nMore complex crawling projects often require different processing logic for various page types. The router supports label-based routing, which allows registration of specialized handlers for specific content categories. This pattern enables clean separation of concerns and targeted processing logic for different URL patterns or content types.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BasicRequestHandlers}\n</RunnableCodeBlock>\n\n## Error handlers\n\nCrawlee provides error handling mechanisms to manage request processing failures. It distinguishes between recoverable errors that may succeed on retry and permanent failures that require alternative handling strategies.\n\n### Error handler\n\nThe error handler executes when exceptions occur during request processing, before any retry attempts. This handler receives the error context and can implement custom recovery logic, modify request parameters, or determine whether the request should be retried. Error handlers enable control over failure scenarios and allow applications to implement error recovery strategies.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ErrorHandler}\n</RunnableCodeBlock>\n\n### Failed request handler\n\nThe failed request handler executes when a request has exhausted all retry attempts and is considered permanently failed. This handler serves as the final opportunity to log failures, store failed requests for later analysis, create alternative requests, or implement fallback processing strategies.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {FailedRequestHandler}\n</RunnableCodeBlock>\n\n## Pre-navigation hooks\n\nPre-navigation hooks execute before each request is processed, providing opportunities to configure request parameters, modify browser settings, or implement request-specific optimizations. You can use pre-navigation hooks for example for viewport configuration, resource blocking, timeout management, header customization, custom proxy rotation, and request interception.\n\n### HTTP crawler\n\nHTTP crawlers support pre-navigation hooks that execute before making HTTP requests. These hooks enable request modification, header configuration, and other HTTP-specific optimizations.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {HttpPreNavigation}\n</RunnableCodeBlock>\n\n### Playwright crawler\n\nPlaywright crawlers provide extensive pre-navigation capabilities that allow browser page configuration before navigation. These hooks can modify browser behavior and configure page settings.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {PlaywrightPreNavigation}\n</RunnableCodeBlock>\n\n### Adaptive Playwright crawler\n\nThe <ApiLink to=\"class/AdaptivePlaywrightCrawler\">`AdaptivePlaywrightCrawler`</ApiLink> implements a dual-hook system with common hooks that execute for all requests and Playwright-specific hooks that execute only when browser automation is required. This is perfect for projects that need both static and dynamic content handling.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {AdaptiveCrawlerHandlers}\n</RunnableCodeBlock>\n\n## Conclusion\n\nThis guide introduced you to the <ApiLink to=\"class/Router\">`Router`</ApiLink> class and how to organize your crawling logic. You learned how to use built-in and custom routers, implement request handlers with label-based routing, handle errors with error and failed request handlers, and configure pre-navigation hooks for different crawler types.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/running_in_web_server.mdx",
    "content": "---\nid: running-in-web-server\ntitle: Running in web server\ndescription: Running in web server\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py';\nimport Server from '!!raw-loader!./code_examples/running_in_web_server/server.py';\n\n\nMost of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in.\n\nWe will build a simple HTTP server that receives a page URL and returns the page title in the response.\n\n## Set up a web server\n\nThere are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.\n\nThis will be our core server setup:\n\n<CodeBlock className=\"language-python\" title=\"server.py\">\n    {Server}\n</CodeBlock>\n\nThe server has two endpoints.\n- `/` - The index is just giving short description of the server with example link to the second endpoint.\n- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL\n\nTo run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command:\n```\nfastapi dev server.py\n```\n\n## Create a crawler\n\nWe will create a standard <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>. This way it will always be waiting for new requests to come in.\n\n<CodeBlock className=\"language-python\" title=\"crawler.py\">\n    {Crawler}\n</CodeBlock>\n\nCrawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`:\n- `crawler` holds instance of our crawler and allows the app to interact with it.\n- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler.\n"
  },
  {
    "path": "docs/guides/scaling_crawlers.mdx",
    "content": "---\nid: scaling-crawlers\ntitle: Scaling crawlers\ndescription: Learn how to scale your crawlers by controlling concurrency and limiting requests per minute.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport MaxTasksPerMinuteExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/max_tasks_per_minute_example.py';\nimport MinAndMaxConcurrencyExample from '!!raw-loader!roa-loader!./code_examples/scaling_crawlers/min_and_max_concurrency_example.py';\n\nAs we build our crawler, we may want to control how many tasks it performs at any given time. In other words, how many requests it makes to the web we are trying to scrape. Crawlee offers several options to fine-tune the number of parallel tasks, limit the number of requests per minute, and optimize scaling based on available system resources.\n\n:::tip\n\nAll of these options are available across all crawlers provided by Crawlee. In this guide, we are using the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> as an example. You should also explore the <ApiLink to=\"class/ConcurrencySettings\">`ConcurrencySettings`</ApiLink>.\n\n:::\n\n## Max tasks per minute\n\nThe `max_tasks_per_minute` setting in <ApiLink to=\"class/ConcurrencySettings\">`ConcurrencySettings`</ApiLink> controls how many total tasks the crawler can process per minute. It ensures that tasks are spread evenly throughout the minute, preventing a sudden burst at the `max_concurrency` limit followed by idle time. By default, this is set to `Infinity`, meaning the crawler can run at full speed, limited only by `max_concurrency`. Use this option if you want to throttle your crawler to avoid overwhelming the target website with continuous requests.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {MaxTasksPerMinuteExample}\n</RunnableCodeBlock>\n\n## Minimum and maximum concurrency\n\nThe `min_concurrency` and `max_concurrency` options in the <ApiLink to=\"class/ConcurrencySettings\">`ConcurrencySettings`</ApiLink> define the minimum and maximum number of parallel tasks that can run at any given time. By default, crawlers start with a single parallel task and gradually scale up to a maximum of concurrent requests.\n\n:::caution Avoid setting minimum concurrency too high\n\nIf you set `min_concurrency` too high compared to the available system resources, the crawler may run very slowly or even crash. It is recommended to stick with the default value and let the crawler automatically adjust concurrency based on the system's available resources.\n\n:::\n\n## Desired concurrency\n\nThe `desired_concurrency` option in the <ApiLink to=\"class/ConcurrencySettings\">`ConcurrencySettings`</ApiLink> specifies the initial number of parallel tasks to start with, assuming sufficient resources are available. It defaults to the same value as `min_concurrency`.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {MinAndMaxConcurrencyExample}\n</RunnableCodeBlock>\n\n## Autoscaled pool\n\nThe <ApiLink to=\"class/AutoscaledPool\">`AutoscaledPool`</ApiLink> manages a pool of asynchronous, resource-intensive tasks that run in parallel. It automatically starts new tasks only when there is enough free CPU and memory. To monitor system resources, it leverages the <ApiLink to=\"class/Snapshotter\">`Snapshotter`</ApiLink> and <ApiLink to=\"class/SystemStatus\">`SystemStatus`</ApiLink> classes. If any task raises an exception, the error is propagated, and the pool is stopped. Every crawler uses an <ApiLink to=\"class/AutoscaledPool\">`AutoscaledPool`</ApiLink> under the hood.\n"
  },
  {
    "path": "docs/guides/service_locator.mdx",
    "content": "---\nid: service-locator\ntitle: Service locator\ndescription: Crawlee's service locator is a central registry for global services, managing and providing access to them throughout the whole framework.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport ServiceLocatorConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_configuration.py';\nimport ServiceLocatorStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_storage_client.py';\nimport ServiceLocatorEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_locator_event_manager.py';\n\nimport ServiceCrawlerConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_configuration.py';\nimport ServiceCrawlerStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_storage_client.py';\nimport ServiceCrawlerEventManager from '!!raw-loader!roa-loader!./code_examples/service_locator/service_crawler_event_manager.py';\n\nimport ServiceStorageConfiguration from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_configuration.py';\nimport ServiceStorageStorageClient from '!!raw-loader!roa-loader!./code_examples/service_locator/service_storage_storage_client.py';\n\nimport ServiceConflicts from '!!raw-loader!roa-loader!./code_examples/service_locator/service_conflicts.py';\n\nThe <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> is a central registry for global services. It manages and provides access to these services throughout the framework, ensuring their consistent configuration and across all components.\n\nThe service locator manages three core services: <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink>, <ApiLink to=\"class/EventManager\">`EventManager`</ApiLink>, and <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink>. All services are initialized lazily with defaults when first accessed.\n\n## Services\n\nThere are three core services that are managed by the service locator:\n\n### Configuration\n\n<ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables.\n\n### StorageClient\n\n<ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> is the backend implementation for storages in Crawlee. It provides a unified interface for <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, and <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, regardless of the underlying storage implementation. Storage clients were already explained in the storage clients section.\n\nRefer to the [Storage clients guide](./storage-clients) for more information about storage clients and how to use them.\n\n### EventManager\n\n<ApiLink to=\"class/EventManager\">`EventManager`</ApiLink> is responsible for coordinating internal events in Crawlee. It allows you to register event listeners and emit events throughout the framework. Examples of such events aborting, migrating, system info, or browser-specific events like page created, page closed and more. It provides a way to listen to events and execute custom logic when certain events occur.\n\n## Service registration\n\nThere are several ways to register services in Crawlee, depending on your use case and preferences.\n\n### Via service locator\n\nServices can be registered globally through the <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> before they are first accessed. There is a singleton `service_locator` instance that is used throughout the framework, making the services available to all components throughout the whole framework.\n\n<Tabs>\n\n<TabItem value=\"storage-client\" label=\"Storage client\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceLocatorStorageClient}\n    </RunnableCodeBlock>\n</TabItem>\n\n<TabItem value=\"configuration\" label=\"Configuration\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceLocatorConfiguration}\n    </RunnableCodeBlock>\n</TabItem>\n\n<TabItem value=\"event-manager\" label=\"Event manager\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceLocatorEventManager}\n    </RunnableCodeBlock>\n</TabItem>\n\n</Tabs>\n\n### Via crawler constructors\n\nAlternatively services can be passed to the crawler constructors. They will be registered globally to the <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> under the hood, making them available to all components and reaching consistent configuration.\n\n<Tabs>\n\n<TabItem value=\"storage-client\" label=\"Storage client\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceCrawlerStorageClient}\n    </RunnableCodeBlock>\n</TabItem>\n\n<TabItem value=\"configuration\" label=\"Configuration\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceCrawlerConfiguration}\n    </RunnableCodeBlock>\n</TabItem>\n\n<TabItem value=\"event-manager\" label=\"Event manager\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceCrawlerEventManager}\n    </RunnableCodeBlock>\n</TabItem>\n\n</Tabs>\n\n### Via storage constructors\n\nAlternatively, services can be provided when opening specific storage instances, which uses them only for that particular instance without affecting global configuration.\n\n<Tabs>\n\n<TabItem value=\"storage-client\" label=\"Storage client\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceStorageStorageClient}\n    </RunnableCodeBlock>\n</TabItem>\n\n<TabItem value=\"configuration\" label=\"Configuration\">\n    <RunnableCodeBlock className=\"language-python\" language=\"python\">\n        {ServiceStorageConfiguration}\n    </RunnableCodeBlock>\n</TabItem>\n\n</Tabs>\n\n## Conflict prevention\n\nOnce a service has been retrieved from the service locator, attempting to set a different instance will raise a <ApiLink to=\"class/ServiceConflictError\">`ServiceConflictError`</ApiLink> to prevent accidental configuration conflicts.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ServiceConflicts}\n</RunnableCodeBlock>\n\n## Conclusion\n\nThe <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> is a tool for managing global services in Crawlee. It provides a consistent way to configure and access services throughout the framework, ensuring that all components have access to the same configuration and services.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/session_management.mdx",
    "content": "---\nid: session-management\ntitle: Session management\ndescription: How to manage your cookies, proxy IP rotations and more.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BasicSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_basic.py';\nimport HttpSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_http.py';\nimport BeautifulSoupSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_beautifulsoup.py';\nimport ParselSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_parsel.py';\nimport PlaywrightSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_playwright.py';\nimport StandaloneSource from '!!raw-loader!roa-loader!./code_examples/session_management/sm_standalone.py';\nimport OneSession from '!!raw-loader!roa-loader!./code_examples/session_management/one_session_http.py';\nimport MultiSessions from '!!raw-loader!roa-loader!./code_examples/session_management/multi_sessions_http.py';\n\nThe <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink> class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies.\n\nAdditionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address.\n\nFinally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper.\n\nFor more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide.\n\nNow, let's explore examples of how to use the <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink> in different scenarios:\n- with <ApiLink to=\"class/BasicCrawler\">`BasicCrawler`</ApiLink>;\n- with <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>;\n- with <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>;\n- with <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>;\n- with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>;\n- without a crawler (standalone usage to manage sessions manually).\n\n<Tabs groupId=\"session_pool\">\n    <TabItem value=\"basic\" label=\"BasicSource\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BasicSource}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"http\" label=\"HttpCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {HttpSource}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"beautifulsoup\" label=\"BeautifulSoupCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulSoupSource}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"parsel\" label=\"ParselCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ParselSource}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"playwright\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightSource}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"standalone\" label=\"Standalone\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {StandaloneSource}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\nThese examples demonstrate the basics of configuring and using the <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink>.\n\nPlease, bear in mind that <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink> requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time.\n\n## Configuring a single session\n\nIn some cases, you need full control over session usage. For example, when working with websites requiring authentication or initialization of certain parameters like cookies.\n\nWhen working with a site that requires authentication, we typically don't want multiple sessions with different browser fingerprints or client parameters accessing the site. In this case, we need to configure the <ApiLink to=\"class/SessionPool\">`SessionPool`</ApiLink> appropriately:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {OneSession}\n</RunnableCodeBlock>\n\n## Binding requests to specific sessions\n\nIn the previous example, there's one obvious limitation - you're restricted to only one session.\n\nIn some cases, we need to achieve the same behavior but using multiple sessions in parallel, such as authenticating with different profiles or using different proxies.\n\nTo do this, use the `session_id` parameter for the <ApiLink to=\"class/Request\">`Request`</ApiLink> object to bind a request to a specific session:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {MultiSessions}\n</RunnableCodeBlock>\n"
  },
  {
    "path": "docs/guides/storage_clients.mdx",
    "content": "---\nid: storage-clients\ntitle: Storage clients\ndescription: How to work with storage clients in Crawlee, including the built-in clients and how to create your own.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py';\nimport FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py';\nimport FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py';\nimport CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py';\nimport RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';\nimport SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py';\nimport SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py';\nimport RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py';\nimport RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py';\n\nStorage clients provide a unified interface for interacting with <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, and <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.\n\n## Built-in storage clients\n\nCrawlee provides three main storage client implementations:\n\n- <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.\n- <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.\n- <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/), [PostgreSQL](https://www.postgresql.org/), [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite, `crawlee[sql_postgres]` for PostgreSQL or `crawlee[sql_mysql]` for MySQL and MariaDB.\n- <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`.\n- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass StorageClient {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass FileSystemStorageClient\n\nclass MemoryStorageClient\n\nclass SqlStorageClient\n\nclass RedisStorageClient\n\nclass ApifyStorageClient\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nStorageClient --|> FileSystemStorageClient\nStorageClient --|> MemoryStorageClient\nStorageClient --|> SqlStorageClient\nStorageClient --|> RedisStorageClient\nStorageClient --|> ApifyStorageClient\n```\n\n### File system storage client\n\nThe <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink> provides persistent storage by writing data directly to the file system. It uses intelligent caching and batch processing for better performance while storing data in human-readable JSON format. This is the default storage client used by Crawlee when no other storage client is specified, making it ideal for large datasets and long-running operations where data persistence is required.\n\n:::warning Concurrency limitation\nThe `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time.\n:::\n\nThis storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {FileSystemStorageClientBasicExample}\n</RunnableCodeBlock>\n\nConfiguration options for the <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> class:\n\n- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory for all storage data.\n- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.\n\nData is stored using the following directory structure:\n\n```text\n{CRAWLEE_STORAGE_DIR}/\n├── datasets/\n│   └── {DATASET_NAME}/\n│       ├── __metadata__.json\n│       ├── 000000001.json\n│       └── 000000002.json\n├── key_value_stores/\n│   └── {KVS_NAME}/\n│       ├── __metadata__.json\n│       ├── key1.json\n│       ├── key2.txt\n│       └── key3.json\n└── request_queues/\n    └── {RQ_NAME}/\n        ├── __metadata__.json\n        ├── {REQUEST_ID_1}.json\n        └── {REQUEST_ID_2}.json\n```\n\nWhere:\n- `{CRAWLEE_STORAGE_DIR}` - The root directory for local storage.\n- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}` - The unique names for each storage instance (defaults to `\"default\"`).\n- Files are stored directly without additional metadata files for simpler structure.\n\nHere is an example of how to configure the <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink>:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {FileSystemStorageClientConfigurationExample}\n</RunnableCodeBlock>\n\n### Memory storage client\n\nThe <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink> stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. This storage client is primarily suitable for testing and development, and is usually not a good fit for production use. However, in some cases where speed is prioritized over persistence, it can make sense.\n\n:::warning Persistence limitation\nThe `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates.\n:::\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {MemoryStorageClientBasicExample}\n</RunnableCodeBlock>\n\n### SQL storage client\n\n:::warning Experimental feature\nThe `SqlStorageClient` is experimental. Its API and behavior may change in future releases.\n:::\n\nThe <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> provides persistent storage using a SQL database (SQLite by default, or PostgreSQL, MySQL, MariaDB). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes.\n\n:::note dependencies\nThe <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> is not included in the core Crawlee package.\nTo use it, you need to install Crawlee with the appropriate extra dependency:\n\n- For SQLite support, run:\n  <code>pip install 'crawlee[sql_sqlite]'</code>\n- For PostgreSQL support, run:\n  <code>pip install 'crawlee[sql_postgres]'</code>\n- For MySQL or MariaDB support, run:\n  <code>pip install 'crawlee[sql_mysql]'</code>\n:::\n\nBy default, <ApiLink to=\"class/SqlStorageClient\">SqlStorageClient</ApiLink> uses SQLite.\nTo use a different database, just provide the appropriate connection string via the `connection_string` parameter. No other code changes are needed—the same client works for all supported databases.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SQLStorageClientBasicExample}\n</RunnableCodeBlock>\n\nData is organized in relational tables. Below are the main tables and columns used for each storage type:\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Storage Clients\n%% ========================\n\nclass SqlDatasetClient {\n    <<Dataset>>\n}\n\nclass SqlKeyValueStoreClient {\n    <<Key-value store>>\n}\n\n%% ========================\n%% Dataset Tables\n%% ========================\n\nclass datasets {\n    <<table>>\n    + dataset_id (PK)\n    + internal_name\n    + name\n    + accessed_at\n    + created_at\n    + modified_at\n    + item_count\n    + buffer_locked_until\n}\n\nclass dataset_records {\n    <<table>>\n    + item_id (PK)\n    + dataset_id (FK)\n    + data\n}\n\nclass dataset_metadata_buffer {\n    <<table>>\n    + id (PK)\n    + accessed_at\n    + modified_at\n    + delta_item_count\n}\n\n%% ========================\n%% Key-Value Store Tables\n%% ========================\n\nclass key_value_stores {\n    <<table>>\n    + key_value_store_id (PK)\n    + internal_name\n    + name\n    + accessed_at\n    + created_at\n    + modified_at\n    + buffer_locked_until\n}\n\nclass key_value_store_records {\n    <<table>>\n    + key_value_store_id (FK, PK)\n    + key (PK)\n    + value\n    + content_type\n    + size\n}\n\nclass key_value_store_metadata_buffer {\n    <<table>>\n    + id (PK)\n    + accessed_at\n    + modified_at\n}\n\n%% ========================\n%% Client to Table arrows\n%% ========================\n\nSqlDatasetClient --> datasets\nSqlDatasetClient --> dataset_records\nSqlDatasetClient --> dataset_metadata_buffer\n\nSqlKeyValueStoreClient --> key_value_stores\nSqlKeyValueStoreClient --> key_value_store_records\nSqlKeyValueStoreClient --> key_value_store_metadata_buffer\n```\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Storage Clients\n%% ========================\n\nclass SqlRequestQueueClient {\n    <<Request queue>>\n}\n\n%% ========================\n%% Request Queue Tables\n%% ========================\n\nclass request_queues {\n    <<table>>\n    + request_queue_id (PK)\n    + internal_name\n    + name\n    + accessed_at\n    + created_at\n    + modified_at\n    + had_multiple_clients\n    + handled_request_count\n    + pending_request_count\n    + total_request_count\n    + buffer_locked_until\n}\n\nclass request_queue_records {\n    <<table>>\n    + request_id (PK)\n    + request_queue_id (FK, PK)\n    + data\n    + sequence_number\n    + is_handled\n    + time_blocked_until\n    + client_key\n}\n\nclass request_queue_state {\n    <<table>>\n    + request_queue_id (FK, PK)\n    + sequence_counter\n    + forefront_sequence_counter\n}\n\nclass request_queue_metadata_buffer {\n    <<table>>\n    + id (PK)\n    + accessed_at\n    + modified_at\n    + client_id\n    + delta_handled_count\n    + delta_pending_count\n    + delta_total_count\n    + need_recalc\n}\n\n%% ========================\n%% Client to Table arrows\n%% ========================\n\nSqlRequestQueueClient --> request_queues\nSqlRequestQueueClient --> request_queue_records\nSqlRequestQueueClient --> request_queue_state\nSqlRequestQueueClient --> request_queue_metadata_buffer\n```\n\nConfiguration options for the <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> class:\n\n- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided.\n- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.\n\nConfiguration options for the <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> can be set via constructor arguments:\n\n- **`connection_string`** (default: SQLite in <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db`, `postgresql+asyncpg://user:pass@host/db`, `mysql+aiomysql://user:pass@host/db` or `mariadb+aiomysql://user:pass@host/db`.\n- **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional).\n\nFor advanced scenarios, you can configure <ApiLink to=\"class/SqlStorageClient\">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.\n\n:::warning\nIf you use MySQL or MariaDB, pass the `isolation_level='READ COMMITTED'` argument to `create_async_engine`. MySQL/MariaDB default to the `REPEATABLE READ` isolation level, which can cause unnecessary locking, deadlocks, or stale reads when multiple Crawlee workers access the same tables concurrently. Using `READ COMMITTED` ensures more predictable row-level locking and visibility semantics for `SqlStorageClient`.\n:::\n\n<CodeBlock className=\"language-python\" language=\"python\">\n    {SQLStorageClientConfigurationExample}\n</CodeBlock>\n\n### Redis storage client\n\n:::warning Experimental feature\nThe <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> is experimental. Its API and behavior may change in future releases.\n:::\n\nThe <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> provides persistent storage using [Redis](https://redis.io/) database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations.\n\n:::note dependencies\nThe <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> is not included in the core Crawlee package.\nTo use it, you need to install Crawlee with the Redis extra dependency:\n\n<code>pip install 'crawlee[redis]'</code>\n\nAdditionally, Redis version 8.0 or higher is required.\n:::\n\n:::note Redis persistence\nData persistence in Redis depends on your [database configuration](https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/).\n:::\n\nThe client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption.\n\n<CodeBlock className=\"language-python\" language=\"python\">\n    {RedisStorageClientBasicExample}\n</CodeBlock>\n\nData is organized using Redis key patterns. Below are the main data structures used for each storage type:\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Storage Client\n%% ========================\n\nclass RedisDatasetClient {\n    <<Dataset>>\n}\n\n%% ========================\n%% Dataset Keys\n%% ========================\n\nclass DatasetKeys {\n    datasets:[name]:items - JSON Array\n    datasets:[name]:metadata - JSON Object\n}\n\nclass DatasetsIndexes {\n    datasets:id_to_name - Hash\n    datasets:name_to_id - Hash\n}\n\n%% ========================\n%% Client to Keys arrows\n%% ========================\n\nRedisDatasetClient --> DatasetKeys\nRedisDatasetClient --> DatasetsIndexes\n```\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Storage Clients\n%% ========================\n\nclass RedisKeyValueStoreClient {\n    <<Key-value store>>\n}\n\n%% ========================\n%% Key-Value Store Keys\n%% ========================\n\nclass KeyValueStoreKeys {\n    key_value_stores:[name]:items - Hash\n    key_value_stores:[name]:metadata_items - Hash\n    key_value_stores:[name]:metadata - JSON Object\n}\n\nclass KeyValueStoresIndexes {\n    key_value_stores:id_to_name - Hash\n    key_value_stores:name_to_id - Hash\n}\n\n%% ========================\n%% Client to Keys arrows\n%% ========================\n\nRedisKeyValueStoreClient --> KeyValueStoreKeys\nRedisKeyValueStoreClient --> KeyValueStoresIndexes\n```\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Storage Clients\n%% ========================\n\nclass RedisRequestQueueClient {\n    <<Request queue>>\n}\n\n%% ========================\n%% Request Queue Keys\n%% ========================\n\nclass RequestQueueKeys{\n    request_queues:[name]:queue - List\n    request_queues:[name]:data - Hash\n    request_queues:[name]:in_progress - Hash\n    request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy\n    request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy\n    request_queues:[name]:pending_set - Set | default queue_dedup_strategy\n    request_queues:[name]:handled_set - Set | default queue_dedup_strategy\n    request_queues:[name]:metadata - JSON Object\n}\n\nclass RequestQueuesIndexes {\n    request_queues:id_to_name - Hash\n    request_queues:name_to_id - Hash\n}\n\n%% ========================\n%% Client to Keys arrows\n%% ========================\n\nRedisRequestQueueClient --> RequestQueueKeys\nRedisRequestQueueClient --> RequestQueuesIndexes\n```\n\nConfiguration options for the <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to=\"class/Configuration\">`Configuration`</ApiLink> class:\n\n- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.\n\nConfiguration options for the <ApiLink to=\"class/RedisStorageClient\">`RedisStorageClient`</ApiLink> can be set via constructor arguments:\n\n- **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`.\n- **`redis`** - Pre-configured Redis client instance (optional).\n\n<CodeBlock className=\"language-python\" language=\"python\">\n    {RedisStorageClientConfigurationExample}\n</CodeBlock>\n\n## Creating a custom storage client\n\nA storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to=\"class/DatasetClient\">`DatasetClient`</ApiLink>, <ApiLink to=\"class/KeyValueStoreClient\">`KeyValueStoreClient`</ApiLink>, <ApiLink to=\"class/RequestQueueClient\">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.\n\nHere is an example of a custom storage client that implements the <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> interface:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CustomStorageClientExample}\n</RunnableCodeBlock>\n\nCustom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages.\n\n## Registering storage clients\n\nStorage clients can be registered in multiple ways:\n- **Globally** - Using the <ApiLink to=\"class/ServiceLocator\">`ServiceLocator`</ApiLink> or passing directly to the crawler.\n- **Per storage** - When opening a specific storage instance like <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, or <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RegisteringStorageClientsExample}\n</RunnableCodeBlock>\n\nYou can also register different storage clients for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use a fast in-memory storage for <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> while persisting scraping results in <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> or <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>.\n\n## Conclusion\n\nStorage clients in Crawlee provide different backends for data storage. Use <ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink> for testing and fast operations without persistence, or <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink> for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the <ApiLink to=\"class/StorageClient\">`StorageClient`</ApiLink> interface.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/storages.mdx",
    "content": "---\nid: storages\ntitle: Storages\ndescription: How to work with storages in Crawlee, how to manage requests and how to store and retrieve scraping results.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport OpeningExample from '!!raw-loader!roa-loader!./code_examples/storages/opening.py';\n\nimport RqBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_basic_example.py';\nimport RqWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_example.py';\nimport RqWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/rq_with_crawler_explicit_example.py';\nimport RqHelperAddRequestsExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_add_requests_example.py';\nimport RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples/storages/helper_enqueue_links_example.py';\n\nimport DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py';\nimport DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py';\nimport DatasetWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py';\n\nimport KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py';\nimport KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py';\nimport KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py';\n\nimport CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py';\nimport CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py';\n\nCrawlee offers several storage types for managing and persisting your crawling data. Request-oriented storages, such as the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, help you store and deduplicate URLs, while result-oriented storages, like <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> and <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, focus on storing and retrieving scraping results. This guide explains when to use each type, how to interact with them, and how to control their lifecycle.\n\n## Overview\n\nCrawlee's storage system consists of two main layers:\n- **Storages** (<ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>): High-level interfaces for interacting with different storage types.\n- **Storage clients** (<ApiLink to=\"class/MemoryStorageClient\">`MemoryStorageClient`</ApiLink>, <ApiLink to=\"class/FileSystemStorageClient\">`FileSystemStorageClient`</ApiLink>, etc.): Backend implementations that handle the actual data persistence and management.\n\nFor more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients).\n\n```mermaid\n---\nconfig:\n    class:\n        hideEmptyMembersBox: true\n---\n\nclassDiagram\n\n%% ========================\n%% Abstract classes\n%% ========================\n\nclass Storage {\n    <<abstract>>\n}\n\n%% ========================\n%% Specific classes\n%% ========================\n\nclass Dataset\n\nclass KeyValueStore\n\nclass RequestQueue\n\n%% ========================\n%% Inheritance arrows\n%% ========================\n\nStorage --|> Dataset\nStorage --|> KeyValueStore\nStorage --|> RequestQueue\n```\n\n### Named and unnamed storages\n\nCrawlee supports two types of storages:\n\n- **Named storages**: Persistent storages with a specific name that persist across runs. These are useful when you want to share data between different crawler runs or access the same storage from multiple places.\n- **Unnamed storages**: Temporary storages identified by an alias that are scoped to a single run. These are automatically purged at the start of each run (when `purge_on_start` is enabled, which is the default).\n\n### Default storage\n\nEach storage type (<ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>, <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>, <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>) has a default instance that can be accessed without specifying `id`, `name` or `alias`. Default unnamed storage is accessed by calling storage's `open` method without parameters. This is the most common way to use storages in simple crawlers.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {OpeningExample}\n</RunnableCodeBlock>\n\n## Request queue\n\nThe <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run.\n\nThe following code demonstrates the usage of the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>:\n\n<Tabs groupId=\"request_queue\">\n    <TabItem value=\"request_queue_basic_example\" label=\"Basic usage\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RqBasicExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"request_queue_with_crawler\" label=\"Usage with Crawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RqWithCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"request_queue_with_crawler_explicit\" label=\"Explicit usage with Crawler\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RqWithCrawlerExplicitExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n### Request-related helpers\n\nCrawlee provides helper functions to simplify interactions with the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>:\n\n- The <ApiLink to=\"class/AddRequestsFunction\">`add_requests`</ApiLink> function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the <ApiLink to=\"class/Request\">`Request`</ApiLink> class to the helper.\n- The <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details.\n\n<Tabs groupId=\"request_helpers\">\n    <TabItem value=\"request_helper_add_requests\" label=\"Add requests\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RqHelperAddRequestsExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"request_helper_enqueue_links\" label=\"Enqueue links\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {RqHelperEnqueueLinksExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n### Request manager\n\nThe <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> implements the <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> interface, offering a unified API for interacting with various request storage types. This provides a unified way to interact with different request storage types.\n\nIf you need custom functionality, you can create your own request storage by subclassing the <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> class and implementing its required methods.\n\nFor a detailed explanation of the <ApiLink to=\"class/RequestManager\">`RequestManager`</ApiLink> and other related components, refer to the [Request loaders guide](https://crawlee.dev/python/docs/guides/request-loaders).\n\n## Dataset\n\nThe <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional.\n\nThe following code demonstrates basic operations of the dataset:\n\n<Tabs groupId=\"dataset_storage\">\n    <TabItem value=\"dataset_basic_example\" label=\"Basic usage\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {DatasetBasicExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"dataset_with_crawler\" label=\"Usage with Crawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {DatasetWithCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"dataset_with_crawler_explicit\" label=\"Explicit usage with Crawler\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {DatasetWithCrawlerExplicitExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\n### Dataset-related helpers\n\nCrawlee provides the following helper function to simplify interactions with the <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink>:\n\n- The <ApiLink to=\"class/PushDataFunction\">`push_data`</ApiLink> function allows you to manually add data to the dataset. You can optionally specify the dataset ID or its name.\n\n## Key-value store\n\nThe <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink> is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink> ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers.\n\nThe following code demonstrates the usage of the <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>:\n\n<Tabs groupId=\"kv_storage\">\n    <TabItem value=\"kvs_basic_example\" label=\"Basic usage\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {KvsBasicExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"kvs_with_crawler\" label=\"Usage with Crawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {KvsWithCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"kvs_with_crawler_explicit\" label=\"Explicit usage with Crawler\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {KvsWithCrawlerExplicitExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\nTo see a real-world example of how to get the input from the key-value store, see the [Screenshots](https://crawlee.dev/python/docs/examples/capture-screenshots-using-playwright) example.\n\n### Key-value store-related helpers\n\nCrawlee provides the following helper function to simplify interactions with the <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>:\n\n- The <ApiLink to=\"class/GetKeyValueStoreFunction\">`get_key_value_store`</ApiLink> function retrieves the key-value store for the current crawler run. If the KVS does not exist, it will be created. You can also specify the KVS's ID or its name.\n\n## Cleaning up the storages\n\nBy default, Crawlee cleans up all unnamed storages (including the default one) at the start of each run, so every crawl begins with a clean state. This behavior is controlled by <ApiLink to=\"class/Configuration#purge_on_start\">`Configuration.purge_on_start`</ApiLink> (default: True). In contrast, named storages are never purged automatically and persist across runs. The exact behavior may vary depending on the storage client implementation.\n\n### When purging happens\n\nThe cleanup occurs as soon as a storage is accessed:\n- When opening a storage explicitly (e.g., <ApiLink to=\"class/RequestQueue#open\">`RequestQueue.open`</ApiLink>, <ApiLink to=\"class/Dataset#open\">`Dataset.open`</ApiLink>, <ApiLink to=\"class/KeyValueStore#open\">`KeyValueStore.open`</ApiLink>).\n- When using helper functions that implicitly open storages (e.g., <ApiLink to=\"class/PushDataFunction\">`push_data`</ApiLink>).\n- Automatically when <ApiLink to=\"class/BasicCrawler#run\">`BasicCrawler.run`</ApiLink> is invoked.\n\n### Disabling automatic purging\n\nTo disable automatic purging, set `purge_on_start=False` in your configuration:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CleaningDoNotPurgeExample}\n</RunnableCodeBlock>\n\n### Manual purging\n\nPurge on start behavior just triggers the storage's `purge` method, which removes all data from the storage. If you want to purge the storage manually, you can do so by calling the `purge` method on the storage instance. Or if you want to delete the storage completely, you can call the `drop` method on the storage instance, which will remove the storage, including metadata and all its data.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CleaningPurgeExplicitlyExample}\n</RunnableCodeBlock>\n\nNote that purging behavior may vary between storage client implementations. For more details on storage configuration and client implementations, see the [Storage clients guide](./storage-clients).\n\n## Conclusion\n\nThis guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned about the distinction between named storages (persistent across runs) and unnamed storages with aliases (temporary and purged on start). You discovered how to manage requests using the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> and store and retrieve scraping results using the <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> and <ApiLink to=\"class/KeyValueStore\">`KeyValueStore`</ApiLink>. You also learned how to use helper functions to simplify interactions with these storages and how to control storage cleanup behavior.\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!\n"
  },
  {
    "path": "docs/guides/trace_and_monitor_crawlers.mdx",
    "content": "---\nid: trace-and-monitor-crawlers\ntitle: Trace and monitor crawlers\ndescription: Learn how to instrument your crawlers with OpenTelemetry to trace request handling, identify bottlenecks, monitor performance, and visualize telemetry data using Jaeger for performance optimization.\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport InstrumentCrawler from '!!raw-loader!./code_examples/trace_and_monitor_crawlers/instrument_crawler.py';\n\n[OpenTelemtery](https://opentelemetry.io/) is a collection of APIs, SDKs, and tools to instrument, generate, collect, and export telemetry data (metrics, logs, and traces) to help you analyze your software’s performance and behavior. In the context of crawler development, it can be used to better understand how the crawler internally works, identify bottlenecks, debug, log metrics, and more. The topic described in this guide requires at least a basic understanding of OpenTelemetry. A good place to start is [What is open telemetry](https://opentelemetry.io/docs/what-is-opentelemetry/).\n\nIn this guide, it will be shown how to set up OpenTelemetry and instrument a specific crawler to see traces of individual requests that are being processed by the crawler. OpenTelemetry on its own does not provide out of the box tool for convenient visualisation of the exported data (apart from printing to the console), but there are several good available tools to do that. In this guide, we will use [Jaeger](https://www.jaegertracing.io/) to visualise the telemetry data. To better understand concepts such as exporter, collector, and visualisation backend, please refer to the [OpenTelemetry documentation](https://opentelemetry.io/docs/collector/).\n\n## Set up the Jaeger\n\nThis guide will show how to set up the environment locally to run the example code and visualize the telemetry data in Jaeger that will be running locally in a [docker](https://www.docker.com/) container.\n\nTo start the preconfigured Docker container, you can use the following command:\n\n```bash\ndocker run -d --name jaeger -e COLLECTOR_OTLP_ENABLED=true -p 16686:16686 -p 4317:4317 -p 4318:4318 jaegertracing/all-in-one:latest\n```\nFor more details about the Jaeger setup, see the [getting started](https://www.jaegertracing.io/docs/2.7/getting-started/) section in their documentation.\nYou can see the Jaeger UI in your browser by navigating to http://localhost:16686\n\n## Instrument the Crawler\n\nNow you can proceed with instrumenting the crawler to send the telemetry data to Jaeger and running it. To have the Python environment ready, you should install either **crawlee[all]** or **crawlee[otel]**, This will ensure that OpenTelemetry dependencies are installed, and you can run the example code snippet.\nIn the following example, you can see the function `instrument_crawler` that contains the instrumentation setup and is called before the crawler is started. If you have already set up the Jaeger, then you can just run the following code snippet.\n\n<CodeBlock className=\"language-python\">\n    {InstrumentCrawler}\n</CodeBlock>\n\n## Analyze the results\n\nIn the Jaeger UI, you can search for different traces, apply filtering, compare traces, view their detailed attributes, view timing details, and more. For the detailed description of the tool's capabilities, please refer to the [Jaeger documentation](https://www.jaegertracing.io/docs/1.47/deployment/frontend-ui/#trace-page).\n\n![Jaeger search view](/img/guides/jaeger_otel_search_view_example.png 'Example visualisation of search view in Jaeger')\n![Jaeger trace view](/img/guides/jaeger_otel_trace_example.png 'Example visualisation of crawler request trace in Jaeger')\n\nYou can use different tools to consume the OpenTelemetry data that might better suit your needs. Please see the list of known Vendors in [OpenTelemetry documentation](https://opentelemetry.io/ecosystem/vendors/).\n\n## Customize the instrumentation\n\nYou can customize the <ApiLink to=\"class/CrawlerInstrumentor\">`CrawlerInstrumentor`</ApiLink>. Depending on the arguments used during its initialization, the instrumentation will be applied to different parts of the Crawlee code. By default, it instruments some functions that can give quite a good picture of each individual request handling. To turn this default instrumentation off, you can pass `request_handling_instrumentation=False` during initialization. You can also extend instrumentation by passing `instrument_classes=[...]` initialization argument that contains classes you want to be auto-instrumented. All their public methods will be automatically instrumented. Bear in mind that instrumentation has some runtime costs as well. The more instrumentation is used, the more overhead it will add to the crawler execution.\n\nYou can also create your instrumentation by selecting only the methods you want to instrument. For more details, see the <ApiLink to=\"class/CrawlerInstrumentor\">`CrawlerInstrumentor`</ApiLink> source code and the [Python documentation for OpenTelemetry](https://opentelemetry.io/docs/languages/python/).\n\nIf you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU).\n"
  },
  {
    "path": "docs/introduction/01_setting_up.mdx",
    "content": "---\nid: setting-up\ntitle: Setting up\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\nThis guide will help you get started with Crawlee by setting it up on your computer. Follow the steps below to ensure a smooth installation process.\n\n## Prerequisites\n\nBefore installing Crawlee itself, make sure that your system meets the following requirements:\n\n- **Python 3.10 or higher**: Crawlee requires Python 3.10 or a newer version. You can download Python from the [official website](https://python.org/downloads/).\n- **Python package manager**: While this guide uses [pip](https://pip.pypa.io/) (the most common package manager), you can also use any package manager you want. You can download pip from the [official website](https://pip.pypa.io/en/stable/installation/).\n\n### Verifying prerequisites\n\nTo check if Python and pip are installed, run the following commands:\n\n```sh\npython --version\n```\n\n```sh\npython -m pip --version\n```\n\nIf these commands return the respective versions, you're ready to continue.\n\n## Installing Crawlee\n\nCrawlee is available as [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.\n\n### Basic installation\n\nTo install the core package, run:\n\n```sh\npython -m pip install crawlee\n```\n\nAfter installation, verify that Crawlee is installed correctly by checking its version:\n\n```sh\npython -c 'import crawlee; print(crawlee.__version__)'\n```\n\n### Full installation\n\nIf you do not mind the package size, you can run the following command to install Crawlee with all optional features:\n\n```sh\npython -m pip install 'crawlee[all]'\n```\n\n### Installing specific extras\n\nDepending on your use case, you may want to install specific extras to enable additional functionality:\n\nFor using the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, install the `beautifulsoup` extra:\n\n```sh\npython -m pip install 'crawlee[beautifulsoup]'\n```\n\nFor using the <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>, install the `parsel` extra:\n\n```sh\npython -m pip install 'crawlee[parsel]'\n```\n\nFor using the <ApiLink to=\"class/CurlImpersonateHttpClient\">`CurlImpersonateHttpClient`</ApiLink>, install the `curl-impersonate` extra:\n\n```sh\npython -m pip install 'crawlee[curl-impersonate]'\n```\n\nIf you plan to use a (headless) browser with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, install Crawlee with the `playwright` extra:\n\n```sh\npython -m pip install 'crawlee[playwright]'\n```\n\nAfter installing the playwright extra, install the necessary Playwright dependencies:\n\n```sh\nplaywright install\n```\n\n### Installing multiple extras\n\nYou can install multiple extras at once by using a comma as a separator:\n\n```sh\npython -m pip install 'crawlee[beautifulsoup,curl-impersonate]'\n```\n\n## Start a new project\n\nThe quickest way to get started with Crawlee is by using the Crawlee CLI and selecting one of the prepared templates. The CLI helps you set up a new project in seconds.\n\n### Using Crawlee CLI with uv\n\nFirst, ensure you have [uv](https://pypi.org/project/uv/) installed. You can check if it is installed by running:\n\n```sh\nuv --version\n```\n\nIf [uv](https://pypi.org/project/uv/) is not installed, follow the official [installation guide](https://docs.astral.sh/uv/getting-started/installation/).\n\nThen, run the Crawlee CLI using `uvx` and choose from the available templates:\n\n```sh\nuvx 'crawlee[cli]' create my-crawler\n```\n\n### Using Crawlee CLI directly\n\nIf you already have `crawlee` installed, you can spin it up by running:\n\n```sh\ncrawlee create my_crawler\n```\n\nFollow the interactive prompts in the CLI to choose a crawler type and set up your new project.\n\n### Running your project\n\nTo run your newly created project, navigate to the project directory, activate the virtual environment, and execute the Python interpreter with the project module:\n\n<Tabs>\n  <TabItem value=\"Linux\" label=\"Linux\" default>\n      <CodeBlock language=\"sh\">cd my_crawler/</CodeBlock>\n      <CodeBlock language=\"sh\">source .venv/bin/activate</CodeBlock>\n      <CodeBlock language=\"sh\">python -m my_crawler</CodeBlock>\n  </TabItem>\n<TabItem value=\"Windows\" label=\"Windows\" default>\n      <CodeBlock language=\"sh\">cd my_crawler/</CodeBlock>\n      <CodeBlock language=\"sh\">venv\\Scripts\\activate</CodeBlock>\n      <CodeBlock language=\"sh\">python -m my_crawler</CodeBlock>\n  </TabItem>\n</Tabs>\n\nCongratulations! You have successfully set up and executed your first Crawlee project.\n\n## Next steps\n\nNext, you will learn how to create a very simple crawler and Crawlee components while building it.\n"
  },
  {
    "path": "docs/introduction/02_first_crawler.mdx",
    "content": "---\nid: first-crawler\ntitle: First crawler\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport RequestQueueExample from '!!raw-loader!roa-loader!./code_examples/02_request_queue.py';\nimport BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/02_bs.py';\nimport BeautifulSoupBetterExample from '!!raw-loader!roa-loader!./code_examples/02_bs_better.py';\n\nNow, you will build your first crawler. But before you do, let's briefly introduce the Crawlee classes involved in the process.\n\n## How Crawlee works\n\nThere are 3 main crawler classes available for use in Crawlee.\n\n- <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>\n- <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>\n- <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>\n\nWe'll talk about their differences later. Now, let's talk about what they have in common.\n\nThe general idea of each crawler is to go to a web page, open it, do some stuff there, save some results, continue to the next page, and repeat this process until the crawler's done its job. So the crawler always needs to find answers to two questions: _Where should I go?_ and _What should I do there?_ Answering those two questions is the only required setup. The crawlers have reasonable defaults for everything else.\n\n### The where - `Request` and `RequestQueue`\n\nAll crawlers use instances of the <ApiLink to=\"class/Request\">`Request`</ApiLink> class to determine where they need to go. Each request may hold a lot of information, but at the very least, it must hold a URL - a web page to open. But having only one URL would not make sense for crawling. Sometimes you have a pre-existing list of your own URLs that you wish to visit, perhaps a thousand. Other times you need to build this list dynamically as you crawl, adding more and more URLs to the list as you progress. Most of the time, you will use both options.\n\nThe requests are stored in a <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>, a dynamic queue of <ApiLink to=\"class/Request\">`Request`</ApiLink> instances. You can seed it with start URLs and also add more requests while the crawler is running. This allows the crawler to open one page, extract interesting data, such as links to other pages on the same domain, add them to the queue (called _enqueuing_) and repeat this process to build a queue of virtually unlimited number of URLs.\n\n### The what - request handler\n\nIn the request handler you tell the crawler what to do at each and every page it visits. You can use it to handle extraction of data from the page, processing the data, saving it, calling APIs, doing calculations and so on.\n\nThe request handler is a user-defined function, invoked automatically by the crawler for each <ApiLink to=\"class/Request\">`Request`</ApiLink> from the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>. It always receives a single argument - <ApiLink to=\"class/BasicCrawlingContext\">`BasicCrawlingContext`</ApiLink> (or its descendants). Its properties change depending on the crawler class used, but it always includes the `request` property, which represents the currently crawled URL and related metadata.\n\n## Building a crawler\n\nLet's put the theory into practice and start with something easy. Visit a page and get its HTML title. In this tutorial, you'll scrape the Crawlee website [https://crawlee.dev](https://crawlee.dev), but the same code will work for any website.\n\n### Adding requests to the crawling queue\n\nEarlier you learned that the crawler uses a queue of requests as its source of URLs to crawl. Let's create it and add the first request.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {RequestQueueExample}\n</RunnableCodeBlock>\n\nThe <ApiLink to=\"class/RequestQueue#add_request\">`RequestQueue.add_request`</ApiLink> method automatically converts the object with URL string to a <ApiLink to=\"class/Request\">`Request`</ApiLink> instance. So now you have a <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> that holds one request which points to `https://crawlee.dev`.\n\n:::tip Bulk add requests\n\nThe code above is for illustration of the request queue concept. Soon you'll learn about the  <ApiLink to=\"class/BasicCrawler#add_requests\">`BasicCrawler.add_requests`</ApiLink> method which allows you to skip this initialization code, and it also supports adding a large number of requests without blocking.\n\n:::\n\n### Building a BeautifulSoupCrawler\n\nCrawlee comes with three main crawler classes: <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>, and <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>. You can read their short descriptions in the [Quick start](../quick-start) lesson.\n\nUnless you have a good reason to start with a different one, you should try building a <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> first. It is an HTTP crawler with HTTP2 support, anti-blocking features and integrated HTML parser - [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). It's fast, simple, cheap to run and does not require complicated dependencies. The only downside is that it won't work out of the box for websites which require JavaScript rendering. But you might not need JavaScript rendering at all, because many modern websites use server-side rendering.\n\nLet's continue with the earlier <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> example.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupExample}\n</RunnableCodeBlock>\n\nWhen you run the example, you will see the title of https://crawlee.dev printed to the log. What really happens is that <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> first makes an HTTP request to `https://crawlee.dev`, then parses the received HTML with BeautifulSoup and makes it available as the `context` argument of the request handler.\n\n```log\n[__main__] INFO  The title of \"https://crawlee.dev\" is \"Crawlee · Build reliable crawlers. Fast. | Crawlee\".\n```\n\n### Add requests faster\n\nEarlier we mentioned that you'll learn how to use the  <ApiLink to=\"class/BasicCrawler#add_requests\">`BasicCrawler.add_requests`</ApiLink> method to skip the request queue initialization. It's simple. Every crawler has an implicit <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> instance, and you can add requests to it with the  <ApiLink to=\"class/BasicCrawler#add_requests\">`BasicCrawler.add_requests`</ApiLink> method. In fact, you can go even further and just use the first parameter of `crawler.run()`!\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {BeautifulSoupBetterExample}\n</RunnableCodeBlock>\n\nWhen you run this code, you'll see exactly the same output as with the earlier, longer example. The <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> is still there, it's just managed by the crawler automatically.\n\n:::info\n\nThis method not only makes the code shorter, it will help with performance too! Internally it calls  <ApiLink to=\"class/RequestQueue#add_requests_batched\">`RequestQueue.add_requests_batched`</ApiLink> method. It will wait only for the initial batch of 1000 requests to be added to the queue before resolving, which means the processing will start almost instantly. After that, it will continue adding the rest of the requests in the background (again, in batches of 1000 items, once every second).\n\n:::\n\n## Next steps\n\nNext, you'll learn about crawling links. That means finding new URLs on the pages you crawl and adding them to the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> for the crawler to visit.\n"
  },
  {
    "path": "docs/introduction/03_adding_more_urls.mdx",
    "content": "---\nid: adding-more-urls\ntitle: Adding more URLs\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport OriginalCodeExample from '!!raw-loader!roa-loader!./code_examples/03_original_code.py';\nimport FindingNewLinksExample from '!!raw-loader!roa-loader!./code_examples/03_finding_new_links.py';\nimport EnqueueStrategyExample from '!!raw-loader!roa-loader!./code_examples/03_enqueue_strategy.py';\nimport GlobsExample from '!!raw-loader!roa-loader!./code_examples/03_globs.py';\nimport TransformExample from '!!raw-loader!roa-loader!./code_examples/03_transform_request.py';\n\nPreviously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {OriginalCodeExample}\n</RunnableCodeBlock>\n\nNow you'll use the example from the previous section and improve on it. You'll add more URLs to the queue and thanks to that the crawler will keep going, finding new links, enqueuing them into the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> and then scraping them.\n\n## How crawling works\n\nThe process is simple:\n\n1. Find new links on the page.\n2. Filter only those pointing to the same domain, in this case [crawlee.dev](https://crawlee.dev/).\n3. Enqueue (add) them to the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>.\n4. Visit the newly enqueued links.\n5. Repeat the process.\n\nIn the following paragraphs you will learn about the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> function which simplifies crawling to a single function call.\n\n:::tip context awareness\n\nThe <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> function is context aware. It means that it will read the information about the currently crawled page from the context, and you don't need to explicitly provide any arguments. However, you can specify filtering criteria or an enqueuing strategy if desired. It will find the links and automatically add the links to the running crawler's <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>.\n\n:::\n\n## Limit your crawls\n\nWhen you're just testing your code or when your crawler could potentially find millions of links, it's very useful to set a maximum limit of crawled pages. The option is called <ApiLink to=\"class/BasicCrawlerOptions#max_requests_per_crawl\">`max_requests_per_crawl`</ApiLink>, is available in all crawlers, and you can set it like this:\n\n```python\ncrawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n```\n\nThis means that no new requests will be started after the 20th request is finished. The actual number of processed requests might be a little higher thanks to parallelization, because the running requests won't be forcefully aborted. It's not even possible in most cases.\n\n## Finding new links\n\nThere are numerous approaches to finding links to follow when crawling the web. For our purposes, we will be looking for `<a>` elements that contain the `href` attribute because that's what you need in most cases. For example:\n\n```html\n<a href=\"https://crawlee.dev/docs/introduction\">This is a link to Crawlee introduction</a>\n```\n\nSince this is the most common case, it is also the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> default.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {FindingNewLinksExample}\n</RunnableCodeBlock>\n\nIf you need to override the default selection of elements in <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink>, you can use the `selector` argument.\n\n```python\nawait context.enqueue_links(selector='a.article-link')\n```\n\n## Filtering links to same domain\n\nWebsites typically contain a lot of links that lead away from the original page. This is normal, but when crawling a website, we usually want to crawl that one site and not let our crawler wander away to Google, Facebook and Twitter. Therefore, we need to filter out the off-domain links and only keep the ones that lead to the same domain.\n\n```python\n# The default behavior of enqueue_links is to stay on the same hostname, so it does not require\n# any parameters. This will ensure the subdomain stays the same.\nawait context.enqueue_links()\n```\n\nThe default behavior of <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> is to stay on the same hostname. This **does not include subdomains**. To include subdomains in your crawl, use the `strategy` argument. The `strategy` argument is an instance of the `EnqueueStrategy` type alias.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {EnqueueStrategyExample}\n</RunnableCodeBlock>\n\nWhen you run the code, you will see the crawler log the **title** of the first page, then the **enqueueing** message showing number of URLs, followed by the **title** of the first enqueued page and so on and so on.\n\n## Skipping duplicate URLs\n\nSkipping of duplicate URLs is critical, because visiting the same page multiple times would lead to duplicate results. This is automatically handled by the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink> which deduplicates requests using their `unique_key`. This `unique_key` is automatically generated from the request's URL by lowercasing the URL, lexically ordering query parameters, removing fragments and a few other tweaks that ensure the queue only includes unique URLs.\n\n## Advanced filtering arguments\n\nWhile the defaults for <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> can be often exactly what you need, it also gives you fine-grained control over which URLs should be enqueued. One way we already mentioned above. It is using the `EnqueueStrategy` type alias. You can use the `all` strategy if you want to follow every single link, regardless of its domain, or you can enqueue links that target the same domain name with the `same-domain` strategy.\n\n```python\n# Wanders the internet.\nawait context.enqueue_links(strategy='all')\n```\n\n### Filter URLs with patterns\n\nFor even more control, you can use the `include` or `exclude` parameters, either as glob patterns or regular expressions, to filter the URLs. Refer to the API documentation for <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> for detailed information on these and other available options.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {GlobsExample}\n</RunnableCodeBlock>\n\n### Transform requests before enqueuing\n\nFor cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function receives a <ApiLink to=\"class/RequestOptions\">`RequestOptions`</ApiLink> object and should return either a modified <ApiLink to=\"class/RequestOptions\">`RequestOptions`</ApiLink> object, or a string of type `RequestTransformAction`, which only allows the values `skip` and `unchanged`. Returning `skip` means the request will be skipped, while `unchanged` will add it without any changes\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {TransformExample}\n</RunnableCodeBlock>\n\n## Next steps\n\nNext, you will start your project of scraping a production website and learn some more Crawlee tricks in the process.\n"
  },
  {
    "path": "docs/introduction/04_real_world_project.mdx",
    "content": "---\nid: real-world-project\ntitle: Real-world project\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport SanityCheckExample from '!!raw-loader!roa-loader!./code_examples/04_sanity_check.py';\n\n> _Hey, guys, you know, it's cool that we can scrape the `<title>` elements of web pages, but that's not very useful. Can we finally scrape some real data and save it somewhere in a machine-readable format? Because that's why I started reading this tutorial in the first place!_\n\nWe hear you, young padawan! First, learn how to crawl, you must. Only then, walk through data, you can!\n\n## Making a production-grade crawler\n\nMaking a production-grade crawler is not difficult, but there are many pitfalls of scraping that can catch you off guard. So for the real world project you'll learn how to scrape an [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) instead of the Crawlee website. It contains a list of products of different categories, and each product has its own detail page.\n\nThe website requires JavaScript rendering, which allows us to showcase more features of Crawlee. We've also added some helpful tips that prepare you for the real-world issues that you will surely encounter when scraping at scale.\n\n:::tip Not interested in theory?\n\nIf you're not interested in crawling theory, feel free to [skip to the next chapter](./crawling) and get right back to coding.\n\n:::\n\n## Drawing a plan\n\nSometimes scraping is really straightforward, but most of the time, it really pays off to do a bit of research first and try to answer some of these questions:\n\n- How is the website structured?\n- Can I scrape it only with HTTP requests (read \"with some <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>, e.g. <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>\")?\n- Do I need a headless browser for something?\n- Are there any anti-scraping protections in place?\n- Do I need to parse the HTML or can I get the data otherwise, such as directly from the website's API?\n\nFor the purposes of this tutorial, let's assume that the website cannot be scraped with <ApiLink to=\"class/HttpCrawler\">`HttpCrawler`</ApiLink>. It actually can, but we would have to dive a bit deeper than this introductory guide allows. So for now we will make things easier for you, scrape it with <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, and you'll learn about headless browsers in the process.\n\n## Choosing the data you need\n\nA good first step is to figure out what data you want to scrape and where to find it. For the time being, let's just agree that we want to scrape all products from all categories available on the [all collections page of the store](https://warehouse-theme-metal.myshopify.com/collections) and for each product we want to get its:\n\n- URL\n- Manufacturer\n- SKU\n- Title\n- Current price\n- Stock available\n\nYou will notice that some information is available directly on the list page, but for details such as \"SKU\" we'll also need to open the product's detail page.\n\n![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.')\n\n### The start URL(s)\n\nThis is where you start your crawl. It's convenient to start as close to the data as possible. For example, it wouldn't make much sense to start at https://warehouse-theme-metal.myshopify.com and look for a `collections` link there, when we already know that everything we want to extract can be found at the https://warehouse-theme-metal.myshopify.com/collections page.\n\n## Exploring the page\n\nLet's take a look at the https://warehouse-theme-metal.myshopify.com/collections page more carefully. There are some **categories** on the page, and each category has a list of **items**. On some category pages, at the bottom you will notice there are links to the next pages of results. This is usually called **the pagination**.\n\n### Categories and sorting\n\nWhen you click the categories, you'll see that they load a page of products filtered by that category. By going through a few categories and observing the behavior, we can also observe that we can sort by different conditions (such as `Best selling`, or `Price, low to high`), but for this example, we will not be looking into those.\n\n:::caution Limited pagination\n\nBe careful, because on some websites, like [amazon.com](https://amazon.com), this is not true and the sum of products in categories is actually larger than what's available without filters. Learn more in our [tutorial on scraping websites with limited pagination](https://docs.apify.com/tutorials/scrape-paginated-sites).\n\n:::\n\n### Pagination\n\nThe pagination of the demo Warehouse Store is simple enough. When switching between pages, you will see that the URL changes to:\n\n```text\nhttps://warehouse-theme-metal.myshopify.com/collections/headphones?page=2\n```\n\nTry clicking on the link to page 4. You'll see that the pagination links update and show more pages. But can you trust that this will include all pages and won't stop at some point?\n\n:::caution Test your assumptions\n\nSimilarly to the issue with filters explained above, the existence of pagination does not guarantee that you can simply paginate through all the results. Always test your assumptions about pagination. Otherwise, you might miss a chunk of results, and not even know about it.\n\n:::\n\nAt the time of writing the `Headphones` collection results counter showed 75 results - products. Quick count of products on one page of results makes 24. 6 rows times 4 products. This means that there are 4 pages of results.\n\nIf you're not convinced, you can visit a page somewhere in the middle, like `https://warehouse-theme-metal.myshopify.com/collections/headphones?page=2` and see how the pagination looks there.\n\n## The crawling strategy\n\nNow that you know where to start and how to find all the collection details, let's look at the crawling process.\n\n1. Visit the store page containing the list of categories (our start URL).\n2. Enqueue all links to all categories.\n3. Enqueue all product pages from the current page.\n4. Enqueue links to next pages of results.\n5. Open the next page in queue.\n    - When it's a results list page, go to 2.\n    - When it's a product page, scrape the data.\n6. Repeat until all results pages and all products have been processed.\n\n`PlaywrightCrawler` will make sure to visit the pages for you, if you provide the correct requests, and you already know how to enqueue pages, so this should be fairly easy. Nevertheless, there are few more tricks that we'd like to showcase.\n\n## Sanity check\n\nLet's check that everything is set up correctly before writing the scraping logic itself. You might realize that something in your previous analysis doesn't quite add up, or the website might not behave exactly as you expected.\n\nThe example below creates a new crawler that visits the start URL and prints the text content of all the categories on that page. When you run the code, you will see the _very badly formatted_ content of the individual category card.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {SanityCheckExample}\n</RunnableCodeBlock>\n\nIf you're wondering how to get that `.collection-block-item` selector. We'll explain it in the next chapter on DevTools.\n\n## DevTools - the scraper's toolbox\n\n:::info DevTool choice\n\nWe'll use Chrome DevTools here, since it's the most common browser, but feel free to use any other, they're all very similar.\n\n:::\n\nLet's open DevTools by going to https://warehouse-theme-metal.myshopify.com/collections in Chrome and then right-clicking anywhere in the page and selecting **Inspect**, or by pressing **F12** or whatever your system prefers. With DevTools, you can inspect or manipulate any aspect of the currently open web page. You can learn more about DevTools in their [official documentation](https://developer.chrome.com/docs/devtools/).\n\n## Selecting elements\n\nIn the DevTools, choose the **Select an element** tool and try hovering over one of the Actor cards.\n\n![select an element](/img/getting-started/select-an-element.jpg 'Finding the select an element tool.')\n\nYou'll see that you can select different elements inside the card. Instead, select the whole card, not just some of its contents, such as its title or description.\n\n![selected element](/img/getting-started/selected-element.jpg 'Selecting an element by hovering over it.')\n\nSelecting an element will highlight it in the DevTools HTML inspector. When carefully look at the elements, you'll see that there are some **classes** attached to the different HTML elements. Those are called **CSS classes**, and we can make a use of them in scraping.\n\nConversely, by hovering over elements in the HTML inspector, you will see them highlight on the page. Inspect the page's structure around the collection card. You'll see that all the card's data is displayed in an `<a>` element with a `class` attribute that includes **collection-block-item**. It should now make sense how we got that `.collection-block-item` selector. It's just a way to find all elements that are annotated with the `collection-block-item`.\n\nIt's always a good idea to double-check that you're not getting any unwanted elements with this class. To do that, go into the **Console** tab of DevTools and run:\n\n```ts\ndocument.querySelectorAll('.collection-block-item');\n```\n\nYou will see that only the 31 collection cards will be returned, and nothing else.\n\n:::tip Learn more about CSS selectors and DevTools\n\nCSS selectors and DevTools are quite a big topic. If you want to learn more, visit the [Web scraping for beginners course](https://developers.apify.com/academy/web-scraping-for-beginners) in the Apify Academy. **It's free and open-source** ❤️.\n\n:::\n\n## Next steps\n\nNext, you will crawl the whole store, including all the listing pages and all the product detail pages.\n"
  },
  {
    "path": "docs/introduction/05_crawling.mdx",
    "content": "---\nid: crawling\ntitle: Crawling\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport CrawlingListingExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_listing.py';\nimport CrawlingDetailExample from '!!raw-loader!roa-loader!./code_examples/05_crawling_detail.py';\n\nTo crawl the whole [Warehouse store example](https://warehouse-theme-metal.myshopify.com/collections) and find all the data, you first need to visit all the pages with products - going through all categories available and also all the product detail pages.\n\n## Crawling the listing pages\n\nIn previous lessons, you used the <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> function like this:\n\n```python\nawait enqueue_links()\n```\n\nWhile useful in that scenario, you need something different now. Instead of finding all the `<a href=\"..\">` elements with links to the same hostname, you need to find only the specific ones that will take your crawler to the next page of results. Otherwise, the crawler will visit a lot of other pages that you're not interested in. Using the power of DevTools and yet another <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> parameter, this becomes fairly easy.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CrawlingListingExample}\n</RunnableCodeBlock>\n\nThe code should look pretty familiar to you. It's a very simple request handler where we log the currently processed URL to the console and enqueue more links. But there are also a few new, interesting additions. Let's break it down.\n\n### The `selector` parameter of `enqueue_links`\n\nWhen you previously used <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink>, you were not providing any `selector` parameter, and it was fine, because you wanted to use the default value, which is `a` - finds all `<a>` elements. But now, you need to be more specific. There are multiple `<a>` links on the `Categories` page, and you're only interested in those that will take your crawler to the available list of results. Using the DevTools, you'll find that you can select the links you need using the `.collection-block-item` selector, which selects all the elements that have the `class=collection-block-item` attribute.\n\n### The `label` of `enqueue_links`\n\nYou will see `label` used often throughout Crawlee, as it's a convenient way of labelling a <ApiLink to=\"class/Request\">`Request`</ApiLink> instance for quick identification later. You can access it with `request.label` and it's a `string`. You can name your requests any way you want. Here, we used the label `CATEGORY` to note that we're enqueueing pages that represent a category of products. The <ApiLink to=\"class/EnqueueLinksFunction\">`enqueue_links`</ApiLink> function will add this label to all requests before enqueueing them to the <ApiLink to=\"class/RequestQueue\">`RequestQueue`</ApiLink>. Why this is useful will become obvious in a minute.\n\n## Crawling the detail pages\n\nIn a similar fashion, you need to collect all the URLs to the product detail pages, because only from there you can scrape all the data you need. The following code only repeats the concepts you already know for another set of links.\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {CrawlingDetailExample}\n</RunnableCodeBlock>\n\nThe crawling code is now complete. When you run the code, you'll see the crawler visit all the listing URLs and all the detail URLs.\n\n## Next steps\n\nThis concludes the Crawling lesson, because you have taught the crawler to visit all the pages it needs. Let's continue with scraping data.\n"
  },
  {
    "path": "docs/introduction/06_scraping.mdx",
    "content": "---\nid: scraping\ntitle: Scraping\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport ScrapingExample from '!!raw-loader!roa-loader!./code_examples/06_scraping.py';\n\nIn the [Real-world project](./real-world-project#choosing-the-data-you-need) chapter, you've created a list of the information you wanted to collect about the products in the example Warehouse store. Let's review that and figure out ways to access the data.\n\n- URL\n- Manufacturer\n- SKU\n- Title\n- Current price\n- Stock available\n\n![data to scrape](/img/getting-started/scraping-practice.jpg 'Overview of data to be scraped.')\n\n## Scraping the URL and manufacturer\n\nSome information is lying right there in front of us without even having to touch the product detail pages. The `URL` we already have - the `context.request.url`. And by looking at it carefully, we realize that we can also extract the manufacturer from the URL (as all product urls start with `/products/<manufacturer>`). We can just split the `string` and be on our way then!\n\n:::info url vs loaded url\n\nYou can use `request.loaded_url` as well. Remember the difference: `request.url` is what you enqueue, `request.loaded_url` is what gets processed (after possible redirects).\n\n:::\n\nBy splitting the `request.url`, we can extract the manufacturer name directly from the URL. This is done by first splitting the URL to get the product identifier and then splitting that identifier to get the manufacturer name.\n\n```python\n# context.request.url:\n# https://warehouse-theme-metal.myshopify.com/products/sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440\n\n# Split the URL and get the last part.\nurl_part = context.request.url.split('/').pop()\n# url_part: sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440\n\n# Split the last part by '-' and get the first element.\nmanufacturer = url_part.split('-')[0]\n# manufacturer: 'sennheiser'\n```\n\n:::tip Storing information\n\nIt's a matter of preference, whether to store this information separately in the resulting dataset, or not. Whoever uses the dataset can easily parse the `manufacturer` from the `URL`, so should you duplicate the data unnecessarily? Our opinion is that unless the increased data consumption would be too large to bear, it's better to make the dataset as rich as possible. For example, someone might want to filter by `manufacturer`.\n\n:::\n\n:::caution Adapt and extract\n\nOne thing you may notice is that the `manufacturer` might have a `-` in its name. If that's the case, your best bet is extracting it from the details page instead, but it's not mandatory. At the end of the day, you should always adjust and pick the best solution for your use case, and website you are crawling.\n\n:::\n\nNow it's time to add more data to the results. Let's open one of the product detail pages, for example the [Sony XBR-950G](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) page and use our DevTools-Fu 🥋 to figure out how to get the title of the product.\n\n## Scraping title\n\nTo scrape the product title from a webpage, you need to identify its location in the HTML structure. By using the element selector tool in your browser's DevTools, you can see that the title is within an `<h1>` tag, which is a common practice for important headers. This `<h1>` tag is enclosed in a `<div>` with the class product-meta. We can leverage this structure to create a combined selector `.product-meta h1`. This selector targets any `<h1>` element that is a child of an element with the class `product-meta`.\n\n![product title](/img/getting-started/title.jpg 'Finding product title in DevTools.')\n\n:::tip Verifying selectors with DevTools\n\nRemember that you can press CTRL+F (or CMD+F on Mac) in the **Elements** tab of DevTools to open the search bar where you can quickly search for elements using their selectors. Always verify your scraping process and assumptions using the DevTools. It's faster than changing the crawler code all the time.\n\n:::\n\nTo get the title, you need to locate it using Playwright with the `.product-meta h1` selector. This selector specifically targets the `<h1>` element you need. If multiple elements match, it will throw an error, which is beneficial as it prevents returning incorrect data silently. Ensuring the accuracy of your selectors is crucial for reliable data extraction.\n\n```python\ntitle = await context.page.locator('.product-meta h1').text_content()\n```\n\n## Scraping SKU\n\nUsing the DevTools, you can find that the product SKU is inside a `<span>` tag with the class `product-meta__sku-number`. Since there is no other `<span>` with that class on the page, you can safely use this selector to extract the SKU.\n\n![product sku selector](/img/getting-started/sku.jpg 'Finding product SKU in DevTools.')\n\n```python\n# Find the SKU element using the selector and get its text content.\nsku = await context.page.locator('span.product-meta__sku-number').text_content()\n```\n\n## Scraping current price\n\nUsing DevTools, you can find that the current price is within a `<span>` element tagged with the `price` class. However, it is nested alongside another `<span>` element with the `visually-hidden` class. To avoid extracting the wrong text, you can filter the elements to get the correct one using the `has_text` helper.\n\n![product current price selector](/img/getting-started/current-price.jpg 'Finding product current price in DevTools.')\n\n```python\n# Locate the price element and filter out the visually hidden elements.\nprice_element = context.page.locator('span.price', has_text='$').first\n\n# Extract the text content of the price element.\ncurrent_price_string = await price_element.text_content() or ''\n# current_price_string: 'Sale price$1,398.00'\n\n# Split the string by the '$' sign to get the numeric part.\nraw_price = current_price_string.split('$')[1]\n# raw_price: '1,398.00'\n\n# Convert the raw price string to a float after removing commas.\nprice = float(raw_price.replace(',', ''))\n# price: 1398.00\n```\n\nIt might look a little complex at first glance, but let's walk through what you did. First, you locate the correct part of the `price` span by filtering for elements containing the `$` sign. This ensures that you get the actual price element. Once you have the right element, you extract its text content, which gives you a string similar to `Sale price$1,398.00`. To get the numeric value, you split this string by the `$` sign. Next, you remove any commas from the resulting numeric string and convert it to a float, allowing you to work with the price as a number. This process ensures that you accurately extract and convert the current price from the product page.\n\n## Scraping stock availability\n\nThe final step is to scrape the stock availability information. There is a `<span>` with the class `product-form__inventory`, which contains the text `In stock` if the product is available. You can use the `has_text` helper to filter out the correct element.\n\n```python\n# Locate the element that contains the text 'In stock' and filter out other elements.\nin_stock_element = context.page.locator(\n    selector='span.product-form__inventory',\n    has_text='In stock',\n).first\n\n# Check if the element exists by counting the matching elements.\nin_stock = await in_stock_element.count() > 0\n```\n\nFor this, all that matters is whether the element exists or not. You can use the `count()` method to check if any elements match the selector. If there are, it means the product is in stock.\n\n## Trying it out\n\nYou have everything that is needed, so grab your newly created scraping logic, dump it into your original request handler and see the magic happen!\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {ScrapingExample}\n</RunnableCodeBlock>\n\nWhen you run the crawler, you will see the crawled URLs and their scraped data printed to the console. The output will look something like this:\n\n```json\n{\n    \"url\": \"https://warehouse-theme-metal.myshopify.com/products/sony-str-za810es-7-2-channel-hi-res-wi-fi-network-av-receiver\",\n    \"manufacturer\": \"sony\",\n    \"title\": \"Sony STR-ZA810ES 7.2-Ch Hi-Res Wi-Fi Network A/V Receiver\",\n    \"sku\": \"SON-692802-STR-DE\",\n    \"price\": 698,\n    \"in_stock\": true\n}\n```\n\n## Next steps\n\nNext, you'll see how to save the data you scraped to the disk for further processing.\n"
  },
  {
    "path": "docs/introduction/07_saving_data.mdx",
    "content": "---\nid: saving-data\ntitle: Saving data\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport FirstCodeExample from '!!raw-loader!./code_examples/07_first_code.py';\n\nimport FinalCodeExample from '!!raw-loader!roa-loader!./code_examples/07_final_code.py';\n\nA data extraction job would not be complete without saving the data for later use and processing. You've come to the final and most difficult part of this tutorial so make sure to pay attention very carefully!\n\n## Save data to the dataset\n\nCrawlee provides a <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> class, which acts as an abstraction over tabular storage, making it useful for storing scraping results. To get started:\n\n- Add the necessary imports: Include the <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> and any required crawler classes at the top of your file.\n- Create a Dataset instance: Use the asynchronous <ApiLink to=\"class/Dataset#open\">`Dataset.open`</ApiLink> constructor to initialize the dataset instance within your crawler's setup.\n\nHere's an example:\n\n<CodeBlock language=\"python\">\n    {FirstCodeExample}\n</CodeBlock>\n\nFinally, instead of logging the extracted data to stdout, we can export them to the dataset:\n\n```python\n# ...\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        # ...\n\n        data = {\n            'manufacturer': manufacturer,\n            'title': title,\n            'sku': sku,\n            'price': price,\n            'in_stock': in_stock,\n        }\n\n        # Push the data to the dataset.\n        await dataset.push_data(data)\n\n        # ...\n```\n\n### Using a context helper\n\nInstead of importing a new class and manually creating an instance of the dataset, you can use the context helper  <ApiLink to=\"class/PushDataFunction\">`context.push_data`</ApiLink>. Remove the dataset import and instantiation, and replace `dataset.push_data` with the following:\n\n```python\n# ...\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        # ...\n\n        data = {\n            'manufacturer': manufacturer,\n            'title': title,\n            'sku': sku,\n            'price': price,\n            'in_stock': in_stock,\n        }\n\n        # Push the data to the dataset.\n        await context.push_data(data)\n\n        # ...\n```\n\n### Final code\n\nAnd that's it. Unlike earlier, we are being serious now. That's it, you're done. The final code looks like this:\n\n<RunnableCodeBlock className=\"language-python\" language=\"python\">\n    {FinalCodeExample}\n</RunnableCodeBlock>\n\n## What `push_data` does?\n\nA helper <ApiLink to=\"class/PushDataFunction\">`context.push_data`</ApiLink> saves data to the default dataset. You can provide additional arguments there like `id` or `name` to open a different dataset. Dataset is a storage designed to hold data in a format similar to a table. Each time you call <ApiLink to=\"class/PushDataFunction\">`context.push_data`</ApiLink> or direct <ApiLink to=\"class/Dataset#push_data\">`Dataset.push_data`</ApiLink> a new row in the table is created, with the property names serving as column titles. In the default configuration, the rows are represented as JSON files saved on your file system, but other backend storage systems can be plugged into Crawlee as well. More on that later.\n\n:::info Automatic dataset initialization\n\nEach time you start Crawlee a default <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the <ApiLink to=\"class/Dataset#open\">`Dataset.open`</ApiLink> function.\n\n:::\n\n{/* TODO: mention result storage guide once it's done\n\n:::info Automatic dataset initialization\n\nEach time you start Crawlee a default <ApiLink to=\"class/Dataset\">`Dataset`</ApiLink> is automatically created, so there's no need to initialize it or create an instance first. You can create as many datasets as you want and even give them names. For more details see the [Result storage guide](../guides/result-storage#dataset) and the `Dataset.open()` function.\n\n:::\n*/}\n\n## Finding saved data\n\nUnless you changed the configuration that Crawlee uses locally, which would suggest that you knew what you were doing, and you didn't need this tutorial anyway, you'll find your data in the storage directory that Crawlee creates in the working directory of the running script:\n\n```text\n{PROJECT_FOLDER}/storage/datasets/default/\n```\n\nThe above folder will hold all your saved data in numbered files, as they were pushed into the dataset. Each file represents one invocation of <ApiLink to=\"class/Dataset#push_data\">`Dataset.push_data`</ApiLink> or one table row.\n\n{/* TODO: add mention of \"Result storage guide\" once it's ready:\n\n:::tip Single file data storage options\n\nIf you would like to store your data in a single big file, instead of many small ones, see the [Result storage guide](../guides/result-storage#key-value-store) for Key-value stores.\n\n:::\n\n*/}\n\n## Next steps\n\nNext, you'll see some improvements that you can add to your crawler code that will make it more readable and maintainable in the long run.\n"
  },
  {
    "path": "docs/introduction/08_refactoring.mdx",
    "content": "---\nid: refactoring\ntitle: Refactoring\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport CodeBlock from '@theme/CodeBlock';\n\nimport MainExample from '!!raw-loader!./code_examples/08_main.py';\nimport RoutesExample from '!!raw-loader!./code_examples/08_routes.py';\n\nIt may seem that the data is extracted and the crawler is done, but honestly, this is just the beginning. For the sake of brevity, we've completely omitted error handling, proxies, logging, architecture, tests, documentation and other stuff that a reliable software should have. The good thing is, error handling is mostly done by Crawlee itself, so no worries on that front, unless you need some custom magic.\n\n:::info Navigating automatic bot-protextion avoidance\n\nYou might be wondering about the **anti-blocking, bot-protection avoiding stealthy features** and why we haven't highlighted them yet. The reason is straightforward: these features are **automatically used** within the default configuration, providing a smooth start without manual adjustments.\n\n:::\n\n{/* TODO: add this to the info once the relevant guide is ready\n\nHowever, the default configuration, while powerful, may not cover every scenario.\n\nIf you want to learn more, browse the [Avoid getting blocked](../guides/avoid-blocking), [Proxy management](../guides/proxy-management) and [Session management](../guides/session-management) guides.\n*/}\n\nTo promote good coding practices, let's look at how you can use a <ApiLink to=\"class/Router\">`Router`</ApiLink> class to better structure your crawler code.\n\n## Request routing\n\nIn the following code, we've made several changes:\n\n- Split the code into multiple files.\n- Added custom instance of <ApiLink to=\"class/Router\">`Router`</ApiLink> to make our routing cleaner, without if clauses.\n- Moved route definitions to a separate `routes.py` file.\n- Simplified the `main.py` file to focus on the general structure of the crawler.\n\n### Routes file\n\nFirst, let's define our routes in a separate file:\n\n<CodeBlock className=\"language-python\" title=\"src/routes.py\">\n    {RoutesExample}\n</CodeBlock>\n\n### Main file\n\nNext, our main file becomes much simpler and cleaner:\n\n<CodeBlock className=\"language-python\" title=\"src/main.py\">\n    {MainExample}\n</CodeBlock>\n\nBy structuring your code this way, you achieve better separation of concerns, making the code easier to read, manage and extend. The <ApiLink to=\"class/Router\">`Router`</ApiLink> class keeps your routing logic clean and modular, replacing if clauses with function decorators.\n\n## Summary\n\nRefactoring your crawler code with these practices enhances readability, maintainability, and scalability.\n\n### Splitting your code into multiple files\n\nThere's no reason not to split your code into multiple files and keep your logic separate. Less code in a single file means less complexity to handle at any time, which improves overall readability and maintainability. Consider further splitting the routes into separate files for even better organization.\n\n### Using a router to structure your crawling\n\nInitially, using a simple `if` / `else` statement for selecting different logic based on the crawled pages might appear more readable. However, this approach can become cumbersome with more than two types of pages, especially when the logic for each page extends over dozens or even hundreds of lines of code.\n\nIt's good practice in any programming language to split your logic into bite-sized chunks that are easy to read and reason about. Scrolling through a thousand line long `request_handler()` where everything interacts with everything and variables can be used everywhere is not a beautiful thing to do and a pain to debug. That's why we prefer the separation of routes into their own files.\n\n## Next steps\n\nIn the next and final step, you'll see how to deploy your Crawlee project to the cloud. If you used the CLI to bootstrap your project, you already have a `Dockerfile` ready, and the next section will show you how to deploy it to the [Apify platform](../deployment/apify-platform) with ease.\n"
  },
  {
    "path": "docs/introduction/09_running_in_cloud.mdx",
    "content": "---\nid: deployment\ntitle: Running your crawler in the Cloud\nsidebar_label: Running in the Cloud\ndescription: Deploying Crawlee-python projects to the Apify platform\n---\n\nimport CodeBlock from '@theme/CodeBlock';\nimport MainExample from '!!raw-loader!./code_examples/09_apify_sdk.py';\n\n## Apify platform\n\nCrawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify platform**](https://console.apify.com) will give you the best experience.\n\n{/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/}\n\nWith a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors).\n\n{/*:::info Choosing between Crawlee CLI and Apify CLI for project setup\n\nWe started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify platform right ahead.\n\n:::*/}\n\n## Dependencies\n\nBefore we get started, you'll need to install two new dependencies:\n\n- [**Apify SDK**](https://pypi.org/project/apify/), a toolkit for working with the Apify platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. The Apify SDK, like Crawlee itself, is available as a PyPI package and can be installed with any Python package manager. To install it using [pip](https://pip.pypa.io/), run:\n\n    ```sh\n    pip install apify\n    ```\n\n- [**Apify CLI**](https://docs.apify.com/cli/), a command-line tool that will help us with authentication and deployment. It is a [Node.js](https://nodejs.org/) package, and can be installed using any Node.js package manager. In this guide, we will use [npm](https://npmjs.com/). We will install it globally, so you can use it across all your Crawlee and Apify projects. To install it using npm, run:\n\n    ```sh\n    npm install -g apify-cli\n    ```\n\n## Logging in to the Apify platform\n\nThe next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations.\n\n```sh\napify login\n```\n\n## Adjusting the code\n\nNow that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify platform counterparts - otherwise Crawlee would keep things only in memory.\n\nOpen your `src/main.py` file, and wrap everything in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this:\n\n<CodeBlock className=\"language-python\" title=\"src/main.py\">\n    {MainExample}\n</CodeBlock>\n\nThe context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown.\n\n:::info Understanding `async with Actor` behavior with environment variables\n\nThe [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify platform.\n\n:::\n\n## Initializing the project\n\nYou will also need to initialize the project for Apify, to do that, use the Apify CLI again:\n\n```sh\napify init\n```\n\nThe CLI will check the project structure and guide you through the setup process. If prompted, follow the instructions and answer the questions to configure the project correctly. For more information follow the [Apify CLI documentation](https://docs.apify.com/cli/docs).\n\nThis will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up.\n\n## Ship it!\n\nAnd that's all, your project is now ready to be published on the Apify platform. You can use the Apify CLI once more to do that:\n\n```sh\napify push\n```\n\nThis command will create an archive from your project, upload it to the Apify platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform.\n\n## Learning more about web scraping\n\n:::tip Explore Apify Academy Resources\n\nIf you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** ❤️\n\n{/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/}\n\n:::\n\n## Thank you! 🎉\n\nThat's it! Thanks for reading the whole introduction and if there's anything wrong, please 🙏 let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! 👋\n"
  },
  {
    "path": "docs/introduction/code_examples/02_bs.py",
    "content": "import asyncio\n\n# Add import of crawler and crawling context.\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # First you create the request queue instance.\n    rq = await RequestQueue.open()\n\n    # And then you add one or more requests to it.\n    await rq.add_request('https://crawlee.dev')\n\n    crawler = BeautifulSoupCrawler(request_manager=rq)\n\n    # Define a request handler and attach it to the crawler using the decorator.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        # Extract <title> text with BeautifulSoup.\n        # See BeautifulSoup documentation for API docs.\n        url = context.request.url\n        title = context.soup.title.string if context.soup.title else ''\n        context.log.info(f'The title of {url} is: {title}.')\n\n    await crawler.run()\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/02_bs_better.py",
    "content": "import asyncio\n\n# You don't need to import RequestQueue anymore.\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        url = context.request.url\n        title = context.soup.title.string if context.soup.title else ''\n        context.log.info(f'The title of {url} is: {title}.')\n\n    # Start the crawler with the provided URLs.\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/02_request_queue.py",
    "content": "import asyncio\n\nfrom crawlee.storages import RequestQueue\n\n\nasync def main() -> None:\n    # First you create the request queue instance.\n    rq = await RequestQueue.open()\n\n    # And then you add one or more requests to it.\n    await rq.add_request('https://crawlee.dev')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/03_enqueue_strategy.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}.')\n\n        # See the `EnqueueStrategy` type alias for more strategy options.\n        # highlight-next-line\n        await context.enqueue_links(\n            # highlight-next-line\n            strategy='same-domain',\n            # highlight-next-line\n        )\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/03_finding_new_links.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # Let's limit our crawls to make our tests shorter and safer.\n    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        url = context.request.url\n        title = context.soup.title.string if context.soup.title else ''\n        context.log.info(f'The title of {url} is: {title}.')\n\n        # The enqueue_links function is available as one of the fields of the context.\n        # It is also context aware, so it does not require any parameters.\n        await context.enqueue_links()\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/03_globs.py",
    "content": "import asyncio\n\nfrom crawlee import Glob\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}.')\n\n        # Enqueue links that match the 'include' glob pattern and\n        # do not match the 'exclude' glob pattern.\n        # highlight-next-line\n        await context.enqueue_links(\n            # highlight-next-line\n            include=[Glob('https://someplace.com/**/cats')],\n            # highlight-next-line\n            exclude=[Glob('https://**/archive/**')],\n            # highlight-next-line\n        )\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/03_original_code.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        url = context.request.url\n        title = context.soup.title.string if context.soup.title else ''\n        context.log.info(f'The title of {url} is: {title}.')\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/03_transform_request.py",
    "content": "from __future__ import annotations\n\nimport asyncio\n\nfrom crawlee import HttpHeaders, RequestOptions, RequestTransformAction\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\ndef transform_request(\n    request_options: RequestOptions,\n) -> RequestOptions | RequestTransformAction:\n    # Skip requests to PDF files\n    if request_options['url'].endswith('.pdf'):\n        return 'skip'\n\n    if '/docs' in request_options['url']:\n        # Add custom headers to requests to specific URLs\n        request_options['headers'] = HttpHeaders({'Custom-Header': 'value'})\n\n    elif '/blog' in request_options['url']:\n        # Add label for certain URLs\n        request_options['label'] = 'BLOG'\n\n    else:\n        # Signal that the request should proceed without any transformation\n        return 'unchanged'\n\n    return request_options\n\n\nasync def main() -> None:\n    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}.')\n\n        # Transform request before enqueueing\n        await context.enqueue_links(transform_request_function=transform_request)\n\n    @crawler.router.handler('BLOG')\n    async def blog_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Blog Processing {context.request.url}.')\n\n    await crawler.run(['https://crawlee.dev/'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/04_sanity_check.py",
    "content": "import asyncio\n\n# Instead of BeautifulSoupCrawler let's use Playwright to be able to render JavaScript.\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        # Wait for the collection cards to render on the page. This ensures that\n        # the elements we want to interact with are present in the DOM.\n        await context.page.wait_for_selector('.collection-block-item')\n\n        # Execute a function within the browser context to target the collection\n        # card elements and extract their text content, trimming any leading or\n        # trailing whitespace.\n        category_texts = await context.page.eval_on_selector_all(\n            '.collection-block-item',\n            '(els) => els.map(el => el.textContent.trim())',\n        )\n\n        # Log the extracted texts.\n        for i, text in enumerate(category_texts):\n            context.log.info(f'CATEGORY_{i + 1}: {text}')\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/05_crawling_detail.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # We're not processing detail pages yet, so we just pass.\n        if context.request.label == 'DETAIL':\n            pass\n\n        # We are now on a category page. We can use this to paginate through and\n        # enqueue all products, as well as any subsequent pages we find.\n        elif context.request.label == 'CATEGORY':\n            # Wait for the product items to render.\n            await context.page.wait_for_selector('.product-item > a')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label DETAIL.\n            await context.enqueue_links(\n                selector='.product-item > a',\n                label='DETAIL',\n            )\n\n            # Find the \"Next\" button to paginate through the category pages.\n            next_button = await context.page.query_selector('a.pagination__next')\n\n            # If a \"Next\" button is found, enqueue the next page of results.\n            if next_button:\n                await context.enqueue_links(\n                    selector='a.pagination__next',\n                    label='CATEGORY',\n                )\n\n        # This indicates we're on the start page with no specific label.\n        # On the start page, we want to enqueue all the category pages.\n        else:\n            # Wait for the collection cards to render.\n            await context.page.wait_for_selector('.collection-block-item')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label CATEGORY.\n            await context.enqueue_links(\n                selector='.collection-block-item',\n                label='CATEGORY',\n            )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/05_crawling_listing.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # Wait for the category cards to render on the page. This ensures that\n        # the elements we want to interact with are present in the DOM.\n        await context.page.wait_for_selector('.collection-block-item')\n\n        # Enqueue links found within elements that match the specified selector.\n        # These links will be added to the crawling queue with the label CATEGORY.\n        await context.enqueue_links(\n            selector='.collection-block-item',\n            label='CATEGORY',\n        )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/06_scraping.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Let's limit our crawls to make our tests shorter and safer.\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # We're not processing detail pages yet, so we just pass.\n        if context.request.label == 'DETAIL':\n            # Split the URL and get the last part to extract the manufacturer.\n            url_part = context.request.url.split('/').pop()\n            manufacturer = url_part.split('-')[0]\n\n            # Extract the title using the combined selector.\n            title = await context.page.locator('.product-meta h1').text_content()\n\n            # Extract the SKU using its selector.\n            sku = await context.page.locator(\n                'span.product-meta__sku-number'\n            ).text_content()\n\n            # Locate the price element that contains the '$' sign and filter out\n            # the visually hidden elements.\n            price_element = context.page.locator('span.price', has_text='$').first\n            current_price_string = await price_element.text_content() or ''\n            raw_price = current_price_string.split('$')[1]\n            price = float(raw_price.replace(',', ''))\n\n            # Locate the element that contains the text 'In stock'\n            # and filter out other elements.\n            in_stock_element = context.page.locator(\n                selector='span.product-form__inventory',\n                has_text='In stock',\n            ).first\n            in_stock = await in_stock_element.count() > 0\n\n            # Put it all together in a dictionary.\n            data = {\n                'manufacturer': manufacturer,\n                'title': title,\n                'sku': sku,\n                'price': price,\n                'in_stock': in_stock,\n            }\n\n            # Print the extracted data.\n            context.log.info(data)\n\n        # We are now on a category page. We can use this to paginate through and\n        # enqueue all products, as well as any subsequent pages we find.\n        elif context.request.label == 'CATEGORY':\n            # Wait for the product items to render.\n            await context.page.wait_for_selector('.product-item > a')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label DETAIL.\n            await context.enqueue_links(\n                selector='.product-item > a',\n                label='DETAIL',\n            )\n\n            # Find the \"Next\" button to paginate through the category pages.\n            next_button = await context.page.query_selector('a.pagination__next')\n\n            # If a \"Next\" button is found, enqueue the next page of results.\n            if next_button:\n                await context.enqueue_links(\n                    selector='a.pagination__next',\n                    label='CATEGORY',\n                )\n\n        # This indicates we're on the start page with no specific label.\n        # On the start page, we want to enqueue all the category pages.\n        else:\n            # Wait for the collection cards to render.\n            await context.page.wait_for_selector('.collection-block-item')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label CATEGORY.\n            await context.enqueue_links(\n                selector='.collection-block-item',\n                label='CATEGORY',\n            )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/07_final_code.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Let's limit our crawls to make our tests shorter and safer.\n        max_requests_per_crawl=10,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url}')\n\n        # We're not processing detail pages yet, so we just pass.\n        if context.request.label == 'DETAIL':\n            # Split the URL and get the last part to extract the manufacturer.\n            url_part = context.request.url.split('/').pop()\n            manufacturer = url_part.split('-')[0]\n\n            # Extract the title using the combined selector.\n            title = await context.page.locator('.product-meta h1').text_content()\n\n            # Extract the SKU using its selector.\n            sku = await context.page.locator(\n                'span.product-meta__sku-number'\n            ).text_content()\n\n            # Locate the price element that contains the '$' sign and filter out\n            # the visually hidden elements.\n            price_element = context.page.locator('span.price', has_text='$').first\n            current_price_string = await price_element.text_content() or ''\n            raw_price = current_price_string.split('$')[1]\n            price = float(raw_price.replace(',', ''))\n\n            # Locate the element that contains the text 'In stock' and filter out\n            # other elements.\n            in_stock_element = context.page.locator(\n                selector='span.product-form__inventory',\n                has_text='In stock',\n            ).first\n            in_stock = await in_stock_element.count() > 0\n\n            # Put it all together in a dictionary.\n            data = {\n                'manufacturer': manufacturer,\n                'title': title,\n                'sku': sku,\n                'price': price,\n                'in_stock': in_stock,\n            }\n\n            # Push the data to the dataset.\n            await context.push_data(data)\n\n        # We are now on a category page. We can use this to paginate through and\n        # enqueue all products, as well as any subsequent pages we find.\n        elif context.request.label == 'CATEGORY':\n            # Wait for the product items to render.\n            await context.page.wait_for_selector('.product-item > a')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label DETAIL.\n            await context.enqueue_links(\n                selector='.product-item > a',\n                label='DETAIL',\n            )\n\n            # Find the \"Next\" button to paginate through the category pages.\n            next_button = await context.page.query_selector('a.pagination__next')\n\n            # If a \"Next\" button is found, enqueue the next page of results.\n            if next_button:\n                await context.enqueue_links(\n                    selector='a.pagination__next',\n                    label='CATEGORY',\n                )\n\n        # This indicates we're on the start page with no specific label.\n        # On the start page, we want to enqueue all the category pages.\n        else:\n            # Wait for the collection cards to render.\n            await context.page.wait_for_selector('.collection-block-item')\n\n            # Enqueue links found within elements matching the provided selector.\n            # These links will be added to the crawling queue with the label CATEGORY.\n            await context.enqueue_links(\n                selector='.collection-block-item',\n                label='CATEGORY',\n            )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/07_first_code.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\nfrom crawlee.storages import Dataset\n\n# ...\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler()\n    dataset = await Dataset.open()\n\n    # ...\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        ...\n        # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/08_main.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler\n\nfrom .routes import router\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Let's limit our crawls to make our tests shorter and safer.\n        max_requests_per_crawl=10,\n        # Provide our router instance to the crawler.\n        request_handler=router,\n    )\n\n    await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/08_routes.py",
    "content": "from crawlee.crawlers import PlaywrightCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[PlaywrightCrawlingContext]()\n\n\n@router.default_handler\nasync def default_handler(context: PlaywrightCrawlingContext) -> None:\n    # This is a fallback route which will handle the start URL.\n    context.log.info(f'default_handler is processing {context.request.url}')\n\n    await context.page.wait_for_selector('.collection-block-item')\n\n    await context.enqueue_links(\n        selector='.collection-block-item',\n        label='CATEGORY',\n    )\n\n\n@router.handler('CATEGORY')\nasync def category_handler(context: PlaywrightCrawlingContext) -> None:\n    # This replaces the context.request.label == CATEGORY branch of the if clause.\n    context.log.info(f'category_handler is processing {context.request.url}')\n\n    await context.page.wait_for_selector('.product-item > a')\n\n    await context.enqueue_links(\n        selector='.product-item > a',\n        label='DETAIL',\n    )\n\n    next_button = await context.page.query_selector('a.pagination__next')\n\n    if next_button:\n        await context.enqueue_links(\n            selector='a.pagination__next',\n            label='CATEGORY',\n        )\n\n\n@router.handler('DETAIL')\nasync def detail_handler(context: PlaywrightCrawlingContext) -> None:\n    # This replaces the context.request.label == DETAIL branch of the if clause.\n    context.log.info(f'detail_handler is processing {context.request.url}')\n\n    url_part = context.request.url.split('/').pop()\n    manufacturer = url_part.split('-')[0]\n\n    title = await context.page.locator('.product-meta h1').text_content()\n\n    sku = await context.page.locator('span.product-meta__sku-number').text_content()\n\n    price_element = context.page.locator('span.price', has_text='$').first\n    current_price_string = await price_element.text_content() or ''\n    raw_price = current_price_string.split('$')[1]\n    price = float(raw_price.replace(',', ''))\n\n    in_stock_element = context.page.locator(\n        selector='span.product-form__inventory',\n        has_text='In stock',\n    ).first\n    in_stock = await in_stock_element.count() > 0\n\n    data = {\n        'manufacturer': manufacturer,\n        'title': title,\n        'sku': sku,\n        'price': price,\n        'in_stock': in_stock,\n    }\n\n    await context.push_data(data)\n"
  },
  {
    "path": "docs/introduction/code_examples/09_apify_sdk.py",
    "content": "import asyncio\n\n# highlight-next-line\nfrom apify import Actor\n\nfrom crawlee.crawlers import PlaywrightCrawler\n\nfrom .routes import router\n\n\nasync def main() -> None:\n    # highlight-next-line\n    async with Actor:\n        crawler = PlaywrightCrawler(\n            # Let's limit our crawls to make our tests shorter and safer.\n            max_requests_per_crawl=10,\n            # Provide our router instance to the crawler.\n            request_handler=router,\n        )\n\n        await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/introduction/code_examples/__init__.py",
    "content": ""
  },
  {
    "path": "docs/introduction/code_examples/routes.py",
    "content": "from crawlee.crawlers import PlaywrightCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[PlaywrightCrawlingContext]()\n"
  },
  {
    "path": "docs/introduction/index.mdx",
    "content": "---\nid: introduction\ntitle: Introduction\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\n\nCrawlee covers your crawling and scraping end-to-end and helps you **build reliable scrapers. Fast.**\n\nYour crawlers will appear human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.\n\n## What you will learn\n\nThe goal of the introduction is to provide a step-by-step guide to the most important features of Crawlee. It will walk you through creating the simplest of crawlers that only prints text to console, all the way up to a full-featured scraper that collects links from a website and extracts data.\n\n## 🛠 Features\n\nWhy Crawlee is the preferred choice for web scraping and crawling?\n\n### Why use Crawlee instead of just a random HTTP library with an HTML parser?\n\n- Unified interface for **HTTP & headless browser** crawling.\n- Automatic **parallel crawling** based on available system resources.\n- Written in Python with **type hints** - enhances DX (IDE autocompletion) and reduces bugs (static type checking).\n- Automatic **retries** on errors or when you are getting blocked.\n- Integrated **proxy rotation** and session management.\n- Configurable **request routing** - direct URLs to the appropriate handlers.\n- Persistent **queue for URLs** to crawl.\n- Pluggable **storage** of both tabular data and files.\n- Robust **error handling**.\n\n### Why to use Crawlee rather than Scrapy?\n\n- Crawlee has out-of-the-box support for **headless browser** crawling (Playwright).\n- Crawlee has a **minimalistic & elegant interface** - Set up your scraper with fewer than 10 lines of code.\n- Complete **type hint** coverage.\n- Based on standard **Asyncio**.\n\n{/* TODO:\n\n### 👾 HTTP crawling\n\n- ...\n*/}\n\n{/* TODO:\n### 💻 Real browser crawling\n\n- ...\n*/}\n\n## Next steps\n\nNext, you will install Crawlee and learn how to bootstrap projects with the prepared Crawlee templates.\n"
  },
  {
    "path": "docs/pyproject.toml",
    "content": "# Line length different from the rest of the code to make sure that the example codes visualised on the generated\n# documentation webpages are shown without vertical slider to make them more readable.\n\n[tool.ruff]\n# Inherit all from project top configuration file.\nextend = \"../pyproject.toml\"\n\n# Override just line length\nline-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider.\n"
  },
  {
    "path": "docs/quick-start/code_examples/beautifulsoup_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n\nasync def main() -> None:\n    # BeautifulSoupCrawler crawls the web using HTTP requests\n    # and parses HTML using the BeautifulSoup library.\n    crawler = BeautifulSoupCrawler(max_requests_per_crawl=10)\n\n    # Define a request handler to process each crawled page\n    # and attach it to the crawler using a decorator.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Extract relevant data from the page context.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n        # Store the extracted data.\n        await context.push_data(data)\n        # Extract links from the current page and add them to the crawling queue.\n        await context.enqueue_links()\n\n    # Add first URL to the queue and start the crawl.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/quick-start/code_examples/parsel_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n\nasync def main() -> None:\n    # ParselCrawler crawls the web using HTTP requests\n    # and parses HTML using the Parsel library.\n    crawler = ParselCrawler(max_requests_per_crawl=10)\n\n    # Define a request handler to process each crawled page\n    # and attach it to the crawler using a decorator.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Extract relevant data from the page context.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.xpath('//title/text()').get(),\n        }\n        # Store the extracted data.\n        await context.push_data(data)\n        # Extract links from the current page and add them to the crawling queue.\n        await context.enqueue_links()\n\n    # Add first URL to the queue and start the crawl.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/quick-start/code_examples/playwright_crawler_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    # PlaywrightCrawler crawls the web using a headless browser\n    # controlled by the Playwright library.\n    crawler = PlaywrightCrawler()\n\n    # Define a request handler to process each crawled page\n    # and attach it to the crawler using a decorator.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Extract relevant data from the page context.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n        }\n        # Store the extracted data.\n        await context.push_data(data)\n        # Extract links from the current page and add them to the crawling queue.\n        await context.enqueue_links()\n\n    # Add first URL to the queue and start the crawl.\n    await crawler.run(['https://crawlee.dev'])\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/quick-start/code_examples/playwright_crawler_headful_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        # Run with a visible browser window.\n        # highlight-next-line\n        headless=False,\n        # Switch to the Firefox browser.\n        browser_type='firefox',\n    )\n\n    # ...\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "docs/quick-start/index.mdx",
    "content": "---\nid: quick-start\ntitle: Quick start\n---\n\nimport ApiLink from '@site/src/components/ApiLink';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\nimport CodeBlock from '@theme/CodeBlock';\nimport RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';\n\nimport BeautifulsoupCrawlerExample from '!!raw-loader!roa-loader!./code_examples/beautifulsoup_crawler_example.py';\nimport ParselCrawlerExample from '!!raw-loader!roa-loader!./code_examples/parsel_crawler_example.py';\nimport PlaywrightCrawlerExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler_example.py';\n\nimport PlaywrightCrawlerHeadfulExample from '!!raw-loader!./code_examples/playwright_crawler_headful_example.py';\n\nThis short tutorial will help you start scraping with Crawlee in just a minute or two. For an in-depth understanding of how Crawlee works, check out the [Introduction](../introduction/index.mdx) section, which provides a comprehensive step-by-step guide to creating your first scraper.\n\n## Choose your crawler\n\nCrawlee offers the following main crawler classes: <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink>, and <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>. All crawlers share the same interface, providing maximum flexibility when switching between them.\n\n:::caution Minimum Python version\n\nCrawlee requires Python 3.10 or higher.\n\n:::\n\n### BeautifulSoupCrawler\n\nThe <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> is a plain HTTP crawler that parses HTML using the well-known [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library. It crawls the web using an HTTP client that mimics a browser. This crawler is very fast and efficient but cannot handle JavaScript rendering.\n\n### ParselCrawler\n\nThe <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> is similar to the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink> but uses the [Parsel](https://pypi.org/project/parsel/) library for HTML parsing. Parsel is a lightweight library that provides a CSS selector-based API for extracting data from HTML documents. If you are familiar with the [Scrapy](https://scrapy.org/) framework, you will feel right at home with Parsel. As with the <ApiLink to=\"class/BeautifulSoupCrawler\">`BeautifulSoupCrawler`</ApiLink>, the <ApiLink to=\"class/ParselCrawler\">`ParselCrawler`</ApiLink> cannot handle JavaScript rendering.\n\n### PlaywrightCrawler\n\nThe <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink> uses a headless browser controlled by the [Playwright](https://playwright.dev/) library. It can manage Chromium, Firefox, Webkit, and other browsers. Playwright is the successor to the [Puppeteer](https://pptr.dev/) library and is becoming the de facto standard in headless browser automation. If you need a headless browser, choose Playwright.\n\n## Installation\n\nCrawlee is available the [`crawlee`](https://pypi.org/project/crawlee/) package on PyPI. This package includes the core functionality, while additional features are available as optional extras to keep dependencies and package size minimal.\n\nYou can install Crawlee with all features or choose only the ones you need. For installing it using the [pip](https://pip.pypa.io/en/stable/) package manager, run the following command:\n\n```sh\npython -m pip install 'crawlee[all]'\n```\n\nVerify that Crawlee is successfully installed:\n\n```sh\npython -c 'import crawlee; print(crawlee.__version__)'\n```\n\nIf you plan to use the <ApiLink to=\"class/PlaywrightCrawler\">`PlaywrightCrawler`</ApiLink>, you'll need to install Playwright dependencies, including the browser binaries. To do this, run the following command:\n\n```sh\nplaywright install\n```\n\nFor detailed installation instructions, see the [Setting up](../introduction/01_setting_up.mdx) documentation page.\n\n## Crawling\n\nRun the following example to perform a recursive crawl of the Crawlee website using the selected crawler.\n\n<Tabs groupId=\"quickStart\">\n    <TabItem value=\"BeautifulSoupCrawler\" label=\"BeautifulSoupCrawler\" default>\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {BeautifulsoupCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"ParselCrawler\" label=\"ParselCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {ParselCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n    <TabItem value=\"PlaywrightCrawler\" label=\"PlaywrightCrawler\">\n        <RunnableCodeBlock className=\"language-python\" language=\"python\">\n            {PlaywrightCrawlerExample}\n        </RunnableCodeBlock>\n    </TabItem>\n</Tabs>\n\nWhen you run the example, you will see Crawlee automating the data extraction process in your terminal.\n\n{/* TODO: improve the logging and add here a sample */}\n\n## Running headful browser\n\nBy default, browsers controlled by Playwright run in headless mode (without a visible window). However, you can configure the crawler to run in a headful mode, which is useful during the development phase to observe the browser's actions. You can also switch from the default Chromium browser to Firefox or WebKit.\n\n<CodeBlock language=\"python\">\n    {PlaywrightCrawlerHeadfulExample}\n</CodeBlock>\n\nWhen you run the example code, you'll see an automated browser navigating through the Crawlee website.\n\n{/* TODO: add video example */}\n\n## Results\n\nBy default, Crawlee stores data in the `./storage` directory within your current working directory. The results of your crawl will be saved as JSON files under `./storage/datasets/default/`.\n\nTo view the results, you can use the `cat` command:\n\n```sh\ncat ./storage/datasets/default/000000001.json\n```\n\nThe JSON file will contain data similar to the following:\n\n```json\n{\n    \"url\": \"https://crawlee.dev/\",\n    \"title\": \"Crawlee · Build reliable crawlers. Fast. | Crawlee\"\n}\n```\n\n:::tip\n\nIf you want to change the storage directory, you can set the `CRAWLEE_STORAGE_DIR` environment variable to your preferred path.\n\n:::\n\n## Examples and further reading\n\nFor more examples showcasing various features of Crawlee, visit the [Examples](/docs/examples) section of the documentation. To get a deeper understanding of Crawlee and its components, read the step-by-step [Introduction](../introduction/index.mdx) guide.\n\n[//]: # (TODO: add related links once they are ready)\n"
  },
  {
    "path": "docs/upgrading/upgrading_to_v0x.md",
    "content": "---\nid: upgrading-to-v0x\ntitle: Upgrading to v0.x\n---\n\nThis page summarizes the breaking changes between Crawlee for Python zero-based versions.\n\n## Upgrading to v0.6\n\nThis section summarizes the breaking changes between v0.5.x and v0.6.0.\n\n### HttpCrawlerOptions\n\n- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead.\n\n### HttpClient\n\n- The signature of the `HttpClient` class has been updated. The constructor parameters `additional_http_error_status_codes` and `ignore_http_error_status_codes` have been removed and are now only available in `BasicCrawlerOptions`.\n- The method `_raise_for_error_status_code` has been removed from `HttpClient`. Its logic has been moved to the `BasicCrawler` class.\n\n### SessionCookies\n\n- Replaces the `dict` used for cookie storage in `Session.cookies` with a new `SessionCookies` class. `SessionCookies` uses `CookieJar`, which enables support for multiple domains.\n\n### PlaywrightCrawler and PlaywrightBrowserPlugin\n\n- `PlaywrightCrawler` now use a persistent browser context instead of the standard browser context.\n- Added `user_data_dir` parameter for `PlaywrightCrawler` and `PlaywrightBrowserPlugin` to specify the directory for the persistent context. If not provided, a temporary directory will be created automatically.\n\n### Configuration\n\nThe `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`.\n\n### CLI dependencies\n\nCLI dependencies have been moved to optional dependencies. If you need the CLI, install `crawlee[cli]`\n\n### Abstract base classes\n\nWe decided to move away from [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation) and remove all the `Base` prefixes from the abstract classes. It includes the following public classes:\n- `BaseStorageClient` -> `StorageClient`\n- `BaseBrowserController` -> `BrowserController`\n- `BaseBrowserPlugin` -> `BrowserPlugin`\n\n### EnqueueStrategy\n\nThe `EnqueueStrategy` has been changed from an enum to a string literal type. All its values and their meaning remain unchanged.\n\n## Upgrading to v0.5\n\nThis section summarizes the breaking changes between v0.4.x and v0.5.0.\n\n### Crawlers & CrawlingContexts\n\n- All crawler and crawling context classes have been consolidated into a single sub-package called `crawlers`.\n- The affected classes include: `AbstractHttpCrawler`, `AbstractHttpParser`, `BasicCrawler`, `BasicCrawlerOptions`, `BasicCrawlingContext`, `BeautifulSoupCrawler`, `BeautifulSoupCrawlingContext`, `BeautifulSoupParserType`, `ContextPipeline`, `HttpCrawler`, `HttpCrawlerOptions`, `HttpCrawlingContext`, `HttpCrawlingResult`, `ParsedHttpCrawlingContext`, `ParselCrawler`, `ParselCrawlingContext`, `PlaywrightCrawler`, `PlaywrightCrawlingContext`, `PlaywrightPreNavCrawlingContext`.\n\nExample update:\n```diff\n- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n+ from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n```\n\n### Storage clients\n\n- All storage client classes have been moved into a single sub-package called `storage_clients`.\n- The affected classes include: `MemoryStorageClient`, `BaseStorageClient`.\n\nExample update:\n```diff\n- from crawlee.memory_storage_client import MemoryStorageClient\n+ from crawlee.storage_clients import MemoryStorageClient\n```\n\n### CurlImpersonateHttpClient\n\n- The `CurlImpersonateHttpClient` changed its import location.\n\nExample update:\n```diff\n- from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient\n+ from crawlee.http_clients import CurlImpersonateHttpClient\n```\n\n### BeautifulSoupParser\n\n- Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`.\n- `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`.\n\n### Service locator\n\n- The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`.\n- You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood.\n\n### Statistics\n\n- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one.\n- If you want to set your custom event manager, do it either via the service locator or pass it to the crawler.\n\n### Request\n\n- The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them.\n\n### Request storages and loaders\n\n- The `request_provider` parameter of `BasicCrawler.__init__` has been renamed to `request_manager`\n- The `BasicCrawler.get_request_provider` method has been renamed to `BasicCrawler.get_request_manager` and it does not accept the `id` and `name` arguments anymore\n    - If using a specific request queue is desired, pass it as the `request_manager` on `BasicCrawler` creation\n- The `RequestProvider` interface has been renamed to `RequestManager` and moved to the `crawlee.request_loaders` package\n- `RequestList` has been moved to the `crawlee.request_loaders` package\n- `RequestList` does not support `.drop()`, `.reclaim_request()`, `.add_request()` and `add_requests_batched()` anymore\n    - It implements the new `RequestLoader` interface instead of `RequestManager`\n    - `RequestManagerTandem` with a `RequestQueue` should be used to enable passing a `RequestList` (or any other `RequestLoader` implementation) as a `request_manager`, `await list.to_tandem()` can be used as a shortcut\n\n### PlaywrightCrawler\n\n- The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`.\n- The input arguments in `PlaywrightCrawler.__init__` have been renamed:\n    - `browser_options` is now `browser_launch_options`,\n    - `page_options` is now `browser_new_context_options`.\n- These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`.\n\n## Upgrading to v0.4\n\nThis section summarizes the breaking changes between v0.3.x and v0.4.0.\n\n### Request model\n\n- The `Request.query_params` field has been removed. Please add query parameters directly to the URL, which was possible before as well, and is now the only supported approach.\n- The `Request.payload` and `Request.data` fields have been consolidated. Now, only `Request.payload` remains, and it should be used for all payload data in requests.\n\n### Extended unique key computation\n\n- The computation of `extended_unique_key` now includes HTTP headers. While this change impacts the behavior, the interface remains the same.\n\n## Upgrading to v0.3\n\nThis section summarizes the breaking changes between v0.2.x and v0.3.0.\n\n### Public and private interface declaration\n\nIn previous versions, the majority of the package was fully public, including many elements intended for internal use only. With the release of v0.3, we have clearly defined the public and private interface of the package. As a result, some imports have been updated (see below). If you are importing something now designated as private, we recommend reconsidering its use or discussing your use case with us in the discussions/issues.\n\nHere is a list of the updated public imports:\n\n```diff\n- from crawlee.enqueue_strategy import EnqueueStrategy\n+ from crawlee import EnqueueStrategy\n```\n\n```diff\n- from crawlee.models import Request\n+ from crawlee import Request\n```\n\n```diff\n- from crawlee.basic_crawler import Router\n+ from crawlee.router import Router\n```\n\n### Request queue\n\nThere were internal changes that should not affect the intended usage:\n\n- The unused `BaseRequestQueueClient.list_requests()` method was removed\n- `RequestQueue` internals were updated to match the \"Request Queue V2\" implementation in Crawlee for JS\n\n### Service container\n\nA new module, `crawlee.service_container`, was added to allow management of \"global instances\" - currently it contains `Configuration`, `EventManager` and `BaseStorageClient`. The module also replaces the `StorageClientManager` static class. It is likely that its interface will change in the future. If your use case requires working with it, please get in touch - we'll be glad to hear any feedback.\n"
  },
  {
    "path": "docs/upgrading/upgrading_to_v1.md",
    "content": "---\nid: upgrading-to-v1\ntitle: Upgrading to v1\n---\n\nThis page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.\n\n## Terminology change: \"browser\" in different contexts\n\nThe word \"browser\" is now used distinctly in two contexts:\n\n- **Playwright context** - Refers to Playwright-supported browsers (`chromium`, `firefox`, `webkit`, `edge`).\n- **Fingerprinting context** - Refers to browsers supported by fingerprint generation (`chrome`, `firefox`, `safari`, `edge`).\n\nThe type of `HeaderGeneratorOptions.browsers` has changed accordingly:\n\n**Before (v0.6):**\n\n```python\nfrom crawlee.fingerprint_suite import HeaderGeneratorOptions\n\nHeaderGeneratorOptions(browsers=['chromium'])\nHeaderGeneratorOptions(browsers=['webkit'])\n```\n\n**Now (v1.0):**\n\n```python\nfrom crawlee.fingerprint_suite import HeaderGeneratorOptions\n\nHeaderGeneratorOptions(browsers=['chrome'])\nHeaderGeneratorOptions(browsers=['safari'])\n```\n\n## New default HTTP client\n\nCrawlee v1.0 now uses `ImpitHttpClient` (based on [impit](https://apify.github.io/impit/) library) as the **default HTTP client**, replacing `HttpxHttpClient` (based on [httpx](https://www.python-httpx.org/) library).\n\nIf you want to keep using `HttpxHttpClient`, install Crawlee with `httpx` extra, e.g. using pip:\n\n```bash\npip install 'crawlee[httpx]'\n```\n\nAnd then provide the HTTP client explicitly to the crawler:\n\n```python\nfrom crawlee.crawlers import HttpCrawler\nfrom crawlee.http_clients import HttpxHttpClient\n\nclient = HttpxHttpClient()\ncrawler = HttpCrawler(http_client=client)\n```\n\nSee the [HTTP clients guide](https://crawlee.dev/python/docs/guides/http-clients) for all options.\n\n## Changes in storages\n\nIn Crawlee v1.0, the `Dataset`, `KeyValueStore`, and `RequestQueue` storage APIs have been updated for consistency and simplicity. Below is a detailed overview of what's new, what's changed, and what's been removed.\n\nSee the [Storages guide](https://crawlee.dev/python/docs/guides/storages) for more details.\n\n### Dataset\n\nThe `Dataset` API now includes several new methods, such as:\n\n- `get_metadata` - retrieves metadata information for the dataset.\n- `purge` - completely clears the dataset, including all items (keeps the metadata only).\n- `list_items` - returns the dataset's items in a list format.\n\nSome older methods have been removed or replaced:\n\n- `from_storage_object` constructor has been removed. You should now use the `open` method with either a `name` or `id` parameter.\n- `get_info` method and the `storage_object` property have been replaced by the new `get_metadata` method.\n- `set_metadata` method has been removed.\n- `write_to_json` and `write_to_csv` methods have been removed; instead, use the `export_to` method for exporting data in different formats.\n\n### Key-value store\n\nThe `KeyValueStore` API now includes several new methods, such as:\n\n- `get_metadata` - retrieves metadata information for the key-value store.\n- `purge` - completely clears the key-value store, removing all keys and values (keeps the metadata only).\n- `delete_value` - deletes a specific key and its associated value.\n- `list_keys` - lists all keys in the key-value store.\n\nSome older methods have been removed or replaced:\n\n- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.\n- `get_info` and `storage_object` - replaced by the new `get_metadata` method.\n- `set_metadata` method has been removed.\n\n### Request queue\n\nThe `RequestQueue` API now includes several new methods, such as:\n\n- `get_metadata` - retrieves metadata information for the request queue.\n- `purge` - completely clears the request queue, including all pending and processed requests (keeps the metadata only).\n- `add_requests` - replaces the previous `add_requests_batched` method, offering the same functionality under a simpler name.\n\nSome older methods have been removed or replaced:\n\n- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.\n- `get_info` and `storage_object` - replaced by the new `get_metadata` method.\n- `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`.\n- `set_metadata` method has been removed.\n\nSome changes in the related model classes:\n\n- `resource_directory` in `RequestQueueMetadata` - removed; use the corresponding `path_to_*` property instead.\n- `stats` field in `RequestQueueMetadata` - removed as it was unused.\n- `RequestQueueHead` - replaced by `RequestQueueHeadWithLocks`.\n\n## New architecture of storage clients\n\nIn v1.0, the storage client system has been completely reworked to simplify implementation and make custom storage clients easier to write.\n\nSee the [Storage clients guide](https://crawlee.dev/python/docs/guides/storage-clients) for more details.\n\n### New dedicated storage clients\n\nPreviously, `MemoryStorageClient` handled both in-memory storage and optional file system persistence. This has now been split into two distinct storage clients:\n\n- **`MemoryStorageClient`** - Stores all data in memory only.\n- **`FileSystemStorageClient`** - Persists data on the file system, with in-memory caching for better performance.\n\n**Before (v0.6):**\n\n```python\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import MemoryStorageClient\n\n# In-memory only\nconfiguration = Configuration(persist_storage=False)\nstorage_client = MemoryStorageClient.from_config(configuration)\n\n# File-system persistence\nconfiguration = Configuration(persist_storage=True)\nstorage_client = MemoryStorageClient.from_config(configuration)\n```\n\n**Now (v1.0):**\n\n```python\nfrom crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient\n\n# In-memory only\nstorage_client = MemoryStorageClient()\n\n# File-system persistence\nstorage_client = FileSystemStorageClient()\n```\n\n### Registering a storage client\n\nThe way you register a storage client remains unchanged:\n\n```python\nfrom crawlee import service_locator\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n# Create custom storage client\nstorage_client = MemoryStorageClient()\n\n# Then register it globally\nservice_locator.set_storage_client(storage_client)\n\n# Or use it for a single crawler only\ncrawler = ParselCrawler(storage_client=storage_client)\n\n# Or use it for a single storage only\ndataset = await Dataset.open(\n    name='my-dataset',\n    storage_client=storage_client,\n)\n```\n\n### Instance caching\n\nInstance caching of `Dataset.open`, `KeyValueStore.open`, and `RequestQueue.open` now return the same instance for the same arguments. Direct calls to `StorageClient.open_*` always return new instances.\n\n### Writing custom storage clients\n\nThe interface for custom storage clients has been simplified:\n\n- One storage client per storage type (`RequestQueue`, `KeyValueStore`, `Dataset`).\n- Collection storage clients have been removed.\n- The number of methods that have to be implemented have been reduced.\n\n## ServiceLocator changes\n\n### ServiceLocator is stricter with registering services\nYou can register the services just once, and you can no longer override already registered services.\n\n**Before (v0.6):**\n```python\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import MemoryStorageClient\n\nservice_locator.set_storage_client(MemoryStorageClient())\nservice_locator.set_storage_client(MemoryStorageClient())\n```\n**Now (v1.0):**\n\n```python\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import MemoryStorageClient\n\nservice_locator.set_storage_client(MemoryStorageClient())\nservice_locator.set_storage_client(MemoryStorageClient())  # Raises an error\n```\n\n### BasicCrawler has its own instance of ServiceLocator to track its own services\nExplicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services.\n\n**Before (v0.6):**\n```python\nfrom crawlee import service_locator\nfrom crawlee.crawlers import BasicCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    custom_storage_client = MemoryStorageClient()\n    crawler = BasicCrawler(storage_client=custom_storage_client)\n\n    assert service_locator.get_storage_client() is custom_storage_client\n    assert await crawler.get_dataset() is await Dataset.open()\n```\n**Now (v1.0):**\n\n```python\nfrom crawlee import service_locator\nfrom crawlee.crawlers import BasicCrawler\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storages import Dataset\n\n\nasync def main() -> None:\n    custom_storage_client = MemoryStorageClient()\n    crawler = BasicCrawler(storage_client=custom_storage_client)\n\n    assert service_locator.get_storage_client() is not custom_storage_client\n    assert await crawler.get_dataset() is not await Dataset.open()\n```\n\nThis allows two crawlers with different services at the same time.\n\n**Now (v1.0):**\n\n```python\nfrom crawlee.crawlers import BasicCrawler\nfrom crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient\nfrom crawlee.configuration import Configuration\nfrom crawlee.events import LocalEventManager\n\ncustom_configuration_1 = Configuration()\ncustom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1)\ncustom_storage_client_1 = MemoryStorageClient()\n\ncustom_configuration_2 = Configuration()\ncustom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2)\ncustom_storage_client_2 = FileSystemStorageClient()\n\ncrawler_1 = BasicCrawler(\n    configuration=custom_configuration_1,\n    event_manager=custom_event_manager_1,\n    storage_client=custom_storage_client_1,\n)\n\ncrawler_2 = BasicCrawler(\n    configuration=custom_configuration_2,\n    event_manager=custom_event_manager_2,\n    storage_client=custom_storage_client_2,\n  )\n\n# use crawlers without runtime crash...\n```\n\n## Other smaller updates\n\nThere are more smaller updates.\n\n### Python version support\n\nWe drop support for Python 3.9. The minimum supported version is now Python 3.10.\n\n### Changes in Configuration\n\nThe fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use.\n\n### Changes in Request\n\n`Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field.\n\n### Changes in HttpResponse\n\nThe method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers.\n\n**Before (v0.6):**\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\nasync def main() -> None:\n    crawler = ParselCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        # highlight-next-line\n        content = context.http_response.read()\n        # ...\n\n    await crawler.run(['https://crawlee.dev/'])\n```\n\n**Now (v1.0):**\n\n```python\nfrom crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\nasync def main() -> None:\n    crawler = ParselCrawler()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        # highlight-next-line\n        content = await context.http_response.read()\n        # ...\n\n    await crawler.run(['https://crawlee.dev/'])\n```\n\n### New storage naming restrictions\n\nWe've introduced naming restrictions for storages to ensure compatibility with Apify Platform requirements and prevent potential conflicts. Storage names may include only letters (a–z, A–Z), digits (0–9), and hyphens (-), with hyphens allowed only in the middle of the name (for example, my-storage-1).\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[project]\nname = \"crawlee\"\nversion = \"1.6.0\"\ndescription = \"Crawlee for Python\"\nauthors = [{ name = \"Apify Technologies s.r.o.\", email = \"support@apify.com\" }]\nlicense = { file = \"LICENSE\" }\nreadme = \"README.md\"\nrequires-python = \">=3.10\"\nclassifiers = [\n    \"Development Status :: 5 - Production/Stable\",\n    \"Environment :: Console\",\n    \"Intended Audience :: Developers\",\n    \"License :: OSI Approved :: Apache Software License\",\n    \"Operating System :: OS Independent\",\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n    \"Programming Language :: Python :: 3.14\",\n    \"Topic :: Software Development :: Libraries\",\n]\nkeywords = [\n    \"apify\",\n    \"automation\",\n    \"chrome\",\n    \"crawlee\",\n    \"crawler\",\n    \"headless\",\n    \"scraper\",\n    \"scraping\",\n]\ndependencies = [\n    \"async-timeout>=5.0.1\",\n    \"cachetools>=5.5.0\",\n    \"colorama>=0.4.0\",\n    \"impit>=0.8.0\",\n    \"more-itertools>=10.2.0\",\n    \"protego>=0.5.0\",\n    \"psutil>=6.0.0\",\n    \"pydantic-settings>=2.12.0\",\n    \"pydantic>=2.11.0\",\n    \"pyee>=9.0.0\",\n    \"tldextract>=5.1.0\",\n    \"typing-extensions>=4.1.0\",\n    \"yarl>=1.18.0\",\n]\n\n[project.optional-dependencies]\nall = [\"crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]\"]\nadaptive-crawler = [\n    \"jaro-winkler>=2.0.3\",\n    \"playwright>=1.27.0\",\n    \"scikit-learn>=1.6.0\",\n    \"apify_fingerprint_datapoints>=0.0.3\",\n    \"browserforge>=1.2.4\"\n]\nbeautifulsoup = [\"beautifulsoup4[lxml]>=4.12.0\", \"html5lib>=1.0\"]\ncli = [\"cookiecutter>=2.6.0\", \"inquirer>=3.3.0\", \"rich>=13.9.0\", \"typer>=0.12.0\"]\ncurl-impersonate = [\"curl-cffi>=0.9.0\"]\nhttpx = [\"httpx[brotli,http2,zstd]>=0.27.0\", \"apify_fingerprint_datapoints>=0.0.2\", \"browserforge>=1.2.3\"]\nparsel = [\"parsel>=1.10.0\"]\nplaywright = [\"playwright>=1.27.0\", \"apify_fingerprint_datapoints>=0.0.2\", \"browserforge>=1.2.3\"]\notel = [\n    \"opentelemetry-api>=1.34.1\",\n    \"opentelemetry-distro[otlp]>=0.54\",\n    \"opentelemetry-instrumentation>=0.54\",\n    \"opentelemetry-instrumentation-httpx>=0.54\",\n    \"opentelemetry-sdk>=1.34.1\",\n    \"opentelemetry-semantic-conventions>=0.54\",\n    \"wrapt>=1.17.0\",\n]\nsql_postgres = [\n    \"sqlalchemy[asyncio]>=2.0.0,<3.0.0\",\n    \"asyncpg>=0.24.0\"\n]\nsql_sqlite = [\n    \"sqlalchemy[asyncio]>=2.0.0,<3.0.0\",\n    \"aiosqlite>=0.21.0\",\n]\nsql_mysql = [\n    \"sqlalchemy[asyncio]>=2.0.0,<3.0.0\",\n    \"aiomysql>=0.3.2\",\n    \"cryptography>=46.0.5\",\n]\nredis = [\"redis[hiredis] >= 7.0.0\"]\n\n[project.scripts]\ncrawlee = \"crawlee._cli:cli\"\n\n[project.urls]\n\"Apify Homepage\" = \"https://apify.com\"\n\"Changelog\" = \"https://crawlee.dev/python/docs/changelog\"\n\"Discord\" = \"https://discord.com/invite/jyEM2PRvMU\"\n\"Documentation\" = \"https://crawlee.dev/python/docs/quick-start\"\n\"Homepage\" = \"https://crawlee.dev/python\"\n\"Issue Tracker\" = \"https://github.com/apify/crawlee-python/issues\"\n\"Release Notes\" = \"https://crawlee.dev/python/docs/upgrading\"\n\"Source Code\" = \"https://github.com/apify/crawlee-python\"\n\n[dependency-groups]\ndev = [\n    # TODO: Remove this constraint once pydoc-markdown updates its dependencies.\n    # Package pydoc-markdown is unmaintained and pins old docspec-python with vulnerable black.\n    # See https://github.com/apify/apify-client-python/pull/582/ for more details.\n    # We explicitly constrain black>=24.3.0 to override the transitive dependency.\n    \"black>=24.3.0\",\n    \"anyio<5.0.0\",\n    \"apify_client\", # For e2e tests.\n    \"build<2.0.0\", # For e2e tests.\n    \"dycw-pytest-only<3.0.0\",\n    \"fakeredis[probabilistic,json,lua]<3.0.0\",\n    \"poethepoet<1.0.0\",\n    \"pre-commit<5.0.0\",\n    \"proxy-py<3.0.0\",\n    \"pydoc-markdown<5.0.0\",\n    \"pytest-asyncio<2.0.0\",\n    \"pytest-cov<8.0.0\",\n    \"pytest-rerunfailures<17.0.0\",\n    \"pytest-timeout<3.0.0\",\n    \"pytest-xdist<4.0.0\",\n    \"pytest<10.0.0\",\n    \"ruff~=0.15.0\",\n    \"setuptools\", # setuptools are used by pytest, but not explicitly required\n    \"ty~=0.0.0\",\n    \"types-beautifulsoup4<5.0.0\",\n    \"types-cachetools<7.0.0\",\n    \"types-colorama<1.0.0\",\n    \"types-psutil<8.0.0\",\n    \"types-python-dateutil<3.0.0\",\n    \"uvicorn[standard]<1.0.0\",\n]\n\n[tool.hatch.build.targets.wheel]\npackages = [\"src/crawlee\"]\n\n[tool.ruff]\nline-length = 120\ninclude = [\"src/**/*.py\", \"tests/**/*.py\", \"docs/**/*.py\", \"website/**/*.py\"]\nextend-exclude = [\"src/crawlee/project_template\"]\n\n[tool.ruff.lint]\nselect = [\"ALL\"]\nignore = [\n    \"ANN401\",   # Dynamically typed expressions (typing.Any) are disallowed in {filename}\n    \"ASYNC109\", # Async function definition with a `timeout` parameter\n    \"BLE001\",   # Do not catch blind exception\n    \"C901\",     # `{name}` is too complex\n    \"COM812\",   # This rule may cause conflicts when used with the formatter\n    \"D100\",     # Missing docstring in public module\n    \"D104\",     # Missing docstring in public package\n    \"D107\",     # Missing docstring in `__init__`\n    \"D203\",     # One blank line required before class docstring\n    \"D213\",     # Multi-line docstring summary should start at the second line\n    \"D413\",     # Missing blank line after last section\n    \"EM\",       # flake8-errmsg\n    \"G004\",     # Logging statement uses f-string\n    \"ISC001\",   # This rule may cause conflicts when used with the formatter\n    \"FIX\",      # flake8-fixme\n    \"PLR0911\",  # Too many return statements\n    \"PLR0912\",  # Too many branches\n    \"PLR0913\",  # Too many arguments in function definition\n    \"PLR0915\",  # Too many statements\n    \"PYI034\",   # `__aenter__` methods in classes like `{name}` usually return `self` at runtime\n    \"PYI036\",   # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`\n    \"S102\",     # Use of `exec` detected\n    \"S105\",     # Possible hardcoded password assigned to\n    \"S106\",     # Possible hardcoded password assigned to argument: \"{name}\"\n    \"S301\",     # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue\n    \"S303\",     # Use of insecure MD2, MD4, MD5, or SHA1 hash function\n    \"S311\",     # Standard pseudo-random generators are not suitable for cryptographic purposes\n    \"TD002\",    # Missing author in TODO; try: `# TODO(<author_name>): ...` or `# TODO @<author_name>: ...\n    \"TRY003\",   # Avoid specifying long messages outside the exception class\n]\n\n[tool.ruff.format]\nquote-style = \"single\"\nindent-style = \"space\"\n\n[tool.ruff.lint.per-file-ignores]\n\"**/__init__.py\" = [\n    \"F401\", # Unused imports\n]\n\"**/{tests}/*\" = [\n    \"ASYNC230\", # Async functions should not open files with blocking methods like `open`\n    \"D\",       # Everything from the pydocstyle\n    \"INP001\",  # File {filename} is part of an implicit namespace package, add an __init__.py\n    \"PLR2004\", # Magic value used in comparison, consider replacing {value} with a constant variable\n    \"S101\",    # Use of assert detected\n    \"SLF001\",  # Private member accessed: `{name}`\n    \"T20\",     # flake8-print\n    \"TRY301\",  # Abstract `raise` to an inner function\n]\n\"**/{docs,website}/**\" = [\n    \"D\",      # Everything from the pydocstyle\n    \"INP001\", # File {filename} is part of an implicit namespace package, add an __init__.py\n    \"F841\",   # Local variable {variable} is assigned to but never used\n    \"N999\",   # Invalid module name\n    \"T201\",   # `print` found\n]\n\"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py\" = [\n    \"PLR2004\", # Magic value used in comparison. Ignored for simplicity and readability of example code.\n]\n\"**/docs/guides/code_examples/running_in_web_server/server.py\" = [\n    \"TC002\", # ruff false positive. Import actually needed during runtime.\n]\n\"**/docs/guides/code_examples/creating_web_archive/*.*\" = [\n    \"ASYNC230\", # Ignore for simplicity of the example.\n]\n\n[tool.ruff.lint.flake8-quotes]\ndocstring-quotes = \"double\"\ninline-quotes = \"single\"\n\n[tool.ruff.lint.flake8-type-checking]\nruntime-evaluated-base-classes = [\n    \"pydantic.BaseModel\",\n    \"pydantic_settings.BaseSettings\",\n]\n\n[tool.ruff.lint.flake8-builtins]\nbuiltins-ignorelist = [\"id\"]\n\n[tool.ruff.lint.isort]\nknown-first-party = [\"crawlee\"]\n\n[tool.pytest.ini_options]\naddopts = \"-r a --verbose\"\nasyncio_default_fixture_loop_scope = \"function\"\nasyncio_mode = \"auto\"\ntimeout = 1800\nmarkers = [\n    \"run_alone: marks tests that must run in isolation\",\n]\n# Ignore DeprecationWarnings coming from Uvicorn's internal imports. Uvicorn relies on deprecated\n# modules from `websockets`, which triggers warnings during tests. These are safe to ignore until\n# Uvicorn updates its internals.\nfilterwarnings = [\n    \"ignore:websockets.legacy is deprecated:DeprecationWarning\",\n    \"ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning\",\n]\n\n[tool.ty.environment]\npython-version = \"3.10\"\n\n[tool.ty.src]\ninclude = [\"src\", \"tests\", \"scripts\", \"docs\", \"website\"]\nexclude = [\n    \"src/crawlee/project_template\",\n    \"docs/guides/code_examples/storage_clients/custom_storage_client_example.py\",\n]\n\n[[tool.ty.overrides]]\ninclude = [\n    \"docs/**/*.py\",\n    \"website/**/*.py\",\n]\n\n[tool.ty.overrides.rules]\nunresolved-import = \"ignore\"\n\n[tool.coverage.report]\nexclude_lines = [\"pragma: no cover\", \"if TYPE_CHECKING:\", \"assert_never()\"]\n\n[tool.ipdb]\ncontext = 7\n\n# Run tasks with: uv run poe <task>\n[tool.poe.tasks]\nclean = \"rm -rf .coverage .pytest_cache .ruff_cache .ty_cache .uv-cache build coverage-unit.xml dist htmlcov website/.docusaurus website/.yarn website/module_shortcuts.json website/node_modules \"\ninstall-sync = \"uv sync --all-extras\"\nbuild = \"uv build --verbose\"\npublish-to-pypi = \"uv publish --verbose --token ${APIFY_PYPI_TOKEN_CRAWLEE}\"\ntype-check = \"uv run ty check\"\ncheck-code = [\"lint\", \"type-check\", \"unit-tests\"]\n\n[tool.poe.tasks.install-dev]\nshell = \"uv sync --all-extras && uv run pre-commit install && uv run playwright install\"\n\n[tool.poe.tasks.lint]\nshell = \"uv run ruff format --check && uv run ruff check\"\n\n[tool.poe.tasks.format]\nshell = \"uv run ruff check --fix && uv run ruff format\"\n\n[tool.poe.tasks.unit-tests]\nshell = \"\"\"\nuv run pytest \\\n    --numprocesses=1 \\\n    -m \"run_alone\" \\\n    tests/unit && \\\nuv run pytest \\\n    --numprocesses=${TESTS_CONCURRENCY:-auto} \\\n    -m \"not run_alone\" \\\n    tests/unit\n\"\"\"\n\n[tool.poe.tasks.unit-tests-cov]\nshell = \"\"\"\nuv run pytest \\\n    --numprocesses=1 \\\n    -m \"run_alone\" \\\n    --cov=src/crawlee \\\n    --cov-report=xml:coverage-unit.xml \\\n    tests/unit && \\\nuv run pytest \\\n    --numprocesses=${TESTS_CONCURRENCY:-auto} \\\n    -m \"not run_alone\" \\\n    --cov=src/crawlee \\\n    --cov-report=xml:coverage-unit.xml \\\n    --cov-append \\\n    tests/unit\n\"\"\"\n\n[tool.poe.tasks.e2e-templates-tests]\ncmd = \"\"\"\nuv run pytest \\\n    --numprocesses=${TESTS_CONCURRENCY:-auto} \\\n    tests/e2e/project_template\n\"\"\"\n\n[tool.poe.tasks.build-docs]\nshell = \"./build_api_reference.sh && corepack enable && yarn && yarn build\"\ncwd = \"website\"\n\n[tool.poe.tasks.run-docs]\nshell = \"./build_api_reference.sh && corepack enable && yarn && yarn start\"\ncwd = \"website\"\n"
  },
  {
    "path": "renovate.json",
    "content": "{\n    \"extends\": [\"config:base\", \":semanticCommitTypeAll(chore)\"],\n    \"ignorePaths\": [\"docs/**\", \"src/crawlee/project_template/**\"],\n    \"pinVersions\": false,\n    \"separateMajorMinor\": false,\n    \"dependencyDashboard\": false,\n    \"semanticCommits\": \"enabled\",\n    \"lockFileMaintenance\": {\n        \"enabled\": true,\n        \"automerge\": true,\n        \"automergeType\": \"branch\"\n    },\n    \"packageRules\": [\n        {\n            \"matchPaths\": [\"pyproject.toml\"],\n            \"matchDepTypes\": [\"devDependencies\"],\n            \"matchUpdateTypes\": [\"major\", \"minor\"],\n            \"groupName\": \"major/minor dev dependencies\",\n            \"groupSlug\": \"dev-dependencies\",\n            \"automerge\": true,\n            \"automergeType\": \"branch\"\n        }\n    ],\n    \"schedule\": [\"before 7am every weekday\"],\n    \"ignoreDeps\": [\"crawlee\", \"docusaurus-plugin-typedoc-api\"]\n}\n"
  },
  {
    "path": "src/crawlee/__init__.py",
    "content": "from importlib import metadata\n\nfrom ._request import Request, RequestOptions, RequestState\nfrom ._service_locator import service_locator\nfrom ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason\nfrom ._utils.globs import Glob\n\n__version__ = metadata.version('crawlee')\n\n__all__ = [\n    'ConcurrencySettings',\n    'EnqueueStrategy',\n    'Glob',\n    'HttpHeaders',\n    'Request',\n    'RequestOptions',\n    'RequestState',\n    'RequestTransformAction',\n    'SkippedReason',\n    'service_locator',\n]\n"
  },
  {
    "path": "src/crawlee/_autoscaling/__init__.py",
    "content": "from .autoscaled_pool import AutoscaledPool\nfrom .snapshotter import Snapshotter\nfrom .system_status import SystemStatus\n\n__all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus']\n"
  },
  {
    "path": "src/crawlee/_autoscaling/_types.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass, field\nfrom datetime import datetime, timedelta, timezone\nfrom typing import TYPE_CHECKING, Annotated\n\nfrom pydantic import Field\nfrom pydantic.dataclasses import dataclass as pydantic_dataclass\n\nif TYPE_CHECKING:\n    from crawlee._utils.byte_size import ByteSize\n\nSYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD = 0.97\n\n\n@dataclass\nclass LoadRatioInfo:\n    \"\"\"Represent the load ratio of a resource.\"\"\"\n\n    limit_ratio: float\n    \"\"\"The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,\n    the resource is considered as overloaded.\"\"\"\n\n    actual_ratio: float\n    \"\"\"The actual ratio of overloaded and non-overloaded samples.\"\"\"\n\n    @property\n    def is_overloaded(self) -> bool:\n        \"\"\"Indicate whether the resource is currently overloaded.\"\"\"\n        return self.actual_ratio > self.limit_ratio\n\n\n@dataclass\nclass SystemInfo:\n    \"\"\"Represent the current status of the system.\"\"\"\n\n    cpu_info: LoadRatioInfo\n    \"\"\"The CPU load ratio.\"\"\"\n\n    memory_info: LoadRatioInfo\n    \"\"\"The memory load ratio.\"\"\"\n\n    event_loop_info: LoadRatioInfo\n    \"\"\"The event loop load ratio.\"\"\"\n\n    client_info: LoadRatioInfo\n    \"\"\"The client load ratio.\"\"\"\n\n    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))\n    \"\"\"The time at which the system load information was measured.\"\"\"\n\n    @property\n    def is_system_idle(self) -> bool:\n        \"\"\"Indicate whether the system is currently idle or overloaded.\"\"\"\n        return (\n            not self.cpu_info.is_overloaded\n            and not self.memory_info.is_overloaded\n            and not self.event_loop_info.is_overloaded\n            and not self.client_info.is_overloaded\n        )\n\n    def __str__(self) -> str:\n        \"\"\"Get a string representation of the system info.\"\"\"\n        stats = {\n            'cpu': self.cpu_info.actual_ratio,\n            'mem': self.memory_info.actual_ratio,\n            'event_loop': self.event_loop_info.actual_ratio,\n            'client_info': self.client_info.actual_ratio,\n        }\n        return '; '.join(f'{name} = {ratio}' for name, ratio in stats.items())\n\n\n@dataclass\nclass CpuSnapshot:\n    \"\"\"A snapshot of CPU usage.\"\"\"\n\n    used_ratio: float\n    \"\"\"The ratio of CPU currently in use.\"\"\"\n\n    max_used_ratio: float\n    \"\"\"The maximum ratio of CPU that is considered acceptable.\"\"\"\n\n    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))\n    \"\"\"The time at which the system load information was measured.\"\"\"\n\n    @property\n    def is_overloaded(self) -> bool:\n        \"\"\"Indicate whether the CPU is considered as overloaded.\"\"\"\n        return self.used_ratio > self.max_used_ratio\n\n\n@dataclass\nclass MemorySnapshot:\n    \"\"\"A snapshot of memory usage.\"\"\"\n\n    current_size: ByteSize\n    \"\"\"Memory usage of the current Python process and its children.\"\"\"\n\n    system_wide_used_size: ByteSize | None\n    \"\"\"Memory usage of all processes, system-wide.\"\"\"\n\n    max_memory_size: ByteSize\n    \"\"\"The maximum memory that can be used by `AutoscaledPool`.\"\"\"\n\n    system_wide_memory_size: ByteSize | None\n    \"\"\"Total memory available in the whole system.\"\"\"\n\n    max_used_memory_ratio: float\n    \"\"\"The maximum acceptable ratio of `current_size` to `max_memory_size`.\"\"\"\n\n    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))\n    \"\"\"The time at which the system load information was measured.\"\"\"\n\n    @property\n    def is_overloaded(self) -> bool:\n        \"\"\"Indicate whether the memory is considered as overloaded.\"\"\"\n        if self.system_wide_memory_size is not None and self.system_wide_used_size is not None:\n            system_wide_utilization = self.system_wide_used_size / self.system_wide_memory_size\n            if system_wide_utilization > SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD:\n                return True\n\n        return (self.current_size / self.max_memory_size) > self.max_used_memory_ratio\n\n\n@dataclass\nclass EventLoopSnapshot:\n    \"\"\"Snapshot of the state of the event loop.\"\"\"\n\n    delay: timedelta\n    \"\"\"The current delay of the event loop.\"\"\"\n\n    max_delay: timedelta\n    \"\"\"The maximum delay that is considered acceptable.\"\"\"\n\n    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))\n    \"\"\"The time at which the system load information was measured.\"\"\"\n\n    @property\n    def max_delay_exceeded(self) -> timedelta:\n        \"\"\"The amount of time by which the delay exceeds the maximum delay.\"\"\"\n        return max(self.delay - self.max_delay, timedelta(seconds=0))\n\n    @property\n    def is_overloaded(self) -> bool:\n        \"\"\"Indicate whether the event loop is considered as overloaded.\"\"\"\n        return self.delay > self.max_delay\n\n\n@dataclass\nclass ClientSnapshot:\n    \"\"\"Snapshot of the state of the client.\"\"\"\n\n    error_count: int\n    \"\"\"The number of errors (HTTP 429) that occurred.\"\"\"\n\n    new_error_count: int\n    \"\"\"The number of new errors (HTTP 429) that occurred since the last snapshot.\"\"\"\n\n    max_error_count: int\n    \"\"\"The maximum number of errors that is considered acceptable.\"\"\"\n\n    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))\n    \"\"\"The time at which the system load information was measured.\"\"\"\n\n    @property\n    def is_overloaded(self) -> bool:\n        \"\"\"Indicate whether the client is considered as overloaded.\"\"\"\n        return self.new_error_count > self.max_error_count\n\n\nSnapshot = MemorySnapshot | CpuSnapshot | EventLoopSnapshot | ClientSnapshot\n\n\n@pydantic_dataclass\nclass Ratio:\n    \"\"\"Represents ratio of memory.\"\"\"\n\n    value: Annotated[float, Field(gt=0.0, le=1.0)]\n"
  },
  {
    "path": "src/crawlee/_autoscaling/autoscaled_pool.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/autoscaled_pool.ts\n\nfrom __future__ import annotations\n\nimport asyncio\nimport math\nfrom contextlib import suppress\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._types import ConcurrencySettings\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recurring_task import RecurringTask\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable\n\n    from crawlee._autoscaling import SystemStatus\n\nlogger = getLogger(__name__)\n\n\nclass AbortError(Exception):\n    \"\"\"Raised when an AutoscaledPool run is aborted. Not for direct use.\"\"\"\n\n\nclass _AutoscaledPoolRun:\n    def __init__(self) -> None:\n        self.worker_tasks = list[asyncio.Task]()\n        \"\"\"A list of worker tasks currently in progress\"\"\"\n\n        self.worker_tasks_updated = asyncio.Event()\n        self.cleanup_done = asyncio.Event()\n        self.result: asyncio.Future = asyncio.Future()\n\n\n@docs_group('Autoscaling')\nclass AutoscaledPool:\n    \"\"\"Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.\n\n    The pool only starts new tasks if there is enough free CPU and memory available. If an exception is thrown in\n    any of the tasks, it is propagated and the pool is stopped.\n    \"\"\"\n\n    _AUTOSCALE_INTERVAL = timedelta(seconds=10)\n    \"\"\"Interval at which the autoscaled pool adjusts the desired concurrency based on the latest system status.\"\"\"\n\n    _LOGGING_INTERVAL = timedelta(minutes=1)\n    \"\"\"Interval at which the autoscaled pool logs its current state.\"\"\"\n\n    _DESIRED_CONCURRENCY_RATIO = 0.9\n    \"\"\"Minimum ratio of desired concurrency that must be reached before allowing further scale-up.\"\"\"\n\n    _SCALE_UP_STEP_RATIO = 0.05\n    \"\"\"Fraction of desired concurrency to add during each scale-up operation.\"\"\"\n\n    _SCALE_DOWN_STEP_RATIO = 0.05\n    \"\"\"Fraction of desired concurrency to remove during each scale-down operation.\"\"\"\n\n    _TASK_TIMEOUT: timedelta | None = None\n    \"\"\"Timeout within which the `run_task_function` must complete.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        system_status: SystemStatus,\n        concurrency_settings: ConcurrencySettings | None = None,\n        run_task_function: Callable[[], Awaitable],\n        is_task_ready_function: Callable[[], Awaitable[bool]],\n        is_finished_function: Callable[[], Awaitable[bool]],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            system_status: Provides data about system utilization (load).\n            concurrency_settings: Settings of concurrency levels.\n            run_task_function: A function that performs an asynchronous resource-intensive task.\n            is_task_ready_function: A function that indicates whether `run_task_function` should be called. This\n                function is called every time there is free capacity for a new task and it should indicate whether\n                it should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,\n                it is also useful for task throttling to save resources.\n            is_finished_function: A function that is called only when there are no tasks to be processed. If it\n                resolves to `True` then the pool's run finishes. Being called only when there are no tasks being\n                processed means that as long as `is_task_ready_function` keeps resolving to `True`,\n                `is_finished_function` will never be called. To abort a run, use the `abort` method.\n        \"\"\"\n        concurrency_settings = concurrency_settings or ConcurrencySettings()\n\n        self._system_status = system_status\n        self._run_task_function = run_task_function\n        self._is_task_ready_function = is_task_ready_function\n        self._is_finished_function = is_finished_function\n        self._desired_concurrency = concurrency_settings.desired_concurrency\n        self._max_concurrency = concurrency_settings.max_concurrency\n        self._min_concurrency = concurrency_settings.min_concurrency\n        self._max_tasks_per_minute = concurrency_settings.max_tasks_per_minute\n\n        self._log_system_status_task = RecurringTask(self._log_system_status, self._LOGGING_INTERVAL)\n        self._autoscale_task = RecurringTask(self._autoscale, self._AUTOSCALE_INTERVAL)\n\n        self._is_paused = False\n        self._current_run: _AutoscaledPoolRun | None = None\n\n    async def run(self) -> None:\n        \"\"\"Start the autoscaled pool and return when all tasks are completed and `is_finished_function` returns True.\n\n        If there is an exception in one of the tasks, it will be re-raised.\n        \"\"\"\n        if self._current_run is not None:\n            raise RuntimeError('The pool is already running')\n\n        run = _AutoscaledPoolRun()\n        self._current_run = run\n\n        logger.debug('Starting the pool')\n\n        self._autoscale_task.start()\n        self._log_system_status_task.start()\n\n        orchestrator = asyncio.create_task(\n            self._worker_task_orchestrator(run), name='autoscaled pool worker task orchestrator'\n        )\n\n        try:\n            await run.result\n        except AbortError:\n            orchestrator.cancel()\n            for task in run.worker_tasks:\n                if not task.done():\n                    task.cancel()\n        finally:\n            with suppress(asyncio.CancelledError):\n                await self._autoscale_task.stop()\n            with suppress(asyncio.CancelledError):\n                await self._log_system_status_task.stop()\n\n            if not orchestrator.done():\n                orchestrator.cancel()\n            elif not orchestrator.cancelled() and orchestrator.exception() is not None:\n                logger.error('Exception in worker task orchestrator', exc_info=orchestrator.exception())\n\n            logger.info('Waiting for remaining tasks to finish')\n\n            for task in run.worker_tasks:\n                if not task.done():\n                    with suppress(BaseException):\n                        await task\n\n            run.cleanup_done.set()\n            self._current_run = None\n\n            logger.debug('Pool cleanup finished')\n\n    async def abort(self) -> None:\n        \"\"\"Interrupt the autoscaled pool and all the tasks in progress.\"\"\"\n        if not self._current_run:\n            raise RuntimeError('The pool is not running')\n\n        self._current_run.result.set_exception(AbortError())\n        await self._current_run.cleanup_done.wait()\n\n    def pause(self) -> None:\n        \"\"\"Pause the autoscaled pool so that it does not start new tasks.\"\"\"\n        self._is_paused = True\n\n    def resume(self) -> None:\n        \"\"\"Resume a paused autoscaled pool so that it continues starting new tasks.\"\"\"\n        self._is_paused = False\n\n    @property\n    def desired_concurrency(self) -> int:\n        \"\"\"The current desired concurrency, possibly updated by the pool according to system load.\"\"\"\n        return self._desired_concurrency\n\n    @property\n    def current_concurrency(self) -> int:\n        \"\"\"The number of concurrent tasks in progress.\"\"\"\n        if self._current_run is None:\n            return 0\n\n        return len(self._current_run.worker_tasks)\n\n    def _autoscale(self) -> None:\n        \"\"\"Inspect system load status and adjust desired concurrency if necessary. Do not call directly.\"\"\"\n        status = self._system_status.get_historical_system_info()\n\n        min_current_concurrency = math.floor(self._DESIRED_CONCURRENCY_RATIO * self.desired_concurrency)\n        should_scale_up = (\n            status.is_system_idle\n            and self._desired_concurrency < self._max_concurrency\n            and self.current_concurrency >= min_current_concurrency\n        )\n\n        should_scale_down = not status.is_system_idle and self._desired_concurrency > self._min_concurrency\n\n        if should_scale_up:\n            step = math.ceil(self._SCALE_UP_STEP_RATIO * self._desired_concurrency)\n            self._desired_concurrency = min(self._max_concurrency, self._desired_concurrency + step)\n        elif should_scale_down:\n            step = math.ceil(self._SCALE_DOWN_STEP_RATIO * self._desired_concurrency)\n            self._desired_concurrency = max(self._min_concurrency, self._desired_concurrency - step)\n\n    def _log_system_status(self) -> None:\n        system_status = self._system_status.get_historical_system_info()\n\n        logger.info(\n            f'current_concurrency = {self.current_concurrency}; '\n            f'desired_concurrency = {self.desired_concurrency}; '\n            f'{system_status!s}'\n        )\n\n    async def _worker_task_orchestrator(self, run: _AutoscaledPoolRun) -> None:\n        \"\"\"Launch worker tasks whenever there is free capacity and a task is ready.\n\n        Exits when `is_finished_function` returns True.\n        \"\"\"\n        finished = False\n\n        try:\n            while not (finished := await self._is_finished_function()) and not run.result.done():\n                run.worker_tasks_updated.clear()\n\n                current_status = self._system_status.get_current_system_info()\n                if not current_status.is_system_idle:\n                    logger.debug('Not scheduling new tasks - system is overloaded')\n                elif self._is_paused:\n                    logger.debug('Not scheduling new tasks - the autoscaled pool is paused')\n                elif self.current_concurrency >= self.desired_concurrency:\n                    logger.debug('Not scheduling new tasks - already running at desired concurrency')\n                elif not await self._is_task_ready_function():\n                    logger.debug('Not scheduling new task - no task is ready')\n                else:\n                    logger.debug('Scheduling a new task')\n                    worker_task = asyncio.create_task(self._worker_task(), name='autoscaled pool worker task')\n                    worker_task.add_done_callback(lambda task: self._reap_worker_task(task, run))\n                    run.worker_tasks.append(worker_task)\n\n                    if math.isfinite(self._max_tasks_per_minute):\n                        await asyncio.sleep(60 / self._max_tasks_per_minute)\n\n                    continue\n\n                with suppress(asyncio.TimeoutError):\n                    await asyncio.wait_for(run.worker_tasks_updated.wait(), timeout=0.5)\n        finally:\n            if finished:\n                logger.debug('`is_finished_function` reports that we are finished')\n            elif run.result.done() and run.result.exception() is not None:\n                logger.debug('Unhandled exception in `run_task_function`')\n\n            if run.worker_tasks:\n                logger.debug('Terminating - waiting for tasks to complete')\n                await asyncio.wait(run.worker_tasks, return_when=asyncio.ALL_COMPLETED)\n                logger.debug('Worker tasks finished')\n            else:\n                logger.debug('Terminating - no running tasks to wait for')\n\n            if not run.result.done():\n                run.result.set_result(object())\n\n    def _reap_worker_task(self, task: asyncio.Task, run: _AutoscaledPoolRun) -> None:\n        \"\"\"Handle cleanup and tracking of a completed worker task.\n\n        - Interrupt the run if the task encountered an exception.\n        - Update the list of tasks in progress.\n        - Notify the orchestrator about the task completion.\n        \"\"\"\n        run.worker_tasks_updated.set()\n        run.worker_tasks.remove(task)\n\n        if not task.cancelled() and (exception := task.exception()) and not run.result.done():\n            run.result.set_exception(exception)\n\n    async def _worker_task(self) -> None:\n        try:\n            await asyncio.wait_for(\n                self._run_task_function(),\n                timeout=self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else None,\n            )\n        except asyncio.TimeoutError:\n            timeout_str = self._TASK_TIMEOUT.total_seconds() if self._TASK_TIMEOUT is not None else '*not set*'\n            logger.warning(f'Task timed out after {timeout_str} seconds')\n        finally:\n            logger.debug('Worker task finished')\n"
  },
  {
    "path": "src/crawlee/_autoscaling/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/_autoscaling/snapshotter.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/snapshotter.ts\n\nfrom __future__ import annotations\n\nimport functools\nfrom bisect import insort\nfrom datetime import datetime, timedelta, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, TypeVar, cast\n\nfrom crawlee import service_locator\nfrom crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Ratio, Snapshot\nfrom crawlee._utils.byte_size import ByteSize\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee._utils.system import MemoryInfo, MemoryUsageInfo, get_memory_info\nfrom crawlee.events._types import Event, EventSystemInfoData\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from crawlee.configuration import Configuration\n\nlogger = getLogger(__name__)\n\nT = TypeVar('T', bound=Snapshot)\n\n\n@functools.lru_cache\ndef _warn_once(warning_message: str) -> None:\n    \"\"\"Log a warning message only once.\"\"\"\n    logger.warning(warning_message)\n\n\nclass SortedSnapshotList(list[T]):\n    \"\"\"A list that maintains sorted order by `created_at` attribute for snapshot objects.\"\"\"\n\n    def add(self, item: T) -> None:\n        \"\"\"Add an item to the list maintaining sorted order by `created_at` using binary search.\"\"\"\n        insort(self, item, key=lambda item: item.created_at)\n\n\n@docs_group('Autoscaling')\nclass Snapshotter:\n    \"\"\"Monitors and logs system resource usage at predefined intervals for performance optimization.\n\n    The class monitors and records the state of various system resources (CPU, memory, event loop, and client API)\n    at predefined intervals. This continuous monitoring helps in identifying resource overloads and ensuring optimal\n    performance of the application. It is utilized in the `AutoscaledPool` module to adjust task allocation\n    dynamically based on the current demand and system load.\n    \"\"\"\n\n    _EVENT_LOOP_SNAPSHOT_INTERVAL = timedelta(milliseconds=500)\n    \"\"\"The interval at which the event loop is sampled.\"\"\"\n\n    _CLIENT_SNAPSHOT_INTERVAL = timedelta(milliseconds=1000)\n    \"\"\"The interval at which the client is sampled.\"\"\"\n\n    _SNAPSHOT_HISTORY = timedelta(seconds=30)\n    \"\"\"The time interval for which the snapshots are kept.\"\"\"\n\n    _RESERVE_MEMORY_RATIO = 0.5\n    \"\"\"Fraction of memory kept in reserve. Used to calculate critical memory overload threshold.\"\"\"\n\n    _MEMORY_WARNING_COOLDOWN_PERIOD = timedelta(milliseconds=10000)\n    \"\"\"Minimum time interval between logging successive critical memory overload warnings.\"\"\"\n\n    _CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2\n    \"\"\"Number of retries for a client request before considering it a failure due to rate limiting.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        max_used_cpu_ratio: float,\n        max_used_memory_ratio: float,\n        max_event_loop_delay: timedelta,\n        max_client_errors: int,\n        max_memory_size: ByteSize | Ratio,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        In most cases, you should use the `from_config` constructor to create a new instance based on\n        the provided configuration.\n\n        Args:\n            max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than\n                the provided ratio, the CPU is considered overloaded.\n            max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage\n                is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.\n            max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided\n                value, the event loop is considered overloaded.\n            max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors\n                is higher than the provided number, the client is considered overloaded.\n            max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. When of type\n                `ByteSize` then it is used as fixed memory size. When of type `Ratio` then it allows for dynamic memory\n                scaling based on the available system memory.\n        \"\"\"\n        self._max_used_cpu_ratio = max_used_cpu_ratio\n        self._max_used_memory_ratio = max_used_memory_ratio\n        self._max_event_loop_delay = max_event_loop_delay\n        self._max_client_errors = max_client_errors\n        self._max_memory_size = max_memory_size\n\n        self._cpu_snapshots = self._get_sorted_list_by_created_at(list[CpuSnapshot]())\n        self._event_loop_snapshots = self._get_sorted_list_by_created_at(list[EventLoopSnapshot]())\n        self._memory_snapshots = self._get_sorted_list_by_created_at(list[MemorySnapshot]())\n        self._client_snapshots = self._get_sorted_list_by_created_at(list[ClientSnapshot]())\n\n        self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._EVENT_LOOP_SNAPSHOT_INTERVAL)\n        self._snapshot_client_task = RecurringTask(self._snapshot_client, self._CLIENT_SNAPSHOT_INTERVAL)\n\n        self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1)\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    @classmethod\n    def from_config(cls, config: Configuration | None = None) -> Snapshotter:\n        \"\"\"Initialize a new instance based on the provided `Configuration`.\n\n        Args:\n            config: The `Configuration` instance. Uses the global (default) one if not provided.\n        \"\"\"\n        config = config or service_locator.get_configuration()\n\n        # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,\n        # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's\n        # total available memory based on `available_memory_ratio`.\n        max_memory_size = (\n            ByteSize.from_mb(config.memory_mbytes)\n            if config.memory_mbytes\n            else Ratio(value=config.available_memory_ratio)\n        )\n\n        return cls(\n            max_used_cpu_ratio=config.max_used_cpu_ratio,\n            max_used_memory_ratio=config.max_used_memory_ratio,\n            max_event_loop_delay=config.max_event_loop_delay,\n            max_client_errors=config.max_client_errors,\n            max_memory_size=max_memory_size,\n        )\n\n    @staticmethod\n    def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnapshotList[T]:\n        \"\"\"Create a sorted list from the input list.\n\n        Returns a custom list that maintains sorted order by created_at when items are added.\n        \"\"\"\n        result = SortedSnapshotList[T]()\n        result.extend(input_list)\n        return result\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    async def __aenter__(self) -> Snapshotter:\n        \"\"\"Start capturing snapshots at configured intervals.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n        event_manager = service_locator.get_event_manager()\n        event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)\n        event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)\n        self._snapshot_event_loop_task.start()\n        self._snapshot_client_task.start()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Stop all resource capturing.\n\n        This method stops capturing snapshots of system resources (CPU, memory, event loop, and client information).\n        It should be called to terminate resource capturing when it is no longer needed.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        event_manager = service_locator.get_event_manager()\n        event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)\n        event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)\n        await self._snapshot_event_loop_task.stop()\n        await self._snapshot_client_task.stop()\n        self._active = False\n\n    @ensure_context\n    def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot]:\n        \"\"\"Return a sample of the latest memory snapshots.\n\n        Args:\n            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n\n        Returns:\n            A sample of memory snapshots.\n        \"\"\"\n        snapshots = cast('list[Snapshot]', self._memory_snapshots)\n        return self._get_sample(snapshots, duration)\n\n    @ensure_context\n    def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snapshot]:\n        \"\"\"Return a sample of the latest event loop snapshots.\n\n        Args:\n            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n\n        Returns:\n            A sample of event loop snapshots.\n        \"\"\"\n        snapshots = cast('list[Snapshot]', self._event_loop_snapshots)\n        return self._get_sample(snapshots, duration)\n\n    @ensure_context\n    def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]:\n        \"\"\"Return a sample of the latest CPU snapshots.\n\n        Args:\n            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n\n        Returns:\n            A sample of CPU snapshots.\n        \"\"\"\n        snapshots = cast('list[Snapshot]', self._cpu_snapshots)\n        return self._get_sample(snapshots, duration)\n\n    @ensure_context\n    def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot]:\n        \"\"\"Return a sample of the latest client snapshots.\n\n        Args:\n            duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history.\n\n        Returns:\n            A sample of client snapshots.\n        \"\"\"\n        snapshots = cast('list[Snapshot]', self._client_snapshots)\n        return self._get_sample(snapshots, duration)\n\n    @staticmethod\n    def _get_sample(snapshots: list[Snapshot], duration: timedelta | None = None) -> list[Snapshot]:\n        \"\"\"Return a time-limited sample from snapshots or full history if duration is None.\"\"\"\n        if not duration:\n            return snapshots\n\n        if not snapshots:\n            return []\n\n        latest_time = snapshots[-1].created_at\n        return [snapshot for snapshot in snapshots if latest_time - snapshot.created_at <= duration]\n\n    async def _snapshot_cpu(self, event_data: EventSystemInfoData) -> None:\n        \"\"\"Capture a snapshot of the current CPU usage.\n\n        This method does not perform CPU usage measurement. Instead, it just reads the data received through\n        the `event_data` parameter, which is expected to be supplied by the event manager.\n        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause\n        race conditions in snapshots manipulation(sorting and pruning).\n\n        Args:\n            event_data: System info data from which CPU usage is read.\n        \"\"\"\n        snapshot = CpuSnapshot(\n            used_ratio=event_data.cpu_info.used_ratio,\n            max_used_ratio=self._max_used_cpu_ratio,\n            created_at=event_data.cpu_info.created_at,\n        )\n\n        snapshots = cast('list[Snapshot]', self._cpu_snapshots)\n        self._cpu_snapshots.add(snapshot)\n        self._prune_snapshots(snapshots, self._cpu_snapshots[-1].created_at)\n\n    async def _snapshot_memory(self, event_data: EventSystemInfoData) -> None:\n        \"\"\"Capture a snapshot of the current memory usage.\n\n        This method does not perform memory usage measurement. Instead, it just reads the data received through\n        the `event_data` parameter, which is expected to be supplied by the event manager.\n        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause\n        race conditions in snapshots manipulation(sorting and pruning).\n\n        Args:\n            event_data: System info data from which memory usage is read.\n        \"\"\"\n        match event_data.memory_info, self._max_memory_size:\n            case MemoryInfo() as memory_info, Ratio() as ratio:\n                max_memory_size = memory_info.total_size * ratio.value\n                system_wide_used_size = memory_info.system_wide_used_size\n                system_wide_memory_size = memory_info.total_size\n\n            case MemoryUsageInfo(), Ratio() as ratio:\n                # This is just hypothetical case, that will most likely not happen in practice.\n                # `LocalEventManager` should always provide `MemoryInfo` in the event data.\n                # When running on Apify, `self._max_memory_size` is always `ByteSize`, not `Ratio`.\n                _warn_once(\n                    'It is recommended that a custom implementation of `LocalEventManager` emits `SYSTEM_INFO` events '\n                    'with `MemoryInfo` and not just `MemoryUsageInfo`.'\n                )\n                max_memory_size = get_memory_info().total_size * ratio.value\n                system_wide_used_size = None\n                system_wide_memory_size = None\n\n            case MemoryInfo() as memory_info, ByteSize() as byte_size:\n                max_memory_size = byte_size\n                system_wide_used_size = memory_info.system_wide_used_size\n                system_wide_memory_size = memory_info.total_size\n\n            case MemoryUsageInfo(), ByteSize() as byte_size:\n                max_memory_size = byte_size\n                system_wide_used_size = None\n                system_wide_memory_size = None\n\n            case _, _:\n                raise NotImplementedError('Unsupported combination of memory info and max memory size types.')\n\n        snapshot = MemorySnapshot(\n            current_size=event_data.memory_info.current_size,\n            max_memory_size=max_memory_size,\n            max_used_memory_ratio=self._max_used_memory_ratio,\n            created_at=event_data.memory_info.created_at,\n            system_wide_used_size=system_wide_used_size,\n            system_wide_memory_size=system_wide_memory_size,\n        )\n\n        snapshots = cast('list[Snapshot]', self._memory_snapshots)\n        self._memory_snapshots.add(snapshot)\n        self._prune_snapshots(snapshots, self._memory_snapshots[-1].created_at)\n\n        self._evaluate_memory_load(\n            event_data.memory_info.current_size,\n            event_data.memory_info.created_at,\n            max_memory_size=max_memory_size,\n        )\n\n    async def _snapshot_event_loop(self) -> None:\n        \"\"\"Capture a snapshot of the current event loop usage.\n\n        This method evaluates the event loop's latency by comparing the expected time between snapshots to the actual\n        time elapsed since the last snapshot. The delay in the snapshot reflects the time deviation due to event loop\n        overhead - it's calculated by subtracting the expected interval between snapshots from the actual time elapsed\n        since the last snapshot. If there's no previous snapshot, the delay is considered zero.\n        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause\n        race conditions in snapshots manipulation(sorting and pruning).\n        \"\"\"\n        snapshot = EventLoopSnapshot(max_delay=self._max_event_loop_delay, delay=timedelta(seconds=0))\n        previous_snapshot = self._event_loop_snapshots[-1] if self._event_loop_snapshots else None\n\n        if previous_snapshot:\n            event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL\n            snapshot.delay = event_loop_delay\n\n        snapshots = cast('list[Snapshot]', self._event_loop_snapshots)\n        self._event_loop_snapshots.add(snapshot)\n        self._prune_snapshots(snapshots, self._event_loop_snapshots[-1].created_at)\n\n    async def _snapshot_client(self) -> None:\n        \"\"\"Capture a snapshot of the current API state by checking for rate limit errors (HTTP 429).\n\n        Only errors produced by a 2nd retry of the API call are considered for snapshotting since earlier errors may\n        just be caused by a random spike in the number of requests and do not necessarily signify API overloading.\n        Must be `async` to ensure it is not scheduled to be run in own thread by the event manager, which could cause\n        race conditions in snapshots manipulation(sorting and pruning).\n        \"\"\"\n        client = service_locator.get_storage_client()\n\n        rate_limit_errors: dict[int, int] = client.get_rate_limit_errors()\n\n        error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0)\n        previous_error_count = self._client_snapshots[-1].error_count if self._client_snapshots else 0\n        snapshot = ClientSnapshot(\n            error_count=error_count,\n            new_error_count=error_count - previous_error_count,\n            max_error_count=self._max_client_errors,\n        )\n\n        snapshots = cast('list[Snapshot]', self._client_snapshots)\n        self._client_snapshots.add(snapshot)\n        self._prune_snapshots(snapshots, self._client_snapshots[-1].created_at)\n\n    def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -> None:\n        \"\"\"Remove snapshots that are older than the `self._snapshot_history`.\n\n        This method modifies the list of snapshots in place, removing all snapshots that are older than the defined\n        snapshot history relative to the `now` parameter.\n\n        Args:\n            snapshots: List of snapshots to be pruned in place.\n            now: The current date and time, used as the reference for pruning.\n        \"\"\"\n        # Find the index where snapshots start to be within the allowed history window.\n        # We'll keep snapshots from this index onwards.\n        keep_from_index = None\n        for i, snapshot in enumerate(snapshots):\n            if now - snapshot.created_at <= self._SNAPSHOT_HISTORY:\n                keep_from_index = i\n                break\n\n        # If all snapshots are old, keep_from_index will remain None, so we clear the list.\n        # Otherwise, we keep only the recent snapshots.\n        if keep_from_index is not None:\n            del snapshots[:keep_from_index]\n        else:\n            snapshots.clear()\n\n    def _evaluate_memory_load(\n        self, current_memory_usage_size: ByteSize, snapshot_timestamp: datetime, max_memory_size: ByteSize\n    ) -> None:\n        \"\"\"Evaluate and logs critical memory load conditions based on the system information.\n\n        Args:\n            current_memory_usage_size: The current memory usage.\n            snapshot_timestamp: The time at which the memory snapshot was taken.\n            max_memory_size: The maximum memory size to be used for evaluation.\n        \"\"\"\n        # Check if the warning has been logged recently to avoid spamming\n        if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._MEMORY_WARNING_COOLDOWN_PERIOD:\n            return\n\n        threshold_memory_size = self._max_used_memory_ratio * max_memory_size\n        buffer_memory_size = max_memory_size * (1 - self._max_used_memory_ratio) * self._RESERVE_MEMORY_RATIO\n        overload_memory_threshold_size = threshold_memory_size + buffer_memory_size\n\n        # Log a warning if current memory usage exceeds the critical overload threshold\n        if current_memory_usage_size > overload_memory_threshold_size:\n            memory_usage_percentage = round((current_memory_usage_size.bytes / max_memory_size.bytes) * 100)\n            logger.warning(\n                f'Memory is critically overloaded. Using {current_memory_usage_size} of '\n                f'{max_memory_size} ({memory_usage_percentage}%). '\n                'Consider increasing available memory.'\n            )\n            self._timestamp_of_last_memory_warning = snapshot_timestamp\n"
  },
  {
    "path": "src/crawlee/_autoscaling/system_status.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/autoscaling/system_status.ts\n\nfrom __future__ import annotations\n\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom more_itertools import pairwise\n\nfrom crawlee._autoscaling._types import LoadRatioInfo, Snapshot, SystemInfo\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from crawlee._autoscaling import Snapshotter\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Autoscaling')\nclass SystemStatus:\n    \"\"\"Provides a simple interface for evaluating system resource usage from snapshots collected by `Snapshotter`.\n\n    This class aggregates and interprets snapshots from a Snapshotter instance to evaluate the current and historical\n    status of system resources like CPU, memory, event loop, and client API usage. It exposes two methods\n    `get_current_system_info` and `get_historical_system_info`. The system information is computed using a weighted\n    average of overloaded messages in the snapshots, with the weights being the time intervals between the snapshots.\n    Each resource is computed separately, and the system is considered as overloaded whenever at least one resource\n    is overloaded.\n\n    `get_current_system_info` returns a `SystemInfo` data structure that represents the current status\n    of the system. The length of the current timeframe in seconds is configurable by the `max_snapshot_age` option\n    and represents the max age of snapshots to be considered for the computation.\n\n    `SystemStatus.get_historical_system_info` returns a `SystemInfo` that represents the long-term status of the system.\n    It considers the full snapshot history available in the `Snapshotter` instance.\n    \"\"\"\n\n    def __init__(\n        self,\n        snapshotter: Snapshotter,\n        *,\n        max_snapshot_age: timedelta = timedelta(seconds=5),\n        cpu_overload_threshold: float = 0.4,\n        memory_overload_threshold: float = 0.2,\n        event_loop_overload_threshold: float = 0.6,\n        client_overload_threshold: float = 0.3,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            snapshotter: The `Snapshotter` instance to be queried for `SystemStatus`.\n            max_snapshot_age: Defines max age of snapshots used in the `SystemStatus.get_current_system_info`\n                measurement.\n            cpu_overload_threshold: Sets the threshold of overloaded snapshots in the CPU sample.\n                If the sample exceeds this threshold, the system will be considered overloaded.\n            memory_overload_threshold: Sets the threshold of overloaded snapshots in the memory sample.\n                If the sample exceeds this threshold, the system will be considered overloaded.\n            event_loop_overload_threshold: Sets the threshold of overloaded snapshots in the event loop sample.\n                If the sample exceeds this threshold, the system will be considered overloaded.\n            client_overload_threshold: Sets the threshold of overloaded snapshots in the Client sample.\n                If the sample exceeds this threshold, the system will be considered overloaded.\n        \"\"\"\n        self._snapshotter = snapshotter\n        self._max_snapshot_age = max_snapshot_age\n        self._cpu_overload_threshold = cpu_overload_threshold\n        self._memory_overload_threshold = memory_overload_threshold\n        self._event_loop_overload_threshold = event_loop_overload_threshold\n        self._client_overload_threshold = client_overload_threshold\n\n    def get_current_system_info(self) -> SystemInfo:\n        \"\"\"Retrieve and evaluates the current status of system resources.\n\n        Considers snapshots within the `_max_snapshot_age` timeframe and determines if the system is currently\n        overloaded based on predefined thresholds for each resource type.\n\n        Returns:\n            An object representing the current system status.\n        \"\"\"\n        return self._get_system_info(sample_duration=self._max_snapshot_age)\n\n    def get_historical_system_info(self) -> SystemInfo:\n        \"\"\"Retrieve and evaluates the historical status of system resources.\n\n        Considers the entire history of snapshots from the Snapshotter to assess long-term system performance and\n        determines if the system has been historically overloaded.\n\n        Returns:\n            An object representing the historical system status.\n        \"\"\"\n        return self._get_system_info()\n\n    def _get_system_info(self, *, sample_duration: timedelta | None = None) -> SystemInfo:\n        \"\"\"Get system information based on the overload state of different resources within a specified duration.\n\n        Args:\n            sample_duration: Specific duration for which to evaluate the system status. If None, evaluates across\n                the entire history available in the snapshotter.\n\n        Returns:\n            Aggregated system status indicating whether the system is idle or overloaded.\n        \"\"\"\n        mem_info = self._is_memory_overloaded(sample_duration)\n        event_loop_info = self._is_event_loop_overloaded(sample_duration)\n        cpu_info = self._is_cpu_overloaded(sample_duration)\n        client_info = self._is_client_overloaded(sample_duration)\n\n        return SystemInfo(\n            memory_info=mem_info,\n            event_loop_info=event_loop_info,\n            cpu_info=cpu_info,\n            client_info=client_info,\n        )\n\n    def _is_cpu_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:\n        \"\"\"Determine if the CPU has been overloaded within a specified time duration.\n\n        Args:\n            sample_duration: The duration within which to analyze CPU snapshots. If None, evaluates across\n                the entire history available in the snapshotter.\n\n        Returns:\n            CPU load ratio information.\n        \"\"\"\n        sample = self._snapshotter.get_cpu_sample(sample_duration)\n        return self._is_sample_overloaded(sample, self._cpu_overload_threshold)\n\n    def _is_memory_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:\n        \"\"\"Determine if memory has been overloaded within a specified time duration.\n\n        Args:\n            sample_duration: The duration within which to analyze memory snapshots. If None, evaluates across\n                the entire history available in the snapshotter.\n\n        Returns:\n            Memory load ratio information.\n        \"\"\"\n        sample = self._snapshotter.get_memory_sample(sample_duration)\n        return self._is_sample_overloaded(sample, self._memory_overload_threshold)\n\n    def _is_event_loop_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:\n        \"\"\"Determine if the event loop has been overloaded within a specified time duration.\n\n        Args:\n            sample_duration: The duration within which to analyze event loop snapshots. If None, evaluates across\n                the entire history available in the snapshotter.\n\n        Returns:\n            Event loop load ratio information.\n        \"\"\"\n        sample = self._snapshotter.get_event_loop_sample(sample_duration)\n        return self._is_sample_overloaded(sample, self._event_loop_overload_threshold)\n\n    def _is_client_overloaded(self, sample_duration: timedelta | None = None) -> LoadRatioInfo:\n        \"\"\"Determine if the client has been overloaded within a specified time duration.\n\n        Args:\n            sample_duration: The duration within which to analyze client snapshots. If None, evaluates across\n                the entire history available in the snapshotter.\n\n        Returns:\n            Client load ratio information.\n        \"\"\"\n        sample = self._snapshotter.get_client_sample(sample_duration)\n        return self._is_sample_overloaded(sample, self._client_overload_threshold)\n\n    def _is_sample_overloaded(self, sample: list[Snapshot], threshold: float) -> LoadRatioInfo:\n        \"\"\"Determine if a sample of snapshot data is overloaded based on a specified ratio.\n\n        Args:\n            sample: A list of snapshot data to analyze.\n            threshold: The threshold ratio to use for determining if the sample is overloaded.\n\n        Returns:\n            An object with an `is_overloaded` property set to `True` if the sample is considered overloaded based\n            on the specified threshold ratio. Otherwise, `is_overloaded` is set to `False`.\n        \"\"\"\n        if not sample:\n            return LoadRatioInfo(limit_ratio=threshold, actual_ratio=0)\n\n        if len(sample) == 1:\n            return LoadRatioInfo(limit_ratio=threshold, actual_ratio=float(sample[0].is_overloaded))\n\n        overloaded_time = 0.0\n        non_overloaded_time = 0.0\n\n        for previous, current in pairwise(sample):\n            time = (current.created_at - previous.created_at).total_seconds()\n            if time < 0:\n                raise ValueError('Negative time. Code assumptions are not valid. Expected time sorted samples.')\n            if current.is_overloaded:\n                overloaded_time += time\n            else:\n                non_overloaded_time += time\n\n        if (total_time := overloaded_time + non_overloaded_time) == 0:\n            overloaded_ratio = 0.0\n        else:\n            overloaded_ratio = overloaded_time / total_time\n\n        return LoadRatioInfo(limit_ratio=threshold, actual_ratio=round(overloaded_ratio, 3))\n"
  },
  {
    "path": "src/crawlee/_cli.py",
    "content": "# ruff: noqa: FBT002\nfrom __future__ import annotations\n\nimport importlib.resources\nimport json\nimport sys\nfrom pathlib import Path\nfrom typing import Annotated, cast\n\nfrom click import Choice\n\ntry:\n    import inquirer\n    import typer\n    from cookiecutter.main import cookiecutter\n    from inquirer.render.console import ConsoleRender\n    from rich.progress import Progress, SpinnerColumn, TextColumn\nexcept ModuleNotFoundError as exc:\n    raise ImportError(\n        \"Missing required dependencies for the Crawlee CLI. It looks like you're running 'crawlee' \"\n        \"without the CLI extra. Try using 'crawlee[cli]' instead.\"\n    ) from exc\n\ncli = typer.Typer(no_args_is_help=True)\n\ntemplate_directory = importlib.resources.files('crawlee') / 'project_template'\nwith (template_directory / 'cookiecutter.json').open() as f:\n    cookiecutter_json = json.load(f)\n\ncrawler_choices = cookiecutter_json['crawler_type']\nhttp_client_choices = cookiecutter_json['http_client']\npackage_manager_choices = cookiecutter_json['package_manager']\ndefault_start_url = cookiecutter_json['start_url']\ndefault_enable_apify_integration = cookiecutter_json['enable_apify_integration']\ndefault_install_project = cookiecutter_json['install_project']\n\n\n@cli.callback(invoke_without_command=True)\ndef callback(\n    version: Annotated[\n        bool,\n        typer.Option(\n            '-V',\n            '--version',\n            help='Print Crawlee version',\n        ),\n    ] = False,\n) -> None:\n    \"\"\"Crawlee is a web scraping and browser automation library.\"\"\"\n    if version:\n        from crawlee import __version__  # noqa: PLC0415\n\n        typer.echo(__version__)\n\n\ndef _prompt_for_project_name(initial_project_name: str | None) -> str:\n    \"\"\"Prompt the user for a non-empty project name that does not lead to an existing folder.\"\"\"\n    while True:\n        if initial_project_name is not None:\n            project_name = initial_project_name\n            initial_project_name = None\n        else:\n            project_name = ConsoleRender().render(\n                inquirer.Text(\n                    name='project_name',\n                    message='Name of the new project folder',\n                    validate=lambda _, value: bool(value.strip()),\n                ),\n            )\n\n        if not project_name:\n            typer.echo('Project name is required.', err=True)\n            continue\n\n        project_path = Path.cwd() / project_name\n\n        if project_path.exists():\n            typer.echo(f'Folder {project_path} already exists. Please choose another name.', err=True)\n            continue\n\n        return project_name\n\n\ndef _prompt_text(message: str, default: str) -> str:\n    return cast(\n        'str',\n        ConsoleRender().render(\n            inquirer.Text(\n                name='text',\n                message=message,\n                default=default,\n                validate=lambda _, value: bool(value.strip()),\n            ),\n        ),\n    )\n\n\ndef _prompt_choice(message: str, choices: list[str]) -> str:\n    \"\"\"Prompt the user to pick one from a list of choices.\"\"\"\n    return cast(\n        'str',\n        ConsoleRender().render(\n            inquirer.List(\n                name='choice',\n                message=message,\n                choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],\n            ),\n        ),\n    )\n\n\ndef _prompt_bool(message: str, *, default: bool) -> bool:\n    return cast(\n        'bool',\n        ConsoleRender().render(\n            inquirer.Confirm(\n                name='confirm',\n                message=message,\n                default=default,\n            ),\n        ),\n    )\n\n\n@cli.command()\ndef create(\n    project_name: str | None = typer.Argument(\n        default=None,\n        show_default=False,\n        help='The name of the project and the directory that will be created to contain it. '\n        'If none is given, you will be prompted.',\n    ),\n    crawler_type: str | None = typer.Option(\n        None,\n        '--crawler-type',\n        '--template',\n        show_default=False,\n        click_type=Choice(crawler_choices),\n        help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',\n    ),\n    http_client: str | None = typer.Option(\n        None,\n        show_default=False,\n        click_type=Choice(http_client_choices),\n        help='The library that will be used to make HTTP requests in your crawler. '\n        'If none is given, you will be prompted.',\n    ),\n    package_manager: str | None = typer.Option(\n        default=None,\n        show_default=False,\n        click_type=Choice(package_manager_choices),\n        help='Package manager to be used in the new project. If none is given, you will be prompted.',\n    ),\n    start_url: str | None = typer.Option(\n        default=None,\n        show_default=False,\n        metavar='[START_URL]',\n        help='The URL where crawling should start. If none is given, you will be prompted.',\n    ),\n    *,\n    enable_apify_integration: bool | None = typer.Option(\n        None,\n        '--apify/--no-apify',\n        show_default=False,\n        help='Should Apify integration be set up for you? If not given, you will be prompted.',\n    ),\n    install_project: bool | None = typer.Option(\n        None,\n        '--install/--no-install',\n        show_default=False,\n        help='Should the project be installed now? If not given, you will be prompted.',\n    ),\n) -> None:\n    \"\"\"Bootstrap a new Crawlee project.\"\"\"\n    try:\n        # Prompt for project name if not provided.\n        project_name = _prompt_for_project_name(project_name)\n\n        # Prompt for crawler_type if not provided.\n        if crawler_type is None:\n            crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)\n\n        # Prompt for http_client if not provided.\n        if http_client is None:\n            http_client = _prompt_choice('Please select the HTTP client', http_client_choices)\n\n        # Prompt for package manager if not provided.\n        if package_manager is None:\n            package_manager = _prompt_choice('Please select the package manager', package_manager_choices)\n\n        # Prompt for start URL\n        if start_url is None:\n            start_url = _prompt_text('Please specify the start URL', default=default_start_url)\n\n        # Ask about Apify integration if not explicitly configured\n        if enable_apify_integration is None:\n            enable_apify_integration = _prompt_bool(\n                'Should Apify integration be set up for you?', default=default_enable_apify_integration\n            )\n\n        # Ask about installing the project\n        if install_project is None:\n            install_project = _prompt_bool('Should the project be installed now?', default=default_install_project)\n\n        if all(\n            [\n                project_name,\n                crawler_type,\n                http_client,\n                package_manager,\n                start_url,\n                enable_apify_integration is not None,\n                install_project is not None,\n            ]\n        ):\n            package_name = project_name.replace('-', '_')\n\n            # Start the bootstrap process.\n            with Progress(\n                SpinnerColumn(),\n                TextColumn('[progress.description]{task.description}'),\n                transient=True,\n            ) as progress:\n                bootstrap_task = progress.add_task(description='Bootstrapping...', total=None)\n\n                try:\n                    cookiecutter(\n                        template=str(template_directory),\n                        no_input=True,\n                        extra_context={\n                            'project_name': project_name,\n                            'package_manager': package_manager,\n                            'crawler_type': crawler_type,\n                            'http_client': http_client,\n                            'enable_apify_integration': enable_apify_integration,\n                            'start_url': start_url,\n                            'install_project': install_project,\n                        },\n                    )\n                except Exception as exc:\n                    progress.update(bootstrap_task, visible=False)\n                    progress.refresh()\n\n                    # Print just the last line of the error message (the actual error without traceback)\n                    if 'Hook script failed' in str(exc):\n                        typer.echo('Project creation failed. Check the error message above.', err=True)\n                    else:\n                        typer.echo(f'Project creation failed: {exc!s}', err=True)\n\n                    sys.exit(1)\n\n            typer.echo(f'Your project \"{project_name}\" was created.')\n\n            if install_project:\n                if package_manager == 'pip':\n                    typer.echo(\n                        f'To run it, navigate to the directory: \"cd {project_name}\", '\n                        f'activate the virtual environment in \".venv\" (\"source .venv/bin/activate\") '\n                        f'and run your project using \"python -m {package_name}\".'\n                    )\n                else:\n                    typer.echo(\n                        f'To run it, navigate to the directory: \"cd {project_name}\", '\n                        f'and run it using \"{package_manager} run python -m {package_name}\".'\n                    )\n            elif package_manager == 'pip':\n                typer.echo(\n                    f'To run it, navigate to the directory: \"cd {project_name}\", '\n                    f'install the dependencies listed in \"requirements.txt\" '\n                    f'and run it using \"python -m {package_name}\".'\n                )\n            else:\n                install_command = 'sync' if package_manager == 'uv' else 'install'\n                typer.echo(\n                    f'To run it, navigate to the directory: \"cd {project_name}\", '\n                    f'install the project using \"{package_manager} {install_command}\", '\n                    f'and run it using \"{package_manager} run python -m {package_name}\".'\n                )\n\n            typer.echo(f'See the \"{project_name}/README.md\" for more information.')\n\n    except KeyboardInterrupt:\n        typer.echo('Operation cancelled by user.')\n"
  },
  {
    "path": "src/crawlee/_consts.py",
    "content": "from __future__ import annotations\n\nMETADATA_FILENAME = '__metadata__.json'\n\"\"\"The name of the metadata file for storage clients.\"\"\"\n"
  },
  {
    "path": "src/crawlee/_log_config.py",
    "content": "from __future__ import annotations\n\nimport json\nimport logging\nimport sys\nimport textwrap\nfrom typing import TYPE_CHECKING, Any\n\nfrom colorama import Fore, Style, just_fix_windows_console\nfrom typing_extensions import assert_never\n\nfrom crawlee import service_locator\n\nif TYPE_CHECKING:\n    from crawlee._types import LogLevel\n\njust_fix_windows_console()\n\n\n_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX\n\n_LOG_LEVEL_COLOR = {\n    logging.DEBUG: Fore.BLUE,\n    logging.INFO: Fore.GREEN,\n    logging.WARNING: Fore.YELLOW,\n    logging.ERROR: Fore.RED,\n    logging.CRITICAL: Fore.RED,\n}\n\n_LOG_LEVEL_SHORT_ALIAS = {\n    logging.DEBUG: 'DEBUG',\n    logging.INFO: 'INFO ',\n    logging.WARNING: 'WARN ',\n    logging.ERROR: 'ERROR',\n}\n\n# So that all the log messages have the same alignment\n_LOG_MESSAGE_INDENT = ' ' * 6\n\n\ndef string_to_log_level(level: LogLevel) -> int:\n    \"\"\"Convert a string representation of a log level to an integer log level.\"\"\"\n    if level == 'DEBUG':\n        return logging.DEBUG\n    if level == 'INFO':\n        return logging.INFO\n    if level == 'WARNING':\n        return logging.WARNING\n    if level == 'ERROR':\n        return logging.ERROR\n    if level == 'CRITICAL':\n        return logging.CRITICAL\n\n    assert_never(level)\n\n\ndef get_configured_log_level() -> int:\n    config = service_locator.get_configuration()\n\n    if 'log_level' in config.model_fields_set:\n        return string_to_log_level(config.log_level)\n\n    if sys.flags.dev_mode:\n        return logging.DEBUG\n\n    return logging.INFO\n\n\ndef configure_logger(logger: logging.Logger, *, remove_old_handlers: bool = False) -> None:\n    handler = logging.StreamHandler()\n    handler.setFormatter(CrawleeLogFormatter())\n\n    if remove_old_handlers:\n        for old_handler in logger.handlers[:]:\n            logger.removeHandler(old_handler)\n\n    logger.addHandler(handler)\n    logger.setLevel(get_configured_log_level())\n\n    # Do not propagate the log messages to the parent logger to prevent duplicate log messages.\n    logger.propagate = False\n\n\nclass CrawleeLogFormatter(logging.Formatter):\n    \"\"\"Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields.\n\n    It formats the log records so that they:\n        - start with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n        - then have the actual log message, if it's multiline then it's nicely indented\n        - then have the stringified extra log fields\n        - then, if an exception is a part of the log record, prints the formatted exception.\n    \"\"\"\n\n    # The fields that are added to the log record with `logger.log(..., extra={...})` are just merged in the log record\n    # with the other log record properties, and you can't get them in some nice, isolated way. So, to get the extra\n    # fields, we just compare all the properties present in the log record with properties present in an empty log\n    # record, and extract all the extra ones not present in the empty log record.\n    empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)\n\n    def __init__(\n        self,\n        include_logger_name: bool = True,  # noqa: FBT001, FBT002\n        *args: Any,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            include_logger_name: Include logger name at the beginning of the log line.\n            args: Arguments passed to the parent class.\n            kwargs: Keyword arguments passed to the parent class.\n        \"\"\"\n        super().__init__(*args, **kwargs)\n        self.include_logger_name = include_logger_name\n\n    def _get_extra_fields(self, record: logging.LogRecord) -> dict[str, Any]:\n        extra_fields: dict[str, Any] = {}\n        for key, value in record.__dict__.items():\n            if key not in self.empty_record.__dict__:\n                extra_fields[key] = value  # noqa: PERF403\n\n        return extra_fields\n\n    def format(self, record: logging.LogRecord) -> str:\n        \"\"\"Format the log record nicely.\n\n        This formats the log record so that it:\n            - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned)\n            - then has the actual log message, if it's multiline then it's nicely indented\n            - then has the stringified extra log fields\n            - then, if an exception is a part of the log record, prints the formatted exception.\n        \"\"\"\n        logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} '\n\n        # Colorize the log level, and shorten it to 6 chars tops\n        level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '')\n        level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname)\n        level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} '\n\n        # Format the extra log record fields, if there were some\n        # Just stringify them to JSON and color them gray\n        extra_string = ''\n        extra = self._get_extra_fields(record)\n        if extra:\n            extra_string = (\n                f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}'\n            )\n\n        # Call the parent method so that it populates missing fields in the record\n        super().format(record)\n\n        # Format the actual log message\n        log_string = self.formatMessage(record)\n\n        # Format the exception, if there is some\n        # Basically just print the traceback and indent it a bit\n        exception_string = ''\n        if record.exc_text:\n            exception_string = '\\n' + textwrap.indent(record.exc_text.rstrip(), _LOG_MESSAGE_INDENT)\n        else:\n            exception_string = ''\n\n        if self.include_logger_name:\n            # Include logger name at the beginning of the log line\n            return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}'\n\n        return f'{level_string}{log_string}{extra_string}{exception_string}'\n"
  },
  {
    "path": "src/crawlee/_request.py",
    "content": "from __future__ import annotations\n\nfrom collections.abc import Iterator, MutableMapping\nfrom datetime import datetime\nfrom enum import IntEnum\nfrom typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast\n\nfrom pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter\nfrom yarl import URL\n\nfrom crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.requests import compute_unique_key\nfrom crawlee._utils.urls import validate_http_url\n\nif TYPE_CHECKING:\n    from typing_extensions import NotRequired, Required, Self\n\n\nclass RequestState(IntEnum):\n    \"\"\"Crawlee-specific request handling state.\"\"\"\n\n    UNPROCESSED = 0\n    BEFORE_NAV = 1\n    AFTER_NAV = 2\n    REQUEST_HANDLER = 3\n    DONE = 4\n    ERROR_HANDLER = 5\n    ERROR = 6\n    SKIPPED = 7\n\n\nclass CrawleeRequestData(BaseModel):\n    \"\"\"Crawlee-specific configuration stored in the `user_data`.\"\"\"\n\n    max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None\n    \"\"\"Maximum number of retries for this request. Allows to override the global `max_request_retries` option of\n    `BasicCrawler`.\"\"\"\n\n    enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None\n    \"\"\"The strategy that was used for enqueuing the request.\"\"\"\n\n    state: RequestState = RequestState.UNPROCESSED\n    \"\"\"Describes the request's current lifecycle state.\"\"\"\n\n    session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None\n    \"\"\"The number of finished session rotations for this request.\"\"\"\n\n    skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False\n\n    last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None\n    \"\"\"The last proxy tier used to process the request.\"\"\"\n\n    forefront: Annotated[bool, Field()] = False\n    \"\"\"Indicate whether the request should be enqueued at the front of the queue.\"\"\"\n\n    crawl_depth: Annotated[int, Field(alias='crawlDepth')] = 0\n    \"\"\"The depth of the request in the crawl tree.\"\"\"\n\n    session_id: Annotated[str | None, Field()] = None\n    \"\"\"ID of a session to which the request is bound.\"\"\"\n\n\nclass UserData(BaseModel, MutableMapping[str, JsonSerializable]):\n    \"\"\"Represents the `user_data` part of a Request.\n\n    Apart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible\n    values.\n    \"\"\"\n\n    model_config = ConfigDict(extra='allow')\n    __pydantic_extra__: dict[str, JsonSerializable] = Field(init=False)\n\n    crawlee_data: Annotated[CrawleeRequestData | None, Field(alias='__crawlee')] = None\n    \"\"\"Crawlee-specific configuration stored in the `user_data`.\"\"\"\n\n    label: Annotated[str | None, Field()] = None\n    \"\"\"Label used for request routing.\"\"\"\n\n    def __getitem__(self, key: str) -> JsonSerializable:\n        return self.__pydantic_extra__[key]\n\n    def __setitem__(self, key: str, value: JsonSerializable) -> None:\n        if key == 'label':\n            if value is not None and not isinstance(value, str):\n                raise ValueError('`label` must be str or None')\n\n            self.label = value\n\n        self.__pydantic_extra__[key] = value\n\n    def __delitem__(self, key: str) -> None:\n        del self.__pydantic_extra__[key]\n\n    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]\n        yield from self.__pydantic_extra__\n\n    def __len__(self) -> int:\n        return len(self.__pydantic_extra__)\n\n    def __eq__(self, other: object) -> bool:\n        if isinstance(other, BaseModel):\n            return super().__eq__(other)\n\n        if isinstance(other, dict):\n            return self.model_dump() == other\n\n        return NotImplemented\n\n    def __hash__(self) -> int:\n        \"\"\"Return hash based on the model fields.\"\"\"\n        data = self.model_dump()\n        return hash(tuple(sorted(data.items())))\n\n\nuser_data_adapter = TypeAdapter(UserData)\n\n\n@docs_group('Other')\nclass RequestOptions(TypedDict):\n    \"\"\"Options that can be used to customize request creation.\n\n    This type exactly matches the parameters of `Request.from_url` method.\n    \"\"\"\n\n    url: Required[str]\n    method: NotRequired[HttpMethod]\n    headers: NotRequired[HttpHeaders | dict[str, str] | None]\n    payload: NotRequired[HttpPayload | str | None]\n    label: NotRequired[str | None]\n    session_id: NotRequired[str | None]\n    unique_key: NotRequired[str | None]\n    id: NotRequired[str | None]\n    keep_url_fragment: NotRequired[bool]\n    use_extended_unique_key: NotRequired[bool]\n    always_enqueue: NotRequired[bool]\n    user_data: NotRequired[dict[str, JsonSerializable]]\n    no_retry: NotRequired[bool]\n    enqueue_strategy: NotRequired[EnqueueStrategy]\n    max_retries: NotRequired[int | None]\n\n\n@docs_group('Storage data')\nclass Request(BaseModel):\n    \"\"\"Represents a request in the Crawlee framework, containing the necessary information for crawling operations.\n\n    The `Request` class is one of the core components in Crawlee, utilized by various components such as request\n    providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,\n    including the URL, HTTP method, headers, payload, and user data. The user data allows custom information\n    to be stored and persisted throughout the request lifecycle, including its retries.\n\n    Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used\n    for request deduplication, controlling retries, handling state management, and enabling configuration for session\n    rotation and proxy handling.\n\n    The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically\n    generates a unique key and identifier based on the URL and request parameters.\n\n    ### Usage\n\n    ```python\n    from crawlee import Request\n\n    request = Request.from_url('https://crawlee.dev')\n    ```\n    \"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]\n    \"\"\"A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing\n    to the same URL.\n\n    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.\n    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`\n    of `http://www.example.com/something`.\n\n    Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior\n    and specify which URLs shall be considered equal.\n    \"\"\"\n\n    url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]\n    \"\"\"The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters\n    and fragments.\"\"\"\n\n    method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'\n    \"\"\"HTTP request method.\"\"\"\n\n    payload: Annotated[\n        HttpPayload | None,\n        BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),\n        PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),\n        Field(frozen=True),\n    ] = None\n    \"\"\"HTTP request payload.\"\"\"\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        headers: HttpHeaders = HttpHeaders()\n        \"\"\"HTTP request headers.\"\"\"\n\n        user_data: dict[str, JsonSerializable] = {}\n        \"\"\"Custom user data assigned to the request. Use this to save any request related data to the\n        request's scope, keeping them accessible on retries, failures etc.\n        \"\"\"\n\n    else:\n        headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]\n        \"\"\"HTTP request headers.\"\"\"\n\n        user_data: Annotated[\n            dict[str, JsonSerializable],  # Internally, the model contains `UserData`, this is just for convenience\n            Field(alias='userData', default_factory=UserData),\n            PlainValidator(user_data_adapter.validate_python),\n            PlainSerializer(\n                lambda instance: user_data_adapter.dump_python(\n                    instance,\n                    by_alias=True,\n                    exclude_none=False,\n                    exclude_unset=True,\n                    exclude_defaults=True,\n                )\n            ),\n        ]\n        \"\"\"Custom user data assigned to the request. Use this to save any request related data to the\n        request's scope, keeping them accessible on retries, failures etc.\n        \"\"\"\n\n    retry_count: Annotated[int, Field(alias='retryCount')] = 0\n    \"\"\"Number of times the request has been retried.\"\"\"\n\n    no_retry: Annotated[bool, Field(alias='noRetry')] = False\n    \"\"\"If set to `True`, the request will not be retried in case of failure.\"\"\"\n\n    loaded_url: Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')] = None\n    \"\"\"URL of the web page that was loaded. This can differ from the original URL in case of redirects.\"\"\"\n\n    handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None\n    \"\"\"Timestamp when the request was handled.\"\"\"\n\n    @classmethod\n    def from_url(\n        cls,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | str | None = None,\n        label: str | None = None,\n        session_id: str | None = None,\n        unique_key: str | None = None,\n        keep_url_fragment: bool = False,\n        use_extended_unique_key: bool = False,\n        always_enqueue: bool = False,\n        enqueue_strategy: EnqueueStrategy | None = None,\n        max_retries: int | None = None,\n        **kwargs: Any,\n    ) -> Self:\n        \"\"\"Create a new `Request` instance from a URL.\n\n        This is recommended constructor for creating new `Request` instances. It generates a `Request` object from\n        a given URL with additional options to customize HTTP method, payload, unique key, and other request\n        properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,\n        method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.\n\n        Args:\n            url: The URL of the request.\n            method: The HTTP method of the request.\n            headers: The HTTP headers of the request.\n            payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.\n            label: A custom label to differentiate between request types. This is stored in `user_data`, and it is\n                used for request routing (different requests go to different handlers).\n            session_id: ID of a specific `Session` to which the request will be strictly bound.\n                If the session becomes unavailable when the request is processed, a `RequestCollisionError` will be\n                raised.\n            unique_key: A unique key identifying the request. If not provided, it is automatically computed based on\n                the URL and other parameters. Requests with the same `unique_key` are treated as identical.\n            keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in\n                the `unique_key` computation. This is only relevant when `unique_key` is not provided.\n            use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the\n                `unique_key` computation. This is only relevant when `unique_key` is not provided.\n            always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.\n                Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.\n            enqueue_strategy: The strategy that will be used for enqueuing the request.\n            max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`\n                option of `BasicCrawler`.\n            **kwargs: Additional request properties.\n        \"\"\"\n        if unique_key is not None and always_enqueue:\n            raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`')\n\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        if isinstance(payload, str):\n            payload = payload.encode()\n\n        unique_key = unique_key or compute_unique_key(\n            url,\n            method=method,\n            headers=headers,\n            payload=payload,\n            session_id=session_id,\n            keep_url_fragment=keep_url_fragment,\n            use_extended_unique_key=use_extended_unique_key,\n        )\n\n        if always_enqueue:\n            unique_key = f'{crypto_random_object_id()}|{unique_key}'\n\n        user_data_dict = kwargs.pop('user_data', {}) or {}\n        crawlee_data_dict = user_data_dict.get('__crawlee', {})\n\n        if max_retries is not None:\n            crawlee_data_dict['maxRetries'] = max_retries\n\n        if enqueue_strategy is not None:\n            crawlee_data_dict['enqueueStrategy'] = enqueue_strategy\n\n        crawlee_data = CrawleeRequestData(**crawlee_data_dict)\n\n        if crawlee_data:\n            user_data_dict['__crawlee'] = crawlee_data\n\n        request = cls(\n            url=url,\n            unique_key=unique_key,\n            method=method,\n            headers=headers,\n            payload=payload,\n            user_data=user_data_dict,\n            **kwargs,\n        )\n\n        if label is not None:\n            request.user_data['label'] = label\n\n        if session_id is not None:\n            request.crawlee_data.session_id = session_id\n\n        return request\n\n    def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:\n        \"\"\"Get the value of a specific query parameter from the URL.\"\"\"\n        query_params = URL(self.url).query\n        return query_params.get(param, default)\n\n    @property\n    def label(self) -> str | None:\n        \"\"\"A string used to differentiate between arbitrary request types.\"\"\"\n        return cast('UserData', self.user_data).label\n\n    @property\n    def session_id(self) -> str | None:\n        \"\"\"The ID of the bound session, if there is any.\"\"\"\n        return self.crawlee_data.session_id\n\n    @property\n    def crawlee_data(self) -> CrawleeRequestData:\n        \"\"\"Crawlee-specific configuration stored in the `user_data`.\"\"\"\n        user_data = cast('UserData', self.user_data)\n        if user_data.crawlee_data is None:\n            user_data.crawlee_data = CrawleeRequestData()\n\n        return user_data.crawlee_data\n\n    @property\n    def crawl_depth(self) -> int:\n        \"\"\"The depth of the request in the crawl tree.\"\"\"\n        return self.crawlee_data.crawl_depth\n\n    @crawl_depth.setter\n    def crawl_depth(self, new_value: int) -> None:\n        self.crawlee_data.crawl_depth = new_value\n\n    @property\n    def state(self) -> RequestState:\n        \"\"\"Crawlee-specific request handling state.\"\"\"\n        return self.crawlee_data.state\n\n    @state.setter\n    def state(self, new_state: RequestState) -> None:\n        self.crawlee_data.state = new_state\n\n    @property\n    def max_retries(self) -> int | None:\n        \"\"\"Crawlee-specific limit on the number of retries of the request.\"\"\"\n        return self.crawlee_data.max_retries\n\n    @property\n    def session_rotation_count(self) -> int | None:\n        \"\"\"Crawlee-specific number of finished session rotations for the request.\"\"\"\n        return self.crawlee_data.session_rotation_count\n\n    @session_rotation_count.setter\n    def session_rotation_count(self, new_session_rotation_count: int) -> None:\n        self.crawlee_data.session_rotation_count = new_session_rotation_count\n\n    @property\n    def enqueue_strategy(self) -> EnqueueStrategy:\n        \"\"\"The strategy that was used for enqueuing the request.\"\"\"\n        return self.crawlee_data.enqueue_strategy or 'all'\n\n    @enqueue_strategy.setter\n    def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> None:\n        self.crawlee_data.enqueue_strategy = new_enqueue_strategy\n\n    @property\n    def last_proxy_tier(self) -> int | None:\n        \"\"\"The last proxy tier used to process the request.\"\"\"\n        return self.crawlee_data.last_proxy_tier\n\n    @last_proxy_tier.setter\n    def last_proxy_tier(self, new_value: int) -> None:\n        self.crawlee_data.last_proxy_tier = new_value\n\n    @property\n    def forefront(self) -> bool:\n        \"\"\"Indicate whether the request should be enqueued at the front of the queue.\"\"\"\n        return self.crawlee_data.forefront\n\n    @forefront.setter\n    def forefront(self, new_value: bool) -> None:\n        self.crawlee_data.forefront = new_value\n\n    @property\n    def was_already_handled(self) -> bool:\n        \"\"\"Indicates whether the request was handled.\"\"\"\n        return self.handled_at is not None\n\n\nclass RequestWithLock(Request):\n    \"\"\"A crawling request with information about locks.\"\"\"\n\n    lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]\n    \"\"\"The timestamp when the lock expires.\"\"\"\n"
  },
  {
    "path": "src/crawlee/_service_locator.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.configuration import Configuration\nfrom crawlee.errors import ServiceConflictError\nfrom crawlee.events import EventManager, LocalEventManager\nfrom crawlee.storage_clients import FileSystemStorageClient, StorageClient\n\nif TYPE_CHECKING:\n    from crawlee.storages._storage_instance_manager import StorageInstanceManager\n\nfrom logging import getLogger\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Configuration')\nclass ServiceLocator:\n    \"\"\"Service locator for managing the services used by Crawlee.\n\n    All services are initialized to its default value lazily.\n    \"\"\"\n\n    global_storage_instance_manager: StorageInstanceManager | None = None\n\n    def __init__(\n        self,\n        configuration: Configuration | None = None,\n        event_manager: EventManager | None = None,\n        storage_client: StorageClient | None = None,\n    ) -> None:\n        self._configuration = configuration\n        self._event_manager = event_manager\n        self._storage_client = storage_client\n\n    def get_configuration(self) -> Configuration:\n        \"\"\"Get the configuration.\"\"\"\n        if self._configuration is None:\n            logger.debug('No configuration set, implicitly creating and using default Configuration.')\n            self._configuration = Configuration()\n\n        return self._configuration\n\n    def set_configuration(self, configuration: Configuration) -> None:\n        \"\"\"Set the configuration.\n\n        Args:\n            configuration: The configuration to set.\n\n        Raises:\n            ServiceConflictError: If the configuration has already been retrieved before.\n        \"\"\"\n        if self._configuration is configuration:\n            # Same instance, no need to anything\n            return\n        if self._configuration:\n            raise ServiceConflictError(Configuration, configuration, self._configuration)\n\n        self._configuration = configuration\n\n    def get_event_manager(self) -> EventManager:\n        \"\"\"Get the event manager.\"\"\"\n        if self._event_manager is None:\n            logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')\n            if self._configuration is None:\n                logger.warning(\n                    'Implicit creation of event manager will implicitly set configuration as side effect. '\n                    'It is advised to explicitly first set the configuration instead.'\n                )\n            self._event_manager = LocalEventManager().from_config(config=self._configuration)\n\n        return self._event_manager\n\n    def set_event_manager(self, event_manager: EventManager) -> None:\n        \"\"\"Set the event manager.\n\n        Args:\n            event_manager: The event manager to set.\n\n        Raises:\n            ServiceConflictError: If the event manager has already been retrieved before.\n        \"\"\"\n        if self._event_manager is event_manager:\n            # Same instance, no need to anything\n            return\n        if self._event_manager:\n            raise ServiceConflictError(EventManager, event_manager, self._event_manager)\n\n        self._event_manager = event_manager\n\n    def get_storage_client(self) -> StorageClient:\n        \"\"\"Get the storage client.\"\"\"\n        if self._storage_client is None:\n            logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')\n            if self._configuration is None:\n                logger.warning(\n                    'Implicit creation of storage client will implicitly set configuration as side effect. '\n                    'It is advised to explicitly first set the configuration instead.'\n                )\n            self._storage_client = FileSystemStorageClient()\n\n        return self._storage_client\n\n    def set_storage_client(self, storage_client: StorageClient) -> None:\n        \"\"\"Set the storage client.\n\n        Args:\n            storage_client: The storage client to set.\n\n        Raises:\n            ServiceConflictError: If the storage client has already been retrieved before.\n        \"\"\"\n        if self._storage_client is storage_client:\n            # Same instance, no need to anything\n            return\n        if self._storage_client:\n            raise ServiceConflictError(StorageClient, storage_client, self._storage_client)\n\n        self._storage_client = storage_client\n\n    @property\n    def storage_instance_manager(self) -> StorageInstanceManager:\n        \"\"\"Get the storage instance manager. It is global manager shared by all instances of ServiceLocator.\"\"\"\n        if ServiceLocator.global_storage_instance_manager is None:\n            # Import here to avoid circular imports.\n            from crawlee.storages._storage_instance_manager import StorageInstanceManager  # noqa: PLC0415\n\n            ServiceLocator.global_storage_instance_manager = StorageInstanceManager()\n\n        return ServiceLocator.global_storage_instance_manager\n\n\nservice_locator = ServiceLocator()\n"
  },
  {
    "path": "src/crawlee/_types.py",
    "content": "from __future__ import annotations\n\nimport dataclasses\nfrom collections.abc import Callable, Iterator, Mapping\nfrom copy import deepcopy\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload\n\nfrom pydantic import ConfigDict, Field, PlainValidator, RootModel\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    import json\n    import logging\n    import re\n    from collections.abc import Callable, Coroutine, Sequence\n\n    from typing_extensions import NotRequired, Required, Self, Unpack\n\n    from crawlee import Glob, Request\n    from crawlee._request import RequestOptions\n    from crawlee.configuration import Configuration\n    from crawlee.http_clients import HttpResponse\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.storage_clients import StorageClient\n    from crawlee.storages import KeyValueStore\n\n    # Workaround for https://github.com/pydantic/pydantic/issues/9445\n    J = TypeVar('J', bound='JsonSerializable')\n    JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None\nelse:\n    from pydantic import JsonValue as JsonSerializable\n\nT = TypeVar('T')\n\nHttpMethod = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']\n\nHttpPayload = bytes\n\nRequestTransformAction = Literal['skip', 'unchanged']\n\nEnqueueStrategy = Literal['all', 'same-domain', 'same-hostname', 'same-origin']\n\"\"\"Enqueue strategy to be used for determining which links to extract and enqueue.\"\"\"\n\nSkippedReason = Literal['robots_txt']\n\nLogLevel = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']\n\n\ndef _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:\n    \"\"\"Convert all header keys to lowercase, strips whitespace, and returns them sorted by key.\"\"\"\n    normalized_headers = {k.lower().strip(): v.strip() for k, v in headers.items()}\n    sorted_headers = sorted(normalized_headers.items())\n    return dict(sorted_headers)\n\n\n@docs_group('Other')\nclass HttpHeaders(RootModel, Mapping[str, str]):\n    \"\"\"A dictionary-like object representing HTTP headers.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        root: dict[str, str] = {}\n    else:\n        root: Annotated[\n            dict[str, str],\n            PlainValidator(_normalize_headers),\n            Field(default_factory=dict),\n        ]\n\n    def __getitem__(self, key: str) -> str:\n        return self.root[key.lower()]\n\n    def __setitem__(self, key: str, value: str) -> None:\n        raise TypeError(f'{self.__class__.__name__} is immutable')\n\n    def __delitem__(self, key: str) -> None:\n        raise TypeError(f'{self.__class__.__name__} is immutable')\n\n    def __or__(self, other: HttpHeaders) -> HttpHeaders:\n        \"\"\"Return a new instance of `HttpHeaders` combining this one with another one.\"\"\"\n        combined_headers = {**self.root, **other}\n        return HttpHeaders(combined_headers)\n\n    def __ror__(self, other: HttpHeaders) -> HttpHeaders:\n        \"\"\"Support reversed | operation (other | self).\"\"\"\n        combined_headers = {**other, **self.root}\n        return HttpHeaders(combined_headers)\n\n    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]\n        yield from self.root\n\n    def __len__(self) -> int:\n        return len(self.root)\n\n\n@docs_group('Configuration')\nclass ConcurrencySettings:\n    \"\"\"Concurrency settings for AutoscaledPool.\"\"\"\n\n    def __init__(\n        self,\n        min_concurrency: int = 1,\n        max_concurrency: int = 100,\n        max_tasks_per_minute: float = float('inf'),\n        desired_concurrency: int = 10,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            min_concurrency: The minimum number of tasks running in parallel. If you set this value too high\n                with respect to the available system memory and CPU, your code might run extremely slow or crash.\n            max_concurrency: The maximum number of tasks running in parallel.\n            max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set\n                to infinity, but you can pass any positive, non-zero number.\n            desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,\n                if there is a large enough supply of them. By default, it is `min_concurrency`.\n        \"\"\"\n        if min_concurrency < 1:\n            raise ValueError('min_concurrency must be 1 or larger')\n\n        if max_concurrency < min_concurrency:\n            raise ValueError('max_concurrency cannot be less than min_concurrency')\n\n        if desired_concurrency < min_concurrency:\n            raise ValueError('desired_concurrency cannot be less than min_concurrency')\n\n        if desired_concurrency > max_concurrency:\n            raise ValueError('desired_concurrency cannot be greater than max_concurrency')\n\n        if max_tasks_per_minute <= 0:\n            raise ValueError('max_tasks_per_minute must be positive')\n\n        self.min_concurrency = min_concurrency\n        self.max_concurrency = max_concurrency\n        self.desired_concurrency = desired_concurrency\n        self.max_tasks_per_minute = max_tasks_per_minute\n\n\nclass EnqueueLinksKwargs(TypedDict):\n    \"\"\"Keyword arguments for the `enqueue_links` methods.\"\"\"\n\n    limit: NotRequired[int]\n    \"\"\"Maximum number of requests to be enqueued.\"\"\"\n\n    base_url: NotRequired[str]\n    \"\"\"Base URL to be used for relative URLs.\"\"\"\n\n    strategy: NotRequired[EnqueueStrategy]\n    \"\"\"Enqueue strategy to be used for determining which links to extract and enqueue.\n\n    Options:\n        all: Enqueue every link encountered, regardless of the target domain. Use this option to ensure that all\n            links, including those leading to external websites, are followed.\n        same-domain: Enqueue links that share the same domain name as the current page, including any subdomains.\n            This strategy is ideal for crawling within the same top-level domain while still allowing for subdomain\n            exploration.\n        same-hostname: Enqueue links only if they match the exact hostname of the current page. This is the default\n            behavior and restricts the crawl to the current hostname, excluding subdomains.\n        same-origin: Enqueue links that share the same origin as the current page. The origin is defined by the\n            combination of protocol, domain, and port, ensuring a strict scope for the crawl.\n    \"\"\"\n\n    include: NotRequired[Sequence[re.Pattern | Glob]]\n    \"\"\"List of regular expressions or globs that URLs must match to be enqueued.\"\"\"\n\n    exclude: NotRequired[Sequence[re.Pattern | Glob]]\n    \"\"\"List of regular expressions or globs that URLs must not match to be enqueued.\"\"\"\n\n\nclass AddRequestsKwargs(EnqueueLinksKwargs):\n    \"\"\"Keyword arguments for the `add_requests` methods.\"\"\"\n\n    requests: Sequence[str | Request]\n    \"\"\"Requests to be added to the `RequestManager`.\"\"\"\n\n    rq_id: str | None\n    \"\"\"ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.\"\"\"\n\n    rq_name: str | None\n    \"\"\"Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.\n    \"\"\"\n\n    rq_alias: str | None\n    \"\"\"Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.\n    \"\"\"\n\n\nclass PushDataKwargs(TypedDict):\n    \"\"\"Keyword arguments for dataset's `push_data` method.\"\"\"\n\n\nclass PushDataFunctionCall(PushDataKwargs):\n    data: list[dict[str, Any]] | dict[str, Any]\n    dataset_id: str | None\n    dataset_name: str | None\n    dataset_alias: str | None\n\n\nclass KeyValueStoreInterface(Protocol):\n    \"\"\"The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler.\"\"\"\n\n    @overload\n    async def get_value(self, key: str) -> Any: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T) -> T: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...\n\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...\n\n    async def set_value(\n        self,\n        key: str,\n        value: Any,\n        content_type: str | None = None,\n    ) -> None: ...\n\n\n@dataclass()\nclass KeyValueStoreValue:\n    content: Any\n    content_type: str | None\n\n\nclass KeyValueStoreChangeRecords:\n    def __init__(self, actual_key_value_store: KeyValueStore) -> None:\n        self.updates = dict[str, KeyValueStoreValue]()\n        self._actual_key_value_store = actual_key_value_store\n\n    async def set_value(\n        self,\n        key: str,\n        value: Any,\n        content_type: str | None = None,\n    ) -> None:\n        self.updates[key] = KeyValueStoreValue(value, content_type)\n\n    @overload\n    async def get_value(self, key: str) -> Any: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T) -> T: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...\n\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None:\n        if key in self.updates:\n            return cast('T', self.updates[key].content)\n\n        return await self._actual_key_value_store.get_value(key, default_value)\n\n\nclass RequestHandlerRunResult:\n    \"\"\"Record of calls to storage-related context helpers.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        key_value_store_getter: GetKeyValueStoreFunction,\n        request: Request,\n    ) -> None:\n        self._key_value_store_getter = key_value_store_getter\n        self.add_requests_calls = list[AddRequestsKwargs]()\n        self.push_data_calls = list[PushDataFunctionCall]()\n        self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()\n\n        # Isolated copies for handler execution\n        self._request = deepcopy(request)\n\n    @property\n    def request(self) -> Request:\n        return self._request\n\n    async def add_requests(\n        self,\n        requests: Sequence[str | Request],\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> None:\n        \"\"\"Track a call to the `add_requests` context helper.\"\"\"\n        specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)\n        if specified_params > 1:\n            raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')\n        self.add_requests_calls.append(\n            AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)\n        )\n\n    async def push_data(\n        self,\n        data: list[dict[str, Any]] | dict[str, Any],\n        dataset_id: str | None = None,\n        dataset_name: str | None = None,\n        dataset_alias: str | None = None,\n        **kwargs: Unpack[PushDataKwargs],\n    ) -> None:\n        \"\"\"Track a call to the `push_data` context helper.\"\"\"\n        self.push_data_calls.append(\n            PushDataFunctionCall(\n                data=data,\n                dataset_id=dataset_id,\n                dataset_name=dataset_name,\n                dataset_alias=dataset_alias,\n                **kwargs,\n            )\n        )\n\n    async def get_key_value_store(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n    ) -> KeyValueStoreInterface:\n        if (id, name, alias) not in self.key_value_store_changes:\n            self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(\n                await self._key_value_store_getter(id=id, name=name, alias=alias)\n            )\n\n        return self.key_value_store_changes[id, name, alias]\n\n    def apply_request_changes(self, target: Request) -> None:\n        \"\"\"Apply tracked changes from handler copy to original request.\"\"\"\n        if self.request.user_data != target.user_data:\n            target.user_data = self.request.user_data\n\n        if self.request.headers != target.headers:\n            target.headers = self.request.headers\n\n\n@docs_group('Functions')\nclass AddRequestsFunction(Protocol):\n    \"\"\"Function for adding requests to the `RequestManager`, with optional filtering.\n\n    It simplifies the process of adding requests to the `RequestManager`. It automatically opens\n    the specified one and adds the provided requests.\n    \"\"\"\n\n    def __call__(\n        self,\n        requests: Sequence[str | Request],\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> Coroutine[None, None, None]:\n        \"\"\"Call dunder method.\n\n        Args:\n            requests: Requests to be added to the `RequestManager`.\n            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\n                provided.\n            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\n                can be provided.\n            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\n                can be provided.\n            **kwargs: Additional keyword arguments.\n        \"\"\"\n\n\n@docs_group('Functions')\nclass EnqueueLinksFunction(Protocol):\n    \"\"\"A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.\n\n    It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues\n    them for further crawling. It allows filtering through selectors and other options. You can also specify labels and\n    user data to be associated with the newly created `Request` objects.\n\n    It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together\n    with `requests` argument.\n\n    For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and\n    `AddRequestsFunction`.\n    \"\"\"\n\n    @overload\n    def __call__(\n        self,\n        *,\n        selector: str | None = None,\n        attribute: str | None = None,\n        label: str | None = None,\n        user_data: dict[str, Any] | None = None,\n        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> Coroutine[None, None, None]: ...\n\n    @overload\n    def __call__(\n        self,\n        *,\n        requests: Sequence[str | Request] | None = None,\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> Coroutine[None, None, None]: ...\n\n    def __call__(\n        self,\n        *,\n        selector: str | None = None,\n        attribute: str | None = None,\n        label: str | None = None,\n        user_data: dict[str, Any] | None = None,\n        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,\n        requests: Sequence[str | Request] | None = None,\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> Coroutine[None, None, None]:\n        \"\"\"Call enqueue links function.\n\n        Args:\n            selector: A selector used to find the elements containing the links. The behaviour differs based\n                on the crawler used:\n                - `PlaywrightCrawler` supports CSS and XPath selectors.\n                - `ParselCrawler` supports CSS selectors.\n                - `BeautifulSoupCrawler` supports CSS selectors.\n            attribute: Which node attribute to extract the links from.\n            label: Label for the newly created `Request` objects, used for request routing.\n            user_data: User data to be provided to the newly created `Request` objects.\n            transform_request_function: A function that takes `RequestOptions` and returns either:\n                - Modified `RequestOptions` to update the request configuration,\n                - `'skip'` to exclude the request from being enqueued,\n                - `'unchanged'` to use the original request options without modification.\n            requests: Requests to be added to the `RequestManager`.\n            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be\n                provided.\n            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\n                can be provided.\n            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`\n                can be provided.\n            **kwargs: Additional keyword arguments.\n        \"\"\"\n\n\n@docs_group('Functions')\nclass ExtractLinksFunction(Protocol):\n    \"\"\"A function for extracting URLs to crawl based on elements selected by a given selector.\n\n    It extracts URLs from the current page and allows filtering through selectors and other options. You can also\n    specify labels and user data to be associated with the newly created `Request` objects.\n    \"\"\"\n\n    def __call__(\n        self,\n        *,\n        selector: str = 'a',\n        attribute: str = 'href',\n        label: str | None = None,\n        user_data: dict[str, Any] | None = None,\n        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> Coroutine[None, None, list[Request]]:\n        \"\"\"Call extract links function.\n\n        Args:\n            selector: A selector used to find the elements containing the links. The behaviour differs based\n                on the crawler used:\n                - `PlaywrightCrawler` supports CSS and XPath selectors.\n                - `ParselCrawler` supports CSS selectors.\n                - `BeautifulSoupCrawler` supports CSS selectors.\n            attribute: Which node attribute to extract the links from.\n            label: Label for the newly created `Request` objects, used for request routing.\n            user_data: User data to be provided to the newly created `Request` objects.\n            transform_request_function: A function that takes `RequestOptions` and returns either:\n                - Modified `RequestOptions` to update the request configuration,\n                - `'skip'` to exclude the request from being enqueued,\n                - `'unchanged'` to use the original request options without modification.\n            **kwargs: Additional keyword arguments.\n        \"\"\"\n\n\n@docs_group('Functions')\nclass GetKeyValueStoreFunction(Protocol):\n    \"\"\"A function for accessing a `KeyValueStore`.\n\n    It retrieves an instance of a `KeyValueStore` based on its ID or name.\n    \"\"\"\n\n    def __call__(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n    ) -> Coroutine[None, None, KeyValueStore]:\n        \"\"\"Call dunder method.\n\n        Args:\n            id: The ID of the `KeyValueStore` to get.\n            name: The name of the `KeyValueStore` to get (global scope, named storage).\n            alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).\n        \"\"\"\n\n\nclass GetKeyValueStoreFromRequestHandlerFunction(Protocol):\n    \"\"\"A function for accessing a `KeyValueStore`.\n\n    It retrieves an instance of a `KeyValueStore` based on its ID or name.\n    \"\"\"\n\n    def __call__(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n    ) -> Coroutine[None, None, KeyValueStoreInterface]:\n        \"\"\"Call dunder method.\n\n        Args:\n            id: The ID of the `KeyValueStore` to get.\n            name: The name of the `KeyValueStore` to get (global scope, named storage).\n            alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).\n        \"\"\"\n\n\n@docs_group('Functions')\nclass PushDataFunction(Protocol):\n    \"\"\"A function for pushing data to a `Dataset`.\n\n    It simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes\n    the provided data to it.\n    \"\"\"\n\n    def __call__(\n        self,\n        data: list[dict[str, Any]] | dict[str, Any],\n        dataset_id: str | None = None,\n        dataset_name: str | None = None,\n        dataset_alias: str | None = None,\n        **kwargs: Unpack[PushDataKwargs],\n    ) -> Coroutine[None, None, None]:\n        \"\"\"Call dunder method.\n\n        Args:\n            data: The data to push to the `Dataset`.\n            dataset_id: The ID of the `Dataset` to push the data to.\n            dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).\n            dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).\n            **kwargs: Additional keyword arguments.\n        \"\"\"\n\n\n@docs_group('Functions')\nclass SendRequestFunction(Protocol):\n    \"\"\"A function for sending HTTP requests.\n\n    It simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used\n    within request handlers to send additional HTTP requests to target URLs.\n    \"\"\"\n\n    def __call__(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        payload: HttpPayload | None = None,\n        headers: HttpHeaders | dict[str, str] | None = None,\n    ) -> Coroutine[None, None, HttpResponse]:\n        \"\"\"Call send request function.\n\n        Args:\n            url: The URL to send the request to.\n            method: The HTTP method to use.\n            headers: The headers to include in the request.\n            payload: The payload to include in the request.\n\n        Returns:\n            The HTTP response received from the server.\n        \"\"\"\n\n\n@docs_group('Other')\n@dataclasses.dataclass\nclass PageSnapshot:\n    \"\"\"Snapshot of a crawled page.\"\"\"\n\n    screenshot: bytes | None = None\n    \"\"\"Screenshot of the page format.\"\"\"\n\n    html: str | None = None\n    \"\"\"HTML content of the page.\"\"\"\n\n    def __bool__(self) -> bool:\n        return bool(self.screenshot or self.html)\n\n\n@docs_group('Functions')\nclass UseStateFunction(Protocol):\n    \"\"\"A function for managing state within the crawling context.\n\n    It allows the use of persistent state across multiple crawls.\n\n    Warning:\n        This is an experimental feature. The behavior and interface may change in future versions.\n    \"\"\"\n\n    def __call__(\n        self,\n        default_value: dict[str, JsonSerializable] | None = None,\n    ) -> Coroutine[None, None, dict[str, JsonSerializable]]:\n        \"\"\"Call dunder method.\n\n        Args:\n            default_value: The default value to initialize the state if it is not already set.\n\n        Returns:\n            The current state.\n        \"\"\"\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass BasicCrawlingContext:\n    \"\"\"Basic crawling context.\n\n    It represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more\n    specific crawlers to provide additional functionality.\n    \"\"\"\n\n    request: Request\n    \"\"\"Request object for the current page being processed.\"\"\"\n\n    session: Session | None\n    \"\"\"Session object for the current page being processed.\"\"\"\n\n    proxy_info: ProxyInfo | None\n    \"\"\"Proxy information for the current page being processed.\"\"\"\n\n    send_request: SendRequestFunction\n    \"\"\"Send request crawling context helper function.\"\"\"\n\n    add_requests: AddRequestsFunction\n    \"\"\"Add requests crawling context helper function.\"\"\"\n\n    push_data: PushDataFunction\n    \"\"\"Push data crawling context helper function.\"\"\"\n\n    use_state: UseStateFunction\n    \"\"\"Use state crawling context helper function.\"\"\"\n\n    get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction\n    \"\"\"Get key-value store crawling context helper function.\"\"\"\n\n    log: logging.Logger\n    \"\"\"Logger instance.\"\"\"\n\n    async def get_snapshot(self) -> PageSnapshot:\n        \"\"\"Get snapshot of crawled page.\"\"\"\n        return PageSnapshot()\n\n    def __hash__(self) -> int:\n        \"\"\"Return hash of the context. Each context is considered unique.\"\"\"\n        return id(self)\n\n    def create_modified_copy(\n        self,\n        push_data: PushDataFunction | None = None,\n        add_requests: AddRequestsFunction | None = None,\n        get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,\n    ) -> Self:\n        \"\"\"Create a modified copy of the crawling context with specified changes.\"\"\"\n        modifications = dict[str, Any]()\n\n        if push_data is not None:\n            modifications['push_data'] = push_data\n        if add_requests is not None:\n            modifications['add_requests'] = add_requests\n        if get_key_value_store is not None:\n            modifications['get_key_value_store'] = get_key_value_store\n\n        return dataclasses.replace(self, **modifications)\n\n\nclass GetDataKwargs(TypedDict):\n    \"\"\"Keyword arguments for dataset's `get_data` method.\"\"\"\n\n    offset: NotRequired[int]\n    \"\"\"Skips the specified number of items at the start.\"\"\"\n\n    limit: NotRequired[int | None]\n    \"\"\"The maximum number of items to retrieve. Unlimited if None.\"\"\"\n\n    clean: NotRequired[bool]\n    \"\"\"Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`.\"\"\"\n\n    desc: NotRequired[bool]\n    \"\"\"Set to True to sort results in descending order.\"\"\"\n\n    fields: NotRequired[list[str]]\n    \"\"\"Fields to include in each item. Sorts fields as specified if provided.\"\"\"\n\n    omit: NotRequired[list[str]]\n    \"\"\"Fields to exclude from each item.\"\"\"\n\n    unwind: NotRequired[list[str]]\n    \"\"\"Unwinds items by a specified array field, turning each element into a separate item.\"\"\"\n\n    skip_empty: NotRequired[bool]\n    \"\"\"Excludes empty items from the results if True.\"\"\"\n\n    skip_hidden: NotRequired[bool]\n    \"\"\"Excludes fields starting with '#' if True.\"\"\"\n\n    flatten: NotRequired[list[str]]\n    \"\"\"Fields to be flattened in returned items.\"\"\"\n\n    view: NotRequired[str]\n    \"\"\"Specifies the dataset view to be used.\"\"\"\n\n\nclass ExportToKwargs(TypedDict):\n    \"\"\"Keyword arguments for dataset's `export_to` method.\"\"\"\n\n    key: Required[str]\n    \"\"\"The key under which to save the data.\"\"\"\n\n    content_type: NotRequired[Literal['json', 'csv']]\n    \"\"\"The format in which to export the data. Either 'json' or 'csv'.\"\"\"\n\n    to_kvs_id: NotRequired[str]\n    \"\"\"ID of the key-value store to save the exported file.\"\"\"\n\n    to_kvs_name: NotRequired[str]\n    \"\"\"Name of the key-value store to save the exported file.\"\"\"\n\n    to_kvs_storage_client: NotRequired[StorageClient]\n    \"\"\"The storage client to use for saving the exported file.\"\"\"\n\n    to_kvs_configuration: NotRequired[Configuration]\n    \"\"\"The configuration to use for saving the exported file.\"\"\"\n\n\nclass ExportDataJsonKwargs(TypedDict):\n    \"\"\"Keyword arguments for dataset's `export_data_json` method.\"\"\"\n\n    skipkeys: NotRequired[bool]\n    \"\"\"If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped\n    instead of raising a `TypeError`.\"\"\"\n\n    ensure_ascii: NotRequired[bool]\n    \"\"\"Determines if non-ASCII characters should be escaped in the output JSON string.\"\"\"\n\n    check_circular: NotRequired[bool]\n    \"\"\"If False (default: True), skips the circular reference check for container types. A circular reference will\n    result in a `RecursionError` or worse if unchecked.\"\"\"\n\n    allow_nan: NotRequired[bool]\n    \"\"\"If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply\n    with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).\"\"\"\n\n    cls: NotRequired[type[json.JSONEncoder]]\n    \"\"\"Allows specifying a custom JSON encoder.\"\"\"\n\n    indent: NotRequired[int]\n    \"\"\"Specifies the number of spaces for indentation in the pretty-printed JSON output.\"\"\"\n\n    separators: NotRequired[tuple[str, str]]\n    \"\"\"A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')\n    otherwise.\"\"\"\n\n    default: NotRequired[Callable]\n    \"\"\"A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version\n    of the object or raise a `TypeError`.\"\"\"\n\n    sort_keys: NotRequired[bool]\n    \"\"\"Specifies whether the output JSON object should have keys sorted alphabetically.\"\"\"\n\n\nclass ExportDataCsvKwargs(TypedDict):\n    \"\"\"Keyword arguments for dataset's `export_data_csv` method.\"\"\"\n\n    dialect: NotRequired[str]\n    \"\"\"Specifies a dialect to be used in CSV parsing and writing.\"\"\"\n\n    delimiter: NotRequired[str]\n    \"\"\"A one-character string used to separate fields. Defaults to ','.\"\"\"\n\n    doublequote: NotRequired[bool]\n    \"\"\"Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;\n    when False, the `escapechar` is used as a prefix. Defaults to True.\"\"\"\n\n    escapechar: NotRequired[str]\n    \"\"\"A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`\n    if `doublequote` is False. Defaults to None, disabling escaping.\"\"\"\n\n    lineterminator: NotRequired[str]\n    \"\"\"The string used to terminate lines produced by the writer. Defaults to '\\\\r\\\\n'.\"\"\"\n\n    quotechar: NotRequired[str]\n    \"\"\"A one-character string used to quote fields containing special characters, like the delimiter or quotechar,\n    or fields containing new-line characters. Defaults to '\\\"'.\"\"\"\n\n    quoting: NotRequired[int]\n    \"\"\"Controls when quotes should be generated by the writer and recognized by the reader. Can take any of\n    the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.\"\"\"\n\n    skipinitialspace: NotRequired[bool]\n    \"\"\"When True, spaces immediately following the delimiter are ignored. Defaults to False.\"\"\"\n\n    strict: NotRequired[bool]\n    \"\"\"When True, raises an exception on bad CSV input. Defaults to False.\"\"\"\n"
  },
  {
    "path": "src/crawlee/_utils/__init__.py",
    "content": ""
  },
  {
    "path": "src/crawlee/_utils/blocked.py",
    "content": "from __future__ import annotations\n\n# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts\n\nCLOUDFLARE_RETRY_CSS_SELECTORS = [\n    '#turnstile-wrapper iframe[src^=\"https://challenges.cloudflare.com\"]',\n]\n\nRETRY_CSS_SELECTORS = [\n    *CLOUDFLARE_RETRY_CSS_SELECTORS,\n    'div#infoDiv0 a[href*=\"//www.google.com/policies/terms/\"]',\n    'iframe[src*=\"_Incapsula_Resource\"]',\n]\n\"\"\"\nCSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.\n\"\"\"\n\nROTATE_PROXY_ERRORS = [\n    'ECONNRESET',\n    'ECONNREFUSED',\n    'ERR_PROXY_CONNECTION_FAILED',\n    'ERR_TUNNEL_CONNECTION_FAILED',\n    'Proxy responded with',\n    'unsuccessful tunnel',\n    'TunnelUnsuccessful',\n]\n\"\"\"\nContent of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.\n\"\"\"\n"
  },
  {
    "path": "src/crawlee/_utils/byte_size.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import Any\n\n_BYTES_PER_KB = 1024\n_BYTES_PER_MB = _BYTES_PER_KB**2\n_BYTES_PER_GB = _BYTES_PER_KB**3\n_BYTES_PER_TB = _BYTES_PER_KB**4\n\n\n@dataclass(frozen=True)\nclass ByteSize:\n    \"\"\"Represents a byte size.\"\"\"\n\n    bytes: int\n\n    def __post_init__(self) -> None:\n        if self.bytes < 0:\n            raise ValueError('ByteSize cannot be negative')\n\n    @classmethod\n    def validate(cls, value: Any) -> ByteSize:\n        if isinstance(value, ByteSize):\n            return value\n\n        if not isinstance(value, (float, int)):\n            raise TypeError('Value must be numeric')\n\n        return cls(int(value))\n\n    @classmethod\n    def from_kb(cls, kb: float) -> ByteSize:\n        return cls(int(kb * _BYTES_PER_KB))\n\n    @classmethod\n    def from_mb(cls, mb: float) -> ByteSize:\n        return cls(int(mb * _BYTES_PER_MB))\n\n    @classmethod\n    def from_gb(cls, gb: float) -> ByteSize:\n        return cls(int(gb * _BYTES_PER_GB))\n\n    @classmethod\n    def from_tb(cls, tb: float) -> ByteSize:\n        return cls(int(tb * _BYTES_PER_TB))\n\n    def to_kb(self) -> float:\n        return self.bytes / _BYTES_PER_KB\n\n    def to_mb(self) -> float:\n        return self.bytes / _BYTES_PER_MB\n\n    def to_gb(self) -> float:\n        return self.bytes / _BYTES_PER_GB\n\n    def to_tb(self) -> float:\n        return self.bytes / _BYTES_PER_TB\n\n    def __str__(self) -> str:\n        if self.bytes >= _BYTES_PER_TB:\n            return f'{self.to_tb():.2f} TB'\n        if self.bytes >= _BYTES_PER_GB:\n            return f'{self.to_gb():.2f} GB'\n        if self.bytes >= _BYTES_PER_MB:\n            return f'{self.to_mb():.2f} MB'\n        if self.bytes >= _BYTES_PER_KB:\n            return f'{self.to_kb():.2f} KB'\n        return f'{self.bytes} B'\n\n    def __eq__(self, other: object) -> bool:\n        if isinstance(other, ByteSize):\n            return self.bytes == other.bytes\n        return NotImplemented\n\n    def __hash__(self) -> int:\n        \"\"\"Return hash based on the bytes value.\"\"\"\n        return hash(self.bytes)\n\n    def __lt__(self, other: object) -> bool:\n        if isinstance(other, ByteSize):\n            return self.bytes < other.bytes\n        return NotImplemented\n\n    def __le__(self, other: object) -> bool:\n        if isinstance(other, ByteSize):\n            return self.bytes <= other.bytes\n        return NotImplemented\n\n    def __gt__(self, other: object) -> bool:\n        if isinstance(other, ByteSize):\n            return self.bytes > other.bytes\n        return NotImplemented\n\n    def __ge__(self, other: object) -> bool:\n        if isinstance(other, ByteSize):\n            return self.bytes >= other.bytes\n        return NotImplemented\n\n    def __add__(self, other: object) -> ByteSize:\n        if isinstance(other, ByteSize):\n            return ByteSize(self.bytes + other.bytes)\n        return NotImplemented\n\n    def __sub__(self, other: object) -> ByteSize:\n        if isinstance(other, ByteSize):\n            result = self.bytes - other.bytes\n            if result < 0:\n                raise ValueError('Resulting ByteSize cannot be negative')\n            return ByteSize(result)\n        return NotImplemented\n\n    def __mul__(self, other: object) -> ByteSize:\n        if isinstance(other, (int, float)):\n            return ByteSize(int(self.bytes * other))\n\n        return NotImplemented\n\n    def __truediv__(self, other: object) -> float:\n        if isinstance(other, ByteSize):\n            if other.bytes == 0:\n                raise ZeroDivisionError('Cannot divide by zero')\n            return self.bytes / other.bytes\n\n        return NotImplemented\n\n    def __rmul__(self, other: object) -> ByteSize:\n        return self.__mul__(other)\n"
  },
  {
    "path": "src/crawlee/_utils/console.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\nBORDER = {'TL': '┌', 'TR': '┐', 'BL': '└', 'BR': '┘', 'H': '─', 'V': '│', 'TM': '┬', 'BM': '┴'}\n\n\ndef make_table(rows: Sequence[Sequence[str]], width: int = 100) -> str:\n    \"\"\"Create a text table using Unicode characters.\n\n    Args:\n        rows: A list of tuples/lists to be displayed in the table.\n        width: Maximum width of the table.\n    \"\"\"\n    if not rows:\n        return ''\n\n    num_cols = max(len(row) for row in rows)\n\n    if num_cols == 0:\n        return ''\n\n    # Normalize the row size by filling missing columns with empty values\n    normalized_rows = [list(row) + [''] * (num_cols - len(row)) for row in rows]\n    col_widths = [max(len(str(row[i])) for row in normalized_rows) for i in range(num_cols)]\n    total_width = sum(col_widths) + (3 * num_cols) + 1\n\n    # If the table size is larger than `width`, set all columns to the same length\n    col_widths = col_widths if total_width <= width else [max(3, (width - (3 * num_cols) - 1) // num_cols)] * num_cols\n\n    # Initialize borders\n    top_parts, bottom_parts = [BORDER['TL']], [BORDER['BL']]\n\n    for i in range(num_cols):\n        h_border = BORDER['H'] * (col_widths[i] + 2)\n        top_parts.append(h_border)\n        bottom_parts.append(h_border)\n\n        if i < num_cols - 1:\n            top_parts.append(BORDER['TM'])\n            bottom_parts.append(BORDER['BM'])\n        else:\n            top_parts.append(BORDER['TR'])\n            bottom_parts.append(BORDER['BR'])\n\n    top_border, bottom_border = ''.join(top_parts), ''.join(bottom_parts)\n\n    result = [top_border]\n\n    for row in normalized_rows:\n        cells = []\n\n        for i, cell in enumerate(row):\n            # Trim the content if the length exceeds the widths of the column\n            norm_cell = f'{cell[: col_widths[i] - 3]}...' if len(cell) > col_widths[i] else cell.ljust(col_widths[i])\n            cells.append(norm_cell)\n\n        # row: │ cell1 │ cell2 │ ...\n        row_str = BORDER['V'] + ''.join(f' {cell} {BORDER[\"V\"]}' for cell in cells)\n        result.append(row_str)\n\n    result.append(bottom_border)\n\n    return '\\n'.join(result)\n"
  },
  {
    "path": "src/crawlee/_utils/context.py",
    "content": "from __future__ import annotations\n\nimport inspect\nfrom collections.abc import Callable\nfrom functools import wraps\nfrom typing import Any, TypeVar, cast\n\nT = TypeVar('T', bound=Callable[..., Any])\n\n\ndef ensure_context(method: T) -> T:\n    \"\"\"Ensure the (async) context manager is initialized before executing the method.\n\n    This decorator checks if the calling instance has an `active` attribute and verifies that it is set to `True`.\n    If the instance is inactive, it raises a `RuntimeError`. Works for both synchronous and asynchronous methods.\n\n    Args:\n        method: The method to wrap.\n\n    Returns:\n        The wrapped method with context checking applied.\n\n    Raises:\n        RuntimeError: If the instance lacks an `active` attribute or is not active.\n    \"\"\"\n\n    @wraps(method)\n    def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:\n        if not hasattr(self, 'active'):\n            raise RuntimeError(f'The {self.__class__.__name__} does not have the \"active\" attribute.')\n\n        if not self.active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the context.')\n\n        return method(self, *args, **kwargs)\n\n    @wraps(method)\n    async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:\n        if not hasattr(self, 'active'):\n            raise RuntimeError(f'The {self.__class__.__name__} does not have the \"active\" attribute.')\n\n        if not self.active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active. Use it within the async context.')\n\n        return await method(self, *args, **kwargs)\n\n    return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)\n"
  },
  {
    "path": "src/crawlee/_utils/crypto.py",
    "content": "from __future__ import annotations\n\nimport secrets\nfrom hashlib import sha256\n\n\ndef compute_short_hash(data: bytes, *, length: int = 8) -> str:\n    \"\"\"Compute a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.\n\n    Args:\n        data: The binary data to be hashed.\n        length: The length of the hash to be returned.\n\n    Returns:\n        A substring (prefix) of the hexadecimal hash of the data.\n    \"\"\"\n    hash_object = sha256(data)\n    return hash_object.hexdigest()[:length]\n\n\ndef crypto_random_object_id(length: int = 17) -> str:\n    \"\"\"Generate a random object ID.\"\"\"\n    chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789'\n    return ''.join(secrets.choice(chars) for _ in range(length))\n"
  },
  {
    "path": "src/crawlee/_utils/docs.py",
    "content": "from __future__ import annotations\n\nfrom collections.abc import Callable\nfrom typing import Any, Literal, TypeVar\n\n# The order of the rendered API groups is defined in the website/docusaurus.config.js file.\nGroupName = Literal[\n    'Autoscaling',\n    'Browser management',\n    'Configuration',\n    'Crawlers',\n    'Crawling contexts',\n    'Errors',\n    'Event data',\n    'Event managers',\n    'Functions',\n    'HTTP clients',\n    'HTTP parsers',\n    'Request loaders',\n    'Session management',\n    'Statistics',\n    'Storage clients',\n    'Storage data',\n    'Storages',\n    'Other',\n]\n\nT = TypeVar('T', bound=Callable[..., Any])\n\n\ndef docs_group(group_name: GroupName) -> Callable[[T], T]:  # noqa: ARG001\n    \"\"\"Mark a symbol for rendering and grouping in documentation.\n\n    This decorator is used solely for documentation purposes and does not modify the behavior\n    of the decorated callable.\n\n    Args:\n        group_name: The documentation group to which the symbol belongs.\n\n    Returns:\n        The original callable without modification.\n    \"\"\"\n\n    def wrapper(func: T) -> T:\n        return func\n\n    return wrapper\n"
  },
  {
    "path": "src/crawlee/_utils/file.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport csv\nimport json\nimport os\nimport sys\nimport tempfile\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, overload\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n    from typing import Any, TextIO\n\n    from typing_extensions import Unpack\n\n    from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs\n\nif sys.platform == 'win32':\n\n    def _write_file(path: Path, data: str | bytes) -> None:\n        \"\"\"Windows-specific file write implementation.\n\n        This implementation writes directly to the file without using a temporary file, because\n        they are problematic due to permissions issues on Windows.\n        \"\"\"\n        if isinstance(data, bytes):\n            path.write_bytes(data)\n        elif isinstance(data, str):\n            path.write_text(data, encoding='utf-8')\n        else:\n            raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')\nelse:\n\n    def _write_file(path: Path, data: str | bytes) -> None:\n        \"\"\"Linux/Unix-specific file write implementation using temporary files.\"\"\"\n        dir_path = path.parent\n        fd, tmp_path = tempfile.mkstemp(\n            suffix=f'{path.suffix}.tmp',\n            prefix=f'{path.name}.',\n            dir=str(dir_path),\n        )\n\n        if not isinstance(data, (str, bytes)):\n            raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.')\n\n        try:\n            if isinstance(data, bytes):\n                with os.fdopen(fd, 'wb') as tmp_file:\n                    tmp_file.write(data)\n            else:\n                with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file:\n                    tmp_file.write(data)\n\n            # Atomically replace the destination file with the temporary file\n            Path(tmp_path).replace(path)\n        except Exception:\n            Path(tmp_path).unlink(missing_ok=True)\n            raise\n\n\ndef infer_mime_type(value: Any) -> str:\n    \"\"\"Infer the MIME content type from the value.\n\n    Args:\n        value: The value to infer the content type from.\n\n    Returns:\n        The inferred MIME content type.\n    \"\"\"\n    # If the value is bytes (or bytearray), return binary content type.\n    if isinstance(value, (bytes, bytearray)):\n        return 'application/octet-stream'\n\n    # If the value is a dict or list, assume JSON.\n    if isinstance(value, (dict, list)):\n        return 'application/json; charset=utf-8'\n\n    # If the value is a string, number or boolean, assume plain text.\n    if isinstance(value, (str, int, float, bool)):\n        return 'text/plain; charset=utf-8'\n\n    # Default fallback.\n    return 'application/octet-stream'\n\n\nasync def json_dumps(obj: Any) -> str:\n    \"\"\"Serialize an object to a JSON-formatted string with specific settings.\n\n    Args:\n        obj: The object to serialize.\n\n    Returns:\n        A string containing the JSON representation of the input object.\n    \"\"\"\n    return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str)\n\n\n@overload\nasync def atomic_write(\n    path: Path,\n    data: str,\n    *,\n    retry_count: int = 0,\n) -> None: ...\n\n\n@overload\nasync def atomic_write(\n    path: Path,\n    data: bytes,\n    *,\n    retry_count: int = 0,\n) -> None: ...\n\n\nasync def atomic_write(\n    path: Path,\n    data: str | bytes,\n    *,\n    retry_count: int = 0,\n) -> None:\n    \"\"\"Write data to a file atomically to prevent data corruption or partial writes.\n\n    This function handles both text and binary data. The binary mode is automatically\n    detected based on the data type (bytes = binary, str = text). It ensures atomic\n    writing by creating a temporary file and then atomically replacing the target file,\n    which prevents data corruption if the process is interrupted during the write operation.\n\n    Args:\n        path: The path to the destination file.\n        data: The data to write to the file (string or bytes).\n        retry_count: Internal parameter to track the number of retry attempts (default: 0).\n    \"\"\"\n    max_retries = 3\n\n    try:\n        # Use the platform-specific write function resolved at import time.\n        await asyncio.to_thread(_write_file, path, data)\n    except (FileNotFoundError, PermissionError):\n        if retry_count < max_retries:\n            return await atomic_write(\n                path,\n                data,\n                retry_count=retry_count + 1,\n            )\n        # If we reach the maximum number of retries, raise the exception.\n        raise\n\n\nasync def export_json_to_stream(\n    iterator: AsyncIterator[dict[str, Any]],\n    dst: TextIO,\n    **kwargs: Unpack[ExportDataJsonKwargs],\n) -> None:\n    items = [item async for item in iterator]\n    json.dump(items, dst, **kwargs)\n\n\nasync def export_csv_to_stream(\n    iterator: AsyncIterator[dict[str, Any]],\n    dst: TextIO,\n    **kwargs: Unpack[ExportDataCsvKwargs],\n) -> None:\n    # Set lineterminator to '\\n' if not explicitly provided. This prevents double line endings on Windows.\n    # The csv.writer default is '\\r\\n', which when written to a file in text mode on Windows gets converted\n    # to '\\r\\r\\n' due to newline translation. By using '\\n', we let the platform handle the line ending\n    # conversion: '\\n' stays as '\\n' on Unix, and becomes '\\r\\n' on Windows.\n    if 'lineterminator' not in kwargs:\n        kwargs['lineterminator'] = '\\n'\n\n    writer = csv.writer(dst, **kwargs)\n    write_header = True\n\n    # Iterate over the dataset and write to CSV.\n    async for item in iterator:\n        if not item:\n            continue\n\n        if write_header:\n            writer.writerow(item.keys())\n            write_header = False\n\n        writer.writerow(item.values())\n"
  },
  {
    "path": "src/crawlee/_utils/globs.py",
    "content": "from __future__ import annotations\n\nimport os\nimport re\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n\nclass Glob:\n    \"\"\"Wraps a glob pattern (supports the `*`, `**`, `?` wildcards).\"\"\"\n\n    def __init__(self, glob: str) -> None:\n        self.glob = glob\n        self.regexp = re.compile(_translate(self.glob, recursive=True))\n\n\ndef _translate(\n    pat: str, *, recursive: bool = False, include_hidden: bool = False, seps: Sequence[str] | None = None\n) -> str:\n    \"\"\"Translate a pathname with shell wildcards to a regular expression.\n\n    If `recursive` is true, the pattern segment '**' will match any number of\n    path segments.\n\n    If `include_hidden` is true, wildcards can match path segments beginning\n    with a dot ('.').\n\n    If a sequence of separator characters is given to `seps`, they will be\n    used to split the pattern into segments and match path separators. If not\n    given, os.path.sep and os.path.altsep (where available) are used.\n\n    HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`\n    \"\"\"\n    _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps\n\n    escaped_seps = ''.join(map(re.escape, _seps))\n    any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps\n    not_sep = f'[^{escaped_seps}]'\n\n    if include_hidden:\n        one_last_segment = f'{not_sep}+'\n        one_segment = f'{one_last_segment}{any_sep}'\n        any_segments = f'(?:.+{any_sep})?'\n        any_last_segments = '.*'\n    else:\n        one_last_segment = f'[^{escaped_seps}.]{not_sep}*'\n        one_segment = f'{one_last_segment}{any_sep}'\n        any_segments = f'(?:{one_segment})*'\n        any_last_segments = f'{any_segments}(?:{one_last_segment})?'\n\n    results = []\n    parts = re.split(any_sep, pat)\n    last_part_idx = len(parts) - 1\n    for idx, part in enumerate(parts):\n        if part == '*':\n            results.append(one_segment if idx < last_part_idx else one_last_segment)\n        elif recursive and part == '**':\n            if idx < last_part_idx:\n                if parts[idx + 1] != '**':\n                    results.append(any_segments)\n            else:\n                results.append(any_last_segments)\n        else:\n            if part:\n                if not include_hidden and part[0] in '*?':\n                    results.append(r'(?!\\.)')\n                results.extend(_fnmatch_translate(part, f'{not_sep}*', not_sep))\n            if idx < last_part_idx:\n                results.append(any_sep)\n    res = ''.join(results)\n    return rf'(?s:{res})\\Z'\n\n\ndef _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[str]:\n    \"\"\"Copy of fnmatch._translate from Python 3.13.\"\"\"\n    res = list[str]()\n    add = res.append\n    i, n = 0, len(pat)\n    while i < n:\n        c = pat[i]\n        i = i + 1\n        if c == '*':\n            # compress consecutive `*` into one\n            if (not res) or res[-1] is not star:\n                add(star)\n        elif c == '?':\n            add(question_mark)\n        elif c == '[':\n            j = i\n            if j < n and pat[j] == '!':\n                j = j + 1\n            if j < n and pat[j] == ']':\n                j = j + 1\n            while j < n and pat[j] != ']':\n                j = j + 1\n            if j >= n:\n                add('\\\\[')\n            else:\n                stuff = pat[i:j]\n                if '-' not in stuff:\n                    stuff = stuff.replace('\\\\', r'\\\\')\n                else:\n                    chunks = []\n                    k = i + 2 if pat[i] == '!' else i + 1\n                    while True:\n                        k = pat.find('-', k, j)\n                        if k < 0:\n                            break\n                        chunks.append(pat[i:k])\n                        i = k + 1\n                        k = k + 3\n                    chunk = pat[i:j]\n                    if chunk:\n                        chunks.append(chunk)\n                    else:\n                        chunks[-1] += '-'\n                    # Remove empty ranges -- invalid in RE.\n                    for k in range(len(chunks) - 1, 0, -1):\n                        if chunks[k - 1][-1] > chunks[k][0]:\n                            chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]\n                            del chunks[k]\n                    # Escape backslashes and hyphens for set difference (--).\n                    # Hyphens that create ranges shouldn't be escaped.\n                    stuff = '-'.join(s.replace('\\\\', r'\\\\').replace('-', r'\\-') for s in chunks)\n                # Escape set operations (&&, ~~ and ||).\n                stuff = re.sub(r'([&~|])', r'\\\\\\1', stuff)\n                i = j + 1\n                if not stuff:\n                    # Empty range: never match.\n                    add('(?!)')\n                elif stuff == '!':\n                    # Negated empty range: match any character.\n                    add('.')\n                else:\n                    if stuff[0] == '!':\n                        stuff = '^' + stuff[1:]\n                    elif stuff[0] in ('^', '['):\n                        stuff = '\\\\' + stuff\n                    add(f'[{stuff}]')\n        else:\n            add(re.escape(c))\n    return res\n"
  },
  {
    "path": "src/crawlee/_utils/html_to_text.py",
    "content": "# This file contains shared constants used by different implementations of html_to_text function.\nfrom __future__ import annotations\n\nimport re\n\n# Tags based on Javascript implementation of htmlToText from:\n# https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11\n# Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9\nSKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}\nBLOCK_TAGS = {\n    'p',\n    'h1',\n    'h2',\n    'h3',\n    'h4',\n    'h5',\n    'h6',\n    'ol',\n    'ul',\n    'li',\n    'pre',\n    'address',\n    'blockquote',\n    'dl',\n    'div',\n    'fieldset',\n    'form',\n    'table',\n    'tr',\n    'select',\n    'option',\n}\n\n_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\\s)$')\n_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\\n)$')\n_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\\s+')\n"
  },
  {
    "path": "src/crawlee/_utils/models.py",
    "content": "from __future__ import annotations\n\nfrom contextlib import suppress\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING, Annotated, Any\n\nfrom pydantic import PlainSerializer, TypeAdapter, ValidationError, WrapValidator\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n\n\"\"\"Utility types for Pydantic models.\"\"\"\n\n\ndef _timedelta_to_ms(td: timedelta | None) -> float | None:\n    if td == timedelta.max:\n        return float('inf')\n    if td is None:\n        return td\n    return round(td.total_seconds() * 1000)\n\n\ndef _timedelta_to_secs(td: timedelta | None) -> float | None:\n    if td == timedelta.max:\n        return float('inf')\n    if td is None:\n        return td\n    return td.total_seconds()\n\n\n_number_parser = TypeAdapter(float)\n\n\ndef _timedelta_from_ms(value: float | timedelta | Any | None, handler: Callable[[Any], timedelta]) -> timedelta | None:\n    if value == float('inf'):\n        return timedelta.max\n\n    # If the value is a string-encoded number, decode it\n    if isinstance(value, str):\n        with suppress(ValidationError):\n            value = _number_parser.validate_python(value)\n\n    if not isinstance(value, (int, float)):\n        return handler(value)\n\n    return timedelta(milliseconds=value)\n\n\ndef _timedelta_from_secs(\n    value: float | timedelta | Any | None,\n    handler: Callable[[Any], timedelta],\n) -> timedelta | None:\n    if value == float('inf'):\n        return timedelta.max\n\n    # If the value is a string-encoded number, decode it\n    if isinstance(value, str):\n        with suppress(ValidationError):\n            value = _number_parser.validate_python(value)\n\n    if not isinstance(value, (int, float)):\n        return handler(value)\n\n    return timedelta(seconds=value)\n\n\ntimedelta_ms = Annotated[timedelta, PlainSerializer(_timedelta_to_ms), WrapValidator(_timedelta_from_ms)]\ntimedelta_secs = Annotated[timedelta, PlainSerializer(_timedelta_to_secs), WrapValidator(_timedelta_from_secs)]\n"
  },
  {
    "path": "src/crawlee/_utils/raise_if_too_many_kwargs.py",
    "content": "from typing import Any\n\n\ndef raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:\n    \"\"\"Raise ValueError if there are more non-None kwargs then max_kwargs.\"\"\"\n    none_kwargs_names = [f'\"{kwarg_name}\"' for kwarg_name, value in kwargs.items() if value is not None]\n    if len(none_kwargs_names) > max_kwargs:\n        all_kwargs_names = [f'\"{kwarg_name}\"' for kwarg_name in kwargs]\n        raise ValueError(\n            f'Only one of {\", \".join(all_kwargs_names)} can be specified, but following arguments were '\n            f'specified: {\", \".join(none_kwargs_names)}.'\n        )\n"
  },
  {
    "path": "src/crawlee/_utils/recoverable_state.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Generic, Literal, TypeVar\n\nfrom pydantic import BaseModel\n\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.events._types import Event, EventPersistStateData\n\nif TYPE_CHECKING:\n    import logging\n    from collections.abc import Callable, Coroutine\n\n    from crawlee.storages import KeyValueStore\n\nTStateModel = TypeVar('TStateModel', bound=BaseModel)\n\n\nclass RecoverableState(Generic[TStateModel]):\n    \"\"\"A class for managing persistent recoverable state using a Pydantic model.\n\n    This class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved\n    across migrations or restarts. It manages the loading, saving, and resetting of state data,\n    with optional persistence capabilities.\n\n    The state is represented by a Pydantic model that can be serialized to and deserialized from JSON.\n    The class automatically hooks into the event system to persist state when needed.\n\n    Type Parameters:\n        TStateModel: A Pydantic BaseModel type that defines the structure of the state data.\n                     Typically, it should be inferred from the `default_state` constructor parameter.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        default_state: TStateModel,\n        persist_state_key: str,\n        persistence_enabled: Literal[True, False, 'explicit_only'] = False,\n        persist_state_kvs_name: str | None = None,\n        persist_state_kvs_id: str | None = None,\n        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,\n        logger: logging.Logger,\n    ) -> None:\n        \"\"\"Initialize a new recoverable state object.\n\n        Args:\n            default_state: The default state model instance to use when no persisted state is found.\n                A deep copy is made each time the state is used.\n            persist_state_key: The key under which the state is stored in the KeyValueStore\n            persistence_enabled: Flag to enable or disable state persistence. Use 'explicit_only' if you want to be able\n                to save the state manually, but without any automatic persistence.\n            persist_state_kvs_name: The name of the KeyValueStore to use for persistence.\n                If neither a name nor and id are supplied, the default store will be used.\n            persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.\n                If neither a name nor and id are supplied, the default store will be used.\n            persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If\n                not provided, a system-wide KeyValueStore will be used, based on service locator configuration.\n            logger: A logger instance for logging operations related to state persistence\n        \"\"\"\n        raise_if_too_many_kwargs(\n            persist_state_kvs_name=persist_state_kvs_name,\n            persist_state_kvs_id=persist_state_kvs_id,\n            persist_state_kvs_factory=persist_state_kvs_factory,\n        )\n        if not persist_state_kvs_factory:\n            logger.debug(\n                'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '\n                'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '\n                'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '\n                'global side effects.'\n            )\n\n        self._default_state = default_state\n        self._state_type: type[TStateModel] = self._default_state.__class__\n        self._state: TStateModel | None = None\n        self._persistence_enabled = persistence_enabled\n        self._persist_state_key = persist_state_key\n        if persist_state_kvs_factory is None:\n\n            async def kvs_factory() -> KeyValueStore:\n                from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import\n\n                return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)\n\n            self._persist_state_kvs_factory = kvs_factory\n        else:\n            self._persist_state_kvs_factory = persist_state_kvs_factory\n\n        self._key_value_store: KeyValueStore | None = None\n        self._log = logger\n\n    async def initialize(self) -> TStateModel:\n        \"\"\"Initialize the recoverable state.\n\n        This method must be called before using the recoverable state. It loads the saved state\n        if persistence is enabled and registers the object to listen for PERSIST_STATE events.\n\n        Returns:\n            The loaded state model\n        \"\"\"\n        if self._persistence_enabled is False:\n            self._state = self._default_state.model_copy(deep=True)\n            return self.current_value\n\n        # Import here to avoid circular imports.\n\n        self._key_value_store = await self._persist_state_kvs_factory()\n\n        await self._load_saved_state()\n\n        if self._persistence_enabled is True:\n            # Import here to avoid circular imports.\n            from crawlee import service_locator  # noqa: PLC0415\n\n            event_manager = service_locator.get_event_manager()\n            event_manager.on(event=Event.PERSIST_STATE, listener=self.persist_state)\n\n        return self.current_value\n\n    async def teardown(self) -> None:\n        \"\"\"Clean up resources used by the recoverable state.\n\n        If persistence is enabled, this method deregisters the object from PERSIST_STATE events\n        and persists the current state one last time.\n        \"\"\"\n        if not self._persistence_enabled:\n            return\n\n        if self._persistence_enabled is True:\n            # Import here to avoid circular imports.\n            from crawlee import service_locator  # noqa: PLC0415\n\n            event_manager = service_locator.get_event_manager()\n            event_manager.off(event=Event.PERSIST_STATE, listener=self.persist_state)\n            await self.persist_state()\n\n    @property\n    def current_value(self) -> TStateModel:\n        \"\"\"Get the current state.\"\"\"\n        if self._state is None:\n            raise RuntimeError('Recoverable state has not yet been loaded')\n\n        return self._state\n\n    @property\n    def is_initialized(self) -> bool:\n        \"\"\"Check if the state has already been initialized.\"\"\"\n        return self._state is not None\n\n    async def has_persisted_state(self) -> bool:\n        \"\"\"Check if there is any persisted state in the key-value store.\"\"\"\n        if not self._persistence_enabled:\n            return False\n\n        if self._key_value_store is None:\n            raise RuntimeError('Recoverable state has not yet been initialized')\n\n        return await self._key_value_store.record_exists(self._persist_state_key)\n\n    async def reset(self) -> None:\n        \"\"\"Reset the state to the default values and clear any persisted state.\n\n        Resets the current state to the default state and, if persistence is enabled,\n        clears the persisted state from the KeyValueStore.\n        \"\"\"\n        self._state = self._default_state.model_copy(deep=True)\n\n        if self._persistence_enabled:\n            if self._key_value_store is None:\n                raise RuntimeError('Recoverable state has not yet been initialized')\n\n            await self._key_value_store.set_value(self._persist_state_key, None)\n\n    async def persist_state(self, event_data: EventPersistStateData | None = None) -> None:\n        \"\"\"Persist the current state to the KeyValueStore.\n\n        This method is typically called in response to a PERSIST_STATE event, but can also be called\n        directly when needed.\n\n        Args:\n            event_data: Optional data associated with a PERSIST_STATE event\n        \"\"\"\n        self._log.debug(\n            f'Persisting RecoverableState (model={self._default_state.__class__.__name__}, event_data={event_data}).'\n        )\n\n        if self._key_value_store is None or self._state is None:\n            raise RuntimeError('Recoverable state has not yet been initialized')\n\n        if self._persistence_enabled is True or self._persistence_enabled == 'explicit_only':\n            await self._key_value_store.set_value(\n                self._persist_state_key,\n                self._state.model_dump(mode='json', by_alias=True),\n                'application/json',\n            )\n        else:\n            self._log.debug('Persistence is not enabled - not doing anything')\n\n    async def _load_saved_state(self) -> None:\n        if self._key_value_store is None:\n            raise RuntimeError('Recoverable state has not yet been initialized')\n\n        stored_state = await self._key_value_store.get_value(self._persist_state_key)\n        if stored_state is None:\n            self._state = self._default_state.model_copy(deep=True)\n        else:\n            self._state = self._state_type.model_validate(stored_state)\n"
  },
  {
    "path": "src/crawlee/_utils/recurring_task.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport inspect\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n    from datetime import timedelta\n    from types import TracebackType\n\n    from typing_extensions import Self\n\nlogger = getLogger(__name__)\n\n\nclass RecurringTask:\n    \"\"\"Class for creating and managing recurring tasks.\n\n    Attributes:\n        func: The function to be executed repeatedly.\n        delay: The time delay (in seconds) between function calls.\n        task: The underlying task object.\n    \"\"\"\n\n    def __init__(self, func: Callable, delay: timedelta) -> None:\n        logger.debug(\n            'Calling RecurringTask.__init__(func={%s}, delay={%s})...',\n            func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,\n            delay,\n        )\n        self.func = func\n        self.delay = delay\n        self.task: asyncio.Task | None = None\n\n    async def __aenter__(self) -> Self:\n        self.start()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        await self.stop()\n\n    async def _wrapper(self) -> None:\n        \"\"\"Continuously execute the provided function with the specified delay.\n\n        Run the function in a loop, waiting for the configured delay between executions.\n        Supports both synchronous and asynchronous functions.\n        \"\"\"\n        sleep_time_secs = self.delay.total_seconds()\n        while True:\n            await self.func() if inspect.iscoroutinefunction(self.func) else self.func()\n            await asyncio.sleep(sleep_time_secs)\n\n    def start(self) -> None:\n        \"\"\"Start the recurring task execution.\"\"\"\n        name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__\n        self.task = asyncio.create_task(\n            self._wrapper(),\n            name=f'Task-recurring-{name}',\n        )\n\n    async def stop(self) -> None:\n        \"\"\"Stop the recurring task execution.\"\"\"\n        if self.task:\n            self.task.cancel()\n            # Ensure the task has a chance to properly handle the cancellation and any potential exceptions.\n            await asyncio.gather(self.task, return_exceptions=True)\n"
  },
  {
    "path": "src/crawlee/_utils/requests.py",
    "content": "from __future__ import annotations\n\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom yarl import URL\n\nfrom crawlee._utils.crypto import compute_short_hash\n\nif TYPE_CHECKING:\n    from crawlee._types import HttpHeaders, HttpMethod, HttpPayload\n\nlogger = getLogger(__name__)\n\n\ndef normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:\n    \"\"\"Normalize a URL.\n\n    This function cleans and standardizes a URL by removing leading and trailing whitespaces,\n    converting the scheme and netloc to lower case, stripping unwanted tracking parameters\n    (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,\n    and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally\n    identical but differ in trivial ways (such as parameter order or casing) are treated as the same.\n\n    Args:\n        url: The URL to be normalized.\n        keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.\n\n    Returns:\n        A string containing the normalized URL.\n    \"\"\"\n    # Parse the URL\n    parsed_url = URL(url.strip())\n\n    # Remove any 'utm_' parameters\n    search_params = [(k, v) for k, v in parsed_url.query.items() if not k.startswith('utm_')]\n\n    # Construct the new query string\n    sorted_search_params = sorted(search_params)\n\n    # Construct the final URL\n    yarl_new_url = parsed_url.with_query(sorted_search_params)\n    yarl_new_url = yarl_new_url.with_path(\n        yarl_new_url.path.removesuffix('/'), keep_query=True, keep_fragment=keep_url_fragment\n    )\n\n    return str(yarl_new_url).lower()\n\n\ndef compute_unique_key(\n    url: str,\n    method: HttpMethod = 'GET',\n    headers: HttpHeaders | None = None,\n    payload: HttpPayload | None = None,\n    session_id: str | None = None,\n    *,\n    keep_url_fragment: bool = False,\n    use_extended_unique_key: bool = False,\n) -> str:\n    \"\"\"Compute a unique key for caching & deduplication of requests.\n\n    This function computes a unique key by normalizing the provided URL and method. If `use_extended_unique_key`\n    is True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key\n    is just the normalized URL. Additionally, if HTTP headers are provided, the whitelisted headers are hashed\n    and included in the key.\n\n    Args:\n        url: The request URL.\n        method: The HTTP method.\n        headers: The HTTP headers.\n        payload: The data to be sent as the request body.\n        keep_url_fragment: A flag indicating whether to keep the URL fragment.\n        use_extended_unique_key: A flag indicating whether to include a hashed payload in the key.\n        session_id: The ID of a specific `Session` to which the request will be strictly bound\n\n    Returns:\n        A string representing the unique key for the request.\n    \"\"\"\n    # Normalize the URL.\n    try:\n        normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)\n    except Exception as exc:\n        logger.warning(f'Failed to normalize URL: {exc}')\n        normalized_url = url\n\n    # Normalize the method.\n    normalized_method = method.upper()\n\n    # Compute and return the extended unique key if required.\n    if use_extended_unique_key:\n        payload_hash = _get_payload_hash(payload)\n        headers_hash = _get_headers_hash(headers)\n        normalized_session = '' if session_id is None else session_id.lower()\n\n        # Return the extended unique key. Use pipe as a separator of the different parts of the unique key.\n        extended_part = f'{normalized_method}|{headers_hash}|{payload_hash}'\n        if normalized_session:\n            extended_part = f'{extended_part}|{normalized_session}'\n        return f'{extended_part}|{normalized_url}'\n\n    # Log information if there is a non-GET request with a payload.\n    if normalized_method != 'GET' and payload:\n        logger.info(\n            f'{normalized_method} request with a payload detected. By default, requests to the same URL with '\n            'different methods or payloads will be deduplicated. Use \"use_extended_unique_key\" to include payload '\n            'and headers in the unique key and avoid deduplication in these cases.'\n        )\n\n    # Return the normalized URL as the unique key.\n    return normalized_url\n\n\ndef _get_payload_hash(payload: HttpPayload | None) -> str:\n    payload_in_bytes = b'' if payload is None else payload\n    return compute_short_hash(payload_in_bytes)\n\n\ndef _get_headers_hash(headers: HttpHeaders | None) -> str:\n    # HTTP headers which will be included in the hash computation.\n    whitelisted_headers = {'accept', 'accept-language', 'authorization', 'content-type'}\n\n    if headers is None:\n        normalized_headers = b''\n    else:\n        filtered_headers = {key: value for key, value in headers.items() if key in whitelisted_headers}\n        normalized_headers = '|'.join(f'{k}:{v}' for k, v in filtered_headers.items()).encode('utf-8')\n\n    return compute_short_hash(normalized_headers)\n"
  },
  {
    "path": "src/crawlee/_utils/robots.py",
    "content": "from __future__ import annotations\n\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom protego import Protego\nfrom yarl import URL\n\nfrom crawlee._utils.sitemap import Sitemap\nfrom crawlee._utils.web import is_status_code_client_error\n\nif TYPE_CHECKING:\n    from typing_extensions import Self\n\n    from crawlee.http_clients import HttpClient\n    from crawlee.proxy_configuration import ProxyInfo\n\n\nlogger = getLogger(__name__)\n\n\nclass RobotsTxtFile:\n    def __init__(\n        self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None\n    ) -> None:\n        self._robots = robots\n        self._original_url = URL(url).origin()\n        self._http_client = http_client\n        self._proxy_info = proxy_info\n\n    @classmethod\n    async def from_content(cls, url: str, content: str) -> Self:\n        \"\"\"Create a `RobotsTxtFile` instance from the given content.\n\n        Args:\n            url: The URL associated with the robots.txt file.\n            content: The raw string content of the robots.txt file to be parsed.\n        \"\"\"\n        robots = Protego.parse(content)\n        return cls(url, robots)\n\n    @classmethod\n    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:\n        \"\"\"Determine the location of a robots.txt file for a URL and fetch it.\n\n        Args:\n            url: The URL whose domain will be used to find the corresponding robots.txt file.\n            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.\n            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.\n        \"\"\"\n        robots_url = URL(url).with_path('/robots.txt')\n        return await cls.load(str(robots_url), http_client, proxy_info)\n\n    @classmethod\n    async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:\n        \"\"\"Load the robots.txt file for a given URL.\n\n        Args:\n            url: The direct URL of the robots.txt file to be loaded.\n            http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.\n            proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.\n        \"\"\"\n        try:\n            response = await http_client.send_request(url, proxy_info=proxy_info)\n\n            body = (\n                b'User-agent: *\\nAllow: /'\n                if is_status_code_client_error(response.status_code)\n                else await response.read()\n            )\n            robots = Protego.parse(body.decode('utf-8'))\n\n        except Exception as e:\n            logger.warning(f'Failed to fetch from robots.txt from \"{url}\" with error: \"{e}\"')\n\n            robots = Protego.parse('User-agent: *\\nAllow: /')\n\n        return cls(url, robots, http_client=http_client, proxy_info=proxy_info)\n\n    def is_allowed(self, url: str, user_agent: str = '*') -> bool:\n        \"\"\"Check if the given URL is allowed for the given user agent.\n\n        Args:\n            url: The URL to check against the robots.txt rules.\n            user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.\n        \"\"\"\n        check_url = URL(url)\n        if check_url.origin() != self._original_url:\n            return True\n        return bool(self._robots.can_fetch(str(check_url), user_agent))\n\n    def get_sitemaps(self) -> list[str]:\n        \"\"\"Get the list of sitemaps urls from the robots.txt file.\"\"\"\n        return list(self._robots.sitemaps)\n\n    def get_crawl_delay(self, user_agent: str = '*') -> int | None:\n        \"\"\"Get the crawl delay for the given user agent.\n\n        Args:\n            user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any\n                user-agent.\n        \"\"\"\n        crawl_delay = self._robots.crawl_delay(user_agent)\n        return int(crawl_delay) if crawl_delay is not None else None\n\n    async def parse_sitemaps(self) -> Sitemap:\n        \"\"\"Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.\"\"\"\n        sitemaps = self.get_sitemaps()\n        if not self._http_client:\n            raise ValueError('HTTP client is required to parse sitemaps.')\n\n        return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)\n\n    async def parse_urls_from_sitemaps(self) -> list[str]:\n        \"\"\"Parse the sitemaps in the robots.txt file and return a list URLs.\"\"\"\n        sitemap = await self.parse_sitemaps()\n        return sitemap.urls\n"
  },
  {
    "path": "src/crawlee/_utils/sitemap.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport re\nimport zlib\nfrom codecs import getincrementaldecoder\nfrom collections import defaultdict\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom datetime import datetime, timedelta\nfrom hashlib import sha256\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Literal, TypedDict\nfrom xml.sax import SAXParseException\nfrom xml.sax.expatreader import ExpatParser\nfrom xml.sax.handler import ContentHandler\n\nfrom typing_extensions import NotRequired, override\nfrom yarl import URL\n\nfrom crawlee._utils.web import is_status_code_successful\nfrom crawlee.errors import ProxyError\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from xml.sax.xmlreader import AttributesImpl\n\n    from crawlee.http_clients import HttpClient\n    from crawlee.proxy_configuration import ProxyInfo\n\nlogger = getLogger(__name__)\n\nVALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}\nSITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}\nSITEMAP_URL_PATTERN = re.compile(r'\\/sitemap\\.(?:xml|txt)(?:\\.gz)?$', re.IGNORECASE)\nCOMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml']\n\n\n@dataclass()\nclass SitemapUrl:\n    loc: str\n    lastmod: datetime | None = None\n    changefreq: str | None = None\n    priority: float | None = None\n    origin_sitemap_url: str | None = None\n\n\n@dataclass()\nclass NestedSitemap:\n    loc: str\n    origin_sitemap_url: str | None = None\n\n\nclass ParseSitemapOptions(TypedDict, total=False):\n    emit_nested_sitemaps: bool\n    max_depth: int\n    sitemap_retries: int\n    timeout: timedelta | None\n\n\nclass SitemapSource(TypedDict):\n    type: Literal['url', 'raw']\n    url: NotRequired[str]\n    content: NotRequired[str]\n    depth: NotRequired[int]\n\n\nclass _SitemapItem(TypedDict, total=False):\n    type: Literal['url', 'sitemap_url']\n    loc: str\n    url: str\n    lastmod: datetime | None\n    changefreq: str | None\n    priority: float | None\n\n\nclass _XMLSaxSitemapHandler(ContentHandler):\n    def __init__(self) -> None:\n        super().__init__()\n        self._root_tag_name: str | None = None\n        self._current_tag: str | None = None\n        self._current_url: _SitemapItem = {}\n        self._buffer: str = ''\n        self._items: list[_SitemapItem] = []\n\n    @property\n    def items(self) -> list[_SitemapItem]:\n        return self._items\n\n    @override\n    def startElement(self, name: str, attrs: AttributesImpl) -> None:\n        if self._root_tag_name is None and name in ('urlset', 'sitemapindex'):\n            self._root_tag_name = name\n\n        if name in ('loc', 'lastmod', 'changefreq', 'priority'):\n            self._current_tag = name\n            self._buffer = ''\n\n    def characters(self, content: str) -> None:\n        if self._current_tag:\n            self._buffer += content\n\n    @override\n    def endElement(self, name: str) -> None:\n        if name == self._current_tag:\n            text = self._buffer.strip()\n\n            if name == 'loc':\n                if self._root_tag_name == 'sitemapindex':\n                    self._items.append({'type': 'sitemap_url', 'url': text})\n                else:\n                    self._current_url['loc'] = text\n\n            elif name == 'lastmod' and text:\n                with suppress(ValueError):\n                    self._current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00'))\n\n            elif name == 'priority' and text:\n                with suppress(ValueError):\n                    self._current_url['priority'] = float(text)\n\n            elif name == 'changefreq' and text in VALID_CHANGE_FREQS:\n                self._current_url['changefreq'] = text\n\n            self.current_tag = None\n\n        if name == 'url' and 'loc' in self._current_url:\n            self.items.append({'type': 'url', **self._current_url})\n            self._current_url = {}\n\n\nclass _TxtSitemapParser:\n    \"\"\"Parser for plaintext sitemaps that processes data as a stream.\"\"\"\n\n    def __init__(self) -> None:\n        self._buffer = ''\n\n    async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:\n        \"\"\"Process a chunk of text data and yield items one by one.\"\"\"\n        self._buffer += chunk\n\n        # Process complete lines\n        if '\\n' in self._buffer:\n            lines = self._buffer.split('\\n')\n            # Last element might be incomplete, save for next chunk\n            self._buffer = lines.pop()\n\n            for line in lines:\n                url = line.strip()\n                if url:\n                    yield {'type': 'url', 'loc': url}\n\n    async def flush(self) -> AsyncGenerator[_SitemapItem, None]:\n        \"\"\"Process any remaining data in the buffer, yielding items one by one.\"\"\"\n        if self._buffer:\n            url = self._buffer.strip()\n            if url:\n                yield {'type': 'url', 'loc': url}\n            self.buffer = ''\n\n    def close(self) -> None:\n        \"\"\"Clean up resources.\"\"\"\n        self._buffer = ''\n\n\nclass _XmlSitemapParser:\n    \"\"\"Parser for XML sitemaps using SAX to process data as a stream.\"\"\"\n\n    def __init__(self) -> None:\n        self._parser = ExpatParser()\n        self._handler = _XMLSaxSitemapHandler()\n        self._parser.setContentHandler(self._handler)\n\n    async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:\n        \"\"\"Process a chunk of XML data and yield items one by one.\"\"\"\n        try:\n            self._parser.feed(chunk)\n\n            # If we get here, the XML was valid and complete\n            for item in self._handler.items:\n                yield item\n\n            self._handler.items.clear()\n\n        except Exception as e:\n            logger.warning(f'Failed to parse XML data chunk: {e}', exc_info=True)\n\n    async def flush(self) -> AsyncGenerator[_SitemapItem, None]:\n        \"\"\"Process any remaining data in the buffer, yielding items one by one.\"\"\"\n        try:\n            self._parser.flush()\n\n            for item in self._handler.items:\n                yield item\n\n            self._handler.items.clear()\n\n        except Exception as e:\n            logger.warning(f'Failed to parse remaining XML data: {e}')\n\n    def close(self) -> None:\n        \"\"\"Clean up resources.\"\"\"\n        with suppress(SAXParseException):\n            self._parser.close()\n\n\ndef _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:\n    \"\"\"Create appropriate parser based on content type and URL.\"\"\"\n    if 'text/plain' in content_type.lower() or (url and URL(url).path.endswith('.txt')):\n        return _TxtSitemapParser()\n    # Default to XML parser for most cases\n    return _XmlSitemapParser()\n\n\ndef _get_origin_url(source: SitemapSource) -> str:\n    \"\"\"Determine the origin URL for a sitemap source.\"\"\"\n    if source['type'] == 'url' and 'url' in source:\n        return source['url']\n    if source['type'] == 'raw' and 'content' in source:\n        # For raw content sources, create a consistent identifier\n        return f'raw://{sha256(source[\"content\"].encode()).hexdigest()}'\n    return ''\n\n\nasync def _process_sitemap_item(\n    item: _SitemapItem,\n    source: SitemapSource,\n    depth: int,\n    visited_sitemap_urls: set[str],\n    sources: list[SitemapSource],\n    *,\n    emit_nested_sitemaps: bool,\n) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]:\n    \"\"\"Process a sitemap item and yield appropriate results.\"\"\"\n    item_copy = item.copy()  # Work with a copy to avoid modifying the original\n\n    if 'type' not in item_copy:\n        return\n\n    item_type = item_copy.pop('type')\n\n    # Handle sitemap URL references (nested sitemaps)\n    if item_type == 'sitemap_url' and 'url' in item_copy:\n        sitemap_url = item_copy['url']\n        if sitemap_url and sitemap_url not in visited_sitemap_urls:\n            # Add to processing queue\n            sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1))\n\n            # Output the nested sitemap reference if requested\n            if emit_nested_sitemaps:\n                yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None)\n\n    # Handle individual URL entries\n    elif item_type == 'url' and 'loc' in item_copy:\n        # Determine the origin sitemap URL for tracking purposes\n        origin_url = _get_origin_url(source)\n\n        # Create and yield the sitemap URL object\n        yield SitemapUrl(\n            loc=item_copy['loc'],\n            lastmod=item_copy.get('lastmod'),\n            changefreq=item_copy.get('changefreq'),\n            priority=item_copy.get('priority'),\n            origin_sitemap_url=origin_url,\n        )\n\n\nasync def _process_raw_source(\n    source: SitemapSource,\n    depth: int,\n    visited_sitemap_urls: set[str],\n    sources: list[SitemapSource],\n    *,\n    emit_nested_sitemaps: bool,\n) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:\n    \"\"\"Process a raw content sitemap source.\"\"\"\n    if 'content' not in source:\n        logger.warning(f'Raw source missing content: {source}')\n        return\n\n    content = source['content']\n    parser = _get_parser('text/xml')\n\n    try:\n        # Process the content\n        async for item in parser.process_chunk(content):\n            async for result in _process_sitemap_item(\n                item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps\n            ):\n                if result:\n                    yield result\n\n        # Process any remaining content\n        async for item in parser.flush():\n            async for result in _process_sitemap_item(\n                item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps\n            ):\n                if result:\n                    yield result\n    except Exception as e:\n        logger.warning(f'Failed to parse raw sitemap content: {e}')\n    finally:\n        parser.close()\n\n\nasync def _fetch_and_process_sitemap(\n    http_client: HttpClient,\n    source: SitemapSource,\n    depth: int,\n    visited_sitemap_urls: set[str],\n    sources: list[SitemapSource],\n    retries_left: int,\n    *,\n    proxy_info: ProxyInfo | None = None,\n    timeout: timedelta | None = None,\n    emit_nested_sitemaps: bool,\n) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:\n    \"\"\"Fetch a sitemap from a URL and process its content.\"\"\"\n    if 'url' not in source:\n        return\n\n    sitemap_url = source['url']\n\n    try:\n        while retries_left > 0:\n            retries_left -= 1\n            async with http_client.stream(\n                sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout\n            ) as response:\n                # Determine content type and compression\n                content_type = response.headers.get('content-type', '')\n\n                decoder = getincrementaldecoder('utf-8')(errors='replace')\n\n                # Create appropriate parser\n                parser = _get_parser(content_type, sitemap_url)\n                decompressor = None\n                try:\n                    # Process chunks as they arrive\n                    first_chunk = True\n                    async for raw_chunk in response.read_stream():\n                        # Check if the first chunk is a valid gzip header\n                        if first_chunk and raw_chunk.startswith(b'\\x1f\\x8b'):\n                            decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)\n                        first_chunk = False\n\n                        chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk\n                        text_chunk = decoder.decode(chunk)\n                        async for item in parser.process_chunk(text_chunk):\n                            async for result in _process_sitemap_item(\n                                item,\n                                source,\n                                depth,\n                                visited_sitemap_urls,\n                                sources,\n                                emit_nested_sitemaps=emit_nested_sitemaps,\n                            ):\n                                if result:\n                                    yield result\n\n                    # Process any remaining content\n                    async for item in parser.flush():\n                        async for result in _process_sitemap_item(\n                            item,\n                            source,\n                            depth,\n                            visited_sitemap_urls,\n                            sources,\n                            emit_nested_sitemaps=emit_nested_sitemaps,\n                        ):\n                            if result:\n                                yield result\n                finally:\n                    parser.close()\n                break\n\n    except Exception as e:\n        if retries_left > 0:\n            logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}')\n            await asyncio.sleep(1)  # Brief pause before retry\n\n\nclass Sitemap:\n    def __init__(self, urls: list[str]) -> None:\n        self._urls = urls\n\n    @property\n    def urls(self) -> list[str]:\n        return self._urls\n\n    @classmethod\n    async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap:\n        base_url = URL(url)\n        sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS]\n        return await cls.load(sitemap_urls, http_client, proxy_info)\n\n    @classmethod\n    async def load(\n        cls,\n        urls: str | list[str],\n        http_client: HttpClient,\n        proxy_info: ProxyInfo | None = None,\n        parse_sitemap_options: ParseSitemapOptions | None = None,\n    ) -> Sitemap:\n        if isinstance(urls, str):\n            urls = [urls]\n        return await cls.parse(\n            [SitemapSource(type='url', url=url) for url in urls], http_client, proxy_info, parse_sitemap_options\n        )\n\n    @classmethod\n    async def from_xml_string(cls, content: str) -> Sitemap:\n        return await cls.parse([SitemapSource(type='raw', content=content)])\n\n    @classmethod\n    async def parse(\n        cls,\n        sources: list[SitemapSource],\n        http_client: HttpClient | None = None,\n        proxy_info: ProxyInfo | None = None,\n        parse_sitemap_options: ParseSitemapOptions | None = None,\n    ) -> Sitemap:\n        urls = [item.loc async for item in parse_sitemap(sources, http_client, proxy_info, parse_sitemap_options)]\n        return cls(urls)\n\n\nasync def parse_sitemap(\n    initial_sources: list[SitemapSource],\n    http_client: HttpClient | None = None,\n    proxy_info: ProxyInfo | None = None,\n    options: ParseSitemapOptions | None = None,\n) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:\n    \"\"\"Parse sitemap(s) and yield URLs found in them.\n\n    This function coordinates the process of fetching and parsing sitemaps,\n    handling both URL-based and raw content sources. It follows nested sitemaps\n    up to the specified maximum depth.\n    \"\"\"\n    # Set default options\n    default_timeout = timedelta(seconds=30)\n    if options:\n        emit_nested_sitemaps = options['emit_nested_sitemaps']\n        max_depth = options['max_depth']\n        sitemap_retries = options['sitemap_retries']\n        timeout = options.get('timeout', default_timeout)\n    else:\n        emit_nested_sitemaps = False\n        max_depth = float('inf')\n        sitemap_retries = 3\n        timeout = default_timeout\n\n    # Setup working state\n    sources = list(initial_sources)\n    visited_sitemap_urls: set[str] = set()\n\n    # Process sources until the queue is empty\n    while sources:\n        source = sources.pop(0)\n        depth = source.get('depth', 0)\n\n        # Skip if we've reached max depth\n        if depth > max_depth:\n            logger.debug(f'Skipping sitemap {source.get(\"url\", \"\")} - exceeded max depth {max_depth}')\n            continue\n\n        # Process based on source type\n        if source['type'] == 'raw':\n            async for result in _process_raw_source(\n                source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps\n            ):\n                yield result\n\n        elif source['type'] == 'url' and 'url' in source:\n            # Add to visited set before processing to avoid duplicates\n            if http_client is None:\n                raise RuntimeError('HttpClient must be provided for URL-based sitemap sources.')\n\n            visited_sitemap_urls.add(source['url'])\n\n            async for result in _fetch_and_process_sitemap(\n                http_client,\n                source,\n                depth,\n                visited_sitemap_urls,\n                sources,\n                sitemap_retries,\n                emit_nested_sitemaps=emit_nested_sitemaps,\n                proxy_info=proxy_info,\n                timeout=timeout,\n            ):\n                yield result\n        else:\n            logger.warning(f'Invalid source configuration: {source}')\n\n\nasync def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator:\n    queue: asyncio.Queue = asyncio.Queue()\n\n    end_feed = object()\n\n    async def feed(gen: AsyncGenerator) -> None:\n        try:\n            async for item in gen:\n                await queue.put(item)\n        except Exception:\n            logger.warning(f'Error in generator: {gen}', exc_info=True)\n        finally:\n            await queue.put(end_feed)\n\n    tasks = [asyncio.create_task(feed(gen)) for gen in generators]\n    remaining_tasks = len(tasks)\n\n    try:\n        while remaining_tasks > 0:\n            item = await queue.get()\n            if item is end_feed:\n                remaining_tasks -= 1\n            else:\n                yield item\n    finally:\n        for task in tasks:\n            task.cancel()\n        await asyncio.gather(*tasks, return_exceptions=True)\n\n\nasync def _discover_for_hostname(\n    hostname: str,\n    hostname_urls: list[str],\n    *,\n    http_client: HttpClient,\n    proxy_info: ProxyInfo | None = None,\n    request_timeout: timedelta,\n    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',\n) -> AsyncGenerator[str, None]:\n    # Import here to avoid circular imports.\n    from crawlee._utils.robots import RobotsTxtFile  # noqa: PLC0415\n\n    domain_seen: set[str] = set()\n    hostname_urls = list(set(hostname_urls))  # Remove duplicates\n\n    def _check_and_add(url: str) -> bool:\n        if url in domain_seen:\n            return False\n        domain_seen.add(url)\n        return True\n\n    # Try getting sitemaps from robots.txt first\n    robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)\n    for sitemap_url in robots.get_sitemaps():\n        if _check_and_add(sitemap_url):\n            yield sitemap_url\n\n    # Check maybe provided URLs have sitemap url\n    matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)]\n\n    if matching_sitemap_urls:\n        for sitemap_url in matching_sitemap_urls:\n            if _check_and_add(sitemap_url):\n                yield sitemap_url\n    else:\n        # Check common sitemap locations\n        base_url = URL(hostname_urls[0])\n        for path in COMMON_SITEMAP_PATHS:\n            candidate = str(base_url.with_path(path))\n            if candidate in domain_seen:\n                continue\n            try:\n                response = await http_client.send_request(\n                    candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout\n                )\n                if is_status_code_successful(response.status_code) and _check_and_add(candidate):\n                    yield candidate\n            except ProxyError:\n                logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}')\n            except asyncio.TimeoutError:\n                logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}')\n            except Exception:\n                logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True)\n\n\nasync def discover_valid_sitemaps(\n    urls: list[str],\n    *,\n    http_client: HttpClient,\n    proxy_info: ProxyInfo | None = None,\n    request_timeout: timedelta = timedelta(seconds=20),\n    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',\n) -> AsyncGenerator[str, None]:\n    \"\"\"Discover related sitemaps for the given URLs.\n\n    Args:\n        urls: List of URLs to discover sitemaps for.\n        http_client: `HttpClient` to use for making requests.\n        proxy_info: Proxy configuration to use for requests.\n        request_timeout: Timeout for each request when checking for sitemaps.\n        method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET).\n    \"\"\"\n    # Use a set to track seen sitemap URLs and avoid duplicates\n    seen = set()\n\n    grouped_urls = defaultdict(list)\n    for url in urls:\n        try:\n            hostname = URL(url).host\n        except ValueError:\n            logger.warning(f'Invalid URL {url} skipped')\n            continue\n\n        if not hostname:\n            logger.warning(f'URL {url} without host skipped')\n            continue\n\n        grouped_urls[hostname].append(url)\n\n    generators = [\n        _discover_for_hostname(\n            hostname,\n            hostname_urls,\n            http_client=http_client,\n            proxy_info=proxy_info,\n            request_timeout=request_timeout,\n            method_for_checking=method_for_checking,\n        )\n        for hostname, hostname_urls in grouped_urls.items()\n    ]\n\n    async for sitemap_url in _merge_async_generators(*generators):\n        if sitemap_url not in seen:\n            seen.add(sitemap_url)\n            yield sitemap_url\n"
  },
  {
    "path": "src/crawlee/_utils/system.py",
    "content": "from __future__ import annotations\n\nimport os\nimport sys\nfrom contextlib import suppress\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Annotated\n\nimport psutil\nfrom pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator\n\nfrom crawlee._utils.byte_size import ByteSize\n\nlogger = getLogger(__name__)\n\nif sys.platform == 'linux':\n    \"\"\"Get the most suitable available used memory metric.\n\n    `Proportional Set Size (PSS)`, is the amount of own memory and memory shared with other processes, accounted in a\n    way that the shared amount is divided evenly between the processes that share it. Available on Linux. Suitable for\n    avoiding overestimation by counting the same shared memory used by children processes multiple times.\n\n    `Resident Set Size (RSS)` is the non-swapped physical memory a process has used; it includes shared memory. It\n    should be available everywhere.\n    \"\"\"\n\n    def _get_used_memory(process: psutil.Process) -> int:\n        return int(process.memory_full_info().pss)\nelse:\n\n    def _get_used_memory(process: psutil.Process) -> int:\n        return int(process.memory_info().rss)\n\n\nclass CpuInfo(BaseModel):\n    \"\"\"Information about the CPU usage.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    used_ratio: Annotated[float, Field(alias='usedRatio')]\n    \"\"\"The ratio of CPU currently in use, represented as a float between 0 and 1.\"\"\"\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        created_at: datetime = datetime.now(timezone.utc)\n        \"\"\"The time at which the measurement was taken.\"\"\"\n    else:\n        created_at: Annotated[\n            datetime,\n            Field(\n                alias='createdAt',\n                default_factory=lambda: datetime.now(timezone.utc),\n            ),\n        ]\n        \"\"\"The time at which the measurement was taken.\"\"\"\n\n\nclass MemoryUsageInfo(BaseModel):\n    \"\"\"Information about the memory usage.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    current_size: Annotated[\n        ByteSize,\n        PlainValidator(ByteSize.validate),\n        PlainSerializer(lambda size: size.bytes),\n        Field(alias='currentSize'),\n    ]\n    \"\"\"Memory usage of the current Python process and its children.\"\"\"\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        created_at: datetime = datetime.now(timezone.utc)\n        \"\"\"The time at which the measurement was taken.\"\"\"\n    else:\n        created_at: Annotated[\n            datetime,\n            Field(\n                alias='createdAt',\n                default_factory=lambda: datetime.now(timezone.utc),\n            ),\n        ]\n        \"\"\"The time at which the measurement was taken.\"\"\"\n\n\nclass MemoryInfo(MemoryUsageInfo):\n    \"\"\"Information about system memory.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    total_size: Annotated[\n        ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')\n    ]\n    \"\"\"Total memory available in the system.\"\"\"\n\n    system_wide_used_size: Annotated[\n        ByteSize,\n        PlainValidator(ByteSize.validate),\n        PlainSerializer(lambda size: size.bytes),\n        Field(alias='systemWideUsedSize'),\n    ]\n    \"\"\"Total memory used by all processes system-wide (including non-crawlee processes).\"\"\"\n\n\ndef get_cpu_info() -> CpuInfo:\n    \"\"\"Retrieve the current CPU usage.\n\n    It utilizes the `psutil` library. Function `psutil.cpu_percent()` returns a float representing the current\n    system-wide CPU utilization as a percentage.\n    \"\"\"\n    logger.debug('Calling get_cpu_info()...')\n    cpu_percent = psutil.cpu_percent(interval=0.1)\n    return CpuInfo(used_ratio=cpu_percent / 100)\n\n\ndef get_memory_info() -> MemoryInfo:\n    \"\"\"Retrieve the current memory usage of the process and its children.\n\n    It utilizes the `psutil` library.\n    \"\"\"\n    logger.debug('Calling get_memory_info()...')\n    current_process = psutil.Process(os.getpid())\n\n    # Retrieve estimated memory usage of the current process.\n    current_size_bytes = _get_used_memory(current_process)\n\n    # Sum memory usage by all children processes, try to exclude shared memory from the sum if allowed by OS.\n    for child in current_process.children(recursive=True):\n        # Ignore any NoSuchProcess exception that might occur if a child process ends before we retrieve\n        # its memory usage.\n        with suppress(psutil.NoSuchProcess):\n            current_size_bytes += _get_used_memory(child)\n\n    vm = psutil.virtual_memory()\n\n    return MemoryInfo(\n        total_size=ByteSize(vm.total),\n        current_size=ByteSize(current_size_bytes),\n        system_wide_used_size=ByteSize(vm.total - vm.available),\n    )\n"
  },
  {
    "path": "src/crawlee/_utils/time.py",
    "content": "from __future__ import annotations\n\nimport time\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\n\nfrom async_timeout import Timeout, timeout\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n    from types import TracebackType\n\n_SECONDS_PER_MINUTE = 60\n_SECONDS_PER_HOUR = 3600\n\n\n@dataclass\nclass TimerResult:\n    wall: float | None = None\n    cpu: float | None = None\n\n\n@contextmanager\ndef measure_time() -> Iterator[TimerResult]:\n    \"\"\"Measure the execution time (wall-clock and CPU) between the start and end of the with-block.\"\"\"\n    result = TimerResult()\n    before_wall = time.monotonic()\n    before_cpu = time.thread_time()\n\n    try:\n        yield result\n    finally:\n        after_wall = time.monotonic()\n        after_cpu = time.thread_time()\n        result.wall = after_wall - before_wall\n        result.cpu = after_cpu - before_cpu\n\n\nclass SharedTimeout:\n    \"\"\"Keeps track of a time budget shared by multiple independent async operations.\n\n    Provides a reusable, non-reentrant context manager interface.\n    \"\"\"\n\n    def __init__(self, timeout: timedelta) -> None:\n        self._remaining_timeout = timeout\n        self._active_timeout: Timeout | None = None\n        self._activation_timestamp: float | None = None\n\n    async def __aenter__(self) -> timedelta:\n        if self._active_timeout is not None or self._activation_timestamp is not None:\n            raise RuntimeError('A shared timeout context cannot be entered twice at the same time')\n\n        self._activation_timestamp = time.monotonic()\n        self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())\n        await new_timeout.__aenter__()\n        return self._remaining_timeout\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        if self._active_timeout is None or self._activation_timestamp is None:\n            raise RuntimeError('Logic error')\n\n        await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)\n        elapsed = time.monotonic() - self._activation_timestamp\n        self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)\n\n        self._active_timeout = None\n        self._activation_timestamp = None\n\n\ndef format_duration(duration: timedelta | None) -> str:\n    \"\"\"Format a timedelta into a human-readable string with appropriate units.\"\"\"\n    if duration is None:\n        return 'None'\n\n    total_seconds = duration.total_seconds()\n\n    if total_seconds == 0:\n        return '0s'\n\n    # For very small durations, show in milliseconds\n    if total_seconds < 1:\n        milliseconds = total_seconds * 1000\n        if milliseconds < 1:\n            microseconds = total_seconds * 1_000_000\n            return f'{microseconds:.1f}μs'\n        return f'{milliseconds:.1f}ms'\n\n    # For durations less than 60 seconds, show in seconds\n    if total_seconds < _SECONDS_PER_MINUTE:\n        return f'{total_seconds:.2f}s'\n\n    # For durations less than 1 hour, show in minutes and seconds\n    if total_seconds < _SECONDS_PER_HOUR:\n        minutes = int(total_seconds // _SECONDS_PER_MINUTE)\n        seconds = total_seconds % _SECONDS_PER_MINUTE\n        if seconds == 0:\n            return f'{minutes}min'\n        return f'{minutes}min {seconds:.1f}s'\n\n    # For longer durations, show in hours, minutes, and seconds\n    hours = int(total_seconds // _SECONDS_PER_HOUR)\n    remaining_seconds = total_seconds % _SECONDS_PER_HOUR\n    minutes = int(remaining_seconds // _SECONDS_PER_MINUTE)\n    seconds = remaining_seconds % _SECONDS_PER_MINUTE\n\n    result = f'{hours}h'\n    if minutes > 0:\n        result += f' {minutes}min'\n    if seconds > 0:\n        result += f' {seconds:.1f}s'\n\n    return result\n"
  },
  {
    "path": "src/crawlee/_utils/try_import.py",
    "content": "import sys\nfrom collections.abc import Iterator\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass\nfrom types import ModuleType\nfrom typing import Any\n\n\n@contextmanager\ndef try_import(module_name: str, *symbol_names: str) -> Iterator[None]:\n    \"\"\"Context manager to attempt importing symbols into a module.\n\n    If an `ImportError` is raised during the import, the symbol will be replaced with a `FailedImport` object.\n    \"\"\"\n    try:\n        yield\n    except ImportError as e:\n        for symbol_name in symbol_names:\n            setattr(sys.modules[module_name], symbol_name, FailedImport(e.args[0]))\n\n\ndef install_import_hook(module_name: str) -> None:\n    \"\"\"Install an import hook for a specified module.\"\"\"\n    sys.modules[module_name].__class__ = ImportWrapper\n\n\n@dataclass\nclass FailedImport:\n    \"\"\"Represent a placeholder for a failed import.\"\"\"\n\n    message: str\n    \"\"\"The error message associated with the failed import.\"\"\"\n\n\nclass ImportWrapper(ModuleType):\n    \"\"\"A wrapper class for modules to handle attribute access for failed imports.\"\"\"\n\n    def __getattribute__(self, name: str) -> Any:\n        result = super().__getattribute__(name)\n\n        if isinstance(result, FailedImport):\n            raise ImportError(result.message)  # noqa: TRY004\n\n        return result\n"
  },
  {
    "path": "src/crawlee/_utils/urls.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom pydantic import AnyHttpUrl, TypeAdapter\nfrom yarl import URL\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n    from logging import Logger\n\n\ndef is_url_absolute(url: str) -> bool:\n    \"\"\"Check if a URL is absolute.\"\"\"\n    url_parsed = URL(url)\n\n    # We don't use .absolute because in yarl.URL, it is always True for links that start with '//'\n    return bool(url_parsed.scheme) and bool(url_parsed.raw_authority)\n\n\ndef convert_to_absolute_url(base_url: str, relative_url: str) -> str:\n    \"\"\"Convert a relative URL to an absolute URL using a base URL.\"\"\"\n    return str(URL(base_url).join(URL(relative_url)))\n\n\ndef to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:\n    \"\"\"Convert an iterator of relative URLs to absolute URLs using a base URL.\"\"\"\n    for url in urls:\n        if is_url_absolute(url):\n            yield url\n        else:\n            converted_url = convert_to_absolute_url(base_url, url)\n            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.\n            if not is_url_absolute(converted_url):\n                if logger:\n                    logger.debug(f'Could not convert URL \"{url}\" to absolute using base URL \"{base_url}\". Skipping it.')\n                continue\n            yield converted_url\n\n\n_http_url_adapter = TypeAdapter(AnyHttpUrl)\n\n\ndef validate_http_url(value: str | None) -> str | None:\n    \"\"\"Validate the given HTTP URL.\n\n    Raises:\n        pydantic.ValidationError: If the URL is not valid.\n    \"\"\"\n    if value is not None:\n        _http_url_adapter.validate_python(value)\n\n    return value\n"
  },
  {
    "path": "src/crawlee/_utils/wait.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import suppress\nfrom typing import TYPE_CHECKING, TypeVar\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable, Sequence\n    from datetime import timedelta\n    from logging import Logger\n\nT = TypeVar('T')\n\n\nasync def wait_for(\n    operation: Callable[[], Awaitable[T]],\n    *,\n    timeout: timedelta,\n    timeout_message: str | None = None,\n    max_retries: int = 1,\n    logger: Logger,\n) -> T:\n    \"\"\"Wait for an async operation to complete.\n\n    If the wait times out, `TimeoutError` is raised and the future is cancelled.\n    Optionally retry on error.\n\n    Args:\n        operation: A function that returns the future to wait for.\n        timeout: How long should we wait before cancelling the future.\n        timeout_message: Message to be included in the `TimeoutError` in case of timeout.\n        max_retries: How many times should the operation be attempted.\n        logger: Used to report information about retries as they happen.\n    \"\"\"\n    for iteration in range(1, max_retries + 1):\n        try:\n            return await asyncio.wait_for(operation(), timeout.total_seconds())\n        except asyncio.TimeoutError as ex:  # noqa: PERF203\n            raise asyncio.TimeoutError(timeout_message) from ex\n        except Exception as e:\n            if iteration == max_retries:\n                raise\n\n            logger.warning(f'{e!s}: retrying ({iteration}/{max_retries})')\n\n    raise RuntimeError('Unreachable code')\n\n\nasync def wait_for_all_tasks_for_finish(\n    tasks: Sequence[asyncio.Task],\n    *,\n    logger: Logger,\n    timeout: timedelta | None = None,\n) -> None:\n    \"\"\"Wait for all tasks to finish or until the timeout is reached.\n\n    Args:\n        tasks: A sequence of asyncio tasks to wait for.\n        logger: Logger to use for reporting.\n        timeout: How long should we wait before cancelling the tasks.\n    \"\"\"\n    if not tasks:\n        return\n\n    timeout_secs = timeout.total_seconds() if timeout else None\n    try:\n        _, pending = await asyncio.wait(tasks, timeout=timeout_secs)\n        if pending:\n            logger.warning('Waiting timeout reached; canceling unfinished tasks.')\n    except asyncio.CancelledError:\n        logger.warning('Asyncio wait was cancelled; canceling unfinished tasks.')\n        raise\n    finally:\n        for task in tasks:\n            if not task.done():\n                task.cancel()\n                with suppress(asyncio.CancelledError):\n                    await task\n            # If task is done, access the result to clear any exceptions\n            else:\n                try:\n                    task.result()\n                except asyncio.CancelledError:\n                    pass\n                except Exception as e:\n                    logger.warning(f'Task raised an exception: {e}')\n"
  },
  {
    "path": "src/crawlee/_utils/web.py",
    "content": "from __future__ import annotations\n\nfrom http import HTTPStatus\n\n\ndef is_status_code_client_error(value: int) -> bool:\n    \"\"\"Return `True` for 4xx status codes, `False` otherwise.\"\"\"\n    return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR\n\n\ndef is_status_code_server_error(value: int) -> bool:\n    \"\"\"Return `True` for 5xx status codes, `False` otherwise.\"\"\"\n    return value >= HTTPStatus.INTERNAL_SERVER_ERROR\n\n\ndef is_status_code_successful(value: int) -> bool:\n    \"\"\"Return `True` for 2xx and 3xx status codes, `False` otherwise.\"\"\"\n    return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST\n"
  },
  {
    "path": "src/crawlee/browsers/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\nfrom ._types import BrowserType, CrawleePage\n\n_install_import_hook(__name__)\n\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'BrowserPool'):\n    from ._browser_pool import BrowserPool\nwith _try_import(__name__, 'PlaywrightBrowserController'):\n    from ._playwright_browser_controller import PlaywrightBrowserController\nwith _try_import(__name__, 'PlaywrightBrowserPlugin'):\n    from ._playwright_browser_plugin import PlaywrightBrowserPlugin\nwith _try_import(__name__, 'PlaywrightPersistentBrowser'):\n    from ._playwright_browser import PlaywrightPersistentBrowser\n\n\n__all__ = [\n    'BrowserPool',\n    'BrowserType',\n    'CrawleePage',\n    'PlaywrightBrowserController',\n    'PlaywrightBrowserPlugin',\n    'PlaywrightPersistentBrowser',\n]\n"
  },
  {
    "path": "src/crawlee/browsers/_browser_controller.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-controller.ts\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n    from datetime import datetime, timedelta\n\n    from playwright.async_api import Page\n\n    from crawlee.browsers._types import BrowserType\n    from crawlee.proxy_configuration import ProxyInfo\n\n\n@docs_group('Browser management')\nclass BrowserController(ABC):\n    \"\"\"An abstract base class for managing browser instance and their pages.\"\"\"\n\n    AUTOMATION_LIBRARY: str | None = None\n    \"\"\"The name of the automation library that the controller is using.\"\"\"\n\n    @property\n    @abstractmethod\n    def pages(self) -> list[Page]:\n        \"\"\"Return the list of opened pages.\"\"\"\n\n    @property\n    @abstractmethod\n    def total_opened_pages(self) -> int:\n        \"\"\"Return the total number of pages opened since the browser was launched.\"\"\"\n\n    @property\n    @abstractmethod\n    def pages_count(self) -> int:\n        \"\"\"Return the number of currently open pages.\"\"\"\n\n    @property\n    @abstractmethod\n    def last_page_opened_at(self) -> datetime:\n        \"\"\"Return the time when the last page was opened.\"\"\"\n\n    @property\n    @abstractmethod\n    def idle_time(self) -> timedelta:\n        \"\"\"Return the idle time of the browser controller.\"\"\"\n\n    @property\n    @abstractmethod\n    def has_free_capacity(self) -> bool:\n        \"\"\"Return if the browser has free capacity to open a new page.\"\"\"\n\n    @property\n    @abstractmethod\n    def is_browser_connected(self) -> bool:\n        \"\"\"Return if the browser is closed.\"\"\"\n\n    @property\n    @abstractmethod\n    def browser_type(self) -> BrowserType:\n        \"\"\"Return the type of the browser.\"\"\"\n\n    @abstractmethod\n    async def new_page(\n        self,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        proxy_info: ProxyInfo | None = None,\n    ) -> Page:\n        \"\"\"Create a new page with the given context options.\n\n        Args:\n            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options\n                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n            proxy_info: The proxy configuration to use for the new page.\n\n        Returns:\n            Page: The newly created page.\n\n        Raises:\n            ValueError: If the browser has reached the maximum number of open pages.\n        \"\"\"\n\n    @abstractmethod\n    async def close(self, *, force: bool = False) -> None:\n        \"\"\"Close the browser.\n\n        Args:\n            force: Whether to force close all open pages before closing the browser.\n\n        Raises:\n            ValueError: If there are still open pages when trying to close the browser.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/browsers/_browser_plugin.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/abstract-classes/browser-plugin.ts\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n    from types import TracebackType\n\n    from crawlee.browsers._browser_controller import BrowserController\n    from crawlee.browsers._types import BrowserType\n\n\n@docs_group('Browser management')\nclass BrowserPlugin(ABC):\n    \"\"\"An abstract base class for browser plugins.\n\n    Browser plugins act as wrappers around browser automation tools like Playwright,\n    providing a unified interface for interacting with browsers.\n    \"\"\"\n\n    AUTOMATION_LIBRARY: str | None = None\n    \"\"\"The name of the automation library that the plugin is managing.\"\"\"\n\n    @property\n    @abstractmethod\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n\n    @property\n    @abstractmethod\n    def browser_type(self) -> BrowserType:\n        \"\"\"Return the browser type name.\"\"\"\n\n    @property\n    @abstractmethod\n    def browser_launch_options(self) -> Mapping[str, Any]:\n        \"\"\"Return the options for the `browser.launch` method.\n\n        Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n        `browser_type.launch` method. For more details, refer to the Playwright documentation:\n         https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\n        \"\"\"\n\n    @property\n    @abstractmethod\n    def browser_new_context_options(self) -> Mapping[str, Any]:\n        \"\"\"Return the options for the `browser.new_context` method.\n\n        Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n        `browser.new_context` method. For more details, refer to the Playwright documentation:\n        https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n        \"\"\"\n\n    @property\n    @abstractmethod\n    def max_open_pages_per_browser(self) -> int:\n        \"\"\"Return the maximum number of pages that can be opened in a single browser.\"\"\"\n\n    @abstractmethod\n    async def __aenter__(self) -> BrowserPlugin:\n        \"\"\"Enter the context manager and initialize the browser plugin.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n\n    @abstractmethod\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Exit the context manager and close the browser plugin.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n\n    @abstractmethod\n    async def new_browser(self) -> BrowserController:\n        \"\"\"Create a new browser instance.\n\n        Returns:\n            A new browser instance wrapped in a controller.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/browsers/_browser_pool.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/tree/v3.10.1/packages/browser-pool/\n\nfrom __future__ import annotations\n\nimport asyncio\nimport itertools\nfrom collections import defaultdict\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any\nfrom weakref import WeakValueDictionary\n\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee.browsers._browser_controller import BrowserController\nfrom crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin\nfrom crawlee.browsers._types import BrowserType, CrawleePage\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable, Mapping, Sequence\n    from pathlib import Path\n    from types import TracebackType\n\n    from crawlee.browsers._browser_plugin import BrowserPlugin\n    from crawlee.fingerprint_suite import FingerprintGenerator\n    from crawlee.proxy_configuration import ProxyInfo\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Browser management')\nclass BrowserPool:\n    \"\"\"Manage a pool of browsers and pages, handling their lifecycle and resource allocation.\n\n    The `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers,\n    and handling the overall lifecycle of these resources. It provides flexible configuration via\n    constructor options, which include various hooks that allow for the insertion of custom behavior\n    at different stages of the browser and page lifecycles.\n\n    The browsers in the pool can be in one of three states: active, inactive, or closed.\n    \"\"\"\n\n    _GENERATED_PAGE_ID_LENGTH = 8\n    \"\"\"The length of the newly generated page ID.\"\"\"\n\n    def __init__(\n        self,\n        plugins: Sequence[BrowserPlugin] | None = None,\n        *,\n        operation_timeout: timedelta = timedelta(seconds=15),\n        browser_inactive_threshold: timedelta = timedelta(seconds=10),\n        identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),\n        close_inactive_browsers_interval: timedelta = timedelta(seconds=30),\n        retire_browser_after_page_count: int = 100,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            plugins: Browser plugins serve as wrappers around various browser automation libraries,\n                providing a consistent interface across different libraries.\n            operation_timeout: Operations of the underlying automation libraries, such as launching a browser\n                or opening a new page, can sometimes get stuck. To prevent `BrowserPool` from becoming unresponsive,\n                we add a timeout to these operations.\n            browser_inactive_threshold: The period of inactivity after which a browser is considered as inactive.\n            identify_inactive_browsers_interval: The period of inactivity after which a browser is considered\n                as retired.\n            close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers\n                and closes them. The browser is considered as inactive if it has no active pages and has been idle\n                for the specified period. The browser is considered as retired if it has no active pages and has total\n                pages count greater than or equal to `retire_browser_after_page_count`.\n            retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered\n                as retired.\n        \"\"\"\n        self._plugins = plugins or [PlaywrightBrowserPlugin()]\n        self._operation_timeout = operation_timeout\n        self._browser_inactive_threshold = browser_inactive_threshold\n\n        self._active_browsers = list[BrowserController]()\n        \"\"\"A list of browsers currently active and being used to open pages.\"\"\"\n\n        self._inactive_browsers = list[BrowserController]()\n        \"\"\"A list of browsers currently inactive and not being used to open new pages,\n        but may still contain open pages.\"\"\"\n\n        self._identify_inactive_browsers_task = RecurringTask(\n            self._identify_inactive_browsers,\n            identify_inactive_browsers_interval,\n        )\n\n        self._close_inactive_browsers_task = RecurringTask(\n            self._close_inactive_browsers,\n            close_inactive_browsers_interval,\n        )\n\n        self._total_pages_count = 0\n        self._retire_browser_after_page_count = retire_browser_after_page_count\n        self._pages = WeakValueDictionary[str, CrawleePage]()  # Track the pages in the pool\n        self._plugins_cycle = itertools.cycle(self._plugins)  # Cycle through the plugins\n\n        self._pre_page_create_hooks: list[\n            Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]\n        ] = []\n        self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []\n        self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []\n        self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    @classmethod\n    def with_default_plugin(\n        cls,\n        *,\n        browser_type: BrowserType | None = None,\n        user_data_dir: str | Path | None = None,\n        browser_launch_options: Mapping[str, Any] | None = None,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        headless: bool | None = None,\n        fingerprint_generator: FingerprintGenerator | None = None,\n        use_incognito_pages: bool | None = False,\n        **kwargs: Any,\n    ) -> BrowserPool:\n        \"\"\"Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.\n\n        Args:\n            browser_type: The type of browser to launch:\n                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\n                    the system.\n            user_data_dir: Path to a user data directory, which stores browser session data like cookies\n                and local storage.\n            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided\n                directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\n                documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\n            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options\n                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n            headless: Whether to run the browser in headless mode.\n            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used\n                to generate browser fingerprints together with consistent headers.\n            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its\n                own context that is destroyed once the page is closed or crashes.\n            kwargs: Additional arguments for default constructor.\n        \"\"\"\n        plugin_options: dict = defaultdict(dict)\n        plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}\n        plugin_options['browser_new_context_options'] = browser_new_context_options or {}\n\n        if headless is not None:\n            plugin_options['browser_launch_options']['headless'] = headless\n\n        if use_incognito_pages is not None:\n            plugin_options['use_incognito_pages'] = use_incognito_pages\n\n        if browser_type:\n            plugin_options['browser_type'] = browser_type\n\n        if user_data_dir:\n            plugin_options['user_data_dir'] = user_data_dir\n\n        plugin = PlaywrightBrowserPlugin(\n            **plugin_options,\n            fingerprint_generator=fingerprint_generator,\n        )\n        return cls(plugins=[plugin], **kwargs)\n\n    @property\n    def plugins(self) -> Sequence[BrowserPlugin]:\n        \"\"\"Return the browser plugins.\"\"\"\n        return self._plugins\n\n    @property\n    def active_browsers(self) -> Sequence[BrowserController]:\n        \"\"\"Return the active browsers in the pool.\"\"\"\n        return self._active_browsers\n\n    @property\n    def inactive_browsers(self) -> Sequence[BrowserController]:\n        \"\"\"Return the inactive browsers in the pool.\"\"\"\n        return self._inactive_browsers\n\n    @property\n    def pages(self) -> Mapping[str, CrawleePage]:\n        \"\"\"Return the pages in the pool.\"\"\"\n        return self._pages\n\n    @property\n    def total_pages_count(self) -> int:\n        \"\"\"Return the total number of pages opened since the browser pool was launched.\"\"\"\n        return self._total_pages_count\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    async def __aenter__(self) -> BrowserPool:\n        \"\"\"Enter the context manager and initialize all browser plugins.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n        # Start the recurring tasks for identifying and closing inactive browsers\n        self._identify_inactive_browsers_task.start()\n        self._close_inactive_browsers_task.start()\n\n        timeout = self._operation_timeout.total_seconds()\n\n        try:\n            for plugin in self._plugins:\n                await asyncio.wait_for(plugin.__aenter__(), timeout)\n        except asyncio.TimeoutError:\n            logger.warning(f'Initializing of the browser plugin {plugin} timed out, will be skipped.')\n\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Exit the context manager and close all browser plugins.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        await self._identify_inactive_browsers_task.stop()\n        await self._close_inactive_browsers_task.stop()\n\n        for browser in self._active_browsers + self._inactive_browsers:\n            await browser.close(force=True)\n        self._active_browsers.clear()\n        self._inactive_browsers.clear()\n\n        for plugin in self._plugins:\n            await plugin.__aexit__(exc_type, exc_value, exc_traceback)\n\n        self._active = False\n\n    @ensure_context\n    async def new_page(\n        self,\n        *,\n        page_id: str | None = None,\n        browser_plugin: BrowserPlugin | None = None,\n        proxy_info: ProxyInfo | None = None,\n    ) -> CrawleePage:\n        \"\"\"Open a new page in a browser using the specified or a random browser plugin.\n\n        Args:\n            page_id: The ID to assign to the new page. If not provided, a random ID is generated.\n            browser_plugin: browser_plugin: The browser plugin to use for creating the new page.\n                If not provided, the next plugin in the rotation is used.\n            proxy_info: The proxy configuration to use for the new page.\n\n        Returns:\n            The newly created browser page.\n        \"\"\"\n        if page_id in self.pages:\n            raise ValueError(f'Page with ID: {page_id} already exists.')\n\n        if browser_plugin and browser_plugin not in self.plugins:\n            raise ValueError('Provided browser_plugin is not one of the plugins used by BrowserPool.')\n\n        page_id = page_id or crypto_random_object_id(self._GENERATED_PAGE_ID_LENGTH)\n        plugin = browser_plugin or next(self._plugins_cycle)\n\n        return await self._get_new_page(page_id, plugin, proxy_info)\n\n    @ensure_context\n    async def new_page_with_each_plugin(self) -> Sequence[CrawleePage]:\n        \"\"\"Create a new page with each browser plugin in the pool.\n\n        This method is useful for running scripts in multiple environments simultaneously, typically for testing\n        or website analysis. Each page is created using a different browser plugin, allowing you to interact\n        with various browser types concurrently.\n\n        Returns:\n            A list of newly created pages, one for each plugin in the pool.\n        \"\"\"\n        pages_coroutines = [self.new_page(browser_plugin=plugin) for plugin in self._plugins]\n        return await asyncio.gather(*pages_coroutines)\n\n    async def _get_new_page(\n        self,\n        page_id: str,\n        plugin: BrowserPlugin,\n        proxy_info: ProxyInfo | None,\n    ) -> CrawleePage:\n        \"\"\"Initialize a new browser page using the specified plugin.\n\n        Select a browser with available capacity or launch a new one if needed. Create a new page in the selected\n        browser with the provided proxy settings.\n        \"\"\"\n        timeout = self._operation_timeout.total_seconds()\n        browser_controller = self._pick_browser_with_free_capacity(plugin)\n\n        try:\n            if not browser_controller:\n                browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)\n            browser_new_context_options = dict(plugin.browser_new_context_options)\n\n            await self._execute_hooks(\n                self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info\n            )\n\n            page = await asyncio.wait_for(\n                browser_controller.new_page(\n                    browser_new_context_options=browser_new_context_options,\n                    proxy_info=proxy_info,\n                ),\n                timeout,\n            )\n        except asyncio.TimeoutError as exc:\n            raise TimeoutError(f'Creating a new page with plugin {plugin} timed out.') from exc\n        except RuntimeError as exc:\n            raise RuntimeError('Browser pool is not initialized.') from exc\n\n        if browser_controller.total_opened_pages >= self._retire_browser_after_page_count:\n            self._retire_browser(browser_controller)\n\n        crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)\n        self._pages[page_id] = crawlee_page\n        self._total_pages_count += 1\n\n        await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller)\n\n        self._override_page_close(crawlee_page, browser_controller)\n\n        return crawlee_page\n\n    def _pick_browser_with_free_capacity(\n        self,\n        browser_plugin: BrowserPlugin,\n    ) -> BrowserController | None:\n        \"\"\"Pick a browser with free capacity that matches the specified plugin.\"\"\"\n        for browser in self._active_browsers:\n            if browser.has_free_capacity and browser.AUTOMATION_LIBRARY == browser_plugin.AUTOMATION_LIBRARY:\n                return browser\n\n        return None\n\n    def _retire_browser(self, browser: BrowserController) -> None:\n        \"\"\"Retire a browser by moving it to the inactive list.\"\"\"\n        if browser in self._active_browsers:\n            self._active_browsers.remove(browser)\n            self._inactive_browsers.append(browser)\n\n    async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:\n        \"\"\"Launch a new browser instance using the specified plugin.\"\"\"\n        browser = await plugin.new_browser()\n        self._active_browsers.append(browser)\n        return browser\n\n    def _identify_inactive_browsers(self) -> None:\n        \"\"\"Identify inactive browsers and move them to the inactive list if their idle time exceeds the threshold.\"\"\"\n        for browser in list(self._active_browsers):\n            if browser.idle_time >= self._browser_inactive_threshold:\n                self._active_browsers.remove(browser)\n                self._inactive_browsers.append(browser)\n\n    async def _close_inactive_browsers(self) -> None:\n        \"\"\"Close the browsers that have no active pages and have been idle for a certain period.\"\"\"\n        for browser in list(self._inactive_browsers):\n            if not browser.pages:\n                await browser.close()\n                self._inactive_browsers.remove(browser)\n\n    async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None:\n        \"\"\"Execute the provided hooks with the given arguments.\"\"\"\n        for hook in hooks:\n            await hook(*args)\n\n    def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None:\n        \"\"\"Override the page's close method to execute pre and post close hooks.\"\"\"\n        if self._pre_page_close_hooks or self._post_page_close_hooks:\n            original_close = crawlee_page.page.close\n\n            async def close_with_hooks(*args: Any, **kwargs: Any) -> None:\n                try:\n                    await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller)\n                finally:\n                    await original_close(*args, **kwargs)\n                await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller)\n\n            crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks\n\n    def pre_page_create_hook(\n        self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]\n    ) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:\n        \"\"\"Register a hook to be called just before a new page is created.\n\n        The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.\n        Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not\n        apply to every page individually. For example, `PlaywrightBrowserController` with\n        ``use_incognito_pages=False`` shares a single context across all pages, so the options are applied\n        only when the context is first created.\n        \"\"\"\n        self._pre_page_create_hooks.append(hook)\n\n        return hook\n\n    def post_page_create_hook(\n        self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]\n    ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:\n        \"\"\"Register a hook to be called right after a new page is created.\n\n        The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply\n        changes to all pages, such as injecting scripts or configuring request interception.\n        \"\"\"\n        self._post_page_create_hooks.append(hook)\n\n        return hook\n\n    def pre_page_close_hook(\n        self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]\n    ) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:\n        \"\"\"Register a hook to be called just before a page is closed.\n\n        The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,\n        such as taking a screenshot or saving page state before the page is destroyed.\n        \"\"\"\n        self._pre_page_close_hooks.append(hook)\n\n        return hook\n\n    def post_page_close_hook(\n        self, hook: Callable[[str, BrowserController], Awaitable[None]]\n    ) -> Callable[[str, BrowserController], Awaitable[None]]:\n        \"\"\"Register a hook to be called right after a page is closed.\n\n        The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging\n        after a page's lifecycle ends.\n        \"\"\"\n        self._post_page_close_hooks.append(hook)\n\n        return hook\n"
  },
  {
    "path": "src/crawlee/browsers/_playwright_browser.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport shutil\nimport tempfile\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nfrom playwright.async_api import Browser\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from playwright.async_api import BrowserContext, BrowserType, CDPSession, Page\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Browser management')\nclass PlaywrightPersistentBrowser(Browser):\n    \"\"\"A wrapper for Playwright's `Browser` that operates with a persistent context.\n\n    It utilizes Playwright's persistent browser context feature, maintaining user data across sessions.\n    While it follows the same interface as Playwright's `Browser` class, there is no abstract base class\n    enforcing this. There is a limitation that only a single persistent context is allowed.\n    \"\"\"\n\n    _TMP_DIR_PREFIX = 'apify-playwright-firefox-taac-'\n\n    def __init__(\n        self,\n        browser_type: BrowserType,\n        user_data_dir: str | Path | None,\n        browser_launch_options: dict[str, Any],\n    ) -> None:\n        self._browser_type = browser_type\n        self._browser_launch_options = browser_launch_options\n        self._user_data_dir = user_data_dir\n        self._temp_dir: Path | None = None\n\n        self._context: BrowserContext | None = None\n        self._is_connected = True\n\n    @property\n    def browser_type(self) -> BrowserType:\n        return self._browser_type\n\n    @property\n    def contexts(self) -> list[BrowserContext]:\n        return [self._context] if self._context else []\n\n    def is_connected(self) -> bool:\n        return self._is_connected\n\n    async def new_context(self, **context_options: Any) -> BrowserContext:\n        \"\"\"Create persistent context instead of regular one. Merge launch options with context options.\"\"\"\n        if self._context:\n            raise RuntimeError('Persistent browser can have only one context')\n\n        launch_options = self._browser_launch_options | context_options\n\n        if self._user_data_dir:\n            user_data_dir = self._user_data_dir\n        else:\n            user_data_dir = tempfile.mkdtemp(prefix=self._TMP_DIR_PREFIX)\n            self._temp_dir = Path(user_data_dir)\n\n        self._context = await self._browser_type.launch_persistent_context(\n            user_data_dir=user_data_dir, **launch_options\n        )\n\n        if self._temp_dir:\n            self._context.on('close', self._delete_temp_dir)\n\n        return self._context\n\n    async def _delete_temp_dir(self, _: BrowserContext | None) -> None:\n        if self._temp_dir and self._temp_dir.exists():\n            temp_dir = self._temp_dir\n            await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)\n\n    @override\n    async def close(self, **kwargs: Any) -> None:\n        \"\"\"Close browser by closing its context.\"\"\"\n        if self._context:\n            await self._context.close()\n            self._context = None\n        self._is_connected = False\n        await asyncio.sleep(0.1)\n        await self._delete_temp_dir(self._context)\n\n    @property\n    @override\n    def version(self) -> str:\n        raise NotImplementedError('Persistent browser does not support version.')\n\n    async def new_page(self, **kwargs: Any) -> Page:\n        raise NotImplementedError('Persistent browser does not support new page.')\n\n    @override\n    async def new_browser_cdp_session(self) -> CDPSession:\n        raise NotImplementedError('Persistent browser does not support new browser CDP session.')\n\n    async def start_tracing(self, **kwargs: Any) -> None:\n        raise NotImplementedError('Persistent browser does not support tracing.')\n\n    async def stop_tracing(self, **kwargs: Any) -> bytes:\n        raise NotImplementedError('Persistent browser does not support tracing.')\n"
  },
  {
    "path": "src/crawlee/browsers/_playwright_browser_controller.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-controller.ts\n\nfrom __future__ import annotations\n\nfrom asyncio import Lock\nfrom datetime import datetime, timedelta, timezone\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom browserforge.injectors.playwright import AsyncNewContext\nfrom playwright.async_api import Browser, BrowserContext, Page, ProxySettings\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.browsers._browser_controller import BrowserController\nfrom crawlee.fingerprint_suite import HeaderGenerator\nfrom crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n\n    from crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser\n    from crawlee.browsers._types import BrowserType\n    from crawlee.fingerprint_suite import FingerprintGenerator\n    from crawlee.proxy_configuration import ProxyInfo\n\nfrom logging import getLogger\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Browser management')\nclass PlaywrightBrowserController(BrowserController):\n    \"\"\"Controller for managing Playwright browser instances and their pages.\n\n    It provides methods to control browser instances, manage their pages, and handle context-specific\n    configurations. It enforces limits on the number of open pages and tracks their state.\n    \"\"\"\n\n    AUTOMATION_LIBRARY = 'playwright'\n    _DEFAULT_HEADER_GENERATOR = HeaderGenerator()\n\n    def __init__(\n        self,\n        browser: Browser | PlaywrightPersistentBrowser,\n        *,\n        max_open_pages_per_browser: int = 20,\n        use_incognito_pages: bool = False,\n        header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,\n        fingerprint_generator: FingerprintGenerator | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            browser: The browser instance to control.\n            max_open_pages_per_browser: The maximum number of pages that can be open at the same time.\n            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its\n                own context that is destroyed once the page is closed or crashes.\n            header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for\n                requests made by the browser. By default, a predefined header generator is used. Set to `None` to\n                disable automatic header modifications.\n            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used\n                to generate browser fingerprints together with consistent headers.\n        \"\"\"\n        if fingerprint_generator and header_generator is not self._DEFAULT_HEADER_GENERATOR:\n            raise ValueError(\n                'Do not use `header_generator` and `fingerprint_generator` arguments at the same time. '\n                'Choose only one. `fingerprint_generator` generates headers as well.'\n            )\n        self._browser = browser\n        self._max_open_pages_per_browser = max_open_pages_per_browser\n        self._header_generator = header_generator\n        self._fingerprint_generator = fingerprint_generator\n        self._use_incognito_pages = use_incognito_pages\n\n        self._browser_context: BrowserContext | None = (\n            self._browser.contexts[0] if len(self._browser.contexts) > 0 else None\n        )\n        self._pages = list[Page]()\n        self._last_page_opened_at = datetime.now(timezone.utc)\n\n        self._total_opened_pages = 0\n        self._opening_pages_count = 0\n\n        self._context_creation_lock: Lock | None = None\n\n    async def _get_context_creation_lock(self) -> Lock:\n        \"\"\"Get context checking and creation lock.\n\n        It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to\n        memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.\n        \"\"\"\n        if self._context_creation_lock:\n            return self._context_creation_lock\n        self._context_creation_lock = Lock()\n        return self._context_creation_lock\n\n    @property\n    @override\n    def pages(self) -> list[Page]:\n        return self._pages\n\n    @property\n    @override\n    def total_opened_pages(self) -> int:\n        return self._total_opened_pages\n\n    @property\n    @override\n    def pages_count(self) -> int:\n        return len(self._pages)\n\n    @property\n    @override\n    def last_page_opened_at(self) -> datetime:\n        return self._last_page_opened_at\n\n    @property\n    @override\n    def idle_time(self) -> timedelta:\n        return datetime.now(timezone.utc) - self._last_page_opened_at\n\n    @property\n    @override\n    def has_free_capacity(self) -> bool:\n        return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser\n\n    @property\n    @override\n    def is_browser_connected(self) -> bool:\n        return self._browser.is_connected()\n\n    @property\n    @override\n    def browser_type(self) -> BrowserType:\n        return cast('BrowserType', self._browser.browser_type.name)\n\n    @override\n    async def new_page(\n        self,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        proxy_info: ProxyInfo | None = None,\n    ) -> Page:\n        \"\"\"Create a new page with the given context options.\n\n        Args:\n            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options\n                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n            proxy_info: The proxy configuration to use for the new page.\n\n        Returns:\n            Page: The newly created page.\n\n        Raises:\n            ValueError: If the browser has reached the maximum number of open pages.\n        \"\"\"\n        if not self.has_free_capacity:\n            raise ValueError('Cannot open more pages in this browser.')\n\n        self._opening_pages_count += 1\n\n        try:\n            if self._use_incognito_pages:\n                # In incognito there is exactly one context per one page. Create new context for each new page.\n                new_context = await self._create_browser_context(\n                    browser_new_context_options=browser_new_context_options,\n                    proxy_info=proxy_info,\n                )\n                page = await new_context.new_page()\n            else:\n                async with await self._get_context_creation_lock():\n                    if not self._browser_context:\n                        self._browser_context = await self._create_browser_context(\n                            browser_new_context_options=browser_new_context_options,\n                            proxy_info=proxy_info,\n                        )\n                page = await self._browser_context.new_page()\n\n            # Handle page close event\n            page.on(event='close', f=self._on_page_close)\n\n            # Update internal state\n            self._pages.append(page)\n            self._last_page_opened_at = datetime.now(timezone.utc)\n\n            self._total_opened_pages += 1\n        finally:\n            self._opening_pages_count -= 1\n        return page\n\n    @override\n    async def close(self, *, force: bool = False) -> None:\n        \"\"\"Close the browser.\n\n        Args:\n            force: Whether to force close all open pages before closing the browser.\n\n        Raises:\n            ValueError: If there are still open pages when trying to close the browser.\n        \"\"\"\n        if self.pages_count > 0 and not force:\n            raise ValueError('Cannot close the browser while there are open pages.')\n\n        if self._browser_context:\n            await self._browser_context.close()\n        await self._browser.close()\n\n    def _on_page_close(self, page: Page) -> None:\n        \"\"\"Handle actions after a page is closed.\"\"\"\n        self._pages.remove(page)\n\n    async def _create_browser_context(\n        self,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        proxy_info: ProxyInfo | None = None,\n    ) -> BrowserContext:\n        \"\"\"Create a new browser context with the specified proxy settings.\n\n        Create context with fingerprints and headers using with `self._fingerprint_generator` if available.\n        Create context without fingerprints, but with headers based on `self._header_generator` if available.\n        Create context without headers and without fingerprints if neither `self._header_generator` nor\n        `self._fingerprint_generator` is available.\n        \"\"\"\n        browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}\n        if proxy_info:\n            if browser_new_context_options.get('proxy'):\n                logger.warning(\"browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.\")\n\n            browser_new_context_options['proxy'] = ProxySettings(\n                server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',\n                username=proxy_info.username,\n                password=proxy_info.password,\n            )\n\n        if self._fingerprint_generator:\n            return await AsyncNewContext(\n                browser=self._browser,\n                fingerprint=self._fingerprint_generator.generate(),\n                **browser_new_context_options,\n            )\n\n        if self._header_generator:\n            extra_http_headers = dict(\n                self._header_generator.get_specific_headers(\n                    header_names={\n                        'Accept',\n                        'Accept-Language',\n                        'User-Agent',\n                        'sec-ch-ua',\n                        'sec-ch-ua-mobile',\n                        'sec-ch-ua-platform',\n                    },\n                    browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),\n                )\n            )\n        else:\n            extra_http_headers = None\n\n        browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(\n            'extra_http_headers', extra_http_headers\n        )\n        return await self._browser.new_context(**browser_new_context_options)\n"
  },
  {
    "path": "src/crawlee/browsers/_playwright_browser_plugin.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.10.1/packages/browser-pool/src/playwright/playwright-plugin.ts\n\nfrom __future__ import annotations\n\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any\n\nfrom playwright.async_api import Playwright, async_playwright\nfrom typing_extensions import override\n\nfrom crawlee import service_locator\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.browsers._browser_plugin import BrowserPlugin\nfrom crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser\nfrom crawlee.browsers._playwright_browser_controller import PlaywrightBrowserController\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n    from pathlib import Path\n    from types import TracebackType\n\n    from playwright.async_api._generated import Browser\n\n    from crawlee.browsers._types import BrowserType\n    from crawlee.fingerprint_suite import FingerprintGenerator\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Browser management')\nclass PlaywrightBrowserPlugin(BrowserPlugin):\n    \"\"\"A plugin for managing Playwright automation library.\n\n    It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory\n    for creating new browser instances and provides a unified interface for interacting with different browser types\n    (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless\n    mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each\n    browser instance, ensuring that resource limits are respected.\n    \"\"\"\n\n    AUTOMATION_LIBRARY = 'playwright'\n\n    def __init__(\n        self,\n        *,\n        browser_type: BrowserType = 'chromium',\n        user_data_dir: str | Path | None = None,\n        browser_launch_options: dict[str, Any] | None = None,\n        browser_new_context_options: dict[str, Any] | None = None,\n        max_open_pages_per_browser: int = 20,\n        use_incognito_pages: bool = False,\n        fingerprint_generator: FingerprintGenerator | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            browser_type: The type of browser to launch:\n                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\n                    the system.\n            user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local\n                storage.\n            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided\n                directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\n                documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\n            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options\n                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n                Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n            max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.\n                Once reached, a new browser instance will be launched to handle the excess.\n            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its\n                own context that is destroyed once the page is closed or crashes.\n            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used\n                to generate browser fingerprints together with consistent headers.\n        \"\"\"\n        config = service_locator.get_configuration()\n\n        # Default browser launch options are based on the configuration.\n        default_launch_browser_options: dict[str, Any] = {\n            'headless': config.headless,\n            'executable_path': config.default_browser_path,\n            'chromium_sandbox': not config.disable_browser_sandbox,\n        }\n        explicit_browser_launch_options = browser_launch_options or {}\n\n        # Map 'chrome' to 'chromium' with the 'chrome' channel.\n        if browser_type == 'chrome':\n            browser_type = 'chromium'\n            # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.\n            default_launch_browser_options['channel'] = 'chrome'\n\n            if executable_path := explicit_browser_launch_options.get(\n                'executable_path', default_launch_browser_options.get('executable_path')\n            ):\n                logger.debug(\n                    f\"Using browser executable from {executable_path}, which takes precedence over 'chrome' channel.\"\n                )\n\n        self._browser_type: BrowserType = browser_type\n        self._browser_launch_options: dict[str, Any] = default_launch_browser_options | explicit_browser_launch_options\n        self._browser_new_context_options = browser_new_context_options or {}\n        self._max_open_pages_per_browser = max_open_pages_per_browser\n        self._use_incognito_pages = use_incognito_pages\n        self._user_data_dir = user_data_dir\n\n        self._playwright_context_manager = async_playwright()\n        self._playwright: Playwright | None = None\n\n        # Flag to indicate the context state.\n        self._active = False\n\n        self._fingerprint_generator = fingerprint_generator\n\n    @property\n    @override\n    def active(self) -> bool:\n        return self._active\n\n    @property\n    @override\n    def browser_type(self) -> BrowserType:\n        return self._browser_type\n\n    @property\n    @override\n    def browser_launch_options(self) -> Mapping[str, Any]:\n        \"\"\"Return the options for the `browser.launch` method.\n\n        Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's\n        `browser_type.launch` method. For more details, refer to the Playwright documentation:\n         https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\n        \"\"\"\n        return self._browser_launch_options\n\n    @property\n    @override\n    def browser_new_context_options(self) -> Mapping[str, Any]:\n        \"\"\"Return the options for the `browser.new_context` method.\n\n        Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n        `browser.new_context` method. For more details, refer to the Playwright documentation:\n        https://playwright.dev/python/docs/api/class-browser#browser-new-context.\n        \"\"\"\n        return self._browser_new_context_options\n\n    @property\n    @override\n    def max_open_pages_per_browser(self) -> int:\n        return self._max_open_pages_per_browser\n\n    @override\n    async def __aenter__(self) -> PlaywrightBrowserPlugin:\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n        self._playwright = await self._playwright_context_manager.__aenter__()\n        return self\n\n    @override\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback)\n        self._playwright_context_manager = async_playwright()\n        self._active = False\n\n    @override\n    @ensure_context\n    async def new_browser(self) -> PlaywrightBrowserController:\n        if not self._playwright:\n            raise RuntimeError('Playwright browser plugin is not initialized.')\n\n        if self._browser_type == 'chromium':\n            browser_type = self._playwright.chromium\n        elif self._browser_type == 'firefox':\n            browser_type = self._playwright.firefox\n        elif self._browser_type == 'webkit':\n            browser_type = self._playwright.webkit\n        else:\n            raise ValueError(f'Invalid browser type: {self._browser_type}')\n\n        if self._use_incognito_pages:\n            browser: Browser | PlaywrightPersistentBrowser = await browser_type.launch(**self._browser_launch_options)\n        else:\n            browser = PlaywrightPersistentBrowser(browser_type, self._user_data_dir, self._browser_launch_options)\n\n        return PlaywrightBrowserController(\n            browser,\n            use_incognito_pages=self._use_incognito_pages,\n            max_open_pages_per_browser=self._max_open_pages_per_browser,\n            fingerprint_generator=self._fingerprint_generator,\n        )\n"
  },
  {
    "path": "src/crawlee/browsers/_types.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Literal\n\nif TYPE_CHECKING:\n    from playwright.async_api import Page\n\nBrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']\n\n\n@dataclass\nclass CrawleePage:\n    \"\"\"Represents a page object within a browser, with additional metadata for tracking and management.\"\"\"\n\n    id: str\n    browser_type: BrowserType\n    page: Page\n"
  },
  {
    "path": "src/crawlee/browsers/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/configuration.py",
    "content": "from __future__ import annotations\n\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING, Annotated\n\nfrom pydantic import AliasChoices, BeforeValidator, Field\nfrom pydantic_settings import BaseSettings, SettingsConfigDict\n\nfrom crawlee._types import LogLevel\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.models import timedelta_ms\n\nif TYPE_CHECKING:\n    from typing_extensions import Self\n\n__all__ = ['Configuration']\n\n\n@docs_group('Configuration')\nclass Configuration(BaseSettings):\n    \"\"\"Configuration settings for the Crawlee project.\n\n    This class stores common configurable parameters for Crawlee. Default values are provided for all settings,\n    so typically, no adjustments are necessary. However, you may modify settings for specific use cases,\n    such as changing the default storage directory, the default storage IDs, the timeout for internal\n    operations, and more.\n\n    Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.\n    \"\"\"\n\n    # TODO: https://github.com/pydantic/pydantic-settings/issues/706\n    # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.\n    model_config = SettingsConfigDict(populate_by_name=True)\n\n    internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None\n    \"\"\"Timeout for the internal asynchronous operations.\"\"\"\n\n    default_browser_path: Annotated[\n        str | None,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_default_browser_path',\n                'crawlee_default_browser_path',\n            )\n        ),\n    ] = None\n    \"\"\"Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option\n    is passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details,\n    refer to the Playwright documentation:\n    https://playwright.dev/docs/api/class-browsertype#browser-type-launch.\n    \"\"\"\n\n    disable_browser_sandbox: Annotated[\n        bool,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_disable_browser_sandbox',\n                'crawlee_disable_browser_sandbox',\n            )\n        ),\n    ] = False\n    \"\"\"Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option\n    is passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details,\n    refer to the Playwright documentation:\n    https://playwright.dev/docs/api/class-browsertype#browser-type-launch.\"\"\"\n\n    log_level: Annotated[\n        LogLevel,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_log_level',\n                'crawlee_log_level',\n            )\n        ),\n        BeforeValidator(lambda value: str(value).upper()),\n    ] = 'INFO'\n    \"\"\"The logging level.\"\"\"\n\n    purge_on_start: Annotated[\n        bool,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_purge_on_start',\n                'crawlee_purge_on_start',\n            )\n        ),\n    ] = True\n    \"\"\"Whether to purge the storage on the start. This option is utilized by the storage clients.\"\"\"\n\n    persist_state_interval: Annotated[\n        timedelta_ms,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_persist_state_interval_millis',\n                'crawlee_persist_state_interval_millis',\n            )\n        ),\n    ] = timedelta(minutes=1)\n    \"\"\"Interval at which `PersistState` events are emitted. The event ensures the state persistence during\n    the crawler run. This option is utilized by the `EventManager`.\"\"\"\n\n    system_info_interval: Annotated[\n        timedelta_ms,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_system_info_interval_millis',\n                'crawlee_system_info_interval_millis',\n            )\n        ),\n    ] = timedelta(seconds=1)\n    \"\"\"Interval at which `SystemInfo` events are emitted. The event represents the current status of the system.\n    This option is utilized by the `LocalEventManager`.\"\"\"\n\n    max_used_cpu_ratio: Annotated[\n        float,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_max_used_cpu_ratio',\n                'crawlee_max_used_cpu_ratio',\n            )\n        ),\n    ] = 0.95\n    \"\"\"The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded.\n    This option is used by the `Snapshotter`.\"\"\"\n\n    max_used_memory_ratio: Annotated[\n        float,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_max_used_memory_ratio',\n                'crawlee_max_used_memory_ratio',\n            )\n        ),\n    ] = 0.9\n    \"\"\"The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded.\n    This option is used by the `Snapshotter`.\"\"\"\n\n    max_event_loop_delay: Annotated[\n        timedelta_ms,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_max_event_loop_delay_millis',\n                'crawlee_max_event_loop_delay_millis',\n            )\n        ),\n    ] = timedelta(milliseconds=50)\n    \"\"\"The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded.\n    This option is used by the `Snapshotter`.\"\"\"\n\n    max_client_errors: Annotated[\n        int,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_max_client_errors',\n                'crawlee_max_client_errors',\n            )\n        ),\n    ] = 1\n    \"\"\"The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded.\n    This option is used by the `Snapshotter`.\"\"\"\n\n    memory_mbytes: Annotated[\n        int | None,\n        Field(\n            validation_alias=AliasChoices(\n                'actor_memory_mbytes',\n                'apify_memory_mbytes',\n                'crawlee_memory_mbytes',\n            )\n        ),\n    ] = None\n    \"\"\"The maximum used memory in megabytes. This option is utilized by the `Snapshotter`.\"\"\"\n\n    available_memory_ratio: Annotated[\n        float,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_available_memory_ratio',\n                'crawlee_available_memory_ratio',\n            ),\n            gt=0.0,\n            le=1.0,\n        ),\n    ] = 0.25\n    \"\"\"The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to\n    calculate the maximum memory. This option is utilized by the `Snapshotter` and supports the dynamic system memory\n    scaling.\"\"\"\n\n    storage_dir: Annotated[\n        str,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_local_storage_dir',\n                'crawlee_storage_dir',\n            ),\n        ),\n    ] = './storage'\n    \"\"\"The path to the storage directory. This option is utilized by the storage clients.\"\"\"\n\n    headless: Annotated[\n        bool,\n        Field(\n            validation_alias=AliasChoices(\n                'apify_headless',\n                'crawlee_headless',\n            )\n        ),\n    ] = True\n    \"\"\"Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option\n    is passed directly to Playwright's `browser_type.launch` method as `headless`. For more details,\n    refer to the Playwright documentation:\n    https://playwright.dev/docs/api/class-browsertype#browser-type-launch.\n    \"\"\"\n\n    @classmethod\n    def get_global_configuration(cls) -> Self:\n        \"\"\"Retrieve the global instance of the configuration.\n\n        Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`\n        instead.\n        \"\"\"\n        # Import here to avoid circular imports.\n        from crawlee import service_locator  # noqa: PLC0415\n\n        config = service_locator.get_configuration()\n\n        if not isinstance(config, cls):\n            raise TypeError(f'Requested global configuration object of type {cls}, but {config.__class__} was found')\n\n        return config\n"
  },
  {
    "path": "src/crawlee/crawlers/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\nfrom ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext\nfrom ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline\nfrom ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult\n\n_install_import_hook(__name__)\n\n# The following imports use try_import to handle optional dependencies, as they may not always be available.\n\nwith _try_import(__name__, 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'):\n    from ._beautifulsoup import BeautifulSoupCrawler, BeautifulSoupCrawlingContext, BeautifulSoupParserType\n\nwith _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'):\n    from ._parsel import ParselCrawler, ParselCrawlingContext\n\nwith _try_import(\n    __name__,\n    'PlaywrightCrawler',\n    'PlaywrightCrawlingContext',\n    'PlaywrightPostNavCrawlingContext',\n    'PlaywrightPreNavCrawlingContext',\n):\n    from ._playwright import (\n        PlaywrightCrawler,\n        PlaywrightCrawlingContext,\n        PlaywrightPostNavCrawlingContext,\n        PlaywrightPreNavCrawlingContext,\n    )\n\nwith _try_import(\n    __name__,\n    'AdaptivePlaywrightCrawler',\n    'AdaptivePlaywrightCrawlingContext',\n    'AdaptivePlaywrightPostNavCrawlingContext',\n    'AdaptivePlaywrightPreNavCrawlingContext',\n    'AdaptivePlaywrightCrawlerStatisticState',\n    'RenderingType',\n    'RenderingTypePrediction',\n    'RenderingTypePredictor',\n):\n    from ._adaptive_playwright import (\n        AdaptivePlaywrightCrawler,\n        AdaptivePlaywrightCrawlerStatisticState,\n        AdaptivePlaywrightCrawlingContext,\n        AdaptivePlaywrightPostNavCrawlingContext,\n        AdaptivePlaywrightPreNavCrawlingContext,\n        RenderingType,\n        RenderingTypePrediction,\n        RenderingTypePredictor,\n    )\n\n\n__all__ = [\n    'AbstractHttpCrawler',\n    'AbstractHttpParser',\n    'AdaptivePlaywrightCrawler',\n    'AdaptivePlaywrightCrawlerStatisticState',\n    'AdaptivePlaywrightCrawlingContext',\n    'AdaptivePlaywrightPostNavCrawlingContext',\n    'AdaptivePlaywrightPreNavCrawlingContext',\n    'BasicCrawler',\n    'BasicCrawlerOptions',\n    'BasicCrawlingContext',\n    'BeautifulSoupCrawler',\n    'BeautifulSoupCrawlingContext',\n    'BeautifulSoupParserType',\n    'ContextPipeline',\n    'HttpCrawler',\n    'HttpCrawlerOptions',\n    'HttpCrawlingContext',\n    'HttpCrawlingResult',\n    'ParsedHttpCrawlingContext',\n    'ParselCrawler',\n    'ParselCrawlingContext',\n    'PlaywrightCrawler',\n    'PlaywrightCrawlingContext',\n    'PlaywrightPostNavCrawlingContext',\n    'PlaywrightPreNavCrawlingContext',\n    'RenderingType',\n    'RenderingTypePrediction',\n    'RenderingTypePredictor',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_abstract_http/__init__.py",
    "content": "from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions\nfrom ._abstract_http_parser import AbstractHttpParser\nfrom ._http_crawling_context import ParsedHttpCrawlingContext\n\n__all__ = [\n    'AbstractHttpCrawler',\n    'AbstractHttpParser',\n    'HttpCrawlerOptions',\n    'ParsedHttpCrawlingContext',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom abc import ABC\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING, Any, Generic\n\nfrom more_itertools import partition\nfrom pydantic import ValidationError\nfrom typing_extensions import NotRequired, TypeVar\n\nfrom crawlee._request import Request, RequestOptions, RequestState\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.time import SharedTimeout\nfrom crawlee._utils.urls import to_absolute_url_iterator\nfrom crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline\nfrom crawlee.errors import SessionError\nfrom crawlee.statistics import StatisticsState\n\nfrom ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator\n\n    from typing_extensions import Unpack\n\n    from crawlee import RequestTransformAction\n    from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction\n\n    from ._abstract_http_parser import AbstractHttpParser\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)\nTStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)\n\n\nclass HttpCrawlerOptions(\n    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],\n    Generic[TCrawlingContext, TStatisticsState],\n):\n    \"\"\"Arguments for the `AbstractHttpCrawler` constructor.\n\n    It is intended for typing forwarded `__init__` arguments in the subclasses.\n    \"\"\"\n\n    navigation_timeout: NotRequired[timedelta | None]\n    \"\"\"Timeout for the HTTP request.\"\"\"\n\n\n@docs_group('Crawlers')\nclass AbstractHttpCrawler(\n    BasicCrawler[TCrawlingContext, StatisticsState],\n    ABC,\n    Generic[TCrawlingContext, TParseResult, TSelectResult],\n):\n    \"\"\"A web crawler for performing HTTP requests.\n\n    The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,\n    it implements HTTP communication using HTTP clients. The class allows integration with any HTTP client\n    that implements the `HttpClient` interface, provided as an input parameter to the constructor.\n\n    `AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses\n    and the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include\n    `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.\n\n    HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that\n    require client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        parser: AbstractHttpParser[TParseResult, TSelectResult],\n        navigation_timeout: timedelta | None = None,\n        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],\n    ) -> None:\n        self._parser = parser\n        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)\n        self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []\n        self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = []\n        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}\n\n        if '_context_pipeline' not in kwargs:\n            raise ValueError(\n                'Please pass in a `_context_pipeline`. You should use the '\n                'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.'\n            )\n\n        kwargs.setdefault('_logger', logging.getLogger(self.__class__.__name__))\n        super().__init__(**kwargs)\n\n    @classmethod\n    def create_parsed_http_crawler_class(\n        cls,\n        static_parser: AbstractHttpParser[TParseResult, TSelectResult],\n    ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]]:\n        \"\"\"Create a specific version of `AbstractHttpCrawler` class.\n\n        This is a convenience factory method for creating a specific `AbstractHttpCrawler` subclass.\n        While `AbstractHttpCrawler` allows its two generic parameters to be independent,\n        this method simplifies cases where `TParseResult` is used for both generic parameters.\n        \"\"\"\n\n        class _ParsedHttpCrawler(AbstractHttpCrawler):\n            def __init__(\n                self,\n                parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,  # ty: ignore[invalid-parameter-default]\n                **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],\n            ) -> None:\n                kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()\n                super().__init__(\n                    parser=parser,\n                    **kwargs,\n                )\n\n        return _ParsedHttpCrawler\n\n    def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:\n        \"\"\"Create static content crawler context pipeline with expected pipeline steps.\"\"\"\n        return (\n            ContextPipeline()\n            .compose(self._execute_pre_navigation_hooks)\n            .compose(self._make_http_request)\n            .compose(self._execute_post_navigation_hooks)\n            .compose(self._handle_status_code_response)\n            .compose(self._parse_http_response)\n            .compose(self._handle_blocked_request_by_content)\n        )\n\n    async def _execute_pre_navigation_hooks(\n        self, context: BasicCrawlingContext\n    ) -> AsyncGenerator[BasicCrawlingContext, None]:\n        context_id = id(context)\n        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)\n\n        try:\n            for hook in self._pre_navigation_hooks:\n                async with self._shared_navigation_timeouts[context_id]:\n                    await hook(context)\n\n            yield context\n        finally:\n            self._shared_navigation_timeouts.pop(context_id, None)\n\n    async def _execute_post_navigation_hooks(\n        self, context: HttpCrawlingContext\n    ) -> AsyncGenerator[HttpCrawlingContext, None]:\n        for hook in self._post_navigation_hooks:\n            await hook(context)\n\n        yield context\n\n    async def _parse_http_response(\n        self, context: HttpCrawlingContext\n    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:\n        \"\"\"Parse HTTP response and create context enhanced by the parsing result and enqueue links function.\n\n        Args:\n            context: The current crawling context, that includes HTTP response.\n\n        Yields:\n            The original crawling context enhanced by the parsing result and enqueue links function.\n        \"\"\"\n        parsed_content = await self._parser.parse(context.http_response)\n        extract_links = self._create_extract_links_function(context, parsed_content)\n        yield ParsedHttpCrawlingContext.from_http_crawling_context(\n            context=context,\n            parsed_content=parsed_content,\n            enqueue_links=self._create_enqueue_links_function(context, extract_links),\n            extract_links=extract_links,\n        )\n\n    def _create_extract_links_function(\n        self, context: HttpCrawlingContext, parsed_content: TParseResult\n    ) -> ExtractLinksFunction:\n        \"\"\"Create a callback function for extracting links from parsed content.\n\n        Args:\n            context: The current crawling context.\n            parsed_content: The parsed http response.\n\n        Returns:\n            Awaitable that is used for extracting links from parsed content.\n        \"\"\"\n\n        async def extract_links(\n            *,\n            selector: str = 'a',\n            attribute: str = 'href',\n            label: str | None = None,\n            user_data: dict[str, Any] | None = None,\n            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]\n            | None = None,\n            **kwargs: Unpack[EnqueueLinksKwargs],\n        ) -> list[Request]:\n            requests = list[Request]()\n\n            base_user_data = user_data or {}\n\n            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)\n\n            kwargs.setdefault('strategy', 'same-hostname')\n            strategy = kwargs.get('strategy', 'same-hostname')\n\n            links_iterator: Iterator[str] = iter(\n                self._parser.find_links(parsed_content, selector=selector, attribute=attribute)\n            )\n\n            # Get base URL from <base> tag if present\n            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href'))\n            base_url: str = (\n                str(extracted_base_urls[0])\n                if extracted_base_urls\n                else context.request.loaded_url or context.request.url\n            )\n            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)\n\n            if robots_txt_file:\n                skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)\n            else:\n                skipped = iter([])\n\n            for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):\n                request_options = RequestOptions(\n                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy\n                )\n\n                if transform_request_function:\n                    transform_request_options = transform_request_function(request_options)\n                    if transform_request_options == 'skip':\n                        continue\n                    if transform_request_options != 'unchanged':\n                        request_options = transform_request_options\n\n                try:\n                    request = Request.from_url(**request_options)\n                except ValidationError as exc:\n                    context.log.debug(\n                        f'Skipping URL \"{url}\" due to invalid format: {exc}. '\n                        'This may be caused by a malformed URL or unsupported URL scheme. '\n                        'Please ensure the URL is correct and retry.'\n                    )\n                    continue\n\n                requests.append(request)\n\n            skipped_tasks = [\n                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped\n            ]\n            await asyncio.gather(*skipped_tasks)\n\n            return requests\n\n        return extract_links\n\n    async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:\n        \"\"\"Make http request and create context enhanced by HTTP response.\n\n        Args:\n            context: The current crawling context.\n\n        Yields:\n            The original crawling context enhanced by HTTP response.\n        \"\"\"\n        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:\n            result = await self._http_client.crawl(\n                request=context.request,\n                session=context.session,\n                proxy_info=context.proxy_info,\n                statistics=self._statistics,\n                timeout=remaining_timeout,\n            )\n\n        context.request.state = RequestState.AFTER_NAV\n        yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)\n\n    async def _handle_status_code_response(\n        self, context: HttpCrawlingContext\n    ) -> AsyncGenerator[HttpCrawlingContext, None]:\n        \"\"\"Validate the HTTP status code and raise appropriate exceptions if needed.\n\n        Args:\n            context: The current crawling context containing the HTTP response.\n\n        Raises:\n            SessionError: If the status code indicates the session is blocked.\n            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.\n            HttpClientStatusCodeError: If the status code represents a client error.\n\n        Yields:\n            The original crawling context if no errors are detected.\n        \"\"\"\n        status_code = context.http_response.status_code\n        if self._retry_on_blocked:\n            self._raise_for_session_blocked_status_code(context.session, status_code)\n        self._raise_for_error_status_code(status_code)\n        yield context\n\n    async def _handle_blocked_request_by_content(\n        self, context: ParsedHttpCrawlingContext[TParseResult]\n    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:\n        \"\"\"Try to detect if the request is blocked based on the parsed response content.\n\n        Args:\n            context: The current crawling context.\n\n        Raises:\n            SessionError: If the request is considered blocked.\n\n        Yields:\n            The original crawling context if no blocking is detected.\n        \"\"\"\n        if self._retry_on_blocked and (blocked_info := self._parser.is_blocked(context.parsed_content)):\n            raise SessionError(blocked_info.reason)\n        yield context\n\n    def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None:\n        \"\"\"Register a hook to be called before each navigation.\n\n        Args:\n            hook: A coroutine function to be called before each navigation.\n        \"\"\"\n        self._pre_navigation_hooks.append(hook)\n\n    def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None:\n        \"\"\"Register a hook to be called after each navigation.\n\n        Args:\n            hook: A coroutine function to be called after each navigation.\n        \"\"\"\n        self._post_navigation_hooks.append(hook)\n"
  },
  {
    "path": "src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Generic\n\nfrom crawlee._utils.blocked import RETRY_CSS_SELECTORS\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._types import BlockedInfo\n\nfrom ._http_crawling_context import TParseResult, TSelectResult\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Sequence\n\n    from crawlee.http_clients import HttpResponse\n\n\n@docs_group('HTTP parsers')\nclass AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):\n    \"\"\"Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking.\"\"\"\n\n    @abstractmethod\n    async def parse(self, response: HttpResponse) -> TParseResult:\n        \"\"\"Parse HTTP response.\n\n        Args:\n            response: HTTP response to be parsed.\n\n        Returns:\n            Parsed HTTP response.\n        \"\"\"\n\n    @abstractmethod\n    async def parse_text(self, text: str) -> TParseResult:\n        \"\"\"Parse text containing html.\n\n        Args:\n            text: String containing html.\n\n        Returns:\n            Parsed text.\n        \"\"\"\n\n    @abstractmethod\n    async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]:\n        \"\"\"Use css selector to select page element and return it.\n\n        Args:\n            parsed_content: Content where the page element will be located.\n            selector: Css selector used to locate desired html element.\n\n        Returns:\n            Selected element.\n        \"\"\"\n\n    def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo:\n        \"\"\"Detect if blocked and return BlockedInfo with additional information.\n\n        Default implementation that expects `is_matching_selector` abstract method to be implemented.\n        Override this method if your parser has different way of blockage detection.\n\n        Args:\n            parsed_content: Parsed HTTP response. Result of `parse` method.\n\n        Returns:\n            `BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty\n            string in reason signifies no blockage detected.\n        \"\"\"\n        reason = ''\n        if parsed_content is not None:\n            matched_selectors = [\n                selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector)\n            ]\n\n            if matched_selectors:\n                reason = (\n                    f'Assuming the session is blocked - HTTP response matched the following selectors: '\n                    f'{\"; \".join(matched_selectors)}'\n                )\n\n        return BlockedInfo(reason=reason)\n\n    @abstractmethod\n    def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool:\n        \"\"\"Find if selector has match in parsed content.\n\n        Args:\n            parsed_content: Parsed HTTP response. Result of `parse` method.\n            selector: String used to define matching pattern.\n\n        Returns:\n            True if selector has match in parsed content.\n        \"\"\"\n\n    @abstractmethod\n    def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]:\n        \"\"\"Find all links in result using selector.\n\n        Args:\n            parsed_content: Parsed HTTP response. Result of `parse` method.\n            selector: String used to define matching pattern for finding links.\n            attribute: Which node attribute to extract the links from.\n\n        Returns:\n            Iterable of strings that contain found links.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_abstract_http/_http_crawling_context.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass, fields\nfrom typing import Generic\n\nfrom typing_extensions import Self, TypeVar\n\nfrom crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction, PageSnapshot\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.http_clients import HttpCrawlingResult, HttpResponse\n\nTParseResult = TypeVar('TParseResult')\nTSelectResult = TypeVar('TSelectResult')\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):\n    \"\"\"The crawling context used by the `AbstractHttpCrawler`.\"\"\"\n\n    @classmethod\n    def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self:\n        \"\"\"Initialize a new instance from an existing `BasicCrawlingContext`.\"\"\"\n        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}\n        return cls(http_response=http_response, **context_kwargs)\n\n    async def get_snapshot(self) -> PageSnapshot:\n        \"\"\"Get snapshot of crawled page.\"\"\"\n        return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):\n    \"\"\"The crawling context used by `AbstractHttpCrawler`.\n\n    It provides access to key objects as well as utility functions for handling crawling tasks.\n    \"\"\"\n\n    parsed_content: TParseResult\n    enqueue_links: EnqueueLinksFunction\n    extract_links: ExtractLinksFunction\n\n    @classmethod\n    def from_http_crawling_context(\n        cls,\n        context: HttpCrawlingContext,\n        parsed_content: TParseResult,\n        enqueue_links: EnqueueLinksFunction,\n        extract_links: ExtractLinksFunction,\n    ) -> Self:\n        \"\"\"Initialize a new instance from an existing `HttpCrawlingContext`.\"\"\"\n        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}\n        return cls(\n            parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_abstract_http/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n# These imports have only mandatory dependencies, so they are imported directly.\nfrom ._adaptive_playwright_crawling_context import (\n    AdaptivePlaywrightCrawlingContext,\n    AdaptivePlaywrightPostNavCrawlingContext,\n    AdaptivePlaywrightPreNavCrawlingContext,\n)\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):\n    from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor\nwith _try_import(__name__, 'AdaptivePlaywrightCrawler'):\n    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler\nwith _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):\n    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState\n\n__all__ = [\n    'AdaptivePlaywrightCrawler',\n    'AdaptivePlaywrightCrawlerStatisticState',\n    'AdaptivePlaywrightCrawlingContext',\n    'AdaptivePlaywrightPostNavCrawlingContext',\n    'AdaptivePlaywrightPreNavCrawlingContext',\n    'RenderingType',\n    'RenderingTypePrediction',\n    'RenderingTypePredictor',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py",
    "content": "from __future__ import annotations\n\nimport logging\nfrom collections.abc import Awaitable, Callable, Coroutine\nfrom copy import deepcopy\nfrom dataclasses import dataclass\nfrom logging import getLogger\nfrom random import random\nfrom typing import TYPE_CHECKING, Any, Generic, get_args\n\nfrom bs4 import BeautifulSoup, Tag\nfrom parsel import Selector\nfrom typing_extensions import Self, TypeVar, override\n\nfrom crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.wait import wait_for\nfrom crawlee.crawlers import (\n    AbstractHttpCrawler,\n    AbstractHttpParser,\n    BasicCrawler,\n    BeautifulSoupParserType,\n    HttpCrawlingContext,\n    ParsedHttpCrawlingContext,\n    PlaywrightCrawler,\n    PlaywrightCrawlingContext,\n    PlaywrightPostNavCrawlingContext,\n    PlaywrightPreNavCrawlingContext,\n)\nfrom crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser\nfrom crawlee.crawlers._parsel._parsel_parser import ParselParser\nfrom crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions\nfrom crawlee.statistics import Statistics, StatisticsState\n\nfrom ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState\nfrom ._adaptive_playwright_crawling_context import (\n    AdaptivePlaywrightCrawlingContext,\n    AdaptivePlaywrightPostNavCrawlingContext,\n    AdaptivePlaywrightPreNavCrawlingContext,\n)\nfrom ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor\nfrom ._result_comparator import create_default_comparator\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from typing_extensions import Unpack\n\n    from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions\n\n\nTStaticParseResult = TypeVar('TStaticParseResult')\nTStaticSelectResult = TypeVar('TStaticSelectResult')\nTStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext)\n\n\nclass _NonPersistentStatistics(Statistics):\n    \"\"\"Statistics compliant object that is not supposed to do anything when entering/exiting context.\n\n    To be used in sub crawlers.\n    \"\"\"\n\n    def __init__(self) -> None:\n        super().__init__(state_model=StatisticsState)\n\n    async def __aenter__(self) -> Self:\n        self._active = True\n        await self._state.initialize()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        self._active = False\n\n\n@docs_group('Crawlers')\nclass AdaptivePlaywrightCrawler(\n    BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],\n    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],\n):\n    \"\"\"An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.\n\n    It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects\n    that it may bring a performance benefit.\n    It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.\n\n    ### Usage\n    ```python\n    from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n        playwright_crawler_specific_kwargs={'browser_type': 'chromium'},\n    )\n\n    @crawler.router.default_handler\n    async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:\n        # Do some processing using `parsed_content`\n        context.log.info(context.parsed_content.title)\n\n        # Locate element h2 within 5 seconds\n        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))\n        # Do stuff with element found by the selector\n        context.log.info(h2)\n\n        # Find more links and enqueue them.\n        await context.enqueue_links()\n        # Save some data.\n        await context.push_data({'Visited url': context.request.url})\n\n    await crawler.run(['https://crawlee.dev/'])\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],\n        rendering_type_predictor: RenderingTypePredictor | None = None,\n        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,\n        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,\n        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,\n        statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None,\n        **kwargs: Unpack[_BasicCrawlerOptions],\n    ) -> None:\n        \"\"\"Initialize a new instance. Recommended way to create instance is to call factory methods.\n\n        Recommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.\n\n        Args:\n            rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which\n                rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used.\n            result_checker: Function that evaluates whether crawling result is valid or not.\n            result_comparator: Function that compares two crawling results and decides whether they are equivalent.\n            static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling.\n            static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler.\n            playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler.\n            statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of\n                non-default configuration.\n            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.\n        \"\"\"\n        # Adaptive crawling related.\n        self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()\n        self.result_checker = result_checker or (lambda _: True)\n        self.result_comparator = result_comparator or create_default_comparator(result_checker)\n\n        # Set default concurrency settings for browser crawlers if not provided\n        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:\n            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)\n\n        adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)\n\n        super().__init__(statistics=adaptive_statistics, **kwargs)\n\n        # Sub crawlers related.\n        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()\n\n        # Each sub crawler will use custom logger .\n        static_logger = getLogger('Subcrawler_static')\n        static_logger.setLevel(logging.ERROR)\n        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}\n\n        pw_logger = getLogger('Subcrawler_playwright')\n        pw_logger.setLevel(logging.ERROR)\n        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}\n\n        # Initialize sub crawlers to create their pipelines.\n        static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)\n\n        static_crawler = static_crawler_class(\n            parser=static_parser,\n            statistics=_NonPersistentStatistics(),\n            **basic_crawler_kwargs_for_static_crawler,\n        )\n        playwright_crawler = PlaywrightCrawler(\n            statistics=_NonPersistentStatistics(),\n            **playwright_crawler_specific_kwargs,\n            **basic_crawler_kwargs_for_pw_crawler,\n        )\n\n        # Register pre navigation hooks on sub crawlers\n        self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]()\n        self._pre_navigation_hooks_pw_only = list[\n            Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]\n        ]()\n\n        async def adaptive_pre_navigation_hook_static(context: BasicCrawlingContext) -> None:\n            for hook in self._pre_navigation_hooks:\n                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))\n\n        async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingContext) -> None:\n            for hook in self._pre_navigation_hooks + self._pre_navigation_hooks_pw_only:\n                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))\n\n        static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static)\n        playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw)\n\n        # Register post navigation hooks on sub crawlers\n        self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]()\n        self._post_navigation_hooks_pw_only = list[\n            Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]\n        ]()\n\n        async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None:\n            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)\n            for hook in self._post_navigation_hooks:\n                await hook(adaptive_context)\n\n        async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None:\n            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)\n            for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only:\n                await hook(adaptive_context)\n\n        static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static)\n        playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw)\n\n        self._additional_context_managers = [\n            *self._additional_context_managers,\n            self.rendering_type_predictor,\n            static_crawler.statistics,\n            playwright_crawler.statistics,\n            playwright_crawler._browser_pool,  # noqa: SLF001 # Intentional access to private member.\n        ]\n\n        # Sub crawler pipeline related\n        self._pw_context_pipeline = playwright_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.\n        self._static_context_pipeline = static_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.\n        self._static_parser = static_parser\n\n    @classmethod\n    def with_beautifulsoup_static_parser(\n        cls,\n        rendering_type_predictor: RenderingTypePredictor | None = None,\n        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,\n        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,\n        parser_type: BeautifulSoupParserType = 'lxml',\n        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,\n        statistics: Statistics[StatisticsState] | None = None,\n        **kwargs: Unpack[_BasicCrawlerOptions],\n    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]:\n        \"\"\"Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content.\"\"\"\n        if statistics is not None:\n            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)\n        else:\n            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)\n        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag](\n            rendering_type_predictor=rendering_type_predictor,\n            result_checker=result_checker,\n            result_comparator=result_comparator,\n            static_parser=BeautifulSoupParser(parser=parser_type),\n            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,\n            statistics=adaptive_statistics,\n            **kwargs,\n        )\n\n    @classmethod\n    def with_parsel_static_parser(\n        cls,\n        rendering_type_predictor: RenderingTypePredictor | None = None,\n        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,\n        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,\n        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,\n        statistics: Statistics[StatisticsState] | None = None,\n        **kwargs: Unpack[_BasicCrawlerOptions],\n    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]:\n        \"\"\"Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content.\"\"\"\n        if statistics is not None:\n            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)\n        else:\n            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)\n        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector](\n            rendering_type_predictor=rendering_type_predictor,\n            result_checker=result_checker,\n            result_comparator=result_comparator,\n            static_parser=ParselParser(),\n            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,\n            statistics=adaptive_statistics,\n            **kwargs,\n        )\n\n    async def _crawl_one(\n        self,\n        rendering_type: RenderingType,\n        context: BasicCrawlingContext,\n        state: dict[str, JsonSerializable] | None = None,\n    ) -> SubCrawlerRun:\n        \"\"\"Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`.\n\n        `SubCrawlerRun` contains either result of the crawl or the exception that was thrown during the crawl.\n        Sub crawler pipeline call is dynamically created based on the `rendering_type`.\n        New copy-like context is created from passed `context` and `state` and is passed to sub crawler pipeline.\n        \"\"\"\n        if state is not None:\n\n            async def get_input_state(\n                default_value: dict[str, JsonSerializable] | None = None,  # noqa:ARG001  # Intentionally unused arguments. Closure, that generates same output regardless of inputs.\n            ) -> dict[str, JsonSerializable]:\n                return state\n\n            use_state_function = get_input_state\n        else:\n            use_state_function = context.use_state\n\n        # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.\n        result = RequestHandlerRunResult(\n            key_value_store_getter=self.get_key_value_store,\n            request=context.request,\n        )\n        context_linked_to_result = BasicCrawlingContext(\n            request=result.request,\n            session=context.session,\n            proxy_info=context.proxy_info,\n            send_request=context.send_request,\n            add_requests=result.add_requests,\n            push_data=result.push_data,\n            get_key_value_store=result.get_key_value_store,\n            use_state=use_state_function,\n            log=context.log,\n        )\n\n        try:\n            await wait_for(\n                lambda: self._pipeline_call_factory(\n                    rendering_type=rendering_type, context_linked_to_result=context_linked_to_result\n                ),\n                timeout=self._request_handler_timeout,\n                timeout_message=(\n                    f'{rendering_type=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds'\n                ),\n                logger=self._logger,\n            )\n            return SubCrawlerRun(result=result)\n        except Exception as e:\n            return SubCrawlerRun(exception=e)\n\n    def _pipeline_call_factory(\n        self, rendering_type: RenderingType, context_linked_to_result: BasicCrawlingContext\n    ) -> Coroutine[Any, Any, None]:\n        \"\"\"Create sub crawler pipeline call.\"\"\"\n        if rendering_type == 'static':\n\n            async def from_static_pipeline_to_top_router(\n                context: ParsedHttpCrawlingContext[TStaticParseResult],\n            ) -> None:\n                adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(\n                    context=context, parser=self._static_parser\n                )\n                await self.router(adaptive_crawling_context)\n\n            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]\n\n        if rendering_type == 'client only':\n\n            async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None:\n                adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context(\n                    context=context, parser=self._static_parser\n                )\n                await self.router(adaptive_crawling_context)\n\n            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]\n\n        raise RuntimeError(\n            f'Not a valid rendering type. Must be one of the following: {\", \".join(get_args(RenderingType))}'\n        )\n\n    @override\n    async def _run_request_handler(self, context: BasicCrawlingContext) -> None:\n        \"\"\"Override BasicCrawler method that delegates request processing to sub crawlers.\n\n        To decide which sub crawler should process the request it runs `rendering_type_predictor`.\n        To check if results are valid it uses `result_checker`.\n        To compare results of both sub crawlers it uses `result_comparator`.\n\n        Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts\n        \"\"\"\n        rendering_type_prediction = self.rendering_type_predictor.predict(context.request)\n        should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation\n\n        if not should_detect_rendering_type:\n            self.log.debug(\n                f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}'\n            )\n            if rendering_type_prediction.rendering_type == 'static':\n                context.log.debug(f'Running static request for {context.request.url}')\n                self.track_http_only_request_handler_runs()\n\n                static_run = await self._crawl_one(rendering_type='static', context=context)\n                if static_run.result and self.result_checker(static_run.result):\n                    self._context_result_map[context] = static_run.result\n                    return\n                if static_run.exception:\n                    context.log.exception(\n                        msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception\n                    )\n                else:\n                    context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}')\n                    self.track_rendering_type_mispredictions()\n\n        context.log.debug(f'Running browser request handler for {context.request.url}')\n\n        old_state_copy = None\n\n        if should_detect_rendering_type:\n            # Save copy of global state from `use_state` before it can be mutated by browser crawl.\n            # This copy will be used in the static crawl to make sure they both run with same conditions and to\n            # avoid static crawl to modify the state.\n            # (This static crawl is performed only to evaluate rendering type detection.)\n            kvs = await context.get_key_value_store()\n            default_value = dict[str, JsonSerializable]()\n            old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value)\n            old_state_copy = deepcopy(old_state)\n\n        pw_run = await self._crawl_one('client only', context=context)\n        self.track_browser_request_handler_runs()\n\n        if pw_run.exception is not None:\n            raise pw_run.exception\n\n        if pw_run.result:\n            if should_detect_rendering_type:\n                detection_result: RenderingType\n                static_run = await self._crawl_one('static', context=context, state=old_state_copy)\n                if static_run.result and self.result_comparator(static_run.result, pw_run.result):\n                    detection_result = 'static'\n                else:\n                    detection_result = 'client only'\n\n                context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')\n                self.rendering_type_predictor.store_result(context.request, detection_result)\n\n            self._context_result_map[context] = pw_run.result\n\n    def pre_navigation_hook(\n        self,\n        hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,\n        *,\n        playwright_only: bool = False,\n    ) -> Callable[[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]], None]:\n        \"\"\"Pre navigation hooks for adaptive crawler are delegated to sub crawlers.\n\n        Optionally parametrized decorator.\n        Hooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`.\n        \"\"\"\n\n        def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:\n            if playwright_only:\n                self._pre_navigation_hooks_pw_only.append(hook)\n            else:\n                self._pre_navigation_hooks.append(hook)\n\n        # No parameter in decorator. Execute directly.\n        if hook:\n            register_hooks(hook)\n\n        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.\n        return register_hooks\n\n    def post_navigation_hook(\n        self,\n        hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None,\n        *,\n        playwright_only: bool = False,\n    ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]:\n        \"\"\"Post navigation hooks for adaptive crawler are delegated to sub crawlers.\n\n        Optionally parametrized decorator.\n        Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising\n        `AdaptiveContextError`.\n        \"\"\"\n\n        def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:\n            if playwright_only:\n                self._post_navigation_hooks_pw_only.append(hook)\n            else:\n                self._post_navigation_hooks.append(hook)\n\n        # No parameter in decorator. Execute directly.\n        if hook:\n            register_hooks(hook)\n\n        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.\n        return register_hooks\n\n    def track_http_only_request_handler_runs(self) -> None:\n        self.statistics.state.http_only_request_handler_runs += 1\n\n    def track_browser_request_handler_runs(self) -> None:\n        self.statistics.state.browser_request_handler_runs += 1\n\n    def track_rendering_type_mispredictions(self) -> None:\n        self.statistics.state.rendering_type_mispredictions += 1\n\n\n@dataclass(frozen=True)\nclass SubCrawlerRun:\n    result: RequestHandlerRunResult | None = None\n    exception: Exception | None = None\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py",
    "content": "from __future__ import annotations\n\nfrom typing import Annotated\n\nfrom pydantic import ConfigDict, Field\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.statistics import StatisticsState\n\n\n@docs_group('Statistics')\nclass AdaptivePlaywrightCrawlerStatisticState(StatisticsState):\n    \"\"\"Statistic data about a crawler run with additional information related to adaptive crawling.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')\n\n    http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0\n    \"\"\"Number representing how many times static http based crawling was used.\"\"\"\n\n    browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0\n    \"\"\"Number representing how many times browser based crawling was used.\"\"\"\n\n    rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0\n    \"\"\"Number representing how many times the predictor gave incorrect prediction.\"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass, fields\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING, Generic, TypeVar\n\nfrom playwright.async_api import TimeoutError as PlaywrightTimeoutError\n\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext\nfrom crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext\nfrom crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext\nfrom crawlee.crawlers._playwright._types import PlaywrightHttpResponse\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable, Sequence\n\n    from playwright.async_api import Page, Response\n    from typing_extensions import Self\n\n    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions\n\n\nTStaticParseResult = TypeVar('TStaticParseResult')\nTStaticSelectResult = TypeVar('TStaticSelectResult')\n\n\nclass AdaptiveContextError(RuntimeError):\n    pass\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass AdaptivePlaywrightCrawlingContext(\n    ParsedHttpCrawlingContext[TStaticParseResult],\n    Generic[TStaticParseResult, TStaticSelectResult],\n):\n    _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]\n    \"\"\"The crawling context used by `AdaptivePlaywrightCrawler`.\n\n    It provides access to key objects as well as utility functions for handling crawling tasks.\n    \"\"\"\n\n    _response: Response | None = None\n    _infinite_scroll: Callable[[], Awaitable[None]] | None = None\n    _page: Page | None = None\n\n    @property\n    def page(self) -> Page:\n        \"\"\"The Playwright `Page` object for the current page.\n\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if not self._page:\n            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')\n        return self._page\n\n    @property\n    def infinite_scroll(self) -> Callable[[], Awaitable[None]]:\n        \"\"\"A function to perform infinite scrolling on the page.\n\n        This scrolls to the bottom, triggering the loading of additional content if present.\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if not self._infinite_scroll:\n            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')\n        return self._infinite_scroll\n\n    @property\n    def response(self) -> Response:\n        \"\"\"The Playwright `Response` object containing the response details for the current URL.\n\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if not self._response:\n            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')\n        return self._response\n\n    async def wait_for_selector(self, selector: str, timeout: timedelta = timedelta(seconds=5)) -> None:\n        \"\"\"Locate element by css selector and return `None` once it is found.\n\n        If element is not found within timeout, `TimeoutError` is raised.\n\n        Args:\n            selector: Css selector to be used to locate specific element on page.\n            timeout: Timeout that defines how long the function wait for the selector to appear.\n        \"\"\"\n        if await self._static_parser.select(await self.parse_with_static_parser(), selector):\n            return\n        await self.page.locator(selector).wait_for(timeout=timeout.total_seconds() * 1000)\n\n    async def query_selector_one(\n        self, selector: str, timeout: timedelta = timedelta(seconds=5)\n    ) -> TStaticSelectResult | None:\n        \"\"\"Locate element by css selector and return first element found.\n\n        If element is not found within timeout, `TimeoutError` is raised.\n\n        Args:\n            selector: Css selector to be used to locate specific element on page.\n            timeout: Timeout that defines how long the function wait for the selector to appear.\n\n        Returns:\n            Result of used static parser `select` method.\n        \"\"\"\n        if matches := await self.query_selector_all(selector=selector, timeout=timeout):\n            return matches[0]\n        return None\n\n    async def query_selector_all(\n        self, selector: str, timeout: timedelta = timedelta(seconds=5)\n    ) -> Sequence[TStaticSelectResult]:\n        \"\"\"Locate element by css selector and return all elements found.\n\n        If element is not found within timeout, `TimeoutError` is raised.\n\n        Args:\n            selector: Css selector to be used to locate specific element on page.\n            timeout: Timeout that defines how long the function wait for the selector to appear.\n\n        Returns:\n            List of results of used static parser `select` method.\n        \"\"\"\n        if static_content := await self._static_parser.select(await self.parse_with_static_parser(), selector):\n            # Selector found in static content.\n            return static_content\n\n        locator = self.page.locator(selector)\n        try:\n            await locator.wait_for(timeout=timeout.total_seconds() * 1000)\n        except PlaywrightTimeoutError:\n            # Selector not found at all.\n            return ()\n\n        parsed_selector = await self._static_parser.select(\n            await self._static_parser.parse_text(await locator.evaluate('el => el.outerHTML')), selector\n        )\n        if parsed_selector is not None:\n            # Selector found by browser after some wait time and selected by static parser.\n            return parsed_selector\n\n        # Selector found by browser after some wait time, but could not be selected by static parser.\n        raise AdaptiveContextError(\n            'Element exists on the page and Playwright was able to locate it, but the static content parser of selected'\n            'static crawler does support such selector.'\n        )\n\n    async def parse_with_static_parser(\n        self, selector: str | None = None, timeout: timedelta = timedelta(seconds=5)\n    ) -> TStaticParseResult:\n        \"\"\"Parse whole page with static parser. If `selector` argument is used, wait for selector first.\n\n        If element is not found within timeout, TimeoutError is raised.\n\n        Args:\n            selector: css selector to be used to locate specific element on page.\n            timeout: timeout that defines how long the function wait for the selector to appear.\n\n        Returns:\n            Result of used static parser `parse_text` method.\n        \"\"\"\n        if selector:\n            await self.wait_for_selector(selector, timeout)\n        if self._page:\n            return await self._static_parser.parse_text(await self.page.content())\n        return self.parsed_content\n\n    @classmethod\n    def from_parsed_http_crawling_context(\n        cls,\n        context: ParsedHttpCrawlingContext[TStaticParseResult],\n        parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],\n    ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]:\n        \"\"\"Initialize a new instance from an existing `ParsedHttpCrawlingContext`.\"\"\"\n        return cls(_static_parser=parser, **{field.name: getattr(context, field.name) for field in fields(context)})\n\n    @classmethod\n    async def from_playwright_crawling_context(\n        cls,\n        context: PlaywrightCrawlingContext,\n        parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],\n    ) -> AdaptivePlaywrightCrawlingContext[TStaticParseResult, TStaticSelectResult]:\n        \"\"\"Initialize a new instance from an existing `PlaywrightCrawlingContext`.\"\"\"\n        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}\n        # Remove playwright specific attributes and pass them as private instead to be available as property.\n        context_kwargs['_response'] = context_kwargs.pop('response')\n        context_kwargs['_page'] = context_kwargs.pop('page')\n        context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')\n        # This might not be always available.\n        protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol')\n        http_response = await PlaywrightHttpResponse.from_playwright_response(\n            response=context.response, protocol=protocol_guess or ''\n        )\n        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.\n        context_kwargs.pop('block_requests')\n        context_kwargs.pop('goto_options')\n        return cls(\n            parsed_content=await parser.parse(http_response),\n            http_response=http_response,\n            _static_parser=parser,\n            **context_kwargs,\n        )\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):\n    \"\"\"A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\n    Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext.\n    \"\"\"\n\n    _page: Page | None = None\n    block_requests: BlockRequestsFunction | None = None\n    \"\"\"Blocks network requests matching specified URL patterns.\"\"\"\n\n    goto_options: GotoOptions | None = None\n    \"\"\"Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.\"\"\"\n\n    @property\n    def page(self) -> Page:\n        \"\"\"The Playwright `Page` object for the current page.\n\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if self._page is not None:\n            return self._page\n        raise AdaptiveContextError(\n            'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only '\n            'hooks please use `playwright_only`=True when registering the hook. '\n            'For example: @crawler.pre_navigation_hook(playwright_only=True)'\n        )\n\n    @classmethod\n    def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self:\n        \"\"\"Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`.\"\"\"\n        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}\n        context_kwargs['_page'] = context_kwargs.pop('page', None)\n\n        # For static sub crawler replace block requests by function doing nothing.\n        async def dummy_block_requests(\n            url_patterns: list[str] | None = None,  # noqa:ARG001\n            extra_url_patterns: list[str] | None = None,  # noqa:ARG001\n        ) -> None:\n            return\n\n        context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)\n        return cls(**context_kwargs)\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext):\n    \"\"\"A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.\n\n    Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext.\n    \"\"\"\n\n    _page: Page | None = None\n    _response: Response | None = None\n\n    @property\n    def page(self) -> Page:\n        \"\"\"The Playwright `Page` object for the current page.\n\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if not self._page:\n            raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')\n        return self._page\n\n    @property\n    def response(self) -> Response:\n        \"\"\"The Playwright `Response` object containing the response details for the current URL.\n\n        Raises `AdaptiveContextError` if accessed during static crawling.\n        \"\"\"\n        if not self._response:\n            raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.')\n        return self._response\n\n    @classmethod\n    async def from_post_navigation_context(\n        cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext\n    ) -> Self:\n        \"\"\"Initialize a new instance from an existing post-navigation context.\"\"\"\n        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}\n\n        context_kwargs['_page'] = context_kwargs.pop('page', None)\n        context_kwargs['_response'] = context_kwargs.pop('response', None)\n\n        # block_requests and goto_options are useful only on pre-navigation contexts.\n        context_kwargs.pop('block_requests', None)\n        context_kwargs.pop('goto_options', None)\n\n        if isinstance(context, PlaywrightPostNavCrawlingContext):\n            protocol_guess = await context_kwargs['_page'].evaluate(\n                '() => performance.getEntries()[0]?.nextHopProtocol'\n            )\n            context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response(\n                response=context.response, protocol=protocol_guess or ''\n            )\n        return cls(**context_kwargs)\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom itertools import zip_longest\nfrom logging import getLogger\nfrom statistics import mean\nfrom typing import TYPE_CHECKING, Annotated, Literal\nfrom urllib.parse import urlparse\n\nfrom jaro import jaro_winkler_metric\nfrom pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator\nfrom sklearn.linear_model import LogisticRegression\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recoverable_state import RecoverableState\n\nfrom ._utils import sklearn_model_serializer, sklearn_model_validator\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from crawlee import Request\n\nlogger = getLogger(__name__)\n\nUrlComponents = list[str]\nRenderingType = Literal['static', 'client only']\nFeatureVector = tuple[float, float]\n\n\nclass RenderingTypePredictorState(BaseModel):\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    model: Annotated[\n        LogisticRegression,\n        Field(LogisticRegression),\n        PlainValidator(sklearn_model_validator),\n        PlainSerializer(sklearn_model_serializer),\n    ]\n\n    labels_coefficients: Annotated[defaultdict[str, float], Field(alias='labelsCoefficients')]\n\n\n@docs_group('Other')\n@dataclass(frozen=True)\nclass RenderingTypePrediction:\n    \"\"\"Rendering type recommendation with detection probability recommendation.\"\"\"\n\n    rendering_type: RenderingType\n    \"\"\"Recommended rendering type.\"\"\"\n    detection_probability_recommendation: float\n    \"\"\"Recommended rendering detection probability. Expected values between 0-1.\n\n    Zero represents absolute confidence in `rendering_type` recommendation.\n    One represents no confidence in `rendering_type` recommendation.\"\"\"\n\n\n@docs_group('Other')\nclass RenderingTypePredictor(ABC):\n    \"\"\"Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"Initialize a new instance.\"\"\"\n        # Flag to indicate the state.\n        self._active = False\n\n    @abstractmethod\n    def predict(self, request: Request) -> RenderingTypePrediction:\n        \"\"\"Get `RenderingTypePrediction` based on the input request.\n\n        Args:\n            request: `Request` instance for which the prediction is made.\n        \"\"\"\n\n    @abstractmethod\n    def store_result(self, request: Request, rendering_type: RenderingType) -> None:\n        \"\"\"Store prediction results and retrain the model.\n\n        Args:\n            request: Used request.\n            rendering_type: Known suitable `RenderingType`.\n        \"\"\"\n\n    async def initialize(self) -> None:\n        \"\"\"Initialize additional resources required for the predictor operation.\"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n        self._active = True\n\n    async def clear(self) -> None:\n        \"\"\"Clear and release additional resources used by the predictor.\"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n        self._active = False\n\n    async def __aenter__(self) -> RenderingTypePredictor:\n        \"\"\"Initialize the predictor upon entering the context manager.\"\"\"\n        await self.initialize()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Clear the predictor upon exiting the context manager.\"\"\"\n        await self.clear()\n\n\n@docs_group('Other')\nclass DefaultRenderingTypePredictor(RenderingTypePredictor):\n    \"\"\"Stores rendering type for previously crawled URLs and predicts the rendering type for unvisited urls.\n\n    `RenderingTypePredictor` implementation based on logistic regression:\n    https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n    \"\"\"\n\n    def __init__(\n        self,\n        detection_ratio: float = 0.1,\n        *,\n        persistence_enabled: bool = False,\n        persist_state_key: str = 'rendering-type-predictor-state',\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            detection_ratio: A number between 0 and 1 that determines the desired ratio of rendering type detections.\n            persist_state_key: Key in the key-value storage where the trained model parameters will be saved.\n            If None, defaults to 'rendering-type-predictor-state'.\n            persistence_enabled: Whether to enable persistence of the trained model parameters for reuse.\n\n        \"\"\"\n        super().__init__()\n\n        self._rendering_type_detection_results: dict[RenderingType, dict[str, list[UrlComponents]]] = {\n            'static': defaultdict(list),\n            'client only': defaultdict(list),\n        }\n        self._detection_ratio = max(0, min(1, detection_ratio))\n\n        # Used to increase detection probability recommendation for initial recommendations of each label.\n        # Reaches 1 (no additional increase) after n samples of specific label is already present in\n        # `self._rendering_type_detection_results`.\n        n = 3\n\n        self._state = RecoverableState(\n            default_state=RenderingTypePredictorState(\n                model=LogisticRegression(max_iter=1000), labels_coefficients=defaultdict(lambda: n + 2)\n            ),\n            persist_state_key=persist_state_key,\n            persistence_enabled=persistence_enabled,\n            logger=logger,\n        )\n\n    @override\n    async def initialize(self) -> None:\n        \"\"\"Get current state of the predictor.\"\"\"\n        await super().initialize()\n\n        if not self._state.is_initialized:\n            await self._state.initialize()\n\n    @override\n    async def clear(self) -> None:\n        \"\"\"Clear the predictor state.\"\"\"\n        await super().clear()\n\n        if self._state.is_initialized:\n            await self._state.teardown()\n\n    @override\n    def predict(self, request: Request) -> RenderingTypePrediction:\n        \"\"\"Get `RenderingTypePrediction` based on the input request.\n\n        Args:\n            request: `Request` instance for which the prediction is made.\n        \"\"\"\n        similarity_threshold = 0.1  #  Prediction probability difference threshold to consider prediction unreliable.\n        label = request.label or ''\n\n        # Check that the model has already been fitted.\n        if hasattr(self._state.current_value.model, 'coef_'):\n            url_feature = self._calculate_feature_vector(get_url_components(request.url), label)\n            # Are both calls expensive?\n            prediction = self._state.current_value.model.predict([url_feature])[0]\n            probability = self._state.current_value.model.predict_proba([url_feature])[0]\n\n            if abs(probability[0] - probability[1]) < similarity_threshold:\n                # Prediction not reliable.\n                detection_probability_recommendation = 1.0\n            else:\n                detection_probability_recommendation = self._detection_ratio\n                # Increase recommendation for uncommon labels.\n                detection_probability_recommendation *= self._state.current_value.labels_coefficients[label]\n\n            return RenderingTypePrediction(\n                rendering_type=('client only', 'static')[int(prediction)],\n                detection_probability_recommendation=detection_probability_recommendation,\n            )\n        # No data available yet.\n        return RenderingTypePrediction(rendering_type='client only', detection_probability_recommendation=1)\n\n    @override\n    def store_result(self, request: Request, rendering_type: RenderingType) -> None:\n        \"\"\"Store prediction results and retrain the model.\n\n        Args:\n            request: Used `Request` instance.\n            rendering_type: Known suitable `RenderingType` for the used `Request` instance.\n        \"\"\"\n        label = request.label or ''\n        self._rendering_type_detection_results[rendering_type][label].append(get_url_components(request.url))\n        if self._state.current_value.labels_coefficients[label] > 1:\n            self._state.current_value.labels_coefficients[label] -= 1\n        self._retrain()\n\n    def _retrain(self) -> None:\n        x: list[FeatureVector] = [(0, 1), (1, 0)]\n        y: list[float] = [0, 1]\n\n        for rendering_type, urls_by_label in self._rendering_type_detection_results.items():\n            encoded_rendering_type = 1 if rendering_type == 'static' else 0\n            for label, urls in urls_by_label.items():\n                for url_components in urls:\n                    x.append(self._calculate_feature_vector(url_components, label))\n                    y.append(encoded_rendering_type)\n\n        self._state.current_value.model.fit(x, y)\n\n    def _calculate_mean_similarity(self, url: UrlComponents, label: str, rendering_type: RenderingType) -> float:\n        if not self._rendering_type_detection_results[rendering_type][label]:\n            return 0\n        return mean(\n            calculate_url_similarity(url, known_url_components)\n            for known_url_components in self._rendering_type_detection_results[rendering_type][label]\n        )\n\n    def _calculate_feature_vector(self, url: UrlComponents, label: str) -> tuple[float, float]:\n        return (\n            self._calculate_mean_similarity(url, label, 'static'),\n            self._calculate_mean_similarity(url, label, 'client only'),\n        )\n\n\ndef get_url_components(url: str) -> UrlComponents:\n    \"\"\"Get list of url components where first component is host name.\"\"\"\n    parsed_url = urlparse(url)\n    if parsed_url.path:\n        return [parsed_url.netloc, *parsed_url.path.strip('/').split('/')]\n    return [parsed_url.netloc]\n\n\ndef calculate_url_similarity(url_1: UrlComponents, url_2: UrlComponents) -> float:\n    \"\"\"Calculate url similarity based on host name and path components similarity.\n\n    Return 0 if different host names.\n    Compare path components using jaro-wrinkler method and assign 1 or 0 value based on similarity_cutoff for each\n    path component. Return their weighted average.\n    \"\"\"\n    # Anything with jaro_winkler_metric less than this value is considered completely different,\n    # otherwise considered the same.\n    similarity_cutoff = 0.8\n\n    if (url_1[0] != url_2[0]) or not url_1 or not url_2:\n        return 0\n    if url_1 == url_2:\n        return 1\n\n    # Each additional path component from longer path is compared to empty string.\n    return mean(\n        1 if jaro_winkler_metric(path_1, path_2) > similarity_cutoff else 0\n        for path_1, path_2 in zip_longest(url_1[1:], url_2[1:], fillvalue='')\n    )\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n\n    from crawlee._types import RequestHandlerRunResult\n\n\ndef create_default_comparator(\n    result_checker: Callable[[RequestHandlerRunResult], bool] | None,\n) -> Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool]:\n    \"\"\"Create a default comparator function for evaluating request handler results.\"\"\"\n    if result_checker:\n        # Fallback comparator if only user-specific checker is defined.\n        return lambda result_1, result_2: result_checker(result_1) and result_checker(result_2)\n    # Fallback default comparator.\n    return push_data_only_comparator\n\n\ndef full_result_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool:\n    \"\"\"Compare results by comparing all their parts.\n\n    Comparison of `add_requests_calls` will consider same url requests with different parameters as different\n    For example following two request will be considered as different requests:\n    https://sdk.apify.com/docs/guides/getting-started\n    https://sdk.apify.com/docs/guides/getting-started?__hsfp=1136113150&__hssc=7591405.1.173549427712\n    \"\"\"\n    return (\n        (result_1.push_data_calls == result_2.push_data_calls)\n        and (result_1.add_requests_calls == result_2.add_requests_calls)\n        and (result_1.key_value_store_changes == result_2.key_value_store_changes)\n    )\n\n\ndef push_data_only_comparator(result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult) -> bool:\n    \"\"\"Compare results by comparing their push data calls. Ignore other parts of results in comparison.\"\"\"\n    return result_1.push_data_calls == result_2.push_data_calls\n"
  },
  {
    "path": "src/crawlee/crawlers/_adaptive_playwright/_utils.py",
    "content": "from typing import Any\n\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\n\n\ndef sklearn_model_validator(v: LogisticRegression | dict[str, Any]) -> LogisticRegression:\n    if isinstance(v, LogisticRegression):\n        return v\n\n    model = LogisticRegression(max_iter=1000)\n    if v.get('is_fitted', False):\n        model.coef_ = np.array(v['coef'])\n        model.intercept_ = np.array(v['intercept'])\n        model.classes_ = np.array(v['classes'])\n        model.n_iter_ = np.array(v.get('n_iter', [1000]))\n\n    return model\n\n\ndef sklearn_model_serializer(model: LogisticRegression) -> dict[str, Any]:\n    if hasattr(model, 'coef_'):\n        return {\n            'coef': np.asarray(model.coef_).tolist(),\n            'intercept': model.intercept_.tolist(),\n            'classes': model.classes_.tolist(),\n            'n_iter': model.n_iter_.tolist() if hasattr(model, 'n_iter_') else [1000],\n            'is_fitted': True,\n            'max_iter': model.max_iter,\n            'solver': model.solver,\n        }\n    return {'is_fitted': False, 'max_iter': model.max_iter, 'solver': model.solver}\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/__init__.py",
    "content": "from ._basic_crawler import BasicCrawler, BasicCrawlerOptions\nfrom ._basic_crawling_context import BasicCrawlingContext\nfrom ._context_pipeline import ContextPipeline\n\n__all__ = [\n    'BasicCrawler',\n    'BasicCrawlerOptions',\n    'BasicCrawlingContext',\n    'ContextPipeline',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/_basic_crawler.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/basic-crawler/src/internals/basic-crawler.ts\nfrom __future__ import annotations\n\nimport asyncio\nimport functools\nimport logging\nimport signal\nimport sys\nimport tempfile\nimport threading\nimport traceback\nfrom asyncio import CancelledError\nfrom collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence\nfrom contextlib import AsyncExitStack, suppress\nfrom datetime import timedelta\nfrom functools import partial\nfrom io import StringIO\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast\nfrom urllib.parse import ParseResult, urlparse\nfrom weakref import WeakKeyDictionary\n\nfrom cachetools import LRUCache\nfrom tldextract import TLDExtract\nfrom typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never\nfrom yarl import URL\n\nfrom crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator\nfrom crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus\nfrom crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level\nfrom crawlee._request import Request, RequestOptions, RequestState\nfrom crawlee._service_locator import ServiceLocator\nfrom crawlee._types import (\n    BasicCrawlingContext,\n    EnqueueLinksKwargs,\n    ExportDataCsvKwargs,\n    ExportDataJsonKwargs,\n    GetKeyValueStoreFromRequestHandlerFunction,\n    HttpHeaders,\n    HttpPayload,\n    LogLevel,\n    RequestHandlerRunResult,\n    SendRequestFunction,\n    SkippedReason,\n)\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee._utils.robots import RobotsTxtFile\nfrom crawlee._utils.urls import convert_to_absolute_url, is_url_absolute\nfrom crawlee._utils.wait import wait_for\nfrom crawlee._utils.web import is_status_code_client_error, is_status_code_server_error\nfrom crawlee.errors import (\n    ContextPipelineInitializationError,\n    ContextPipelineInterruptedError,\n    HttpClientStatusCodeError,\n    HttpStatusCodeError,\n    RequestCollisionError,\n    RequestHandlerError,\n    SessionError,\n    UserDefinedErrorHandlerError,\n    UserHandlerTimeoutError,\n)\nfrom crawlee.events._types import Event, EventCrawlerStatusData\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.router import Router\nfrom crawlee.sessions import SessionPool\nfrom crawlee.statistics import Statistics, StatisticsState\nfrom crawlee.storages import Dataset, KeyValueStore, RequestQueue\n\nfrom ._context_pipeline import ContextPipeline\nfrom ._context_utils import swapped_context\nfrom ._logging_utils import (\n    get_one_line_error_summary_if_possible,\n    reduce_asyncio_timeout_error_to_relevant_traceback_parts,\n)\n\nif TYPE_CHECKING:\n    import re\n    from collections.abc import Iterator\n    from contextlib import AbstractAsyncContextManager\n\n    from crawlee._types import (\n        ConcurrencySettings,\n        EnqueueLinksFunction,\n        ExtractLinksFunction,\n        GetDataKwargs,\n        HttpMethod,\n        JsonSerializable,\n        PushDataKwargs,\n    )\n    from crawlee.configuration import Configuration\n    from crawlee.events import EventManager\n    from crawlee.http_clients import HttpClient, HttpResponse\n    from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo\n    from crawlee.request_loaders import RequestManager\n    from crawlee.sessions import Session\n    from crawlee.statistics import FinalStatistics\n    from crawlee.storage_clients import StorageClient\n    from crawlee.storage_clients.models import DatasetItemsListPage\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)\nTStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)\nTRequestIterator = TypeVar('TRequestIterator', str, Request)\nTParams = ParamSpec('TParams')\nT = TypeVar('T')\n\nErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]\nFailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]\nSkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]\n\n\nclass _BasicCrawlerOptions(TypedDict):\n    \"\"\"Non-generic options the `BasicCrawler` constructor.\"\"\"\n\n    configuration: NotRequired[Configuration]\n    \"\"\"The `Configuration` instance. Some of its properties are used as defaults for the crawler.\"\"\"\n\n    event_manager: NotRequired[EventManager]\n    \"\"\"The event manager for managing events for the crawler and all its components.\"\"\"\n\n    storage_client: NotRequired[StorageClient]\n    \"\"\"The storage client for managing storages for the crawler and all its components.\"\"\"\n\n    request_manager: NotRequired[RequestManager]\n    \"\"\"Manager of requests that should be processed by the crawler.\"\"\"\n\n    session_pool: NotRequired[SessionPool]\n    \"\"\"A custom `SessionPool` instance, allowing the use of non-default configuration.\"\"\"\n\n    proxy_configuration: NotRequired[ProxyConfiguration]\n    \"\"\"HTTP proxy configuration used when making requests.\"\"\"\n\n    http_client: NotRequired[HttpClient]\n    \"\"\"HTTP client used by `BasicCrawlingContext.send_request` method.\"\"\"\n\n    max_request_retries: NotRequired[int]\n    \"\"\"Specifies the maximum number of retries allowed for a request if its processing fails.\n    This includes retries due to navigation errors or errors thrown from user-supplied functions\n    (`request_handler`, `pre_navigation_hooks` etc.).\n\n    This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).\"\"\"\n\n    max_requests_per_crawl: NotRequired[int | None]\n    \"\"\"Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.\n    Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.\n    Due to concurrency settings, the actual number of pages visited may slightly exceed this value.\"\"\"\n\n    max_session_rotations: NotRequired[int]\n    \"\"\"Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs\n    or if the website blocks the request.\n\n    The session rotations are not counted towards the `max_request_retries` limit.\n    \"\"\"\n\n    max_crawl_depth: NotRequired[int | None]\n    \"\"\"Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.\n    The crawl depth starts at 0 for initial requests and increases with each subsequent level of links.\n    Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests.\n    If not set, crawling continues without depth restrictions.\n    \"\"\"\n\n    use_session_pool: NotRequired[bool]\n    \"\"\"Enable the use of a session pool for managing sessions during crawling.\"\"\"\n\n    retry_on_blocked: NotRequired[bool]\n    \"\"\"If True, the crawler attempts to bypass bot protections automatically.\"\"\"\n\n    concurrency_settings: NotRequired[ConcurrencySettings]\n    \"\"\"Settings to fine-tune concurrency levels.\"\"\"\n\n    request_handler_timeout: NotRequired[timedelta]\n    \"\"\"Maximum duration allowed for a single request handler to run.\"\"\"\n\n    abort_on_error: NotRequired[bool]\n    \"\"\"If True, the crawler stops immediately when any request handler error occurs.\"\"\"\n\n    configure_logging: NotRequired[bool]\n    \"\"\"If True, the crawler will set up logging infrastructure automatically.\"\"\"\n\n    statistics_log_format: NotRequired[Literal['table', 'inline']]\n    \"\"\"If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain\n    text log messages.\n    \"\"\"\n\n    keep_alive: NotRequired[bool]\n    \"\"\"Flag that can keep crawler running even when there are no requests in queue.\"\"\"\n\n    additional_http_error_status_codes: NotRequired[Iterable[int]]\n    \"\"\"Additional HTTP status codes to treat as errors, triggering automatic retries when encountered.\"\"\"\n\n    ignore_http_error_status_codes: NotRequired[Iterable[int]]\n    \"\"\"HTTP status codes that are typically considered errors but should be treated as successful responses.\"\"\"\n\n    _additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]]\n    \"\"\"Additional context managers used throughout the crawler lifecycle. Intended for use by\n    subclasses rather than direct instantiation of `BasicCrawler`.\"\"\"\n\n    _logger: NotRequired[logging.Logger]\n    \"\"\"A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by\n    subclasses rather than direct instantiation of `BasicCrawler`.\"\"\"\n\n    respect_robots_txt_file: NotRequired[bool]\n    \"\"\"If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,\n    and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.\"\"\"\n\n    status_message_logging_interval: NotRequired[timedelta]\n    \"\"\"Interval for logging the crawler status messages.\"\"\"\n\n    status_message_callback: NotRequired[\n        Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]\n    ]\n    \"\"\"Allows overriding the default status message. The default status message is provided in the parameters.\n    Returning `None` suppresses the status message.\"\"\"\n\n    id: NotRequired[int]\n    \"\"\"Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state between\n    them.\"\"\"\n\n\nclass _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):\n    \"\"\"Generic options the `BasicCrawler` constructor.\"\"\"\n\n    request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]\n    \"\"\"A callable responsible for handling requests.\"\"\"\n\n    _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]\n    \"\"\"Enables extending the request lifecycle and modifying the crawling context. Intended for use by\n    subclasses rather than direct instantiation of `BasicCrawler`.\"\"\"\n\n    statistics: NotRequired[Statistics[TStatisticsState]]\n    \"\"\"A custom `Statistics` instance, allowing the use of non-default configuration.\"\"\"\n\n\nclass BasicCrawlerOptions(\n    _BasicCrawlerOptions,\n    _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],\n    Generic[TCrawlingContext, TStatisticsState],\n):\n    \"\"\"Arguments for the `BasicCrawler` constructor.\n\n    It is intended for typing forwarded `__init__` arguments in the subclasses.\n    \"\"\"\n\n\n@docs_group('Crawlers')\nclass BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):\n    \"\"\"A basic web crawler providing a framework for crawling websites.\n\n    The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their\n    own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific\n    purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,\n    `BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full\n    control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic\n    yourself.\n\n    The crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then\n    handled by a user-defined `request_handler` function, which processes the page and extracts the data.\n\n    The `BasicCrawler` includes several common features for crawling, such as:\n        - automatic scaling based on the system resources,\n        - retries for failed requests,\n        - session management,\n        - statistics tracking,\n        - request routing via labels,\n        - proxy rotation,\n        - direct storage interaction helpers,\n        - and more.\n    \"\"\"\n\n    _CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'\n    _request_handler_timeout_text = 'Request handler timed out after'\n    __next_id = 0\n\n    def __init__(\n        self,\n        *,\n        configuration: Configuration | None = None,\n        event_manager: EventManager | None = None,\n        storage_client: StorageClient | None = None,\n        request_manager: RequestManager | None = None,\n        session_pool: SessionPool | None = None,\n        proxy_configuration: ProxyConfiguration | None = None,\n        http_client: HttpClient | None = None,\n        request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,\n        max_request_retries: int = 3,\n        max_requests_per_crawl: int | None = None,\n        max_session_rotations: int = 10,\n        max_crawl_depth: int | None = None,\n        use_session_pool: bool = True,\n        retry_on_blocked: bool = True,\n        additional_http_error_status_codes: Iterable[int] | None = None,\n        ignore_http_error_status_codes: Iterable[int] | None = None,\n        concurrency_settings: ConcurrencySettings | None = None,\n        request_handler_timeout: timedelta = timedelta(minutes=1),\n        statistics: Statistics[TStatisticsState] | None = None,\n        abort_on_error: bool = False,\n        keep_alive: bool = False,\n        configure_logging: bool = True,\n        statistics_log_format: Literal['table', 'inline'] = 'table',\n        respect_robots_txt_file: bool = False,\n        status_message_logging_interval: timedelta = timedelta(seconds=10),\n        status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]\n        | None = None,\n        id: int | None = None,\n        _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,\n        _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,\n        _logger: logging.Logger | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            configuration: The `Configuration` instance. Some of its properties are used as defaults for the crawler.\n            event_manager: The event manager for managing events for the crawler and all its components.\n            storage_client: The storage client for managing storages for the crawler and all its components.\n            request_manager: Manager of requests that should be processed by the crawler.\n            session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.\n            proxy_configuration: HTTP proxy configuration used when making requests.\n            http_client: HTTP client used by `BasicCrawlingContext.send_request` method.\n            request_handler: A callable responsible for handling requests.\n            max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.\n                This includes retries due to navigation errors or errors thrown from user-supplied functions\n                (`request_handler`, `pre_navigation_hooks` etc.).\n                This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).\n            max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching\n                this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means\n                no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed\n                this value. If used together with `keep_alive`, then the crawler will be kept alive only until\n                `max_requests_per_crawl` is achieved.\n            max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session\n                if a proxy error occurs or if the website blocks the request.\n                The session rotations are not counted towards the `max_request_retries` limit.\n            max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond\n                this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level\n                of links. Requests at the maximum depth will still be processed, but no new links will be enqueued\n                from those requests. If not set, crawling continues without depth restrictions.\n            use_session_pool: Enable the use of a session pool for managing sessions during crawling.\n            retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.\n            additional_http_error_status_codes: Additional HTTP status codes to treat as errors,\n                triggering automatic retries when encountered.\n            ignore_http_error_status_codes: HTTP status codes that are typically considered errors but should be treated\n                as successful responses.\n            concurrency_settings: Settings to fine-tune concurrency levels.\n            request_handler_timeout: Maximum duration allowed for a single request handler to run.\n            statistics: A custom `Statistics` instance, allowing the use of non-default configuration.\n            abort_on_error: If True, the crawler stops immediately when any request handler error occurs.\n            keep_alive: If True, it will keep crawler alive even if there are no requests in queue.\n                Use `crawler.stop()` to exit the crawler.\n            configure_logging: If True, the crawler will set up logging infrastructure automatically.\n            statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',\n                outputs statistics as plain text log messages.\n            respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file\n                for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added\n                via `EnqueueLinksFunction`\n            status_message_logging_interval: Interval for logging the crawler status messages.\n            status_message_callback: Allows overriding the default status message. The default status message is\n                provided in the parameters. Returning `None` suppresses the status message.\n            id: Identifier used for crawler state tracking. Use the same id across multiple crawlers to share state\n                between them.\n            _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.\n                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.\n            _additional_context_managers: Additional context managers used throughout the crawler lifecycle.\n                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.\n            _logger: A logger instance, typically provided by a subclass, for consistent logging labels.\n                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.\n        \"\"\"\n        if id is None:\n            self._id = BasicCrawler.__next_id\n            BasicCrawler.__next_id += 1\n        else:\n            self._id = id\n\n        implicit_event_manager_with_explicit_config = False\n        if not configuration:\n            configuration = service_locator.get_configuration()\n        elif not event_manager:\n            implicit_event_manager_with_explicit_config = True\n\n        if not storage_client:\n            storage_client = service_locator.get_storage_client()\n\n        if not event_manager:\n            event_manager = service_locator.get_event_manager()\n\n        self._service_locator = ServiceLocator(\n            configuration=configuration, storage_client=storage_client, event_manager=event_manager\n        )\n\n        config = self._service_locator.get_configuration()\n\n        # Core components\n        self._request_manager = request_manager\n        self._session_pool = session_pool or SessionPool()\n        self._proxy_configuration = proxy_configuration\n\n        self._additional_http_error_status_codes = (\n            set(additional_http_error_status_codes) if additional_http_error_status_codes else set()\n        )\n        self._ignore_http_error_status_codes = (\n            set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()\n        )\n\n        self._http_client = http_client or ImpitHttpClient()\n\n        # Request router setup\n        self._router: Router[TCrawlingContext] | None = None\n        if isinstance(cast('Router', request_handler), Router):\n            self._router = cast('Router[TCrawlingContext]', request_handler)\n        elif request_handler is not None:\n            self._router = None\n            self.router.default_handler(request_handler)\n\n        # Error, failed & skipped request handlers\n        self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None\n        self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None\n        self._on_skipped_request: SkippedRequestCallback | None = None\n        self._abort_on_error = abort_on_error\n\n        # Crawler callbacks\n        self._status_message_callback = status_message_callback\n\n        # Context of each request with matching result of request handler.\n        # Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`.\n        self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()\n\n        # Context pipeline\n        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)  # ty: ignore[invalid-argument-type]\n\n        # Crawl settings\n        self._max_request_retries = max_request_retries\n        self._max_requests_per_crawl = max_requests_per_crawl\n        self._max_session_rotations = max_session_rotations\n        self._max_crawl_depth = max_crawl_depth\n        self._respect_robots_txt_file = respect_robots_txt_file\n\n        # Timeouts\n        self._request_handler_timeout = request_handler_timeout\n        self._internal_timeout = (\n            config.internal_timeout\n            if config.internal_timeout is not None\n            else max(2 * request_handler_timeout, timedelta(minutes=5))\n        )\n\n        # Retry and session settings\n        self._use_session_pool = use_session_pool\n        self._retry_on_blocked = retry_on_blocked\n\n        # Logging setup\n        if configure_logging:\n            root_logger = logging.getLogger()\n            configure_logger(root_logger, remove_old_handlers=True)\n            httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger\n            httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)\n        self._logger = _logger or logging.getLogger(__name__)\n        if implicit_event_manager_with_explicit_config:\n            self._logger.warning(\n                'No event manager set, implicitly using event manager from global service_locator.'\n                'It is advised to explicitly set the event manager if explicit configuration is used as well.'\n            )\n        self._statistics_log_format = statistics_log_format\n\n        # Statistics\n        if statistics:\n            self._statistics = statistics\n        else:\n\n            async def persist_state_factory() -> KeyValueStore:\n                return await self.get_key_value_store()\n\n            self._statistics = cast(\n                'Statistics[TStatisticsState]',\n                Statistics.with_default_state(\n                    persistence_enabled=True,\n                    periodic_message_logger=self._logger,\n                    statistics_log_format=self._statistics_log_format,\n                    log_message='Current request statistics:',\n                    persist_state_kvs_factory=persist_state_factory,\n                ),\n            )\n\n        # Additional context managers to enter and exit\n        self._additional_context_managers = _additional_context_managers or []\n\n        # Internal, not explicitly configurable components\n        self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)\n        self._robots_txt_lock = asyncio.Lock()\n        self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)\n        self._snapshotter = Snapshotter.from_config(config)\n        self._autoscaled_pool = AutoscaledPool(\n            system_status=SystemStatus(self._snapshotter),\n            concurrency_settings=concurrency_settings,\n            is_finished_function=self.__is_finished_function,\n            is_task_ready_function=self.__is_task_ready_function,\n            run_task_function=self.__run_task_function,\n        )\n        self._crawler_state_rec_task = RecurringTask(\n            func=self._crawler_state_task, delay=status_message_logging_interval\n        )\n        self._previous_crawler_state: TStatisticsState | None = None\n\n        # State flags\n        self._keep_alive = keep_alive\n        self._running = False\n        self._has_finished_before = False\n\n        self._failed = False\n\n        self._unexpected_stop = False\n\n    @property\n    def log(self) -> logging.Logger:\n        \"\"\"The logger used by the crawler.\"\"\"\n        return self._logger\n\n    @property\n    def router(self) -> Router[TCrawlingContext]:\n        \"\"\"The `Router` used to handle each individual crawling request.\"\"\"\n        if self._router is None:\n            self._router = Router[TCrawlingContext]()\n\n        return self._router\n\n    @router.setter\n    def router(self, router: Router[TCrawlingContext]) -> None:\n        if self._router is not None:\n            raise RuntimeError('A router is already set')\n\n        self._router = router\n\n    @property\n    def statistics(self) -> Statistics[TStatisticsState]:\n        \"\"\"Statistics about the current (or last) crawler run.\"\"\"\n        return self._statistics\n\n    def stop(self, reason: str = 'Stop was called externally.') -> None:\n        \"\"\"Set flag to stop crawler.\n\n        This stops current crawler run regardless of whether all requests were finished.\n\n        Args:\n            reason: Reason for stopping that will be used in logs.\n        \"\"\"\n        self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')\n        self._unexpected_stop = True\n\n    def _wrap_handler_with_error_context(\n        self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]\n    ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:\n        \"\"\"Decorate error handlers to make their context helpers usable.\"\"\"\n\n        @functools.wraps(handler)\n        async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:\n            # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request\n            # failed. Modified context provides context helpers with direct access to the storages.\n            error_context = context.create_modified_copy(\n                push_data=self._push_data,\n                get_key_value_store=self.get_key_value_store,\n                add_requests=functools.partial(self._add_requests, context),\n            )\n            return await handler(error_context, exception)\n\n        return wrapped_handler\n\n    def _stop_if_max_requests_count_exceeded(self) -> None:\n        \"\"\"Call `stop` when the maximum number of requests to crawl has been reached.\"\"\"\n        if self._max_requests_per_crawl is None:\n            return\n\n        if self._statistics.state.requests_total >= self._max_requests_per_crawl:\n            self.stop(\n                reason=f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. '\n            )\n\n    async def _get_session(self) -> Session | None:\n        \"\"\"If session pool is being used, try to take a session from it.\"\"\"\n        if not self._use_session_pool:\n            return None\n\n        return await wait_for(\n            self._session_pool.get_session,\n            timeout=self._internal_timeout,\n            timeout_message='Fetching a session from the pool timed out after '\n            f'{self._internal_timeout.total_seconds()} seconds',\n            max_retries=3,\n            logger=self._logger,\n        )\n\n    async def _get_session_by_id(self, session_id: str | None) -> Session | None:\n        \"\"\"If session pool is being used, try to take a session by id from it.\"\"\"\n        if not self._use_session_pool or not session_id:\n            return None\n\n        return await wait_for(\n            partial(self._session_pool.get_session_by_id, session_id),\n            timeout=self._internal_timeout,\n            timeout_message='Fetching a session from the pool timed out after '\n            f'{self._internal_timeout.total_seconds()} seconds',\n            max_retries=3,\n            logger=self._logger,\n        )\n\n    async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:\n        \"\"\"Retrieve a new ProxyInfo object based on crawler configuration and the current request and session.\"\"\"\n        if not self._proxy_configuration:\n            return None\n\n        return await self._proxy_configuration.new_proxy_info(\n            session_id=session.id if session else None,\n            request=request,\n            proxy_tier=None,\n        )\n\n    async def get_request_manager(self) -> RequestManager:\n        \"\"\"Return the configured request manager. If none is configured, open and return the default request queue.\"\"\"\n        if not self._request_manager:\n            self._request_manager = await RequestQueue.open(\n                storage_client=self._service_locator.get_storage_client(),\n                configuration=self._service_locator.get_configuration(),\n            )\n\n        return self._request_manager\n\n    async def get_dataset(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n    ) -> Dataset:\n        \"\"\"Return the `Dataset` with the given ID or name. If none is provided, return the default one.\"\"\"\n        return await Dataset.open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=self._service_locator.get_storage_client(),\n            configuration=self._service_locator.get_configuration(),\n        )\n\n    async def get_key_value_store(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n    ) -> KeyValueStore:\n        \"\"\"Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS.\"\"\"\n        return await KeyValueStore.open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=self._service_locator.get_storage_client(),\n            configuration=self._service_locator.get_configuration(),\n        )\n\n    def error_handler(\n        self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]\n    ) -> ErrorHandler[TCrawlingContext]:\n        \"\"\"Register a function to handle errors occurring in request handlers.\n\n        The error handler is invoked after a request handler error occurs and before a retry attempt.\n        \"\"\"\n        self._error_handler = self._wrap_handler_with_error_context(handler)\n        return handler\n\n    def failed_request_handler(\n        self, handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]\n    ) -> FailedRequestHandler[TCrawlingContext]:\n        \"\"\"Register a function to handle requests that exceed the maximum retry limit.\n\n        The failed request handler is invoked when a request has failed all retry attempts.\n        \"\"\"\n        self._failed_request_handler = self._wrap_handler_with_error_context(handler)\n        return handler\n\n    def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:\n        \"\"\"Register a function to handle skipped requests.\n\n        The skipped request handler is invoked when a request is skipped due to a collision or other reasons.\n        \"\"\"\n        self._on_skipped_request = callback\n        return callback\n\n    async def run(\n        self,\n        requests: Sequence[str | Request] | None = None,\n        *,\n        purge_request_queue: bool = True,\n    ) -> FinalStatistics:\n        \"\"\"Run the crawler until all requests are processed.\n\n        Args:\n            requests: The requests to be enqueued before the crawler starts.\n            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default\n                request queue will be purged.\n        \"\"\"\n        if self._running:\n            raise RuntimeError(\n                'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`'\n            )\n\n        self._running = True\n\n        if self._has_finished_before:\n            await self._statistics.reset()\n\n            if self._use_session_pool:\n                await self._session_pool.reset_store()\n\n            request_manager = await self.get_request_manager()\n            if purge_request_queue and isinstance(request_manager, RequestQueue):\n                await request_manager.drop()\n                self._request_manager = await RequestQueue.open(\n                    storage_client=self._service_locator.get_storage_client(),\n                    configuration=self._service_locator.get_configuration(),\n                )\n\n        if requests is not None:\n            await self.add_requests(requests)\n\n        interrupted = False\n\n        def sigint_handler() -> None:\n            nonlocal interrupted\n\n            if not interrupted:\n                interrupted = True\n                self._logger.info('Pausing... Press CTRL+C again to force exit.')\n\n            run_task.cancel()\n\n        run_task = asyncio.create_task(self._run_crawler(), name='run_crawler_task')\n\n        if threading.current_thread() is threading.main_thread():  # `add_signal_handler` works only in the main thread\n            with suppress(NotImplementedError):  # event loop signal handlers are not supported on Windows\n                asyncio.get_running_loop().add_signal_handler(signal.SIGINT, sigint_handler)\n\n        try:\n            await run_task\n        except CancelledError:\n            pass\n        finally:\n            if threading.current_thread() is threading.main_thread():\n                with suppress(NotImplementedError):\n                    asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)\n\n        if self._statistics.error_tracker.total > 0:\n            self._logger.info(\n                'Error analysis:'\n                f' total_errors={self._statistics.error_tracker.total}'\n                f' unique_errors={self._statistics.error_tracker.unique_error_count}'\n            )\n\n        if interrupted:\n            self._logger.info(\n                f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python {sys.argv[0]}'\n            )\n\n        self._running = False\n        self._has_finished_before = True\n\n        await self._save_crawler_state()\n\n        final_statistics = self._statistics.calculate()\n        if self._statistics_log_format == 'table':\n            self._logger.info(f'Final request statistics:\\n{final_statistics.to_table()}')\n        else:\n            self._logger.info('Final request statistics:', extra=final_statistics.to_dict())\n        return final_statistics\n\n    async def _run_crawler(self) -> None:\n        event_manager = self._service_locator.get_event_manager()\n\n        # Collect the context managers to be entered. Context managers that are already active are excluded,\n        # as they were likely entered by the caller, who will also be responsible for exiting them.\n        contexts_to_enter = [\n            cm\n            for cm in (\n                event_manager,\n                self._snapshotter,\n                self._statistics,\n                self._session_pool if self._use_session_pool else None,\n                self._http_client,\n                self._crawler_state_rec_task,\n                *self._additional_context_managers,\n            )\n            if cm and getattr(cm, 'active', False) is False\n        ]\n\n        async with AsyncExitStack() as exit_stack:\n            for context in contexts_to_enter:\n                await exit_stack.enter_async_context(context)  # ty: ignore[invalid-argument-type]\n\n            await self._autoscaled_pool.run()\n\n    async def add_requests(\n        self,\n        requests: Sequence[str | Request],\n        *,\n        forefront: bool = False,\n        batch_size: int = 1000,\n        wait_time_between_batches: timedelta = timedelta(0),\n        wait_for_all_requests_to_be_added: bool = False,\n        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,\n    ) -> None:\n        \"\"\"Add requests to the underlying request manager in batches.\n\n        Args:\n            requests: A list of requests to add to the queue.\n            forefront: If True, add requests to the forefront of the queue.\n            batch_size: The number of requests to add in one batch.\n            wait_time_between_batches: Time to wait between adding batches.\n            wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.\n            wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.\n        \"\"\"\n        allowed_requests = []\n        skipped = []\n\n        for request in requests:\n            check_url = request.url if isinstance(request, Request) else request\n            if await self._is_allowed_based_on_robots_txt_file(check_url):\n                allowed_requests.append(request)\n            else:\n                skipped.append(request)\n\n        if skipped:\n            skipped_tasks = [\n                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped\n            ]\n            await asyncio.gather(*skipped_tasks)\n            self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')\n\n        request_manager = await self.get_request_manager()\n\n        await request_manager.add_requests(\n            requests=allowed_requests,\n            forefront=forefront,\n            batch_size=batch_size,\n            wait_time_between_batches=wait_time_between_batches,\n            wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,\n            wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,\n        )\n\n    async def use_state(\n        self,\n        default_value: dict[str, JsonSerializable] | None = None,\n    ) -> dict[str, JsonSerializable]:\n        kvs = await self.get_key_value_store()\n        return await kvs.get_auto_saved_value(f'{self._CRAWLEE_STATE_KEY}_{self._id}', default_value)\n\n    async def _save_crawler_state(self) -> None:\n        store = await self.get_key_value_store()\n        await store.persist_autosaved_values()\n\n    async def get_data(\n        self,\n        dataset_id: str | None = None,\n        dataset_name: str | None = None,\n        dataset_alias: str | None = None,\n        **kwargs: Unpack[GetDataKwargs],\n    ) -> DatasetItemsListPage:\n        \"\"\"Retrieve data from a `Dataset`.\n\n        This helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified\n        one and then retrieves the data based on the provided parameters.\n\n        Args:\n            dataset_id: The ID of the `Dataset`.\n            dataset_name: The name of the `Dataset` (global scope, named storage).\n            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).\n            kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.\n\n        Returns:\n            The retrieved data.\n        \"\"\"\n        dataset = await Dataset.open(\n            id=dataset_id,\n            name=dataset_name,\n            alias=dataset_alias,\n            storage_client=self._service_locator.get_storage_client(),\n            configuration=self._service_locator.get_configuration(),\n        )\n        return await dataset.get_data(**kwargs)\n\n    async def export_data(\n        self,\n        path: str | Path,\n        dataset_id: str | None = None,\n        dataset_name: str | None = None,\n        dataset_alias: str | None = None,\n        **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],\n    ) -> None:\n        \"\"\"Export all items from a Dataset to a JSON or CSV file.\n\n        This method simplifies the process of exporting data collected during crawling. It automatically\n        determines the export format based on the file extension (`.json` or `.csv`) and handles\n        the conversion of `Dataset` items to the appropriate format.\n\n        Args:\n            path: The destination file path. Must end with '.json' or '.csv'.\n            dataset_id: The ID of the Dataset to export from.\n            dataset_name: The name of the Dataset to export from (global scope, named storage).\n            dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).\n            additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.\n        \"\"\"\n        dataset = await Dataset.open(\n            id=dataset_id,\n            name=dataset_name,\n            alias=dataset_alias,\n            storage_client=self._service_locator.get_storage_client(),\n            configuration=self._service_locator.get_configuration(),\n        )\n\n        path = Path(path)\n\n        if path.suffix == '.csv':\n            dst = StringIO()\n            csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)\n            await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)\n            await atomic_write(path, dst.getvalue())\n        elif path.suffix == '.json':\n            dst = StringIO()\n            json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)\n            await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)\n            await atomic_write(path, dst.getvalue())\n        else:\n            raise ValueError(f'Unsupported file extension: {path.suffix}')\n\n    async def _push_data(\n        self,\n        data: list[dict[str, Any]] | dict[str, Any],\n        dataset_id: str | None = None,\n        dataset_name: str | None = None,\n        dataset_alias: str | None = None,\n        **kwargs: Unpack[PushDataKwargs],\n    ) -> None:\n        \"\"\"Push data to a `Dataset`.\n\n        This helper method simplifies the process of pushing data to a `Dataset`. It opens the specified\n        one and then pushes the provided data to it.\n\n        Args:\n            data: The data to push to the `Dataset`.\n            dataset_id: The ID of the `Dataset`.\n            dataset_name: The name of the `Dataset` (global scope, named storage).\n            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).\n            kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.\n        \"\"\"\n        dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)\n        await dataset.push_data(data, **kwargs)\n\n    def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:\n        if context.request.no_retry:\n            return False\n\n        # Do not retry on client errors.\n        if isinstance(error, HttpClientStatusCodeError):\n            return False\n\n        if isinstance(error, SessionError):\n            return ((context.request.session_rotation_count or 0) + 1) < self._max_session_rotations\n\n        max_request_retries = context.request.max_retries\n        if max_request_retries is None:\n            max_request_retries = self._max_request_retries\n\n        return context.request.retry_count < max_request_retries\n\n    async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:\n        \"\"\"Ensure that the `loaded_url` still matches the enqueue strategy after redirects.\n\n        Filter out links that redirect outside of the crawled domain.\n        \"\"\"\n        if context.request.loaded_url is not None and not self._check_enqueue_strategy(\n            context.request.enqueue_strategy,\n            origin_url=urlparse(context.request.url),\n            target_url=urlparse(context.request.loaded_url),\n        ):\n            raise ContextPipelineInterruptedError(\n                f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'\n            )\n\n        yield context\n\n    def _create_enqueue_links_function(\n        self, context: BasicCrawlingContext, extract_links: ExtractLinksFunction\n    ) -> EnqueueLinksFunction:\n        \"\"\"Create a callback function for extracting links from parsed content and enqueuing them to the crawl.\n\n        Args:\n            context: The current crawling context.\n            extract_links: Function used to extract links from the page.\n\n        Returns:\n            Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.\n        \"\"\"\n\n        async def enqueue_links(\n            *,\n            selector: str | None = None,\n            attribute: str | None = None,\n            label: str | None = None,\n            user_data: dict[str, Any] | None = None,\n            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]\n            | None = None,\n            requests: Sequence[str | Request] | None = None,\n            rq_id: str | None = None,\n            rq_name: str | None = None,\n            rq_alias: str | None = None,\n            **kwargs: Unpack[EnqueueLinksKwargs],\n        ) -> None:\n            kwargs.setdefault('strategy', 'same-hostname')\n\n            if requests:\n                if any((selector, attribute, label, user_data, transform_request_function)):\n                    raise ValueError(\n                        'You cannot provide `selector`, `attribute`, `label`, `user_data` or '\n                        '`transform_request_function` arguments when `requests` is provided.'\n                    )\n                # Add directly passed requests.\n                await context.add_requests(\n                    requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs\n                )\n            else:\n                # Add requests from extracted links.\n                await context.add_requests(\n                    await extract_links(\n                        selector=selector or 'a',\n                        attribute=attribute or 'href',\n                        label=label,\n                        user_data=user_data,\n                        transform_request_function=transform_request_function,\n                        **kwargs,\n                    ),\n                    rq_id=rq_id,\n                    rq_name=rq_name,\n                    rq_alias=rq_alias,\n                    **kwargs,\n                )\n\n        return enqueue_links\n\n    def _enqueue_links_filter_iterator(\n        self, request_iterator: Iterator[TRequestIterator], origin_url: str, **kwargs: Unpack[EnqueueLinksKwargs]\n    ) -> Iterator[TRequestIterator]:\n        \"\"\"Filter requests based on the enqueue strategy and URL patterns.\"\"\"\n        limit = kwargs.get('limit')\n        parsed_origin_url = urlparse(origin_url)\n        strategy = kwargs.get('strategy', 'all')\n\n        if strategy == 'all' and not parsed_origin_url.hostname:\n            self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')\n            return\n\n        # Emit a `warning` message to the log, only once per call\n        warning_flag = True\n\n        for request in request_iterator:\n            if isinstance(request, Request):\n                if request.enqueue_strategy != strategy:\n                    request.enqueue_strategy = strategy\n                target_url = request.url\n            else:\n                target_url = request\n            parsed_target_url = urlparse(target_url)\n\n            if warning_flag and strategy != 'all' and not parsed_target_url.hostname:\n                self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')\n                warning_flag = False\n\n            if self._check_enqueue_strategy(\n                strategy, target_url=parsed_target_url, origin_url=parsed_origin_url\n            ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):\n                yield request\n\n                if limit is not None:\n                    limit -= 1\n                    if limit <= 0:\n                        break\n\n    def _check_enqueue_strategy(\n        self,\n        strategy: EnqueueStrategy,\n        *,\n        target_url: ParseResult,\n        origin_url: ParseResult,\n    ) -> bool:\n        \"\"\"Check if a URL matches the enqueue_strategy.\"\"\"\n        if strategy == 'all':\n            return True\n\n        if origin_url.hostname is None or target_url.hostname is None:\n            self.log.debug(\n                f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '\n                f'target_url = {target_url.geturl()}'\n            )\n            return False\n\n        if strategy == 'same-hostname':\n            return target_url.hostname == origin_url.hostname\n\n        if strategy == 'same-domain':\n            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix\n            target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix\n            return origin_domain == target_domain\n\n        if strategy == 'same-origin':\n            return (\n                target_url.hostname == origin_url.hostname\n                and target_url.scheme == origin_url.scheme\n                and target_url.port == origin_url.port\n            )\n\n        assert_never(strategy)\n\n    def _check_url_patterns(\n        self,\n        target_url: str,\n        include: Sequence[re.Pattern[Any] | Glob] | None,\n        exclude: Sequence[re.Pattern[Any] | Glob] | None,\n    ) -> bool:\n        \"\"\"Check if a URL matches configured include/exclude patterns.\"\"\"\n        # If the URL matches any `exclude` pattern, reject it\n        for pattern in exclude or ():\n            if isinstance(pattern, Glob):\n                pattern = pattern.regexp  # noqa: PLW2901\n\n            if pattern.match(target_url) is not None:\n                return False\n\n        # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL\n        if include is None:\n            return True\n\n        # If the URL matches any `include` pattern, accept it\n        for pattern in include:\n            if isinstance(pattern, Glob):\n                pattern = pattern.regexp  # noqa: PLW2901\n\n            if pattern.match(target_url) is not None:\n                return True\n\n        # The URL does not match any `include` pattern - reject it\n        return False\n\n    async def _handle_request_retries(\n        self,\n        context: TCrawlingContext | BasicCrawlingContext,\n        error: Exception,\n    ) -> None:\n        request_manager = await self.get_request_manager()\n        request = context.request\n\n        if self._abort_on_error:\n            self._logger.exception('Aborting crawler run due to error (abort_on_error=True)', exc_info=error)\n            self._failed = True\n\n        if self._should_retry_request(context, error):\n            request.retry_count += 1\n            reduced_error = str(error).split('\\n')[0]\n            self.log.warning(\n                f'Retrying request to {context.request.url} due to: {reduced_error}. '\n                f'{get_one_line_error_summary_if_possible(error)}'\n            )\n            await self._statistics.error_tracker.add(error=error, context=context)\n\n            if self._error_handler:\n                try:\n                    new_request = await self._error_handler(context, error)\n                except Exception as e:\n                    raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e\n                else:\n                    if new_request is not None and new_request != request:\n                        await request_manager.add_request(new_request)\n                        await self._mark_request_as_handled(request)\n                        return\n\n            await request_manager.reclaim_request(request)\n        else:\n            request.state = RequestState.ERROR\n            await self._mark_request_as_handled(request)\n            await self._handle_failed_request(context, error)\n            self._statistics.record_request_processing_failure(request.unique_key)\n\n    async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:\n        try:\n            context.request.state = RequestState.ERROR_HANDLER\n\n            await wait_for(\n                partial(self._handle_request_retries, context, error),\n                timeout=self._internal_timeout,\n                timeout_message='Handling request failure timed out after '\n                f'{self._internal_timeout.total_seconds()} seconds',\n                logger=self._logger,\n            )\n        except UserDefinedErrorHandlerError:\n            context.request.state = RequestState.ERROR\n            raise\n        except Exception as secondary_error:\n            self._logger.exception(\n                'An exception occurred during handling of failed request. This places the crawler '\n                'and its underlying storages into an unknown state and crawling will be terminated.',\n                exc_info=secondary_error,\n            )\n            context.request.state = RequestState.ERROR\n            raise\n\n        if context.session:\n            context.session.mark_bad()\n\n    async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:\n        self._logger.error(\n            f'Request to {context.request.url} failed and reached maximum retries\\n '\n            f'{self._get_message_from_error(error)}'\n        )\n        await self._statistics.error_tracker.add(error=error, context=context)\n\n        if self._failed_request_handler:\n            try:\n                await self._failed_request_handler(context, error)\n            except Exception as e:\n                raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e\n\n    async def _handle_skipped_request(\n        self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False\n    ) -> None:\n        if need_mark and isinstance(request, Request):\n            request.state = RequestState.SKIPPED\n            await self._mark_request_as_handled(request)\n\n        url = request.url if isinstance(request, Request) else request\n\n        if self._on_skipped_request:\n            try:\n                await self._on_skipped_request(url, reason)\n            except Exception as e:\n                raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e\n\n    def _get_message_from_error(self, error: Exception) -> str:\n        \"\"\"Get error message summary from exception.\n\n        Custom processing to reduce the irrelevant traceback clutter in some cases.\n        \"\"\"\n        traceback_parts = traceback.format_exception(type(error), value=error, tb=error.__traceback__, chain=True)\n        used_traceback_parts = traceback_parts\n\n        if (\n            isinstance(error, asyncio.exceptions.TimeoutError)\n            and traceback_parts\n            and self._request_handler_timeout_text in traceback_parts[-1]\n        ) or isinstance(error, UserHandlerTimeoutError):\n            used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)\n            used_traceback_parts.extend(traceback_parts[-1:])\n\n        return ''.join(used_traceback_parts).strip('\\n')\n\n    def _get_only_inner_most_exception(self, error: BaseException) -> BaseException:\n        \"\"\"Get innermost exception by following __cause__ and __context__ attributes of exception.\"\"\"\n        if error.__cause__:\n            return self._get_only_inner_most_exception(error.__cause__)\n        if error.__context__:\n            return self._get_only_inner_most_exception(error.__context__)\n        # No __cause__ and no __context__, this is as deep as it can get.\n        return error\n\n    def _prepare_send_request_function(\n        self,\n        session: Session | None,\n        proxy_info: ProxyInfo | None,\n    ) -> SendRequestFunction:\n        async def send_request(\n            url: str,\n            *,\n            method: HttpMethod = 'GET',\n            payload: HttpPayload | None = None,\n            headers: HttpHeaders | dict[str, str] | None = None,\n        ) -> HttpResponse:\n            return await self._http_client.send_request(\n                url=url,\n                method=method,\n                payload=payload,\n                headers=headers,\n                session=session,\n                proxy_info=proxy_info,\n            )\n\n        return send_request\n\n    def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_url: str) -> Iterator[Request]:\n        \"\"\"Convert a sequence of URLs or Request objects to an iterator of Request objects.\"\"\"\n        for url in urls:\n            # If the request is a Request object, keep it as it is\n            if isinstance(url, Request):\n                yield url\n            # If the request is a string, convert it to Request object with absolute_url.\n            elif isinstance(url, str) and not is_url_absolute(url):\n                absolute_url = convert_to_absolute_url(base_url, url)\n                yield Request.from_url(absolute_url)\n            else:\n                yield Request.from_url(url)\n\n    async def _add_requests(\n        self,\n        context: BasicCrawlingContext,\n        requests: Sequence[str | Request],\n        rq_id: str | None = None,\n        rq_name: str | None = None,\n        rq_alias: str | None = None,\n        **kwargs: Unpack[EnqueueLinksKwargs],\n    ) -> None:\n        \"\"\"Add requests method aware of the crawling context.\"\"\"\n        if rq_id or rq_name or rq_alias:\n            request_manager: RequestManager = await RequestQueue.open(\n                id=rq_id,\n                name=rq_name,\n                alias=rq_alias,\n                storage_client=self._service_locator.get_storage_client(),\n                configuration=self._service_locator.get_configuration(),\n            )\n        else:\n            request_manager = await self.get_request_manager()\n\n        context_aware_requests = list[Request]()\n        base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url\n        requests_iterator = self._convert_url_to_request_iterator(requests, base_url)\n        filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)\n        for dst_request in filter_requests_iterator:\n            # Update the crawl depth of the request.\n            dst_request.crawl_depth = context.request.crawl_depth + 1\n\n            if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:\n                context_aware_requests.append(dst_request)\n\n        return await request_manager.add_requests(context_aware_requests)\n\n    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:\n        \"\"\"Commit request handler result for the input `context`. Result is taken from `_context_result_map`.\"\"\"\n        result = self._context_result_map[context]\n\n        for add_requests_call in result.add_requests_calls:\n            await self._add_requests(context, **add_requests_call)\n\n        for push_data_call in result.push_data_calls:\n            await self._push_data(**push_data_call)\n\n        await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)\n\n        result.apply_request_changes(target=context.request)\n\n    @staticmethod\n    async def _commit_key_value_store_changes(\n        result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction\n    ) -> None:\n        \"\"\"Store key value store changes recorded in result.\"\"\"\n        for (id, name, alias), changes in result.key_value_store_changes.items():\n            store = await get_kvs(id=id, name=name, alias=alias)\n            for key, value in changes.updates.items():\n                await store.set_value(key, value.content, value.content_type)\n\n    async def __is_finished_function(self) -> bool:\n        self._stop_if_max_requests_count_exceeded()\n        if self._unexpected_stop:\n            self._logger.info('The crawler will finish any remaining ongoing requests and shut down.')\n            return True\n\n        if self._abort_on_error and self._failed:\n            self._failed = False\n            return True\n\n        if self._keep_alive:\n            return False\n\n        request_manager = await self.get_request_manager()\n        return await request_manager.is_finished()\n\n    async def __is_task_ready_function(self) -> bool:\n        self._stop_if_max_requests_count_exceeded()\n        if self._unexpected_stop:\n            self._logger.info(\n                'No new requests are allowed because crawler `stop` method was called. '\n                'Ongoing requests will be allowed to complete.'\n            )\n            return False\n\n        request_manager = await self.get_request_manager()\n        return not await request_manager.is_empty()\n\n    async def __run_task_function(self) -> None:\n        request_manager = await self.get_request_manager()\n\n        request = await wait_for(\n            request_manager.fetch_next_request,\n            timeout=self._internal_timeout,\n            timeout_message=f'Fetching next request failed after {self._internal_timeout.total_seconds()} seconds',\n            logger=self._logger,\n            max_retries=3,\n        )\n\n        if request is None:\n            return\n\n        if not (await self._is_allowed_based_on_robots_txt_file(request.url)):\n            self._logger.warning(\n                f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'\n            )\n\n            await self._handle_skipped_request(request, 'robots_txt', need_mark=True)\n            return\n\n        if request.session_id:\n            session = await self._get_session_by_id(request.session_id)\n        else:\n            session = await self._get_session()\n        proxy_info = await self._get_proxy_info(request, session)\n        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)\n\n        context = BasicCrawlingContext(\n            request=result.request,\n            session=session,\n            proxy_info=proxy_info,\n            send_request=self._prepare_send_request_function(session, proxy_info),\n            add_requests=result.add_requests,\n            push_data=result.push_data,\n            get_key_value_store=result.get_key_value_store,\n            use_state=self.use_state,\n            log=self._logger,\n        )\n        self._context_result_map[context] = result\n\n        self._statistics.record_request_processing_start(request.unique_key)\n\n        try:\n            request.state = RequestState.REQUEST_HANDLER\n\n            try:\n                with swapped_context(context, request):\n                    self._check_request_collision(request, session)\n                    await self._run_request_handler(context=context)\n            except asyncio.TimeoutError as e:\n                raise RequestHandlerError(e, context) from e\n\n            await self._commit_request_handler_result(context)\n\n            request.state = RequestState.DONE\n\n            await self._mark_request_as_handled(request)\n\n            if session and session.is_usable:\n                session.mark_good()\n\n            self._statistics.record_request_processing_finish(request.unique_key)\n\n        except RequestCollisionError as request_error:\n            request.no_retry = True\n            await self._handle_request_error(context, request_error)\n\n        except RequestHandlerError as primary_error:\n            primary_error = cast(\n                'RequestHandlerError[TCrawlingContext]', primary_error\n            )  # valid thanks to ContextPipeline\n\n            self._logger.debug(\n                'An exception occurred in the user-defined request handler',\n                exc_info=primary_error.wrapped_exception,\n            )\n            await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)\n\n        except SessionError as session_error:\n            if not session:\n                raise RuntimeError('SessionError raised in a crawling context without a session') from session_error\n\n            if self._error_handler:\n                await self._error_handler(context, session_error)\n\n            if self._should_retry_request(context, session_error):\n                exc_only = ''.join(traceback.format_exception_only(session_error)).strip()\n                self._logger.warning('Encountered \"%s\", rotating session and retrying...', exc_only)\n\n                if session:\n                    session.retire()\n\n                # Increment session rotation count.\n                request.session_rotation_count = (request.session_rotation_count or 0) + 1\n\n                await request_manager.reclaim_request(request)\n                await self._statistics.error_tracker_retry.add(error=session_error, context=context)\n            else:\n                await self._mark_request_as_handled(request)\n\n                await self._handle_failed_request(context, session_error)\n                self._statistics.record_request_processing_failure(request.unique_key)\n\n        except ContextPipelineInterruptedError as interrupted_error:\n            self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)\n\n            await self._mark_request_as_handled(request)\n\n        except ContextPipelineInitializationError as initialization_error:\n            self._logger.debug(\n                'An exception occurred during the initialization of crawling context',\n                exc_info=initialization_error,\n            )\n            await self._handle_request_error(context, initialization_error.wrapped_exception)\n\n        except Exception as internal_error:\n            self._logger.exception(\n                'An exception occurred during handling of a request. This places the crawler '\n                'and its underlying storages into an unknown state and crawling will be terminated.',\n                exc_info=internal_error,\n            )\n            raise\n\n    async def _run_request_handler(self, context: BasicCrawlingContext) -> None:\n        context.request.state = RequestState.BEFORE_NAV\n        await self._context_pipeline(\n            context,\n            lambda final_context: wait_for(\n                lambda: self.router(final_context),\n                timeout=self._request_handler_timeout,\n                timeout_message=f'{self._request_handler_timeout_text}'\n                f' {self._request_handler_timeout.total_seconds()} seconds',\n                logger=self._logger,\n            ),\n        )\n\n    def _raise_for_error_status_code(self, status_code: int) -> None:\n        \"\"\"Raise an exception if the given status code is considered an error.\n\n        Args:\n            status_code: The HTTP status code to check.\n\n        Raises:\n            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.\n            HttpClientStatusCodeError: If the status code represents a client error.\n        \"\"\"\n        is_ignored_status = status_code in self._ignore_http_error_status_codes\n        is_explicit_error = status_code in self._additional_http_error_status_codes\n\n        if is_explicit_error:\n            raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)\n\n        if is_status_code_client_error(status_code) and not is_ignored_status:\n            raise HttpClientStatusCodeError('Client error status code returned', status_code)\n\n        if is_status_code_server_error(status_code) and not is_ignored_status:\n            raise HttpStatusCodeError('Error status code returned', status_code)\n\n    def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None:\n        \"\"\"Raise an exception if the given status code indicates the session is blocked.\n\n        Args:\n            session: The session used for the request. If None, no check is performed.\n            status_code: The HTTP status code to check.\n\n        Raises:\n            SessionError: If the status code indicates the session is blocked.\n        \"\"\"\n        if session is not None and session.is_blocked_status_code(\n            status_code=status_code,\n            ignore_http_error_status_codes=self._ignore_http_error_status_codes,\n        ):\n            raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')\n\n    def _check_request_collision(self, request: Request, session: Session | None) -> None:\n        \"\"\"Raise an exception if a request cannot access required resources.\n\n        Args:\n            request: The `Request` that might require specific resources (like a session).\n            session: The `Session` that was retrieved for the request, or `None` if not available.\n\n        Raises:\n            RequestCollisionError: If the `Session` referenced by the `Request` is not available.\n        \"\"\"\n        if self._use_session_pool and request.session_id and not session:\n            raise RequestCollisionError(\n                f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'\n            )\n\n    async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:\n        \"\"\"Check if the URL is allowed based on the robots.txt file.\n\n        Args:\n            url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.\n        \"\"\"\n        if not self._respect_robots_txt_file:\n            return True\n        robots_txt_file = await self._get_robots_txt_file_for_url(url)\n        return not robots_txt_file or robots_txt_file.is_allowed(url)\n\n    async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:\n        \"\"\"Get the RobotsTxtFile for a given URL.\n\n        Args:\n            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.\n        \"\"\"\n        if not self._respect_robots_txt_file:\n            return None\n        origin_url = str(URL(url).origin())\n        robots_txt_file = self._robots_txt_file_cache.get(origin_url)\n        if robots_txt_file:\n            return robots_txt_file\n\n        async with self._robots_txt_lock:\n            # Check again if the robots.txt file is already cached after acquiring the lock\n            robots_txt_file = self._robots_txt_file_cache.get(origin_url)\n            if robots_txt_file:\n                return robots_txt_file\n\n            # If not cached, fetch the robots.txt file\n            robots_txt_file = await self._find_txt_file_for_url(url)\n            self._robots_txt_file_cache[origin_url] = robots_txt_file\n            return robots_txt_file\n\n    async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:\n        \"\"\"Find the robots.txt file for a given URL.\n\n        Args:\n            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.\n        \"\"\"\n        return await RobotsTxtFile.find(url, self._http_client)\n\n    def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:\n        \"\"\"Log a status message for the crawler.\n\n        Args:\n            message: The status message to log.\n            level: The logging level for the message.\n        \"\"\"\n        log_level = string_to_log_level(level)\n        self.log.log(log_level, message)\n\n    async def _crawler_state_task(self) -> None:\n        \"\"\"Emit a persist state event with the given migration status.\"\"\"\n        event_manager = self._service_locator.get_event_manager()\n\n        current_state = self.statistics.state\n\n        if (\n            failed_requests := (\n                current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed\n            )\n            > 0\n        ):\n            message = f'Experiencing problems, {failed_requests} failed requests since last status update.'\n        else:\n            request_manager = await self.get_request_manager()\n            total_count = await request_manager.get_total_count()\n            if total_count is not None and total_count > 0:\n                pages_info = f'{self._statistics.state.requests_finished}/{total_count}'\n            else:\n                pages_info = str(self._statistics.state.requests_finished)\n\n            message = (\n                f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, '\n                f'desired concurrency {self._autoscaled_pool.desired_concurrency}.'\n            )\n\n        if self._status_message_callback:\n            new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message)\n            if new_message:\n                message = new_message\n                self._log_status_message(message, level='INFO')\n        else:\n            self._log_status_message(message, level='INFO')\n\n        event_manager.emit(\n            event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self))\n        )\n\n        self._previous_crawler_state = current_state\n\n    async def _mark_request_as_handled(self, request: Request) -> None:\n        request_manager = await self.get_request_manager()\n        await wait_for(\n            lambda: request_manager.mark_request_as_handled(request),\n            timeout=self._internal_timeout,\n            timeout_message='Marking request as handled timed out after '\n            f'{self._internal_timeout.total_seconds()} seconds',\n            logger=self._logger,\n            max_retries=3,\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/_basic_crawling_context.py",
    "content": "from __future__ import annotations\n\n# Do just the re-export because of the circular imports.\nfrom crawlee._types import BasicCrawlingContext  # noqa: F401\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/_context_pipeline.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, Generic, cast\n\nfrom typing_extensions import TypeVar\n\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.errors import (\n    ContextPipelineFinalizationError,\n    ContextPipelineInitializationError,\n    ContextPipelineInterruptedError,\n    RequestHandlerError,\n    SessionError,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, Awaitable, Callable, Generator\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)\nTMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)\n\n\nclass _Middleware(Generic[TMiddlewareCrawlingContext, TCrawlingContext]):\n    \"\"\"Helper wrapper class to make the middleware easily observable by open telemetry instrumentation.\"\"\"\n\n    def __init__(\n        self,\n        middleware: Callable[\n            [TCrawlingContext],\n            AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],\n        ],\n        input_context: TCrawlingContext,\n    ) -> None:\n        self.generator = middleware(input_context)\n        self.input_context = input_context\n        self.output_context: TMiddlewareCrawlingContext | None = None\n\n    async def action(self) -> TMiddlewareCrawlingContext:\n        self.output_context = await self.generator.__anext__()\n        return self.output_context\n\n    async def cleanup(self, final_consumer_exception: Exception | None) -> None:\n        try:\n            await self.generator.asend(final_consumer_exception)\n        except StopAsyncIteration:\n            pass\n        except ContextPipelineInterruptedError as e:\n            raise RuntimeError('Invalid state - pipeline interrupted in the finalization step') from e\n        except Exception as e:\n            raise ContextPipelineFinalizationError(e, self.output_context or self.input_context) from e\n        else:\n            raise RuntimeError('The middleware yielded more than once')\n\n\n@docs_group('Other')\nclass ContextPipeline(Generic[TCrawlingContext]):\n    \"\"\"Encapsulates the logic of gradually enhancing the crawling context with additional information and utilities.\n\n    The enhancement is done by a chain of middlewares that are added to the pipeline after it's creation.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        _middleware: Callable[\n            [TCrawlingContext],\n            AsyncGenerator[TMiddlewareCrawlingContext, Exception | None],\n        ]\n        | None = None,\n        _parent: ContextPipeline[BasicCrawlingContext] | None = None,\n    ) -> None:\n        self._middleware = _middleware\n        self._parent = _parent\n\n    def _middleware_chain(self) -> Generator[ContextPipeline[Any], None, None]:\n        yield self\n\n        if self._parent is not None:\n            yield from self._parent._middleware_chain()  # noqa: SLF001\n\n    async def __call__(\n        self,\n        crawling_context: BasicCrawlingContext,\n        final_context_consumer: Callable[[TCrawlingContext], Awaitable[None]],\n    ) -> None:\n        \"\"\"Run a crawling context through the middleware chain and pipe it into a consumer function.\n\n        Exceptions from the consumer function are wrapped together with the final crawling context.\n        \"\"\"\n        chain = list(self._middleware_chain())\n        cleanup_stack: list[_Middleware[Any]] = []\n        final_consumer_exception: Exception | None = None\n\n        try:\n            for member in reversed(chain):\n                if member._middleware:  # noqa: SLF001\n                    middleware_instance = _Middleware(middleware=member._middleware, input_context=crawling_context)  # noqa: SLF001\n                    try:\n                        result = await middleware_instance.action()\n                    except SessionError:  # Session errors get special treatment\n                        raise\n                    except StopAsyncIteration as e:\n                        raise RuntimeError('The middleware did not yield') from e\n                    except ContextPipelineInterruptedError:\n                        raise\n                    except Exception as e:\n                        raise ContextPipelineInitializationError(e, crawling_context) from e\n\n                    crawling_context = result\n                    cleanup_stack.append(middleware_instance)\n\n            try:\n                await final_context_consumer(cast('TCrawlingContext', crawling_context))\n            except SessionError as e:  # Session errors get special treatment\n                final_consumer_exception = e\n                raise\n            except Exception as e:\n                final_consumer_exception = e\n                raise RequestHandlerError(e, crawling_context) from e\n        finally:\n            for middleware_instance in reversed(cleanup_stack):\n                await middleware_instance.cleanup(final_consumer_exception)\n\n    def compose(\n        self,\n        middleware: Callable[\n            [TCrawlingContext],\n            AsyncGenerator[TMiddlewareCrawlingContext, None],\n        ],\n    ) -> ContextPipeline[TMiddlewareCrawlingContext]:\n        \"\"\"Add a middleware to the pipeline.\n\n        The middleware should yield exactly once, and it should yield an (optionally) extended crawling context object.\n        The part before the yield can be used for initialization and the part after it for cleanup.\n\n        Returns:\n            The extended pipeline instance, providing a fluent interface\n        \"\"\"\n        return ContextPipeline[TMiddlewareCrawlingContext](\n            _middleware=cast(\n                'Callable[[BasicCrawlingContext], AsyncGenerator[TMiddlewareCrawlingContext, Exception | None]]',\n                middleware,\n            ),\n            _parent=cast('ContextPipeline[BasicCrawlingContext]', self),\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/_context_utils.py",
    "content": "from __future__ import annotations\n\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n\n    from crawlee._request import Request\n\n    from ._basic_crawling_context import BasicCrawlingContext\n\n\n@contextmanager\ndef swapped_context(\n    context: BasicCrawlingContext,\n    request: Request,\n) -> Iterator[None]:\n    \"\"\"Replace context's isolated copies with originals after handler execution.\"\"\"\n    try:\n        yield\n    finally:\n        # Restore original context state to avoid side effects between different handlers.\n        object.__setattr__(context, 'request', request)\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/_logging_utils.py",
    "content": "import asyncio\nimport re\nimport traceback\n\nimport crawlee.errors\n\n\ndef _get_only_innermost_exception(error: BaseException) -> BaseException:\n    \"\"\"Get innermost exception by following __cause__ and __context__ attributes of exception.\n\n    If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.\n    \"\"\"\n    if type(error) is crawlee.errors.UserHandlerTimeoutError:\n        if error.__cause__:\n            return error.__cause__\n        if error.__context__:\n            return error.__context__\n        return error\n\n    if error.__cause__:\n        return _get_only_innermost_exception(error.__cause__)\n    if error.__context__:\n        return _get_only_innermost_exception(error.__context__)\n    # No __cause__ and no __context__, this is as deep as it can get.\n    return error\n\n\ndef _get_filtered_traceback_parts_for_asyncio_timeout_error(traceback_parts: list[str]) -> list[str]:\n    \"\"\"Extract only the most relevant traceback parts from stack trace.\"\"\"\n    ignore_pattern = (\n        r'([\\\\/]{1}asyncio[\\\\/]{1})|'  # internal asyncio parts\n        r'(Traceback \\(most recent call last\\))|'  # common part of the stack trace formatting\n        r'(asyncio\\.exceptions\\.CancelledError)'  # internal asyncio exception\n    )\n    return [\n        _strip_pep657_highlighting(traceback_part)\n        for traceback_part in traceback_parts\n        if not re.findall(ignore_pattern, traceback_part)\n    ]\n\n\ndef _strip_pep657_highlighting(traceback_part: str) -> str:\n    \"\"\"Remove PEP 657 highlighting from the traceback.\"\"\"\n    highlight_pattern = r'(\\n\\s*~*\\^+~*\\n)$'\n    return re.sub(highlight_pattern, '\\n', traceback_part)\n\n\ndef reduce_asyncio_timeout_error_to_relevant_traceback_parts(\n    timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,\n) -> list[str]:\n    innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)\n    return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)\n\n\ndef _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:\n    innermost_error = _get_only_innermost_exception(error)\n    return traceback.format_exception(\n        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False\n    )\n\n\ndef get_one_line_error_summary_if_possible(error: Exception) -> str:\n    if isinstance(error, asyncio.exceptions.TimeoutError):\n        relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)\n        most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''\n    elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):\n        # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee\n        # code and third line the topmost user error\n        traceback_parts = _get_traceback_parts_for_innermost_exception(error)\n        relevant_index_from_start = 3\n        most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''\n    elif 'playwright._impl._errors.Error' in str(error.__class__):\n        # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway\n        # point to deep internals.\n        return ''\n    else:\n        traceback_parts = _get_traceback_parts_for_innermost_exception(error)\n        # Commonly last traceback part is type of the error, and the second last part is the relevant file.\n        # If there are not enough traceback parts, then we are not sure how to summarize the error.\n        relevant_traceback_part_index_from_end = 2\n        most_relevant_part = _strip_pep657_highlighting(\n            _get_traceback_parts_for_innermost_exception(error)[-relevant_traceback_part_index_from_end]\n            if len(traceback_parts) >= relevant_traceback_part_index_from_end\n            else ''\n        )\n\n    return most_relevant_part.strip('\\n ').replace('\\n', ', ')\n"
  },
  {
    "path": "src/crawlee/crawlers/_basic/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'BeautifulSoupCrawler'):\n    from ._beautifulsoup_crawler import BeautifulSoupCrawler\nwith _try_import(__name__, 'BeautifulSoupCrawlingContext'):\n    from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext\nwith _try_import(__name__, 'BeautifulSoupParserType'):\n    from ._beautifulsoup_parser import BeautifulSoupParserType\n\n__all__ = [\n    'BeautifulSoupCrawler',\n    'BeautifulSoupCrawlingContext',\n    'BeautifulSoupParserType',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom bs4 import BeautifulSoup, Tag\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions\n\nfrom ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext\nfrom ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from typing_extensions import Unpack\n\n    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext\n\n\n@docs_group('Crawlers')\nclass BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup, Tag]):\n    \"\"\"A web crawler for performing HTTP requests and parsing HTML/XML content.\n\n    The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\n    It specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.\n    `BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/\n\n    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\n    if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext\n\n    crawler = BeautifulSoupCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.soup.title.string if context.soup.title else None,\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    await crawler.run(['https://crawlee.dev/'])\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        parser: BeautifulSoupParserType = 'lxml',\n        **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            parser: The type of parser that should be used by `BeautifulSoup`.\n            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.\n        \"\"\"\n\n        async def final_step(\n            context: ParsedHttpCrawlingContext[BeautifulSoup],\n        ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:\n            \"\"\"Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property.\"\"\"\n            yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context)\n\n        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)\n\n        super().__init__(\n            parser=BeautifulSoupParser(parser=parser),\n            **kwargs,\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py",
    "content": "from dataclasses import dataclass, fields\n\nfrom bs4 import BeautifulSoup\nfrom typing_extensions import Self\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers import ParsedHttpCrawlingContext\n\nfrom ._utils import html_to_text\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]):\n    \"\"\"The crawling context used by the `BeautifulSoupCrawler`.\n\n    It provides access to key objects as well as utility functions for handling crawling tasks.\n    \"\"\"\n\n    @property\n    def soup(self) -> BeautifulSoup:\n        \"\"\"Convenience alias.\"\"\"\n        return self.parsed_content\n\n    @classmethod\n    def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:\n        \"\"\"Initialize a new instance from an existing `ParsedHttpCrawlingContext`.\"\"\"\n        return cls(**{field.name: getattr(context, field.name) for field in fields(context)})\n\n    def html_to_text(self) -> str:\n        \"\"\"Convert the parsed HTML content to newline-separated plain text without tags.\"\"\"\n        return html_to_text(self.parsed_content)\n"
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Literal\n\nfrom bs4 import BeautifulSoup, Tag\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._abstract_http import AbstractHttpParser\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Sequence\n\n    from crawlee.http_clients import HttpResponse\n\n\n@docs_group('HTTP parsers')\nclass BeautifulSoupParser(AbstractHttpParser[BeautifulSoup, Tag]):\n    \"\"\"Parser for parsing HTTP response using `BeautifulSoup`.\"\"\"\n\n    def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:\n        self._parser = parser\n\n    @override\n    async def parse(self, response: HttpResponse) -> BeautifulSoup:\n        return BeautifulSoup(await response.read(), features=self._parser)\n\n    @override\n    async def parse_text(self, text: str) -> BeautifulSoup:\n        return BeautifulSoup(text, features=self._parser)\n\n    @override\n    def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool:\n        return parsed_content.select_one(selector) is not None\n\n    @override\n    async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:\n        return tuple(match for match in parsed_content.select(selector))\n\n    @override\n    def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]:\n        link: Tag\n        urls: list[str] = []\n        for link in parsed_content.select(selector):\n            url = link.attrs.get(attribute)\n            if url:\n                urls.append(url.strip())\n        return urls\n\n\nBeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib']\n"
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/_utils.py",
    "content": "from __future__ import annotations\n\nimport re\nfrom typing import TYPE_CHECKING\n\nfrom bs4 import BeautifulSoup, NavigableString, PageElement, Tag\n\nfrom crawlee._utils.html_to_text import (\n    _ANY_CONSECUTIVE_WHITE_SPACES,\n    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,\n    _EMPTY_OR_ENDS_WITH_NEW_LINE,\n    BLOCK_TAGS,\n    SKIP_TAGS,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable\n\n\ndef html_to_text(source: str | Tag) -> str:\n    \"\"\"Convert markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.\n\n    Args:\n        source: Input markup string or `BeautifulSoup` object.\n\n    Returns:\n        Newline separated plain text without tags.\n    \"\"\"\n    if isinstance(source, str):\n        soup = BeautifulSoup(source, features='lxml')\n    elif isinstance(source, BeautifulSoup):\n        soup = source\n    else:\n        raise TypeError('Source must be either a string or a `BeautifulSoup` object.')\n\n    text = ''\n\n    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:\n        \"\"\"Extract and process text content from a collection of HTML elements.\n\n        Convert page elements into plain text while preserving structure. Handle whitespace compression,\n        skip unwanted elements, and format block elements correctly.\n        \"\"\"\n        nonlocal text\n        for page_element in page_elements:\n            if isinstance(page_element, (Tag, NavigableString)):\n                if isinstance(page_element, NavigableString):\n                    compr: str\n                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':\n                        compr = page_element.get_text()\n                    else:\n                        # Compress white spaces outside of pre block\n                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())\n                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line\n                    if (compr.startswith((' ', '\\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):\n                        compr = compr[1:]\n                    text += compr\n                elif page_element.name.lower() in SKIP_TAGS:\n                    # Skip comments and special elements\n                    pass\n                elif page_element.name.lower() == 'br':\n                    text += '\\n'\n                elif page_element.name.lower() == 'td':\n                    _page_element_to_text(page_element.children)\n                    text += '\\t'\n                else:\n                    # Block elements must be surrounded by newlines(unless beginning of text)\n                    is_block_tag = page_element.name.lower() in BLOCK_TAGS\n                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):\n                        text += '\\n'\n                    _page_element_to_text(page_element.children)\n                    if is_block_tag and not text.endswith('\\n'):\n                        text += '\\n'\n\n    _page_element_to_text(soup.children)\n\n    return text.strip()\n"
  },
  {
    "path": "src/crawlee/crawlers/_beautifulsoup/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/crawlers/_http/__init__.py",
    "content": "from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext\nfrom crawlee.http_clients import HttpCrawlingResult\n\nfrom ._http_crawler import HttpCrawler\n\n__all__ = [\n    'HttpCrawler',\n    'HttpCrawlingContext',\n    'HttpCrawlingResult',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_http/_http_crawler.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._abstract_http import AbstractHttpCrawler, ParsedHttpCrawlingContext\n\nfrom ._http_parser import NoParser\n\nif TYPE_CHECKING:\n    from typing_extensions import Unpack\n\n    from crawlee.crawlers import BasicCrawlerOptions\n\n\n@docs_group('Crawlers')\nclass HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]):\n    \"\"\"Specific version of generic `AbstractHttpCrawler`.\n\n    It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are\n    doing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using\n    `BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n\n    crawler = HttpCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'response': (await context.http_response.read()).decode()[:100],\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    await crawler.run(['https://crawlee.dev/'])\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.\n        \"\"\"\n        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()\n        super().__init__(\n            parser=NoParser(),\n            **kwargs,\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_http/_http_parser.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._abstract_http import AbstractHttpParser\nfrom crawlee.crawlers._types import BlockedInfo\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Sequence\n\n    from crawlee.http_clients import HttpResponse\n\n\n@docs_group('HTTP parsers')\nclass NoParser(AbstractHttpParser[bytes, bytes]):\n    \"\"\"A no-op parser that returns raw response content without any processing.\n\n    This is useful when you only need the raw response data and don't require HTML\n    parsing, link extraction, or content selection functionality.\n    \"\"\"\n\n    @override\n    async def parse(self, response: HttpResponse) -> bytes:\n        return await response.read()\n\n    @override\n    async def parse_text(self, text: str) -> bytes:\n        raise NotImplementedError\n\n    @override\n    async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]:\n        raise NotImplementedError\n\n    @override\n    def is_blocked(self, parsed_content: bytes) -> BlockedInfo:  # Intentional unused argument.\n        return BlockedInfo(reason='')\n\n    @override\n    def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool:  # Intentional unused argument.\n        return False\n\n    @override\n    def find_links(\n        self, parsed_content: bytes, selector: str, attribute: str\n    ) -> Iterable[str]:  # Intentional unused argument.\n        return []\n"
  },
  {
    "path": "src/crawlee/crawlers/_parsel/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'ParselCrawler'):\n    from ._parsel_crawler import ParselCrawler\nwith _try_import(__name__, 'ParselCrawlingContext'):\n    from ._parsel_crawling_context import ParselCrawlingContext\n\n__all__ = [\n    'ParselCrawler',\n    'ParselCrawlingContext',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_parsel/_parsel_crawler.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom parsel import Selector\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions\n\nfrom ._parsel_crawling_context import ParselCrawlingContext\nfrom ._parsel_parser import ParselParser\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from typing_extensions import Unpack\n\n    from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext\n\n\n@docs_group('Crawlers')\nclass ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selector]):\n    \"\"\"A web crawler for performing HTTP requests and parsing HTML/XML content.\n\n    The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.\n    It specifies its own parser `ParselParser` which is used to parse `HttpResponse`.\n    `ParselParser` uses following library for parsing: https://pypi.org/project/parsel/\n\n    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,\n    if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import ParselCrawler, ParselCrawlingContext\n\n    crawler = ParselCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': context.selector.css('title').get(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    await crawler.run(['https://crawlee.dev/'])\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.\n        \"\"\"\n\n        async def final_step(\n            context: ParsedHttpCrawlingContext[Selector],\n        ) -> AsyncGenerator[ParselCrawlingContext, None]:\n            \"\"\"Enhance `ParsedHttpCrawlingContext[Selector]` with a `selector` property.\"\"\"\n            yield ParselCrawlingContext.from_parsed_http_crawling_context(context)\n\n        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)\n        super().__init__(\n            parser=ParselParser(),\n            **kwargs,\n        )\n"
  },
  {
    "path": "src/crawlee/crawlers/_parsel/_parsel_crawling_context.py",
    "content": "from dataclasses import dataclass, fields\n\nfrom parsel import Selector\nfrom typing_extensions import Self\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext\n\nfrom ._utils import html_to_text\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]):\n    \"\"\"The crawling context used by the `ParselCrawler`.\n\n    It provides access to key objects as well as utility functions for handling crawling tasks.\n    \"\"\"\n\n    @property\n    def selector(self) -> Selector:\n        \"\"\"Convenience alias.\"\"\"\n        return self.parsed_content\n\n    @classmethod\n    def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:\n        \"\"\"Create a new context from an existing `ParsedHttpCrawlingContext[Selector]`.\"\"\"\n        return cls(**{field.name: getattr(context, field.name) for field in fields(context)})\n\n    def html_to_text(self) -> str:\n        \"\"\"Convert the parsed HTML content to newline-separated plain text without tags.\"\"\"\n        return html_to_text(self.parsed_content)\n"
  },
  {
    "path": "src/crawlee/crawlers/_parsel/_parsel_parser.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nfrom parsel import Selector\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers._abstract_http import AbstractHttpParser\n\nif TYPE_CHECKING:\n    from collections.abc import Iterable, Sequence\n\n    from crawlee.http_clients import HttpResponse\n\n\n@docs_group('HTTP parsers')\nclass ParselParser(AbstractHttpParser[Selector, Selector]):\n    \"\"\"Parser for parsing HTTP response using Parsel.\"\"\"\n\n    @override\n    async def parse(self, response: HttpResponse) -> Selector:\n        response_body = await response.read()\n        return await asyncio.to_thread(Selector, body=response_body)\n\n    @override\n    async def parse_text(self, text: str) -> Selector:\n        return Selector(text=text)\n\n    @override\n    async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]:\n        return tuple(match for match in parsed_content.css(selector))\n\n    @override\n    def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:\n        return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None\n\n    @override\n    def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]:\n        link: Selector\n        urls: list[str] = []\n        for link in parsed_content.css(selector):\n            url = link.xpath(f'@{attribute}').get()\n            if url:\n                urls.append(url.strip())\n        return urls\n"
  },
  {
    "path": "src/crawlee/crawlers/_parsel/_utils.py",
    "content": "from __future__ import annotations\n\nimport re\n\nfrom parsel import Selector\n\nfrom crawlee._utils.html_to_text import (\n    _ANY_CONSECUTIVE_WHITE_SPACES,\n    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,\n    _EMPTY_OR_ENDS_WITH_NEW_LINE,\n    BLOCK_TAGS,\n    SKIP_TAGS,\n)\n\n\ndef html_to_text(source: str | Selector) -> str:\n    \"\"\"Convert markup string or `Selector` to newline-separated plain text without tags using Parsel.\n\n    Args:\n        source: Input markup string or `Selector` object.\n\n    Returns:\n        Newline separated plain text without tags.\n    \"\"\"\n    if isinstance(source, str):\n        selector = Selector(text=source)\n    elif isinstance(source, Selector):\n        selector = source\n    else:\n        raise TypeError('Source must be either a string or a `Selector` object.')\n\n    text = ''\n\n    def _extract_text(elements: list[Selector], *, compress: bool = True) -> None:\n        \"\"\"Extract text content from HTML elements while preserving formatting.\n\n        Perform custom HTML parsing to match the behavior of the JavaScript version of Crawlee. Handles whitespace\n        compression and block-level tag formatting.\n\n        Args:\n            elements: A list of selectors representing the HTML elements.\n            compress: Whether to compress consecutive whitespace outside of `<pre>` blocks.\n        \"\"\"\n        nonlocal text\n        for element in elements:\n            tag = element.root.tag if hasattr(element.root, 'tag') else None\n\n            if tag is None:\n                # Compress white spaces outside of pre block\n                compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root\n                # If text is empty or ends with a whitespace, don't add the leading whitespace or new line\n                if (compr.startswith((' ', '\\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):\n                    compr = compr[1:]\n                text += compr\n\n            if tag in SKIP_TAGS or not isinstance(tag, str):\n                continue\n\n            if tag == 'br':\n                text += '\\n'\n            elif tag == 'td':\n                _extract_text(element.xpath('./node()'))\n                text += '\\t'\n            else:\n                is_block_tag = tag in BLOCK_TAGS if tag else False\n\n                if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):\n                    text += '\\n'\n\n                _extract_text(element.xpath('./node()'), compress=tag != 'pre')\n\n                if is_block_tag and not text.endswith('\\n'):\n                    text += '\\n'\n\n    # Start processing the root elements\n    _extract_text(selector.xpath('/*'))\n\n    return text.strip()\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'PlaywrightCrawler'):\n    from ._playwright_crawler import PlaywrightCrawler\nwith _try_import(__name__, 'PlaywrightCrawlingContext'):\n    from ._playwright_crawling_context import PlaywrightCrawlingContext\nwith _try_import(__name__, 'PlaywrightPreNavCrawlingContext'):\n    from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext\nwith _try_import(__name__, 'PlaywrightPostNavCrawlingContext'):\n    from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext\n\n__all__ = [\n    'PlaywrightCrawler',\n    'PlaywrightCrawlingContext',\n    'PlaywrightPostNavCrawlingContext',\n    'PlaywrightPreNavCrawlingContext',\n]\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_playwright_crawler.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nimport warnings\nfrom datetime import timedelta\nfrom functools import partial\nfrom typing import TYPE_CHECKING, Any, Generic, Literal\n\nimport playwright.async_api\nfrom more_itertools import partition\nfrom pydantic import ValidationError\nfrom typing_extensions import NotRequired, TypedDict, TypeVar\n\nfrom crawlee._request import Request, RequestOptions, RequestState\nfrom crawlee._types import BasicCrawlingContext, ConcurrencySettings\nfrom crawlee._utils.blocked import RETRY_CSS_SELECTORS\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.robots import RobotsTxtFile\nfrom crawlee._utils.time import SharedTimeout\nfrom crawlee._utils.urls import to_absolute_url_iterator\nfrom crawlee.browsers import BrowserPool\nfrom crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline\nfrom crawlee.errors import SessionError\nfrom crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions\nfrom crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.sessions._cookies import PlaywrightCookieParam\nfrom crawlee.statistics import StatisticsState\n\nfrom ._playwright_crawling_context import PlaywrightCrawlingContext\nfrom ._playwright_http_client import PlaywrightHttpClient, browser_page_context\nfrom ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext\nfrom ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext\nfrom ._types import GotoOptions\nfrom ._utils import block_requests, infinite_scroll\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)\nTStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping\n    from pathlib import Path\n\n    from playwright.async_api import Page, Route\n    from playwright.async_api import Request as PlaywrightRequest\n    from typing_extensions import Unpack\n\n    from crawlee import RequestTransformAction\n    from crawlee._types import (\n        EnqueueLinksKwargs,\n        ExtractLinksFunction,\n        HttpHeaders,\n        HttpMethod,\n        HttpPayload,\n    )\n    from crawlee.browsers._types import BrowserType\n\n\n@docs_group('Crawlers')\nclass PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]):\n    \"\"\"A web crawler that leverages the `Playwright` browser automation library.\n\n    The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.\n    On top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more\n    specific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they\n    open. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let\n    the crawler create a new instance with the default settings.\n\n    This crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers\n    to download web pages and extract data. For websites that do not require JavaScript, consider using one of the\n    HTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use\n    raw HTTP requests, which means they are much faster.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n    crawler = PlaywrightCrawler()\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n            'response': (await context.response.text())[:100],\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n    await crawler.run(['https://crawlee.dev/'])\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        browser_pool: BrowserPool | None = None,\n        browser_type: BrowserType | None = None,\n        user_data_dir: str | Path | None = None,\n        browser_launch_options: Mapping[str, Any] | None = None,\n        browser_new_context_options: Mapping[str, Any] | None = None,\n        goto_options: GotoOptions | None = None,\n        fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',\n        headless: bool | None = None,\n        use_incognito_pages: bool | None = None,\n        navigation_timeout: timedelta | None = None,\n        **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.\n            user_data_dir: Path to a user data directory, which stores browser session data like cookies\n                and local storage.\n            browser_type: The type of browser to launch:\n                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on\n                    the system.\n                This option should not be used if `browser_pool` is provided.\n            browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided\n                directly to Playwright's `browser_type.launch` method. For more details, refer to the\n                [Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch).\n                This option should not be used if `browser_pool` is provided.\n            browser_new_context_options: Keyword arguments to pass to the browser new context method. These options\n                are provided directly to Playwright's `browser.new_context` method. For more details, refer to the\n                [Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).\n                This option should not be used if `browser_pool` is provided.\n            fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used\n                to generate browser fingerprints together with consistent headers.\n            headless: Whether to run the browser in headless mode.\n                This option should not be used if `browser_pool` is provided.\n            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its\n                own context that is destroyed once the page is closed or crashes.\n                This option should not be used if `browser_pool` is provided.\n            navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling\n                the request handler)\n            goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is\n                not supported, use `navigation_timeout` instead.\n            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.\n        \"\"\"\n        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}\n\n        if browser_pool:\n            # Raise an exception if browser_pool is provided together with other browser-related arguments.\n            if any(\n                param not in [None, 'default']\n                for param in (\n                    user_data_dir,\n                    use_incognito_pages,\n                    headless,\n                    browser_type,\n                    browser_launch_options,\n                    browser_new_context_options,\n                    fingerprint_generator,\n                )\n            ):\n                raise ValueError(\n                    'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '\n                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '\n                    '`fingerprint_generator` arguments when `browser_pool` is provided.'\n                )\n\n        # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.\n        else:\n            if fingerprint_generator == 'default':\n                generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (\n                    [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None\n                )\n\n                fingerprint_generator = DefaultFingerprintGenerator(\n                    header_options=HeaderGeneratorOptions(browsers=generator_browser_type)\n                )\n\n            browser_pool = BrowserPool.with_default_plugin(\n                headless=headless,\n                browser_type=browser_type,\n                user_data_dir=user_data_dir,\n                browser_launch_options=browser_launch_options,\n                browser_new_context_options=browser_new_context_options,\n                use_incognito_pages=use_incognito_pages,\n                fingerprint_generator=fingerprint_generator,\n            )\n\n        self._browser_pool = browser_pool\n\n        # Compose the context pipeline with the Playwright-specific context enhancer.\n        kwargs['_context_pipeline'] = (\n            ContextPipeline()\n            .compose(self._open_page)\n            .compose(self._navigate)\n            .compose(self._execute_post_navigation_hooks)\n            .compose(self._handle_status_code_response)\n            .compose(self._handle_blocked_request_by_content)\n            .compose(self._create_crawling_context)\n        )\n        kwargs['_additional_context_managers'] = [self._browser_pool]\n        kwargs.setdefault('_logger', logging.getLogger(__name__))\n        self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []\n        self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = []\n\n        kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']\n\n        # Set default concurrency settings for browser crawlers if not provided\n        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:\n            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)\n\n        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)\n        self._goto_options = goto_options or GotoOptions()\n\n        super().__init__(**kwargs)\n\n    async def _open_page(\n        self,\n        context: BasicCrawlingContext,\n    ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]:\n        if self._browser_pool is None:\n            raise ValueError('Browser pool is not initialized.')\n\n        # Create a new browser page\n        crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)\n\n        pre_navigation_context = PlaywrightPreNavCrawlingContext(\n            request=context.request,\n            session=context.session,\n            add_requests=context.add_requests,\n            send_request=context.send_request,\n            push_data=context.push_data,\n            use_state=context.use_state,\n            proxy_info=context.proxy_info,\n            get_key_value_store=context.get_key_value_store,\n            log=context.log,\n            page=crawlee_page.page,\n            block_requests=partial(block_requests, page=crawlee_page.page),\n            goto_options=GotoOptions(**self._goto_options),\n        )\n\n        context_id = id(pre_navigation_context)\n        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)\n\n        try:\n            # Only use the page context manager here — it sets the current page in a context variable,\n            # making it accessible to PlaywrightHttpClient in subsequent pipeline steps.\n            async with browser_page_context(crawlee_page.page):\n                for hook in self._pre_navigation_hooks:\n                    async with self._shared_navigation_timeouts[context_id]:\n                        await hook(pre_navigation_context)\n\n                # Yield should be inside the browser_page_context.\n                yield pre_navigation_context\n        finally:\n            self._shared_navigation_timeouts.pop(context_id, None)\n\n    def _prepare_request_interceptor(\n        self,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n    ) -> Callable:\n        \"\"\"Create a request interceptor for Playwright to support non-GET methods with custom parameters.\n\n        The interceptor modifies requests by adding custom headers and payload before they are sent.\n\n        Args:\n            method: HTTP method to use for the request.\n            headers: Custom HTTP headers to send with the request.\n            payload: Request body data for POST/PUT requests.\n        \"\"\"\n\n        async def route_handler(route: Route, _: PlaywrightRequest) -> None:\n            await route.continue_(method=method, headers=dict(headers) if headers else None, post_data=payload)\n\n        return route_handler\n\n    async def _navigate(\n        self,\n        context: PlaywrightPreNavCrawlingContext,\n    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]:\n        \"\"\"Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.\n\n        Args:\n            context: The basic crawling context to be enhanced.\n\n        Raises:\n            ValueError: If the browser pool is not initialized.\n            SessionError: If the URL cannot be loaded by the browser.\n            TimeoutError: If navigation does not succeed within the navigation timeout.\n\n        Yields:\n            The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,\n                infinite_scroll and block_requests).\n        \"\"\"\n        async with context.page:\n            if context.session:\n                session_cookies = context.session.cookies.get_cookies_as_playwright_format()\n                await self._update_cookies(context.page, session_cookies)\n\n            if context.request.headers:\n                await context.page.set_extra_http_headers(context.request.headers.model_dump())\n            # Navigate to the URL and get response.\n            if context.request.method != 'GET':\n                # Call the notification only once\n                warnings.warn(\n                    'Using other request methods than GET or adding payloads has a high impact on performance'\n                    ' in recent versions of Playwright. Use only when necessary.',\n                    category=UserWarning,\n                    stacklevel=2,\n                )\n\n                route_handler = self._prepare_request_interceptor(\n                    method=context.request.method,\n                    headers=context.request.headers,\n                    payload=context.request.payload,\n                )\n\n                # Set route_handler only for current request\n                await context.page.route(context.request.url, route_handler)\n\n            try:\n                async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:\n                    response = await context.page.goto(\n                        context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options\n                    )\n                context.request.state = RequestState.AFTER_NAV\n            except playwright.async_api.TimeoutError as exc:\n                raise asyncio.TimeoutError from exc\n\n            if response is None:\n                raise SessionError(f'Failed to load the URL: {context.request.url}')\n\n            # Set the loaded URL to the actual URL after redirection.\n            context.request.loaded_url = context.page.url\n\n            yield PlaywrightPostNavCrawlingContext(\n                request=context.request,\n                session=context.session,\n                add_requests=context.add_requests,\n                send_request=context.send_request,\n                push_data=context.push_data,\n                use_state=context.use_state,\n                proxy_info=context.proxy_info,\n                get_key_value_store=context.get_key_value_store,\n                log=context.log,\n                page=context.page,\n                block_requests=context.block_requests,\n                goto_options=context.goto_options,\n                response=response,\n            )\n\n    def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:\n        \"\"\"Create a callback function for extracting links from context.\n\n        Args:\n            context: The current crawling context.\n\n        Returns:\n            Awaitable that is used for extracting links from context.\n        \"\"\"\n\n        async def extract_links(\n            *,\n            selector: str = 'a',\n            attribute: str = 'href',\n            label: str | None = None,\n            user_data: dict | None = None,\n            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]\n            | None = None,\n            **kwargs: Unpack[EnqueueLinksKwargs],\n        ) -> list[Request]:\n            \"\"\"Extract links from the current page.\n\n            The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.\n            \"\"\"\n            requests = list[Request]()\n\n            base_user_data = user_data or {}\n\n            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)\n\n            kwargs.setdefault('strategy', 'same-hostname')\n            strategy = kwargs.get('strategy', 'same-hostname')\n\n            elements = await context.page.query_selector_all(selector)\n            links_iterator: Iterator[str] = iter(\n                [url for element in elements if (url := await element.get_attribute(attribute)) is not None]\n            )\n\n            # Get base URL from <base> tag if present\n            extracted_base_url = await context.page.evaluate('document.baseURI')\n            base_url: str = extracted_base_url or context.request.loaded_url or context.request.url\n\n            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)\n\n            if robots_txt_file:\n                skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)\n            else:\n                skipped = iter([])\n\n            for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):\n                request_options = RequestOptions(\n                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy\n                )\n\n                if transform_request_function:\n                    transform_request_options = transform_request_function(request_options)\n                    if transform_request_options == 'skip':\n                        continue\n                    if transform_request_options != 'unchanged':\n                        request_options = transform_request_options\n\n                try:\n                    request = Request.from_url(**request_options)\n                except ValidationError as exc:\n                    context.log.debug(\n                        f'Skipping URL \"{url}\" due to invalid format: {exc}. '\n                        'This may be caused by a malformed URL or unsupported URL scheme. '\n                        'Please ensure the URL is correct and retry.'\n                    )\n                    continue\n\n                requests.append(request)\n\n            skipped_tasks = [\n                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped\n            ]\n            await asyncio.gather(*skipped_tasks)\n\n            return requests\n\n        return extract_links\n\n    async def _handle_status_code_response(\n        self, context: PlaywrightPostNavCrawlingContext\n    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:\n        \"\"\"Validate the HTTP status code and raise appropriate exceptions if needed.\n\n        Args:\n            context: The current crawling context containing the response.\n\n        Raises:\n            SessionError: If the status code indicates the session is blocked.\n            HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error.\n            HttpClientStatusCodeError: If the status code represents a client error.\n\n        Yields:\n            The original crawling context if no errors are detected.\n        \"\"\"\n        status_code = context.response.status\n        if self._retry_on_blocked:\n            self._raise_for_session_blocked_status_code(context.session, status_code)\n        self._raise_for_error_status_code(status_code)\n        yield context\n\n    async def _handle_blocked_request_by_content(\n        self,\n        context: PlaywrightPostNavCrawlingContext,\n    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:\n        \"\"\"Try to detect if the request is blocked based on the response content.\n\n        Args:\n            context: The current crawling context.\n\n        Raises:\n            SessionError: If the request is considered blocked.\n\n        Yields:\n            The original crawling context if no errors are detected.\n        \"\"\"\n        if self._retry_on_blocked:\n            matched_selectors = [\n                selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector))\n            ]\n\n            # Check if the session is blocked based on the response content\n            if matched_selectors:\n                raise SessionError(\n                    'Assuming the session is blocked - '\n                    f'HTTP response matched the following selectors: {\"; \".join(matched_selectors)}'\n                )\n\n        yield context\n\n    async def _execute_post_navigation_hooks(\n        self, context: PlaywrightPostNavCrawlingContext\n    ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:\n        for hook in self._post_navigation_hooks:\n            await hook(context)\n        yield context\n\n    async def _create_crawling_context(\n        self, context: PlaywrightPostNavCrawlingContext\n    ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:\n        extract_links = self._create_extract_links_function(context)\n\n        error = yield PlaywrightCrawlingContext(\n            request=context.request,\n            session=context.session,\n            add_requests=context.add_requests,\n            send_request=context.send_request,\n            push_data=context.push_data,\n            use_state=context.use_state,\n            proxy_info=context.proxy_info,\n            get_key_value_store=context.get_key_value_store,\n            log=context.log,\n            page=context.page,\n            goto_options=context.goto_options,\n            response=context.response,\n            infinite_scroll=lambda: infinite_scroll(context.page),\n            extract_links=extract_links,\n            enqueue_links=self._create_enqueue_links_function(context, extract_links),\n            block_requests=partial(block_requests, page=context.page),\n        )\n\n        if context.session:\n            pw_cookies = await self._get_cookies(context.page)\n            context.session.cookies.set_cookies_from_playwright_format(pw_cookies)\n\n        # Collect data in case of errors, before the page object is closed.\n        if error:\n            await self.statistics.error_tracker.add(error=error, context=context, early=True)\n\n    def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:\n        \"\"\"Register a hook to be called before each navigation.\n\n        Args:\n            hook: A coroutine function to be called before each navigation.\n        \"\"\"\n        self._pre_navigation_hooks.append(hook)\n\n    def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:\n        \"\"\"Register a hook to be called after each navigation.\n\n        Args:\n            hook: A coroutine function to be called after each navigation.\n        \"\"\"\n        self._post_navigation_hooks.append(hook)\n\n    async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]:\n        \"\"\"Get the cookies from the page.\"\"\"\n        cookies = await page.context.cookies()\n        return [PlaywrightCookieParam(**cookie) for cookie in cookies]\n\n    async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:\n        \"\"\"Update the cookies in the page context.\"\"\"\n        await page.context.add_cookies([{**cookie} for cookie in cookies])\n\n    async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:\n        \"\"\"Find the robots.txt file for a given URL.\n\n        Args:\n            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.\n        \"\"\"\n        http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client\n\n        return await RobotsTxtFile.find(url, http_client=http_client)\n\n\nclass _PlaywrightCrawlerAdditionalOptions(TypedDict):\n    \"\"\"Additional arguments for the `PlaywrightCrawler` constructor.\n\n    It is intended for typing forwarded `__init__` arguments in the subclasses.\n    All arguments are `BasicCrawlerOptions` + `_PlaywrightCrawlerAdditionalOptions`\n    \"\"\"\n\n    browser_pool: NotRequired[BrowserPool]\n    \"\"\"A `BrowserPool` instance to be used for launching the browsers and getting pages.\"\"\"\n\n    browser_type: NotRequired[BrowserType]\n    \"\"\"The type of browser to launch:\n    - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers\n    - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.\n    This option should not be used if `browser_pool` is provided.\"\"\"\n\n    browser_launch_options: NotRequired[Mapping[str, Any]]\n    \"\"\"Keyword arguments to pass to the browser launch method. These options are provided\n    directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright\n    documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.\n    This option should not be used if `browser_pool` is provided.\"\"\"\n\n    browser_new_context_options: NotRequired[Mapping[str, Any]]\n    \"\"\"Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's\n    `browser.new_context` method. For more details, refer to the Playwright documentation:\n    https://playwright.dev/python/docs/api/class-browser#browser-new-context. This option should not be used if\n    `browser_pool` is provided.\"\"\"\n\n    headless: NotRequired[bool]\n    \"\"\"Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided.\"\"\"\n\n\nclass PlaywrightCrawlerOptions(\n    _PlaywrightCrawlerAdditionalOptions,\n    BasicCrawlerOptions[TCrawlingContext, StatisticsState],\n    Generic[TCrawlingContext, TStatisticsState],\n):\n    \"\"\"Arguments for the `AbstractHttpCrawler` constructor.\n\n    It is intended for typing forwarded `__init__` arguments in the subclasses.\n    \"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_playwright_crawling_context.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\n\nfrom ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable\n\n    from crawlee._types import EnqueueLinksFunction, ExtractLinksFunction\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext):\n    \"\"\"The crawling context used by the `PlaywrightCrawler`.\n\n    It provides access to key objects as well as utility functions for handling crawling tasks.\n    \"\"\"\n\n    enqueue_links: EnqueueLinksFunction\n    \"\"\"The Playwright `EnqueueLinksFunction` implementation.\"\"\"\n\n    extract_links: ExtractLinksFunction\n    \"\"\"The Playwright `ExtractLinksFunction` implementation.\"\"\"\n\n    infinite_scroll: Callable[[], Awaitable[None]]\n    \"\"\"A function to perform infinite scrolling on the page. This scrolls to the bottom, triggering\n    the loading of additional content if present.\"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_playwright_http_client.py",
    "content": "from __future__ import annotations\n\nimport contextvars\nfrom contextlib import AbstractAsyncContextManager, asynccontextmanager\nfrom typing import TYPE_CHECKING\n\nfrom typing_extensions import override\n\nfrom crawlee._types import HttpHeaders\nfrom crawlee.crawlers._playwright._types import PlaywrightHttpResponse\nfrom crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from datetime import timedelta\n\n    from playwright.async_api import Page\n\n    from crawlee import Request\n    from crawlee._types import HttpMethod, HttpPayload\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.statistics import Statistics\n\n\n_browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None)\n\n\n@asynccontextmanager\nasync def browser_page_context(page: Page) -> AsyncGenerator[None, None]:\n    \"\"\"Asynchronous context manager for setting the current Playwright page in the context variable.\"\"\"\n    token = _browser_page_context_var.set(page)\n    try:\n        yield\n    finally:\n        _browser_page_context_var.reset(token)\n\n\nclass PlaywrightHttpClient(HttpClient):\n    \"\"\"HTTP client based on the Playwright library.\n\n    This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\n    and to manage sessions, proxies, and error handling.\n\n    See the `HttpClient` class for more common information about HTTP clients.\n\n    Note: This class is pre-designated for use in `PlaywrightCrawler` only\n    \"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"Initialize a new instance.\"\"\"\n        self._active = False\n\n    @override\n    async def crawl(\n        self,\n        request: Request,\n        *,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        statistics: Statistics | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpCrawlingResult:\n        raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')\n\n    @override\n    async def send_request(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpResponse:\n        # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`\n        # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved\n        # https://github.com/apify/crawlee-python/issues/1055\n\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        browser_context = _browser_page_context_var.get()\n\n        if browser_context is None:\n            raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context')\n\n        # Proxies appropriate to the browser context are used\n        response = await browser_context.request.fetch(\n            url_or_request=url,\n            method=method.lower(),\n            headers=dict(headers) if headers else None,\n            data=payload,\n            timeout=timeout.total_seconds() if timeout else None,\n        )\n\n        return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')\n\n    @override\n    def stream(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> AbstractAsyncContextManager[HttpResponse]:\n        raise NotImplementedError('The `stream` method should not be used for `PlaywrightHttpClient`')\n\n    async def cleanup(self) -> None:\n        # The `browser_page_context` is responsible for resource cleanup\n        return\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\n\nfrom ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext\n\nif TYPE_CHECKING:\n    from playwright.async_api import Response\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext):\n    \"\"\"The post navigation crawling context used by the `PlaywrightCrawler`.\n\n    It provides access to the `Page` and `Response` objects, after the navigation to the URL is performed.\n    \"\"\"\n\n    response: Response\n    \"\"\"The Playwright `Response` object containing the response details for the current URL.\"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._types import BasicCrawlingContext, PageSnapshot\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from playwright.async_api import Page\n\n    from ._types import BlockRequestsFunction, GotoOptions\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass PlaywrightPreNavCrawlingContext(BasicCrawlingContext):\n    \"\"\"The pre navigation crawling context used by the `PlaywrightCrawler`.\n\n    It provides access to the `Page` object, before the navigation to the URL is performed.\n    \"\"\"\n\n    page: Page\n    \"\"\"The Playwright `Page` object for the current page.\"\"\"\n\n    block_requests: BlockRequestsFunction\n    \"\"\"Blocks network requests matching specified URL patterns.\"\"\"\n\n    goto_options: GotoOptions\n    \"\"\"Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.\"\"\"\n\n    async def get_snapshot(self) -> PageSnapshot:\n        \"\"\"Get snapshot of crawled page.\"\"\"\n        html = None\n        screenshot = None\n\n        try:\n            html = await self.page.content()\n        except Exception:\n            self.log.exception(f'Failed to get html snapshot for {self.request.url}.')\n\n        try:\n            screenshot = await self.page.screenshot(full_page=True, type='jpeg')\n        except Exception:\n            self.log.exception(f'Failed to get page screenshot for {self.request.url}.')\n\n        return PageSnapshot(html=html, screenshot=screenshot)\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_types.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Literal, Protocol, TypedDict\n\nfrom playwright.async_api import APIResponse\n\nfrom crawlee import HttpHeaders\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from playwright.async_api import Response\n    from typing_extensions import NotRequired, Self\n\n\n@docs_group('Functions')\nclass BlockRequestsFunction(Protocol):\n    \"\"\"A function for blocking unwanted HTTP requests during page loads in PlaywrightCrawler.\n\n    It simplifies the process of blocking specific HTTP requests during page navigation.\n    The function allows blocking both default resource types (like images, fonts, stylesheets) and custom URL patterns.\n    \"\"\"\n\n    async def __call__(\n        self, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None\n    ) -> None:\n        \"\"\"Call dunder method.\n\n        Args:\n            url_patterns: List of URL patterns to block. If None, uses default patterns.\n            extra_url_patterns: Additional URL patterns to append to the main patterns list.\n        \"\"\"\n\n\n@dataclass(frozen=True)\nclass PlaywrightHttpResponse:\n    \"\"\"Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol.\"\"\"\n\n    http_version: str\n    status_code: int\n    headers: HttpHeaders\n    _content: bytes\n\n    async def read(self) -> bytes:\n        return self._content\n\n    async def read_stream(self) -> AsyncGenerator[bytes, None]:\n        # Playwright does not support `streaming` responses.\n        # This is a workaround to make it compatible with `HttpResponse` protocol.\n        yield self._content\n\n    @classmethod\n    async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self:\n        headers = HttpHeaders(response.headers)\n        status_code = response.status\n        # Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.\n        http_version = protocol\n        _content = await response.body()\n        # If not called then the body will stay in memory until the context closes.\n        if isinstance(response, APIResponse):\n            await response.dispose()\n\n        return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)\n\n\nclass GotoOptions(TypedDict):\n    \"\"\"Keyword arguments for Playwright's `Page.goto()` method.\"\"\"\n\n    wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]\n    \"\"\"When to consider operation succeeded, defaults to 'load' event.\"\"\"\n\n    referer: NotRequired[str]\n    \"\"\"Referer header value.\"\"\"\n"
  },
  {
    "path": "src/crawlee/crawlers/_playwright/_utils.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import suppress\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from playwright.async_api import Page\n    from playwright.async_api import Request as PlaywrightRequest\n\n_DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [\n    '.css',\n    '.webp',\n    '.jpg',\n    '.jpeg',\n    '.png',\n    '.svg',\n    '.gif',\n    '.woff',\n    '.pdf',\n    '.zip',\n]\n\n\nasync def infinite_scroll(page: Page) -> None:\n    \"\"\"Scroll to the bottom of a page, handling loading of additional items.\"\"\"\n    scrolled_distance = 0\n    finished = False\n\n    match_count = 0\n    match_count_threshold = 4\n\n    old_request_count = 0\n    new_request_count = 0\n\n    def track_request(request: PlaywrightRequest) -> None:\n        if request.resource_type in ['xhr', 'fetch', 'websocket', 'other']:\n            nonlocal new_request_count\n            new_request_count += 1\n\n    page.on('request', track_request)\n\n    async def scroll() -> None:\n        body_scroll_height = await page.evaluate('() => document.body.scrollHeight')\n\n        delta = body_scroll_height or 10000\n        await page.mouse.wheel(delta_x=0, delta_y=delta)\n\n        nonlocal scrolled_distance\n        scrolled_distance += delta\n\n    async def check_finished() -> None:\n        nonlocal old_request_count, new_request_count, match_count, finished\n\n        while True:\n            if old_request_count == new_request_count:\n                match_count += 1\n\n                if match_count >= match_count_threshold:\n                    finished = True\n                    return\n            else:\n                match_count = 0\n                old_request_count = new_request_count\n\n            await asyncio.sleep(1)\n\n    check_task = asyncio.create_task(check_finished(), name='infinite_scroll_check_finished_task')\n\n    try:\n        while not finished:\n            await scroll()\n            await page.wait_for_timeout(250)\n    finally:\n        if not check_task.done():\n            check_task.cancel()\n        with suppress(asyncio.CancelledError):\n            await check_task\n\n\nasync def block_requests(\n    page: Page, url_patterns: list[str] | None = None, extra_url_patterns: list[str] | None = None\n) -> None:\n    \"\"\"Blocks network requests matching specified URL patterns.\n\n    Args:\n        page: Playwright Page object to block requests on.\n        url_patterns: List of URL patterns to block. If None, uses default patterns.\n        extra_url_patterns: Additional URL patterns to append to the main patterns list.\n    \"\"\"\n    url_patterns = list(url_patterns or _DEFAULT_BLOCK_REQUEST_URL_PATTERNS)\n    url_patterns.extend(extra_url_patterns or [])\n\n    browser_type = page.context.browser.browser_type.name if page.context.browser else 'undefined'\n\n    if browser_type == 'chromium':\n        client = await page.context.new_cdp_session(page)\n\n        await client.send('Network.enable')\n        await client.send('Network.setBlockedURLs', {'urls': url_patterns})\n    else:\n        extensions = [pattern.strip('*.') for pattern in url_patterns if pattern.startswith(('*.', '.'))]\n        specific_files = [pattern for pattern in url_patterns if not pattern.startswith(('*.', '.'))]\n\n        if extensions:\n            await page.route(f'**/*.{{{\",\".join(extensions)}}}*', lambda route, _: route.abort())\n\n        if specific_files:\n            await page.route(f'**/{{{\",\".join(specific_files)}}}*', lambda route, _: route.abort())\n"
  },
  {
    "path": "src/crawlee/crawlers/_types.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\n\n\n@dataclass(frozen=True)\nclass BlockedInfo:\n    \"\"\"Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked.\"\"\"\n\n    reason: str\n\n    def __bool__(self) -> bool:\n        \"\"\"No reason means no blocking.\"\"\"\n        return bool(self.reason)\n"
  },
  {
    "path": "src/crawlee/crawlers/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/errors.py",
    "content": "from __future__ import annotations\n\nfrom typing import Generic\n\nfrom typing_extensions import TypeVar\n\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee._utils.docs import docs_group\n\n__all__ = [\n    'ContextPipelineFinalizationError',\n    'ContextPipelineInitializationError',\n    'ContextPipelineInterruptedError',\n    'HttpClientStatusCodeError',\n    'HttpStatusCodeError',\n    'ProxyError',\n    'RequestCollisionError',\n    'RequestHandlerError',\n    'ServiceConflictError',\n    'SessionError',\n    'UserDefinedErrorHandlerError',\n]\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)\n\n\n@docs_group('Errors')\nclass UserDefinedErrorHandlerError(Exception):\n    \"\"\"Wraps an exception thrown from an user-defined error handler.\"\"\"\n\n\nclass UserHandlerTimeoutError(UserDefinedErrorHandlerError):\n    \"\"\"Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out.\"\"\"\n\n\n@docs_group('Errors')\nclass SessionError(Exception):\n    \"\"\"Errors of `SessionError` type will trigger a session rotation.\n\n    This error doesn't respect the `max_request_retries` option and has a separate limit of `max_session_rotations`.\n    \"\"\"\n\n\n@docs_group('Errors')\nclass ServiceConflictError(Exception):\n    \"\"\"Raised when attempting to reassign a service in service container that is already in use.\"\"\"\n\n    def __init__(self, service: type, new_value: object, existing_value: object) -> None:\n        super().__init__(\n            f'Service {service.__name__} is already in use. Existing value: {existing_value}, '\n            f'attempted new value: {new_value}.'\n        )\n\n\n@docs_group('Errors')\nclass ProxyError(SessionError):\n    \"\"\"Raised when a proxy is being blocked or malfunctions.\"\"\"\n\n\n@docs_group('Errors')\nclass HttpStatusCodeError(Exception):\n    \"\"\"Raised when the response status code indicates an error.\"\"\"\n\n    def __init__(self, message: str, status_code: int) -> None:\n        super().__init__(f'{message} (status code: {status_code}).')\n        self.status_code = status_code\n        self.message = message\n\n\n@docs_group('Errors')\nclass HttpClientStatusCodeError(HttpStatusCodeError):\n    \"\"\"Raised when the response status code indicates an client error.\"\"\"\n\n\n@docs_group('Errors')\nclass RequestHandlerError(Exception, Generic[TCrawlingContext]):\n    \"\"\"Wraps an exception thrown from a request handler (router) and extends it with crawling context.\"\"\"\n\n    def __init__(self, wrapped_exception: Exception, crawling_context: TCrawlingContext) -> None:\n        super().__init__()\n        self.wrapped_exception = wrapped_exception\n        self.crawling_context = crawling_context\n\n\n@docs_group('Errors')\nclass ContextPipelineInitializationError(Exception):\n    \"\"\"Wraps an exception thrown in the initialization step of a context pipeline middleware.\n\n    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.\n    \"\"\"\n\n    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:\n        super().__init__()\n        self.wrapped_exception = wrapped_exception\n        self.crawling_context = crawling_context\n\n\n@docs_group('Errors')\nclass ContextPipelineFinalizationError(Exception):\n    \"\"\"Wraps an exception thrown in the finalization step of a context pipeline middleware.\n\n    We may not have the complete context at this point, so only `BasicCrawlingContext` is provided.\n    \"\"\"\n\n    def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawlingContext) -> None:\n        super().__init__()\n        self.wrapped_exception = wrapped_exception\n        self.crawling_context = crawling_context\n\n\n@docs_group('Errors')\nclass ContextPipelineInterruptedError(Exception):\n    \"\"\"May be thrown in the initialization phase of a middleware to signal that the request should not be processed.\"\"\"\n\n\n@docs_group('Errors')\nclass RequestCollisionError(Exception):\n    \"\"\"Raised when a request cannot be processed due to a conflict with required resources.\"\"\"\n"
  },
  {
    "path": "src/crawlee/events/__init__.py",
    "content": "from ._event_manager import EventManager\nfrom ._local_event_manager import LocalEventManager\nfrom ._types import (\n    Event,\n    EventAbortingData,\n    EventCrawlerStatusData,\n    EventData,\n    EventExitData,\n    EventListener,\n    EventMigratingData,\n    EventPersistStateData,\n    EventSystemInfoData,\n)\n\n__all__ = [\n    'Event',\n    'EventAbortingData',\n    'EventCrawlerStatusData',\n    'EventData',\n    'EventExitData',\n    'EventListener',\n    'EventManager',\n    'EventMigratingData',\n    'EventPersistStateData',\n    'EventSystemInfoData',\n    'LocalEventManager',\n]\n"
  },
  {
    "path": "src/crawlee/events/_event_manager.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/event_manager.ts\n\nfrom __future__ import annotations\n\nimport asyncio\nimport inspect\nfrom collections import defaultdict\nfrom datetime import timedelta\nfrom functools import wraps\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, Literal, TypedDict, cast, overload\n\nfrom pyee.asyncio import AsyncIOEventEmitter\n\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee._utils.wait import wait_for_all_tasks_for_finish\nfrom crawlee.events._types import (\n    Event,\n    EventAbortingData,\n    EventCrawlerStatusData,\n    EventExitData,\n    EventListener,\n    EventMigratingData,\n    EventPersistStateData,\n    EventSystemInfoData,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable\n    from types import TracebackType\n\n    from typing_extensions import NotRequired\n\n    from crawlee.events._types import EventData, WrappedListener\n\nlogger = getLogger(__name__)\n\n\nclass EventManagerOptions(TypedDict):\n    \"\"\"Arguments for the `EventManager` constructor.\n\n    It is intended for typing forwarded `__init__` arguments in the subclasses.\n    \"\"\"\n\n    persist_state_interval: NotRequired[timedelta]\n    \"\"\"Interval between emitted `PersistState` events to maintain state persistence.\"\"\"\n\n    close_timeout: NotRequired[timedelta | None]\n    \"\"\"Optional timeout for canceling pending event listeners if they exceed this duration.\"\"\"\n\n\n@docs_group('Event managers')\nclass EventManager:\n    \"\"\"Manage events and their listeners, enabling registration, emission, and execution control.\n\n    It allows for registering event listeners, emitting events, and ensuring all listeners complete their execution.\n    Built on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all\n    listeners to complete and emitting `PersistState` events at regular intervals.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        persist_state_interval: timedelta = timedelta(minutes=1),\n        close_timeout: timedelta | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            persist_state_interval: Interval between emitted `PersistState` events to maintain state persistence.\n            close_timeout: Optional timeout for canceling pending event listeners if they exceed this duration.\n        \"\"\"\n        self._persist_state_interval = persist_state_interval\n        self._close_timeout = close_timeout\n\n        # Asynchronous event emitter for handle events and invoke the event listeners.\n        self._event_emitter = AsyncIOEventEmitter()\n\n        # Listeners are wrapped inside asyncio.Task. Store their references here so that we can wait for them to finish.\n        self._listener_tasks: set[asyncio.Task] = set()\n\n        # Store the mapping between events, listeners and their wrappers in the following way:\n        #   event -> listener -> [wrapped_listener_1, wrapped_listener_2, ...]\n        self._listeners_to_wrappers: dict[Event, dict[EventListener[Any], list[WrappedListener]]] = defaultdict(\n            lambda: defaultdict(list),\n        )\n\n        # Recurring task for emitting persist state events.\n        self._emit_persist_state_event_rec_task = RecurringTask(\n            func=self._emit_persist_state_event,\n            delay=self._persist_state_interval,\n        )\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    async def __aenter__(self) -> EventManager:\n        \"\"\"Initialize the event manager upon entering the async context.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n        self._emit_persist_state_event_rec_task.start()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Close the local event manager upon exiting the async context.\n\n        This will stop listening for the events, and it will wait for all the event listeners to finish.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.\n        await self._emit_persist_state_event_rec_task.stop()\n        await self._emit_persist_state_event()\n        await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)\n        self._event_emitter.remove_all_listeners()\n        self._listener_tasks.clear()\n        self._listeners_to_wrappers.clear()\n        self._active = False\n\n    @overload\n    def on(self, *, event: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]) -> None: ...\n    @overload\n    def on(self, *, event: Literal[Event.SYSTEM_INFO], listener: EventListener[EventSystemInfoData]) -> None: ...\n    @overload\n    def on(self, *, event: Literal[Event.MIGRATING], listener: EventListener[EventMigratingData]) -> None: ...\n    @overload\n    def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbortingData]) -> None: ...\n    @overload\n    def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ...\n    @overload\n    def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ...\n    @overload\n    def on(self, *, event: Event, listener: EventListener[None]) -> None: ...\n\n    def on(self, *, event: Event, listener: EventListener[Any]) -> None:\n        \"\"\"Register an event listener for a specific event.\n\n        Args:\n            event: The event for which to listen to.\n            listener: The function (sync or async) which is to be called when the event is emitted.\n        \"\"\"\n        signature = inspect.signature(listener)\n\n        @wraps(cast('Callable[..., None | Awaitable[None]]', listener))\n        async def listener_wrapper(event_data: EventData) -> None:\n            try:\n                bound_args = signature.bind(event_data)\n            except TypeError:  # Parameterless listener\n                bound_args = signature.bind()\n\n            # If the listener is a coroutine function, just call it, otherwise, run it in a separate thread\n            # to avoid blocking the event loop\n            coro = (\n                listener(*bound_args.args, **bound_args.kwargs)\n                if inspect.iscoroutinefunction(listener)\n                else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)\n            )\n\n            listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__\n            listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')\n            self._listener_tasks.add(listener_task)\n\n            try:\n                logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...')\n                await listener_task\n                logger.debug('EventManager.on.listener_wrapper(): Listener task completed.')\n            except Exception:\n                # We need to swallow the exception and just log it here, otherwise it could break the event emitter\n                logger.exception(\n                    'Exception in the event listener',\n                    extra={\n                        'event_name': event.value,\n                        'listener_name': listener.__name__\n                        if hasattr(listener, '__name__')\n                        else listener.__class__.__name__,\n                    },\n                )\n            finally:\n                logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')\n                self._listener_tasks.remove(listener_task)\n\n        self._listeners_to_wrappers[event][listener].append(listener_wrapper)\n        self._event_emitter.add_listener(event.value, listener_wrapper)\n\n    def off(self, *, event: Event, listener: EventListener[Any] | None = None) -> None:\n        \"\"\"Remove a specific listener or all listeners for an event.\n\n        Args:\n            event: The Actor event for which to remove listeners.\n            listener: The listener which is supposed to be removed. If not passed, all listeners of this event\n                are removed.\n        \"\"\"\n        if listener:\n            for listener_wrapper in self._listeners_to_wrappers[event][listener]:\n                self._event_emitter.remove_listener(event.value, listener_wrapper)\n            self._listeners_to_wrappers[event][listener] = []\n        else:\n            self._listeners_to_wrappers[event] = defaultdict(list)\n            self._event_emitter.remove_all_listeners(event.value)\n\n    @overload\n    def emit(self, *, event: Literal[Event.PERSIST_STATE], event_data: EventPersistStateData) -> None: ...\n    @overload\n    def emit(self, *, event: Literal[Event.SYSTEM_INFO], event_data: EventSystemInfoData) -> None: ...\n    @overload\n    def emit(self, *, event: Literal[Event.MIGRATING], event_data: EventMigratingData) -> None: ...\n    @overload\n    def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData) -> None: ...\n    @overload\n    def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ...\n    @overload\n    def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ...\n    @overload\n    def emit(self, *, event: Event, event_data: Any) -> None: ...\n\n    @ensure_context\n    def emit(self, *, event: Event, event_data: EventData) -> None:\n        \"\"\"Emit an event with the associated data to all registered listeners.\n\n        Args:\n            event: The event which will be emitted.\n            event_data: The data which will be passed to the event listeners.\n        \"\"\"\n        self._event_emitter.emit(event.value, event_data)\n\n    @ensure_context\n    async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None:\n        \"\"\"Wait for all currently executing event listeners to complete.\n\n        Args:\n            timeout: The maximum time to wait for the event listeners to finish. If they do not complete within\n                the specified timeout, they will be canceled.\n        \"\"\"\n\n        async def wait_for_listeners() -> None:\n            \"\"\"Gathers all listener tasks and awaits their completion, logging any exceptions encountered.\"\"\"\n            results = await asyncio.gather(*self._listener_tasks, return_exceptions=True)\n            for result in results:\n                if isinstance(result, Exception):\n                    logger.exception('Event listener raised an exception.', exc_info=result)\n\n        tasks = [asyncio.create_task(wait_for_listeners(), name=f'Task-{wait_for_listeners.__name__}')]\n\n        await wait_for_all_tasks_for_finish(tasks=tasks, logger=logger, timeout=timeout)\n\n    async def _emit_persist_state_event(self) -> None:\n        \"\"\"Emit a persist state event with the given migration status.\"\"\"\n        self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))\n"
  },
  {
    "path": "src/crawlee/events/_local_event_manager.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.7.3/packages/core/src/events/local_event_manager.ts\n\nfrom __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee._utils.system import get_cpu_info, get_memory_info\nfrom crawlee.configuration import Configuration\nfrom crawlee.events._event_manager import EventManager, EventManagerOptions\nfrom crawlee.events._types import Event, EventSystemInfoData\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from typing_extensions import Unpack\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Event managers')\nclass LocalEventManager(EventManager):\n    \"\"\"Event manager for local environments.\n\n    It extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager`\n    is intended to be used in local environments, where the system metrics are required managing the `Snapshotter`\n    and `AutoscaledPool`.\n    \"\"\"\n\n    def __init__(\n        self,\n        system_info_interval: timedelta = timedelta(seconds=1),\n        **event_manager_options: Unpack[EventManagerOptions],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        In most cases, you should use the `from_config` constructor to create a new instance based on\n        the provided configuration.\n\n        Args:\n            system_info_interval: Interval at which `SystemInfo` events are emitted.\n            event_manager_options: Additional options for the parent class.\n        \"\"\"\n        self._system_info_interval = system_info_interval\n\n        # Recurring task for emitting system info events.\n        self._emit_system_info_event_rec_task = RecurringTask(\n            func=self._emit_system_info_event,\n            delay=self._system_info_interval,\n        )\n\n        super().__init__(**event_manager_options)\n\n    @classmethod\n    def from_config(cls, config: Configuration | None = None) -> LocalEventManager:\n        \"\"\"Initialize a new instance based on the provided `Configuration`.\n\n        Args:\n            config: The `Configuration` instance. Uses the global (default) one if not provided.\n        \"\"\"\n        config = config or Configuration.get_global_configuration()\n\n        return cls(\n            system_info_interval=config.system_info_interval,\n            persist_state_interval=config.persist_state_interval,\n        )\n\n    async def __aenter__(self) -> LocalEventManager:\n        \"\"\"Initialize the local event manager upon entering the async context.\n\n        It starts emitting system info events at regular intervals.\n        \"\"\"\n        await super().__aenter__()\n        self._emit_system_info_event_rec_task.start()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Close the local event manager upon exiting the async context.\n\n        It stops emitting system info events and closes the event manager.\n        \"\"\"\n        await self._emit_system_info_event_rec_task.stop()\n        await super().__aexit__(exc_type, exc_value, exc_traceback)\n\n    async def _emit_system_info_event(self) -> None:\n        \"\"\"Emit a system info event with the current CPU and memory usage.\"\"\"\n        cpu_info = await asyncio.to_thread(get_cpu_info)\n        memory_info = await asyncio.to_thread(get_memory_info)\n\n        event_data = EventSystemInfoData(cpu_info=cpu_info, memory_info=memory_info)\n        self.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n"
  },
  {
    "path": "src/crawlee/events/_types.py",
    "content": "from __future__ import annotations\n\nfrom collections.abc import Callable, Coroutine\nfrom enum import Enum\nfrom typing import Annotated, Any, TypeVar\n\nfrom pydantic import BaseModel, ConfigDict, Field\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.models import timedelta_secs\nfrom crawlee._utils.system import CpuInfo, MemoryUsageInfo\n\n\n@docs_group('Event data')\nclass Event(str, Enum):\n    \"\"\"Names of all possible events that can be emitted using an `EventManager`.\"\"\"\n\n    # Core events\n    PERSIST_STATE = 'persistState'\n    SYSTEM_INFO = 'systemInfo'\n    MIGRATING = 'migrating'\n    ABORTING = 'aborting'\n    EXIT = 'exit'\n\n    # Session pool events\n    SESSION_RETIRED = 'sessionRetired'\n\n    # Browser pool events\n    BROWSER_LAUNCHED = 'browserLaunched'\n    BROWSER_RETIRED = 'browserRetired'\n    BROWSER_CLOSED = 'browserClosed'\n    PAGE_CREATED = 'pageCreated'\n    PAGE_CLOSED = 'pageClosed'\n\n    # State events\n    CRAWLER_STATUS = 'crawlerStatus'\n\n\n@docs_group('Event data')\nclass EventPersistStateData(BaseModel):\n    \"\"\"Data for the persist state event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    is_migrating: Annotated[bool, Field(alias='isMigrating')]\n\n\n@docs_group('Event data')\nclass EventSystemInfoData(BaseModel):\n    \"\"\"Data for the system info event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]\n    memory_info: Annotated[\n        MemoryUsageInfo,\n        Field(alias='memoryInfo'),\n    ]\n\n\n@docs_group('Event data')\nclass EventMigratingData(BaseModel):\n    \"\"\"Data for the migrating event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    # The remaining time in seconds before the migration is forced and the process is killed\n    # Optional because it's not present when the event handler is called manually\n    time_remaining: Annotated[timedelta_secs | None, Field(alias='timeRemainingSecs')] = None\n\n\n@docs_group('Event data')\nclass EventAbortingData(BaseModel):\n    \"\"\"Data for the aborting event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n\n@docs_group('Event data')\nclass EventExitData(BaseModel):\n    \"\"\"Data for the exit event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n\n@docs_group('Event data')\nclass EventCrawlerStatusData(BaseModel):\n    \"\"\"Data for the crawler status event.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    message: str\n    \"\"\"A message describing the current status of the crawler.\"\"\"\n\n    crawler_id: int\n    \"\"\"The ID of the crawler that emitted the event.\"\"\"\n\n\nEventData = (\n    EventPersistStateData\n    | EventSystemInfoData\n    | EventMigratingData\n    | EventAbortingData\n    | EventExitData\n    | EventCrawlerStatusData\n)\n\"\"\"A helper type for all possible event payloads\"\"\"\n\nWrappedListener = Callable[..., Coroutine[Any, Any, None]]\n\nTEvent = TypeVar('TEvent')\nEventListener = (\n    Callable[\n        [TEvent],\n        None | Coroutine[Any, Any, None],\n    ]\n    | Callable[\n        [],\n        None | Coroutine[Any, Any, None],\n    ]\n)\n\"\"\"An event listener function - it can be both sync and async and may accept zero or one argument.\"\"\"\n"
  },
  {
    "path": "src/crawlee/events/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/fingerprint_suite/__init__.py",
    "content": "from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator\nfrom ._fingerprint_generator import FingerprintGenerator\nfrom ._header_generator import HeaderGenerator\nfrom ._types import HeaderGeneratorOptions, ScreenOptions\n\n__all__ = [\n    'DefaultFingerprintGenerator',\n    'FingerprintGenerator',\n    'HeaderGenerator',\n    'HeaderGeneratorOptions',\n    'ScreenOptions',\n]\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/_browserforge_adapter.py",
    "content": "from __future__ import annotations\n\nimport random\nfrom collections.abc import Iterable\nfrom copy import deepcopy\nfrom functools import reduce\nfrom operator import or_\nfrom typing import TYPE_CHECKING, Any, Literal\n\nimport apify_fingerprint_datapoints\nfrom browserforge.bayesian_network import extract_json\nfrom browserforge.fingerprints import Fingerprint as bf_Fingerprint\nfrom browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator\nfrom browserforge.fingerprints import Screen\nfrom browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator\nfrom browserforge.headers.generator import ListOrString\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\n\nfrom ._consts import BROWSER_TYPE_HEADER_KEYWORD\nfrom ._fingerprint_generator import FingerprintGenerator\n\nif TYPE_CHECKING:\n    from browserforge.headers import Browser\n\n    from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType\n\n\nclass PatchedHeaderGenerator(bf_HeaderGenerator):\n    \"\"\"Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator.\"\"\"\n\n    def _get_accept_language_header(self, locales: tuple[str, ...] | list[str] | str) -> str:\n        \"\"\"Generate the Accept-Language header based on the given locales.\n\n        Patched version due to PR of upstream repo not being merged: https://github.com/daijro/browserforge/pull/24\n\n        Args:\n            locales: Locale(s).\n\n        Returns:\n            Accept-Language header string.\n        \"\"\"\n        # Convert to tuple if needed for consistent handling.\n        if isinstance(locales, str):\n            locales_tuple: tuple[str, ...] = (locales,)\n        elif isinstance(locales, list):\n            locales_tuple = tuple(locales)\n        else:\n            locales_tuple = locales\n\n        # First locale does not include quality factor, q=1 is considered as implicit.\n        additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales_tuple[1:])]\n        return ','.join((locales_tuple[0], *additional_locales))\n\n    def generate(\n        self,\n        *,\n        browser: Iterable[str | Browser] | None = None,\n        os: ListOrString | None = None,\n        device: ListOrString | None = None,\n        locale: ListOrString | None = None,\n        http_version: Literal[1, 2] | None = None,\n        user_agent: ListOrString | None = None,\n        strict: bool | None = None,\n        request_dependent_headers: dict[str, str] | None = None,\n    ) -> dict[str, str]:\n        \"\"\"Generate HTTP headers based on the specified parameters.\n\n        For detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`\n        This patched version of the method adds additional quality checks on the output of the original method. It tries\n        to generate headers several times until they match the requirements.\n\n        Returns:\n            A generated headers.\n        \"\"\"\n        # browserforge header generation can be flaky. Enforce basic QA on generated headers\n        max_attempts = 10\n\n        single_browser = self._get_single_browser_type(browser)\n\n        if single_browser == 'chrome':\n            # `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also\n            # other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers\n            # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.\n            # Increase max attempts as from `BrowserForge` header generator perspective even `chromium`\n            # headers without `sec-...` headers are valid.\n            max_attempts += 50\n\n        # Use browserforge to generate headers until it satisfies our additional requirements.\n        for _attempt in range(max_attempts):\n            generated_header: dict[str, str] = super().generate(\n                browser=single_browser,\n                os=os,\n                device=device,\n                locale=locale,\n                http_version=http_version,\n                user_agent=user_agent,\n                strict=strict,\n                request_dependent_headers=request_dependent_headers,\n            )\n\n            if ('headless' in generated_header.get('User-Agent', '').lower()) or (\n                'headless' in generated_header.get('sec-ch-ua', '').lower()\n            ):\n                # It can be a valid header, but we never want to leak \"headless\". Get a different one.\n                continue\n\n            if any(\n                keyword in generated_header['User-Agent']\n                for keyword in self._get_expected_browser_keywords(single_browser)\n            ):\n                if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header):\n                    # Accept chromium header only with all sec headers.\n                    continue\n\n                return generated_header\n        raise RuntimeError('Failed to generate header.')\n\n    def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:\n        return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'))\n\n    def _get_expected_browser_keywords(self, browser: str | None) -> set[str]:\n        if not browser:\n            # Allow all possible keywords when there is no preference for specific browser type.\n            return reduce(or_, BROWSER_TYPE_HEADER_KEYWORD.values())\n\n        return BROWSER_TYPE_HEADER_KEYWORD[browser]\n\n    def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> str | None:\n        \"\"\"Get single browser type.\n\n        Browserforge header generator accepts wider range of possible types.\n        Narrow it to single optional string as that is how we use it.\n        Handling the original multitype would be pointlessly complex.\n        \"\"\"\n        # In our case we never pass more than one browser type. In general case more browsers are just bigger pool to\n        # select from, so narrowing it to any of them is still a valid action as we are going to pick just one anyway.\n        if isinstance(browser, str):\n            return browser\n        if isinstance(browser, Iterable):\n            choice = random.choice(\n                [\n                    single_browser if isinstance(single_browser, str) else single_browser.name\n                    for single_browser in browser\n                ]\n            )\n            if choice in {'chrome', 'firefox', 'safari', 'edge'}:\n                return choice\n            raise ValueError('Invalid browser type.')\n        return None\n\n\nclass PatchedFingerprintGenerator(bf_FingerprintGenerator):\n    \"\"\"Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        screen: Screen | None = None,\n        strict: bool = False,\n        mock_webrtc: bool = False,\n        slim: bool = False,\n        **header_kwargs,  # noqa:ANN003 # Upstream repo types missing.\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            screen: Screen constraints for the generated fingerprint.\n            strict: Whether to raise an exception if the constraints are too strict.\n            mock_webrtc: Whether to mock WebRTC when injecting the fingerprint.\n            slim: Disables performance-heavy evasions when injecting the fingerprint.\n            **header_kwargs: Header generation options for `HeaderGenerator`.\n        \"\"\"\n        super().__init__(screen=screen, strict=strict, mock_webrtc=mock_webrtc, slim=slim)\n        # Replace `self.header_generator` To make sure that we consistently use `PatchedHeaderGenerator`\n        self.header_generator = PatchedHeaderGenerator(**header_kwargs)\n\n\n@docs_group('Other')\nclass BrowserforgeFingerprintGenerator(FingerprintGenerator):\n    \"\"\"`FingerprintGenerator` adapter for fingerprint generator from `browserforge`.\n\n    `browserforge` is a browser header and fingerprint generator: https://github.com/daijro/browserforge\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        header_options: HeaderGeneratorOptions | None = None,\n        screen_options: ScreenOptions | None = None,\n        mock_web_rtc: bool | None = None,\n        slim: bool | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        All generator options are optional. If any value is not specified, then `None` is set in the options.\n        Default values for options set to `None` are implementation detail of used fingerprint generator.\n        Specific default values should not be relied upon. Use explicit values if it matters for your use case.\n\n        Args:\n            header_options: Collection of header related attributes that can be used by the fingerprint generator.\n            screen_options: Defines the screen constrains for the fingerprint generator.\n            mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint.\n            slim: Disables performance-heavy evasions when injecting the fingerprint.\n        \"\"\"\n        bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim}\n\n        if header_options is None:\n            bf_header_options = {}\n        else:\n            bf_header_options = deepcopy(header_options.model_dump())\n            bf_header_options['browser'] = bf_header_options.pop('browsers', None)\n            bf_header_options['os'] = bf_header_options.pop('operating_systems', None)\n            bf_header_options['device'] = bf_header_options.pop('devices', None)\n            bf_header_options['locale'] = bf_header_options.pop('locales', None)\n\n        if screen_options is None:\n            bf_options['screen'] = Screen()\n        else:\n            bf_options['screen'] = Screen(**screen_options.model_dump())\n\n        self._options = {**bf_options, **bf_header_options}\n        self._generator = PatchedFingerprintGenerator()\n\n    @override\n    def generate(self) -> bf_Fingerprint:\n        # browserforge fingerprint generation can be flaky\n        # https://github.com/daijro/browserforge/issues/22\"\n        # During test runs around 10 % flakiness was detected.\n        # Max attempt set to 10 as (0.1)^10 is considered sufficiently low probability.\n        max_attempts = 10\n        for attempt in range(max_attempts):\n            try:\n                return self._generator.generate(**self._options)\n            except ValueError:  # noqa:PERF203\n                if attempt == max_attempts:\n                    raise\n        raise RuntimeError('Failed to generate fingerprint.')\n\n\nclass BrowserforgeHeaderGenerator:\n    \"\"\"`HeaderGenerator` adapter for fingerprint generator from `browserforge`.\"\"\"\n\n    def __init__(self) -> None:\n        self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])\n\n    def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]:\n        \"\"\"Generate headers.\"\"\"\n        return self._generator.generate(browser=[browser_type])\n\n\ndef get_available_header_network() -> dict:\n    \"\"\"Get header network that contains possible header values.\"\"\"\n    return extract_json(apify_fingerprint_datapoints.get_header_network())\n\n\ndef get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]:\n    \"\"\"Get set of possible header values from available header network.\"\"\"\n    node_names = {node_name} if isinstance(node_name, str) else node_name\n    for node in header_network['nodes']:\n        if node['name'] in node_names:\n            return set(node['possibleValues'])\n    return set()\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/_consts.py",
    "content": "from __future__ import annotations\n\nCOMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'\n\nBROWSER_TYPE_HEADER_KEYWORD = {\n    'chrome': {'Chrome', 'CriOS'},\n    'firefox': {'Firefox', 'FxiOS'},\n    'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},\n    'safari': {'Safari'},\n}\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/_fingerprint_generator.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from browserforge.fingerprints import Fingerprint\n\n\n@docs_group('Other')\nclass FingerprintGenerator(ABC):\n    \"\"\"A class for creating browser fingerprints that mimic browser fingerprints of real users.\"\"\"\n\n    @abstractmethod\n    def generate(self) -> Fingerprint:\n        \"\"\"Generate browser fingerprints.\n\n        This is experimental feature.\n        Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely\n        it will change to custom `Fingerprint` class defined in this repo later.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/_header_generator.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Literal\n\nfrom crawlee._types import HttpHeaders\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.fingerprint_suite._browserforge_adapter import BrowserforgeHeaderGenerator\n\nif TYPE_CHECKING:\n    from crawlee.fingerprint_suite._types import SupportedBrowserType\n\n\ndef fingerprint_browser_type_from_playwright_browser_type(\n    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],\n) -> SupportedBrowserType:\n    if playwright_browser_type in {'chromium', 'chrome'}:\n        return 'chrome'\n    if playwright_browser_type == 'firefox':\n        return 'firefox'\n    if playwright_browser_type == 'webkit':\n        return 'safari'\n    raise ValueError(f'Unsupported browser type: {playwright_browser_type}')\n\n\n@docs_group('Other')\nclass HeaderGenerator:\n    \"\"\"Generate realistic looking or browser-like HTTP headers.\"\"\"\n\n    def __init__(self) -> None:\n        self._generator = BrowserforgeHeaderGenerator()\n\n    def _select_specific_headers(self, all_headers: dict[str, str], header_names: set[str]) -> HttpHeaders:\n        return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names})\n\n    def get_specific_headers(\n        self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome'\n    ) -> HttpHeaders:\n        \"\"\"Return subset of headers based on the selected `header_names`.\n\n        If no `header_names` are specified, full unfiltered headers are returned.\n        \"\"\"\n        all_headers = self._generator.generate(browser_type=browser_type)\n\n        if not header_names:\n            return HttpHeaders(all_headers)\n        return self._select_specific_headers(all_headers, header_names)\n\n    def get_common_headers(self) -> HttpHeaders:\n        \"\"\"Get common HTTP headers (\"Accept\", \"Accept-Language\").\n\n        We do not modify the \"Accept-Encoding\", \"Connection\" and other headers. They should be included and handled\n        by the HTTP client or browser.\n        \"\"\"\n        all_headers = self._generator.generate()\n        return self._select_specific_headers(all_headers, header_names={'Accept', 'Accept-Language'})\n\n    def get_random_user_agent_header(self) -> HttpHeaders:\n        \"\"\"Get a random User-Agent header.\"\"\"\n        all_headers = self._generator.generate()\n        return self._select_specific_headers(all_headers, header_names={'User-Agent'})\n\n    def get_user_agent_header(\n        self,\n        *,\n        browser_type: SupportedBrowserType = 'chrome',\n    ) -> HttpHeaders:\n        \"\"\"Get the User-Agent header based on the browser type.\"\"\"\n        if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:\n            raise ValueError(f'Unsupported browser type: {browser_type}')\n        all_headers = self._generator.generate(browser_type=browser_type)\n        return self._select_specific_headers(all_headers, header_names={'User-Agent'})\n\n    def get_sec_ch_ua_headers(\n        self,\n        *,\n        browser_type: SupportedBrowserType = 'chrome',\n    ) -> HttpHeaders:\n        \"\"\"Get the sec-ch-ua headers based on the browser type.\"\"\"\n        if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:\n            raise ValueError(f'Unsupported browser type: {browser_type}')\n        all_headers = self._generator.generate(browser_type=browser_type)\n        return self._select_specific_headers(\n            all_headers, header_names={'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'}\n        )\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/_types.py",
    "content": "from __future__ import annotations\n\nfrom typing import Annotated, Literal\n\nfrom pydantic import BaseModel, ConfigDict, Field\n\nSupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']\nSupportedDevices = Literal['desktop', 'mobile']\nSupportedHttpVersion = Literal['1', '2']\nSupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']\n\n\nclass ScreenOptions(BaseModel):\n    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)\n\n    \"\"\"Defines the screen constrains for the fingerprint generator.\"\"\"\n\n    min_width: Annotated[float | None, Field(alias='minWidth')] = None\n    \"\"\"Minimal screen width constraint for the fingerprint generator.\"\"\"\n\n    max_width: Annotated[float | None, Field(alias='maxWidth')] = None\n    \"\"\"Maximal screen width constraint for the fingerprint generator.\"\"\"\n\n    min_height: Annotated[float | None, Field(alias='minHeight')] = None\n    \"\"\"Minimal screen height constraint for the fingerprint generator.\"\"\"\n\n    max_height: Annotated[float | None, Field(alias='maxHeight')] = None\n    \"\"\"Maximal screen height constraint for the fingerprint generator.\"\"\"\n\n\nclass HeaderGeneratorOptions(BaseModel):\n    \"\"\"Collection of header related attributes that can be used by the fingerprint generator.\"\"\"\n\n    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)\n\n    browsers: list[SupportedBrowserType] | None = None\n    \"\"\"List of BrowserSpecifications to generate the headers for.\"\"\"\n\n    operating_systems: Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')] = None\n    \"\"\"List of operating systems to generate the headers for.\"\"\"\n\n    devices: list[SupportedDevices] | None = None\n    \"\"\"List of devices to generate the headers for.\"\"\"\n\n    locales: list[str] | None = None\n    \"\"\"List of at most 10 languages to include in the [Accept-Language]\n    (https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header\n    in the language format accepted by that header, for example `en`, `en-US` or `de`.\"\"\"\n\n    http_version: Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')] = None\n    \"\"\"HTTP version to be used for header generation (the headers differ depending on the version).\"\"\"\n\n    strict: bool | None = None\n    \"\"\"If true, the generator will throw an error if it cannot generate headers based on the input.\"\"\"\n"
  },
  {
    "path": "src/crawlee/fingerprint_suite/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/http_clients/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n# These imports have only mandatory dependencies, so they are imported directly.\nfrom ._base import HttpClient, HttpCrawlingResult, HttpResponse\nfrom ._impit import ImpitHttpClient\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'CurlImpersonateHttpClient'):\n    from ._curl_impersonate import CurlImpersonateHttpClient\n\nwith _try_import(__name__, 'HttpxHttpClient'):\n    from ._httpx import HttpxHttpClient\n\n\n__all__ = [\n    'CurlImpersonateHttpClient',\n    'HttpClient',\n    'HttpCrawlingResult',\n    'HttpResponse',\n    'HttpxHttpClient',\n    'ImpitHttpClient',\n]\n"
  },
  {
    "path": "src/crawlee/http_clients/_base.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Protocol\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n    from contextlib import AbstractAsyncContextManager\n    from datetime import timedelta\n    from types import TracebackType\n\n    from crawlee import Request\n    from crawlee._types import HttpHeaders, HttpMethod, HttpPayload\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.statistics import Statistics\n\n\n@docs_group('Other')\nclass HttpResponse(Protocol):\n    \"\"\"Define the interface that any HTTP response object must implement.\"\"\"\n\n    @property\n    def http_version(self) -> str:\n        \"\"\"The HTTP version used in the response.\"\"\"\n\n    @property\n    def status_code(self) -> int:\n        \"\"\"The HTTP status code received from the server.\"\"\"\n\n    @property\n    def headers(self) -> HttpHeaders:\n        \"\"\"The HTTP headers received in the response.\"\"\"\n\n    async def read(self) -> bytes:\n        \"\"\"Read the entire content of the response body.\n\n        This method loads the complete response body into memory at once. It should be used\n        for responses received from regular HTTP requests (via `send_request` or `crawl` methods).\n\n        Raises:\n            RuntimeError: If called on a response received from the `stream` method.\n        \"\"\"\n\n    def read_stream(self) -> AsyncIterator[bytes]:\n        \"\"\"Iterate over the content of the response body in chunks.\n\n        This method should be used for responses received from the `stream` method to process\n        large response bodies without loading them entirely into memory. It allows for efficient\n        processing of potentially large data by yielding chunks sequentially.\n\n        Raises:\n            RuntimeError: If the stream has already been consumed or if the response was not obtained from the `stream`\n                method.\n        \"\"\"\n\n\n@dataclass(frozen=True)\n@docs_group('Crawling contexts')\nclass HttpCrawlingResult:\n    \"\"\"Result of an HTTP-only crawl.\n\n    Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,\n    `ParselCrawlingContext`, ...).\n    \"\"\"\n\n    http_response: HttpResponse\n    \"\"\"The HTTP response received from the server.\"\"\"\n\n\n@docs_group('HTTP clients')\nclass HttpClient(ABC):\n    \"\"\"An abstract base class for HTTP clients used in crawlers (`BasicCrawler` subclasses).\"\"\"\n\n    @abstractmethod\n    def __init__(\n        self,\n        *,\n        persist_cookies_per_session: bool = True,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            persist_cookies_per_session: Whether to persist cookies per HTTP session.\n        \"\"\"\n        self._persist_cookies_per_session = persist_cookies_per_session\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    @abstractmethod\n    async def crawl(\n        self,\n        request: Request,\n        *,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        statistics: Statistics | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpCrawlingResult:\n        \"\"\"Perform the crawling for a given request.\n\n        This method is called from `crawler.run()`.\n\n        Args:\n            request: The request to be crawled.\n            session: The session associated with the request.\n            proxy_info: The information about the proxy to be used.\n            statistics: The statistics object to register status codes.\n            timeout: Maximum time allowed to process the request.\n\n        Raises:\n            ProxyError: Raised if a proxy-related error occurs.\n\n        Returns:\n            The result of the crawling.\n        \"\"\"\n\n    @abstractmethod\n    async def send_request(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpResponse:\n        \"\"\"Send an HTTP request via the client.\n\n        This method is called from `context.send_request()` helper.\n\n        Args:\n            url: The URL to send the request to.\n            method: The HTTP method to use.\n            headers: The headers to include in the request.\n            payload: The data to be sent as the request body.\n            session: The session associated with the request.\n            proxy_info: The information about the proxy to be used.\n            timeout: Maximum time allowed to process the request.\n\n        Raises:\n            ProxyError: Raised if a proxy-related error occurs.\n\n        Returns:\n            The HTTP response received from the server.\n        \"\"\"\n\n    @abstractmethod\n    def stream(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> AbstractAsyncContextManager[HttpResponse]:\n        \"\"\"Stream an HTTP request via the client.\n\n        This method should be used for downloading potentially large data where you need to process\n        the response body in chunks rather than loading it entirely into memory.\n\n        Args:\n            url: The URL to send the request to.\n            method: The HTTP method to use.\n            headers: The headers to include in the request.\n            payload: The data to be sent as the request body.\n            session: The session associated with the request.\n            proxy_info: The information about the proxy to be used.\n            timeout: The maximum time to wait for establishing the connection.\n\n        Raises:\n            ProxyError: Raised if a proxy-related error occurs.\n\n        Returns:\n            An async context manager yielding the HTTP response with streaming capabilities.\n        \"\"\"\n\n    @abstractmethod\n    async def cleanup(self) -> None:\n        \"\"\"Clean up resources used by the client.\n\n        This method is called when the client is no longer needed and should be overridden\n        in subclasses to perform any necessary cleanup such as closing connections,\n        releasing file handles, or other resource deallocation.\n        \"\"\"\n\n    async def __aenter__(self) -> HttpClient:\n        \"\"\"Initialize the client when entering the context manager.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n        return self\n\n    async def __aexit__(\n        self, exc_type: BaseException | None, exc_value: BaseException | None, traceback: TracebackType | None\n    ) -> None:\n        \"\"\"Deinitialize the client and clean up resources when exiting the context manager.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        await self.cleanup()\n        self._active = False\n"
  },
  {
    "path": "src/crawlee/http_clients/_curl_impersonate.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import asynccontextmanager\nfrom http.cookiejar import Cookie\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom curl_cffi import CurlInfo\nfrom curl_cffi.const import CurlHttpVersion\nfrom curl_cffi.requests import AsyncSession\nfrom curl_cffi.requests.cookies import Cookies as CurlCookies\nfrom curl_cffi.requests.cookies import CurlMorsel\nfrom curl_cffi.requests.exceptions import ProxyError as CurlProxyError\nfrom curl_cffi.requests.exceptions import RequestException as CurlRequestError\nfrom curl_cffi.requests.exceptions import Timeout\nfrom curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME\nfrom typing_extensions import override\n\nfrom crawlee._types import HttpHeaders, HttpMethod, HttpPayload\nfrom crawlee._utils.blocked import ROTATE_PROXY_ERRORS\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.errors import ProxyError\nfrom crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from datetime import timedelta\n\n    from curl_cffi import Curl\n    from curl_cffi.requests import Request as CurlRequest\n    from curl_cffi.requests import Response\n    from curl_cffi.requests.session import HttpMethod as CurlHttpMethod\n\n    from crawlee import Request\n    from crawlee._types import HttpMethod\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.statistics import Statistics\n\n\nclass _EmptyCookies(CurlCookies):\n    @override\n    def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:\n        return []\n\n    @override\n    def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:\n        return None\n\n\nclass _AsyncSession(AsyncSession):\n    @override\n    def __init__(self, *args: Any, **kwargs: Any) -> None:\n        super().__init__(*args, **kwargs)\n        self._cookies = _EmptyCookies()\n\n\nclass _CurlImpersonateResponse:\n    \"\"\"Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol.\"\"\"\n\n    def __init__(self, response: Response) -> None:\n        self._response = response\n\n    @property\n    def http_version(self) -> str:\n        if self._response.http_version == CurlHttpVersion.NONE:\n            return 'NONE'\n        if self._response.http_version == CurlHttpVersion.V1_0:\n            return 'HTTP/1.0'\n        if self._response.http_version == CurlHttpVersion.V1_1:\n            return 'HTTP/1.1'\n        if self._response.http_version in {\n            CurlHttpVersion.V2_0,\n            CurlHttpVersion.V2TLS,\n            CurlHttpVersion.V2_PRIOR_KNOWLEDGE,\n        }:\n            return 'HTTP/2'\n        if self._response.http_version == CurlHttpVersion.V3:\n            return 'HTTP/3'\n\n        raise ValueError(f'Unknown HTTP version: {self._response.http_version}')\n\n    @property\n    def status_code(self) -> int:\n        return self._response.status_code\n\n    @property\n    def headers(self) -> HttpHeaders:\n        return HttpHeaders({key: value for key, value in self._response.headers.items() if value})\n\n    async def read(self) -> bytes:\n        if self._response.astream_task:\n            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')\n\n        return self._response.content\n\n    async def read_stream(self) -> AsyncGenerator[bytes, None]:\n        if not self._response.astream_task:\n            raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')\n\n        if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():\n            raise RuntimeError('Cannot read stream, it was already consumed.')\n\n        async for chunk in self._response.aiter_content():\n            yield chunk\n\n\n@docs_group('HTTP clients')\nclass CurlImpersonateHttpClient(HttpClient):\n    \"\"\"HTTP client based on the `curl-cffi` library.\n\n    This client uses the `curl-cffi` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\n    and to manage sessions, proxies, and error handling.\n\n    See the `HttpClient` class for more common information about HTTP clients.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler\n    from crawlee.http_clients import CurlImpersonateHttpClient\n\n    http_client = CurlImpersonateHttpClient()\n    crawler = HttpCrawler(http_client=http_client)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        persist_cookies_per_session: bool = True,\n        **async_session_kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            persist_cookies_per_session: Whether to persist cookies per HTTP session.\n            async_session_kwargs: Additional keyword arguments for `curl_cffi.requests.AsyncSession`.\n        \"\"\"\n        super().__init__(\n            persist_cookies_per_session=persist_cookies_per_session,\n        )\n        self._async_session_kwargs = async_session_kwargs\n\n        self._client_by_proxy_url = dict[str | None, AsyncSession]()\n\n    @override\n    async def crawl(\n        self,\n        request: Request,\n        *,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        statistics: Statistics | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpCrawlingResult:\n        client = self._get_client(proxy_info.url if proxy_info else None)\n\n        try:\n            response = await client.request(\n                url=request.url,\n                method=self._convert_method(request.method),\n                headers=request.headers,\n                data=request.payload,\n                cookies=session.cookies.jar if session else None,\n                timeout=timeout.total_seconds() if timeout else None,\n            )\n        except Timeout as exc:\n            raise asyncio.TimeoutError from exc\n        except CurlRequestError as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        if statistics:\n            statistics.register_status_code(response.status_code)\n\n        if self._persist_cookies_per_session and session and response.curl:\n            response_cookies = self._get_cookies(response.curl)\n            session.cookies.store_cookies(response_cookies)\n\n        request.loaded_url = response.url\n\n        return HttpCrawlingResult(\n            http_response=_CurlImpersonateResponse(response),\n        )\n\n    @override\n    async def send_request(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpResponse:\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        proxy_url = proxy_info.url if proxy_info else None\n        client = self._get_client(proxy_url)\n\n        try:\n            response = await client.request(\n                url=url,\n                method=self._convert_method(method),\n                headers=dict(headers) if headers else None,\n                data=payload,\n                cookies=session.cookies.jar if session else None,\n                timeout=timeout.total_seconds() if timeout else None,\n            )\n        except Timeout as exc:\n            raise asyncio.TimeoutError from exc\n        except CurlRequestError as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        if self._persist_cookies_per_session and session and response.curl:\n            response_cookies = self._get_cookies(response.curl)\n            session.cookies.store_cookies(response_cookies)\n\n        return _CurlImpersonateResponse(response)\n\n    @asynccontextmanager\n    @override\n    async def stream(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> AsyncGenerator[HttpResponse]:\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        proxy_url = proxy_info.url if proxy_info else None\n        client = self._get_client(proxy_url)\n\n        try:\n            response = await client.request(\n                url=url,\n                method=self._convert_method(method),\n                headers=dict(headers) if headers else None,\n                data=payload,\n                cookies=session.cookies.jar if session else None,\n                stream=True,\n                timeout=timeout.total_seconds() if timeout else None,\n            )\n        except Timeout as exc:\n            raise asyncio.TimeoutError from exc\n        except CurlRequestError as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        if self._persist_cookies_per_session and session and response.curl:\n            response_cookies = self._get_cookies(response.curl)\n            session.cookies.store_cookies(response_cookies)\n\n        try:\n            yield _CurlImpersonateResponse(response)\n        finally:\n            await response.aclose()\n\n    def _get_client(self, proxy_url: str | None) -> AsyncSession:\n        \"\"\"Retrieve or create an asynchronous HTTP session for the given proxy URL.\n\n        Check if an `AsyncSession` already exists for the specified proxy URL. If no session is found,\n        create a new one with the provided proxy settings and additional session options.\n        Store the new session for future use.\n        \"\"\"\n        # Check if a session for the given proxy URL has already been created.\n        if proxy_url not in self._client_by_proxy_url:\n            # Prepare a default kwargs for the new session. A provided proxy URL and a chrome for impersonation\n            # are set as default options.\n            kwargs: dict[str, Any] = {\n                'proxy': proxy_url,\n                'impersonate': CURL_DEFAULT_CHROME,\n            }\n\n            # Update the default kwargs with any additional user-provided kwargs.\n            kwargs.update(self._async_session_kwargs)\n\n            # Create and store the new session with the specified kwargs.\n            self._client_by_proxy_url[proxy_url] = _AsyncSession(**kwargs)\n\n        return self._client_by_proxy_url[proxy_url]\n\n    def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:\n        \"\"\"Convert from Crawlee HTTP method to curl-cffi HTTP method.\n\n        Args:\n            method: Crawlee HTTP method.\n\n        Returns:\n            Corresponding curl-cffi HTTP method.\n\n        Raises:\n            ValueError: If the provided HTTP method is not supported.\n        \"\"\"\n        method_upper = method.upper()  # curl-cffi requires uppercase methods\n\n        match method_upper:\n            case 'GET':\n                return 'GET'\n            case 'POST':\n                return 'POST'\n            case 'PUT':\n                return 'PUT'\n            case 'DELETE':\n                return 'DELETE'\n            case 'OPTIONS':\n                return 'OPTIONS'\n            case 'HEAD':\n                return 'HEAD'\n            case 'TRACE':\n                return 'TRACE'\n            case 'PATCH':\n                return 'PATCH'\n            case _:\n                raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')\n\n    @staticmethod\n    def _is_proxy_error(error: CurlRequestError) -> bool:\n        \"\"\"Determine whether the given error is related to a proxy issue.\n\n        Check if the error message contains known proxy-related error keywords or if it is an instance\n        of `CurlProxyError`.\n        \"\"\"\n        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):\n            return True\n\n        if isinstance(error, CurlProxyError):  # noqa: SIM103\n            return True\n\n        return False\n\n    @staticmethod\n    def _get_cookies(curl: Curl) -> list[Cookie]:\n        cookies = list[Cookie]()\n\n        # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.\n        cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))\n\n        for curl_cookie in cookie_list:\n            curl_morsel = CurlMorsel.from_curl_format(curl_cookie)\n            cookie = curl_morsel.to_cookiejar_cookie()\n            cookies.append(cookie)\n\n        return cookies\n\n    async def cleanup(self) -> None:\n        for client in self._client_by_proxy_url.values():\n            await client.close()\n        self._client_by_proxy_url.clear()\n"
  },
  {
    "path": "src/crawlee/http_clients/_httpx.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import asynccontextmanager\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport httpx\nfrom typing_extensions import override\n\nfrom crawlee._types import HttpHeaders\nfrom crawlee._utils.blocked import ROTATE_PROXY_ERRORS\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.errors import ProxyError\nfrom crawlee.fingerprint_suite import HeaderGenerator\nfrom crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, AsyncIterator\n    from datetime import timedelta\n    from ssl import SSLContext\n\n    from crawlee import Request\n    from crawlee._types import HttpMethod, HttpPayload\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.statistics import Statistics\n\nlogger = getLogger(__name__)\n\n\nclass _HttpxResponse:\n    \"\"\"Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol.\"\"\"\n\n    def __init__(self, response: httpx.Response) -> None:\n        self._response = response\n\n    @property\n    def http_version(self) -> str:\n        return self._response.http_version\n\n    @property\n    def status_code(self) -> int:\n        return self._response.status_code\n\n    @property\n    def headers(self) -> HttpHeaders:\n        return HttpHeaders(dict(self._response.headers))\n\n    async def read(self) -> bytes:\n        if not self._response.is_closed:\n            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')\n        return await self._response.aread()\n\n    async def read_stream(self) -> AsyncIterator[bytes]:\n        if self._response.is_stream_consumed:\n            raise RuntimeError('Stream is already consumed.')\n        else:\n            async for chunk in self._response.aiter_bytes():\n                yield chunk\n\n\nclass _HttpxTransport(httpx.AsyncHTTPTransport):\n    \"\"\"HTTP transport adapter that stores response cookies in a `Session`.\n\n    This transport adapter modifies the handling of HTTP requests to update the session cookies\n    based on the response cookies, ensuring that the cookies are stored in the session object\n    rather than the `HTTPX` client itself.\n    \"\"\"\n\n    @override\n    async def handle_async_request(self, request: httpx.Request) -> httpx.Response:\n        response = await super().handle_async_request(request)\n        response.request = request\n\n        if session := cast('Session', request.extensions.get('crawlee_session')):\n            session.cookies.store_cookies(list(response.cookies.jar))\n\n        if 'Set-Cookie' in response.headers:\n            del response.headers['Set-Cookie']\n\n        return response\n\n\n@docs_group('HTTP clients')\nclass HttpxHttpClient(HttpClient):\n    \"\"\"HTTP client based on the `HTTPX` library.\n\n    This client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\n    and to manage sessions, proxies, and error handling.\n\n    See the `HttpClient` class for more common information about HTTP clients.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler\n    from crawlee.http_clients import HttpxHttpClient\n\n    http_client = HttpxHttpClient()\n    crawler = HttpCrawler(http_client=http_client)\n    ```\n    \"\"\"\n\n    _DEFAULT_HEADER_GENERATOR = HeaderGenerator()\n\n    def __init__(\n        self,\n        *,\n        persist_cookies_per_session: bool = True,\n        http1: bool = True,\n        http2: bool = True,\n        verify: str | bool | SSLContext = True,\n        header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,\n        **async_client_kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            persist_cookies_per_session: Whether to persist cookies per HTTP session.\n            http1: Whether to enable HTTP/1.1 support.\n            http2: Whether to enable HTTP/2 support.\n            verify: SSL certificates used to verify the identity of requested hosts.\n            header_generator: Header generator instance to use for generating common headers.\n            async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.\n        \"\"\"\n        super().__init__(\n            persist_cookies_per_session=persist_cookies_per_session,\n        )\n        self._http1 = http1\n        self._http2 = http2\n\n        self._async_client_kwargs = async_client_kwargs\n        self._header_generator = header_generator\n\n        self._ssl_context = httpx.create_ssl_context(verify=verify)\n\n        self._transport: _HttpxTransport | None = None\n\n        self._client_by_proxy_url = dict[str | None, httpx.AsyncClient]()\n\n    @override\n    async def crawl(\n        self,\n        request: Request,\n        *,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        statistics: Statistics | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpCrawlingResult:\n        client = self._get_client(proxy_info.url if proxy_info else None)\n        headers = self._combine_headers(request.headers)\n\n        http_request = client.build_request(\n            url=request.url,\n            method=request.method,\n            headers=headers,\n            content=request.payload,\n            cookies=session.cookies.jar if session else None,\n            extensions={'crawlee_session': session if self._persist_cookies_per_session else None},\n            timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,\n        )\n\n        try:\n            response = await client.send(http_request)\n        except httpx.TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n        except httpx.TransportError as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        if statistics:\n            statistics.register_status_code(response.status_code)\n\n        request.loaded_url = str(response.url)\n\n        return HttpCrawlingResult(\n            http_response=_HttpxResponse(response),\n        )\n\n    @override\n    async def send_request(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpResponse:\n        client = self._get_client(proxy_info.url if proxy_info else None)\n\n        http_request = self._build_request(\n            client=client,\n            url=url,\n            method=method,\n            headers=headers,\n            payload=payload,\n            session=session,\n            timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,\n        )\n\n        try:\n            response = await client.send(http_request)\n        except httpx.TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n        except httpx.TransportError as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        return _HttpxResponse(response)\n\n    @asynccontextmanager\n    @override\n    async def stream(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> AsyncGenerator[HttpResponse]:\n        client = self._get_client(proxy_info.url if proxy_info else None)\n\n        http_request = self._build_request(\n            client=client,\n            url=url,\n            method=method,\n            headers=headers,\n            payload=payload,\n            session=session,\n            timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,\n        )\n\n        try:\n            response = await client.send(http_request, stream=True)\n        except httpx.TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n\n        try:\n            yield _HttpxResponse(response)\n        finally:\n            await response.aclose()\n\n    def _build_request(\n        self,\n        client: httpx.AsyncClient,\n        url: str,\n        method: HttpMethod,\n        headers: HttpHeaders | dict[str, str] | None,\n        payload: HttpPayload | None,\n        session: Session | None = None,\n        timeout: httpx.Timeout | None = None,\n    ) -> httpx.Request:\n        \"\"\"Build an `httpx.Request` using the provided parameters.\"\"\"\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        headers = self._combine_headers(headers)\n\n        return client.build_request(\n            url=url,\n            method=method,\n            headers=dict(headers) if headers else None,\n            content=payload,\n            extensions={'crawlee_session': session if self._persist_cookies_per_session else None},\n            timeout=timeout or httpx.USE_CLIENT_DEFAULT,\n        )\n\n    def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:\n        \"\"\"Retrieve or create an HTTP client for the given proxy URL.\n\n        If a client for the specified proxy URL does not exist, create and store a new one.\n        \"\"\"\n        if not self._transport:\n            # Configure connection pool limits and keep-alive connections for transport\n            limits = self._async_client_kwargs.get(\n                'limits', httpx.Limits(max_connections=1000, max_keepalive_connections=200)\n            )\n\n            self._transport = _HttpxTransport(\n                http1=self._http1,\n                http2=self._http2,\n                verify=self._ssl_context,\n                limits=limits,\n            )\n\n        if proxy_url not in self._client_by_proxy_url:\n            # Prepare a default kwargs for the new client.\n            kwargs: dict[str, Any] = {\n                'proxy': proxy_url,\n                'http1': self._http1,\n                'http2': self._http2,\n                'follow_redirects': True,\n            }\n\n            # Update the default kwargs with any additional user-provided kwargs.\n            kwargs.update(self._async_client_kwargs)\n\n            kwargs.update(\n                {\n                    'transport': self._transport,\n                    'verify': self._ssl_context,\n                }\n            )\n\n            client = httpx.AsyncClient(**kwargs)\n            self._client_by_proxy_url[proxy_url] = client\n\n        return self._client_by_proxy_url[proxy_url]\n\n    def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None:\n        \"\"\"Merge default headers with explicit headers for an HTTP request.\n\n        Generate a final set of request headers by combining default headers, a random User-Agent header,\n        and any explicitly provided headers.\n        \"\"\"\n        common_headers = self._header_generator.get_common_headers() if self._header_generator else HttpHeaders()\n        user_agent_header = (\n            self._header_generator.get_random_user_agent_header() if self._header_generator else HttpHeaders()\n        )\n        explicit_headers = explicit_headers or HttpHeaders()\n        headers = common_headers | user_agent_header | explicit_headers\n        return headers or None\n\n    @staticmethod\n    def _is_proxy_error(error: httpx.TransportError) -> bool:\n        \"\"\"Determine whether the given error is related to a proxy issue.\n\n        Check if the error is an instance of `httpx.ProxyError` or if its message contains known proxy-related\n        error keywords.\n        \"\"\"\n        if isinstance(error, httpx.ProxyError):\n            return True\n\n        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):  # noqa: SIM103\n            return True\n\n        return False\n\n    async def cleanup(self) -> None:\n        for client in self._client_by_proxy_url.values():\n            await client.aclose()\n        self._client_by_proxy_url.clear()\n        if self._transport:\n            await self._transport.aclose()\n            self._transport = None\n"
  },
  {
    "path": "src/crawlee/http_clients/_impit.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import asynccontextmanager\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, TypedDict\n\nfrom cachetools import LRUCache\nfrom impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError\nfrom impit import ProxyError as ImpitProxyError\nfrom typing_extensions import override\n\nfrom crawlee._types import HttpHeaders\nfrom crawlee._utils.blocked import ROTATE_PROXY_ERRORS\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.errors import ProxyError\nfrom crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, AsyncIterator\n    from datetime import timedelta\n    from http.cookiejar import CookieJar\n\n    from crawlee import Request\n    from crawlee._types import HttpMethod, HttpPayload\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.sessions import Session\n    from crawlee.statistics import Statistics\n\nlogger = getLogger(__name__)\n\n\nclass _ClientCacheEntry(TypedDict):\n    \"\"\"Type definition for client cache entries.\"\"\"\n\n    client: AsyncClient\n    cookie_jar: CookieJar | None\n\n\nclass _ImpitResponse:\n    \"\"\"Adapter class for `impit.Response` to conform to the `HttpResponse` protocol.\"\"\"\n\n    def __init__(self, response: Response) -> None:\n        self._response = response\n\n    @property\n    def http_version(self) -> str:\n        return str(self._response.http_version)\n\n    @property\n    def status_code(self) -> int:\n        return int(self._response.status_code)\n\n    @property\n    def headers(self) -> HttpHeaders:\n        return HttpHeaders(dict(self._response.headers))\n\n    async def read(self) -> bytes:\n        if not self._response.is_closed:\n            raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')\n        return self._response.content\n\n    async def read_stream(self) -> AsyncIterator[bytes]:\n        if self._response.is_stream_consumed:\n            raise RuntimeError('Stream is already consumed.')\n        else:\n            async for chunk in self._response.aiter_bytes():\n                yield chunk\n\n\n@docs_group('HTTP clients')\nclass ImpitHttpClient(HttpClient):\n    \"\"\"HTTP client based on the `impit` library.\n\n    This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)\n    and to manage sessions, proxies, and error handling.\n\n    See the `HttpClient` class for more common information about HTTP clients.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler\n    from crawlee.http_clients import ImpitHttpClient\n\n    http_client = ImpitHttpClient()\n    crawler = HttpCrawler(http_client=http_client)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        persist_cookies_per_session: bool = True,\n        http3: bool = False,\n        verify: bool = True,\n        browser: Browser | None = 'firefox',\n        **async_client_kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            persist_cookies_per_session: Whether to persist cookies per HTTP session.\n            http3: Whether to enable HTTP/3 support.\n            verify: SSL certificates used to verify the identity of requested hosts.\n            browser: Browser to impersonate.\n            async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.\n        \"\"\"\n        super().__init__(\n            persist_cookies_per_session=persist_cookies_per_session,\n        )\n        self._http3 = http3\n        self._verify = verify\n        self._browser = browser\n\n        self._async_client_kwargs = async_client_kwargs\n\n        self._client_by_proxy_url = LRUCache[str | None, _ClientCacheEntry](maxsize=10)\n\n    @override\n    async def crawl(\n        self,\n        request: Request,\n        *,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        statistics: Statistics | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpCrawlingResult:\n        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)\n\n        try:\n            response = await client.request(\n                url=request.url,\n                method=request.method,\n                content=request.payload,\n                headers=dict(request.headers) if request.headers else None,\n                timeout=timeout.total_seconds() if timeout else None,\n            )\n        except TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n        except (TransportError, HTTPError) as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        if statistics:\n            statistics.register_status_code(response.status_code)\n\n        request.loaded_url = str(response.url)\n\n        return HttpCrawlingResult(http_response=_ImpitResponse(response))\n\n    @override\n    async def send_request(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> HttpResponse:\n        if isinstance(headers, dict) or headers is None:\n            headers = HttpHeaders(headers or {})\n\n        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)\n\n        try:\n            response = await client.request(\n                method=method,\n                url=url,\n                content=payload,\n                headers=dict(headers) if headers else None,\n                timeout=timeout.total_seconds() if timeout else None,\n            )\n        except TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n        except (TransportError, HTTPError) as exc:\n            if self._is_proxy_error(exc):\n                raise ProxyError from exc\n            raise\n\n        return _ImpitResponse(response)\n\n    @asynccontextmanager\n    @override\n    async def stream(\n        self,\n        url: str,\n        *,\n        method: HttpMethod = 'GET',\n        headers: HttpHeaders | dict[str, str] | None = None,\n        payload: HttpPayload | None = None,\n        session: Session | None = None,\n        proxy_info: ProxyInfo | None = None,\n        timeout: timedelta | None = None,\n    ) -> AsyncGenerator[HttpResponse]:\n        client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)\n\n        try:\n            response = await client.request(\n                method=method,\n                url=url,\n                content=payload,\n                headers=dict(headers) if headers else None,\n                timeout=timeout.total_seconds() if timeout else None,\n                stream=True,\n            )\n        except TimeoutException as exc:\n            raise asyncio.TimeoutError from exc\n\n        try:\n            yield _ImpitResponse(response)\n        finally:\n            # TODO: https://github.com/apify/impit/issues/242\n            # Quickly closing Response while reading the response body causes an error in the Rust generator in `impit`.\n            # With a short sleep and sync closing, the error does not occur.\n            # Replace with `response.aclose` when this is resolved in impit.\n            await asyncio.sleep(0.01)\n            response.close()\n\n    def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> AsyncClient:\n        \"\"\"Retrieve or create an HTTP client for the given proxy URL.\n\n        If a client for the specified proxy URL does not exist, create and store a new one.\n        \"\"\"\n        cached_data = self._client_by_proxy_url.get(proxy_url)\n        if cached_data:\n            client = cached_data['client']\n            client_cookie_jar = cached_data['cookie_jar']\n            if client_cookie_jar is cookie_jar:\n                # If the cookie jar matches, return the existing client.\n                return client\n\n        # Prepare a default kwargs for the new client.\n        kwargs: dict[str, Any] = {\n            'proxy': proxy_url,\n            'http3': self._http3,\n            'verify': self._verify,\n            'follow_redirects': True,\n            'browser': self._browser,\n        }\n\n        # Update the default kwargs with any additional user-provided kwargs.\n        kwargs.update(self._async_client_kwargs)\n\n        client = AsyncClient(**kwargs, cookie_jar=cookie_jar)\n\n        self._client_by_proxy_url[proxy_url] = _ClientCacheEntry(client=client, cookie_jar=cookie_jar)\n\n        return client\n\n    @staticmethod\n    def _is_proxy_error(error: HTTPError) -> bool:\n        \"\"\"Determine whether the given error is related to a proxy issue.\n\n        Check if the error message contains known proxy-related error keywords.\n        \"\"\"\n        if isinstance(error, ImpitProxyError):\n            return True\n\n        if any(needle in str(error) for needle in ROTATE_PROXY_ERRORS):  # noqa: SIM103\n            return True\n\n        return False\n\n    @override\n    async def cleanup(self) -> None:\n        \"\"\"Clean up resources used by the HTTP client.\"\"\"\n        self._client_by_proxy_url.clear()\n"
  },
  {
    "path": "src/crawlee/otel/__init__.py",
    "content": "from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor\n\n__all__ = [\n    'CrawlerInstrumentor',\n]\n"
  },
  {
    "path": "src/crawlee/otel/crawler_instrumentor.py",
    "content": "from __future__ import annotations\n\nimport inspect\nfrom typing import TYPE_CHECKING, Any\n\nfrom opentelemetry.instrumentation.instrumentor import BaseInstrumentor\nfrom opentelemetry.instrumentation.utils import unwrap\nfrom opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME\nfrom opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD\nfrom opentelemetry.semconv.attributes.url_attributes import URL_FULL\nfrom opentelemetry.trace import get_tracer\nfrom wrapt import wrap_function_wrapper\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.crawlers import BasicCrawler, ContextPipeline\nfrom crawlee.crawlers._basic._context_pipeline import _Middleware\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n\n    from crawlee.crawlers import BasicCrawlingContext\n\n\n@docs_group('Other')\nclass CrawlerInstrumentor(BaseInstrumentor):\n    \"\"\"Helper class for instrumenting crawlers with OpenTelemetry.\"\"\"\n\n    def __init__(\n        self, *, instrument_classes: list[type] | None = None, request_handling_instrumentation: bool = True\n    ) -> None:\n        \"\"\"Initialize the instrumentor.\n\n        Args:\n            instrument_classes: List of classes to be instrumented - all their public methods and coroutines will be\n                wrapped by generic instrumentation wrapper that will create spans for them.\n            request_handling_instrumentation: When `True`, the most relevant methods in the request handling pipeline\n                will be instrumented. When `False`, no request handling instrumentation will be done.\n        \"\"\"\n        self._tracer = get_tracer(__name__)\n\n        async def _simple_async_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any:\n            with self._tracer.start_as_current_span(\n                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}\n            ):\n                return await wrapped(*args, **kwargs)\n\n        def _simple_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> Any:\n            with self._tracer.start_as_current_span(\n                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}\n            ):\n                return wrapped(*args, **kwargs)\n\n        def _init_wrapper(wrapped: Any, _: Any, args: Any, kwargs: Any) -> None:\n            with self._tracer.start_as_current_span(\n                name=wrapped.__name__, attributes={CODE_FUNCTION_NAME: wrapped.__qualname__}\n            ):\n                wrapped(*args, **kwargs)\n\n        self._instrumented: list[tuple[Any, str, Callable]] = []\n        self._simple_wrapper = _simple_wrapper\n        self._simple_async_wrapper = _simple_async_wrapper\n        self._init_wrapper = _init_wrapper\n\n        if instrument_classes:\n            for _class in instrument_classes:\n                self._instrument_all_public_methods(on_class=_class)\n\n        if request_handling_instrumentation:\n\n            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:\n                with self._tracer.start_as_current_span(\n                    name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context\n                    attributes={\n                        URL_FULL: instance.input_context.request.url,\n                        CODE_FUNCTION_NAME: instance.generator.__qualname__,  # type:ignore[attr-defined]  # valid in our context\n                    },\n                ):\n                    return await wrapped(*args, **kwargs)\n\n            async def context_pipeline_wrapper(\n                wrapped: Any, _: ContextPipeline[BasicCrawlingContext], args: Any, kwargs: Any\n            ) -> Any:\n                context = args[0]\n                final_context_consumer = args[1]\n\n                async def wrapped_final_consumer(*args: Any, **kwargs: Any) -> Any:\n                    with self._tracer.start_as_current_span(\n                        name='request_handler',\n                        attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},\n                    ):\n                        return await final_context_consumer(*args, **kwargs)\n\n                with self._tracer.start_as_current_span(\n                    name='ContextPipeline',\n                    attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},\n                ):\n                    return await wrapped(context, wrapped_final_consumer, **kwargs)\n\n            async def _commit_request_handler_result_wrapper(\n                wrapped: Callable[[Any], Any], _: BasicCrawler, args: Any, kwargs: Any\n            ) -> Any:\n                context = args[0]\n                with self._tracer.start_as_current_span(\n                    name='Commit results',\n                    attributes={URL_FULL: context.request.url, HTTP_REQUEST_METHOD: context.request.method},\n                ):\n                    return await wrapped(*args, **kwargs)\n\n            # Handpicked interesting methods to instrument\n            self._instrumented.extend(\n                [\n                    (_Middleware, 'action', middleware_wrapper),\n                    (_Middleware, 'cleanup', middleware_wrapper),\n                    (ContextPipeline, '__call__', context_pipeline_wrapper),\n                    (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),\n                    (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),\n                ]\n            )\n\n    def instrumentation_dependencies(self) -> list[str]:\n        \"\"\"Return a list of python packages with versions that will be instrumented.\"\"\"\n        return ['crawlee']\n\n    def _instrument_all_public_methods(self, on_class: type) -> None:\n        public_coroutines = {\n            name\n            for name, member in inspect.getmembers(on_class, predicate=inspect.iscoroutinefunction)\n            if not name.startswith('_')\n        }\n        public_methods = {\n            name\n            for name, member in inspect.getmembers(on_class, predicate=inspect.isfunction)\n            if not name.startswith('_')\n        } - public_coroutines\n\n        for coroutine in public_coroutines:\n            self._instrumented.append((on_class, coroutine, self._simple_async_wrapper))\n\n        for method in public_methods:\n            self._instrumented.append((on_class, method, self._simple_wrapper))\n\n        self._instrumented.append((on_class, '__init__', self._init_wrapper))\n\n    def _instrument(self, **_: Any) -> None:\n        for _class, method, wrapper in self._instrumented:\n            wrap_function_wrapper(_class, method, wrapper)\n\n    def _uninstrument(self, **_: Any) -> None:\n        for _class, method, __ in self._instrumented:\n            unwrap(_class, method)\n"
  },
  {
    "path": "src/crawlee/project_template/cookiecutter.json",
    "content": "{\n    \"project_name\": \"crawlee-python-project\",\n    \"__package_name\": \"{{ cookiecutter.project_name|lower|replace('-', '_') }}\",\n    \"crawler_type\": [\"beautifulsoup\", \"parsel\", \"playwright\", \"playwright-camoufox\", \"playwright-chrome\", \"playwright-firefox\", \"playwright-webkit\"],\n    \"__crawler_type\": \"{{ cookiecutter.crawler_type|lower|replace('-', '_') }}\",\n    \"http_client\": [\"impit\", \"httpx\", \"curl-impersonate\"],\n    \"package_manager\": [\"poetry\", \"pip\", \"uv\"],\n    \"enable_apify_integration\": false,\n    \"install_project\": true,\n    \"start_url\": \"https://crawlee.dev\",\n    \"_jinja2_env_vars\": {\n        \"line_statement_prefix\": \"# %\"\n    },\n    \"_extensions\": [\"jinja2.ext.do\"]\n}\n"
  },
  {
    "path": "src/crawlee/project_template/hooks/post_gen_project.py",
    "content": "import platform\nimport subprocess\nfrom pathlib import Path\n\n# % if cookiecutter.package_manager in ['poetry', 'uv']\nPath('requirements.txt').unlink()\n\n# % if cookiecutter.install_project == True\n# % if cookiecutter.package_manager == 'poetry'\nsubprocess.check_call(['poetry', 'install'])\n# % elif cookiecutter.package_manager == 'uv'\nsubprocess.check_call(['uv', 'sync'])\n# % endif\n\n# % if cookiecutter.crawler_type == 'playwright'\nmanager = \"{{ cookiecutter.package_manager }}\"\nsubprocess.check_call([manager, 'run', 'playwright', 'install'])\n# % endif\n# % endif\n\n\n# % elif cookiecutter.package_manager == 'pip'\nimport venv  # noqa: E402\n\n# Create a virtual environment\nvenv_root = Path('.venv')\nvenv.main([str(venv_root)])\n\n# % if cookiecutter.install_project == True\nif platform.system() == 'Windows':  # noqa: SIM108\n    path = venv_root / 'Scripts'\nelse:\n    path = venv_root / 'bin'\n\n# Install requirements and generate requirements.txt as an impromptu lockfile\nsubprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])\nPath('requirements.txt').write_text(\n    subprocess.check_output([str(path / 'pip'), 'freeze']).decode()\n)\n\n# % if cookiecutter.crawler_type == 'playwright'\nsubprocess.check_call([str(path / 'playwright'), 'install'])\n# % endif\n# % endif\n# % endif\n"
  },
  {
    "path": "src/crawlee/project_template/hooks/pre_gen_project.py",
    "content": "# % if cookiecutter.package_manager in ['poetry', 'uv']\nimport subprocess\nimport shutil\nimport re\nimport sys\n\nmanager = \"{{cookiecutter.package_manager}}\"\nmanager_text = manager.title()\n# % if cookiecutter.package_manager == 'poetry'\nversion_regex = r'Poetry \\(version 2\\..*\\)'\nr_version = '2.x'\n# % elif cookiecutter.package_manager == 'uv'\nversion_regex = r'uv (0\\..*)'\nr_version = '0.x'\n# % endif\n\n# Check if package manager is available in PATH\nif not shutil.which(manager):\n    sys.stderr.write(f'\\nError: You selected {manager_text} as your package manager, but it is not installed. Please install it and try again.\\n')\n    sys.exit(1)\n\n# Check if the package manager is executable\ntry:\n    version = subprocess.check_output([manager, '--version']).decode().strip()\nexcept OSError:\n    sys.stderr.write(f'\\nError: Your selected package manager {manager_text} was found but failed to execute.\\n')\n    sys.exit(1)\n\n# Check if the version matches the required regex\nif not re.match(version_regex, version):\n    sys.stderr.write(f'\\nError: Your selected package manager {manager_text} requires version {r_version}, but {version} is installed.\\n')\n    sys.exit(1)\n# % endif\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main.py",
    "content": "# % if cookiecutter.enable_apify_integration\nfrom apify import Actor\n# % endif\n# % block import required\n# % endblock\n# % if cookiecutter.http_client == 'curl-impersonate'\nfrom crawlee.http_clients import CurlImpersonateHttpClient\n# % elif cookiecutter.http_client == 'httpx'\nfrom crawlee.http_clients import HttpxHttpClient\n# % elif cookiecutter.http_client == 'impit'\nfrom crawlee.http_clients import ImpitHttpClient\n# % endif\n\nfrom .routes import router\n\n# % filter truncate(0, end='')\n# % block http_client_instantiation\n# % if cookiecutter.http_client == 'curl-impersonate'\nhttp_client=CurlImpersonateHttpClient(),\n# % elif cookiecutter.http_client == 'httpx'\nhttp_client=HttpxHttpClient(),\n# % elif cookiecutter.http_client == 'impit'\nhttp_client=ImpitHttpClient(),\n# % endif\n# % endblock\n# % endfilter\n# % if self.pre_main is defined\n\n{{self.pre_main()}}\n\n# % endif\nasync def main() -> None:\n    \"\"\"The crawler entry point.\"\"\"\n    # % filter truncate(0, end='')\n    # % block instantiation required\n    # % endblock\n    # % endfilter\n\n    # % if cookiecutter.enable_apify_integration\n    async with Actor:\n    # % set indent_width = 8\n    # % else\n    # % set indent_width = 4\n    # % endif\n# % filter indent(width=indent_width, first=True)\n{{self.instantiation()}}\n\nawait crawler.run(\n    [\n        '{{ cookiecutter.start_url }}',\n    ]\n)\n# % endfilter\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_beautifulsoup.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import BeautifulSoupCrawler\n# % endblock\n\n# % block instantiation\ncrawler = BeautifulSoupCrawler(\n    request_handler=router,\n    max_requests_per_crawl=10,\n    {{ self.http_client_instantiation() }})\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_parsel.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import ParselCrawler\n# % endblock\n\n# % block instantiation\ncrawler = ParselCrawler(\n    request_handler=router,\n    max_requests_per_crawl=10,\n    {{ self.http_client_instantiation() }})\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_playwright.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import PlaywrightCrawler\n# % endblock\n\n# % block instantiation\ncrawler = PlaywrightCrawler(\n    request_handler=router,\n    headless=True,\n    max_requests_per_crawl=10,\n    {{ self.http_client_instantiation() }})\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_playwright_camoufox.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom camoufox import AsyncNewBrowser\nfrom typing_extensions import override\n\nfrom crawlee._utils.context import ensure_context\nfrom crawlee.browsers import PlaywrightBrowserPlugin, PlaywrightBrowserController, BrowserPool\nfrom crawlee.crawlers import PlaywrightCrawler\n# % endblock\n\n# % block pre_main\nclass CamoufoxPlugin(PlaywrightBrowserPlugin):\n    \"\"\"Example browser plugin that uses Camoufox Browser, but otherwise keeps the functionality of\n    PlaywrightBrowserPlugin.\"\"\"\n\n    @ensure_context\n    @override\n    async def new_browser(self) -> PlaywrightBrowserController:\n        if not self._playwright:\n            raise RuntimeError('Playwright browser plugin is not initialized.')\n\n        return PlaywrightBrowserController(\n            browser=await AsyncNewBrowser(self._playwright, headless=True),\n            max_open_pages_per_browser=1,  #  Increase, if camoufox can handle it in your usecase.\n            header_generator=None,  #  This turns off the crawlee header_generation. Camoufox has its own.\n        )\n# % endblock\n\n# % block instantiation\ncrawler = PlaywrightCrawler(\n    max_requests_per_crawl=10,\n    request_handler=router,\n    browser_pool=BrowserPool(plugins=[CamoufoxPlugin()])\n)\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_playwright_chrome.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import PlaywrightCrawler\n# % endblock\n\n# % block instantiation\ncrawler = PlaywrightCrawler(\n    request_handler=router,\n    headless=True,\n    max_requests_per_crawl=10,\n    browser_type=\"chrome\",\n    {{ self.http_client_instantiation() }}\n)\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_playwright_firefox.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import PlaywrightCrawler\n# % endblock\n\n# % block instantiation\ncrawler = PlaywrightCrawler(\n    request_handler=router,\n    headless=True,\n    max_requests_per_crawl=10,\n    browser_type=\"firefox\",\n    {{ self.http_client_instantiation() }}\n)\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/main_playwright_webkit.py",
    "content": "# % extends 'main.py'\n\n# % block import\nfrom crawlee.crawlers import PlaywrightCrawler\n# % endblock\n\n# % block instantiation\ncrawler = PlaywrightCrawler(\n    request_handler=router,\n    headless=True,\n    max_requests_per_crawl=10,\n    browser_type=\"webkit\",\n    {{ self.http_client_instantiation() }}\n)\n# % endblock\n"
  },
  {
    "path": "src/crawlee/project_template/templates/routes_beautifulsoup.py",
    "content": "from crawlee.crawlers import BeautifulSoupCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[BeautifulSoupCrawlingContext]()\n\n\n@router.default_handler\nasync def default_handler(context: BeautifulSoupCrawlingContext) -> None:\n    \"\"\"Default request handler.\"\"\"\n    context.log.info(f'Processing {context.request.url} ...')\n    title = context.soup.find('title')\n    await context.push_data(\n        {\n            'url': context.request.loaded_url,\n            'title': title.text if title else None,\n        }\n    )\n\n    await context.enqueue_links()\n"
  },
  {
    "path": "src/crawlee/project_template/templates/routes_parsel.py",
    "content": "from crawlee.crawlers import ParselCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[ParselCrawlingContext]()\n\n\n@router.default_handler\nasync def default_handler(context: ParselCrawlingContext) -> None:\n    \"\"\"Default request handler.\"\"\"\n    context.log.info(f'Processing {context.request.url} ...')\n    title = context.selector.xpath('//title/text()').get()\n    await context.push_data(\n        {\n            'url': context.request.loaded_url,\n            'title': title,\n        }\n    )\n\n    await context.enqueue_links()\n"
  },
  {
    "path": "src/crawlee/project_template/templates/routes_playwright.py",
    "content": "from crawlee.crawlers import PlaywrightCrawlingContext\nfrom crawlee.router import Router\n\nrouter = Router[PlaywrightCrawlingContext]()\n\n\n@router.default_handler\nasync def default_handler(context: PlaywrightCrawlingContext) -> None:\n    \"\"\"Default request handler.\"\"\"\n    context.log.info(f'Processing {context.request.url} ...')\n    title = await context.page.query_selector('title')\n    await context.push_data(\n        {\n            'url': context.request.loaded_url,\n            'title': await title.inner_text() if title else None,\n        }\n    )\n\n    await context.enqueue_links()\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore",
    "content": ".venv\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile",
    "content": "# First, specify the base Docker image.\n# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.\n# You can also use any other image from Docker Hub.\n# % if cookiecutter.crawler_type == 'playwright'\nFROM apify/actor-python-playwright:3.13\n# % elif cookiecutter.crawler_type == 'playwright-camoufox'\nFROM apify/actor-python-playwright-camoufox:3.13\n# % elif cookiecutter.crawler_type == 'playwright-chrome'\nFROM apify/actor-python-playwright-chrome:3.13\n# % elif cookiecutter.crawler_type == 'playwright-firefox'\nFROM apify/actor-python-playwright-firefox:3.13\n# % elif cookiecutter.crawler_type == 'playwright-webkit'\nFROM apify/actor-python-playwright-webkit:3.13\n# % else\nFROM apify/actor-python:3.13\n# % endif\n\nRUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*\n\n# % if cookiecutter.package_manager == 'poetry'\nRUN pip install -U pip setuptools \\\n    && pip install 'poetry<3' \\\n    && poetry self add 'poetry-plugin-export'\n\n# Second, copy just poetry.lock and pyproject.toml into the Actor image,\n# since those should be the only files that affects the dependency install in the next step,\n# in order to speed up the build\nCOPY pyproject.toml poetry.lock ./\n\n# Install the dependencies\nRUN echo \"Python version:\" \\\n && python --version \\\n && echo \"Installing dependencies:\" \\\n # Export packages from poetry.lock\n && poetry export -f requirements.txt --without-hashes | \\\n # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed)\n    sed \"s/^playwright==\\(.*\\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\\1')/\" | \\\n # Install everything using pip (ignore dependency checks - the lockfile is correct, period)\n    pip install -r /dev/stdin --no-dependencies \\\n && echo \"All installed Python packages:\" \\\n && pip freeze\n# % elif cookiecutter.package_manager == 'uv'\nRUN pip install -U pip setuptools \\\n    && pip install 'uv<1'\n\nENV UV_PROJECT_ENVIRONMENT=\"/usr/local\"\n\nCOPY pyproject.toml uv.lock ./\n\nRUN echo \"Python version:\" \\\n    && python --version \\\n    && echo \"Installing dependencies:\" \\\n    # Check if playwright is already installed\n    && PLAYWRIGHT_INSTALLED=$(pip freeze | grep -q playwright && echo \"true\" || echo \"false\") \\\n    && if [ \"$PLAYWRIGHT_INSTALLED\" = \"true\" ]; then \\\n        echo \"Playwright already installed, excluding from uv sync\" \\\n        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact --no-install-package playwright; \\\n    else \\\n        echo \"Playwright not found, installing all dependencies\" \\\n        && uv sync --frozen --no-install-project --no-editable -q --no-dev --inexact; \\\n    fi \\\n    && echo \"All installed Python packages:\" \\\n    && pip freeze\n# % elif cookiecutter.package_manager == 'pip'\nRUN pip install -U pip setuptools\n\n# Second, copy just requirements.txt into the Actor image,\n# since it should be the only file that affects the dependency install in the next step,\n# in order to speed up the build\nCOPY requirements.txt ./\n\n# Install the dependencies\nRUN echo \"Python version:\" \\\n && python --version \\\n && echo \"Installing dependencies:\" \\\n # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image\n && cat requirements.txt | \\\n # Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed)\n    sed \"s/^playwright==\\(.*\\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\\1')/\" | \\\n # Install everything using pip\n    pip install -r /dev/stdin \\\n && echo \"All installed Python packages:\" \\\n && pip freeze\n# % elif cookiecutter.package_manager == 'manual'\n# TODO install dependencies\n# % endif\n\n# Next, copy the remaining files and directories with the source code.\n# Since we do this after installing the dependencies, quick build will be really fast\n# for most source file changes.\nCOPY . ./\n\n# Use compileall to ensure the runnability of the Actor Python code.\nRUN python -m compileall -q .\n\n# % if cookiecutter.crawler_type == 'playwright-camoufox'\n# Fetch camoufox files that are always needed when using camoufox.\nRUN python -m camoufox fetch\n# % endif\n\n# Specify how to launch the source code of your Actor.\nCMD [\"python\", \"-m\", \"{{ cookiecutter.__package_name }}\"]\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/README.md",
    "content": "# {{cookiecutter.project_name}}\n\nProject skeleton generated by Crawlee ({{ cookiecutter.crawler_type | capitalize }} template).\n\n## Usage\n\n{% if cookiecutter.package_manager == 'poetry' -%}\nTo get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command:\n\n```sh\npipx install poetry\n```\n\nNext, install the project dependencies:\n\n```sh\npoetry install\n```\n\nFinally, launch the crawler with:\n\n```sh\npoetry run python -m {{cookiecutter.__package_name}}\n```\n{% elif cookiecutter.package_manager == 'pip' -%}\nTo install dependencies, your can run the following command:\n\n```sh\npython -m pip install .\n```\n\nWhen the dependencies are installed, you may launch the crawler with:\n\n```sh\npython -m {{cookiecutter.__package_name}}\n```\n\n{% elif cookiecutter.package_manager == 'uv' -%}\nTo get started, ensure you have [UV](https://docs.astral.sh/uv/), a package and dependency management system, installed on your machine. We recommend installing it with the following command:\n\n```sh\npipx install uv\n```\n\nNext, install the project dependencies:\n\n```sh\nuv sync\n```\n\nFinally, launch the crawler with:\n\n```sh\nuv run python -m {{cookiecutter.__package_name}}\n```\n{% elif cookiecutter.package_manager == 'pip' -%}\nTo install dependencies, your can run the following command:\n\n```sh\npython -m pip install .\n```\n\nWhen the dependencies are installed, you may launch the crawler with:\n\n```sh\npython -m {{cookiecutter.__package_name}}\n```\n{% elif cookiecutter.package_manager == 'manual' -%}\nYou selected the manual dependency installation method, so you're on your own. There is a simple `requirements.txt` file to get you started.\n{% endif %}\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml",
    "content": "# % if cookiecutter.crawler_type.startswith('playwright')\n# % set extras = ['playwright']\n# % else\n# % set extras = [cookiecutter.crawler_type]\n# % endif\n# % if cookiecutter.http_client == 'curl-impersonate'\n# % do extras.append('curl-impersonate')\n# % elif cookiecutter.http_client == 'httpx'\n# % do extras.append('httpx')\n# % endif\n\n[project]\nname = \"{{cookiecutter.project_name}}\"\nversion = \"0.0.1\"\ndescription = \"\"\nauthors = [\n    {name = \"Your Name\",email = \"you@example.com\"}\n]\nreadme = \"README.md\"\nrequires-python = \">=3.10,<4.0\"\ndependencies = [\n    \"crawlee[{{ extras|join(',') }}]\",\n    # % if cookiecutter.crawler_type == 'playwright-camoufox'\n    \"camoufox[geoip]~=0.4.5\",\n    # % endif\n    # % if cookiecutter.enable_apify_integration\n    \"apify\",\n    # % endif\n]\n\n# % if cookiecutter.package_manager == 'poetry'\n[tool.poetry]\npackage-mode = false\n\n[build-system]\nrequires = [\"poetry-core>=2.0.0,<3.0.0\"]\nbuild-backend = \"poetry.core.masonry.api\"\n# % endif\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt",
    "content": "# % if cookiecutter.crawler_type == 'playwright-camoufox'\ncamoufox[geoip]~=0.4.5\n# % endif\n# % if cookiecutter.crawler_type.startswith('playwright')\n# % set extras = ['playwright']\n# % else\n# % set extras = [cookiecutter.crawler_type]\n# % endif\n# % if cookiecutter.enable_apify_integration\napify\n# % endif\n# % if cookiecutter.http_client == 'curl-impersonate'\n# % do extras.append('curl-impersonate')\n# % endif\n# % if cookiecutter.http_client == 'httpx'\n# % do extras.append('httpx')\n# % endif\ncrawlee[{{ extras | join(',') }}]\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py",
    "content": ""
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py",
    "content": "import asyncio\n# % if cookiecutter.http_client == 'curl-impersonate'\nimport platform\n# % if 'playwright' in cookiecutter.crawler_type\nimport warnings\n# % endif\n# % endif\n{{ '' }}\nfrom .main import main\n\nif __name__ == '__main__':\n    # % if cookiecutter.http_client == 'curl-impersonate'\n    if platform.system() == 'Windows':\n        # This mitigates a warning raised by curl-cffi.\n        # % if 'playwright' in cookiecutter.crawler_type\n        warnings.warn(\n            message=('curl-cffi suggests using WindowsSelectorEventLoopPolicy, but this conflicts with Playwright. '\n                     'Ignore the curl-cffi warning.'),\n            category=UserWarning,\n            stacklevel=2,\n        )\n        # % else\n        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())\n        # % endif\n    # % endif\n{{ '' }}\n    asyncio.run(main())\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py",
    "content": "# % include 'main_%s.py' % cookiecutter.__crawler_type\n"
  },
  {
    "path": "src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py",
    "content": "# % if cookiecutter.crawler_type.startswith('playwright')\n# % include 'routes_playwright.py'\n# % else\n# % include 'routes_%s.py' % cookiecutter.__crawler_type\n# % endif\n"
  },
  {
    "path": "src/crawlee/proxy_configuration.py",
    "content": "from __future__ import annotations\n\nimport inspect\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING\n\nfrom more_itertools import flatten\nfrom pydantic import AnyHttpUrl, TypeAdapter\nfrom typing_extensions import Protocol\nfrom yarl import URL\n\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Sequence\n\n    from crawlee import Request\n\n__all__ = ['ProxyConfiguration', 'ProxyInfo']\n\n\n@dataclass\n@docs_group('Other')\nclass ProxyInfo:\n    \"\"\"Provides information about a proxy connection that is used for requests.\"\"\"\n\n    url: str\n    \"\"\"The URL of the proxy.\"\"\"\n\n    scheme: str\n    \"\"\"The scheme of the proxy.\"\"\"\n\n    hostname: str\n    \"\"\"The hostname of the proxy.\"\"\"\n\n    port: int\n    \"\"\"The proxy port.\"\"\"\n\n    username: str = ''\n    \"\"\"The username for the proxy.\"\"\"\n\n    password: str = ''\n    \"\"\"The password for the proxy.\"\"\"\n\n    session_id: str | None = None\n    \"\"\"The identifier of the used proxy session, if used.\n    Using the same session ID guarantees getting the same proxy URL.\"\"\"\n\n    proxy_tier: int | None = None\n    \"\"\"The tier of the proxy.\"\"\"\n\n\n@docs_group('Configuration')\nclass ProxyConfiguration:\n    \"\"\"Configures connection to a proxy server with the provided options.\n\n    Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or\n    blacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies\n    for all connections. You can get information about the currently used proxy by inspecting the {@apilink ProxyInfo}\n    property in your crawler's page function. There, you can inspect the proxy's URL and other attributes.\n\n    If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of\n    proxy URLs will be rotated by the configuration if this option is provided.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        proxy_urls: list[str | None] | None = None,\n        new_url_function: _NewUrlFunction | None = None,\n        tiered_proxy_urls: list[list[str | None]] | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Exactly one of `proxy_urls`, `tiered_proxy_urls` or `new_url_function` must be specified.\n\n        Args:\n            proxy_urls: A list of URLs of proxies that will be rotated in a round-robin fashion\n            tiered_proxy_urls: A list of URL tiers (where a tier is a list of proxy URLs). Crawlers will automatically\n                try to use the lowest tier (smallest index) where blocking does not happen. The proxy URLs in\n                the selected tier will be rotated in a round-robin fashion.\n            new_url_function: A function that returns a proxy URL for a given Request. This provides full control over\n                the proxy selection mechanism.\n        \"\"\"\n        self._next_custom_url_index = 0\n        self._used_proxy_urls = dict[str, URL | None]()\n        self._url_validator = TypeAdapter(AnyHttpUrl)\n\n        # Validation\n        if sum(map(bool, (proxy_urls, new_url_function, list(flatten(tiered_proxy_urls or []))))) != 1:\n            raise ValueError(\n                'Exactly one of `proxy_urls`, `tiered_proxy_urls` and `new_url_function` '\n                'must be specified (and non-empty).'\n            )\n\n        self._proxy_urls = [self._create_url(url) for url in proxy_urls] if proxy_urls else []\n        self._proxy_tier_tracker = (\n            _ProxyTierTracker([[self._create_url(url) for url in tier] for tier in tiered_proxy_urls])\n            if tiered_proxy_urls\n            else None\n        )\n        self._new_url_function = new_url_function\n\n    def _create_url(self, url: str | None) -> URL | None:\n        \"\"\"Create URL from input string. None means that intentionally no proxy should be used.\"\"\"\n        if url is None:\n            return None\n\n        self._url_validator.validate_python(url)\n        return URL(url)\n\n    async def new_proxy_info(\n        self, session_id: str | None, request: Request | None, proxy_tier: int | None\n    ) -> ProxyInfo | None:\n        \"\"\"Return a new ProxyInfo object based on the configured proxy rotation strategy.\n\n        Args:\n            session_id: Session identifier. If provided, same proxy URL will be returned for\n                subsequent calls with this ID. Will be auto-generated for tiered proxies if\n                not provided.\n            request: Request object used for proxy rotation and tier selection. Required for\n                tiered proxies to track retries and adjust tier accordingly.\n            proxy_tier: Specific proxy tier to use. If not provided, will be automatically\n                selected based on configuration.\n        \"\"\"\n        if self._proxy_tier_tracker is not None and session_id is None:\n            session_id = crypto_random_object_id(6)\n\n        url, proxy_tier = await self._pick_url(session_id, request, proxy_tier)\n\n        if url is None:\n            return None\n\n        if url.port is None:\n            raise ValueError(f'Port is None for URL: {url}')\n\n        if url.host is None:\n            raise ValueError(f'Host is None for URL: {url}')\n\n        info = ProxyInfo(\n            url=str(url),\n            scheme=url.scheme,\n            hostname=url.host,\n            port=url.port,\n            username=url.user or '',\n            password=url.password or '',\n        )\n\n        if session_id is not None:\n            info.session_id = session_id\n\n        if proxy_tier is not None:\n            info.proxy_tier = proxy_tier\n\n        return info\n\n    async def new_url(\n        self, session_id: str | None = None, request: Request | None = None, proxy_tier: int | None = None\n    ) -> str | None:\n        \"\"\"Return a proxy URL string based on the configured proxy rotation strategy.\n\n        Args:\n            session_id: Session identifier. If provided, same proxy URL will be returned for\n                subsequent calls with this ID. Will be auto-generated for tiered proxies if\n                not provided.\n            request: Request object used for proxy rotation and tier selection. Required for\n                tiered proxies to track retries and adjust tier accordingly.\n            proxy_tier: Specific proxy tier to use. If not provided, will be automatically\n                selected based on configuration.\n        \"\"\"\n        proxy_info = await self.new_proxy_info(session_id, request, proxy_tier)\n        return proxy_info.url if proxy_info else None\n\n    async def _pick_url(\n        self, session_id: str | None, request: Request | None, proxy_tier: int | None\n    ) -> tuple[URL | None, int | None]:\n        if self._new_url_function:\n            try:\n                result = self._new_url_function(session_id, request)\n                if inspect.isawaitable(result):\n                    result = await result\n\n                return URL(str(result)) if result is not None else None, None\n            except Exception as e:\n                raise ValueError('The provided \"new_url_function\" did not return a valid URL') from e\n\n        if self._proxy_tier_tracker:\n            if request is not None and proxy_tier is None:\n                hostname = URL(request.url).host\n                if hostname is None:\n                    raise ValueError('The request URL does not have a hostname')\n\n                if request.last_proxy_tier is not None:\n                    self._proxy_tier_tracker.add_error(hostname, request.last_proxy_tier)\n\n                proxy_tier = self._proxy_tier_tracker.predict_tier(hostname)\n\n                request.last_proxy_tier = proxy_tier\n                request.forefront = True\n\n            if proxy_tier is not None:\n                urls = self._proxy_tier_tracker.get_tier_urls(proxy_tier)\n            else:\n                urls = self._proxy_tier_tracker.all_urls\n        elif self._proxy_urls:\n            urls = self._proxy_urls\n        else:\n            raise RuntimeError('Invalid state')\n\n        if session_id is None:\n            url = urls[self._next_custom_url_index % len(urls)]\n            self._next_custom_url_index += 1\n            return url, proxy_tier\n\n        if session_id not in self._used_proxy_urls:\n            self._used_proxy_urls[session_id] = urls[self._next_custom_url_index % len(urls)]\n            self._next_custom_url_index += 1\n\n        return self._used_proxy_urls[session_id], proxy_tier\n\n\nclass _ProxyTierTracker:\n    \"\"\"Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains.\"\"\"\n\n    def __init__(self, tiered_proxy_urls: list[list[URL | None]]) -> None:\n        self._tiered_proxy_urls = tiered_proxy_urls\n        self._histogram_by_domain = defaultdict[str, list[int]](lambda: [0 for _tier in tiered_proxy_urls])\n        self._current_tier_by_domain = defaultdict[str, int](lambda: 0)\n\n    @property\n    def all_urls(self) -> Sequence[URL | None]:\n        return list(flatten(self._tiered_proxy_urls))\n\n    def get_tier_urls(self, tier_number: int) -> Sequence[URL | None]:\n        return self._tiered_proxy_urls[tier_number]\n\n    def add_error(self, domain: str, tier: int) -> None:\n        self._histogram_by_domain[domain][tier] += 10\n\n    def predict_tier(self, domain: str) -> int:\n        histogram = self._histogram_by_domain[domain]\n        current_tier = self._current_tier_by_domain[domain]\n\n        for index, value in enumerate(histogram):\n            if index == current_tier:\n                continue\n            if value > 0:\n                histogram[index] -= 1\n\n        left = histogram[current_tier - 1] if current_tier > 0 else float('inf')\n        right = histogram[current_tier + 1] if current_tier < len(histogram) - 1 else float('inf')\n\n        if histogram[current_tier] > min(left, right):\n            self._current_tier_by_domain[domain] = current_tier - 1 if left <= right else current_tier + 1\n        elif histogram[current_tier] == left:\n            self._current_tier_by_domain[domain] -= 1\n\n        return self._current_tier_by_domain[domain]\n\n\nclass _NewUrlFunction(Protocol):\n    def __call__(\n        self,\n        session_id: str | None = None,\n        request: Request | None = None,\n    ) -> str | None | Awaitable[str | None]: ...\n"
  },
  {
    "path": "src/crawlee/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/request_loaders/__init__.py",
    "content": "from ._request_list import RequestList\nfrom ._request_loader import RequestLoader\nfrom ._request_manager import RequestManager\nfrom ._request_manager_tandem import RequestManagerTandem\nfrom ._sitemap_request_loader import SitemapRequestLoader\n\n__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader']\n"
  },
  {
    "path": "src/crawlee/request_loaders/_request_list.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport contextlib\nfrom collections.abc import AsyncGenerator, AsyncIterable, AsyncIterator, Iterable\nfrom logging import getLogger\nfrom typing import Annotated\n\nfrom pydantic import BaseModel, ConfigDict, Field\nfrom typing_extensions import override\n\nfrom crawlee._request import Request\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.request_loaders._request_loader import RequestLoader\n\nlogger = getLogger(__name__)\n\n\nclass RequestListState(BaseModel):\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    next_index: Annotated[int, Field(alias='nextIndex')] = 0\n    next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None\n    in_progress: Annotated[set[str], Field(alias='inProgress')] = set()\n\n\nclass RequestListData(BaseModel):\n    requests: Annotated[list[Request], Field()]\n\n\n@docs_group('Request loaders')\nclass RequestList(RequestLoader):\n    \"\"\"Represents a (potentially very large) list of URLs to crawl.\"\"\"\n\n    def __init__(\n        self,\n        requests: Iterable[str | Request] | AsyncIterable[str | Request] | None = None,\n        name: str | None = None,\n        persist_state_key: str | None = None,\n        persist_requests_key: str | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            requests: The request objects (or their string representations) to be added to the provider.\n            name: A name of the request list.\n            persist_state_key: A key for persisting the progress information of the RequestList.\n                If you do not pass a key but pass a `name`, a key will be derived using the name.\n                Otherwise, state will not be persisted.\n            persist_requests_key: A key for persisting the request data loaded from the `requests` iterator.\n                If specified, the request data will be stored in the KeyValueStore to make sure that they don't change\n                over time. This is useful if the `requests` iterator pulls the data dynamically.\n        \"\"\"\n        from crawlee._utils.recoverable_state import RecoverableState  # noqa: PLC0415\n\n        self._name = name\n        self._handled_count = 0\n        self._assumed_total_count = 0\n\n        self._next: tuple[Request | None, Request | None] = (None, None)\n\n        if persist_state_key is None and name is not None:\n            persist_state_key = f'SDK_REQUEST_LIST_STATE-{name}'\n\n        self._state = RecoverableState(\n            default_state=RequestListState(),\n            persistence_enabled=bool(persist_state_key),\n            persist_state_key=persist_state_key or '',\n            logger=logger,\n        )\n\n        self._persist_request_data = bool(persist_requests_key)\n\n        self._requests_data = RecoverableState(\n            default_state=RequestListData(requests=[]),\n            # With request data persistence enabled, a snapshot of the requests will be done on initialization\n            persistence_enabled='explicit_only' if self._persist_request_data else False,\n            persist_state_key=persist_requests_key or '',\n            logger=logger,\n        )\n\n        self._requests: AsyncIterator[str | Request]\n        if isinstance(requests, AsyncIterable):\n            self._requests = requests.__aiter__()  # ty: ignore[invalid-assignment]\n        elif requests is None:\n            self._requests = self._iterate_in_threadpool([])\n        else:\n            self._requests = self._iterate_in_threadpool(requests)\n\n        self._requests_lock: asyncio.Lock | None = None\n\n    async def _get_state(self) -> RequestListState:\n        # If state is already initialized, we are done\n        if self._state.is_initialized:\n            return self._state.current_value\n\n        # Initialize recoverable state\n        await self._state.initialize()\n        await self._requests_data.initialize()\n\n        # Initialize lock if necessary\n        if self._requests_lock is None:\n            self._requests_lock = asyncio.Lock()\n\n        # If the RequestList is configured to persist request data, ensure that a copy of request data is used\n        if self._persist_request_data:\n            async with self._requests_lock:\n                if not await self._requests_data.has_persisted_state():\n                    self._requests_data.current_value.requests = [\n                        request if isinstance(request, Request) else Request.from_url(request)\n                        async for request in self._requests\n                    ]\n                    await self._requests_data.persist_state()\n\n                self._requests = self._iterate_in_threadpool(\n                    self._requests_data.current_value.requests[self._state.current_value.next_index :]\n                )\n        # If not using persistent request data, advance the request iterator\n        else:\n            async with self._requests_lock:\n                for _ in range(self._state.current_value.next_index):\n                    with contextlib.suppress(StopAsyncIteration):\n                        await self._requests.__anext__()\n\n        # Check consistency of the stored state and the request iterator\n        if (unique_key_to_check := self._state.current_value.next_unique_key) is not None:\n            await self._ensure_next_request()\n\n            next_unique_key = self._next[0].unique_key if self._next[0] is not None else None\n            if next_unique_key != unique_key_to_check:\n                raise RuntimeError(\n                    f\"\"\"Mismatch at index {\n                        self._state.current_value.next_index\n                    } in persisted requests - Expected unique key `{unique_key_to_check}`, got `{next_unique_key}`\"\"\"\n                )\n\n        return self._state.current_value\n\n    @property\n    def name(self) -> str | None:\n        return self._name\n\n    @override\n    async def get_handled_count(self) -> int:\n        return self._handled_count\n\n    @override\n    async def get_total_count(self) -> int:\n        return self._assumed_total_count\n\n    @override\n    async def is_empty(self) -> bool:\n        await self._ensure_next_request()\n        return self._next[0] is None\n\n    @override\n    async def is_finished(self) -> bool:\n        state = await self._get_state()\n        return len(state.in_progress) == 0 and await self.is_empty()\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        await self._get_state()\n        await self._ensure_next_request()\n\n        if self._next[0] is None:\n            return None\n\n        state = await self._get_state()\n        state.in_progress.add(self._next[0].unique_key)\n        self._assumed_total_count += 1\n\n        next_request = self._next[0]\n        if next_request is not None:\n            state.next_index += 1\n            state.next_unique_key = self._next[1].unique_key if self._next[1] is not None else None\n\n        self._next = (self._next[1], None)\n        await self._ensure_next_request()\n\n        return next_request\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> None:\n        self._handled_count += 1\n        state = await self._get_state()\n        state.in_progress.remove(request.unique_key)\n\n    async def _ensure_next_request(self) -> None:\n        await self._get_state()\n\n        if self._requests_lock is None:\n            self._requests_lock = asyncio.Lock()\n\n        async with self._requests_lock:\n            if None in self._next:\n                if self._next[0] is None:\n                    to_enqueue = [item async for item in self._dequeue_requests(2)]\n                    self._next = (to_enqueue[0], to_enqueue[1])\n                else:\n                    to_enqueue = [item async for item in self._dequeue_requests(1)]\n                    self._next = (self._next[0], to_enqueue[0])\n\n    async def _dequeue_requests(self, count: int) -> AsyncGenerator[Request | None]:\n        for _ in range(count):\n            try:\n                yield self._transform_request(await self._requests.__anext__())\n            except StopAsyncIteration:  # noqa: PERF203\n                yield None\n\n    async def _iterate_in_threadpool(self, iterable: Iterable[str | Request]) -> AsyncIterator[str | Request]:\n        \"\"\"Inspired by a function of the same name from encode/starlette.\"\"\"\n        iterator = iter(iterable)\n\n        class _StopIteration(Exception):  # noqa: N818\n            pass\n\n        def _next() -> str | Request:\n            # We can't raise `StopIteration` from within the threadpool iterator\n            # and catch it outside that context, so we coerce them into a different\n            # exception type.\n            try:\n                return next(iterator)\n            except StopIteration:\n                raise _StopIteration  # noqa: B904\n\n        try:\n            while True:\n                yield await asyncio.to_thread(_next)\n        except _StopIteration:\n            return\n"
  },
  {
    "path": "src/crawlee/request_loaders/_request_loader.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nfrom crawlee import Request\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee.request_loaders import RequestManager, RequestManagerTandem\n    from crawlee.storage_clients.models import ProcessedRequest\n\n\n@docs_group('Request loaders')\nclass RequestLoader(ABC):\n    \"\"\"An abstract class defining the interface for classes that provide access to a read-only stream of requests.\n\n    Request loaders are used to manage and provide access to a storage of crawling requests.\n\n    Key responsibilities:\n        - Fetching the next request to be processed.\n        - Marking requests as successfully handled after processing.\n        - Managing state information such as the total and handled request counts.\n    \"\"\"\n\n    @abstractmethod\n    async def get_handled_count(self) -> int:\n        \"\"\"Get the number of requests in the loader that have been handled.\"\"\"\n\n    @abstractmethod\n    async def get_total_count(self) -> int:\n        \"\"\"Get an offline approximation of the total number of requests in the loader (i.e. pending + handled).\"\"\"\n\n    @abstractmethod\n    async def is_empty(self) -> bool:\n        \"\"\"Return True if there are no more requests in the loader (there might still be unfinished requests).\"\"\"\n\n    @abstractmethod\n    async def is_finished(self) -> bool:\n        \"\"\"Return True if all requests have been handled.\"\"\"\n\n    @abstractmethod\n    async def fetch_next_request(self) -> Request | None:\n        \"\"\"Return the next request to be processed, or `None` if there are no more pending requests.\n\n        The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method\n        should wait until a request appears.\n        \"\"\"\n\n    @abstractmethod\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        \"\"\"Mark a request as handled after a successful processing (or after giving up retrying).\"\"\"\n\n    async def to_tandem(self, request_manager: RequestManager | None = None) -> RequestManagerTandem:\n        \"\"\"Combine the loader with a request manager to support adding and reclaiming requests.\n\n        Args:\n            request_manager: Request manager to combine the loader with.\n                If None is given, the default request queue is used.\n        \"\"\"\n        # Import here to avoid circular imports.\n        from crawlee.request_loaders import RequestManagerTandem  # noqa: PLC0415\n        from crawlee.storages import RequestQueue  # noqa: PLC0415\n\n        if request_manager is None:\n            request_manager = await RequestQueue.open()\n\n        return RequestManagerTandem(self, request_manager)\n\n    def _transform_request(self, request: str | Request) -> Request:\n        \"\"\"Transform a request-like object into a Request object.\"\"\"\n        if isinstance(request, Request):\n            return request\n\n        if isinstance(request, str):\n            return Request.from_url(request)\n\n        raise ValueError(f'Invalid request type: {type(request)}')\n\n    def _transform_requests(self, requests: Sequence[str | Request]) -> list[Request]:\n        \"\"\"Transform a list of request-like objects into a list of `Request` objects.\"\"\"\n        processed_requests = dict[str, Request]()\n\n        for request in requests:\n            processed_request = self._transform_request(request)\n            processed_requests.setdefault(processed_request.unique_key, processed_request)\n\n        return list(processed_requests.values())\n"
  },
  {
    "path": "src/crawlee/request_loaders/_request_manager.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.request_loaders._request_loader import RequestLoader\nfrom crawlee.storage_clients.models import ProcessedRequest\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee._request import Request\n\n\n@docs_group('Request loaders')\nclass RequestManager(RequestLoader, ABC):\n    \"\"\"Base class that extends `RequestLoader` with the capability to enqueue new requests and reclaim failed ones.\"\"\"\n\n    @abstractmethod\n    async def drop(self) -> None:\n        \"\"\"Remove persistent state either from the Apify Cloud storage or from the local database.\"\"\"\n\n    @abstractmethod\n    async def add_request(\n        self,\n        request: str | Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        \"\"\"Add a single request to the manager and store it in underlying resource client.\n\n        Args:\n            request: The request object (or its string representation) to be added to the manager.\n            forefront: Determines whether the request should be added to the beginning (if True) or the end (if False)\n                of the manager.\n\n        Returns:\n            Information about the request addition to the manager or None if the request was not added.\n        \"\"\"\n\n    async def add_requests(\n        self,\n        requests: Sequence[str | Request],\n        *,\n        forefront: bool = False,\n        batch_size: int = 1000,  # noqa: ARG002\n        wait_time_between_batches: timedelta = timedelta(seconds=1),  # noqa: ARG002\n        wait_for_all_requests_to_be_added: bool = False,  # noqa: ARG002\n        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,  # noqa: ARG002\n    ) -> None:\n        \"\"\"Add requests to the manager in batches.\n\n        Args:\n            requests: Requests to enqueue.\n            forefront: If True, add requests to the beginning of the queue.\n            batch_size: The number of requests to add in one batch.\n            wait_time_between_batches: Time to wait between adding batches.\n            wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.\n            wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.\n        \"\"\"\n        # Default and dumb implementation.\n        processed_requests = list[ProcessedRequest]()\n        for request in requests:\n            processed_request = await self.add_request(request, forefront=forefront)\n            if processed_request:\n                processed_requests.append(processed_request)\n\n    @abstractmethod\n    async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None:\n        \"\"\"Reclaims a failed request back to the source, so that it can be returned for processing later again.\n\n        It is possible to modify the request data by supplying an updated request as a parameter.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/request_loaders/_request_manager_tandem.py",
    "content": "from __future__ import annotations\n\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.request_loaders import RequestManager\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee import Request\n    from crawlee.request_loaders import RequestLoader\n    from crawlee.storage_clients.models import ProcessedRequest\n\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Request loaders')\nclass RequestManagerTandem(RequestManager):\n    \"\"\"Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.\n\n    In this scenario, the contents of the \"loader\" get transferred into the \"manager\", allowing processing the requests\n    from both sources and also enqueueing new requests (not possible with plain `RequestManager`).\n    \"\"\"\n\n    def __init__(self, request_loader: RequestLoader, request_manager: RequestManager) -> None:\n        self._read_only_loader = request_loader\n        self._read_write_manager = request_manager\n\n    @override\n    async def get_handled_count(self) -> int:\n        return await self._read_write_manager.get_handled_count()\n\n    @override\n    async def get_total_count(self) -> int:\n        return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count())\n\n    @override\n    async def is_empty(self) -> bool:\n        return (await self._read_only_loader.is_empty()) and (await self._read_write_manager.is_empty())\n\n    @override\n    async def is_finished(self) -> bool:\n        return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished())\n\n    @override\n    async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None:\n        return await self._read_write_manager.add_request(request, forefront=forefront)\n\n    @override\n    async def add_requests(\n        self,\n        requests: Sequence[str | Request],\n        *,\n        forefront: bool = False,\n        batch_size: int = 1000,\n        wait_time_between_batches: timedelta = timedelta(seconds=1),\n        wait_for_all_requests_to_be_added: bool = False,\n        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,\n    ) -> None:\n        return await self._read_write_manager.add_requests(\n            requests,\n            forefront=forefront,\n            batch_size=batch_size,\n            wait_time_between_batches=wait_time_between_batches,\n            wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,\n            wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,\n        )\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        if await self._read_only_loader.is_finished():\n            return await self._read_write_manager.fetch_next_request()\n\n        request = await self._read_only_loader.fetch_next_request()\n\n        if not request:\n            return await self._read_write_manager.fetch_next_request()\n\n        try:\n            await self._read_write_manager.add_request(request, forefront=True)\n        except Exception:\n            logger.exception(\n                'Adding request from the RequestLoader to the RequestManager failed, the request has been dropped',\n                extra={'url': request.url, 'unique_key': request.unique_key},\n            )\n            return None\n\n        await self._read_only_loader.mark_request_as_handled(request)\n\n        return await self._read_write_manager.fetch_next_request()\n\n    @override\n    async def reclaim_request(self, request: Request, *, forefront: bool = False) -> None:\n        await self._read_write_manager.reclaim_request(request, forefront=forefront)\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> None:\n        await self._read_write_manager.mark_request_as_handled(request)\n\n    @override\n    async def drop(self) -> None:\n        await self._read_write_manager.drop()\n"
  },
  {
    "path": "src/crawlee/request_loaders/_sitemap_request_loader.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom collections import deque\nfrom contextlib import suppress\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Annotated, Any\n\nfrom pydantic import BaseModel, ConfigDict, Field\nfrom typing_extensions import override\n\nfrom crawlee import Request, RequestOptions\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.globs import Glob\nfrom crawlee._utils.recoverable_state import RecoverableState\nfrom crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap\nfrom crawlee.request_loaders._request_loader import RequestLoader\n\nif TYPE_CHECKING:\n    import re\n    from collections.abc import Callable, Sequence\n    from types import TracebackType\n\n    from crawlee import RequestTransformAction\n    from crawlee.http_clients import HttpClient\n    from crawlee.proxy_configuration import ProxyInfo\n    from crawlee.storage_clients.models import ProcessedRequest\n\n\nlogger = getLogger(__name__)\n\n\nclass SitemapRequestLoaderState(BaseModel):\n    \"\"\"State model for persisting sitemap request loader data.\n\n    The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.\n    The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved\n    from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to\n    `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a\n    `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,\n    the loader was restarted from a saved state and the URL is skipped.\n\n    If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is\n    incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`\n    is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is\n    cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in\n    `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.\n\n    When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.\n    When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and\n    `handled_count` is incremented by 1.\n\n    During initial startup or restart after persistence, state validation occurs in `_get_state`. If both\n    `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a\n    fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is\n    restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and\n    `in_progress` is cleared.\n    \"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    url_queue: Annotated[deque[str], Field(alias='urlQueue')]\n    \"\"\"Queue of URLs extracted from sitemaps and ready for processing.\"\"\"\n\n    in_progress: Annotated[set[str], Field(alias='inProgress')] = set()\n    \"\"\"Set of request URLs currently being processed.\"\"\"\n\n    pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]\n    \"\"\"Queue of sitemap URLs that need to be fetched and processed.\"\"\"\n\n    in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None\n    \"\"\"The sitemap URL currently being processed.\"\"\"\n\n    current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()\n    \"\"\"URLs from the current sitemap that have been added to the queue.\"\"\"\n\n    processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()\n    \"\"\"Set of processed sitemap URLs.\"\"\"\n\n    completed: Annotated[bool, Field(alias='sitemapCompleted')] = False\n    \"\"\"Whether all sitemaps have been fully processed.\"\"\"\n\n    total_count: Annotated[int, Field(alias='totalCount')] = 0\n    \"\"\"Total number of URLs found and added to the queue from all processed sitemaps.\"\"\"\n\n    handled_count: Annotated[int, Field(alias='handledCount')] = 0\n    \"\"\"Number of URLs that have been successfully handled.\"\"\"\n\n\n@docs_group('Request loaders')\nclass SitemapRequestLoader(RequestLoader):\n    \"\"\"A request loader that reads URLs from sitemap(s).\n\n    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol\n    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.\n    Note that HTML pages containing links are not supported - those should be handled by regular crawlers\n    and the `enqueue_links` functionality.\n\n    The loader fetches and parses sitemaps in the background, allowing crawling to start\n    before all URLs are loaded. It supports filtering URLs using glob and regex patterns.\n\n    The loader supports state persistence, allowing it to resume from where it left off\n    after interruption when a `persist_state_key` is provided during initialization.\n    \"\"\"\n\n    def __init__(\n        self,\n        sitemap_urls: list[str],\n        http_client: HttpClient,\n        *,\n        proxy_info: ProxyInfo | None = None,\n        include: list[re.Pattern[Any] | Glob] | None = None,\n        exclude: list[re.Pattern[Any] | Glob] | None = None,\n        max_buffer_size: int = 200,\n        persist_state_key: str | None = None,\n        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,\n    ) -> None:\n        \"\"\"Initialize the sitemap request loader.\n\n        Args:\n            sitemap_urls: Configuration options for the loader.\n            proxy_info: Optional proxy to use for fetching sitemaps.\n            include: List of glob or regex patterns to include URLs.\n            exclude: List of glob or regex patterns to exclude URLs.\n            max_buffer_size: Maximum number of URLs to buffer in memory.\n            http_client: the instance of `HttpClient` to use for fetching sitemaps.\n            persist_state_key: A key for persisting the loader's state in the KeyValueStore.\n                When provided, allows resuming from where it left off after interruption.\n                If None, no state persistence occurs.\n            transform_request_function: An optional function to transform requests\n                generated by the loader. It receives `RequestOptions` with `url` and should return either\n                modified `RequestOptions` or a `RequestTransformAction`.\n        \"\"\"\n        self._http_client = http_client\n        self._sitemap_urls = sitemap_urls\n        self._include = include\n        self._exclude = exclude\n        self._proxy_info = proxy_info\n        self._max_buffer_size = max_buffer_size\n        self._transform_request_function = transform_request_function\n\n        # Synchronization for queue operations\n        self._queue_has_capacity = asyncio.Event()\n        self._queue_has_capacity.set()\n        self._queue_lock = asyncio.Lock()\n\n        # Initialize recoverable state\n        self._state = RecoverableState(\n            default_state=SitemapRequestLoaderState(\n                url_queue=deque(),\n                pending_sitemap_urls=deque(),\n            ),\n            persistence_enabled=bool(persist_state_key),\n            persist_state_key=persist_state_key or '',\n            logger=logger,\n        )\n\n        # Start background loading\n        self._loading_task = asyncio.create_task(self._load_sitemaps())\n\n    async def _get_state(self) -> SitemapRequestLoaderState:\n        \"\"\"Initialize and return the current state.\"\"\"\n        async with self._queue_lock:\n            if self._state.is_initialized:\n                return self._state.current_value\n\n            await self._state.initialize()\n\n            # Initialize pending sitemaps on first run\n            has_sitemap_for_processing = (\n                self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url\n            )\n            if not has_sitemap_for_processing and not self._state.current_value.completed:\n                self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)\n\n            if self._state.current_value.in_progress:\n                self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)\n                self._state.current_value.in_progress.clear()\n\n            if (\n                self._state.current_value.url_queue\n                and len(self._state.current_value.url_queue) >= self._max_buffer_size\n            ):\n                # Notify that the queue is full\n                self._queue_has_capacity.clear()\n\n            return self._state.current_value\n\n    def _check_url_patterns(\n        self,\n        target_url: str,\n        include: Sequence[re.Pattern[Any] | Glob] | None,\n        exclude: Sequence[re.Pattern[Any] | Glob] | None,\n    ) -> bool:\n        \"\"\"Check if a URL matches configured include/exclude patterns.\"\"\"\n        # If the URL matches any `exclude` pattern, reject it\n        for pattern in exclude or ():\n            if isinstance(pattern, Glob):\n                pattern = pattern.regexp  # noqa: PLW2901\n\n            if pattern.match(target_url) is not None:\n                return False\n\n        # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL\n        if include is None:\n            return True\n\n        # If the URL matches any `include` pattern, accept it\n        for pattern in include:\n            if isinstance(pattern, Glob):\n                pattern = pattern.regexp  # noqa: PLW2901\n\n            if pattern.match(target_url) is not None:\n                return True\n\n        # The URL does not match any `include` pattern - reject it\n        return False\n\n    async def _load_sitemaps(self) -> None:\n        \"\"\"Load URLs from sitemaps in the background.\"\"\"\n        try:\n            # Get actual state\n            while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):\n                # Get sitemap URL for parsing\n                sitemap_url = state.in_progress_sitemap_url\n                if not sitemap_url:\n                    sitemap_url = state.pending_sitemap_urls.popleft()\n                    # Skip processed urls\n                    if sitemap_url in state.processed_sitemap_urls:\n                        continue\n                    state.in_progress_sitemap_url = sitemap_url\n\n                parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)\n\n                async for item in parse_sitemap(\n                    [SitemapSource(type='url', url=sitemap_url)],\n                    self._http_client,\n                    proxy_info=self._proxy_info,\n                    options=parse_options,\n                ):\n                    if isinstance(item, NestedSitemap):\n                        # Add nested sitemap to queue\n                        if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:\n                            state.pending_sitemap_urls.append(item.loc)\n                        continue\n\n                    if isinstance(item, SitemapUrl):\n                        url = item.loc\n\n                        state = await self._get_state()\n\n                        # Skip if already processed\n                        if url in state.current_sitemap_processed_urls:\n                            continue\n\n                        # Check if URL should be included\n                        if not self._check_url_patterns(url, self._include, self._exclude):\n                            continue\n\n                        # Check if we have capacity in the queue\n                        await self._queue_has_capacity.wait()\n\n                        state = await self._get_state()\n                        async with self._queue_lock:\n                            state.url_queue.append(url)\n                            state.current_sitemap_processed_urls.add(url)\n                            state.total_count += 1\n                            if len(state.url_queue) >= self._max_buffer_size:\n                                # Notify that the queue is full\n                                self._queue_has_capacity.clear()\n\n                # Clear current sitemap after processing\n                state = await self._get_state()\n                current_sitemap_url = state.in_progress_sitemap_url\n                state.in_progress_sitemap_url = None\n                if current_sitemap_url:\n                    state.processed_sitemap_urls.add(current_sitemap_url)\n                state.current_sitemap_processed_urls.clear()\n\n            # Mark as completed after processing all sitemap urls\n            state.completed = True\n\n        except Exception:\n            logger.exception('Error loading sitemaps')\n            raise\n\n    @override\n    async def get_total_count(self) -> int:\n        \"\"\"Return the total number of URLs found so far.\"\"\"\n        state = await self._get_state()\n        return state.total_count\n\n    @override\n    async def get_handled_count(self) -> int:\n        \"\"\"Return the number of URLs that have been handled.\"\"\"\n        state = await self._get_state()\n        return state.handled_count\n\n    @override\n    async def is_empty(self) -> bool:\n        \"\"\"Check if there are no more URLs to process.\"\"\"\n        state = await self._get_state()\n        return not state.url_queue\n\n    @override\n    async def is_finished(self) -> bool:\n        \"\"\"Check if all URLs have been processed.\"\"\"\n        state = await self._get_state()\n        return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        \"\"\"Fetch the next request to process.\"\"\"\n        while not (await self.is_finished()):\n            state = await self._get_state()\n            if not state.url_queue:\n                await asyncio.sleep(0.1)\n                continue\n\n            async with self._queue_lock:\n                url = state.url_queue.popleft()\n                request_option = RequestOptions(url=url)\n                if self._transform_request_function:\n                    transform_request_option = self._transform_request_function(request_option)\n                    if transform_request_option == 'skip':\n                        state.total_count -= 1\n                        continue\n                    if transform_request_option != 'unchanged':\n                        request_option = transform_request_option\n                request = Request.from_url(**request_option)\n                state.in_progress.add(request.url)\n                if len(state.url_queue) < self._max_buffer_size:\n                    self._queue_has_capacity.set()\n\n            return request\n\n        return None\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        \"\"\"Mark a request as successfully handled.\"\"\"\n        state = await self._get_state()\n        if request.url in state.in_progress:\n            state.in_progress.remove(request.url)\n            state.handled_count += 1\n        return None\n\n    async def abort_loading(self) -> None:\n        \"\"\"Abort the sitemap loading process.\"\"\"\n        if self._loading_task and not self._loading_task.done():\n            self._loading_task.cancel()\n            with suppress(asyncio.CancelledError):\n                await self._loading_task\n\n    async def start(self) -> None:\n        \"\"\"Start the sitemap loading process.\"\"\"\n        if self._loading_task and not self._loading_task.done():\n            return\n        self._loading_task = asyncio.create_task(self._load_sitemaps())\n\n    async def close(self) -> None:\n        \"\"\"Close the request loader.\"\"\"\n        await self.abort_loading()\n        await self._state.teardown()\n\n    async def __aenter__(self) -> SitemapRequestLoader:\n        \"\"\"Enter the context manager.\"\"\"\n        await self.start()\n        return self\n\n    async def __aexit__(\n        self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None\n    ) -> None:\n        \"\"\"Exit the context manager.\"\"\"\n        await self.close()\n"
  },
  {
    "path": "src/crawlee/router.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import Awaitable, Callable\nfrom typing import Generic, TypeVar\n\nfrom crawlee._request import RequestState\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee._utils.docs import docs_group\n\n__all__ = ['Router']\n\nfrom crawlee.errors import UserHandlerTimeoutError\n\nTCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)\nRequestHandler = Callable[[TCrawlingContext], Awaitable[None]]\n\n\n@docs_group('Other')\nclass Router(Generic[TCrawlingContext]):\n    \"\"\"A request dispatching system that routes requests to registered handlers based on their labels.\n\n    The `Router` allows you to define and register request handlers for specific labels. When a request is received,\n    the router invokes the corresponding `request_handler` based on the request's `label`. If no matching handler\n    is found, the default handler is used.\n\n    ### Usage\n\n    ```python\n    from crawlee.crawlers import HttpCrawler, HttpCrawlingContext\n    from crawlee.router import Router\n\n    router = Router[HttpCrawlingContext]()\n\n\n    # Handler for requests without a matching label handler\n    @router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Request without label {context.request.url} ...')\n\n\n    # Handler for category requests\n    @router.handler(label='category')\n    async def category_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Category request {context.request.url} ...')\n\n\n    # Handler for product requests\n    @router.handler(label='product')\n    async def product_handler(context: HttpCrawlingContext) -> None:\n        context.log.info(f'Product {context.request.url} ...')\n\n\n    async def main() -> None:\n        crawler = HttpCrawler(request_handler=router)\n        await crawler.run()\n    \"\"\"\n\n    def __init__(self) -> None:\n        self._default_handler: RequestHandler[TCrawlingContext] | None = None\n        self._handlers_by_label = dict[str, RequestHandler[TCrawlingContext]]()\n\n    def default_handler(self: Router, handler: RequestHandler[TCrawlingContext]) -> RequestHandler[TCrawlingContext]:\n        \"\"\"Register a default request handler.\n\n        The default request handler is invoked for requests that have either no label or a label for which we have\n        no matching handler.\n        \"\"\"\n        if self._default_handler is not None:\n            raise RuntimeError('A default handler is already configured')\n\n        self._default_handler = handler\n\n        return handler\n\n    def handler(\n        self,\n        label: str,\n    ) -> Callable[[RequestHandler[TCrawlingContext]], Callable[[TCrawlingContext], Awaitable]]:\n        \"\"\"Register a request handler based on a label.\n\n        This decorator registers a request handler for a specific label. The handler will be invoked only for requests\n        that have the exact same label.\n        \"\"\"\n        if label in self._handlers_by_label:\n            raise RuntimeError(f'A handler for label `{label}` is already registered')\n\n        def wrapper(handler: Callable[[TCrawlingContext], Awaitable]) -> Callable[[TCrawlingContext], Awaitable]:\n            self._handlers_by_label[label] = handler\n            return handler\n\n        return wrapper\n\n    async def __call__(self, context: TCrawlingContext) -> None:\n        \"\"\"Invoke a request handler that matches the request label (or the default).\"\"\"\n        context.request.state = RequestState.REQUEST_HANDLER\n        if context.request.label is None or context.request.label not in self._handlers_by_label:\n            if self._default_handler is None:\n                raise RuntimeError(\n                    f'No handler matches label `{context.request.label}` and no default handler is configured'\n                )\n\n            user_defined_handler = self._default_handler\n        else:\n            user_defined_handler = self._handlers_by_label[context.request.label]\n\n        try:\n            return await user_defined_handler(context)\n        except asyncio.TimeoutError as e:\n            # Timeout in handler, but not timeout of handler.\n            raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e\n"
  },
  {
    "path": "src/crawlee/sessions/__init__.py",
    "content": "from ._cookies import CookieParam, SessionCookies\nfrom ._session import Session\nfrom ._session_pool import SessionPool\n\n__all__ = ['CookieParam', 'Session', 'SessionCookies', 'SessionPool']\n"
  },
  {
    "path": "src/crawlee/sessions/_cookies.py",
    "content": "from __future__ import annotations\n\nfrom copy import deepcopy\nfrom http.cookiejar import Cookie, CookieJar\nfrom typing import TYPE_CHECKING, Any, Literal\n\nfrom typing_extensions import NotRequired, Required, TypedDict\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Iterator\n    from typing import TypeGuard\n\n\n@docs_group('Session management')\nclass CookieParam(TypedDict, total=False):\n    \"\"\"Dictionary representation of cookies for `SessionCookies.set` method.\"\"\"\n\n    name: Required[str]\n    \"\"\"Cookie name.\"\"\"\n\n    value: Required[str]\n    \"\"\"Cookie value.\"\"\"\n\n    domain: NotRequired[str]\n    \"\"\"Domain for which the cookie is set.\"\"\"\n\n    path: NotRequired[str]\n    \"\"\"Path on the specified domain for which the cookie is set.\"\"\"\n\n    secure: NotRequired[bool]\n    \"\"\"Set the `Secure` flag for the cookie.\"\"\"\n\n    http_only: NotRequired[bool]\n    \"\"\"Set the `HttpOnly` flag for the cookie.\"\"\"\n\n    expires: NotRequired[int]\n    \"\"\"Expiration date for the cookie, None for a session cookie.\"\"\"\n\n    same_site: NotRequired[Literal['Lax', 'None', 'Strict']]\n    \"\"\"Set the `SameSite` attribute for the cookie.\"\"\"\n\n\nclass PlaywrightCookieParam(TypedDict, total=False):\n    \"\"\"Cookie parameters in Playwright format with camelCase naming.\"\"\"\n\n    name: NotRequired[str]\n    value: NotRequired[str]\n    domain: NotRequired[str]\n    path: NotRequired[str]\n    secure: NotRequired[bool]\n    httpOnly: NotRequired[bool]\n    expires: NotRequired[float]\n    sameSite: NotRequired[Literal['Lax', 'None', 'Strict']]\n    partitionKey: NotRequired[str | None]\n\n\n@docs_group('Session management')\nclass SessionCookies:\n    \"\"\"Storage cookies for session with browser-compatible serialization and deserialization.\"\"\"\n\n    def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None) -> None:\n        if isinstance(cookies, CookieJar):\n            self._jar = cookies\n            return\n\n        self._jar = CookieJar()\n\n        if isinstance(cookies, list):\n            for item in cookies:\n                self.set(**item)\n\n        elif isinstance(cookies, SessionCookies):\n            for cookie in cookies.jar:\n                self._jar.set_cookie(cookie)\n\n        elif isinstance(cookies, dict):\n            cookies_dict: dict[str, str] = cookies\n            for key, value in cookies_dict.items():\n                self.set(key, value)\n\n    @property\n    def jar(self) -> CookieJar:\n        \"\"\"The cookie jar instance.\"\"\"\n        return self._jar\n\n    def set(\n        self,\n        name: str,\n        value: str,\n        *,\n        domain: str = '',\n        path: str = '/',\n        expires: int | None = None,\n        http_only: bool = False,\n        secure: bool = False,\n        same_site: Literal['Lax', 'None', 'Strict'] | None = None,\n        **_kwargs: Any,  # Unknown parameters will be ignored.\n    ) -> None:\n        \"\"\"Create and store a cookie with modern browser attributes.\n\n        Args:\n            name: Cookie name.\n            value: Cookie value.\n            domain: Cookie domain.\n            path: Cookie path.\n            expires: Cookie expiration timestamp.\n            http_only: Whether cookie is HTTP-only.\n            secure: Whether cookie requires secure context.\n            same_site: SameSite cookie attribute value.\n        \"\"\"\n        cookie = Cookie(\n            version=0,\n            name=name,\n            value=value,\n            port=None,\n            port_specified=False,\n            domain=domain,\n            domain_specified=bool(domain),\n            domain_initial_dot=domain.startswith('.'),\n            path=path,\n            path_specified=bool(path),\n            secure=secure,\n            expires=expires,\n            discard=True,\n            comment=None,\n            comment_url=None,\n            rest={'HttpOnly': ''} if http_only else {},\n            rfc2109=False,\n        )\n\n        if same_site:\n            cookie.set_nonstandard_attr('SameSite', same_site)\n\n        self.jar.set_cookie(cookie)\n\n    def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam:\n        \"\"\"Convert `http.cookiejar.Cookie` to dictionary format.\n\n        Args:\n            cookie: Cookie object to convert.\n        \"\"\"\n        cookie_dict = CookieParam(\n            name=cookie.name,\n            value=cookie.value or '',\n            domain=cookie.domain,\n            path=cookie.path,\n            secure=cookie.secure,\n            http_only=cookie.has_nonstandard_attr('HttpOnly'),\n        )\n\n        if cookie.expires:\n            cookie_dict['expires'] = cookie.expires\n\n        if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):\n            cookie_dict['same_site'] = same_site\n\n        return cookie_dict\n\n    def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookieParam:\n        \"\"\"Convert internal cookie to Playwright format.\"\"\"\n        result: dict = dict(cookie_dict)\n\n        if 'http_only' in result:\n            result['httpOnly'] = result.pop('http_only')\n        if 'same_site' in result:\n            result['sameSite'] = result.pop('same_site')\n        if 'expires' in result:\n            result['expires'] = float(result['expires'])\n\n        return PlaywrightCookieParam(**result)\n\n    def _from_playwright(self, cookie_dict: PlaywrightCookieParam) -> CookieParam:\n        \"\"\"Convert Playwright cookie to internal format.\"\"\"\n        result: dict = dict(cookie_dict)\n\n        if 'httpOnly' in result:\n            result['http_only'] = result.pop('httpOnly')\n        if 'sameSite' in result:\n            result['same_site'] = result.pop('sameSite')\n        if 'expires' in result:\n            expires = int(result['expires'])\n            result['expires'] = None if expires == -1 else expires\n\n        return CookieParam(name=result.pop('name', ''), value=result.pop('value', ''), **result)\n\n    def get_cookies_as_dicts(self) -> list[CookieParam]:\n        \"\"\"Convert cookies to a list with `CookieParam` dicts.\"\"\"\n        return [self._convert_cookie_to_dict(cookie) for cookie in self.jar]\n\n    def store_cookie(self, cookie: Cookie) -> None:\n        \"\"\"Store a Cookie object in the session cookie jar.\n\n        Args:\n            cookie: The Cookie object to store in the jar.\n        \"\"\"\n        self.jar.set_cookie(cookie)\n\n    def store_cookies(self, cookies: list[Cookie]) -> None:\n        \"\"\"Store multiple cookie objects in the session cookie jar.\n\n        Args:\n            cookies: A list of cookie objects to store in the jar.\n        \"\"\"\n        for cookie in cookies:\n            self.store_cookie(cookie)\n        self._jar.clear_expired_cookies()\n\n    def set_cookies(self, cookie_dicts: list[CookieParam]) -> None:\n        \"\"\"Create and store cookies from their dictionary representations.\n\n        Args:\n            cookie_dicts: List of dictionaries where each dict represents cookie parameters.\n        \"\"\"\n        for cookie_dict in cookie_dicts:\n            self.set(**cookie_dict)\n        self._jar.clear_expired_cookies()\n\n    def get_cookies_as_playwright_format(self) -> list[PlaywrightCookieParam]:\n        \"\"\"Get cookies in playwright format.\"\"\"\n        return [self._to_playwright(cookie) for cookie in self.get_cookies_as_dicts()]\n\n    def set_cookies_from_playwright_format(self, pw_cookies: list[PlaywrightCookieParam]) -> None:\n        \"\"\"Set cookies from playwright format.\"\"\"\n        for pw_cookie in pw_cookies:\n            cookie_param = self._from_playwright(pw_cookie)\n            self.set(**cookie_param)\n        self._jar.clear_expired_cookies()\n\n    def __deepcopy__(self, memo: dict[int, Any] | None) -> SessionCookies:\n        # This is necessary because `CookieJar` use `RLock`, which prevents `deepcopy`.\n        cookie_dicts = self.get_cookies_as_dicts()\n        return self.__class__(deepcopy(cookie_dicts, memo))\n\n    def __len__(self) -> int:\n        return len(self._jar)\n\n    def __setitem__(self, name: str, value: str) -> None:\n        self.set(name, value)\n\n    def __getitem__(self, name: str) -> str | None:\n        for cookie in self._jar:\n            if cookie.name == name:\n                return cookie.value\n        raise KeyError(f\"Cookie '{name}' not found\")\n\n    def __iter__(self) -> Iterator[CookieParam]:\n        return (self._convert_cookie_to_dict(cookie) for cookie in self._jar)\n\n    def __repr__(self) -> str:\n        cookies_str: str = ', '.join(\n            [f'<Cookie {cookie.name}={cookie.value} for {cookie.domain}{cookie.path}>' for cookie in self._jar]\n        )\n        return f'<SessionCookies[{cookies_str}]>'\n\n    def __bool__(self) -> bool:\n        for _ in self._jar:\n            return True\n        return False\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, SessionCookies):\n            return NotImplemented\n\n        if len(self) != len(other):\n            return False\n\n        self_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar}\n        other_keys = {(cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in other.jar}\n\n        return self_keys == other_keys\n\n    def __hash__(self) -> int:\n        \"\"\"Return hash based on the cookies key attributes.\"\"\"\n        cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)\n        return hash(cookie_tuples)\n\n    def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:\n        return value in {'Lax', 'None', 'Strict'}\n"
  },
  {
    "path": "src/crawlee/sessions/_models.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timedelta\nfrom typing import Annotated, Any\n\nfrom pydantic import (\n    BaseModel,\n    BeforeValidator,\n    ConfigDict,\n    Field,\n    GetPydanticSchema,\n    PlainSerializer,\n    computed_field,\n)\n\nfrom ._cookies import CookieParam\nfrom ._session import Session\n\n\nclass SessionModel(BaseModel):\n    \"\"\"Model for a Session object.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    id: Annotated[str, Field(alias='id')]\n    max_age: Annotated[timedelta, Field(alias='maxAge')]\n    user_data: Annotated[dict, Field(alias='userData')]\n    max_error_score: Annotated[float, Field(alias='maxErrorScore')]\n    error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]\n    created_at: Annotated[datetime, Field(alias='createdAt')]\n    usage_count: Annotated[int, Field(alias='usageCount')]\n    max_usage_count: Annotated[int, Field(alias='maxUsageCount')]\n    error_score: Annotated[float, Field(alias='errorScore')]\n    cookies: Annotated[list[CookieParam], Field(alias='cookies')]\n    blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')]\n\n\nclass SessionPoolModel(BaseModel):\n    \"\"\"Model for a SessionPool object.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)\n\n    max_pool_size: Annotated[int, Field(alias='maxPoolSize')]\n\n    sessions: Annotated[\n        dict[\n            str,\n            Annotated[\n                Session, GetPydanticSchema(lambda _, handler: handler(Any))\n            ],  # handler(Any) is fine - we validate manually in the BeforeValidator\n        ],\n        Field(alias='sessions'),\n        PlainSerializer(\n            lambda value: [session.get_state().model_dump(by_alias=True) for session in value.values()],\n            return_type=list,\n        ),\n        BeforeValidator(\n            lambda value: {\n                session.id: session\n                for item in value\n                if (session := Session.from_model(SessionModel.model_validate(item, by_alias=True)))\n            }\n        ),\n    ]\n\n    @computed_field(alias='sessionCount')\n    @property\n    def session_count(self) -> int:\n        \"\"\"Get the total number of sessions currently maintained in the pool.\"\"\"\n        return len(self.sessions)\n\n    @computed_field(alias='usableSessionCount')\n    @property\n    def usable_session_count(self) -> int:\n        \"\"\"Get the number of sessions that are currently usable.\"\"\"\n        return len([session for _, session in self.sessions.items() if session.is_usable])\n\n    @computed_field(alias='retiredSessionCount')\n    @property\n    def retired_session_count(self) -> int:\n        \"\"\"Get the number of sessions that are no longer usable.\"\"\"\n        return self.session_count - self.usable_session_count\n"
  },
  {
    "path": "src/crawlee/sessions/_session.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session.ts\n\nfrom __future__ import annotations\n\nfrom datetime import datetime, timedelta, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, ClassVar, Literal, overload\n\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.sessions._cookies import CookieParam, SessionCookies\n\nif TYPE_CHECKING:\n    from http.cookiejar import CookieJar\n\n    from crawlee.sessions._models import SessionModel\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Session management')\nclass Session:\n    \"\"\"Represent a single user session, managing cookies, error states, and usage limits.\n\n    A `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially\n    a unique browser fingerprint. It maintains its internal state, which can include custom user data\n    (e.g., authorization tokens or headers) and tracks its usability through metrics such as error score,\n    usage count, and expiration.\n    \"\"\"\n\n    _DEFAULT_BLOCKED_STATUS_CODES: ClassVar = [401, 403, 429]\n    \"\"\"Default status codes that indicate a session is blocked.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        id: str | None = None,\n        max_age: timedelta = timedelta(minutes=50),\n        user_data: dict | None = None,\n        max_error_score: float = 3.0,\n        error_score_decrement: float = 0.5,\n        created_at: datetime | None = None,\n        usage_count: int = 0,\n        max_usage_count: int = 50,\n        error_score: float = 0.0,\n        cookies: SessionCookies | CookieJar | dict[str, str] | list[CookieParam] | None = None,\n        blocked_status_codes: list | None = None,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            id: Unique identifier for the session, autogenerated if not provided.\n            max_age: Time duration after which the session expires.\n            user_data: Custom user data associated with the session.\n            max_error_score: Threshold score beyond which the session is considered blocked.\n            error_score_decrement: Value by which the error score is decremented on successful operations.\n            created_at: Timestamp when the session was created, defaults to current UTC time if not provided.\n            usage_count: Number of times the session has been used.\n            max_usage_count: Maximum allowable uses of the session before it is considered expired.\n            error_score: Current error score of the session.\n            cookies: Cookies associated with the session.\n            blocked_status_codes: HTTP status codes that indicate a session should be blocked.\n        \"\"\"\n        self._id = id or crypto_random_object_id(length=10)\n        self._max_age = max_age\n        self._user_data = user_data or {}\n        self._max_error_score = max_error_score\n        self._error_score_decrement = error_score_decrement\n        self._created_at = created_at or datetime.now(timezone.utc)\n        self._usage_count = usage_count\n        self._max_usage_count = max_usage_count\n        self._error_score = error_score\n        self._cookies = SessionCookies(cookies) or SessionCookies()\n        self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES)\n\n    @classmethod\n    def from_model(cls, model: SessionModel) -> Session:\n        \"\"\"Initialize a new instance from a `SessionModel`.\"\"\"\n        cookies = SessionCookies(model.cookies)\n        return cls(**model.model_dump(exclude={'cookies'}), cookies=cookies)\n\n    def __repr__(self) -> str:\n        \"\"\"Get a string representation.\"\"\"\n        return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>'\n\n    def __eq__(self, other: object) -> bool:\n        \"\"\"Compare two sessions for equality.\"\"\"\n        if not isinstance(other, Session):\n            return NotImplemented\n        return self.get_state(as_dict=True) == other.get_state(as_dict=True)\n\n    def __hash__(self) -> int:\n        \"\"\"Return hash based on the session state.\"\"\"\n        state = self.get_state(as_dict=True)\n        hashable_items = list[tuple[str, int]]()\n\n        # Convert dict to tuple of sorted items for consistent hashing. Exclude non-hashable values like cookies\n        # and convert them to their string representation.\n        for key, value in sorted(state.items()):\n            if key == 'cookies':\n                # Use hash of the cookies object if it has __hash__ method.\n                hashable_items.append((key, hash(self._cookies)))\n            elif isinstance(value, (list, dict)):\n                # Convert collections to tuples for hashing.\n                if isinstance(value, list):\n                    hashable_items.append((key, hash(tuple(value))))\n                else:\n                    hashable_items.append((key, hash(tuple(sorted(value.items())))))\n            else:\n                hashable_items.append((key, hash(value)))\n\n        return hash(tuple(hashable_items))\n\n    @property\n    def id(self) -> str:\n        \"\"\"Get the session ID.\"\"\"\n        return self._id\n\n    @property\n    def user_data(self) -> dict:\n        \"\"\"Get the user data.\"\"\"\n        return self._user_data\n\n    @property\n    def cookies(self) -> SessionCookies:\n        \"\"\"Get the cookies.\"\"\"\n        return self._cookies\n\n    @property\n    def error_score(self) -> float:\n        \"\"\"Get the current error score.\"\"\"\n        return self._error_score\n\n    @property\n    def usage_count(self) -> float:\n        \"\"\"Get the current usage count.\"\"\"\n        return self._usage_count\n\n    @property\n    def expires_at(self) -> datetime:\n        \"\"\"Get the expiration datetime of the session.\"\"\"\n        return self._created_at + self._max_age\n\n    @property\n    def is_blocked(self) -> bool:\n        \"\"\"Indicate whether the session is blocked based on the error score..\"\"\"\n        return self._error_score >= self._max_error_score\n\n    @property\n    def is_expired(self) -> bool:\n        \"\"\"Indicate whether the session is expired based on the current time.\"\"\"\n        return datetime.now(timezone.utc) >= self.expires_at\n\n    @property\n    def is_max_usage_count_reached(self) -> bool:\n        \"\"\"Indicate whether the session has reached its maximum usage limit.\"\"\"\n        return self._usage_count >= self._max_usage_count\n\n    @property\n    def is_usable(self) -> bool:\n        \"\"\"Determine if the session is usable for next requests.\"\"\"\n        return not (self.is_blocked or self.is_expired or self.is_max_usage_count_reached)\n\n    @overload\n    def get_state(self, *, as_dict: Literal[True]) -> dict: ...\n\n    @overload\n    def get_state(self, *, as_dict: Literal[False]) -> SessionModel: ...\n\n    def get_state(self, *, as_dict: bool = False) -> SessionModel | dict:\n        \"\"\"Retrieve the current state of the session either as a model or as a dictionary.\"\"\"\n        from ._models import SessionModel  # noqa: PLC0415\n\n        model = SessionModel(\n            id=self._id,\n            max_age=self._max_age,\n            user_data=self._user_data,\n            max_error_score=self._max_error_score,\n            error_score_decrement=self._error_score_decrement,\n            created_at=self._created_at,\n            usage_count=self._usage_count,\n            max_usage_count=self._max_usage_count,\n            error_score=self._error_score,\n            cookies=self._cookies.get_cookies_as_dicts(),\n            blocked_status_codes=list(self._blocked_status_codes),\n        )\n        if as_dict:\n            return model.model_dump()\n        return model\n\n    def mark_good(self) -> None:\n        \"\"\"Mark the session as good. Should be called after a successful session usage.\"\"\"\n        self._usage_count += 1\n\n        if self._error_score > 0:\n            self._error_score = max(0, self._error_score - self._error_score_decrement)\n\n        # Retire the session if it is not usable anymore\n        if not self.is_usable:\n            self.retire()\n\n    def mark_bad(self) -> None:\n        \"\"\"Mark the session as bad after an unsuccessful session usage.\"\"\"\n        self._error_score += 1\n        self._usage_count += 1\n\n        # Retire the session if it is not usable anymore\n        if not self.is_usable:\n            self.retire()\n\n    def retire(self) -> None:\n        \"\"\"Retire the session by setting the error score to the maximum value.\n\n        This method should be used if the session usage was unsuccessful and you are sure that it is because of\n        the session configuration and not any external matters. For example when server returns 403 status code.\n        If the session does not work due to some external factors as server error such as 5XX you probably want\n        to use `mark_bad` method.\n        \"\"\"\n        self._error_score += self._max_error_score\n        self._usage_count += 1\n        # Note: We emit an event here because of the Puppeteer in TS implementation.\n\n    def is_blocked_status_code(\n        self,\n        *,\n        status_code: int,\n        ignore_http_error_status_codes: set[int] | None = None,\n    ) -> bool:\n        \"\"\"Evaluate whether a session should be retired based on the received HTTP status code.\n\n        Args:\n            status_code: The HTTP status code received from a server response.\n            ignore_http_error_status_codes: Optional status codes to allow suppression of\n            codes from `blocked_status_codes`.\n\n        Returns:\n            True if the session should be retired, False otherwise.\n        \"\"\"\n        return status_code in (self._blocked_status_codes - (ignore_http_error_status_codes or set()))\n"
  },
  {
    "path": "src/crawlee/sessions/_session_pool.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.9.0/packages/core/src/session_pool/session_pool.ts\n\nfrom __future__ import annotations\n\nimport random\nfrom collections.abc import Callable\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Literal, overload\n\nfrom crawlee import service_locator\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recoverable_state import RecoverableState\nfrom crawlee.sessions import Session\nfrom crawlee.sessions._models import SessionPoolModel\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from crawlee.events import EventManager\n\nlogger = getLogger(__name__)\n\nCreateSessionFunctionType = Callable[[], Session]\n\n\n@docs_group('Session management')\nclass SessionPool:\n    \"\"\"A pool of sessions that are managed, rotated, and persisted based on usage and age.\n\n    It ensures effective session management by maintaining a pool of sessions and rotating them based on\n    usage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their\n    lifecycle, and optionally persist the state to enable recovery.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        max_pool_size: int = 1000,\n        create_session_settings: dict | None = None,\n        create_session_function: CreateSessionFunctionType | None = None,\n        event_manager: EventManager | None = None,\n        persistence_enabled: bool = False,\n        persist_state_kvs_name: str | None = None,\n        persist_state_key: str = 'CRAWLEE_SESSION_POOL_STATE',\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Args:\n            max_pool_size: Maximum number of sessions to maintain in the pool. You can add more sessions to the pool\n                by using the `add_session` method.\n            create_session_settings: Settings for creating new session instances. If None, default settings will\n                be used. Do not set it if you are providing a `create_session_function`.\n            create_session_function: A callable to create new session instances. If None, a default session settings\n                will be used. Do not set it if you are providing `create_session_settings`.\n            event_manager: The event manager to handle events like persist state.\n            persistence_enabled: Flag to enable or disable state persistence of the pool.\n            persist_state_kvs_name: The name of the `KeyValueStore` used for state persistence.\n            persist_state_key: The key under which the session pool's state is stored in the `KeyValueStore`.\n        \"\"\"\n        if event_manager:\n            service_locator.set_event_manager(event_manager)\n\n        self._state = RecoverableState(\n            default_state=SessionPoolModel(\n                max_pool_size=max_pool_size,\n                sessions={},\n            ),\n            logger=logger,\n            persistence_enabled=persistence_enabled,\n            persist_state_kvs_name=persist_state_kvs_name,\n            persist_state_key=persist_state_key or 'CRAWLEE_SESSION_POOL_STATE',\n        )\n\n        self._max_pool_size = max_pool_size\n        self._session_settings = create_session_settings or {}\n        self._create_session_function = create_session_function\n        self._persistence_enabled = persistence_enabled\n\n        if self._create_session_function and self._session_settings:\n            raise ValueError('Both `create_session_settings` and `create_session_function` cannot be provided.')\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    def __repr__(self) -> str:\n        \"\"\"Get a string representation.\"\"\"\n        return f'<{self.__class__.__name__} {self.get_state(as_dict=False)}>'\n\n    @property\n    def session_count(self) -> int:\n        \"\"\"Get the total number of sessions currently maintained in the pool.\"\"\"\n        return len(self._state.current_value.sessions)\n\n    @property\n    def usable_session_count(self) -> int:\n        \"\"\"Get the number of sessions that are currently usable.\"\"\"\n        return self._state.current_value.usable_session_count\n\n    @property\n    def retired_session_count(self) -> int:\n        \"\"\"Get the number of sessions that are no longer usable.\"\"\"\n        return self._state.current_value.retired_session_count\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    async def __aenter__(self) -> SessionPool:\n        \"\"\"Initialize the pool upon entering the context manager.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        self._active = True\n\n        state = await self._state.initialize()\n        state.max_pool_size = self._max_pool_size\n        self._remove_retired_sessions()\n\n        if not state.sessions:\n            await self._fill_sessions_to_max()\n\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Deinitialize the pool upon exiting the context manager.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        await self._state.teardown()\n\n        self._active = False\n\n    @overload\n    def get_state(self, *, as_dict: Literal[True]) -> dict: ...\n\n    @overload\n    def get_state(self, *, as_dict: Literal[False]) -> SessionPoolModel: ...\n\n    @ensure_context\n    def get_state(self, *, as_dict: bool = False) -> SessionPoolModel | dict:\n        \"\"\"Retrieve the current state of the pool either as a model or as a dictionary.\"\"\"\n        model = self._state.current_value.model_copy(deep=True)\n        if as_dict:\n            return model.model_dump()\n        return model\n\n    @ensure_context\n    def add_session(self, session: Session) -> None:\n        \"\"\"Add an externally created session to the pool.\n\n        This is intended only for the cases when you want to add a session that was created outside of the pool.\n        Otherwise, the pool will create new sessions automatically.\n\n        Args:\n            session: The session to add to the pool.\n        \"\"\"\n        state = self._state.current_value\n\n        if session.id in state.sessions:\n            logger.warning(f'Session with ID {session.id} already exists in the pool.')\n            return\n        state.sessions[session.id] = session\n\n    @ensure_context\n    async def get_session(self) -> Session:\n        \"\"\"Retrieve a random session from the pool.\n\n        This method first ensures the session pool is at its maximum capacity. If the random session is not usable,\n        retired sessions are removed and a new session is created and returned.\n\n        Returns:\n            The session object.\n        \"\"\"\n        await self._fill_sessions_to_max()\n        session = self._get_random_session()\n\n        if session.is_usable:\n            return session\n\n        # If the random session is not usable, clean up and create a new session\n        self._remove_retired_sessions()\n        return await self._create_new_session()\n\n    @ensure_context\n    async def get_session_by_id(self, session_id: str) -> Session | None:\n        \"\"\"Retrieve a session by ID from the pool.\n\n        This method first ensures the session pool is at its maximum capacity. It then tries to retrieve a specific\n        session by ID. If the session is not found or not usable, `None` is returned.\n\n        Args:\n            session_id: The ID of the session to retrieve.\n\n        Returns:\n            The session object if found and usable, otherwise `None`.\n        \"\"\"\n        await self._fill_sessions_to_max()\n        session = self._state.current_value.sessions.get(session_id)\n\n        if not session:\n            logger.warning(f'Session with ID {session_id} not found.')\n            return None\n\n        if not session.is_usable:\n            logger.warning(f'Session with ID {session_id} is not usable.')\n            return None\n\n        return session\n\n    async def reset_store(self) -> None:\n        \"\"\"Reset the KVS where the pool state is persisted.\"\"\"\n        await self._state.reset()\n\n    async def _create_new_session(self) -> Session:\n        \"\"\"Create a new session, add it to the pool and return it.\"\"\"\n        if self._create_session_function:\n            new_session = self._create_session_function()\n        else:\n            new_session = Session(**self._session_settings)\n        self._state.current_value.sessions[new_session.id] = new_session\n        return new_session\n\n    async def _fill_sessions_to_max(self) -> None:\n        \"\"\"Fill the pool with sessions to the maximum size.\"\"\"\n        for _ in range(self._max_pool_size - self.session_count):\n            await self._create_new_session()\n\n    def _get_random_session(self) -> Session:\n        \"\"\"Get a random session from the pool.\"\"\"\n        state = self._state.current_value\n        if not state.sessions:\n            raise ValueError('No sessions available in the pool.')\n        return random.choice(list(state.sessions.values()))\n\n    def _remove_retired_sessions(self) -> None:\n        \"\"\"Remove all sessions from the pool that are no longer usable.\"\"\"\n        state = self._state.current_value\n        state.sessions = {session.id: session for session in state.sessions.values() if session.is_usable}\n"
  },
  {
    "path": "src/crawlee/sessions/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/statistics/__init__.py",
    "content": "from ._models import FinalStatistics, StatisticsState\nfrom ._statistics import Statistics\n\n__all__ = ['FinalStatistics', 'Statistics', 'StatisticsState']\n"
  },
  {
    "path": "src/crawlee/statistics/_error_snapshotter.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport hashlib\nimport re\nimport string\nfrom typing import TYPE_CHECKING\n\nfrom crawlee.storages import KeyValueStore\n\nif TYPE_CHECKING:\n    from crawlee._types import BasicCrawlingContext\n\n\nclass ErrorSnapshotter:\n    MAX_ERROR_CHARACTERS = 30\n    MAX_HASH_LENGTH = 30\n    MAX_FILENAME_LENGTH = 250\n    BASE_MESSAGE = 'An error occurred'\n    SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT'\n    ALLOWED_CHARACTERS = string.ascii_letters + string.digits + '!-_.'\n\n    def __init__(self, *, snapshot_kvs_name: str | None = None) -> None:\n        self._kvs_name = snapshot_kvs_name\n\n    async def capture_snapshot(\n        self,\n        error_message: str,\n        file_and_line: str,\n        context: BasicCrawlingContext,\n    ) -> None:\n        \"\"\"Capture error snapshot and save it to key value store.\n\n        It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because\n        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`\n        returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with\n        an exception.\n\n        Args:\n            error_message: Used in filename of the snapshot.\n            file_and_line: Used in filename of the snapshot.\n            context: Context that is used to get the snapshot.\n        \"\"\"\n        if snapshot := await context.get_snapshot():\n            kvs = await KeyValueStore.open(name=self._kvs_name)\n            snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line)\n            snapshot_save_tasks = list[asyncio.Task]()\n\n            if snapshot.html:\n                snapshot_save_tasks.append(\n                    asyncio.create_task(self._save_html(kvs, snapshot.html, base_name=snapshot_base_name))\n                )\n\n            if snapshot.screenshot:\n                snapshot_save_tasks.append(\n                    asyncio.create_task(self._save_screenshot(kvs, snapshot.screenshot, base_name=snapshot_base_name))\n                )\n\n            await asyncio.gather(*snapshot_save_tasks)\n\n    async def _save_html(self, kvs: KeyValueStore, html: str, base_name: str) -> None:\n        file_name = f'{base_name}.html'\n        await kvs.set_value(file_name, html, content_type='text/html')\n\n    async def _save_screenshot(self, kvs: KeyValueStore, screenshot: bytes, base_name: str) -> None:\n        file_name = f'{base_name}.jpg'\n        await kvs.set_value(file_name, screenshot, content_type='image/jpeg')\n\n    def _sanitize_filename(self, filename: str) -> str:\n        return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH])\n\n    def _get_snapshot_base_name(self, error_message: str, file_and_line: str) -> str:\n        sha1_hash = hashlib.sha1()  # noqa:S324 # Collisions related attacks are of no concern here.\n        sha1_hash.update(file_and_line.encode('utf-8'))\n        hashed_file_and_text = sha1_hash.hexdigest()[: self.MAX_HASH_LENGTH]\n        error_message_start = (error_message or self.BASE_MESSAGE)[: self.MAX_ERROR_CHARACTERS]\n        return self._sanitize_filename(f'{self.SNAPSHOT_PREFIX}_{hashed_file_and_text}_{error_message_start}')\n"
  },
  {
    "path": "src/crawlee/statistics/_error_tracker.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/error_tracker.ts\n\nfrom __future__ import annotations\n\nimport traceback\nfrom collections import Counter, defaultdict\nfrom itertools import zip_longest\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom crawlee.statistics._error_snapshotter import ErrorSnapshotter\n\nif TYPE_CHECKING:\n    from crawlee._types import BasicCrawlingContext\n\nGroupName = str | None\nErrorFilenameGroups = dict[GroupName, dict[GroupName, Counter[GroupName]]]\n\n\nlogger = getLogger(__name__)\n\n\nclass ErrorTracker:\n    \"\"\"Track errors and aggregates their counts by similarity.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        snapshot_kvs_name: str | None = None,\n        show_error_name: bool = True,\n        show_file_and_line_number: bool = True,\n        show_error_message: bool = True,\n        show_full_message: bool = False,\n        save_error_snapshots: bool = False,\n    ) -> None:\n        self.error_snapshotter = ErrorSnapshotter(snapshot_kvs_name=snapshot_kvs_name) if save_error_snapshots else None\n        self.show_error_name = show_error_name\n        self.show_file_and_line_number = show_file_and_line_number\n        self.show_error_message = show_error_message\n        if show_full_message and not show_error_message:\n            raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`')\n        self.show_full_message = show_full_message\n        self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter))\n        self._early_reported_errors = set[int]()\n\n    async def add(\n        self,\n        error: Exception,\n        *,\n        context: BasicCrawlingContext | None = None,\n        early: bool = False,\n    ) -> None:\n        \"\"\"Add an error in the statistics.\n\n        Args:\n            error: Error to be added to statistics.\n            context: Context used to collect error snapshot.\n            early: Flag indicating that the error is added earlier than usual to have access to resources that will be\n             closed before normal error collection. This prevents double reporting during normal error collection.\n        \"\"\"\n        if id(error) in self._early_reported_errors:\n            # Error had to be collected earlier before relevant resources are closed.\n            self._early_reported_errors.remove(id(error))\n            return\n\n        if early:\n            self._early_reported_errors.add(id(error))\n\n        error_group_name = error.__class__.__name__ if self.show_error_name else None\n        error_group_message = self._get_error_message(error)\n        new_error_group_message = ''  # In case of wildcard similarity match\n        error_group_file_and_line = self._get_file_and_line(error)\n\n        # First two levels are grouped only in case of exact match.\n        specific_groups = self._errors[error_group_file_and_line][error_group_name]\n\n        # Lowest level group is matched by similarity.\n        if error_group_message in specific_groups:\n            # Exact match.\n            specific_groups.update([error_group_message])\n        else:\n            for existing_error_group_message in specific_groups:\n                # Add to first group with similar text. Modify text with wildcard characters if necessary.\n                if new_error_group_message := self._create_generic_message(\n                    existing_error_group_message, error_group_message\n                ):\n                    # Replace old name.\n                    specific_groups[new_error_group_message] = specific_groups.pop(existing_error_group_message)\n                    # Increment.\n                    specific_groups.update([new_error_group_message])\n                    break\n            else:\n                # No similar message found. Create new group.\n                self._errors[error_group_file_and_line][error_group_name].update([error_group_message])\n\n        if (\n            self._errors[error_group_file_and_line][error_group_name][new_error_group_message or error_group_message]\n            == 1\n            and context is not None\n        ):\n            # Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well.\n            await self._capture_error_snapshot(\n                error_message=new_error_group_message or error_group_message,\n                file_and_line=error_group_file_and_line,\n                context=context,\n            )\n\n    async def _capture_error_snapshot(\n        self, error_message: str, file_and_line: str, context: BasicCrawlingContext\n    ) -> None:\n        if self.error_snapshotter:\n            try:\n                await self.error_snapshotter.capture_snapshot(\n                    error_message=error_message, file_and_line=file_and_line, context=context\n                )\n            except Exception:\n                logger.exception(f'Error when trying to collect error snapshot for exception: {error_message}')\n\n    def _get_file_and_line(self, error: Exception) -> str:\n        if self.show_file_and_line_number:\n            error_traceback = traceback.extract_tb(error.__traceback__)\n            # Show only the most specific frame.\n            return f'{error_traceback[-1].filename.split(\"/\")[-1]}:{error_traceback[-1].lineno}'\n        return ''\n\n    def _get_error_message(self, error: Exception) -> str:\n        if self.show_error_message:\n            error_content = error.args[0] if error.args else error.__context__\n            error_content = str(error_content) if error_content else error.__class__.__name__\n            if self.show_full_message:\n                return error_content\n            return error_content.split('\\n')[0]\n        return ''\n\n    @property\n    def unique_error_count(self) -> int:\n        \"\"\"Number of distinct kinds of errors.\"\"\"\n        unique_error_count = 0\n        for file_and_line_group in self._errors.values():\n            for name_group in file_and_line_group.values():\n                unique_error_count += len(name_group)\n        return unique_error_count\n\n    @property\n    def total(self) -> int:\n        \"\"\"Total number of errors.\"\"\"\n        error_count = 0\n        for file_and_line_group in self._errors.values():\n            for name_group in file_and_line_group.values():\n                error_count += sum(name_group.values())\n        return error_count\n\n    def get_most_common_errors(self, n: int = 3) -> list[tuple[str | None, int]]:\n        \"\"\"Return n most common errors.\"\"\"\n        all_errors: Counter[GroupName] = Counter()\n        for file_and_line_group_name, file_and_line_group in self._errors.items():\n            for name_group_name, name_group in file_and_line_group.items():\n                for message_group_name, count in name_group.items():\n                    all_errors[self._get_error_repr(file_and_line_group_name, name_group_name, message_group_name)] = (\n                        count\n                    )\n        return all_errors.most_common(n)\n\n    def _get_error_repr(self, file_and_line: str | None, name: str | None, message: str | None) -> str:\n        \"\"\"Get the most specific error representation.\"\"\"\n        file_and_line_part = f'{file_and_line}:' if file_and_line else ''\n        name_part = f'{name}:' if name else ''\n        message_part = f'{message}' if message else ''\n        return f'{file_and_line_part}{name_part}{message_part}'\n\n    @staticmethod\n    def _create_generic_message(message_1: str | None, message_2: str | None) -> str:\n        \"\"\"Create a generic error message from two messages, if they are similar enough.\n\n        Different parts of similar messages are replaced by `***`.\n        \"\"\"\n        if message_1 is None or message_2 is None:\n            return ''\n\n        replacement_string = '***'\n        replacement_count = 0\n\n        generic_message_parts = []\n        message_1_parts = message_1.split(' ')\n        message_2_parts = message_2.split(' ')\n        parts_count = min(len(message_1_parts), len(message_2_parts))\n\n        for message_1_part, message_2_part in zip_longest(message_1_parts, message_2_parts, fillvalue=''):\n            if message_1_part != message_2_part:\n                generic_message_parts.append(replacement_string)\n                replacement_count += 1\n                if replacement_count >= parts_count / 2:\n                    # Messages are too different.\n                    return ''\n            else:\n                generic_message_parts.append(message_1_part)\n        return ' '.join(generic_message_parts)\n"
  },
  {
    "path": "src/crawlee/statistics/_models.py",
    "content": "from __future__ import annotations\n\nimport json\nimport warnings\nfrom dataclasses import asdict, dataclass\nfrom datetime import datetime, timedelta, timezone\nfrom typing import TYPE_CHECKING, Annotated, Any\n\nfrom pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field\nfrom typing_extensions import override\n\nfrom crawlee._utils.console import make_table\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.models import timedelta_ms\nfrom crawlee._utils.time import format_duration\n\n_STATISTICS_TABLE_WIDTH = 100\n\n\n@dataclass(frozen=True)\n@docs_group('Statistics')\nclass FinalStatistics:\n    \"\"\"Statistics about a crawler run.\"\"\"\n\n    requests_finished: int\n    requests_failed: int\n    retry_histogram: list[int]\n    request_avg_failed_duration: timedelta | None\n    request_avg_finished_duration: timedelta | None\n    requests_finished_per_minute: float\n    requests_failed_per_minute: float\n    request_total_duration: timedelta\n    requests_total: int\n    crawler_runtime: timedelta\n\n    def to_table(self) -> str:\n        \"\"\"Print out the Final Statistics data as a table.\"\"\"\n        formatted_dict = {}\n        for k, v in asdict(self).items():\n            if isinstance(v, timedelta):\n                formatted_dict[k] = format_duration(v)\n            else:\n                formatted_dict[k] = v\n\n        return make_table([(str(k), str(v)) for k, v in formatted_dict.items()], width=_STATISTICS_TABLE_WIDTH)\n\n    def to_dict(self) -> dict[str, float | int | list[int]]:\n        return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}\n\n    @override\n    def __str__(self) -> str:\n        return json.dumps(\n            {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()},\n        )\n\n\n@docs_group('Statistics')\nclass StatisticsState(BaseModel):\n    \"\"\"Statistic data about a crawler run.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')\n    stats_id: Annotated[int | None, Field(alias='statsId')] = None\n\n    requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0\n    requests_failed: Annotated[int, Field(alias='requestsFailed')] = 0\n    requests_retries: Annotated[int, Field(alias='requestsRetries')] = 0\n    requests_failed_per_minute: Annotated[float, Field(alias='requestsFailedPerMinute')] = 0\n    requests_finished_per_minute: Annotated[float, Field(alias='requestsFinishedPerMinute')] = 0\n    request_min_duration: Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')] = None\n    request_max_duration: Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')] = None\n    request_total_failed_duration: Annotated[timedelta_ms, Field(alias='requestTotalFailedDurationMillis')] = (\n        timedelta()\n    )\n    request_total_finished_duration: Annotated[timedelta_ms, Field(alias='requestTotalFinishedDurationMillis')] = (\n        timedelta()\n    )\n    crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None\n    crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None\n    crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        errors: dict[str, Any] = {}\n        retry_errors: dict[str, Any] = {}\n        requests_with_status_code: dict[str, int] = {}\n    else:\n        errors: Annotated[dict[str, Any], Field(default_factory=dict)]\n        retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]\n        requests_with_status_code: Annotated[\n            dict[str, int],\n            Field(alias='requestsWithStatusCode', default_factory=dict),\n        ]\n\n    stats_persisted_at: Annotated[\n        datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))\n    ] = None\n    request_retry_histogram: Annotated[\n        dict[int, int],\n        Field(alias='requestRetryHistogram'),\n        PlainValidator(lambda value: dict(enumerate(value)), json_schema_input_type=list[int]),\n        PlainSerializer(\n            lambda value: [value.get(i, 0) for i in range(max(value.keys(), default=0) + 1)],\n            return_type=list[int],\n        ),\n    ] = {}\n\n    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.\n    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()\n\n    def model_post_init(self, /, __context: Any) -> None:\n        self._runtime_offset = self.crawler_runtime or self._runtime_offset\n\n    @property\n    def crawler_runtime(self) -> timedelta:\n        if self.crawler_last_started_at:\n            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)\n            return self._runtime_offset + finished_at - self.crawler_last_started_at\n        return self._runtime_offset\n\n    @crawler_runtime.setter\n    def crawler_runtime(self, value: timedelta) -> None:\n        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.\n        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567\n        warnings.warn(\n            f\"Setting 'crawler_runtime' is deprecated and will be removed in a future version.\"\n            f' Value {value} will not be used.',\n            DeprecationWarning,\n            stacklevel=2,\n        )\n\n    @computed_field(alias='crawlerRuntimeMillis')\n    def crawler_runtime_for_serialization(self) -> timedelta:\n        if self.crawler_last_started_at:\n            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)\n            return self._runtime_offset + finished_at - self.crawler_last_started_at\n        return self._runtime_offset\n\n    @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)\n    @property\n    def request_total_duration(self) -> timedelta:\n        return self.request_total_finished_duration + self.request_total_failed_duration\n\n    @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)\n    @property\n    def request_avg_failed_duration(self) -> timedelta | None:\n        return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None\n\n    @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)\n    @property\n    def request_avg_finished_duration(self) -> timedelta | None:\n        return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None\n\n    @computed_field(alias='requestsTotal')\n    @property\n    def requests_total(self) -> int:\n        return self.requests_failed + self.requests_finished\n"
  },
  {
    "path": "src/crawlee/statistics/_statistics.py",
    "content": "# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts\nfrom __future__ import annotations\n\nimport asyncio\nimport math\nimport time\nfrom datetime import datetime, timedelta, timezone\nfrom logging import Logger, getLogger\nfrom typing import TYPE_CHECKING, Generic, Literal\n\nfrom typing_extensions import Self, TypeVar\n\nfrom crawlee._utils.context import ensure_context\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recoverable_state import RecoverableState\nfrom crawlee._utils.recurring_task import RecurringTask\nfrom crawlee.statistics import FinalStatistics, StatisticsState\nfrom crawlee.statistics._error_tracker import ErrorTracker\n\nif TYPE_CHECKING:\n    from collections.abc import Callable, Coroutine\n    from types import TracebackType\n\n    from crawlee.storages import KeyValueStore\n\nTStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)\nTNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)\nlogger = getLogger(__name__)\n\n\nclass RequestProcessingRecord:\n    \"\"\"Tracks information about the processing of a request.\"\"\"\n\n    def __init__(self) -> None:\n        self._last_run_at_ns: int | None = None\n        self._runs = 0\n        self.duration: timedelta | None = None\n\n    def run(self) -> int:\n        \"\"\"Mark the job as started.\"\"\"\n        self._last_run_at_ns = time.perf_counter_ns()\n        self._runs += 1\n        return self._runs\n\n    def finish(self) -> timedelta:\n        \"\"\"Mark the job as finished.\"\"\"\n        if self._last_run_at_ns is None:\n            raise RuntimeError('Invalid state')\n\n        self.duration = timedelta(microseconds=math.ceil((time.perf_counter_ns() - self._last_run_at_ns) / 1000))\n        return self.duration\n\n    @property\n    def retry_count(self) -> int:\n        \"\"\"Number of times the job has been retried.\"\"\"\n        return max(0, self._runs - 1)\n\n\n@docs_group('Statistics')\nclass Statistics(Generic[TStatisticsState]):\n    \"\"\"A class for collecting, tracking, and logging runtime statistics for requests.\n\n    It is designed to record information such as request durations, retries, successes, and failures, enabling\n    analysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they\n    remain available across crawler migrations, abortions, and restarts. This persistence allows for tracking\n    and evaluation of crawler behavior over its lifecycle.\n    \"\"\"\n\n    __next_id = 0\n\n    def __init__(\n        self,\n        *,\n        persistence_enabled: bool | Literal['explicit_only'] = False,\n        persist_state_kvs_name: str | None = None,\n        persist_state_key: str | None = None,\n        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,\n        log_message: str = 'Statistics',\n        periodic_message_logger: Logger | None = None,\n        log_interval: timedelta = timedelta(minutes=1),\n        state_model: type[TStatisticsState],\n        statistics_log_format: Literal['table', 'inline'] = 'table',\n        save_error_snapshots: bool = False,\n    ) -> None:\n        self._id = Statistics.__next_id\n        Statistics.__next_id += 1\n\n        self.error_tracker = ErrorTracker(\n            save_error_snapshots=save_error_snapshots,\n            snapshot_kvs_name=persist_state_kvs_name,\n        )\n        self.error_tracker_retry = ErrorTracker(save_error_snapshots=False)\n\n        self._requests_in_progress = dict[str, RequestProcessingRecord]()\n\n        self._state = RecoverableState(\n            default_state=state_model(stats_id=self._id),\n            persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',\n            persistence_enabled=persistence_enabled,\n            persist_state_kvs_name=persist_state_kvs_name,\n            persist_state_kvs_factory=persist_state_kvs_factory,\n            logger=logger,\n        )\n\n        self._log_message = log_message\n        self._statistics_log_format = statistics_log_format\n        self._periodic_message_logger = periodic_message_logger or logger\n        self._periodic_logger = RecurringTask(self._log, log_interval)\n\n        # Flag to indicate the context state.\n        self._active = False\n\n    def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:\n        \"\"\"Create near copy of the `Statistics` with replaced `state_model`.\"\"\"\n        new_statistics: Statistics[TNewStatisticsState] = Statistics(\n            persistence_enabled=self._state._persistence_enabled,  # noqa: SLF001\n            persist_state_key=self._state._persist_state_key,  # noqa: SLF001\n            persist_state_kvs_factory=self._state._persist_state_kvs_factory,  # noqa: SLF001\n            log_message=self._log_message,\n            periodic_message_logger=self._periodic_message_logger,\n            state_model=state_model,\n        )\n        new_statistics._periodic_logger = self._periodic_logger  # Accessing private member to create copy like-object.\n        return new_statistics\n\n    @staticmethod\n    def with_default_state(\n        *,\n        persistence_enabled: bool = False,\n        persist_state_kvs_name: str | None = None,\n        persist_state_key: str | None = None,\n        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,\n        log_message: str = 'Statistics',\n        periodic_message_logger: Logger | None = None,\n        log_interval: timedelta = timedelta(minutes=1),\n        statistics_log_format: Literal['table', 'inline'] = 'table',\n        save_error_snapshots: bool = False,\n    ) -> Statistics[StatisticsState]:\n        \"\"\"Initialize a new instance with default state model `StatisticsState`.\"\"\"\n        return Statistics[StatisticsState](\n            persistence_enabled=persistence_enabled,\n            persist_state_kvs_name=persist_state_kvs_name,\n            persist_state_key=persist_state_key,\n            persist_state_kvs_factory=persist_state_kvs_factory,\n            log_message=log_message,\n            periodic_message_logger=periodic_message_logger,\n            log_interval=log_interval,\n            state_model=StatisticsState,\n            statistics_log_format=statistics_log_format,\n            save_error_snapshots=save_error_snapshots,\n        )\n\n    @property\n    def active(self) -> bool:\n        \"\"\"Indicate whether the context is active.\"\"\"\n        return self._active\n\n    async def __aenter__(self) -> Self:\n        \"\"\"Subscribe to events and start collecting statistics.\n\n        Raises:\n            RuntimeError: If the context manager is already active.\n        \"\"\"\n        if self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is already active.')\n\n        await self._state.initialize()\n        # Reset `crawler_finished_at` to indicate a new run in progress.\n        self.state.crawler_finished_at = None\n\n        # Start periodic logging and let it print initial state before activation.\n        self._periodic_logger.start()\n        await asyncio.sleep(0.01)\n        self._active = True\n\n        self.state.crawler_last_started_at = datetime.now(timezone.utc)\n        self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Stop collecting statistics.\n\n        Raises:\n            RuntimeError: If the context manager is not active.\n        \"\"\"\n        if not self._active:\n            raise RuntimeError(f'The {self.__class__.__name__} is not active.')\n\n        if not self.state.crawler_last_started_at:\n            raise RuntimeError('Statistics.state.crawler_last_started_at not set.')\n\n        # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime\n        await self._periodic_logger.stop()\n        self.state.crawler_finished_at = datetime.now(timezone.utc)\n        self._active = False\n        await self._state.teardown()\n\n    @property\n    def state(self) -> TStatisticsState:\n        return self._state.current_value\n\n    @ensure_context\n    def register_status_code(self, code: int) -> None:\n        \"\"\"Increment the number of times a status code has been received.\"\"\"\n        state = self._state.current_value\n        state.requests_with_status_code.setdefault(str(code), 0)\n        state.requests_with_status_code[str(code)] += 1\n\n    @ensure_context\n    def record_request_processing_start(self, request_id_or_key: str) -> None:\n        \"\"\"Mark a request as started.\"\"\"\n        record = self._requests_in_progress.get(request_id_or_key, RequestProcessingRecord())\n        record.run()\n        self._requests_in_progress[request_id_or_key] = record\n\n    @ensure_context\n    def record_request_processing_finish(self, request_id_or_key: str) -> None:\n        \"\"\"Mark a request as finished.\"\"\"\n        record = self._requests_in_progress.get(request_id_or_key)\n        if record is None:\n            return\n\n        state = self._state.current_value\n        duration = record.finish()\n\n        state.requests_finished += 1\n        state.request_total_finished_duration += duration\n        self._save_retry_count_for_request(record)\n        state.request_min_duration = min(\n            state.request_min_duration if state.request_min_duration is not None else timedelta.max, duration\n        )\n        state.request_max_duration = max(\n            state.request_max_duration if state.request_max_duration is not None else timedelta(), duration\n        )\n\n        del self._requests_in_progress[request_id_or_key]\n\n    @ensure_context\n    def record_request_processing_failure(self, request_id_or_key: str) -> None:\n        \"\"\"Mark a request as failed.\"\"\"\n        record = self._requests_in_progress.get(request_id_or_key)\n        if record is None:\n            return\n\n        state = self._state.current_value\n\n        state.request_total_failed_duration += record.finish()\n        state.requests_failed += 1\n        self._save_retry_count_for_request(record)\n\n        del self._requests_in_progress[request_id_or_key]\n\n    def calculate(self) -> FinalStatistics:\n        \"\"\"Calculate the current statistics.\"\"\"\n        total_minutes = self.state.crawler_runtime.total_seconds() / 60\n        state = self._state.current_value\n        serialized_state = state.model_dump(by_alias=False)\n\n        return FinalStatistics(\n            request_avg_failed_duration=state.request_avg_failed_duration,\n            request_avg_finished_duration=state.request_avg_finished_duration,\n            requests_finished_per_minute=round(state.requests_finished / total_minutes) if total_minutes else 0,\n            requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,\n            request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,\n            requests_total=state.requests_failed + state.requests_finished,\n            crawler_runtime=state.crawler_runtime,\n            requests_finished=state.requests_finished,\n            requests_failed=state.requests_failed,\n            retry_histogram=serialized_state['request_retry_histogram'],\n        )\n\n    async def reset(self) -> None:\n        \"\"\"Reset the statistics to their defaults and remove any persistent state.\"\"\"\n        await self._state.reset()\n        self.error_tracker = ErrorTracker()\n        self.error_tracker_retry = ErrorTracker()\n        self._requests_in_progress.clear()\n\n    def _log(self) -> None:\n        stats = self.calculate()\n        if self._statistics_log_format == 'table':\n            self._periodic_message_logger.info(f'{self._log_message}\\n{stats.to_table()}')\n        else:\n            self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())\n\n    def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:\n        retry_count = record.retry_count\n        state = self._state.current_value\n\n        if retry_count:\n            state.requests_retries += 1\n\n        state.request_retry_histogram.setdefault(retry_count, 0)\n        state.request_retry_histogram[retry_count] += 1\n"
  },
  {
    "path": "src/crawlee/storage_clients/__init__.py",
    "content": "from crawlee._utils.try_import import install_import_hook as _install_import_hook\nfrom crawlee._utils.try_import import try_import as _try_import\n\n# These imports have only mandatory dependencies, so they are imported directly.\nfrom ._base import StorageClient\nfrom ._file_system import FileSystemStorageClient\nfrom ._memory import MemoryStorageClient\n\n_install_import_hook(__name__)\n\n# The following imports are wrapped in try_import to handle optional dependencies,\n# ensuring the module can still function even if these dependencies are missing.\nwith _try_import(__name__, 'SqlStorageClient'):\n    from ._sql import SqlStorageClient\n\nwith _try_import(__name__, 'RedisStorageClient'):\n    from ._redis import RedisStorageClient\n\n__all__ = [\n    'FileSystemStorageClient',\n    'MemoryStorageClient',\n    'RedisStorageClient',\n    'SqlStorageClient',\n    'StorageClient',\n]\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/__init__.py",
    "content": "from ._dataset_client import DatasetClient\nfrom ._key_value_store_client import KeyValueStoreClient\nfrom ._request_queue_client import RequestQueueClient\nfrom ._storage_client import StorageClient\n\n__all__ = [\n    'DatasetClient',\n    'KeyValueStoreClient',\n    'RequestQueueClient',\n    'StorageClient',\n]\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/_dataset_client.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n    from typing import Any\n\n    from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\n\nclass DatasetClient(ABC):\n    \"\"\"An abstract class for dataset storage clients.\n\n    Dataset clients provide an interface for accessing and manipulating dataset storage. They handle\n    operations like adding and getting dataset items across different storage backends.\n\n    Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n    `RequestQueue`), and can operate with various storage systems including memory, file system,\n    databases, and cloud storage solutions.\n\n    This abstract class defines the interface that all specific dataset clients must implement.\n    \"\"\"\n\n    @abstractmethod\n    async def get_metadata(self) -> DatasetMetadata:\n        \"\"\"Get the metadata of the dataset.\"\"\"\n\n    @abstractmethod\n    async def drop(self) -> None:\n        \"\"\"Drop the whole dataset and remove all its items.\n\n        The backend method for the `Dataset.drop` call.\n        \"\"\"\n\n    @abstractmethod\n    async def purge(self) -> None:\n        \"\"\"Purge all items from the dataset.\n\n        The backend method for the `Dataset.purge` call.\n        \"\"\"\n\n    @abstractmethod\n    async def push_data(self, data: list[Any] | dict[str, Any]) -> None:\n        \"\"\"Push data to the dataset.\n\n        The backend method for the `Dataset.push_data` call.\n        \"\"\"\n\n    @abstractmethod\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        \"\"\"Get data from the dataset with various filtering options.\n\n        The backend method for the `Dataset.get_data` call.\n        \"\"\"\n\n    @abstractmethod\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = None,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        \"\"\"Iterate over the dataset items with filtering options.\n\n        The backend method for the `Dataset.iterate_items` call.\n        \"\"\"\n        # This syntax is to make type checker properly work with abstract AsyncIterator.\n        # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators\n        raise NotImplementedError\n        if False:\n            yield 0\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/_key_value_store_client.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata\n\n\nclass KeyValueStoreClient(ABC):\n    \"\"\"An abstract class for key-value store (KVS) storage clients.\n\n    Key-value stores clients provide an interface for accessing and manipulating KVS storage. They handle\n    operations like getting, setting, deleting KVS values across different storage backends.\n\n    Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`,\n    `RequestQueue`), and can operate with various storage systems including memory, file system,\n    databases, and cloud storage solutions.\n\n    This abstract class defines the interface that all specific KVS clients must implement.\n    \"\"\"\n\n    @abstractmethod\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        \"\"\"Get the metadata of the key-value store.\"\"\"\n\n    @abstractmethod\n    async def drop(self) -> None:\n        \"\"\"Drop the whole key-value store and remove all its values.\n\n        The backend method for the `KeyValueStore.drop` call.\n        \"\"\"\n\n    @abstractmethod\n    async def purge(self) -> None:\n        \"\"\"Purge all items from the key-value store.\n\n        The backend method for the `KeyValueStore.purge` call.\n        \"\"\"\n\n    @abstractmethod\n    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:\n        \"\"\"Retrieve the given record from the key-value store.\n\n        The backend method for the `KeyValueStore.get_value` call.\n        \"\"\"\n\n    @abstractmethod\n    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:\n        \"\"\"Set a value in the key-value store by its key.\n\n        The backend method for the `KeyValueStore.set_value` call.\n        \"\"\"\n\n    @abstractmethod\n    async def delete_value(self, *, key: str) -> None:\n        \"\"\"Delete a value from the key-value store by its key.\n\n        The backend method for the `KeyValueStore.delete_value` call.\n        \"\"\"\n\n    @abstractmethod\n    async def iterate_keys(\n        self,\n        *,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        \"\"\"Iterate over all the existing keys in the key-value store.\n\n        The backend method for the `KeyValueStore.iterate_keys` call.\n        \"\"\"\n        # This syntax is to make type checker properly work with abstract AsyncIterator.\n        # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators\n        raise NotImplementedError\n        if False:\n            yield 0\n\n    @abstractmethod\n    async def get_public_url(self, *, key: str) -> str:\n        \"\"\"Get the public URL for the given key.\n\n        The backend method for the `KeyValueStore.get_public_url` call.\n        \"\"\"\n\n    @abstractmethod\n    async def record_exists(self, *, key: str) -> bool:\n        \"\"\"Check if a record with the given key exists in the key-value store.\n\n        The backend method for the `KeyValueStore.record_exists` call.\n\n        Args:\n            key: The key to check for existence.\n\n        Returns:\n            True if a record with the given key exists, False otherwise.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/_request_queue_client.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee import Request\n    from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata\n\n\nclass RequestQueueClient(ABC):\n    \"\"\"An abstract class for request queue resource clients.\n\n    These clients are specific to the type of resource they manage and operate under a designated storage\n    client, like a memory storage client.\n    \"\"\"\n\n    @abstractmethod\n    async def get_metadata(self) -> RequestQueueMetadata:\n        \"\"\"Get the metadata of the request queue.\"\"\"\n\n    @abstractmethod\n    async def drop(self) -> None:\n        \"\"\"Drop the whole request queue and remove all its values.\n\n        The backend method for the `RequestQueue.drop` call.\n        \"\"\"\n\n    @abstractmethod\n    async def purge(self) -> None:\n        \"\"\"Purge all items from the request queue.\n\n        The backend method for the `RequestQueue.purge` call.\n        \"\"\"\n\n    @abstractmethod\n    async def add_batch_of_requests(\n        self,\n        requests: Sequence[Request],\n        *,\n        forefront: bool = False,\n    ) -> AddRequestsResponse:\n        \"\"\"Add batch of requests to the queue.\n\n        This method adds a batch of requests to the queue. Each request is processed based on its uniqueness\n        (determined by `unique_key`). Duplicates will be identified but not re-added to the queue.\n\n        Args:\n            requests: The collection of requests to add to the queue.\n            forefront: Whether to put the added requests at the beginning (True) or the end (False) of the queue.\n                When True, the requests will be processed sooner than previously added requests.\n            batch_size: The maximum number of requests to add in a single batch.\n            wait_time_between_batches: The time to wait between adding batches of requests.\n            wait_for_all_requests_to_be_added: If True, the method will wait until all requests are added\n                to the queue before returning.\n            wait_for_all_requests_to_be_added_timeout: The maximum time to wait for all requests to be added.\n\n        Returns:\n            A response object containing information about which requests were successfully\n            processed and which failed (if any).\n        \"\"\"\n\n    @abstractmethod\n    async def get_request(self, unique_key: str) -> Request | None:\n        \"\"\"Retrieve a request from the queue.\n\n        Args:\n            unique_key: Unique key of the request to retrieve.\n\n        Returns:\n            The retrieved request, or None, if it did not exist.\n        \"\"\"\n\n    @abstractmethod\n    async def fetch_next_request(self) -> Request | None:\n        \"\"\"Return the next request in the queue to be processed.\n\n        Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\n        to mark the request as handled in the queue. If there was some error in processing the request, call\n        `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\n        in another call to the `fetch_next_request` method.\n\n        Note that the `None` return value does not mean the queue processing finished, it means there are currently\n        no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\n        instead.\n\n        Returns:\n            The request or `None` if there are no more pending requests.\n        \"\"\"\n\n    @abstractmethod\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        \"\"\"Mark a request as handled after successful processing.\n\n        Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method.\n\n        Args:\n            request: The request to mark as handled.\n\n        Returns:\n            Information about the queue operation. `None` if the given request was not in progress.\n        \"\"\"\n\n    @abstractmethod\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        \"\"\"Reclaim a failed request back to the queue.\n\n        The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.\n\n        Args:\n            request: The request to return to the queue.\n            forefront: Whether to add the request to the head or the end of the queue.\n\n        Returns:\n            Information about the queue operation. `None` if the given request was not in progress.\n        \"\"\"\n\n    @abstractmethod\n    async def is_empty(self) -> bool:\n        \"\"\"Check if the request queue is empty.\n\n        Returns:\n            True if the request queue is empty, False otherwise.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/_storage_client.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from collections.abc import Hashable\n\n    from crawlee.configuration import Configuration\n\n    from ._dataset_client import DatasetClient\n    from ._key_value_store_client import KeyValueStoreClient\n    from ._request_queue_client import RequestQueueClient\n\n\n@docs_group('Storage clients')\nclass StorageClient(ABC):\n    \"\"\"Base class for storage clients.\n\n    The `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's\n    storage types: datasets, key-value stores, and request queues. It provides methods to open clients for\n    each of these storage types and handles common functionality.\n\n    Storage clients implementations can be provided for various backends (file system, memory, databases,\n    various cloud providers, etc.) to support different use cases from development to production environments.\n\n    Each storage client implementation is responsible for ensuring proper initialization, data persistence\n    (where applicable), and consistent access patterns across all storage types it supports.\n    \"\"\"\n\n    def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:  # noqa: ARG002\n        \"\"\"Return a cache key that can differentiate between different storages of this and other clients.\n\n        Can be based on configuration or on the client itself. By default, returns a module and name of the client\n        class.\n        \"\"\"\n        return f'{self.__class__.__module__}.{self.__class__.__name__}'\n\n    @abstractmethod\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> DatasetClient:\n        \"\"\"Create a dataset client.\"\"\"\n\n    @abstractmethod\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> KeyValueStoreClient:\n        \"\"\"Create a key-value store client.\"\"\"\n\n    @abstractmethod\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> RequestQueueClient:\n        \"\"\"Create a request queue client.\"\"\"\n\n    def get_rate_limit_errors(self) -> dict[int, int]:\n        \"\"\"Return statistics about rate limit errors encountered by the HTTP client in storage client.\"\"\"\n        return {}\n\n    async def _purge_if_needed(\n        self,\n        client: DatasetClient | KeyValueStoreClient | RequestQueueClient,\n        configuration: Configuration,\n    ) -> None:\n        \"\"\"Purge the client if needed.\n\n        The purge is only performed if the configuration indicates that it should be done and the client\n        is not a named storage. Named storages are considered global and will typically outlive the run,\n        so they are not purged.\n\n        Args:\n            client: The storage client to potentially purge.\n            configuration: Configuration that determines whether purging should occur.\n        \"\"\"\n        metadata = await client.get_metadata()\n        if configuration.purge_on_start and metadata.name is None:\n            await client.purge()\n"
  },
  {
    "path": "src/crawlee/storage_clients/_base/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/__init__.py",
    "content": "from ._dataset_client import FileSystemDatasetClient\nfrom ._key_value_store_client import FileSystemKeyValueStoreClient\nfrom ._request_queue_client import FileSystemRequestQueueClient\nfrom ._storage_client import FileSystemStorageClient\n\n__all__ = [\n    'FileSystemDatasetClient',\n    'FileSystemKeyValueStoreClient',\n    'FileSystemRequestQueueClient',\n    'FileSystemStorageClient',\n]\n"
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/_dataset_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nimport shutil\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nfrom pydantic import ValidationError\nfrom typing_extensions import Self, override\n\nfrom crawlee._consts import METADATA_FILENAME\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.file import atomic_write, json_dumps\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import DatasetClient\nfrom crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from crawlee.configuration import Configuration\n\nlogger = getLogger(__name__)\n\n\nclass FileSystemDatasetClient(DatasetClient):\n    \"\"\"File system implementation of the dataset client.\n\n    This client persists dataset items to the file system as individual JSON files within a structured\n    directory hierarchy following the pattern:\n\n    ```\n    {STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json\n    ```\n\n    Each item is stored as a separate file, which allows for durability and the ability to\n    recover after process termination. Dataset operations like filtering, sorting, and pagination are\n    implemented by processing the stored files according to the requested parameters.\n\n    This implementation is ideal for long-running crawlers where data persistence is important,\n    and for development environments where you want to easily inspect the collected data between runs.\n    \"\"\"\n\n    _STORAGE_SUBDIR = 'datasets'\n    \"\"\"The name of the subdirectory where datasets are stored.\"\"\"\n\n    _STORAGE_SUBSUBDIR_DEFAULT = 'default'\n    \"\"\"The name of the subdirectory for the default dataset.\"\"\"\n\n    _ITEM_FILENAME_DIGITS = 9\n    \"\"\"Number of digits used for the dataset item file names (e.g., 000000019.json).\"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: DatasetMetadata,\n        path_to_dataset: Path,\n        lock: asyncio.Lock,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `FileSystemDatasetClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._path_to_dataset = path_to_dataset\n        \"\"\"The full path to the dataset directory.\"\"\"\n\n        self._lock = lock\n        \"\"\"A lock to ensure that only one operation is performed at a time.\"\"\"\n\n    @override\n    async def get_metadata(self) -> DatasetMetadata:\n        return self._metadata\n\n    @property\n    def path_to_dataset(self) -> Path:\n        \"\"\"The full path to the dataset directory.\"\"\"\n        return self._path_to_dataset\n\n    @property\n    def path_to_metadata(self) -> Path:\n        \"\"\"The full path to the dataset metadata file.\"\"\"\n        return self.path_to_dataset / METADATA_FILENAME\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        configuration: Configuration,\n    ) -> Self:\n        \"\"\"Open or create a file system dataset client.\n\n        This method attempts to open an existing dataset from the file system. If a dataset with the specified ID\n        or name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the dataset to open. If provided, searches for existing dataset by ID.\n            name: The name of the dataset for named (global scope) storages.\n            alias: The alias of the dataset for unnamed (run scope) storages.\n            configuration: The configuration object containing storage directory settings.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If a dataset with the specified ID is not found, if metadata is invalid,\n                or if both name and alias are provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR\n\n        if not dataset_base_path.exists():\n            await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)\n\n        # Get a new instance by ID.\n        if id:\n            found = False\n            for dataset_dir in dataset_base_path.iterdir():\n                if not dataset_dir.is_dir():\n                    continue\n\n                path_to_metadata = dataset_dir / METADATA_FILENAME\n                if not path_to_metadata.exists():\n                    continue\n\n                try:\n                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')\n                    try:\n                        file_content = json.load(file)\n                        metadata = DatasetMetadata(**file_content)\n                        if metadata.id == id:\n                            client = cls(\n                                metadata=metadata,\n                                path_to_dataset=dataset_base_path / dataset_dir,\n                                lock=asyncio.Lock(),\n                            )\n                            await client._update_metadata(update_accessed_at=True)\n                            found = True\n                            break\n                    finally:\n                        await asyncio.to_thread(file.close)\n                except (json.JSONDecodeError, ValidationError):\n                    continue\n\n            if not found:\n                raise ValueError(f'Dataset with ID \"{id}\" not found')\n\n        # Get a new instance by name or alias.\n        else:\n            dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')\n            path_to_dataset = dataset_base_path / dataset_dir\n            path_to_metadata = path_to_dataset / METADATA_FILENAME\n\n            # If the dataset directory exists, reconstruct the client from the metadata file.\n            if path_to_dataset.exists() and path_to_metadata.exists():\n                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')\n                try:\n                    file_content = json.load(file)\n                finally:\n                    await asyncio.to_thread(file.close)\n                try:\n                    metadata = DatasetMetadata(**file_content)\n                except ValidationError as exc:\n                    raise ValueError(f'Invalid metadata file for dataset \"{name or alias}\"') from exc\n\n                client = cls(\n                    metadata=metadata,\n                    path_to_dataset=path_to_dataset,\n                    lock=asyncio.Lock(),\n                )\n\n                await client._update_metadata(update_accessed_at=True)\n\n            # Otherwise, create a new dataset client.\n            else:\n                now = datetime.now(timezone.utc)\n                metadata = DatasetMetadata(\n                    id=crypto_random_object_id(),\n                    name=name,\n                    created_at=now,\n                    accessed_at=now,\n                    modified_at=now,\n                    item_count=0,\n                )\n                client = cls(\n                    metadata=metadata,\n                    path_to_dataset=path_to_dataset,\n                    lock=asyncio.Lock(),\n                )\n                await client._update_metadata()\n\n        return client\n\n    @override\n    async def drop(self) -> None:\n        async with self._lock:\n            if self.path_to_dataset.exists():\n                await asyncio.to_thread(shutil.rmtree, self.path_to_dataset)\n\n    @override\n    async def purge(self) -> None:\n        async with self._lock:\n            for file_path in await self._get_sorted_data_files():\n                await asyncio.to_thread(file_path.unlink, missing_ok=True)\n\n            await self._update_metadata(\n                update_accessed_at=True,\n                update_modified_at=True,\n                new_item_count=0,\n            )\n\n    @override\n    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:\n        async with self._lock:\n            new_item_count = self._metadata.item_count\n            if isinstance(data, list):\n                for item in data:\n                    new_item_count += 1\n                    await self._push_item(item, new_item_count)\n            else:\n                new_item_count += 1\n                await self._push_item(data, new_item_count)\n\n            # now update metadata under the same lock\n            await self._update_metadata(\n                update_accessed_at=True,\n                update_modified_at=True,\n                new_item_count=new_item_count,\n            )\n\n    @override\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        # Check for unsupported arguments and log a warning if found.\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n            'flatten': flatten,\n            'view': view,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of get_data are not supported by the '\n                f'{self.__class__.__name__} client.'\n            )\n\n        # If the dataset directory does not exist, log a warning and return an empty page.\n        if not self.path_to_dataset.exists():\n            logger.warning(f'Dataset directory not found: {self.path_to_dataset}')\n            return DatasetItemsListPage(\n                count=0,\n                offset=offset,\n                limit=limit or 0,\n                total=0,\n                desc=desc,\n                items=[],\n            )\n\n        # Get the list of sorted data files.\n        async with self._lock:\n            try:\n                data_files = await self._get_sorted_data_files()\n            except FileNotFoundError:\n                # directory was dropped mid-check\n                return DatasetItemsListPage(count=0, offset=offset, limit=limit or 0, total=0, desc=desc, items=[])\n\n        total = len(data_files)\n\n        # Reverse the order if descending order is requested.\n        if desc:\n            data_files.reverse()\n\n        # Apply offset and limit slicing.\n        selected_files = data_files[offset:]\n        if limit is not None:\n            selected_files = selected_files[:limit]\n\n        # Read and parse each data file.\n        items = list[dict[str, Any]]()\n        for file_path in selected_files:\n            try:\n                file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')\n            except FileNotFoundError:\n                logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping')\n                continue\n\n            try:\n                item = json.loads(file_content)\n            except json.JSONDecodeError:\n                logger.exception(f'Corrupt JSON in {file_path}, skipping')\n                continue\n\n            # Skip empty items if requested.\n            if skip_empty and not item:\n                continue\n\n            items.append(item)\n\n        async with self._lock:\n            await self._update_metadata(update_accessed_at=True)\n\n        # Return a paginated list page of dataset items.\n        return DatasetItemsListPage(\n            count=len(items),\n            offset=offset,\n            limit=limit or total - offset,\n            total=total,\n            desc=desc,\n            items=items,\n        )\n\n    @override\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = None,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        # Check for unsupported arguments and log a warning if found.\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of iterate are not supported '\n                f'by the {self.__class__.__name__} client.'\n            )\n\n        # If the dataset directory does not exist, log a warning and return immediately.\n        if not self.path_to_dataset.exists():\n            logger.warning(f'Dataset directory not found: {self.path_to_dataset}')\n            return\n\n        # Get the list of sorted data files.\n        async with self._lock:\n            try:\n                data_files = await self._get_sorted_data_files()\n            except FileNotFoundError:\n                return\n\n        # Reverse the order if descending order is requested.\n        if desc:\n            data_files.reverse()\n\n        # Apply offset and limit slicing.\n        selected_files = data_files[offset:]\n        if limit is not None:\n            selected_files = selected_files[:limit]\n\n        # Iterate over each data file, reading and yielding its parsed content.\n        for file_path in selected_files:\n            try:\n                file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')\n            except FileNotFoundError:\n                logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping')\n                continue\n\n            try:\n                item = json.loads(file_content)\n            except json.JSONDecodeError:\n                logger.exception(f'Corrupt JSON in {file_path}, skipping')\n                continue\n\n            # Skip empty items if requested.\n            if skip_empty and not item:\n                continue\n\n            yield item\n\n        async with self._lock:\n            await self._update_metadata(update_accessed_at=True)\n\n    async def _update_metadata(\n        self,\n        *,\n        new_item_count: int | None = None,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n    ) -> None:\n        \"\"\"Update the dataset metadata file with current information.\n\n        Args:\n            new_item_count: If provided, update the item count to this value.\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n        if update_modified_at:\n            self._metadata.modified_at = now\n        if new_item_count is not None:\n            self._metadata.item_count = new_item_count\n\n        # Ensure the parent directory for the metadata file exists.\n        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)\n\n        # Dump the serialized metadata to the file.\n        data = await json_dumps(self._metadata.model_dump())\n        await atomic_write(self.path_to_metadata, data)\n\n    async def _push_item(self, item: dict[str, Any], item_id: int) -> None:\n        \"\"\"Push a single item to the dataset.\n\n        This method writes the item as a JSON file with a zero-padded numeric filename\n        that reflects its position in the dataset sequence.\n\n        Args:\n            item: The data item to add to the dataset.\n            item_id: The sequential ID to use for this item's filename.\n        \"\"\"\n        # Generate the filename for the new item using zero-padded numbering.\n        filename = f'{str(item_id).zfill(self._ITEM_FILENAME_DIGITS)}.json'\n        file_path = self.path_to_dataset / filename\n\n        # Ensure the dataset directory exists.\n        await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True)\n\n        # Dump the serialized item to the file.\n        data = await json_dumps(item)\n        await atomic_write(file_path, data)\n\n    async def _get_sorted_data_files(self) -> list[Path]:\n        \"\"\"Retrieve and return a sorted list of data files in the dataset directory.\n\n        The files are sorted numerically based on the filename (without extension),\n        which corresponds to the order items were added to the dataset.\n\n        Returns:\n            A list of `Path` objects pointing to data files, sorted by numeric filename.\n        \"\"\"\n        # Retrieve and sort all JSON files in the dataset directory numerically.\n        files = await asyncio.to_thread(\n            lambda: sorted(\n                self.path_to_dataset.glob('*.json'),\n                key=lambda f: int(f.stem) if f.stem.isdigit() else 0,\n            )\n        )\n\n        # Remove the metadata file from the list if present.\n        if self.path_to_metadata in files:\n            files.remove(self.path_to_metadata)\n\n        return files\n"
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/_key_value_store_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport functools\nimport json\nimport shutil\nimport urllib.parse\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nfrom pydantic import ValidationError\nfrom typing_extensions import Self, override\n\nfrom crawlee._consts import METADATA_FILENAME\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.file import atomic_write, infer_mime_type, json_dumps\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import KeyValueStoreClient\nfrom crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from crawlee.configuration import Configuration\n\n\nlogger = getLogger(__name__)\n\n\nclass FileSystemKeyValueStoreClient(KeyValueStoreClient):\n    \"\"\"File system implementation of the key-value store client.\n\n    This client persists data to the file system, making it suitable for scenarios where data needs to\n    survive process restarts. Keys are mapped to file paths in a directory structure following the pattern:\n\n    ```\n    {STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}\n    ```\n\n    Binary data is stored as-is, while JSON and text data are stored in human-readable format.\n    The implementation automatically handles serialization based on the content type and\n    maintains metadata about each record.\n\n    This implementation is ideal for long-running crawlers where persistence is important and\n    for development environments where you want to easily inspect the stored data between runs.\n    \"\"\"\n\n    _STORAGE_SUBDIR = 'key_value_stores'\n    \"\"\"The name of the subdirectory where key-value stores are stored.\"\"\"\n\n    _STORAGE_SUBSUBDIR_DEFAULT = 'default'\n    \"\"\"The name of the subdirectory for the default key-value store.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: KeyValueStoreMetadata,\n        path_to_kvs: Path,\n        lock: asyncio.Lock,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._path_to_kvs = path_to_kvs\n        \"\"\"The full path to the key-value store directory.\"\"\"\n\n        self._lock = lock\n        \"\"\"A lock to ensure that only one operation is performed at a time.\"\"\"\n\n    @override\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        return self._metadata\n\n    @property\n    def path_to_kvs(self) -> Path:\n        \"\"\"The full path to the key-value store directory.\"\"\"\n        return self._path_to_kvs\n\n    @property\n    def path_to_metadata(self) -> Path:\n        \"\"\"The full path to the key-value store metadata file.\"\"\"\n        return self.path_to_kvs / METADATA_FILENAME\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        configuration: Configuration,\n    ) -> Self:\n        \"\"\"Open or create a file system key-value store client.\n\n        This method attempts to open an existing key-value store from the file system. If a KVS with the specified\n        ID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the key-value store to open. If provided, searches for existing store by ID.\n            name: The name of the key-value store for named (global scope) storages.\n            alias: The alias of the key-value store for unnamed (run scope) storages.\n            configuration: The configuration object containing storage directory settings.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If a store with the specified ID is not found, if metadata is invalid,\n                or if both name and alias are provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR\n\n        if not kvs_base_path.exists():\n            await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)\n\n        # Get a new instance by ID.\n        if id:\n            found = False\n            for kvs_dir in kvs_base_path.iterdir():\n                if not kvs_dir.is_dir():\n                    continue\n\n                path_to_metadata = kvs_dir / METADATA_FILENAME\n                if not path_to_metadata.exists():\n                    continue\n\n                try:\n                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')\n                    try:\n                        file_content = json.load(file)\n                        metadata = KeyValueStoreMetadata(**file_content)\n                        if metadata.id == id:\n                            client = cls(\n                                metadata=metadata,\n                                path_to_kvs=kvs_base_path / kvs_dir,\n                                lock=asyncio.Lock(),\n                            )\n                            await client._update_metadata(update_accessed_at=True)\n                            found = True\n                            break\n                    finally:\n                        await asyncio.to_thread(file.close)\n                except (json.JSONDecodeError, ValidationError):\n                    continue\n\n            if not found:\n                raise ValueError(f'Key-value store with ID \"{id}\" not found.')\n\n        # Get a new instance by name or alias.\n        else:\n            kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')\n            path_to_kvs = kvs_base_path / kvs_dir\n            path_to_metadata = path_to_kvs / METADATA_FILENAME\n\n            # If the key-value store directory exists, reconstruct the client from the metadata file.\n            if path_to_kvs.exists() and path_to_metadata.exists():\n                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')\n                try:\n                    file_content = json.load(file)\n                finally:\n                    await asyncio.to_thread(file.close)\n                try:\n                    metadata = KeyValueStoreMetadata(**file_content)\n                except ValidationError as exc:\n                    raise ValueError(f'Invalid metadata file for key-value store \"{name or alias}\"') from exc\n\n                client = cls(\n                    metadata=metadata,\n                    path_to_kvs=path_to_kvs,\n                    lock=asyncio.Lock(),\n                )\n\n                await client._update_metadata(update_accessed_at=True)\n\n            # Otherwise, create a new key-value store client.\n            else:\n                now = datetime.now(timezone.utc)\n                metadata = KeyValueStoreMetadata(\n                    id=crypto_random_object_id(),\n                    name=name,\n                    created_at=now,\n                    accessed_at=now,\n                    modified_at=now,\n                )\n                client = cls(\n                    metadata=metadata,\n                    path_to_kvs=path_to_kvs,\n                    lock=asyncio.Lock(),\n                )\n                await client._update_metadata()\n\n        return client\n\n    @override\n    async def drop(self) -> None:\n        # If the client directory exists, remove it recursively.\n        if self.path_to_kvs.exists():\n            async with self._lock:\n                await asyncio.to_thread(shutil.rmtree, self.path_to_kvs)\n\n    @override\n    async def purge(self) -> None:\n        async with self._lock:\n            for file_path in self.path_to_kvs.glob('*'):\n                if file_path.name == METADATA_FILENAME:\n                    continue\n                await asyncio.to_thread(file_path.unlink, missing_ok=True)\n\n            await self._update_metadata(\n                update_accessed_at=True,\n                update_modified_at=True,\n            )\n\n    @override\n    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:\n        # Update the metadata to record access\n        async with self._lock:\n            await self._update_metadata(update_accessed_at=True)\n\n        record_path = self.path_to_kvs / self._encode_key(key)\n\n        if not record_path.exists():\n            return None\n\n        # Found a file for this key, now look for its metadata\n        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')\n        if not record_metadata_filepath.exists():\n            logger.warning(f'Found value file for key \"{key}\" but no metadata file.')\n            return None\n\n        # Read the metadata file\n        async with self._lock:\n            try:\n                file = await asyncio.to_thread(\n                    functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),\n                )\n            except FileNotFoundError:\n                logger.warning(f'Metadata file disappeared for key \"{key}\", aborting get_value')\n                return None\n\n            try:\n                metadata_content = json.load(file)\n            except json.JSONDecodeError:\n                logger.warning(f'Invalid metadata file for key \"{key}\"')\n                return None\n            finally:\n                await asyncio.to_thread(file.close)\n\n        try:\n            metadata = KeyValueStoreRecordMetadata(**metadata_content)\n        except ValidationError:\n            logger.warning(f'Invalid metadata schema for key \"{key}\"')\n            return None\n\n        # Read the actual value\n        try:\n            value_bytes = await asyncio.to_thread(record_path.read_bytes)\n        except FileNotFoundError:\n            logger.warning(f'Value file disappeared for key \"{key}\"')\n            return None\n\n        # Handle None values\n        if metadata.content_type == 'application/x-none':\n            value = None\n        # Handle JSON values\n        elif 'application/json' in metadata.content_type:\n            try:\n                value = json.loads(value_bytes.decode('utf-8'))\n            except (json.JSONDecodeError, UnicodeDecodeError):\n                logger.warning(f'Failed to decode JSON value for key \"{key}\"')\n                return None\n        # Handle text values\n        elif metadata.content_type.startswith('text/'):\n            try:\n                value = value_bytes.decode('utf-8')\n            except UnicodeDecodeError:\n                logger.warning(f'Failed to decode text value for key \"{key}\"')\n                return None\n        # Handle binary values\n        else:\n            value = value_bytes\n\n        # Calculate the size of the value in bytes\n        size = len(value_bytes)\n\n        return KeyValueStoreRecord(\n            key=metadata.key,\n            value=value,\n            content_type=metadata.content_type,\n            size=size,\n        )\n\n    @override\n    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:\n        # Special handling for None values\n        if value is None:\n            content_type = 'application/x-none'  # Special content type to identify None values\n            value_bytes = b''\n        else:\n            content_type = content_type or infer_mime_type(value)\n\n            # Serialize the value to bytes.\n            if 'application/json' in content_type:\n                value_bytes = (await json_dumps(value)).encode('utf-8')\n            elif isinstance(value, str):\n                value_bytes = value.encode('utf-8')\n            elif isinstance(value, (bytes, bytearray)):\n                value_bytes = value\n            else:\n                # Fallback: attempt to convert to string and encode.\n                value_bytes = str(value).encode('utf-8')\n\n        record_path = self.path_to_kvs / self._encode_key(key)\n\n        # Prepare the metadata\n        size = len(value_bytes)\n        record_metadata = KeyValueStoreRecordMetadata(key=key, content_type=content_type, size=size)\n        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')\n        record_metadata_content = await json_dumps(record_metadata.model_dump())\n\n        async with self._lock:\n            # Ensure the key-value store directory exists.\n            await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True)\n\n            # Write the value to the file.\n            await atomic_write(record_path, value_bytes)\n\n            # Write the record metadata to the file.\n            await atomic_write(record_metadata_filepath, record_metadata_content)\n\n            # Update the KVS metadata to record the access and modification.\n            await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def delete_value(self, *, key: str) -> None:\n        record_path = self.path_to_kvs / self._encode_key(key)\n        metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')\n        deleted = False\n\n        async with self._lock:\n            # Delete the value file and its metadata if found\n            if record_path.exists():\n                await asyncio.to_thread(record_path.unlink, missing_ok=True)\n\n                # Delete the metadata file if it exists\n                if metadata_path.exists():\n                    await asyncio.to_thread(metadata_path.unlink, missing_ok=True)\n                else:\n                    logger.warning(f'Found value file for key \"{key}\" but no metadata file when trying to delete it.')\n\n                deleted = True\n\n            # If we deleted something, update the KVS metadata\n            if deleted:\n                await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def iterate_keys(\n        self,\n        *,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        # Check if the KVS directory exists\n        if not self.path_to_kvs.exists():\n            return\n\n        # List and sort all files *inside* a brief lock, then release it immediately:\n        async with self._lock:\n            files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))\n\n        count = 0\n\n        for file_path in files:\n            # Skip the main metadata file\n            if file_path.name == METADATA_FILENAME:\n                continue\n\n            # Only process metadata files for records\n            if not file_path.name.endswith(f'.{METADATA_FILENAME}'):\n                continue\n\n            # Extract the base key name from the metadata filename\n            key_name = self._decode_key(file_path.name[: -len(f'.{METADATA_FILENAME}')])\n\n            # Apply exclusive_start_key filter if provided\n            if exclusive_start_key is not None and key_name <= exclusive_start_key:\n                continue\n\n            # Try to read and parse the metadata file\n            try:\n                metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')\n            except FileNotFoundError:\n                logger.warning(f'Metadata file disappeared for key \"{key_name}\", skipping it.')\n                continue\n\n            try:\n                metadata_dict = json.loads(metadata_content)\n            except json.JSONDecodeError:\n                logger.warning(f'Failed to decode metadata file for key \"{key_name}\", skipping it.')\n                continue\n\n            try:\n                record_metadata = KeyValueStoreRecordMetadata(**metadata_dict)\n            except ValidationError:\n                logger.warning(f'Invalid metadata schema for key \"{key_name}\", skipping it.')\n\n            yield record_metadata\n\n            count += 1\n            if limit and count >= limit:\n                break\n\n        # Update accessed_at timestamp\n        async with self._lock:\n            await self._update_metadata(update_accessed_at=True)\n\n    @override\n    async def get_public_url(self, *, key: str) -> str:\n        \"\"\"Return a file:// URL for the given key.\n\n        Args:\n            key: The key to get the public URL for.\n\n        Returns:\n            A file:// URL pointing to the file on the local filesystem.\n        \"\"\"\n        record_path = self.path_to_kvs / self._encode_key(key)\n        absolute_path = record_path.absolute()\n        return absolute_path.as_uri()\n\n    @override\n    async def record_exists(self, *, key: str) -> bool:\n        \"\"\"Check if a record with the given key exists in the key-value store.\n\n        Args:\n            key: The key to check for existence.\n\n        Returns:\n            True if a record with the given key exists, False otherwise.\n        \"\"\"\n        # Update the metadata to record access\n        async with self._lock:\n            await self._update_metadata(update_accessed_at=True)\n\n        record_path = self.path_to_kvs / self._encode_key(key)\n        record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}')\n\n        # Both the value file and metadata file must exist for a record to be considered existing\n        return record_path.exists() and record_metadata_filepath.exists()\n\n    async def _update_metadata(\n        self,\n        *,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n    ) -> None:\n        \"\"\"Update the KVS metadata file with current information.\n\n        Args:\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n        if update_modified_at:\n            self._metadata.modified_at = now\n\n        # Ensure the parent directory for the metadata file exists.\n        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)\n\n        # Dump the serialized metadata to the file.\n        data = await json_dumps(self._metadata.model_dump())\n        await atomic_write(self.path_to_metadata, data)\n\n    def _encode_key(self, key: str) -> str:\n        \"\"\"Encode a key to make it safe for use in a file path.\"\"\"\n        return urllib.parse.quote(key, safe='')\n\n    def _decode_key(self, encoded_key: str) -> str:\n        \"\"\"Decode a key that was encoded to make it safe for use in a file path.\"\"\"\n        return urllib.parse.unquote(encoded_key)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/_request_queue_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport functools\nimport json\nimport shutil\nfrom collections import deque\nfrom datetime import datetime, timezone\nfrom hashlib import sha256\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nfrom pydantic import BaseModel, ValidationError\nfrom typing_extensions import Self, override\n\nfrom crawlee import Request\nfrom crawlee._consts import METADATA_FILENAME\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.file import atomic_write, json_dumps\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee._utils.recoverable_state import RecoverableState\nfrom crawlee.storage_clients._base import RequestQueueClient\nfrom crawlee.storage_clients.models import (\n    AddRequestsResponse,\n    ProcessedRequest,\n    RequestQueueMetadata,\n    UnprocessedRequest,\n)\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee.configuration import Configuration\n    from crawlee.storages import KeyValueStore\n\nlogger = getLogger(__name__)\n\n\nclass RequestQueueState(BaseModel):\n    \"\"\"State model for the `FileSystemRequestQueueClient`.\"\"\"\n\n    sequence_counter: int = 0\n    \"\"\"Counter for regular request ordering.\"\"\"\n\n    forefront_sequence_counter: int = 0\n    \"\"\"Counter for forefront request ordering.\"\"\"\n\n    forefront_requests: dict[str, int] = {}\n    \"\"\"Mapping of forefront request unique keys to their sequence numbers.\"\"\"\n\n    regular_requests: dict[str, int] = {}\n    \"\"\"Mapping of regular request unique keys to their sequence numbers.\"\"\"\n\n    in_progress_requests: set[str] = set()\n    \"\"\"Set of request unique keys currently being processed.\"\"\"\n\n    handled_requests: set[str] = set()\n    \"\"\"Set of request unique keys that have been handled.\"\"\"\n\n\nclass FileSystemRequestQueueClient(RequestQueueClient):\n    \"\"\"A file system implementation of the request queue client.\n\n    This client persists requests to the file system as individual JSON files, making it suitable for scenarios\n    where data needs to survive process restarts. Each request is stored as a separate file in a directory\n    structure following the pattern:\n\n    ```\n    {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json\n    ```\n\n    The implementation uses `RecoverableState` to maintain ordering information, in-progress status, and\n    request handling status. This allows for proper state recovery across process restarts without\n    embedding metadata in individual request files. File system storage provides durability at the cost of\n    slower I/O operations compared to memory only-based storage.\n\n    This implementation is ideal for long-running crawlers where persistence is important and for situations\n    where you need to resume crawling after process termination.\n    \"\"\"\n\n    _STORAGE_SUBDIR = 'request_queues'\n    \"\"\"The name of the subdirectory where request queues are stored.\"\"\"\n\n    _STORAGE_SUBSUBDIR_DEFAULT = 'default'\n    \"\"\"The name of the subdirectory for the default request queue.\"\"\"\n\n    _MAX_REQUESTS_IN_CACHE = 100_000\n    \"\"\"Maximum number of requests to keep in cache for faster access.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: RequestQueueMetadata,\n        path_to_rq: Path,\n        lock: asyncio.Lock,\n        recoverable_state: RecoverableState[RequestQueueState],\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `FileSystemRequestQueueClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._path_to_rq = path_to_rq\n        \"\"\"The full path to the request queue directory.\"\"\"\n\n        self._lock = lock\n        \"\"\"A lock to ensure that only one operation is performed at a time.\"\"\"\n\n        self._request_cache = deque[Request]()\n        \"\"\"Cache for requests: forefront requests at the beginning, regular requests at the end.\"\"\"\n\n        self._request_cache_needs_refresh = True\n        \"\"\"Flag indicating whether the cache needs to be refreshed from filesystem.\"\"\"\n\n        self._is_empty_cache: bool | None = None\n        \"\"\"Cache for is_empty result: None means unknown, True/False is cached state.\"\"\"\n\n        self._state = recoverable_state\n        \"\"\"Recoverable state to maintain request ordering, in-progress status, and handled status.\"\"\"\n\n    @override\n    async def get_metadata(self) -> RequestQueueMetadata:\n        return self._metadata\n\n    @property\n    def path_to_rq(self) -> Path:\n        \"\"\"The full path to the request queue directory.\"\"\"\n        return self._path_to_rq\n\n    @property\n    def path_to_metadata(self) -> Path:\n        \"\"\"The full path to the request queue metadata file.\"\"\"\n        return self.path_to_rq / METADATA_FILENAME\n\n    @classmethod\n    async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:\n        async def kvs_factory() -> KeyValueStore:\n            from crawlee.storage_clients import FileSystemStorageClient  # noqa: PLC0415 avoid circular import\n            from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import\n\n            return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)\n\n        return RecoverableState[RequestQueueState](\n            default_state=RequestQueueState(),\n            persist_state_key=f'__RQ_STATE_{id}',\n            persist_state_kvs_factory=kvs_factory,\n            persistence_enabled=True,\n            logger=logger,\n        )\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        configuration: Configuration,\n    ) -> Self:\n        \"\"\"Open or create a file system request queue client.\n\n        This method attempts to open an existing request queue from the file system. If a queue with the specified\n        ID or name exists, it loads the metadata and state from the stored files. If no existing queue is found,\n        a new one is created.\n\n        Args:\n            id: The ID of the request queue to open. If provided, searches for existing queue by ID.\n            name: The name of the request queue for named (global scope) storages.\n            alias: The alias of the request queue for unnamed (run scope) storages.\n            configuration: The configuration object containing storage directory settings.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If a queue with the specified ID is not found, if metadata is invalid,\n                or if both name and alias are provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR\n\n        if not rq_base_path.exists():\n            await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)\n\n        # Open an existing RQ by its ID, raise an error if not found.\n        if id:\n            found = False\n            for rq_dir in rq_base_path.iterdir():\n                if not rq_dir.is_dir():\n                    continue\n\n                path_to_metadata = rq_dir / METADATA_FILENAME\n                if not path_to_metadata.exists():\n                    continue\n\n                try:\n                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')\n                    try:\n                        file_content = json.load(file)\n                        metadata = RequestQueueMetadata(**file_content)\n\n                        if metadata.id == id:\n                            client = cls(\n                                metadata=metadata,\n                                path_to_rq=rq_base_path / rq_dir,\n                                lock=asyncio.Lock(),\n                                recoverable_state=await cls._create_recoverable_state(\n                                    id=id, configuration=configuration\n                                ),\n                            )\n                            await client._state.initialize()\n                            await client._discover_existing_requests()\n                            await client._update_metadata(update_accessed_at=True)\n                            found = True\n                            break\n                    finally:\n                        await asyncio.to_thread(file.close)\n                except (json.JSONDecodeError, ValidationError):\n                    continue\n\n            if not found:\n                raise ValueError(f'Request queue with ID \"{id}\" not found')\n\n        # Open an existing RQ by its name or alias, or create a new one if not found.\n        else:\n            rq_dir = Path(name) if name else Path(alias) if alias else Path('default')\n            path_to_rq = rq_base_path / rq_dir\n            path_to_metadata = path_to_rq / METADATA_FILENAME\n\n            # If the RQ directory exists, reconstruct the client from the metadata file.\n            if path_to_rq.exists() and path_to_metadata.exists():\n                file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')\n                try:\n                    file_content = json.load(file)\n                finally:\n                    await asyncio.to_thread(file.close)\n                try:\n                    metadata = RequestQueueMetadata(**file_content)\n                except ValidationError as exc:\n                    raise ValueError(f'Invalid metadata file for request queue \"{name or alias}\"') from exc\n\n                client = cls(\n                    metadata=metadata,\n                    path_to_rq=path_to_rq,\n                    lock=asyncio.Lock(),\n                    recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),\n                )\n\n                await client._state.initialize()\n                await client._discover_existing_requests()\n                await client._update_metadata(update_accessed_at=True)\n\n            # Otherwise, create a new dataset client.\n            else:\n                now = datetime.now(timezone.utc)\n                metadata = RequestQueueMetadata(\n                    id=crypto_random_object_id(),\n                    name=name,\n                    created_at=now,\n                    accessed_at=now,\n                    modified_at=now,\n                    had_multiple_clients=False,\n                    handled_request_count=0,\n                    pending_request_count=0,\n                    total_request_count=0,\n                )\n                client = cls(\n                    metadata=metadata,\n                    path_to_rq=path_to_rq,\n                    lock=asyncio.Lock(),\n                    recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),\n                )\n                await client._state.initialize()\n                await client._update_metadata()\n\n        return client\n\n    @override\n    async def drop(self) -> None:\n        async with self._lock:\n            # Remove the RQ dir recursively if it exists.\n            if self.path_to_rq.exists():\n                await asyncio.to_thread(shutil.rmtree, self.path_to_rq)\n\n            # Clear recoverable state\n            await self._state.reset()\n            await self._state.teardown()\n            self._request_cache.clear()\n            self._request_cache_needs_refresh = True\n\n            # Invalidate is_empty cache.\n            self._is_empty_cache = None\n\n    @override\n    async def purge(self) -> None:\n        async with self._lock:\n            request_files = await self._get_request_files(self.path_to_rq)\n\n            for file_path in request_files:\n                await asyncio.to_thread(file_path.unlink, missing_ok=True)\n\n            # Clear recoverable state\n            await self._state.reset()\n            self._request_cache.clear()\n            self._request_cache_needs_refresh = True\n\n            await self._update_metadata(\n                update_modified_at=True,\n                update_accessed_at=True,\n                new_pending_request_count=0,\n                new_handled_request_count=0,\n                new_total_request_count=0,\n            )\n\n            # Invalidate is_empty cache.\n            self._is_empty_cache = None\n\n    @override\n    async def add_batch_of_requests(\n        self,\n        requests: Sequence[Request],\n        *,\n        forefront: bool = False,\n    ) -> AddRequestsResponse:\n        async with self._lock:\n            self._is_empty_cache = None\n            new_total_request_count = self._metadata.total_request_count\n            new_pending_request_count = self._metadata.pending_request_count\n            processed_requests = list[ProcessedRequest]()\n            unprocessed_requests = list[UnprocessedRequest]()\n            state = self._state.current_value\n\n            all_requests = state.forefront_requests | state.regular_requests\n\n            requests_to_enqueue = {}\n\n            # Determine which requests can be added or are modified.\n            for request in requests:\n                # Check if the request has already been handled.\n                if request.unique_key in state.handled_requests:\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=True,\n                            was_already_handled=True,\n                        )\n                    )\n                # Check if the request is already in progress.\n                # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the\n                # position of the request.\n                elif (request.unique_key in state.in_progress_requests) or (\n                    request.unique_key in all_requests and not forefront\n                ):\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=True,\n                            was_already_handled=False,\n                        )\n                    )\n                # These requests must either be added or update their position.\n                else:\n                    requests_to_enqueue[request.unique_key] = request\n\n            # Process each request in the batch.\n            for request in requests_to_enqueue.values():\n                # If the request is not already in the RQ, this is a new request.\n                if request.unique_key not in all_requests:\n                    request_path = self._get_request_path(request.unique_key)\n                    # Add sequence number to ensure FIFO ordering using state.\n                    if forefront:\n                        sequence_number = state.forefront_sequence_counter\n                        state.forefront_sequence_counter += 1\n                        state.forefront_requests[request.unique_key] = sequence_number\n                    else:\n                        sequence_number = state.sequence_counter\n                        state.sequence_counter += 1\n                        state.regular_requests[request.unique_key] = sequence_number\n\n                    # Save the clean request without extra fields\n                    request_data = await json_dumps(request.model_dump())\n                    await atomic_write(request_path, request_data)\n\n                    # Update the metadata counts.\n                    new_total_request_count += 1\n                    new_pending_request_count += 1\n\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=False,\n                            was_already_handled=False,\n                        )\n                    )\n\n                # If the request already exists in the RQ and use the forefront flag to update its position\n                elif forefront:\n                    # If the request is among `regular`, remove it from its current position.\n                    if request.unique_key in state.regular_requests:\n                        state.regular_requests.pop(request.unique_key)\n\n                    # If the request is already in `forefront`, we just need to update its position.\n                    state.forefront_requests[request.unique_key] = state.forefront_sequence_counter\n                    state.forefront_sequence_counter += 1\n\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=True,\n                            was_already_handled=False,\n                        )\n                    )\n\n                else:\n                    logger.warning(f'Request with unique key \"{request.unique_key}\" could not be processed.')\n                    unprocessed_requests.append(\n                        UnprocessedRequest(\n                            unique_key=request.unique_key,\n                            url=request.url,\n                            method=request.method,\n                        )\n                    )\n\n            await self._update_metadata(\n                update_modified_at=True,\n                update_accessed_at=True,\n                new_total_request_count=new_total_request_count,\n                new_pending_request_count=new_pending_request_count,\n            )\n\n            # Invalidate the cache if we added forefront requests.\n            if forefront:\n                self._request_cache_needs_refresh = True\n\n            # Invalidate is_empty cache.\n            self._is_empty_cache = None\n\n            return AddRequestsResponse(\n                processed_requests=processed_requests,\n                unprocessed_requests=unprocessed_requests,\n            )\n\n    @override\n    async def get_request(self, unique_key: str) -> Request | None:\n        async with self._lock:\n            request_path = self._get_request_path(unique_key)\n            request = await self._parse_request_file(request_path)\n\n            if request is None:\n                logger.warning(f'Request with unique key \"{unique_key}\" not found in the queue.')\n                return None\n\n            await self._update_metadata(update_accessed_at=True)\n            return request\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        async with self._lock:\n            # Refresh cache if needed or if it's empty.\n            if self._request_cache_needs_refresh or not self._request_cache:\n                await self._refresh_cache()\n\n            next_request: Request | None = None\n            state = self._state.current_value\n\n            # Fetch from the front of the deque (forefront requests are at the beginning).\n            while self._request_cache and next_request is None:\n                candidate = self._request_cache.popleft()\n\n                # Skip requests that are already in progress, however this should not happen.\n                if candidate.unique_key not in state.in_progress_requests:\n                    next_request = candidate\n\n            if next_request is not None:\n                state.in_progress_requests.add(next_request.unique_key)\n\n            return next_request\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        async with self._lock:\n            self._is_empty_cache = None\n            state = self._state.current_value\n\n            # Check if the request is in progress.\n            if request.unique_key not in state.in_progress_requests:\n                logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')\n                return None\n\n            # Update the request's handled_at timestamp.\n            if request.handled_at is None:\n                request.handled_at = datetime.now(timezone.utc)\n\n            # Dump the updated request to the file.\n            request_path = self._get_request_path(request.unique_key)\n\n            if not await asyncio.to_thread(request_path.exists):\n                logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')\n                return None\n\n            request_data = await json_dumps(request.model_dump())\n            await atomic_write(request_path, request_data)\n\n            # Update state: remove from in-progress and add to handled.\n            state.in_progress_requests.discard(request.unique_key)\n            state.handled_requests.add(request.unique_key)\n\n            # Update RQ metadata.\n            await self._update_metadata(\n                update_modified_at=True,\n                update_accessed_at=True,\n                new_handled_request_count=self._metadata.handled_request_count + 1,\n                new_pending_request_count=self._metadata.pending_request_count - 1,\n            )\n\n            return ProcessedRequest(\n                unique_key=request.unique_key,\n                was_already_present=True,\n                was_already_handled=True,\n            )\n\n    @override\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        async with self._lock:\n            self._is_empty_cache = None\n            state = self._state.current_value\n\n            # Check if the request is in progress.\n            if request.unique_key not in state.in_progress_requests:\n                logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')\n                return None\n\n            request_path = self._get_request_path(request.unique_key)\n\n            if not await asyncio.to_thread(request_path.exists):\n                logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')\n                return None\n\n            # Update sequence number and state to ensure proper ordering.\n            if forefront:\n                # Remove from regular requests if it was there\n                state.regular_requests.pop(request.unique_key, None)\n                sequence_number = state.forefront_sequence_counter\n                state.forefront_sequence_counter += 1\n                state.forefront_requests[request.unique_key] = sequence_number\n            else:\n                # Remove from forefront requests if it was there\n                state.forefront_requests.pop(request.unique_key, None)\n                sequence_number = state.sequence_counter\n                state.sequence_counter += 1\n                state.regular_requests[request.unique_key] = sequence_number\n\n            # Save the clean request without extra fields\n            request_data = await json_dumps(request.model_dump())\n            await atomic_write(request_path, request_data)\n\n            # Remove from in-progress.\n            state.in_progress_requests.discard(request.unique_key)\n\n            # Update RQ metadata.\n            await self._update_metadata(\n                update_modified_at=True,\n                update_accessed_at=True,\n            )\n\n            # Add the request back to the cache.\n            if forefront:\n                self._request_cache.appendleft(request)\n            else:\n                self._request_cache.append(request)\n\n            return ProcessedRequest(\n                unique_key=request.unique_key,\n                was_already_present=True,\n                was_already_handled=False,\n            )\n\n    @override\n    async def is_empty(self) -> bool:\n        async with self._lock:\n            # If we have a cached value, return it immediately.\n            if self._is_empty_cache is not None:\n                return self._is_empty_cache\n\n            state = self._state.current_value\n\n            # If there are in-progress requests, return False immediately.\n            if len(state.in_progress_requests) > 0:\n                self._is_empty_cache = False\n                return False\n\n            # If we have a cached requests, check them first (fast path).\n            if self._request_cache:\n                for req in self._request_cache:\n                    if req.unique_key not in state.handled_requests:\n                        self._is_empty_cache = False\n                        return False\n                self._is_empty_cache = True\n                return len(state.in_progress_requests) == 0\n\n            # Fallback: check state for unhandled requests.\n            await self._update_metadata(update_accessed_at=True)\n\n            # Check if there are any requests that are not handled\n            all_requests = set(state.forefront_requests.keys()) | set(state.regular_requests.keys())\n            unhandled_requests = all_requests - state.handled_requests\n\n            if unhandled_requests:\n                self._is_empty_cache = False\n                return False\n\n            self._is_empty_cache = True\n            return True\n\n    def _get_request_path(self, unique_key: str) -> Path:\n        \"\"\"Get the path to a specific request file.\n\n        Args:\n            unique_key: Unique key of the request.\n\n        Returns:\n            The path to the request file.\n        \"\"\"\n        return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'\n\n    async def _update_metadata(\n        self,\n        *,\n        new_handled_request_count: int | None = None,\n        new_pending_request_count: int | None = None,\n        new_total_request_count: int | None = None,\n        update_had_multiple_clients: bool = False,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n    ) -> None:\n        \"\"\"Update the dataset metadata file with current information.\n\n        Args:\n            new_handled_request_count: If provided, update the handled_request_count to this value.\n            new_pending_request_count: If provided, update the pending_request_count to this value.\n            new_total_request_count: If provided, update the total_request_count to this value.\n            update_had_multiple_clients: If True, set had_multiple_clients to True.\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n        \"\"\"\n        # Always create a new timestamp to ensure it's truly updated\n        now = datetime.now(timezone.utc)\n\n        # Update timestamps according to parameters\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n\n        if update_modified_at:\n            self._metadata.modified_at = now\n\n        # Update request counts if provided\n        if new_handled_request_count is not None:\n            self._metadata.handled_request_count = new_handled_request_count\n\n        if new_pending_request_count is not None:\n            self._metadata.pending_request_count = new_pending_request_count\n\n        if new_total_request_count is not None:\n            self._metadata.total_request_count = new_total_request_count\n\n        if update_had_multiple_clients:\n            self._metadata.had_multiple_clients = True\n\n        # Ensure the parent directory for the metadata file exists.\n        await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True)\n\n        # Dump the serialized metadata to the file.\n        data = await json_dumps(self._metadata.model_dump())\n        await atomic_write(self.path_to_metadata, data)\n\n    async def _refresh_cache(self) -> None:\n        \"\"\"Refresh the request cache from filesystem.\n\n        This method loads up to _MAX_REQUESTS_IN_CACHE requests from the filesystem,\n        prioritizing forefront requests and maintaining proper ordering.\n        \"\"\"\n        self._request_cache.clear()\n        state = self._state.current_value\n\n        forefront_requests = list[tuple[Request, int]]()  # (request, sequence)\n        regular_requests = list[tuple[Request, int]]()  # (request, sequence)\n\n        request_files = await self._get_request_files(self.path_to_rq)\n\n        for request_file in request_files:\n            request = await self._parse_request_file(request_file)\n\n            if request is None:\n                continue\n\n            # Skip handled requests\n            if request.unique_key in state.handled_requests:\n                continue\n\n            # Skip in-progress requests\n            if request.unique_key in state.in_progress_requests:\n                continue\n\n            # Determine if request is forefront or regular based on state\n            if request.unique_key in state.forefront_requests:\n                sequence = state.forefront_requests[request.unique_key]\n                forefront_requests.append((request, sequence))\n            elif request.unique_key in state.regular_requests:\n                sequence = state.regular_requests[request.unique_key]\n                regular_requests.append((request, sequence))\n            else:\n                # Request not in state, skip it (might be orphaned)\n                logger.warning(f'Request {request.unique_key} not found in state, skipping.')\n                continue\n\n        # Sort forefront requests by sequence (newest first for LIFO behavior).\n        forefront_requests.sort(key=lambda item: item[1], reverse=True)\n\n        # Sort regular requests by sequence (oldest first for FIFO behavior).\n        regular_requests.sort(key=lambda item: item[1], reverse=False)\n\n        # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted\n        # by sequence (newest first), we need to add them in reverse order to maintain correct priority.\n        for request, _ in reversed(forefront_requests):\n            if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:\n                break\n            self._request_cache.appendleft(request)\n\n        # Add regular requests to the end of the cache (right side).\n        for request, _ in regular_requests:\n            if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:\n                break\n            self._request_cache.append(request)\n\n        self._request_cache_needs_refresh = False\n\n    @classmethod\n    async def _get_request_files(cls, path_to_rq: Path) -> list[Path]:\n        \"\"\"Get all request files from the RQ.\n\n        Args:\n            path_to_rq: The path to the request queue directory.\n\n        Returns:\n            A list of paths to all request files.\n        \"\"\"\n        # Create the requests directory if it doesn't exist.\n        await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)\n\n        # List all the json files.\n        files = list(await asyncio.to_thread(path_to_rq.glob, '*.json'))\n\n        # Filter out metadata file and non-file entries.\n        filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)\n\n        return list(filtered)\n\n    @classmethod\n    async def _parse_request_file(cls, file_path: Path) -> Request | None:\n        \"\"\"Parse a request file and return the `Request` object.\n\n        Args:\n            file_path: The path to the request file.\n\n        Returns:\n            The parsed `Request` object or `None` if the file could not be read or parsed.\n        \"\"\"\n        # Open the request file.\n        try:\n            file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))\n        except FileNotFoundError:\n            logger.warning(f'Request file \"{file_path}\" not found.')\n            return None\n\n        # Read the file content and parse it as JSON.\n        try:\n            file_content = json.load(file)\n        except json.JSONDecodeError as exc:\n            logger.warning(f'Failed to parse request file {file_path}: {exc!s}')\n            return None\n        finally:\n            await asyncio.to_thread(file.close)\n\n        # Validate the content against the Request model.\n        try:\n            return Request.model_validate(file_content)\n        except ValidationError as exc:\n            logger.warning(f'Failed to validate request file {file_path}: {exc!s}')\n            return None\n\n    async def _discover_existing_requests(self) -> None:\n        \"\"\"Discover and load existing requests into the state when opening an existing request queue.\"\"\"\n        request_files = await self._get_request_files(self.path_to_rq)\n        state = self._state.current_value\n\n        for request_file in request_files:\n            request = await self._parse_request_file(request_file)\n            if request is None:\n                continue\n\n            # Add request to state as regular request (assign sequence numbers)\n            if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:\n                # Assign as regular request with current sequence counter\n                state.regular_requests[request.unique_key] = state.sequence_counter\n                state.sequence_counter += 1\n\n                # Check if request was already handled\n                if request.handled_at is not None:\n                    state.handled_requests.add(request.unique_key)\n\n    @staticmethod\n    def _get_file_base_name_from_unique_key(unique_key: str) -> str:\n        \"\"\"Generate a deterministic file name for a unique_key.\n\n        Args:\n            unique_key: Unique key to be used to generate filename.\n\n        Returns:\n            A file name based on the unique_key.\n        \"\"\"\n        # hexdigest produces filenames compliant strings\n        hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()\n        name_length = 15\n        # Truncate the key to the desired length\n        return hashed_key[:name_length]\n"
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/_storage_client.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients._base import StorageClient\n\nfrom ._dataset_client import FileSystemDatasetClient\nfrom ._key_value_store_client import FileSystemKeyValueStoreClient\nfrom ._request_queue_client import FileSystemRequestQueueClient\n\nif TYPE_CHECKING:\n    from collections.abc import Hashable\n\n\n@docs_group('Storage clients')\nclass FileSystemStorageClient(StorageClient):\n    \"\"\"File system implementation of the storage client.\n\n    This storage client provides access to datasets, key-value stores, and request queues that persist data\n    to the local file system. Each storage type is implemented with its own specific file system client\n    that stores data in a structured directory hierarchy.\n\n    Data is stored in JSON format in predictable file paths, making it easy to inspect and manipulate\n    the stored data outside of the Crawlee application if needed.\n\n    All data persists between program runs but is limited to access from the local machine\n    where the files are stored.\n\n    Warning: This storage client is not safe for concurrent access from multiple crawler processes.\n    Use it only when running a single crawler process at a time.\n    \"\"\"\n\n    @override\n    def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:\n        # Even different client instances should return same storage if the storage_dir is the same.\n        return super().get_storage_client_cache_key(configuration), configuration.storage_dir\n\n    @override\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> FileSystemDatasetClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> FileSystemKeyValueStoreClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> FileSystemRequestQueueClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)\n        await self._purge_if_needed(client, configuration)\n        return client\n"
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/_utils.py",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/_file_system/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/_memory/__init__.py",
    "content": "from ._dataset_client import MemoryDatasetClient\nfrom ._key_value_store_client import MemoryKeyValueStoreClient\nfrom ._request_queue_client import MemoryRequestQueueClient\nfrom ._storage_client import MemoryStorageClient\n\n__all__ = [\n    'MemoryDatasetClient',\n    'MemoryKeyValueStoreClient',\n    'MemoryRequestQueueClient',\n    'MemoryStorageClient',\n]\n"
  },
  {
    "path": "src/crawlee/storage_clients/_memory/_dataset_client.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any\n\nfrom typing_extensions import Self, override\n\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import DatasetClient\nfrom crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\nlogger = getLogger(__name__)\n\n\nclass MemoryDatasetClient(DatasetClient):\n    \"\"\"Memory implementation of the dataset client.\n\n    This client stores dataset items in memory using Python lists and dictionaries. No data is persisted\n    between process runs, meaning all stored data is lost when the program terminates. This implementation\n    is primarily useful for testing, development, and short-lived crawler operations where persistent\n    storage is not required.\n\n    The memory implementation provides fast access to data but is limited by available memory and\n    does not support data sharing across different processes. It supports all dataset operations including\n    sorting, filtering, and pagination, but performs them entirely in memory.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: DatasetMetadata,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `MemoryDatasetClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._records = list[dict[str, Any]]()\n        \"\"\"List to hold dataset items. Each item is a dictionary representing a record.\"\"\"\n\n    @override\n    async def get_metadata(self) -> DatasetMetadata:\n        return self._metadata\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n    ) -> Self:\n        \"\"\"Open or create a new memory dataset client.\n\n        This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory\n        datasets don't check for existing datasets with the same name or ID since all data exists only in memory\n        and is lost when the process terminates.\n\n        Alias does not have any effect on the memory storage client implementation, because unnamed storages\n        are supported by default, since data are not persisted.\n\n        Args:\n            id: The ID of the dataset. If not provided, a random ID will be generated.\n            name: The name of the dataset for named (global scope) storages.\n            alias: The alias of the dataset for unnamed (run scope) storages.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        # Create a new dataset\n        dataset_id = id or crypto_random_object_id()\n        now = datetime.now(timezone.utc)\n\n        metadata = DatasetMetadata(\n            id=dataset_id,\n            name=name,\n            created_at=now,\n            accessed_at=now,\n            modified_at=now,\n            item_count=0,\n        )\n\n        return cls(metadata=metadata)\n\n    @override\n    async def drop(self) -> None:\n        self._records.clear()\n        await self._update_metadata(\n            update_accessed_at=True,\n            update_modified_at=True,\n            new_item_count=0,\n        )\n\n    @override\n    async def purge(self) -> None:\n        self._records.clear()\n        await self._update_metadata(\n            update_accessed_at=True,\n            update_modified_at=True,\n            new_item_count=0,\n        )\n\n    @override\n    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:\n        metadata = await self.get_metadata()\n        new_item_count = metadata.item_count\n\n        if isinstance(data, list):\n            for item in data:\n                new_item_count += 1\n                await self._push_item(item)\n        else:\n            new_item_count += 1\n            await self._push_item(data)\n\n        await self._update_metadata(\n            update_accessed_at=True,\n            update_modified_at=True,\n            new_item_count=new_item_count,\n        )\n\n    @override\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        # Check for unsupported arguments and log a warning if found\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n            'flatten': flatten,\n            'view': view,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of get_data are not supported '\n                f'by the {self.__class__.__name__} client.'\n            )\n\n        total = len(self._records)\n        items = self._records.copy()\n\n        # Apply skip_empty filter if requested\n        if skip_empty:\n            items = [item for item in items if item]\n\n        # Apply sorting\n        if desc:\n            items = list(reversed(items))\n\n        # Apply pagination\n        sliced_items = items[offset : (offset + limit) if limit is not None else total]\n\n        await self._update_metadata(update_accessed_at=True)\n\n        return DatasetItemsListPage(\n            count=len(sliced_items),\n            offset=offset,\n            limit=limit or (total - offset),\n            total=total,\n            desc=desc,\n            items=sliced_items,\n        )\n\n    @override\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = None,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        # Check for unsupported arguments and log a warning if found\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of iterate are not supported '\n                f'by the {self.__class__.__name__} client.'\n            )\n\n        items = self._records.copy()\n\n        # Apply sorting\n        if desc:\n            items = list(reversed(items))\n\n        # Apply pagination\n        sliced_items = items[offset : (offset + limit) if limit is not None else len(items)]\n\n        # Yield items one by one\n        for item in sliced_items:\n            if skip_empty and not item:\n                continue\n            yield item\n\n        await self._update_metadata(update_accessed_at=True)\n\n    async def _update_metadata(\n        self,\n        *,\n        new_item_count: int | None = None,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n    ) -> None:\n        \"\"\"Update the dataset metadata with current information.\n\n        Args:\n            new_item_count: If provided, update the item count to this value.\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n        if update_modified_at:\n            self._metadata.modified_at = now\n        if new_item_count is not None:\n            self._metadata.item_count = new_item_count\n\n    async def _push_item(self, item: dict[str, Any]) -> None:\n        \"\"\"Push a single item to the dataset.\n\n        Args:\n            item: The data item to add to the dataset.\n        \"\"\"\n        self._records.append(item)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_memory/_key_value_store_client.py",
    "content": "from __future__ import annotations\n\nimport sys\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING, Any\n\nfrom typing_extensions import Self, override\n\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.file import infer_mime_type\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import KeyValueStoreClient\nfrom crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n\nclass MemoryKeyValueStoreClient(KeyValueStoreClient):\n    \"\"\"Memory implementation of the key-value store client.\n\n    This client stores data in memory as Python dictionaries. No data is persisted between\n    process runs, meaning all stored data is lost when the program terminates. This implementation\n    is primarily useful for testing, development, and short-lived crawler operations where\n    persistence is not required.\n\n    The memory implementation provides fast access to data but is limited by available memory and\n    does not support data sharing across different processes.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: KeyValueStoreMetadata,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._records = dict[str, KeyValueStoreRecord]()\n        \"\"\"Dictionary to hold key-value records.\"\"\"\n\n    @override\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        return self._metadata\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n    ) -> Self:\n        \"\"\"Open or create a new memory key-value store client.\n\n        This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,\n        memory KVS don't check for existing stores with the same name or ID since all data exists only in memory\n        and is lost when the process terminates.\n\n        Alias does not have any effect on the memory storage client implementation, because unnamed storages\n        are supported by default, since data are not persisted.\n\n        Args:\n            id: The ID of the key-value store. If not provided, a random ID will be generated.\n            name: The name of the key-value store for named (global scope) storages.\n            alias: The alias of the key-value store for unnamed (run scope) storages.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If both name and alias are provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        # Create a new key-value store\n        store_id = id or crypto_random_object_id()\n        now = datetime.now(timezone.utc)\n\n        metadata = KeyValueStoreMetadata(\n            id=store_id,\n            name=name,\n            created_at=now,\n            accessed_at=now,\n            modified_at=now,\n        )\n\n        return cls(metadata=metadata)\n\n    @override\n    async def drop(self) -> None:\n        self._records.clear()\n        await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def purge(self) -> None:\n        self._records.clear()\n        await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:\n        await self._update_metadata(update_accessed_at=True)\n\n        # Return None if key doesn't exist\n        return self._records.get(key, None)\n\n    @override\n    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:\n        content_type = content_type or infer_mime_type(value)\n        size = sys.getsizeof(value)\n\n        # Create and store the record\n        record = KeyValueStoreRecord(\n            key=key,\n            value=value,\n            content_type=content_type,\n            size=size,\n        )\n\n        self._records[key] = record\n\n        await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def delete_value(self, *, key: str) -> None:\n        if key in self._records:\n            del self._records[key]\n            await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n    @override\n    async def iterate_keys(\n        self,\n        *,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        await self._update_metadata(update_accessed_at=True)\n\n        # Get all keys, sorted alphabetically\n        keys = sorted(self._records.keys())\n\n        # Apply exclusive_start_key filter if provided\n        if exclusive_start_key is not None:\n            keys = [k for k in keys if k > exclusive_start_key]\n\n        # Apply limit if provided\n        if limit is not None:\n            keys = keys[:limit]\n\n        # Yield metadata for each key\n        for key in keys:\n            record = self._records[key]\n            yield KeyValueStoreRecordMetadata(\n                key=key,\n                content_type=record.content_type,\n                size=record.size,\n            )\n\n    @override\n    async def get_public_url(self, *, key: str) -> str:\n        raise NotImplementedError('Public URLs are not supported for memory key-value stores.')\n\n    @override\n    async def record_exists(self, *, key: str) -> bool:\n        await self._update_metadata(update_accessed_at=True)\n        return key in self._records\n\n    async def _update_metadata(\n        self,\n        *,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n    ) -> None:\n        \"\"\"Update the key-value store metadata with current information.\n\n        Args:\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n        if update_modified_at:\n            self._metadata.modified_at = now\n"
  },
  {
    "path": "src/crawlee/storage_clients/_memory/_request_queue_client.py",
    "content": "from __future__ import annotations\n\nfrom collections import deque\nfrom contextlib import suppress\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING\n\nfrom typing_extensions import Self, override\n\nfrom crawlee import Request\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import RequestQueueClient\nfrom crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\nlogger = getLogger(__name__)\n\n\nclass MemoryRequestQueueClient(RequestQueueClient):\n    \"\"\"Memory implementation of the request queue client.\n\n    No data is persisted between process runs, which means all requests are lost when the program terminates.\n    This implementation is primarily useful for testing, development, and short-lived crawler runs where\n    persistence is not required.\n\n    This client provides fast access to request data but is limited by available memory and does not support\n    data sharing across different processes.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        metadata: RequestQueueMetadata,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `MemoryRequestQueueClient.open` class method to create a new instance.\n        \"\"\"\n        self._metadata = metadata\n\n        self._pending_requests = deque[Request]()\n        \"\"\"Pending requests are those that have been added to the queue but not yet fetched for processing.\"\"\"\n\n        self._handled_requests = dict[str, Request]()\n        \"\"\"Handled requests are those that have been processed and marked as handled.\"\"\"\n\n        self._in_progress_requests = dict[str, Request]()\n        \"\"\"In-progress requests are those that have been fetched but not yet marked as handled or reclaimed.\"\"\"\n\n        self._requests_by_unique_key = dict[str, Request]()\n        \"\"\"Unique key -> Request mapping for fast lookup by unique key.\"\"\"\n\n    @override\n    async def get_metadata(self) -> RequestQueueMetadata:\n        return self._metadata\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n    ) -> Self:\n        \"\"\"Open or create a new memory request queue client.\n\n        This method creates a new in-memory request queue instance. Unlike persistent storage implementations,\n        memory queues don't check for existing queues with the same name or ID since all data exists only\n        in memory and is lost when the process terminates.\n\n        Alias does not have any effect on the memory storage client implementation, because unnamed storages\n        are supported by default, since data are not persisted.\n\n        Args:\n            id: The ID of the request queue. If not provided, a random ID will be generated.\n            name: The name of the request queue for named (global scope) storages.\n            alias: The alias of the request queue for unnamed (run scope) storages.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If both name and alias are provided.\n        \"\"\"\n        # Validate input parameters.\n        raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n        # Create a new queue\n        queue_id = id or crypto_random_object_id()\n        now = datetime.now(timezone.utc)\n\n        metadata = RequestQueueMetadata(\n            id=queue_id,\n            name=name,\n            created_at=now,\n            accessed_at=now,\n            modified_at=now,\n            had_multiple_clients=False,\n            handled_request_count=0,\n            pending_request_count=0,\n            total_request_count=0,\n        )\n\n        return cls(metadata=metadata)\n\n    @override\n    async def drop(self) -> None:\n        self._pending_requests.clear()\n        self._handled_requests.clear()\n        self._requests_by_unique_key.clear()\n        self._in_progress_requests.clear()\n\n        await self._update_metadata(\n            update_modified_at=True,\n            update_accessed_at=True,\n            new_handled_request_count=0,\n            new_pending_request_count=0,\n            new_total_request_count=0,\n        )\n\n    @override\n    async def purge(self) -> None:\n        self._pending_requests.clear()\n        self._handled_requests.clear()\n        self._requests_by_unique_key.clear()\n        self._in_progress_requests.clear()\n\n        await self._update_metadata(\n            update_modified_at=True,\n            update_accessed_at=True,\n            new_pending_request_count=0,\n            new_handled_request_count=0,\n            new_total_request_count=0,\n        )\n\n    @override\n    async def add_batch_of_requests(\n        self,\n        requests: Sequence[Request],\n        *,\n        forefront: bool = False,\n    ) -> AddRequestsResponse:\n        processed_requests = []\n        for request in requests:\n            # Check if the request is already in the queue by unique_key.\n            existing_request = self._requests_by_unique_key.get(request.unique_key)\n\n            was_already_present = existing_request is not None\n            was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None\n            is_in_progress = request.unique_key in self._in_progress_requests\n\n            # If the request is already in the queue and handled, don't add it again.\n            if was_already_handled:\n                processed_requests.append(\n                    ProcessedRequest(\n                        unique_key=request.unique_key,\n                        was_already_present=True,\n                        was_already_handled=True,\n                    )\n                )\n                continue\n\n            # If the request is already in progress, don't add it again.\n            if is_in_progress:\n                processed_requests.append(\n                    ProcessedRequest(\n                        unique_key=request.unique_key,\n                        was_already_present=True,\n                        was_already_handled=False,\n                    )\n                )\n                continue\n\n            # If the request is already in the queue but not handled, update it.\n            if was_already_present and existing_request:\n                # Update indexes.\n                self._requests_by_unique_key[request.unique_key] = request\n\n                # We only update `forefront` by updating its position by shifting it to the left.\n                if forefront:\n                    # Update the existing request with any new data and\n                    # remove old request from pending queue if it's there.\n                    with suppress(ValueError):\n                        self._pending_requests.remove(existing_request)\n\n                    # Add updated request back to queue.\n                    self._pending_requests.appendleft(request)\n\n                processed_requests.append(\n                    ProcessedRequest(\n                        unique_key=request.unique_key,\n                        was_already_present=True,\n                        was_already_handled=False,\n                    )\n                )\n\n            # Add the new request to the queue.\n            else:\n                if forefront:\n                    self._pending_requests.appendleft(request)\n                else:\n                    self._pending_requests.append(request)\n\n                # Update indexes.\n                self._requests_by_unique_key[request.unique_key] = request\n\n                await self._update_metadata(\n                    new_total_request_count=self._metadata.total_request_count + 1,\n                    new_pending_request_count=self._metadata.pending_request_count + 1,\n                )\n\n            processed_requests.append(\n                ProcessedRequest(\n                    unique_key=request.unique_key,\n                    was_already_present=was_already_present,\n                    was_already_handled=False,\n                )\n            )\n\n        await self._update_metadata(update_accessed_at=True, update_modified_at=True)\n\n        return AddRequestsResponse(\n            processed_requests=processed_requests,\n            unprocessed_requests=[],\n        )\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        while self._pending_requests:\n            request = self._pending_requests.popleft()\n\n            # Skip if already handled (shouldn't happen, but safety check).\n            if request.was_already_handled:\n                continue\n\n            # Skip if already in progress (shouldn't happen, but safety check).\n            if request.unique_key in self._in_progress_requests:\n                continue\n\n            # Mark as in progress.\n            self._in_progress_requests[request.unique_key] = request\n            return request\n\n        return None\n\n    @override\n    async def get_request(self, unique_key: str) -> Request | None:\n        await self._update_metadata(update_accessed_at=True)\n        return self._requests_by_unique_key.get(unique_key)\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        # Check if the request is in progress.\n        if request.unique_key not in self._in_progress_requests:\n            return None\n\n        # Set handled_at timestamp if not already set.\n        if not request.was_already_handled:\n            request.handled_at = datetime.now(timezone.utc)\n\n        # Move request to handled storage.\n        self._handled_requests[request.unique_key] = request\n\n        # Update index (keep the request in indexes for get_request to work).\n        self._requests_by_unique_key[request.unique_key] = request\n\n        # Remove from in-progress.\n        del self._in_progress_requests[request.unique_key]\n\n        # Update metadata.\n        await self._update_metadata(\n            new_handled_request_count=self._metadata.handled_request_count + 1,\n            new_pending_request_count=self._metadata.pending_request_count - 1,\n            update_modified_at=True,\n        )\n\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=True,\n        )\n\n    @override\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        # Check if the request is in progress.\n        if request.unique_key not in self._in_progress_requests:\n            return None\n\n        # Remove from in-progress.\n        del self._in_progress_requests[request.unique_key]\n\n        # Add request back to pending queue.\n        if forefront:\n            self._pending_requests.appendleft(request)\n        else:\n            self._pending_requests.append(request)\n\n        # Update metadata timestamps.\n        await self._update_metadata(update_modified_at=True)\n\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=False,\n        )\n\n    @override\n    async def is_empty(self) -> bool:\n        \"\"\"Check if the queue is empty.\n\n        Returns:\n            True if the queue is empty, False otherwise.\n        \"\"\"\n        await self._update_metadata(update_accessed_at=True)\n\n        # Queue is empty if there are no pending requests and no requests in progress.\n        return len(self._pending_requests) == 0 and len(self._in_progress_requests) == 0\n\n    async def _update_metadata(\n        self,\n        *,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n        new_handled_request_count: int | None = None,\n        new_pending_request_count: int | None = None,\n        new_total_request_count: int | None = None,\n    ) -> None:\n        \"\"\"Update the request queue metadata with current information.\n\n        Args:\n            update_accessed_at: If True, update the `accessed_at` timestamp to the current time.\n            update_modified_at: If True, update the `modified_at` timestamp to the current time.\n            new_handled_request_count: If provided, set the handled request count to this value.\n            new_pending_request_count: If provided, set the pending request count to this value.\n            new_total_request_count: If provided, set the total request count to this value.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            self._metadata.accessed_at = now\n        if update_modified_at:\n            self._metadata.modified_at = now\n        if new_handled_request_count is not None:\n            self._metadata.handled_request_count = new_handled_request_count\n        if new_pending_request_count is not None:\n            self._metadata.pending_request_count = new_pending_request_count\n        if new_total_request_count is not None:\n            self._metadata.total_request_count = new_total_request_count\n"
  },
  {
    "path": "src/crawlee/storage_clients/_memory/_storage_client.py",
    "content": "from __future__ import annotations\n\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients._base import StorageClient\n\nfrom ._dataset_client import MemoryDatasetClient\nfrom ._key_value_store_client import MemoryKeyValueStoreClient\nfrom ._request_queue_client import MemoryRequestQueueClient\n\n\n@docs_group('Storage clients')\nclass MemoryStorageClient(StorageClient):\n    \"\"\"Memory implementation of the storage client.\n\n    This storage client provides access to datasets, key-value stores, and request queues that store all data\n    in memory using Python data structures (lists and dictionaries). No data is persisted between process runs,\n    meaning all stored data is lost when the program terminates.\n\n    The memory implementation provides fast access to data but is limited by available memory and does not\n    support data sharing across different processes. All storage operations happen entirely in memory with\n    no disk operations.\n\n    The memory storage client is useful for testing and development environments, or short-lived crawler\n    operations where persistence is not required.\n    \"\"\"\n\n    @override\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> MemoryDatasetClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await MemoryDatasetClient.open(id=id, name=name, alias=alias)\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> MemoryKeyValueStoreClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await MemoryKeyValueStoreClient.open(id=id, name=name, alias=alias)\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> MemoryRequestQueueClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        client = await MemoryRequestQueueClient.open(id=id, name=name, alias=alias)\n        await self._purge_if_needed(client, configuration)\n        return client\n"
  },
  {
    "path": "src/crawlee/storage_clients/_memory/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/_redis/__init__.py",
    "content": "from ._dataset_client import RedisDatasetClient\nfrom ._key_value_store_client import RedisKeyValueStoreClient\nfrom ._request_queue_client import RedisRequestQueueClient\nfrom ._storage_client import RedisStorageClient\n\n__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_client_mixin.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom contextlib import asynccontextmanager\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload\n\nfrom crawlee._utils.crypto import crypto_random_object_id\n\nfrom ._utils import await_redis_response, read_lua_script\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from redis.asyncio import Redis\n    from redis.asyncio.client import Pipeline\n    from redis.commands.core import AsyncScript\n    from typing_extensions import NotRequired, Self\n\n    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata\n\n\nlogger = getLogger(__name__)\n\n\nclass MetadataUpdateParams(TypedDict, total=False):\n    \"\"\"Parameters for updating metadata.\"\"\"\n\n    update_accessed_at: NotRequired[bool]\n    update_modified_at: NotRequired[bool]\n\n\nclass RedisClientMixin:\n    \"\"\"Mixin class for Redis clients.\n\n    This mixin provides common Redis operations and basic methods for Redis storage clients.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default storage name in key prefix when none provided.\"\"\"\n\n    _MAIN_KEY: ClassVar[str]\n    \"\"\"Main Redis key prefix for this storage type.\"\"\"\n\n    _CLIENT_TYPE: ClassVar[str]\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:\n        self._storage_name = storage_name\n        self._storage_id = storage_id\n        self._redis = redis\n\n        self._scripts_loaded = False\n\n    @property\n    def redis(self) -> Redis:\n        \"\"\"Return the Redis client instance.\"\"\"\n        return self._redis\n\n    @property\n    def metadata_key(self) -> str:\n        \"\"\"Return the Redis key for the metadata of this storage.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:metadata'\n\n    @classmethod\n    async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None:\n        \"\"\"Retrieve metadata by storage name.\n\n        Args:\n            name: The name of the storage.\n            redis: The Redis client instance.\n            with_wait: Whether to wait for the storage to be created if it doesn't exist.\n        \"\"\"\n        if with_wait:\n            # Wait for the creation signal (max 30 seconds)\n            await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30))\n            # Signal consumed, push it back for other waiters\n            await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1))\n\n        response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata'))\n        data = response[0] if response is not None and isinstance(response, list) else response\n        if data is not None and not isinstance(data, dict):\n            raise TypeError('The metadata data was received in an incorrect format.')\n        return data\n\n    @classmethod\n    async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None:\n        \"\"\"Retrieve storage name by ID from id_to_name index.\n\n        Args:\n            id: The ID of the storage.\n            redis: The Redis client instance.\n        \"\"\"\n        name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id))\n        if isinstance(name, str) or name is None:\n            return name\n        if isinstance(name, bytes):\n            return name.decode('utf-8')\n        return None\n\n    @classmethod\n    async def _open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],\n        redis: Redis,\n        extra_metadata_fields: dict[str, Any],\n        instance_kwargs: dict[str, Any],\n    ) -> Self:\n        \"\"\"Open or create a new Redis storage client.\n\n        Args:\n            id: The ID of the storage. If not provided, a random ID will be generated.\n            name: The name of the storage for named (global scope) storages.\n            alias: The alias of the storage for unnamed (run scope) storages.\n            redis: Redis client instance.\n            metadata_model: Pydantic model for metadata validation.\n            extra_metadata_fields: Storage-specific metadata fields.\n            instance_kwargs: Additional arguments for the client constructor.\n\n        Returns:\n            An instance for the opened or created storage client.\n        \"\"\"\n        internal_name = name or alias or cls._DEFAULT_NAME\n        storage_id: str | None = None\n        # Determine if storage exists by ID or name\n        if id:\n            storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis)\n            storage_id = id\n            if storage_name is None:\n                raise ValueError(f'{cls._CLIENT_TYPE} with ID \"{id}\" does not exist.')\n        else:\n            metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis)\n            storage_name = internal_name if metadata_data is not None else None\n            storage_id = metadata_data['id'] if metadata_data is not None else None\n        # If both storage_name and storage_id are found, open existing storage\n        if storage_name and storage_id:\n            client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs)\n            async with client._get_pipeline() as pipe:\n                await client._update_metadata(pipe, update_accessed_at=True)\n        # Otherwise, create a new storage\n        else:\n            now = datetime.now(timezone.utc)\n            metadata = metadata_model(\n                id=crypto_random_object_id(),\n                name=name,\n                created_at=now,\n                accessed_at=now,\n                modified_at=now,\n                **extra_metadata_fields,\n            )\n            client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)\n            created = await client._create_metadata_and_storage(internal_name, metadata.model_dump())\n            # The client was probably not created due to a race condition. Let's try to open it using the name.\n            if not created:\n                metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True)\n                client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)\n\n        # Ensure Lua scripts are loaded\n        await client._ensure_scripts_loaded()\n        return client\n\n    async def _load_scripts(self) -> None:\n        \"\"\"Load Lua scripts in Redis.\"\"\"\n        return\n\n    async def _ensure_scripts_loaded(self) -> None:\n        \"\"\"Ensure Lua scripts are loaded in Redis.\"\"\"\n        if not self._scripts_loaded:\n            await self._load_scripts()\n            self._scripts_loaded = True\n\n    @asynccontextmanager\n    async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]:\n        \"\"\"Create a new Redis pipeline.\"\"\"\n        async with self._redis.pipeline() as pipe:\n            try:\n                pipe.multi()\n                yield pipe\n            finally:\n                if with_execute:\n                    await pipe.execute()\n\n    async def _create_storage(self, pipeline: Pipeline) -> None:\n        \"\"\"Create the actual storage structure in Redis.\"\"\"\n\n    async def _create_script(self, script_name: str) -> AsyncScript:\n        \"\"\"Load a Lua script from a file and return a Script object.\"\"\"\n        script_content = await asyncio.to_thread(read_lua_script, script_name)\n\n        return self._redis.register_script(script_content)\n\n    async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool:\n        index_id_to_name = f'{self._MAIN_KEY}:id_to_name'\n        index_name_to_id = f'{self._MAIN_KEY}:name_to_id'\n        metadata['created_at'] = metadata['created_at'].isoformat()\n        metadata['accessed_at'] = metadata['accessed_at'].isoformat()\n        metadata['modified_at'] = metadata['modified_at'].isoformat()\n\n        # Try to create name_to_id index entry, if it already exists, return False.\n        name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id']))\n        # If name already exists, return False. Probably an attempt at parallel creation.\n        if not name_to_id:\n            return False\n\n        # Create id_to_name index entry, metadata, and storage structure in a transaction.\n        async with self._get_pipeline() as pipe:\n            await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name))\n            await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata))\n            await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1))\n\n            await self._create_storage(pipe)\n\n        return True\n\n    async def _drop(self, extra_keys: list[str]) -> None:\n        async with self._get_pipeline() as pipe:\n            await pipe.delete(self.metadata_key)\n            await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id)\n            await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name)\n            await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal')\n            for key in extra_keys:\n                await pipe.delete(key)\n\n    async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None:\n        async with self._get_pipeline() as pipe:\n            for key in extra_keys:\n                await pipe.delete(key)\n            await self._update_metadata(pipe, **metadata_kwargs)\n            await self._create_storage(pipe)\n\n    @overload\n    async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...\n    @overload\n    async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...\n    @overload\n    async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...\n\n    async def _get_metadata(\n        self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]\n    ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:\n        \"\"\"Retrieve client metadata.\"\"\"\n        metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis)\n        if metadata_dict is None:\n            raise ValueError(f'{self._CLIENT_TYPE} with name \"{self._storage_name}\" does not exist.')\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, update_accessed_at=True)\n\n        return metadata_model.model_validate(metadata_dict)\n\n    async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None:\n        \"\"\"Pipeline operations storage-specific metadata updates.\n\n        Must be implemented by concrete classes.\n\n        Args:\n            pipeline: The Redis pipeline to use for the update.\n            **kwargs: Storage-specific update parameters.\n        \"\"\"\n\n    async def _update_metadata(\n        self,\n        pipeline: Pipeline,\n        *,\n        update_accessed_at: bool = False,\n        update_modified_at: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Update storage metadata combining common and specific fields.\n\n        Args:\n            pipeline: The Redis pipeline to use for the update.\n            update_accessed_at: Whether to update accessed_at timestamp.\n            update_modified_at: Whether to update modified_at timestamp.\n            **kwargs: Additional arguments for _specific_update_metadata.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n\n        if update_accessed_at:\n            await await_redis_response(\n                pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True)\n            )\n        if update_modified_at:\n            await await_redis_response(\n                pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True)\n            )\n\n        await self._specific_update_metadata(pipeline, **kwargs)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_dataset_client.py",
    "content": "from __future__ import annotations\n\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom typing_extensions import NotRequired, override\n\nfrom crawlee.storage_clients._base import DatasetClient\nfrom crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\nfrom ._client_mixin import MetadataUpdateParams, RedisClientMixin\nfrom ._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from redis.asyncio import Redis\n    from redis.asyncio.client import Pipeline\n\nlogger = getLogger(__name__)\n\n\nclass _DatasetMetadataUpdateParams(MetadataUpdateParams):\n    \"\"\"Parameters for updating dataset metadata.\"\"\"\n\n    new_item_count: NotRequired[int]\n    delta_item_count: NotRequired[int]\n\n\nclass RedisDatasetClient(DatasetClient, RedisClientMixin):\n    \"\"\"Redis implementation of the dataset client.\n\n    This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.\n    Items are stored as JSON objects with automatic ordering preservation through Redis list operations.\n\n    The dataset data is stored in Redis using the following key pattern:\n    - `datasets:{name}:items` - Redis JSON array containing all dataset items.\n    - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.\n\n    Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.\n    The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency\n    through Redis transactions and pipeline operations.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default Dataset name key prefix when none provided.\"\"\"\n\n    _MAIN_KEY = 'datasets'\n    \"\"\"Main Redis key prefix for Dataset.\"\"\"\n\n    _CLIENT_TYPE = 'Dataset'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `RedisDatasetClient.open` class method to create a new instance.\n\n        Args:\n            storage_name: Internal storage name used for Redis keys.\n            storage_id: Unique identifier for the dataset.\n            redis: Redis client instance.\n        \"\"\"\n        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)\n\n    @property\n    def _items_key(self) -> str:\n        \"\"\"Return the Redis key for the items of this dataset.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:items'\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        redis: Redis,\n    ) -> RedisDatasetClient:\n        \"\"\"Open or create a new Redis dataset client.\n\n        This method attempts to open an existing dataset from the Redis database. If a dataset with the specified\n        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the dataset. If not provided, a random ID will be generated.\n            name: The name of the dataset for named (global scope) storages.\n            alias: The alias of the dataset for unnamed (run scope) storages.\n            redis: Redis client instance.\n\n        Returns:\n            An instance for the opened or created storage client.\n        \"\"\"\n        return await cls._open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=redis,\n            metadata_model=DatasetMetadata,\n            extra_metadata_fields={'item_count': 0},\n            instance_kwargs={},\n        )\n\n    @override\n    async def get_metadata(self) -> DatasetMetadata:\n        return await self._get_metadata(DatasetMetadata)\n\n    @override\n    async def drop(self) -> None:\n        await self._drop(extra_keys=[self._items_key])\n\n    @override\n    async def purge(self) -> None:\n        await self._purge(\n            extra_keys=[self._items_key],\n            metadata_kwargs=_DatasetMetadataUpdateParams(\n                new_item_count=0, update_accessed_at=True, update_modified_at=True\n            ),\n        )\n\n    @override\n    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:\n        if isinstance(data, dict):\n            data = [data]\n\n        async with self._get_pipeline() as pipe:\n            pipe.json().arrappend(self._items_key, '$', *data)\n            await self._update_metadata(\n                pipe,\n                **_DatasetMetadataUpdateParams(\n                    update_accessed_at=True, update_modified_at=True, delta_item_count=len(data)\n                ),\n            )\n\n    @override\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        # Check for unsupported arguments and log a warning if found\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n            'flatten': flatten,\n            'view': view,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of get_data are not supported '\n                f'by the {self.__class__.__name__} client.'\n            )\n\n        metadata = await self.get_metadata()\n\n        total = metadata.item_count\n        json_path = '$'\n\n        # Apply sorting and pagination\n        match (desc, offset, limit):\n            case (True, 0, int()):\n                json_path += f'[-{limit}:]'\n            case (True, int(), None):\n                json_path += f'[:-{offset}]'\n            case (True, int(), int()):\n                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.\n                json_path += f'[-{offset + limit}:-{offset}]'  # ty: ignore[unsupported-operator]\n            case (False, 0, int()):\n                json_path += f'[:{limit}]'\n            case (False, int(), None):\n                json_path += f'[{offset}:]'\n            case (False, int(), int()):\n                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.\n                json_path += f'[{offset}:{offset + limit}]'  # ty: ignore[unsupported-operator]\n\n        if json_path == '$':\n            json_path = '$[*]'\n\n        data = await await_redis_response(self._redis.json().get(self._items_key, json_path))\n\n        if data is None:\n            data = []\n\n        data = [item for item in data if isinstance(item, dict)]\n\n        if skip_empty:\n            data = [item for item in data if item]\n\n        if desc:\n            data = list(reversed(data))\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))\n\n        return DatasetItemsListPage(\n            count=len(data),\n            offset=offset,\n            limit=limit or (total - offset),\n            total=total,\n            desc=desc,\n            items=data,\n        )\n\n    @override\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = None,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        \"\"\"Iterate over dataset items one by one.\n\n        This method yields items individually instead of loading all items at once,\n        which is more memory efficient for large datasets.\n        \"\"\"\n        # Log warnings for unsupported arguments\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of iterate_items are not supported '\n                f'by the {self.__class__.__name__} client.'\n            )\n\n        metadata = await self.get_metadata()\n        total_items = metadata.item_count\n\n        # Calculate actual range based on parameters\n        start_idx = offset\n        end_idx = min(total_items, offset + limit) if limit is not None else total_items\n\n        # Update accessed_at timestamp\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))\n\n        # Process items in batches for better network efficiency\n        batch_size = 100\n\n        for batch_start in range(start_idx, end_idx, batch_size):\n            batch_end = min(batch_start + batch_size, end_idx)\n\n            # Build JsonPath for batch slice\n            if desc:\n                # For descending order, we need to reverse the slice calculation\n                desc_batch_start = total_items - batch_end\n                desc_batch_end = total_items - batch_start\n                json_path = f'$[{desc_batch_start}:{desc_batch_end}]'\n            else:\n                json_path = f'$[{batch_start}:{batch_end}]'\n\n            # Get batch of items\n            batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path))\n\n            # Handle case where batch_items might be None or not a list\n            if batch_items is None:\n                continue\n\n            # Reverse batch if desc order (since we got items in normal order but need desc)\n            items_iter = reversed(batch_items) if desc else iter(batch_items)\n\n            # Yield items from batch\n            for item in items_iter:\n                # Apply skip_empty filter\n                if skip_empty and not item:\n                    continue\n\n                yield cast('dict[str, Any]', item)\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))\n\n    @override\n    async def _create_storage(self, pipeline: Pipeline) -> None:\n        \"\"\"Create the main dataset keys in Redis.\"\"\"\n        # Create an empty JSON array for items\n        await await_redis_response(pipeline.json().set(self._items_key, '$', []))\n\n    @override\n    async def _specific_update_metadata(\n        self,\n        pipeline: Pipeline,\n        *,\n        new_item_count: int | None = None,\n        delta_item_count: int | None = None,\n        **_kwargs: Any,\n    ) -> None:\n        \"\"\"Update the dataset metadata in the database.\n\n        Args:\n            pipeline: The Redis pipeline to use for the update.\n            new_item_count: If provided, update the item count to this value.\n            delta_item_count: If provided, increment the item count by this value.\n        \"\"\"\n        if new_item_count is not None:\n            await await_redis_response(\n                pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True)\n            )\n        elif delta_item_count is not None:\n            await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count))\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_key_value_store_client.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any\n\nfrom typing_extensions import override\n\nfrom crawlee._utils.file import infer_mime_type\nfrom crawlee.storage_clients._base import KeyValueStoreClient\nfrom crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata\n\nfrom ._client_mixin import MetadataUpdateParams, RedisClientMixin\nfrom ._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from redis.asyncio import Redis\n\nlogger = getLogger(__name__)\n\n\nclass RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):\n    \"\"\"Redis implementation of the key-value store client.\n\n    This client persists key-value data to Redis using hash data structures for efficient storage and retrieval.\n    Keys are mapped to values with automatic content type detection and size tracking for metadata management.\n\n    The key-value store data is stored in Redis using the following key pattern:\n    - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).\n    - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.\n    - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.\n\n    Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,\n    text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles\n    content type detection and maintains metadata about each record including size and MIME type information.\n\n    All operations are atomic through Redis hash operations and pipeline transactions. The client supports\n    concurrent access through Redis's built-in atomic operations for hash fields.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default Key-Value Store name key prefix when none provided.\"\"\"\n\n    _MAIN_KEY = 'key_value_stores'\n    \"\"\"Main Redis key prefix for Key-Value Store.\"\"\"\n\n    _CLIENT_TYPE = 'Key-value store'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance.\n        \"\"\"\n        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)\n\n    @property\n    def _items_key(self) -> str:\n        \"\"\"Return the Redis key for the items of KVS.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:items'\n\n    @property\n    def _metadata_items_key(self) -> str:\n        \"\"\"Return the Redis key for the items metadata of KVS.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items'\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        redis: Redis,\n    ) -> RedisKeyValueStoreClient:\n        \"\"\"Open or create a new Redis key-value store client.\n\n        This method attempts to open an existing key-value store from the Redis database. If a store with the specified\n        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the key-value store. If not provided, a random ID will be generated.\n            name: The name of the key-value store for named (global scope) storages.\n            alias: The alias of the key-value store for unnamed (run scope) storages.\n            redis: Redis client instance.\n\n        Returns:\n            An instance for the opened or created storage client.\n        \"\"\"\n        return await cls._open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=redis,\n            metadata_model=KeyValueStoreMetadata,\n            extra_metadata_fields={},\n            instance_kwargs={},\n        )\n\n    @override\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        return await self._get_metadata(KeyValueStoreMetadata)\n\n    @override\n    async def drop(self) -> None:\n        await self._drop(extra_keys=[self._items_key, self._metadata_items_key])\n\n    @override\n    async def purge(self) -> None:\n        await self._purge(\n            extra_keys=[self._items_key, self._metadata_items_key],\n            metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True),\n        )\n\n    @override\n    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:\n        # Special handling for None values\n        if value is None:\n            content_type = 'application/x-none'  # Special content type to identify None values\n            value_bytes = b''\n        else:\n            content_type = content_type or infer_mime_type(value)\n\n            # Serialize the value to bytes.\n            if 'application/json' in content_type:\n                value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')\n            elif isinstance(value, str):\n                value_bytes = value.encode('utf-8')\n            elif isinstance(value, (bytes, bytearray)):\n                value_bytes = value\n            else:\n                # Fallback: attempt to convert to string and encode.\n                value_bytes = str(value).encode('utf-8')\n\n        size = len(value_bytes)\n        item_metadata = KeyValueStoreRecordMetadata(\n            key=key,\n            content_type=content_type,\n            size=size,\n        )\n\n        async with self._get_pipeline() as pipe:\n            # redis-py typing issue\n            await await_redis_response(pipe.hset(self._items_key, key, value_bytes))  # ty: ignore[invalid-argument-type]\n\n            await await_redis_response(\n                pipe.hset(\n                    self._metadata_items_key,\n                    key,\n                    item_metadata.model_dump_json(),\n                )\n            )\n            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))\n\n    @override\n    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:\n        serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key))\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True))\n\n        if not isinstance(serialized_metadata_item, (str, bytes, bytearray)):\n            logger.warning(f'Metadata for key \"{key}\" is missing or invalid.')\n            return None\n\n        metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item)\n\n        # Handle None values\n        if metadata_item.content_type == 'application/x-none':\n            return KeyValueStoreRecord(value=None, **metadata_item.model_dump())\n\n        # Query the record by key\n        # redis-py typing issue\n        value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key))  # ty: ignore[invalid-assignment]\n\n        if value_bytes is None:\n            logger.warning(f'Value for key \"{key}\" is missing.')\n            return None\n\n        # Handle JSON values\n        if 'application/json' in metadata_item.content_type:\n            try:\n                value = json.loads(value_bytes.decode('utf-8'))\n            except (json.JSONDecodeError, UnicodeDecodeError):\n                logger.warning(f'Failed to decode JSON value for key \"{key}\"')\n                return None\n        # Handle text values\n        elif metadata_item.content_type.startswith('text/'):\n            try:\n                value = value_bytes.decode('utf-8')\n            except UnicodeDecodeError:\n                logger.warning(f'Failed to decode text value for key \"{key}\"')\n                return None\n        # Handle binary values\n        else:\n            value = value_bytes\n\n        return KeyValueStoreRecord(value=value, **metadata_item.model_dump())\n\n    @override\n    async def delete_value(self, *, key: str) -> None:\n        async with self._get_pipeline() as pipe:\n            await await_redis_response(pipe.hdel(self._items_key, key))\n            await await_redis_response(pipe.hdel(self._metadata_items_key, key))\n            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))\n\n    @override\n    async def iterate_keys(\n        self,\n        *,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key))\n\n        if not items_data:\n            return  # No items to iterate over\n\n        if not isinstance(items_data, dict):\n            raise TypeError('The items data was received in an incorrect format.')\n\n        # Get all keys, sorted alphabetically\n        keys = sorted(items_data.keys())\n\n        # Apply exclusive_start_key filter if provided\n        if exclusive_start_key is not None:\n            bytes_exclusive_start_key = exclusive_start_key.encode()\n            keys = [k for k in keys if k > bytes_exclusive_start_key]\n\n        # Apply limit if provided\n        if limit is not None:\n            keys = keys[:limit]\n\n        # Yield metadata for each key\n        for key in keys:\n            record = items_data[key]\n            if not isinstance(record, (str, bytes)):\n                raise TypeError(f'Expected str or bytes, got {type(record)}')\n            yield KeyValueStoreRecordMetadata.model_validate_json(record)\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(\n                pipe,\n                **MetadataUpdateParams(update_accessed_at=True),\n            )\n\n    @override\n    async def get_public_url(self, *, key: str) -> str:\n        raise NotImplementedError('Public URLs are not supported for memory key-value stores.')\n\n    @override\n    async def record_exists(self, *, key: str) -> bool:\n        async with self._get_pipeline(with_execute=False) as pipe:\n            await await_redis_response(pipe.hexists(self._items_key, key))\n            await self._update_metadata(\n                pipe,\n                **MetadataUpdateParams(update_accessed_at=True),\n            )\n            results = await pipe.execute()\n\n        return bool(results[0])\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_request_queue_client.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom collections import deque\nfrom datetime import datetime, timedelta, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, Literal\n\nfrom typing_extensions import NotRequired, override\n\nfrom crawlee import Request\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee.storage_clients._base import RequestQueueClient\nfrom crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata\n\nfrom ._client_mixin import MetadataUpdateParams, RedisClientMixin\nfrom ._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from redis.asyncio import Redis\n    from redis.asyncio.client import Pipeline\n    from redis.commands.core import AsyncScript\n\nlogger = getLogger(__name__)\n\n\nclass _QueueMetadataUpdateParams(MetadataUpdateParams):\n    \"\"\"Parameters for updating queue metadata.\"\"\"\n\n    new_handled_request_count: NotRequired[int]\n    new_pending_request_count: NotRequired[int]\n    new_total_request_count: NotRequired[int]\n    delta_handled_request_count: NotRequired[int]\n    delta_pending_request_count: NotRequired[int]\n    delta_total_request_count: NotRequired[int]\n    recalculate: NotRequired[bool]\n    update_had_multiple_clients: NotRequired[bool]\n\n\nclass RedisRequestQueueClient(RequestQueueClient, RedisClientMixin):\n    \"\"\"Redis implementation of the request queue client.\n\n    This client persists requests to Redis using multiple data structures for efficient queue operations,\n    deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support\n    both regular and forefront (high-priority) insertion modes.\n\n    The implementation uses Bloom filters for efficient request deduplication and Redis lists for\n    queue operations. Request blocking and client coordination is handled through Redis hashes\n    with timestamp-based expiration for stale request recovery.\n\n    The request queue data is stored in Redis using the following key patterns:\n    - `request_queues:{name}:queue` - Redis list for FIFO request ordering\n    - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key\n    - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed\n    - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)\n    - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`\n        dedup_strategy)\n    - `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)\n    - `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)\n    - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata\n\n    Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list\n    operations. The implementation provides concurrent access safety through atomic Lua scripts,\n    Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default Request Queue name key prefix when none provided.\"\"\"\n\n    _MAIN_KEY = 'request_queues'\n    \"\"\"Main Redis key prefix for Request Queue.\"\"\"\n\n    _CLIENT_TYPE = 'Request queue'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    _MAX_BATCH_FETCH_SIZE = 10\n    \"\"\"Maximum number of requests to fetch in a single batch operation.\"\"\"\n\n    _BLOCK_REQUEST_TIME = 300_000  # milliseconds\n    \"\"\"Time in milliseconds to block a fetched request for other clients before it can be autoreclaimed.\"\"\"\n\n    _RECLAIM_INTERVAL = timedelta(seconds=30)\n    \"\"\"Interval to check for stale requests to reclaim.\"\"\"\n\n    def __init__(\n        self,\n        storage_name: str,\n        storage_id: str,\n        redis: Redis,\n        dedup_strategy: Literal['default', 'bloom'] = 'default',\n        bloom_error_rate: float = 1e-7,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `RedisRequestQueueClient.open` class method to create a new instance.\n        \"\"\"\n        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)\n\n        self._dedup_strategy = dedup_strategy\n        \"\"\"Deduplication strategy for the queue.\"\"\"\n\n        self._bloom_error_rate = bloom_error_rate\n        \"\"\"Desired false positive rate for Bloom filters.\"\"\"\n\n        self._pending_fetch_cache: deque[Request] = deque()\n        \"\"\"Cache for requests: ordered by sequence number.\"\"\"\n\n        self.client_key = crypto_random_object_id(length=32)[:32]\n        \"\"\"Unique identifier for this client instance.\"\"\"\n\n        # Lua scripts for atomic operations\n        self._fetch_script: AsyncScript | None = None\n        self._reclaim_stale_script: AsyncScript | None = None\n        self._add_requests_script: AsyncScript | None = None\n\n        self._next_reclaim_stale: None | datetime = None\n\n    @property\n    def _added_filter_key(self) -> str:\n        \"\"\"Return the Redis key for the added requests Bloom filter.\"\"\"\n        if self._dedup_strategy != 'bloom':\n            raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.')\n        return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter'\n\n    @property\n    def _handled_filter_key(self) -> str:\n        \"\"\"Return the Redis key for the handled requests Bloom filter.\"\"\"\n        if self._dedup_strategy != 'bloom':\n            raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.')\n        return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter'\n\n    @property\n    def _pending_set_key(self) -> str:\n        \"\"\"Return the Redis key for the pending requests set.\"\"\"\n        if self._dedup_strategy != 'default':\n            raise RuntimeError('The pending requests set is only available with the default deduplication strategy.')\n        return f'{self._MAIN_KEY}:{self._storage_name}:pending_set'\n\n    @property\n    def _handled_set_key(self) -> str:\n        \"\"\"Return the Redis key for the handled requests set.\"\"\"\n        if self._dedup_strategy != 'default':\n            raise RuntimeError('The handled requests set is only available with the default deduplication strategy.')\n        return f'{self._MAIN_KEY}:{self._storage_name}:handled_set'\n\n    @property\n    def _queue_key(self) -> str:\n        \"\"\"Return the Redis key for the request queue.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:queue'\n\n    @property\n    def _data_key(self) -> str:\n        \"\"\"Return the Redis key for the request data hash.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:data'\n\n    @property\n    def _in_progress_key(self) -> str:\n        \"\"\"Return the Redis key for the in-progress requests hash.\"\"\"\n        return f'{self._MAIN_KEY}:{self._storage_name}:in_progress'\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        redis: Redis,\n        dedup_strategy: Literal['default', 'bloom'] = 'default',\n        bloom_error_rate: float = 1e-7,\n    ) -> RedisRequestQueueClient:\n        \"\"\"Open or create a new Redis request queue client.\n\n        This method attempts to open an existing request queue from the Redis database. If a queue with the specified\n        ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the request queue. If not provided, a random ID will be generated.\n            name: The name of the dataset for named (global scope) storages.\n            alias: The alias of the dataset for unnamed (run scope) storages.\n            redis: Redis client instance.\n            dedup_strategy: Strategy for request queue deduplication. Options are:\n                - 'default': Uses Redis sets for exact deduplication.\n                - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\n                    this approach, there is a possibility 1e-7 that requests will be skipped in the queue.\n            bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if\n                `dedup_strategy` is set to 'bloom'.\n\n        Returns:\n            An instance for the opened or created storage client.\n        \"\"\"\n        return await cls._open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=redis,\n            metadata_model=RequestQueueMetadata,\n            extra_metadata_fields={\n                'had_multiple_clients': False,\n                'handled_request_count': 0,\n                'pending_request_count': 0,\n                'total_request_count': 0,\n            },\n            instance_kwargs={'dedup_strategy': dedup_strategy, 'bloom_error_rate': bloom_error_rate},\n        )\n\n    @override\n    async def get_metadata(self) -> RequestQueueMetadata:\n        return await self._get_metadata(RequestQueueMetadata)\n\n    @override\n    async def drop(self) -> None:\n        if self._dedup_strategy == 'bloom':\n            extra_keys = [self._added_filter_key, self._handled_filter_key]\n        elif self._dedup_strategy == 'default':\n            extra_keys = [self._pending_set_key, self._handled_set_key]\n        else:\n            raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')\n        extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])\n        await self._drop(extra_keys=extra_keys)\n\n    @override\n    async def purge(self) -> None:\n        if self._dedup_strategy == 'bloom':\n            extra_keys = [self._added_filter_key, self._handled_filter_key]\n        elif self._dedup_strategy == 'default':\n            extra_keys = [self._pending_set_key, self._handled_set_key]\n        else:\n            raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')\n        extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])\n        await self._purge(\n            extra_keys=extra_keys,\n            metadata_kwargs=_QueueMetadataUpdateParams(\n                update_accessed_at=True,\n                update_modified_at=True,\n                new_pending_request_count=0,\n                new_handled_request_count=0,\n                new_total_request_count=0,\n            ),\n        )\n\n    @override\n    async def add_batch_of_requests(\n        self,\n        requests: Sequence[Request],\n        *,\n        forefront: bool = False,\n    ) -> AddRequestsResponse:\n        if self._add_requests_script is None:\n            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')\n\n        processed_requests = []\n\n        delta_pending = 0\n        delta_total = 0\n\n        requests_by_unique_key = {req.unique_key: req for req in requests}\n        unique_keys = list(requests_by_unique_key.keys())\n        # Check which requests are already added or handled\n        async with self._get_pipeline(with_execute=False) as pipe:\n            if self._dedup_strategy == 'default':\n                await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys))\n                await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys))\n            elif self._dedup_strategy == 'bloom':\n                await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys))\n                await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys))\n\n            pipe_results = await pipe.execute()\n\n        added_pending_flags = pipe_results[0]\n        handled_flags = pipe_results[1]\n\n        new_unique_keys = []\n        new_request_data = {}\n        delta_pending = 0\n        delta_total = 0\n\n        for i, unique_key in enumerate(unique_keys):\n            # Already handled - skip\n            if handled_flags[i]:\n                processed_requests.append(\n                    ProcessedRequest(\n                        unique_key=unique_key,\n                        was_already_present=True,\n                        was_already_handled=True,\n                    )\n                )\n                continue\n\n            # Already in queue - skip\n            if added_pending_flags[i]:\n                processed_requests.append(\n                    ProcessedRequest(\n                        unique_key=unique_key,\n                        was_already_present=True,\n                        was_already_handled=False,\n                    )\n                )\n                continue\n\n            # New request - will add to queue\n            request = requests_by_unique_key[unique_key]\n\n            new_unique_keys.append(unique_key)\n            new_request_data[unique_key] = request.model_dump_json()\n\n        if new_unique_keys:\n            # Add new requests to the queue atomically, get back which were actually added\n            script_results = await self._add_requests_script(\n                keys=[\n                    self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key,\n                    self._queue_key,\n                    self._data_key,\n                ],\n                args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)],\n            )\n            actually_added = set(json.loads(script_results))\n\n            delta_pending = len(actually_added)\n            delta_total = len(actually_added)\n\n            processed_requests.extend(\n                [\n                    ProcessedRequest(\n                        unique_key=unique_key,\n                        was_already_present=unique_key not in actually_added,\n                        was_already_handled=False,\n                    )\n                    for unique_key in new_unique_keys\n                ]\n            )\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(\n                pipe,\n                **_QueueMetadataUpdateParams(\n                    update_accessed_at=True,\n                    update_modified_at=True,\n                    delta_pending_request_count=delta_pending,\n                    delta_total_request_count=delta_total,\n                ),\n            )\n\n        return AddRequestsResponse(\n            processed_requests=processed_requests,\n            unprocessed_requests=[],\n        )\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        if self._pending_fetch_cache:\n            return self._pending_fetch_cache.popleft()\n\n        if self._fetch_script is None:\n            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')\n\n        blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME\n\n        # The script retrieves requests from the queue and places them in the in_progress hash.\n        requests_json = await self._fetch_script(\n            keys=[self._queue_key, self._in_progress_key, self._data_key],\n            args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE],\n        )\n\n        async with self._get_pipeline() as pipe:\n            await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True))\n\n        if not requests_json:\n            return None\n\n        requests = [Request.model_validate_json(req_json) for req_json in requests_json]\n\n        self._pending_fetch_cache.extend(requests[1:])\n\n        return requests[0]\n\n    @override\n    async def get_request(self, unique_key: str) -> Request | None:\n        request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key))\n\n        if isinstance(request_data, (str, bytes, bytearray)):\n            return Request.model_validate_json(request_data)\n\n        return None\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        # Check if the request is in progress.\n        check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))\n        if not check_in_progress:\n            logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')\n            return None\n\n        # Update the request's handled_at timestamp.\n        if request.handled_at is None:\n            request.handled_at = datetime.now(timezone.utc)\n\n        async with self._get_pipeline() as pipe:\n            if self._dedup_strategy == 'default':\n                await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key))\n                await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key))\n            elif self._dedup_strategy == 'bloom':\n                await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key))\n\n            await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))\n            await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))\n\n            await self._update_metadata(\n                pipe,\n                **_QueueMetadataUpdateParams(\n                    update_accessed_at=True,\n                    update_modified_at=True,\n                    delta_handled_request_count=1,\n                    delta_pending_request_count=-1,\n                ),\n            )\n\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=True,\n        )\n\n    @override\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))\n        if not check_in_progress:\n            logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')\n            return None\n\n        async with self._get_pipeline() as pipe:\n            if forefront:\n                blocked_until_timestamp = (\n                    int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME\n                )\n\n                await await_redis_response(\n                    pipe.hset(\n                        self._in_progress_key,\n                        request.unique_key,\n                        f'{{\"client_id\":\"{self.client_key}\",\"blocked_until_timestamp\":{blocked_until_timestamp}}}',\n                    )\n                )\n                await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))\n                self._pending_fetch_cache.appendleft(request)\n            else:\n                await await_redis_response(pipe.rpush(self._queue_key, request.unique_key))\n                await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))\n                await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))\n            await self._update_metadata(\n                pipe,\n                **_QueueMetadataUpdateParams(\n                    update_modified_at=True,\n                    update_accessed_at=True,\n                ),\n            )\n\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=False,\n        )\n\n    @override\n    async def is_empty(self) -> bool:\n        \"\"\"Check if the queue is empty.\n\n        Returns:\n            True if the queue is empty, False otherwise.\n        \"\"\"\n        if self._pending_fetch_cache:\n            return False\n\n        # Reclaim stale requests if needed\n        if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale:\n            await self._reclaim_stale_requests()\n            self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL\n\n        metadata = await self.get_metadata()\n\n        return metadata.pending_request_count == 0\n\n    async def _load_scripts(self) -> None:\n        \"\"\"Ensure Lua scripts are loaded in Redis.\"\"\"\n        self._fetch_script = await self._create_script('atomic_fetch_request.lua')\n        self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua')\n        if self._dedup_strategy == 'bloom':\n            self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua')\n        elif self._dedup_strategy == 'default':\n            self._add_requests_script = await self._create_script('atomic_set_add_requests.lua')\n\n    @override\n    async def _create_storage(self, pipeline: Pipeline) -> None:\n        # Create Bloom filters for added and handled requests\n        if self._dedup_strategy == 'bloom':\n            await await_redis_response(\n                pipeline.bf().create(\n                    self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10\n                )\n            )\n            await await_redis_response(\n                pipeline.bf().create(\n                    self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10\n                )\n            )\n\n    async def _reclaim_stale_requests(self) -> None:\n        \"\"\"Reclaim requests that have been in progress for too long.\"\"\"\n        if self._reclaim_stale_script is None:\n            raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')\n\n        current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000)\n\n        await self._reclaim_stale_script(\n            keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time]\n        )\n\n    @override\n    async def _specific_update_metadata(\n        self,\n        pipeline: Pipeline,\n        *,\n        delta_handled_request_count: int | None = None,\n        new_handled_request_count: int | None = None,\n        delta_pending_request_count: int | None = None,\n        new_pending_request_count: int | None = None,\n        delta_total_request_count: int | None = None,\n        new_total_request_count: int | None = None,\n        update_had_multiple_clients: bool = False,\n        **_kwargs: Any,\n    ) -> None:\n        \"\"\"Update the dataset metadata with current information.\n\n        Args:\n            pipeline: The Redis pipeline to use for the update.\n            new_handled_request_count: If provided, update the handled_request_count to this value.\n            new_pending_request_count: If provided, update the pending_request_count to this value.\n            new_total_request_count: If provided, update the total_request_count to this value.\n            delta_handled_request_count: If provided, add this value to the handled_request_count.\n            delta_pending_request_count: If provided, add this value to the pending_request_count.\n            delta_total_request_count: If provided, add this value to the total_request_count.\n            update_had_multiple_clients: If True, set had_multiple_clients to True.\n        \"\"\"\n        if new_pending_request_count is not None:\n            await await_redis_response(\n                pipeline.json().set(\n                    self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True\n                )\n            )\n        elif delta_pending_request_count is not None:\n            await await_redis_response(\n                pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count)\n            )\n\n        if new_handled_request_count is not None:\n            await await_redis_response(\n                pipeline.json().set(\n                    self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True\n                )\n            )\n        elif delta_handled_request_count is not None:\n            await await_redis_response(\n                pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count)\n            )\n\n        if new_total_request_count is not None:\n            await await_redis_response(\n                pipeline.json().set(\n                    self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True\n                )\n            )\n        elif delta_total_request_count is not None:\n            await await_redis_response(\n                pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count)\n            )\n\n        if update_had_multiple_clients:\n            await await_redis_response(\n                pipeline.json().set(\n                    self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True\n                )\n            )\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_storage_client.py",
    "content": "from __future__ import annotations\n\nimport warnings\nfrom typing import Literal\n\nfrom redis.asyncio import Redis\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients._base import StorageClient\n\nfrom ._dataset_client import RedisDatasetClient\nfrom ._key_value_store_client import RedisKeyValueStoreClient\nfrom ._request_queue_client import RedisRequestQueueClient\n\n\n@docs_group('Storage clients')\nclass RedisStorageClient(StorageClient):\n    \"\"\"Redis implementation of the storage client.\n\n    This storage client provides access to datasets, key-value stores, and request queues that persist data\n    to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for\n    efficient storage and retrieval.\n\n    The client accepts either a Redis connection string or a pre-configured Redis client instance.\n    Exactly one of these parameters must be provided during initialization.\n\n    Storage types use the following Redis data structures:\n    - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects\n    - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage\n    - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,\n      and Bloom filters for request deduplication\n\n    Warning:\n        This is an experimental feature. The behavior and interface may change in future versions.\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        connection_string: str | None = None,\n        redis: Redis | None = None,\n        queue_dedup_strategy: Literal['default', 'bloom'] = 'default',\n        queue_bloom_error_rate: float = 1e-7,\n    ) -> None:\n        \"\"\"Initialize the Redis storage client.\n\n        Args:\n            connection_string: Redis connection string (e.g., \"redis://localhost:6379\").\n                Supports standard Redis URL format with optional database selection.\n            redis: Pre-configured Redis client instance.\n            queue_dedup_strategy: Strategy for request queue deduplication. Options are:\n                - 'default': Uses Redis sets for exact deduplication.\n                - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using\n                    this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.\n            queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if\n                `queue_dedup_strategy` is set to 'bloom'.\n        \"\"\"\n        if redis is None and connection_string is None:\n            raise ValueError('Either redis or connection_string must be provided.')\n\n        if redis is not None and connection_string is not None:\n            raise ValueError('Either redis or connection_string must be provided, not both.')\n\n        if isinstance(redis, Redis) and connection_string is None:\n            self._redis = redis\n\n        if isinstance(connection_string, str) and redis is None:\n            self._redis = Redis.from_url(connection_string)\n\n        self._redis: Redis  # to help type checker\n        self._queue_dedup_strategy = queue_dedup_strategy\n        self._queue_bloom_error_rate = queue_bloom_error_rate\n\n        # Call the notification only once\n        warnings.warn(\n            (\n                'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '\n                'releases.'\n            ),\n            category=UserWarning,\n            stacklevel=2,\n        )\n\n    @override\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> RedisDatasetClient:\n        configuration = configuration or Configuration.get_global_configuration()\n\n        client = await RedisDatasetClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=self._redis,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> RedisKeyValueStoreClient:\n        configuration = configuration or Configuration.get_global_configuration()\n\n        client = await RedisKeyValueStoreClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=self._redis,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> RedisRequestQueueClient:\n        configuration = configuration or Configuration.get_global_configuration()\n\n        client = await RedisRequestQueueClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            redis=self._redis,\n            dedup_strategy=self._queue_dedup_strategy,\n            bloom_error_rate=self._queue_bloom_error_rate,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/_utils.py",
    "content": "from collections.abc import Awaitable\nfrom pathlib import Path\nfrom typing import TypeVar, cast, overload\n\nT = TypeVar('T')\n\n\n@overload\nasync def await_redis_response(response: Awaitable[T]) -> T: ...\n@overload\nasync def await_redis_response(response: T) -> T: ...\n\n\nasync def await_redis_response(response: Awaitable[T] | T) -> T:\n    \"\"\"Solve the problem of ambiguous typing for redis.\"\"\"\n    if isinstance(response, Awaitable):\n        return cast('T', await response)\n    return response\n\n\ndef read_lua_script(script_name: str) -> str:\n    \"\"\"Read a Lua script from a file.\"\"\"\n    file_path = Path(__file__).parent / 'lua_scripts' / script_name\n    with file_path.open(mode='r', encoding='utf-8') as file:\n        return file.read()\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua",
    "content": "local added_filter_key = KEYS[1]\nlocal queue_key = KEYS[2]\nlocal data_key = KEYS[3]\n\nlocal forefront = ARGV[1] == '1'\nlocal unique_keys = cjson.decode(ARGV[2])\nlocal requests_data = cjson.decode(ARGV[3])\n\n-- Add and check which unique keys are actually new using Bloom filter\nlocal bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))\n\nlocal actually_added = {}\nlocal hset_args = {}\n\n-- Process the results\nfor i, unique_key in ipairs(unique_keys) do\n    if bf_results[i] == 1 then\n        -- This key was added by us (did not exist before)\n        table.insert(hset_args, unique_key)\n        table.insert(hset_args, requests_data[unique_key])\n        table.insert(actually_added, unique_key)\n    end\nend\n\n-- Add only those that are actually new\nif #actually_added > 0 then\n    redis.call('hset', data_key, unpack(hset_args))\n\n    if forefront then\n        redis.call('lpush', queue_key, unpack(actually_added))\n    else\n        redis.call('rpush', queue_key, unpack(actually_added))\n    end\nend\n\nreturn cjson.encode(actually_added)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua",
    "content": "local queue_key = KEYS[1]\nlocal in_progress_key = KEYS[2]\nlocal data_key = KEYS[3]\nlocal client_id = ARGV[1]\nlocal blocked_until_timestamp = ARGV[2]\nlocal batch_size = tonumber(ARGV[3])\n\n-- Pop batch unique_key from queue\nlocal batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)\nif not batch_result then\n    return nil\nend\nlocal unique_keys = batch_result[2]\n\n-- Get requests data\nlocal requests_data = redis.call('HMGET', data_key, unpack(unique_keys))\nif not requests_data then\n    -- Data missing, skip this request\n    return nil\nend\n\n-- Prepare results and update in_progress\nlocal final_result = {}\nlocal in_progress_hmset = {}\nlocal pending_decrement = 0\nlocal in_progress_data = cjson.encode({\n    client_id = client_id,\n    blocked_until_timestamp = tonumber(blocked_until_timestamp)\n})\nfor i = 1, #unique_keys do\n    local unique_key = unique_keys[i]\n    local request_data = requests_data[i]\n\n    if request_data then\n        -- Add to in_progress hash\n        table.insert(in_progress_hmset, unique_key)\n        table.insert(in_progress_hmset, in_progress_data)\n\n        table.insert(final_result, request_data)\n    end\nend\n\n-- Update in_progress hash\nif #in_progress_hmset > 0 then\n    redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))\nend\n\n-- Return result with requests data\nreturn final_result\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua",
    "content": "local added_filter_key = KEYS[1]\nlocal queue_key = KEYS[2]\nlocal data_key = KEYS[3]\n\nlocal forefront = ARGV[1] == '1'\nlocal unique_keys = cjson.decode(ARGV[2])\nlocal requests_data = cjson.decode(ARGV[3])\n\n-- Add and check which unique keys are actually new using Redis set\nlocal actually_added = {}\nlocal hset_args = {}\n\n-- Process each unique key\nfor _, unique_key in ipairs(unique_keys) do\n    -- Try to add the key to the set, returns 1 if added, 0 if already existed\n    local set_result = redis.call('sadd', added_filter_key, unique_key)\n\n    if set_result == 1 then\n        -- This key was added by us (did not exist before)\n        table.insert(hset_args, unique_key)\n        table.insert(hset_args, requests_data[unique_key])\n        table.insert(actually_added, unique_key)\n    end\nend\n\n-- Add only those that are actually new\nif #actually_added > 0 then\n    redis.call('hset', data_key, unpack(hset_args))\n\n    if forefront then\n        redis.call('lpush', queue_key, unpack(actually_added))\n    else\n        redis.call('rpush', queue_key, unpack(actually_added))\n    end\nend\n\nreturn cjson.encode(actually_added)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua",
    "content": "local in_progress_key = KEYS[1]\nlocal queue_key = KEYS[2]\nlocal data_key = KEYS[3]\nlocal current_time = tonumber(ARGV[1])\n\nlocal max_reclaim = 1000\n\nlocal cursor = \"0\"\nlocal count = 0\n\nrepeat\n    local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)\n    cursor = result[1]\n    local entries = result[2]\n\n    for i = 1, #entries, 2 do\n        if count >= max_reclaim then\n            break\n        end\n\n        local unique_key = entries[i]\n        local data = cjson.decode(entries[i + 1])\n\n        -- Check if timed out\n        if current_time > data.blocked_until_timestamp then\n            -- Atomically remove from in_progress and add back to queue\n            redis.call('hdel', in_progress_key, unique_key)\n            redis.call('rpush', queue_key, unique_key)\n            count = count + 1\n        end\n    end\nuntil cursor == \"0\" or count >= max_reclaim\n\nreturn count\n"
  },
  {
    "path": "src/crawlee/storage_clients/_redis/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/_sql/__init__.py",
    "content": "from ._dataset_client import SqlDatasetClient\nfrom ._key_value_store_client import SqlKeyValueStoreClient\nfrom ._request_queue_client import SqlRequestQueueClient\nfrom ._storage_client import SqlStorageClient\n\n__all__ = ['SqlDatasetClient', 'SqlKeyValueStoreClient', 'SqlRequestQueueClient', 'SqlStorageClient']\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_client_mixin.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom contextlib import asynccontextmanager\nfrom datetime import datetime, timedelta, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast, overload\n\nfrom sqlalchemy import CursorResult, delete, select, text, update\nfrom sqlalchemy import func as sql_func\nfrom sqlalchemy.dialects.mysql import insert as mysql_insert\nfrom sqlalchemy.dialects.postgresql import insert as pg_insert\nfrom sqlalchemy.dialects.sqlite import insert as lite_insert\nfrom sqlalchemy.exc import OperationalError, SQLAlchemyError\n\nfrom crawlee._utils.crypto import crypto_random_object_id\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from sqlalchemy import Insert\n    from sqlalchemy.ext.asyncio import AsyncSession\n    from sqlalchemy.orm import DeclarativeBase\n    from typing_extensions import NotRequired, Self\n\n    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata\n\n    from ._db_models import (\n        DatasetItemDb,\n        DatasetMetadataBufferDb,\n        DatasetMetadataDb,\n        KeyValueStoreMetadataBufferDb,\n        KeyValueStoreMetadataDb,\n        KeyValueStoreRecordDb,\n        RequestDb,\n        RequestQueueMetadataBufferDb,\n        RequestQueueMetadataDb,\n    )\n    from ._storage_client import SqlStorageClient\n\n\nlogger = getLogger(__name__)\n\n\nclass MetadataUpdateParams(TypedDict, total=False):\n    \"\"\"Parameters for updating metadata.\"\"\"\n\n    accessed_at: NotRequired[datetime]\n    modified_at: NotRequired[datetime]\n\n\nclass SqlClientMixin(ABC):\n    \"\"\"Mixin class for SQL clients.\n\n    This mixin provides common SQL operations and basic methods for SQL storage clients.\n    \"\"\"\n\n    _DEFAULT_NAME: ClassVar[str]\n    \"\"\"Default name when none provided.\"\"\"\n\n    _METADATA_TABLE: ClassVar[type[DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb]]\n    \"\"\"SQLAlchemy model for metadata.\"\"\"\n\n    _BUFFER_TABLE: ClassVar[\n        type[KeyValueStoreMetadataBufferDb | DatasetMetadataBufferDb | RequestQueueMetadataBufferDb]\n    ]\n    \"\"\"SQLAlchemy model for metadata buffer.\"\"\"\n\n    _ITEM_TABLE: ClassVar[type[DatasetItemDb | KeyValueStoreRecordDb | RequestDb]]\n    \"\"\"SQLAlchemy model for items.\"\"\"\n\n    _CLIENT_TYPE: ClassVar[str]\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    _BLOCK_BUFFER_TIME = timedelta(seconds=1)\n    \"\"\"Time interval that blocks buffer reading to update metadata.\"\"\"\n\n    def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None:\n        self._id = id\n        self._storage_client = storage_client\n\n    @classmethod\n    async def _open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        internal_name: str,\n        storage_client: SqlStorageClient,\n        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],\n        session: AsyncSession,\n        extra_metadata_fields: dict[str, Any],\n    ) -> Self:\n        \"\"\"Open existing storage or create new one.\n\n        Internal method used by _safely_open.\n\n        Args:\n            id: Storage ID to open (takes precedence over name).\n            name: The name of the storage.\n            internal_name: The database name for the storage based on name or alias.\n            storage_client: SQL storage client instance.\n            metadata_model: Pydantic model for metadata validation.\n            session: Active database session.\n            extra_metadata_fields: Storage-specific metadata fields.\n        \"\"\"\n        orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None = None\n        if id:\n            orm_metadata = await session.get(cls._METADATA_TABLE, id)\n            if not orm_metadata:\n                raise ValueError(f'{cls._CLIENT_TYPE} with ID \"{id}\" not found.')\n        else:\n            stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)\n            result = await session.execute(stmt)\n            orm_metadata = result.scalar_one_or_none()\n\n        if orm_metadata:\n            client = cls(id=orm_metadata.id, storage_client=storage_client)\n            await client._add_buffer_record(session)\n            # Ensure any pending buffer updates are processed\n            await client._process_buffers()\n        else:\n            now = datetime.now(timezone.utc)\n            metadata = metadata_model(\n                id=crypto_random_object_id(),\n                name=name,\n                created_at=now,\n                accessed_at=now,\n                modified_at=now,\n                **extra_metadata_fields,\n            )\n            client = cls(id=metadata.id, storage_client=storage_client)\n            session.add(cls._METADATA_TABLE(**metadata.model_dump(), internal_name=internal_name))\n\n        return client\n\n    @classmethod\n    async def _safely_open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None = None,\n        storage_client: SqlStorageClient,\n        metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],\n        extra_metadata_fields: dict[str, Any],\n    ) -> Self:\n        \"\"\"Safely open storage with transaction handling.\n\n        Args:\n            id: Storage ID to open (takes precedence over name).\n            name: The name of the storage for named (global scope) storages.\n            alias: The alias of the storage for unnamed (run scope) storages.\n            storage_client: SQL storage client instance.\n            client_class: Concrete client class to instantiate.\n            metadata_model: Pydantic model for metadata validation.\n            extra_metadata_fields: Storage-specific metadata fields.\n        \"\"\"\n        # Validate input parameters.\n        specified_params = sum(1 for param in [id, name, alias] if param is not None)\n        if specified_params > 1:\n            raise ValueError('Only one of \"id\", \"name\", or \"alias\" can be specified, not multiple.')\n\n        internal_name = name or alias or cls._DEFAULT_NAME\n\n        async with storage_client.create_session() as session:\n            try:\n                client = await cls._open(\n                    id=id,\n                    name=name,\n                    internal_name=internal_name,\n                    storage_client=storage_client,\n                    metadata_model=metadata_model,\n                    session=session,\n                    extra_metadata_fields=extra_metadata_fields,\n                )\n                await session.commit()\n            except SQLAlchemyError:\n                await session.rollback()\n\n                stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)\n                result = await session.execute(stmt)\n                orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None\n                orm_metadata = cast(\n                    'DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None',\n                    result.scalar_one_or_none(),\n                )\n\n                if not orm_metadata:\n                    raise ValueError(f'{cls._CLIENT_TYPE} with Name \"{internal_name}\" not found.') from None\n\n                client = cls(id=orm_metadata.id, storage_client=storage_client)\n\n        return client\n\n    @asynccontextmanager\n    async def get_session(self, *, with_simple_commit: bool = False) -> AsyncIterator[AsyncSession]:\n        \"\"\"Create a new SQLAlchemy session for this storage.\"\"\"\n        async with self._storage_client.create_session() as session:\n            # For operations where a final commit is mandatory and does not require specific processing conditions\n            if with_simple_commit:\n                try:\n                    yield session\n                    await session.commit()\n                except SQLAlchemyError as e:\n                    logger.warning(f'Error occurred during session transaction: {e}')\n                    await session.rollback()\n            else:\n                yield session\n\n    def _build_insert_stmt_with_ignore(\n        self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]]\n    ) -> Insert:\n        \"\"\"Build an insert statement with ignore for the SQL dialect.\n\n        Args:\n            table_model: SQLAlchemy table model.\n            insert_values: Single dict or list of dicts to insert.\n        \"\"\"\n        if isinstance(insert_values, dict):\n            insert_values = [insert_values]\n\n        dialect = self._storage_client.get_dialect_name()\n\n        if dialect == 'postgresql':\n            return pg_insert(table_model).values(insert_values).on_conflict_do_nothing()\n\n        if dialect == 'sqlite':\n            return lite_insert(table_model).values(insert_values).on_conflict_do_nothing()\n\n        if dialect in {'mysql', 'mariadb'}:\n            return mysql_insert(table_model).values(insert_values).prefix_with('IGNORE')\n\n        raise NotImplementedError(f'Insert with ignore not supported for dialect: {dialect}')\n\n    def _build_upsert_stmt(\n        self,\n        table_model: type[DeclarativeBase],\n        insert_values: dict[str, Any] | list[dict[str, Any]],\n        update_columns: list[str],\n        conflict_cols: list[str] | None = None,\n    ) -> Insert:\n        \"\"\"Build an upsert statement for the SQL dialect.\n\n        Args:\n            table_model: SQLAlchemy table model.\n            insert_values: Single dict or list of dicts to upsert.\n            update_columns: Column names to update on conflict.\n            conflict_cols: Column names that define uniqueness (for PostgreSQL/SQLite).\n\n        \"\"\"\n        if isinstance(insert_values, dict):\n            insert_values = [insert_values]\n\n        dialect = self._storage_client.get_dialect_name()\n\n        if dialect == 'postgresql':\n            pg_stmt = pg_insert(table_model).values(insert_values)\n            set_ = {col: getattr(pg_stmt.excluded, col) for col in update_columns}\n            return pg_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)\n\n        if dialect == 'sqlite':\n            lite_stmt = lite_insert(table_model).values(insert_values)\n            set_ = {col: getattr(lite_stmt.excluded, col) for col in update_columns}\n            return lite_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)\n\n        if dialect in {'mysql', 'mariadb'}:\n            mysql_stmt = mysql_insert(table_model).values(insert_values)\n            set_ = {col: getattr(mysql_stmt.inserted, col) for col in update_columns}\n            return mysql_stmt.on_duplicate_key_update(**set_)\n\n        raise NotImplementedError(f'Upsert not supported for dialect: {dialect}')\n\n    async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None:\n        \"\"\"Drop all items in storage and update metadata.\n\n        Args:\n            metadata_kwargs: Arguments to pass to _update_metadata.\n        \"\"\"\n        # Process buffers to ensure metadata is up to date before purging\n        await self._process_buffers()\n\n        stmt_records = delete(self._ITEM_TABLE).where(self._ITEM_TABLE.storage_id == self._id)\n        async with self.get_session(with_simple_commit=True) as session:\n            await session.execute(stmt_records)\n            await self._update_metadata(session, **metadata_kwargs)\n\n    async def _drop(self) -> None:\n        \"\"\"Delete this storage and all its data.\n\n        This operation is irreversible. Uses CASCADE deletion to remove all related items.\n        \"\"\"\n        stmt = delete(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)\n        # Delete the buffer records with a separate query, since tables don't link via foreign key.\n        buffer_stmt = delete(self._BUFFER_TABLE).where(self._BUFFER_TABLE.storage_id == self._id)\n\n        async with self.get_session(with_simple_commit=True) as session:\n            if self._storage_client.get_dialect_name() == 'sqlite':\n                # foreign_keys=ON is set at the connection level. Required for cascade deletion.\n                await session.execute(text('PRAGMA foreign_keys=ON'))\n            await session.execute(stmt)\n            await session.execute(buffer_stmt)\n\n    @overload\n    async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...\n    @overload\n    async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...\n    @overload\n    async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...\n\n    async def _get_metadata(\n        self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]\n    ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:\n        \"\"\"Retrieve client metadata.\"\"\"\n        # Process any pending buffer updates first\n        await self._process_buffers()\n\n        async with self.get_session() as session:\n            orm_metadata = await session.get(self._METADATA_TABLE, self._id)\n            if not orm_metadata:\n                raise ValueError(f'{self._CLIENT_TYPE} with ID \"{self._id}\" not found.')\n\n            return metadata_model.model_validate(orm_metadata)\n\n    @abstractmethod\n    def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]:\n        \"\"\"Prepare storage-specific metadata updates.\n\n        Must be implemented by concrete classes.\n\n        Args:\n            **kwargs: Storage-specific update parameters.\n        \"\"\"\n\n    @abstractmethod\n    def _prepare_buffer_data(self, **kwargs: Any) -> dict[str, Any]:\n        \"\"\"Prepare storage-specific buffer data. Must be implemented by concrete classes.\"\"\"\n\n    @abstractmethod\n    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:\n        \"\"\"Apply aggregated buffer updates to metadata. Must be implemented by concrete classes.\n\n        Args:\n            session: Active database session.\n            max_buffer_id: Maximum buffer record ID to process.\n        \"\"\"\n\n    async def _update_metadata(\n        self,\n        session: AsyncSession,\n        *,\n        accessed_at: datetime | None = None,\n        modified_at: datetime | None = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Directly update storage metadata combining common and specific fields.\n\n        Args:\n            session: Active database session.\n            accessed_at: Datetime to set as accessed_at timestamp.\n            modified_at: Datetime to set as modified_at timestamp.\n            **kwargs: Additional arguments for _specific_update_metadata.\n        \"\"\"\n        values_to_set: dict[str, Any] = {}\n\n        if accessed_at is not None:\n            values_to_set['accessed_at'] = accessed_at\n\n        if modified_at is not None:\n            values_to_set['modified_at'] = modified_at\n\n        values_to_set.update(self._specific_update_metadata(**kwargs))\n\n        if values_to_set:\n            if (stmt := values_to_set.pop('custom_stmt', None)) is None:\n                stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)\n\n            stmt = stmt.values(**values_to_set)\n            await session.execute(stmt)\n\n    async def _add_buffer_record(\n        self,\n        session: AsyncSession,\n        *,\n        update_modified_at: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Add a record to the buffer table and update metadata.\n\n        Args:\n            session: Active database session.\n            update_modified_at: Whether to update modified_at timestamp.\n            **kwargs: Additional arguments for _prepare_buffer_data.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n        values_to_set = {\n            'storage_id': self._id,\n            'accessed_at': now,  # All entries in the buffer require updating `accessed_at`\n            'modified_at': now if update_modified_at else None,\n        }\n        values_to_set.update(self._prepare_buffer_data(**kwargs))\n\n        session.add(self._BUFFER_TABLE(**values_to_set))\n\n    async def _try_acquire_buffer_lock(self, session: AsyncSession) -> bool:\n        \"\"\"Try to acquire buffer processing lock for a short period.\n\n        Args:\n            session: Active database session.\n\n        Returns:\n            True if lock was acquired, False if already locked by another process.\n        \"\"\"\n        capture_error_code = 1020  # MariaDB error code for \"Record has changed since last read\"\n        now = datetime.now(timezone.utc)\n        lock_until = now + self._BLOCK_BUFFER_TIME\n        dialect = self._storage_client.get_dialect_name()\n\n        if dialect in {'postgresql', 'mysql', 'mariadb'}:\n            select_stmt = (\n                select(self._METADATA_TABLE)\n                .where(\n                    self._METADATA_TABLE.id == self._id,\n                    (self._METADATA_TABLE.buffer_locked_until.is_(None))\n                    | (self._METADATA_TABLE.buffer_locked_until < now),\n                    select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(),\n                )\n                .with_for_update(skip_locked=True)\n            )\n\n            try:\n                result = await session.execute(select_stmt)\n            except OperationalError as e:\n                # MariaDB raises error 1020 (\"Record has changed since last read\") instead of\n                # silently skipping locked rows like MySQL/PostgreSQL. Treat it as lock not acquired.\n                error_code = getattr(e.orig, 'args', [None])[0]\n                if error_code == capture_error_code:\n                    return False\n                raise\n\n            metadata_row = result.scalar_one_or_none()\n\n            if metadata_row is None:\n                # Either conditions not met OR row is locked by another process\n                return False\n\n        # Acquire lock only if not currently locked or lock has expired\n        update_stmt = (\n            update(self._METADATA_TABLE)\n            .where(\n                self._METADATA_TABLE.id == self._id,\n                (self._METADATA_TABLE.buffer_locked_until.is_(None)) | (self._METADATA_TABLE.buffer_locked_until < now),\n                select(self._BUFFER_TABLE.id).where(self._BUFFER_TABLE.storage_id == self._id).exists(),\n            )\n            .values(buffer_locked_until=lock_until)\n        )\n\n        result = await session.execute(update_stmt)\n        result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n\n        if result.rowcount > 0:\n            await session.flush()\n            return True\n\n        return False\n\n    async def _release_buffer_lock(self, session: AsyncSession) -> None:\n        \"\"\"Release buffer processing lock by setting buffer_locked_until to NULL.\n\n        Args:\n            session: Active database session.\n        \"\"\"\n        stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id).values(buffer_locked_until=None)\n\n        await session.execute(stmt)\n\n    async def _has_pending_buffer_updates(self, session: AsyncSession) -> bool:\n        \"\"\"Check if there are pending buffer updates not yet applied to metadata.\n\n        Returns False only when buffer_locked_until is NULL (metadata is consistent).\n\n        Returns:\n            True if metadata might be inconsistent due to pending buffer updates.\n        \"\"\"\n        result = await session.execute(\n            select(self._METADATA_TABLE.buffer_locked_until).where(self._METADATA_TABLE.id == self._id)\n        )\n\n        locked_until = result.scalar()\n\n        # Any non-NULL value means there are pending updates\n        return locked_until is not None\n\n    async def _process_buffers(self) -> None:\n        \"\"\"Process pending buffer updates and apply them to metadata.\"\"\"\n        async with self.get_session(with_simple_commit=True) as session:\n            # Try to acquire buffer processing lock\n            if not await self._try_acquire_buffer_lock(session):\n                # Another process is currently processing buffers or lock acquisition failed\n                return\n\n            # Get the maximum buffer ID at this moment\n            # This creates a consistent snapshot - records added during processing won't be included\n            max_buffer_id_stmt = select(sql_func.max(self._BUFFER_TABLE.id)).where(\n                self._BUFFER_TABLE.storage_id == self._id\n            )\n\n            result = await session.execute(max_buffer_id_stmt)\n            max_buffer_id = result.scalar()\n\n            if max_buffer_id is None:\n                # No buffer records to process. Release the lock and exit.\n                await self._release_buffer_lock(session)\n                return\n\n            # Apply aggregated buffer updates to metadata using only records <= max_buffer_id\n            # This method is implemented by concrete storage classes\n            await self._apply_buffer_updates(session, max_buffer_id=max_buffer_id)\n\n            # Clean up only the processed buffer records (those <= max_buffer_id)\n            delete_stmt = delete(self._BUFFER_TABLE).where(\n                self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id\n            )\n\n            await session.execute(delete_stmt)\n\n            # Release the lock after successful processing\n            await self._release_buffer_lock(session)\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_dataset_client.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any\n\nfrom sqlalchemy import Select, insert, select\nfrom sqlalchemy import func as sql_func\nfrom typing_extensions import Self, override\n\nfrom crawlee.storage_clients._base import DatasetClient\nfrom crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\nfrom ._client_mixin import MetadataUpdateParams, SqlClientMixin\nfrom ._db_models import DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from sqlalchemy import Select\n    from sqlalchemy.ext.asyncio import AsyncSession\n    from typing_extensions import NotRequired\n\n    from ._storage_client import SqlStorageClient\n\n\nlogger = getLogger(__name__)\n\n\nclass _DatasetMetadataUpdateParams(MetadataUpdateParams):\n    \"\"\"Parameters for updating dataset metadata.\"\"\"\n\n    new_item_count: NotRequired[int]\n    delta_item_count: NotRequired[int]\n\n\nclass SqlDatasetClient(DatasetClient, SqlClientMixin):\n    \"\"\"SQL implementation of the dataset client.\n\n    This client persists dataset items to a SQL database using two tables for storage\n    and retrieval. Items are stored as JSON with automatic ordering preservation.\n\n    The dataset data is stored in SQL database tables following the pattern:\n    - `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)\n    - `dataset_records` table: Contains individual items with JSON data and auto-increment ordering\n    - `dataset_metadata_buffer` table: Buffers metadata updates for performance optimization\n\n    Items are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.\n    The `item_id` auto-increment primary key ensures insertion order is preserved.\n    All operations are wrapped in database transactions with CASCADE deletion support.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default dataset name used when no name is provided.\"\"\"\n\n    _METADATA_TABLE = DatasetMetadataDb\n    \"\"\"SQLAlchemy model for dataset metadata.\"\"\"\n\n    _ITEM_TABLE = DatasetItemDb\n    \"\"\"SQLAlchemy model for dataset items.\"\"\"\n\n    _CLIENT_TYPE = 'Dataset'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    _BUFFER_TABLE = DatasetMetadataBufferDb\n    \"\"\"SQLAlchemy model for metadata buffer.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        id: str,\n        storage_client: SqlStorageClient,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `SqlDatasetClient.open` class method to create a new instance.\n        \"\"\"\n        super().__init__(id=id, storage_client=storage_client)\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        storage_client: SqlStorageClient,\n    ) -> Self:\n        \"\"\"Open an existing dataset or create a new one.\n\n        Args:\n            id: The ID of the dataset to open. If provided, searches for existing dataset by ID.\n            name: The name of the dataset for named (global scope) storages.\n            alias: The alias of the dataset for unnamed (run scope) storages.\n            storage_client: The SQL storage client instance.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If a dataset with the specified ID is not found.\n        \"\"\"\n        return await cls._safely_open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=storage_client,\n            metadata_model=DatasetMetadata,\n            extra_metadata_fields={'item_count': 0},\n        )\n\n    @override\n    async def get_metadata(self) -> DatasetMetadata:\n        # The database is a single place of truth\n        return await self._get_metadata(DatasetMetadata)\n\n    @override\n    async def drop(self) -> None:\n        \"\"\"Delete this dataset and all its items from the database.\n\n        This operation is irreversible. Uses CASCADE deletion to remove all related items.\n        \"\"\"\n        await self._drop()\n\n    @override\n    async def purge(self) -> None:\n        \"\"\"Remove all items from this dataset while keeping the dataset structure.\n\n        Resets item_count to 0 and deletes all records from dataset_records table.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n        await self._purge(\n            metadata_kwargs=_DatasetMetadataUpdateParams(\n                new_item_count=0,\n                accessed_at=now,\n                modified_at=now,\n            )\n        )\n\n    @override\n    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:\n        if not isinstance(data, list):\n            data = [data]\n\n        db_items = [{'dataset_id': self._id, 'data': item} for item in data]\n        stmt = insert(self._ITEM_TABLE).values(db_items)\n\n        async with self.get_session(with_simple_commit=True) as session:\n            await session.execute(stmt)\n\n            await self._add_buffer_record(session, update_modified_at=True, delta_item_count=len(data))\n\n    @override\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        stmt = self._prepare_get_stmt(\n            offset=offset,\n            limit=limit,\n            clean=clean,\n            desc=desc,\n            fields=fields,\n            omit=omit,\n            unwind=unwind,\n            skip_empty=skip_empty,\n            skip_hidden=skip_hidden,\n            flatten=flatten,\n            view=view,\n        )\n\n        async with self.get_session(with_simple_commit=True) as session:\n            result = await session.execute(stmt)\n            db_items = result.scalars().all()\n\n            await self._add_buffer_record(session)\n\n        items = [db_item.data for db_item in db_items]\n        metadata = await self.get_metadata()\n        return DatasetItemsListPage(\n            items=items,\n            count=len(items),\n            desc=desc,\n            limit=limit or 0,\n            offset=offset or 0,\n            total=metadata.item_count,\n        )\n\n    @override\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = None,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        stmt = self._prepare_get_stmt(\n            offset=offset,\n            limit=limit,\n            clean=clean,\n            desc=desc,\n            fields=fields,\n            omit=omit,\n            unwind=unwind,\n            skip_empty=skip_empty,\n            skip_hidden=skip_hidden,\n        )\n\n        async with self.get_session(with_simple_commit=True) as session:\n            db_items = await session.stream_scalars(stmt)\n\n            async for db_item in db_items:\n                yield db_item.data\n\n            await self._add_buffer_record(session)\n\n    def _prepare_get_stmt(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> Select:\n        # Check for unsupported arguments and log a warning if found.\n        unsupported_args: dict[str, Any] = {\n            'clean': clean,\n            'fields': fields,\n            'omit': omit,\n            'unwind': unwind,\n            'skip_hidden': skip_hidden,\n            'flatten': flatten,\n            'view': view,\n        }\n        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}\n\n        if unsupported:\n            logger.warning(\n                f'The arguments {list(unsupported.keys())} of get_data are not supported by the '\n                f'{self.__class__.__name__} client.'\n            )\n\n        stmt = select(self._ITEM_TABLE).where(self._ITEM_TABLE.dataset_id == self._id)\n\n        if skip_empty:\n            # Skip items that are empty JSON objects\n            stmt = stmt.where(self._ITEM_TABLE.data != {})\n\n        # Apply ordering by insertion order (item_id)\n        stmt = stmt.order_by(self._ITEM_TABLE.item_id.desc()) if desc else stmt.order_by(self._ITEM_TABLE.item_id.asc())\n\n        return stmt.offset(offset).limit(limit)\n\n    @override\n    def _specific_update_metadata(\n        self,\n        new_item_count: int | None = None,\n        delta_item_count: int | None = None,\n        **_kwargs: dict[str, Any],\n    ) -> dict[str, Any]:\n        \"\"\"Directly update the dataset metadata in the database.\n\n        Args:\n            session: The SQLAlchemy AsyncSession to use for the update.\n            new_item_count: If provided, set item count to this value.\n            delta_item_count: If provided, add this value to the current item count.\n        \"\"\"\n        values_to_set: dict[str, Any] = {}\n\n        if new_item_count is not None:\n            values_to_set['item_count'] = new_item_count\n        elif delta_item_count:\n            # Use database-level for atomic updates\n            values_to_set['item_count'] = self._METADATA_TABLE.item_count + delta_item_count\n\n        return values_to_set\n\n    @override\n    def _prepare_buffer_data(self, delta_item_count: int | None = None, **_kwargs: Any) -> dict[str, Any]:\n        \"\"\"Prepare dataset specific buffer data.\n\n        Args:\n            delta_item_count: If provided, add this value to the current item count.\n        \"\"\"\n        buffer_data = {}\n        if delta_item_count is not None:\n            buffer_data['delta_item_count'] = delta_item_count\n\n        return buffer_data\n\n    @override\n    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:\n        aggregation_stmt = select(\n            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),\n            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),\n            sql_func.sum(self._BUFFER_TABLE.delta_item_count).label('delta_item_count'),\n        ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id)\n\n        result = await session.execute(aggregation_stmt)\n        row = result.first()\n\n        if not row:\n            return\n\n        await self._update_metadata(\n            session,\n            **_DatasetMetadataUpdateParams(\n                accessed_at=row.max_accessed_at,\n                modified_at=row.max_modified_at,\n                delta_item_count=row.delta_item_count,\n            ),\n        )\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_db_models.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING, Any\n\nfrom sqlalchemy import JSON, BigInteger, Boolean, ForeignKey, Index, Integer, LargeBinary, String, Text, text\nfrom sqlalchemy.dialects.postgresql import JSONB\nfrom sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, synonym\nfrom sqlalchemy.types import DateTime, TypeDecorator\nfrom typing_extensions import override\n\nif TYPE_CHECKING:\n    from sqlalchemy.engine import Dialect\n    from sqlalchemy.types import TypeEngine\n\n\nclass AwareDateTime(TypeDecorator):\n    \"\"\"Custom SQLAlchemy type for timezone-aware datetime handling.\n\n    Ensures all datetime values are timezone-aware by adding UTC timezone to\n    naive datetime values from databases that don't store timezone information.\n    \"\"\"\n\n    impl = DateTime(timezone=True)\n    cache_ok = True\n\n    @override\n    def process_result_value(self, value: datetime | None, dialect: Dialect) -> datetime | None:\n        \"\"\"Add UTC timezone to naive datetime values.\"\"\"\n        if value is not None and value.tzinfo is None:\n            return value.replace(tzinfo=timezone.utc)\n        return value\n\n\nclass JsonField(TypeDecorator):\n    \"\"\"Uses JSONB for PostgreSQL and JSON for other databases.\"\"\"\n\n    impl = JSON\n    cache_ok = True\n\n    def load_dialect_impl(self, dialect: Dialect) -> TypeEngine[JSON | JSONB]:\n        \"\"\"Load the appropriate dialect implementation for the JSON type.\"\"\"\n        if dialect.name == 'postgresql':\n            return dialect.type_descriptor(JSONB())\n        return dialect.type_descriptor(JSON())\n\n\nclass Base(DeclarativeBase):\n    \"\"\"Base class for all database models for correct type annotations.\"\"\"\n\n\nclass StorageMetadataDb:\n    \"\"\"Base database model for storage metadata.\"\"\"\n\n    internal_name: Mapped[str] = mapped_column(String(255), nullable=False, index=True, unique=True)\n    \"\"\"Internal unique name for a storage instance based on a name or alias.\"\"\"\n\n    name: Mapped[str | None] = mapped_column(String(255), nullable=True, unique=True)\n    \"\"\"Human-readable name. None becomes 'default' in database to enforce uniqueness.\"\"\"\n\n    accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)\n    \"\"\"Last access datetime for usage tracking.\"\"\"\n\n    created_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)\n    \"\"\"Creation datetime.\"\"\"\n\n    modified_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)\n    \"\"\"Last modification datetime.\"\"\"\n\n    buffer_locked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)\n    \"\"\"Timestamp until which buffer processing is locked for this storage. NULL = unlocked.\"\"\"\n\n\nclass DatasetMetadataDb(StorageMetadataDb, Base):\n    \"\"\"Metadata table for datasets.\"\"\"\n\n    __tablename__ = 'datasets'\n\n    dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)\n    \"\"\"Unique identifier for the dataset.\"\"\"\n\n    item_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)\n    \"\"\"Number of items in the dataset.\"\"\"\n\n    # Relationship to dataset items with cascade deletion\n    items: Mapped[list[DatasetItemDb]] = relationship(\n        back_populates='dataset', cascade='all, delete-orphan', lazy='noload'\n    )\n\n    id = synonym('dataset_id')\n    \"\"\"Alias for dataset_id to match Pydantic expectations.\"\"\"\n\n\nclass RequestQueueMetadataDb(StorageMetadataDb, Base):\n    \"\"\"Metadata table for request queues.\"\"\"\n\n    __tablename__ = 'request_queues'\n\n    request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)\n    \"\"\"Unique identifier for the request queue.\"\"\"\n\n    had_multiple_clients: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)\n    \"\"\"Flag indicating if multiple clients have accessed this queue.\"\"\"\n\n    handled_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)\n    \"\"\"Number of requests processed.\"\"\"\n\n    pending_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)\n    \"\"\"Number of requests waiting to be processed.\"\"\"\n\n    total_request_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)\n    \"\"\"Total number of requests ever added to this queue.\"\"\"\n\n    # Relationship to queue requests with cascade deletion\n    requests: Mapped[list[RequestDb]] = relationship(\n        back_populates='queue', cascade='all, delete-orphan', lazy='noload'\n    )\n    # Relationship to queue state\n    state: Mapped[RequestQueueStateDb] = relationship(\n        back_populates='queue', cascade='all, delete-orphan', lazy='noload'\n    )\n\n    id = synonym('request_queue_id')\n    \"\"\"Alias for request_queue_id to match Pydantic expectations.\"\"\"\n\n\nclass KeyValueStoreMetadataDb(StorageMetadataDb, Base):\n    \"\"\"Metadata table for key-value stores.\"\"\"\n\n    __tablename__ = 'key_value_stores'\n\n    key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, primary_key=True)\n    \"\"\"Unique identifier for the key-value store.\"\"\"\n\n    # Relationship to store records with cascade deletion\n    records: Mapped[list[KeyValueStoreRecordDb]] = relationship(\n        back_populates='kvs', cascade='all, delete-orphan', lazy='noload'\n    )\n\n    id = synonym('key_value_store_id')\n    \"\"\"Alias for key_value_store_id to match Pydantic expectations.\"\"\"\n\n\nclass KeyValueStoreRecordDb(Base):\n    \"\"\"Records table for key-value stores.\"\"\"\n\n    __tablename__ = 'key_value_store_records'\n\n    key_value_store_id: Mapped[str] = mapped_column(\n        String(20),\n        ForeignKey('key_value_stores.key_value_store_id', ondelete='CASCADE'),\n        primary_key=True,\n        index=True,\n        nullable=False,\n    )\n    \"\"\"Foreign key to metadata key-value store record.\"\"\"\n\n    key: Mapped[str] = mapped_column(String(255), primary_key=True)\n    \"\"\"The key part of the key-value pair.\"\"\"\n\n    value: Mapped[bytes] = mapped_column(LargeBinary, nullable=False)\n    \"\"\"Value stored as binary data to support any content type.\"\"\"\n\n    content_type: Mapped[str] = mapped_column(String(50), nullable=False)\n    \"\"\"MIME type for proper value deserialization.\"\"\"\n\n    size: Mapped[int | None] = mapped_column(Integer, nullable=False, default=0)\n    \"\"\"Size of stored value in bytes.\"\"\"\n\n    # Relationship back to parent store\n    kvs: Mapped[KeyValueStoreMetadataDb] = relationship(back_populates='records')\n\n    storage_id = synonym('key_value_store_id')\n    \"\"\"Alias for key_value_store_id to match SqlClientMixin expectations.\"\"\"\n\n\nclass DatasetItemDb(Base):\n    \"\"\"Items table for datasets.\"\"\"\n\n    __tablename__ = 'dataset_records'\n\n    item_id: Mapped[int] = mapped_column(Integer, primary_key=True)\n    \"\"\"Auto-increment primary key preserving insertion order.\"\"\"\n\n    dataset_id: Mapped[str] = mapped_column(\n        String(20),\n        ForeignKey('datasets.dataset_id', ondelete='CASCADE'),\n        index=True,\n    )\n    \"\"\"Foreign key to metadata dataset record.\"\"\"\n\n    data: Mapped[list[dict[str, Any]] | dict[str, Any]] = mapped_column(JsonField, nullable=False)\n    \"\"\"JSON serializable item data.\"\"\"\n\n    # Relationship back to parent dataset\n    dataset: Mapped[DatasetMetadataDb] = relationship(back_populates='items')\n\n    storage_id = synonym('dataset_id')\n    \"\"\"Alias for dataset_id to match SqlClientMixin expectations.\"\"\"\n\n\nclass RequestDb(Base):\n    \"\"\"Requests table for request queues.\"\"\"\n\n    __tablename__ = 'request_queue_records'\n    __table_args__ = (\n        Index(\n            'idx_fetch_available',\n            'request_queue_id',\n            'is_handled',\n            'sequence_number',\n            postgresql_where=text('is_handled = false'),\n        ),\n        Index(\n            'idx_count_aggregate',\n            'request_queue_id',\n            'is_handled',\n        ),\n    )\n\n    request_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)\n    \"\"\"Unique identifier for the request representing the unique_key.\"\"\"\n\n    request_queue_id: Mapped[str] = mapped_column(\n        String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True\n    )\n    \"\"\"Foreign key to metadata request queue record.\"\"\"\n\n    data: Mapped[str] = mapped_column(Text, nullable=False)\n    \"\"\"JSON-serialized Request object.\"\"\"\n\n    sequence_number: Mapped[int] = mapped_column(Integer, nullable=False)\n    \"\"\"Ordering sequence: negative for forefront, positive for regular.\"\"\"\n\n    is_handled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)\n    \"\"\"Processing status flag.\"\"\"\n\n    time_blocked_until: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)\n    \"\"\"Timestamp until which this request is considered blocked for processing by other clients.\"\"\"\n\n    client_key: Mapped[str | None] = mapped_column(String(32), nullable=True)\n    \"\"\"Identifier of the client that has currently locked this request for processing.\"\"\"\n\n    # Relationship back to metadata table\n    queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='requests')\n\n    storage_id = synonym('request_queue_id')\n    \"\"\"Alias for request_queue_id to match SqlClientMixin expectations.\"\"\"\n\n\nclass RequestQueueStateDb(Base):\n    \"\"\"State table for request queues.\"\"\"\n\n    __tablename__ = 'request_queue_state'\n\n    request_queue_id: Mapped[str] = mapped_column(\n        String(20), ForeignKey('request_queues.request_queue_id', ondelete='CASCADE'), primary_key=True\n    )\n    \"\"\"Foreign key to metadata request queue record.\"\"\"\n\n    sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=1)\n    \"\"\"Counter for regular request ordering (positive).\"\"\"\n\n    forefront_sequence_counter: Mapped[int] = mapped_column(Integer, nullable=False, default=-1)\n    \"\"\"Counter for forefront request ordering (negative).\"\"\"\n\n    # Relationship back to metadata table\n    queue: Mapped[RequestQueueMetadataDb] = relationship(back_populates='state')\n\n\nclass VersionDb(Base):\n    \"\"\"Table for storing the database schema version.\"\"\"\n\n    __tablename__ = 'version'\n\n    version: Mapped[str] = mapped_column(String(10), nullable=False, primary_key=True)\n\n\nclass MetadataBufferDb:\n    \"\"\"Base model for metadata update buffer tables.\"\"\"\n\n    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)\n    \"\"\"Auto-increment primary key for ordering.\"\"\"\n\n    # Timestamp fields - use max value when aggregating\n    accessed_at: Mapped[datetime] = mapped_column(AwareDateTime, nullable=False)\n    \"\"\"New accessed_at timestamp, if being updated.\"\"\"\n\n    modified_at: Mapped[datetime | None] = mapped_column(AwareDateTime, nullable=True)\n    \"\"\"New modified_at timestamp, if being updated.\"\"\"\n\n\nclass KeyValueStoreMetadataBufferDb(MetadataBufferDb, Base):\n    \"\"\"Buffer table for deferred key-value store metadata updates to reduce concurrent access issues.\"\"\"\n\n    __tablename__ = 'key_value_store_metadata_buffer'\n\n    # Don't use foreign key constraint to avoid DB locks on high concurrency.\n    key_value_store_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)\n    \"\"\"ID of the key-value store being updated.\"\"\"\n\n    storage_id = synonym('key_value_store_id')\n    \"\"\"Alias for key_value_store_id to match SqlClientMixin expectations.\"\"\"\n\n\nclass DatasetMetadataBufferDb(MetadataBufferDb, Base):\n    \"\"\"Buffer table for deferred dataset metadata updates to reduce concurrent access issues.\"\"\"\n\n    __tablename__ = 'dataset_metadata_buffer'\n\n    # Don't use foreign key constraint to avoid DB locks on high concurrency.\n    dataset_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)\n    \"\"\"ID of the dataset being updated.\"\"\"\n\n    # Counter deltas - use SUM when aggregating.\n    delta_item_count: Mapped[int | None] = mapped_column(Integer, nullable=True)\n    \"\"\"Delta for dataset item_count.\"\"\"\n\n    storage_id = synonym('dataset_id')\n    \"\"\"Alias for dataset_id to match SqlClientMixin expectations.\"\"\"\n\n\nclass RequestQueueMetadataBufferDb(MetadataBufferDb, Base):\n    \"\"\"Buffer table for deferred request queue metadata updates to reduce concurrent access issues.\"\"\"\n\n    __tablename__ = 'request_queue_metadata_buffer'\n\n    __table_args__ = (Index('idx_rq_client', 'request_queue_id', 'client_id'),)\n\n    # Don't use foreign key constraint to avoid DB locks on high concurrency.\n    request_queue_id: Mapped[str] = mapped_column(String(20), nullable=False, index=True)\n    \"\"\"ID of the request queue being updated.\"\"\"\n\n    client_id: Mapped[str] = mapped_column(String(32), nullable=False)\n    \"\"\"Identifier of the client making this update.\"\"\"\n\n    # Counter deltas - use SUM when aggregating.\n    delta_handled_count: Mapped[int | None] = mapped_column(Integer, nullable=True)\n    \"\"\"Delta for handled_request_count.\"\"\"\n\n    delta_pending_count: Mapped[int | None] = mapped_column(Integer, nullable=True)\n    \"\"\"Delta for pending_request_count.\"\"\"\n\n    delta_total_count: Mapped[int | None] = mapped_column(Integer, nullable=True)\n    \"\"\"Delta for total_request_count.\"\"\"\n\n    need_recalc: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)\n    \"\"\"Flag indicating that counters need recalculation from actual data.\"\"\"\n\n    storage_id = synonym('request_queue_id')\n    \"\"\"Alias for request_queue_id to match SqlClientMixin expectations.\"\"\"\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_key_value_store_client.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom datetime import datetime, timezone\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom sqlalchemy import CursorResult, delete, select\nfrom sqlalchemy import func as sql_func\nfrom typing_extensions import Self, override\n\nfrom crawlee._utils.file import infer_mime_type\nfrom crawlee.storage_clients._base import KeyValueStoreClient\nfrom crawlee.storage_clients.models import (\n    KeyValueStoreMetadata,\n    KeyValueStoreRecord,\n    KeyValueStoreRecordMetadata,\n)\n\nfrom ._client_mixin import MetadataUpdateParams, SqlClientMixin\nfrom ._db_models import KeyValueStoreMetadataBufferDb, KeyValueStoreMetadataDb, KeyValueStoreRecordDb\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from sqlalchemy.ext.asyncio import AsyncSession\n\n    from ._storage_client import SqlStorageClient\n\n\nlogger = getLogger(__name__)\n\n\nclass SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):\n    \"\"\"SQL implementation of the key-value store client.\n\n    This client persists key-value data to a SQL database with transaction support and\n    concurrent access safety. Keys are mapped to rows in database tables with proper indexing\n    for efficient retrieval.\n\n    The key-value store data is stored in SQL database tables following the pattern:\n    - `key_value_stores` table: Contains store metadata (id, name, timestamps)\n    - `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,\n    and size information\n    - `key_value_store_metadata_buffer` table: Buffers metadata updates for performance optimization\n\n    Values are serialized based on their type: JSON objects are stored as formatted JSON,\n    text values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.\n    The implementation automatically handles content type detection and maintains metadata\n    about each record including size and MIME type information.\n\n    All database operations are wrapped in transactions with proper error handling and rollback\n    mechanisms. The client supports atomic upsert operations and handles race conditions when\n    multiple clients access the same store using composite primary keys (key_value_store_id, key).\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default dataset name used when no name is provided.\"\"\"\n\n    _METADATA_TABLE = KeyValueStoreMetadataDb\n    \"\"\"SQLAlchemy model for key-value store metadata.\"\"\"\n\n    _ITEM_TABLE = KeyValueStoreRecordDb\n    \"\"\"SQLAlchemy model for key-value store items.\"\"\"\n\n    _CLIENT_TYPE = 'Key-value store'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    _BUFFER_TABLE = KeyValueStoreMetadataBufferDb\n    \"\"\"SQLAlchemy model for metadata buffer.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        storage_client: SqlStorageClient,\n        id: str,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `SqlKeyValueStoreClient.open` class method to create a new instance.\n        \"\"\"\n        super().__init__(id=id, storage_client=storage_client)\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        storage_client: SqlStorageClient,\n    ) -> Self:\n        \"\"\"Open or create a SQL key-value store client.\n\n        This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified\n        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one\n        is created.\n\n        Args:\n            id: The ID of the key-value store to open. If provided, searches for existing store by ID.\n            name: The name of the key-value store for named (global scope) storages.\n            alias: The alias of the key-value store for unnamed (run scope) storages.\n            storage_client: The SQL storage client used to access the database.\n\n        Returns:\n            An instance for the opened or created storage client.\n\n        Raises:\n            ValueError: If a store with the specified ID is not found, or if metadata is invalid.\n        \"\"\"\n        return await cls._safely_open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=storage_client,\n            metadata_model=KeyValueStoreMetadata,\n            extra_metadata_fields={},\n        )\n\n    @override\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        # The database is a single place of truth\n        return await self._get_metadata(KeyValueStoreMetadata)\n\n    @override\n    async def drop(self) -> None:\n        \"\"\"Delete this key-value store and all its records from the database.\n\n        This operation is irreversible. Uses CASCADE deletion to remove all related records.\n        \"\"\"\n        await self._drop()\n\n    @override\n    async def purge(self) -> None:\n        \"\"\"Remove all items from this key-value store while keeping the key-value store structure.\n\n        Remove all records from key_value_store_records table.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n        await self._purge(metadata_kwargs=MetadataUpdateParams(accessed_at=now, modified_at=now))\n\n    @override\n    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:\n        # Special handling for None values\n        if value is None:\n            content_type = 'application/x-none'  # Special content type to identify None values\n            value_bytes = b''\n        else:\n            content_type = content_type or infer_mime_type(value)\n\n            # Serialize the value to bytes.\n            if 'application/json' in content_type:\n                value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')\n            elif isinstance(value, str):\n                value_bytes = value.encode('utf-8')\n            elif isinstance(value, (bytes, bytearray)):\n                value_bytes = value\n            else:\n                # Fallback: attempt to convert to string and encode.\n                value_bytes = str(value).encode('utf-8')\n\n        size = len(value_bytes)\n        insert_values = {\n            'key_value_store_id': self._id,\n            'key': key,\n            'value': value_bytes,\n            'content_type': content_type,\n            'size': size,\n        }\n\n        upsert_stmt = self._build_upsert_stmt(\n            self._ITEM_TABLE,\n            insert_values=insert_values,\n            update_columns=['value', 'content_type', 'size'],\n            conflict_cols=['key_value_store_id', 'key'],\n        )\n\n        async with self.get_session(with_simple_commit=True) as session:\n            await session.execute(upsert_stmt)\n\n            await self._add_buffer_record(session, update_modified_at=True)\n\n    @override\n    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:\n        # Query the record by key\n        stmt = select(self._ITEM_TABLE).where(\n            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key\n        )\n        async with self.get_session(with_simple_commit=True) as session:\n            result = await session.execute(stmt)\n            record_db = result.scalar_one_or_none()\n\n            await self._add_buffer_record(session)\n\n        if not record_db:\n            return None\n\n        # Deserialize the value based on content type\n        value_bytes = record_db.value\n\n        # Handle None values\n        if record_db.content_type == 'application/x-none':\n            value = None\n        # Handle JSON values\n        elif 'application/json' in record_db.content_type:\n            try:\n                value = json.loads(value_bytes.decode('utf-8'))\n            except (json.JSONDecodeError, UnicodeDecodeError):\n                logger.warning(f'Failed to decode JSON value for key \"{key}\"')\n                return None\n        # Handle text values\n        elif record_db.content_type.startswith('text/'):\n            try:\n                value = value_bytes.decode('utf-8')\n            except UnicodeDecodeError:\n                logger.warning(f'Failed to decode text value for key \"{key}\"')\n                return None\n        # Handle binary values\n        else:\n            value = value_bytes\n\n        return KeyValueStoreRecord(\n            key=record_db.key,\n            value=value,\n            content_type=record_db.content_type,\n            size=record_db.size,\n        )\n\n    @override\n    async def delete_value(self, *, key: str) -> None:\n        stmt = delete(self._ITEM_TABLE).where(\n            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key\n        )\n        async with self.get_session(with_simple_commit=True) as session:\n            # Delete the record if it exists\n            result = await session.execute(stmt)\n            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n\n            # Update metadata if we actually deleted something\n            if result.rowcount > 0:\n                await self._add_buffer_record(session, update_modified_at=True)\n\n    @override\n    async def iterate_keys(\n        self,\n        *,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        # Build query for record metadata\n        stmt = (\n            select(self._ITEM_TABLE.key, self._ITEM_TABLE.content_type, self._ITEM_TABLE.size)\n            .where(self._ITEM_TABLE.key_value_store_id == self._id)\n            .order_by(self._ITEM_TABLE.key)\n        )\n\n        # Apply exclusive_start_key filter\n        if exclusive_start_key is not None:\n            stmt = stmt.where(self._ITEM_TABLE.key > exclusive_start_key)\n\n        # Apply limit\n        if limit is not None:\n            stmt = stmt.limit(limit)\n\n        async with self.get_session(with_simple_commit=True) as session:\n            result = await session.stream(stmt.execution_options(stream_results=True))\n\n            async for row in result:\n                yield KeyValueStoreRecordMetadata(\n                    key=row.key,\n                    content_type=row.content_type,\n                    size=row.size,\n                )\n\n            await self._add_buffer_record(session)\n\n    @override\n    async def record_exists(self, *, key: str) -> bool:\n        stmt = select(self._ITEM_TABLE.key).where(\n            self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key\n        )\n        async with self.get_session(with_simple_commit=True) as session:\n            # Check if record exists\n            result = await session.execute(stmt)\n\n            await self._add_buffer_record(session)\n\n            return result.scalar_one_or_none() is not None\n\n    @override\n    async def get_public_url(self, *, key: str) -> str:\n        raise NotImplementedError('Public URLs are not supported for SQL key-value stores.')\n\n    @override\n    def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict[str, Any]:\n        return {}\n\n    @override\n    def _prepare_buffer_data(self, **_kwargs: Any) -> dict[str, Any]:\n        \"\"\"Prepare key-value store specific buffer data.\n\n        For KeyValueStore, we don't have specific metadata fields to track in buffer,\n        so we just return empty dict. The base buffer will handle accessed_at/modified_at.\n        \"\"\"\n        return {}\n\n    @override\n    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:\n        aggregation_stmt = select(\n            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),\n            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),\n        ).where(self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id)\n\n        result = await session.execute(aggregation_stmt)\n        row = result.first()\n\n        if not row:\n            return\n\n        await self._update_metadata(\n            session,\n            **MetadataUpdateParams(\n                accessed_at=row.max_accessed_at,\n                modified_at=row.max_modified_at,\n            ),\n        )\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_request_queue_client.py",
    "content": "from __future__ import annotations\n\nfrom collections import deque\nfrom datetime import datetime, timedelta, timezone\nfrom functools import lru_cache\nfrom hashlib import sha256\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, cast\n\nfrom sqlalchemy import CursorResult, exists, func, or_, select, update\nfrom sqlalchemy import func as sql_func\nfrom sqlalchemy.exc import SQLAlchemyError\nfrom sqlalchemy.orm import load_only\nfrom typing_extensions import NotRequired, Self, override\n\nfrom crawlee import Request\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom crawlee.storage_clients._base import RequestQueueClient\nfrom crawlee.storage_clients.models import (\n    AddRequestsResponse,\n    ProcessedRequest,\n    RequestQueueMetadata,\n    UnprocessedRequest,\n)\n\nfrom ._client_mixin import MetadataUpdateParams, SqlClientMixin\nfrom ._db_models import RequestDb, RequestQueueMetadataBufferDb, RequestQueueMetadataDb, RequestQueueStateDb\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from sqlalchemy.ext.asyncio import AsyncSession\n    from sqlalchemy.sql import ColumnElement\n\n    from ._storage_client import SqlStorageClient\n\n\nlogger = getLogger(__name__)\n\n\nclass _QueueMetadataUpdateParams(MetadataUpdateParams):\n    \"\"\"Parameters for updating queue metadata.\"\"\"\n\n    new_handled_request_count: NotRequired[int]\n    new_pending_request_count: NotRequired[int]\n    new_total_request_count: NotRequired[int]\n    delta_handled_request_count: NotRequired[int]\n    delta_pending_request_count: NotRequired[int]\n    delta_total_request_count: NotRequired[int]\n    recalculate: NotRequired[bool]\n    update_had_multiple_clients: NotRequired[bool]\n\n\nclass SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):\n    \"\"\"SQL implementation of the request queue client.\n\n    This client persists requests to a SQL database with transaction handling and\n    concurrent access safety. Requests are stored with sequence-based ordering and\n    efficient querying capabilities.\n\n    The implementation uses negative sequence numbers for forefront (high-priority) requests\n    and positive sequence numbers for regular requests, allowing for efficient single-query\n    ordering. A cache mechanism reduces database queries.\n\n    The request queue data is stored in SQL database tables following the pattern:\n    - `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag)\n    - `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication,\n    sequence numbers for ordering, and processing status flags\n    - `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests.\n    - `request_queue_metadata_buffer` table: Buffers metadata updates for performance optimization\n\n    Requests are serialized to JSON for storage and maintain proper ordering through sequence\n    numbers. The implementation provides concurrent access safety through transaction\n    handling, locking mechanisms, and optimized database indexes for efficient querying.\n    \"\"\"\n\n    _DEFAULT_NAME = 'default'\n    \"\"\"Default dataset name used when no name is provided.\"\"\"\n\n    _MAX_BATCH_FETCH_SIZE = 10\n    \"\"\"Maximum number of requests to fetch from the database in a single batch operation.\n\n    Used to limit the number of requests loaded and locked for processing at once (improves efficiency and reduces\n    database load).\n    \"\"\"\n\n    _METADATA_TABLE = RequestQueueMetadataDb\n    \"\"\"SQLAlchemy model for request queue metadata.\"\"\"\n\n    _ITEM_TABLE = RequestDb\n    \"\"\"SQLAlchemy model for request items.\"\"\"\n\n    _CLIENT_TYPE = 'Request queue'\n    \"\"\"Human-readable client type for error messages.\"\"\"\n\n    _BLOCK_REQUEST_TIME = 300\n    \"\"\"Number of seconds for which a request is considered blocked in the database after being fetched for processing.\n    \"\"\"\n\n    _BUFFER_TABLE = RequestQueueMetadataBufferDb\n    \"\"\"SQLAlchemy model for metadata buffer.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        id: str,\n        storage_client: SqlStorageClient,\n    ) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `SqlRequestQueueClient.open` class method to create a new instance.\n        \"\"\"\n        super().__init__(id=id, storage_client=storage_client)\n\n        self._pending_fetch_cache: deque[Request] = deque()\n        \"\"\"Cache for requests: ordered by sequence number.\"\"\"\n\n        self.client_key = crypto_random_object_id(length=32)[:32]\n        \"\"\"Unique identifier for this client instance.\"\"\"\n\n        self._had_multiple_clients = False\n        \"\"\"Indicates whether the queue has been accessed by multiple clients.\"\"\"\n\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        storage_client: SqlStorageClient,\n    ) -> Self:\n        \"\"\"Open an existing request queue or create a new one.\n\n        This method first tries to find an existing queue by ID or name.\n        If found, it returns a client for that queue. If not found, it creates\n        a new queue with the specified parameters.\n\n        Args:\n            id: The ID of the request queue to open. Takes precedence over name.\n            name: The name of the request queue for named (global scope) storages.\n            alias: The alias of the request queue for unnamed (run scope) storages.\n            storage_client: The SQL storage client used to access the database.\n\n        Returns:\n            An instance for the opened or created request queue.\n\n        Raises:\n            ValueError: If a queue with the specified ID is not found.\n        \"\"\"\n        return await cls._safely_open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=storage_client,\n            metadata_model=RequestQueueMetadata,\n            extra_metadata_fields={\n                'had_multiple_clients': False,\n                'handled_request_count': 0,\n                'pending_request_count': 0,\n                'total_request_count': 0,\n            },\n        )\n\n    @override\n    async def get_metadata(self) -> RequestQueueMetadata:\n        # The database is a single place of truth\n        metadata = await self._get_metadata(RequestQueueMetadata)\n        self._had_multiple_clients = metadata.had_multiple_clients\n        return metadata\n\n    @override\n    async def drop(self) -> None:\n        \"\"\"Delete this request queue and all its records from the database.\n\n        This operation is irreversible. Uses CASCADE deletion to remove all related records.\n        \"\"\"\n        await self._drop()\n\n        self._pending_fetch_cache.clear()\n\n    @override\n    async def purge(self) -> None:\n        \"\"\"Remove all items from this dataset while keeping the dataset structure.\n\n        Resets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records\n        table.\n        \"\"\"\n        now = datetime.now(timezone.utc)\n        await self._purge(\n            metadata_kwargs=_QueueMetadataUpdateParams(\n                accessed_at=now,\n                modified_at=now,\n                new_pending_request_count=0,\n                new_handled_request_count=0,\n                new_total_request_count=0,\n            )\n        )\n\n        # Clear recoverable state\n        self._pending_fetch_cache.clear()\n\n    @override\n    async def add_batch_of_requests(\n        self,\n        requests: Sequence[Request],\n        *,\n        forefront: bool = False,\n    ) -> AddRequestsResponse:\n        if not requests:\n            return AddRequestsResponse(processed_requests=[], unprocessed_requests=[])\n\n        # Clear empty cache since we're adding requests\n        processed_requests = []\n        unprocessed_requests = []\n        transaction_processed_requests = []\n        transaction_processed_requests_unique_keys = set()\n\n        approximate_new_request = 0\n\n        # Deduplicate requests by unique_key upfront\n        unique_requests = {}\n        unique_key_by_request_id = {}\n        for req in requests:\n            if req.unique_key not in unique_requests:\n                request_id = self._get_int_id_from_unique_key(req.unique_key)\n                unique_requests[request_id] = req\n                unique_key_by_request_id[request_id] = req.unique_key\n\n        # Get existing requests by unique keys\n        stmt = (\n            select(self._ITEM_TABLE)\n            .where(\n                self._ITEM_TABLE.request_queue_id == self._id,\n                self._ITEM_TABLE.request_id.in_(set(unique_requests.keys())),\n            )\n            .options(\n                load_only(\n                    self._ITEM_TABLE.request_id,\n                    self._ITEM_TABLE.is_handled,\n                    self._ITEM_TABLE.time_blocked_until,\n                )\n            )\n        )\n\n        async with self.get_session() as session:\n            result = await session.execute(stmt)\n            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n            existing_requests = {req.request_id: req for req in result.scalars()}\n            state = await self._get_state(session)\n            insert_values: list[dict] = []\n\n            for request_id, request in sorted(unique_requests.items()):\n                existing_req_db = existing_requests.get(request_id)\n                # New Request, add it\n                if existing_req_db is None:\n                    value = {\n                        'request_id': request_id,\n                        'request_queue_id': self._id,\n                        'data': request.model_dump_json(),\n                        'is_handled': False,\n                    }\n                    if forefront:\n                        value['sequence_number'] = state.forefront_sequence_counter\n                        state.forefront_sequence_counter -= 1\n                    else:\n                        value['sequence_number'] = state.sequence_counter\n                        state.sequence_counter += 1\n\n                    insert_values.append(value)\n                    transaction_processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=False,\n                            was_already_handled=False,\n                        )\n                    )\n                    transaction_processed_requests_unique_keys.add(request.unique_key)\n                # Already handled request, skip adding\n                elif existing_req_db and existing_req_db.is_handled:\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=True,\n                            was_already_handled=True,\n                        )\n                    )\n                # Already in progress in one of the clients\n                elif existing_req_db and existing_req_db.time_blocked_until:\n                    processed_requests.append(\n                        ProcessedRequest(\n                            unique_key=request.unique_key,\n                            was_already_present=True,\n                            was_already_handled=False,\n                        )\n                    )\n                # Request in database but not yet handled and not in progress\n                elif existing_req_db and not existing_req_db.is_handled and not existing_req_db.time_blocked_until:\n                    # Forefront request, update its sequence number\n                    if forefront:\n                        insert_values.append(\n                            {\n                                'request_queue_id': self._id,\n                                'request_id': request_id,\n                                'sequence_number': state.forefront_sequence_counter,\n                                'data': request.model_dump_json(),\n                                'is_handled': False,\n                            }\n                        )\n                        state.forefront_sequence_counter -= 1\n                        transaction_processed_requests.append(\n                            ProcessedRequest(\n                                unique_key=request.unique_key,\n                                was_already_present=True,\n                                was_already_handled=False,\n                            )\n                        )\n                        transaction_processed_requests_unique_keys.add(request.unique_key)\n                    # Regular request, keep its position\n                    else:\n                        processed_requests.append(\n                            ProcessedRequest(\n                                unique_key=request.unique_key,\n                                was_already_present=True,\n                                was_already_handled=False,\n                            )\n                        )\n                # Unexpected condition\n                else:\n                    unprocessed_requests.append(\n                        UnprocessedRequest(\n                            unique_key=request.unique_key,\n                            url=request.url,\n                            method=request.method,\n                        )\n                    )\n\n            try:\n                if insert_values:\n                    if forefront:\n                        # If the request already exists in the database, we update the sequence_number\n                        # by shifting request to the left.\n                        upsert_stmt = self._build_upsert_stmt(\n                            self._ITEM_TABLE,\n                            insert_values,\n                            update_columns=['sequence_number'],\n                            conflict_cols=['request_id', 'request_queue_id'],\n                        )\n                        result = await session.execute(upsert_stmt)\n                    else:\n                        # If the request already exists in the database, we ignore this request when inserting.\n                        insert_stmt_with_ignore = self._build_insert_stmt_with_ignore(self._ITEM_TABLE, insert_values)\n                        result = await session.execute(insert_stmt_with_ignore)\n\n                    result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n                    approximate_new_request += result.rowcount\n\n                await self._add_buffer_record(\n                    session,\n                    update_modified_at=True,\n                    delta_pending_request_count=approximate_new_request,\n                    delta_total_request_count=approximate_new_request,\n                )\n\n                await session.commit()\n                processed_requests.extend(transaction_processed_requests)\n            except SQLAlchemyError as e:\n                await session.rollback()\n                logger.debug(f'Failed add requests to DB with error: {e}')\n                await self._add_buffer_record(\n                    session,\n                    update_modified_at=True,\n                    recalculate=True,\n                )\n                await session.commit()\n                transaction_processed_requests.clear()\n                unprocessed_requests.extend(\n                    [\n                        UnprocessedRequest(\n                            unique_key=request.unique_key,\n                            url=request.url,\n                            method=request.method,\n                        )\n                        for request in requests\n                        if request.unique_key in transaction_processed_requests_unique_keys\n                    ]\n                )\n\n        return AddRequestsResponse(\n            processed_requests=processed_requests,\n            unprocessed_requests=unprocessed_requests,\n        )\n\n    @override\n    async def get_request(self, unique_key: str) -> Request | None:\n        request_id = self._get_int_id_from_unique_key(unique_key)\n\n        stmt = select(self._ITEM_TABLE).where(\n            self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id\n        )\n        async with self.get_session(with_simple_commit=True) as session:\n            result = await session.execute(stmt)\n            request_db = result.scalar_one_or_none()\n\n            if request_db is None:\n                logger.warning(f'Request with ID \"{unique_key}\" not found in the queue.')\n                return None\n\n            await self._add_buffer_record(session)\n\n        return Request.model_validate_json(request_db.data)\n\n    @override\n    async def fetch_next_request(self) -> Request | None:\n        if self._pending_fetch_cache:\n            return self._pending_fetch_cache.popleft()\n\n        now = datetime.now(timezone.utc)\n        block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)\n        dialect = self._storage_client.get_dialect_name()\n\n        # Get available requests not blocked by another client\n        stmt = (\n            select(self._ITEM_TABLE)\n            .where(\n                self._ITEM_TABLE.request_queue_id == self._id,\n                self._ITEM_TABLE.is_handled == False,  # noqa: E712\n                or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),\n            )\n            .order_by(self._ITEM_TABLE.sequence_number.asc())\n            .limit(self._MAX_BATCH_FETCH_SIZE)\n        )\n\n        async with self.get_session(with_simple_commit=True) as session:\n            # We use the `skip_locked` database mechanism to prevent the 'interception' of requests by another client\n            if dialect in {'postgresql', 'mysql', 'mariadb'}:\n                stmt = stmt.with_for_update(skip_locked=True)\n                result = await session.execute(stmt)\n                requests_db = result.scalars().all()\n\n                if not requests_db:\n                    return None\n\n                # All requests received have already been reserved for update with the help of `skip_locked`.\n                request_ids = {r.request_id for r in requests_db}\n\n                update_stmt = (\n                    update(self._ITEM_TABLE)\n                    .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id.in_(request_ids))\n                    .values(time_blocked_until=block_until, client_key=self.client_key)\n                )\n                await session.execute(update_stmt)\n\n                blocked_ids = request_ids\n            else:\n                # For other databases, we first select the requests, then try to update them to be blocked.\n                result = await session.execute(stmt)\n                requests_db = result.scalars().all()\n\n                if not requests_db:\n                    return None\n\n                request_ids = {r.request_id for r in requests_db}\n\n                update_stmt = (\n                    update(self._ITEM_TABLE)\n                    .where(\n                        self._ITEM_TABLE.request_queue_id == self._id,\n                        self._ITEM_TABLE.request_id.in_(request_ids),\n                        self._ITEM_TABLE.is_handled == False,  # noqa: E712\n                        or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),\n                    )\n                    .values(time_blocked_until=block_until, client_key=self.client_key)\n                    .returning(self._ITEM_TABLE.request_id)\n                )\n\n                update_result = await session.execute(update_stmt)\n                blocked_ids = {row[0] for row in update_result.fetchall()}\n\n                if not blocked_ids:\n                    await session.rollback()\n                    return None\n\n            await self._add_buffer_record(session)\n\n        requests = [Request.model_validate_json(r.data) for r in requests_db if r.request_id in blocked_ids]\n\n        if not requests:\n            return None\n\n        self._pending_fetch_cache.extend(requests[1:])\n\n        return requests[0]\n\n    @override\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        request_id = self._get_int_id_from_unique_key(request.unique_key)\n\n        # Update the request's handled_at timestamp.\n        if request.handled_at is None:\n            request.handled_at = datetime.now(timezone.utc)\n\n        # Update request in Db\n        stmt = (\n            update(self._ITEM_TABLE)\n            .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id)\n            .values(is_handled=True, time_blocked_until=None, client_key=None, data=request.model_dump_json())\n        )\n        async with self.get_session(with_simple_commit=True) as session:\n            result = await session.execute(stmt)\n            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n\n            if result.rowcount == 0:\n                logger.warning(f'Request {request.unique_key} not found in database.')\n                return None\n\n            await self._add_buffer_record(\n                session, update_modified_at=True, delta_pending_request_count=-1, delta_handled_request_count=1\n            )\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=True,\n        )\n\n    @override\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        request_id = self._get_int_id_from_unique_key(request.unique_key)\n\n        stmt = update(self._ITEM_TABLE).where(\n            self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id\n        )\n\n        async with self.get_session(with_simple_commit=True) as session:\n            state = await self._get_state(session)\n\n            # Update sequence number if changing priority\n            if forefront:\n                new_sequence = state.forefront_sequence_counter\n                state.forefront_sequence_counter -= 1\n                now = datetime.now(timezone.utc)\n                block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)\n                # Extend blocking for forefront request, it is considered blocked by the current client.\n                stmt = stmt.values(\n                    sequence_number=new_sequence,\n                    time_blocked_until=block_until,\n                    client_key=self.client_key,\n                    data=request.model_dump_json(),\n                )\n            else:\n                new_sequence = state.sequence_counter\n                state.sequence_counter += 1\n                stmt = stmt.values(\n                    sequence_number=new_sequence,\n                    time_blocked_until=None,\n                    client_key=None,\n                    data=request.model_dump_json(),\n                )\n\n            result = await session.execute(stmt)\n            result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result\n\n            if result.rowcount == 0:\n                logger.warning(f'Request {request.unique_key} not found in database.')\n                return None\n            await self._add_buffer_record(session, update_modified_at=True)\n\n        # put the forefront request at the beginning of the cache\n        if forefront:\n            self._pending_fetch_cache.appendleft(request)\n\n        return ProcessedRequest(\n            unique_key=request.unique_key,\n            was_already_present=True,\n            was_already_handled=False,\n        )\n\n    @override\n    async def is_empty(self) -> bool:\n        # Check in-memory cache for requests\n        if self._pending_fetch_cache:\n            return False\n\n        metadata = await self.get_metadata()\n\n        async with self.get_session(with_simple_commit=True) as session:\n            # If there are no pending requests, check if there are any buffered updates\n            if metadata.pending_request_count == 0:\n                # Check for active buffer lock (indicates pending buffer processing)\n                buffer_lock_stmt = select(self._METADATA_TABLE.buffer_locked_until).where(\n                    self._METADATA_TABLE.id == self._id\n                )\n                buffer_lock_result = await session.execute(buffer_lock_stmt)\n                buffer_locked_until = buffer_lock_result.scalar()\n\n                # If buffer is locked, there are pending updates being processed\n                if buffer_locked_until is not None:\n                    await self._add_buffer_record(session)\n                    return False\n\n                # Check if there are any buffered updates that might change the pending count\n                buffer_check_stmt = select(\n                    exists().where(\n                        (self._BUFFER_TABLE.storage_id == self._id)\n                        & (\n                            (self._BUFFER_TABLE.delta_pending_count != 0) | (self._BUFFER_TABLE.need_recalc == True)  # noqa: E712\n                        )\n                    )\n                )\n                buffer_result = await session.execute(buffer_check_stmt)\n                has_pending_buffer_updates = buffer_result.scalar()\n\n                await self._add_buffer_record(session)\n                # If there are no pending requests and no buffered updates, the queue is empty\n                return not has_pending_buffer_updates\n\n            # There are pending requests (may be inaccurate), ensure recalculated metadata\n            await self._add_buffer_record(session, update_modified_at=True, recalculate=True)\n\n        return False\n\n    async def _get_state(self, session: AsyncSession) -> RequestQueueStateDb:\n        \"\"\"Get the current state of the request queue.\"\"\"\n        orm_state: RequestQueueStateDb | None = await session.get(RequestQueueStateDb, self._id)\n        if not orm_state:\n            insert_values = {'request_queue_id': self._id}\n            # Create a new state if it doesn't exist\n            # This is a safeguard against race conditions where multiple clients might try to create the state\n            # simultaneously.\n            insert_stmt = self._build_insert_stmt_with_ignore(RequestQueueStateDb, insert_values)\n            await session.execute(insert_stmt)\n            await session.flush()\n            orm_state = await session.get(RequestQueueStateDb, self._id)\n            if not orm_state:\n                raise RuntimeError(f'Failed to create or retrieve state for queue {self._id}')\n        return orm_state\n\n    @override\n    def _specific_update_metadata(\n        self,\n        new_handled_request_count: int | None = None,\n        new_pending_request_count: int | None = None,\n        new_total_request_count: int | None = None,\n        delta_handled_request_count: int | None = None,\n        delta_pending_request_count: int | None = None,\n        delta_total_request_count: int | None = None,\n        *,\n        recalculate: bool = False,\n        update_had_multiple_clients: bool = False,\n        **_kwargs: dict[str, Any],\n    ) -> dict[str, Any]:\n        \"\"\"Update the request queue metadata in the database.\n\n        Args:\n            session: The SQLAlchemy session to use for database operations.\n            new_handled_request_count: If provided, update the handled_request_count to this value.\n            new_pending_request_count: If provided, update the pending_request_count to this value.\n            new_total_request_count: If provided, update the total_request_count to this value.\n            delta_handled_request_count: If provided, add this value to the handled_request_count.\n            delta_pending_request_count: If provided, add this value to the pending_request_count.\n            delta_total_request_count: If provided, add this value to the total_request_count.\n            recalculate: If True, recalculate the pending_request_count, and total_request_count on request table.\n            update_had_multiple_clients: If True, set had_multiple_clients to True.\n        \"\"\"\n        values_to_set: dict[str, Any] = {}\n\n        if update_had_multiple_clients:\n            values_to_set['had_multiple_clients'] = True\n\n        if recalculate:\n            stmt = (\n                update(self._METADATA_TABLE)\n                .where(self._METADATA_TABLE.request_queue_id == self._id)\n                .values(\n                    pending_request_count=(\n                        select(func.count())\n                        .select_from(self._ITEM_TABLE)\n                        .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(False))\n                        .scalar_subquery()\n                    ),\n                    total_request_count=(\n                        select(func.count())\n                        .select_from(self._ITEM_TABLE)\n                        .where(self._ITEM_TABLE.request_queue_id == self._id)\n                        .scalar_subquery()\n                    ),\n                    handled_request_count=(\n                        select(func.count())\n                        .select_from(self._ITEM_TABLE)\n                        .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(True))\n                        .scalar_subquery()\n                    ),\n                )\n            )\n\n            values_to_set['custom_stmt'] = stmt\n\n        else:\n            if new_handled_request_count is not None:\n                values_to_set['handled_request_count'] = new_handled_request_count\n            elif delta_handled_request_count is not None:\n                values_to_set['handled_request_count'] = (\n                    self._METADATA_TABLE.handled_request_count + delta_handled_request_count\n                )\n\n            if new_pending_request_count is not None:\n                values_to_set['pending_request_count'] = new_pending_request_count\n            elif delta_pending_request_count is not None:\n                values_to_set['pending_request_count'] = (\n                    self._METADATA_TABLE.pending_request_count + delta_pending_request_count\n                )\n\n            if new_total_request_count is not None:\n                values_to_set['total_request_count'] = new_total_request_count\n            elif delta_total_request_count is not None:\n                values_to_set['total_request_count'] = (\n                    self._METADATA_TABLE.total_request_count + delta_total_request_count\n                )\n\n        return values_to_set\n\n    @staticmethod\n    @lru_cache(maxsize=10000)\n    def _get_int_id_from_unique_key(unique_key: str) -> int:\n        \"\"\"Generate a deterministic integer ID for a unique_key.\n\n        Args:\n            unique_key: Unique key to be used to generate ID.\n\n        Returns:\n            An integer ID based on the unique_key.\n        \"\"\"\n        hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()\n        name_length = 15\n        return int(hashed_key[:name_length], 16)\n\n    @override\n    def _prepare_buffer_data(\n        self,\n        delta_handled_request_count: int | None = None,\n        delta_pending_request_count: int | None = None,\n        delta_total_request_count: int | None = None,\n        *,\n        recalculate: bool = False,\n        **_kwargs: Any,\n    ) -> dict[str, Any]:\n        \"\"\"Prepare request queue specific buffer data.\n\n        Args:\n            delta_handled_request_count: If provided, add this value to the handled_request_count.\n            delta_pending_request_count: If provided, add this value to the pending_request_count.\n            delta_total_request_count: If provided, add this value to the total_request_count.\n            recalculate: If True, recalculate the pending_request_count, and total_request_count on request table.\n        \"\"\"\n        buffer_data: dict[str, Any] = {\n            'client_id': self.client_key,\n        }\n\n        if delta_handled_request_count:\n            buffer_data['delta_handled_count'] = delta_handled_request_count\n\n        if delta_pending_request_count:\n            buffer_data['delta_pending_count'] = delta_pending_request_count\n\n        if delta_total_request_count:\n            buffer_data['delta_total_count'] = delta_total_request_count\n\n        if recalculate:\n            buffer_data['need_recalc'] = True\n\n        return buffer_data\n\n    @override\n    async def _apply_buffer_updates(self, session: AsyncSession, max_buffer_id: int) -> None:\n        aggregations: list[ColumnElement[Any]] = [\n            sql_func.max(self._BUFFER_TABLE.accessed_at).label('max_accessed_at'),\n            sql_func.max(self._BUFFER_TABLE.modified_at).label('max_modified_at'),\n            sql_func.sum(self._BUFFER_TABLE.delta_handled_count).label('delta_handled_count'),\n            sql_func.sum(self._BUFFER_TABLE.delta_pending_count).label('delta_pending_count'),\n            sql_func.sum(self._BUFFER_TABLE.delta_total_count).label('delta_total_count'),\n        ]\n\n        if not self._had_multiple_clients:\n            aggregations.append(\n                sql_func.count(sql_func.distinct(self._BUFFER_TABLE.client_id)).label('unique_clients_count')\n            )\n\n        if self._storage_client.get_dialect_name() == 'postgresql':\n            aggregations.append(sql_func.bool_or(self._BUFFER_TABLE.need_recalc).label('need_recalc'))\n        else:\n            aggregations.append(sql_func.max(self._BUFFER_TABLE.need_recalc).label('need_recalc'))\n\n        aggregation_stmt = select(*aggregations).where(\n            self._BUFFER_TABLE.storage_id == self._id, self._BUFFER_TABLE.id <= max_buffer_id\n        )\n\n        result = await session.execute(aggregation_stmt)\n        row = result.first()\n\n        if not row:\n            return\n\n        await self._update_metadata(\n            session,\n            **_QueueMetadataUpdateParams(\n                accessed_at=row.max_accessed_at,\n                modified_at=row.max_modified_at,\n                update_had_multiple_clients=not self._had_multiple_clients and row.unique_clients_count > 1,\n                delta_handled_request_count=row.delta_handled_count,\n                delta_pending_request_count=row.delta_pending_count,\n                delta_total_request_count=row.delta_total_count,\n                recalculate=bool(row.need_recalc),\n            ),\n        )\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/_storage_client.py",
    "content": "from __future__ import annotations\n\nimport warnings\nfrom logging import getLogger\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar\n\nfrom sqlalchemy.exc import IntegrityError, OperationalError\nfrom sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine\nfrom sqlalchemy.sql import insert, select, text\nfrom typing_extensions import override\n\nfrom crawlee._utils.docs import docs_group\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients._base import StorageClient\n\nfrom ._dataset_client import SqlDatasetClient\nfrom ._db_models import Base, VersionDb\nfrom ._key_value_store_client import SqlKeyValueStoreClient\nfrom ._request_queue_client import SqlRequestQueueClient\n\nif TYPE_CHECKING:\n    from types import TracebackType\n\n    from sqlalchemy.ext.asyncio import AsyncSession\n\n\nlogger = getLogger(__name__)\n\n\n@docs_group('Storage clients')\nclass SqlStorageClient(StorageClient):\n    \"\"\"SQL implementation of the storage client.\n\n    This storage client provides access to datasets, key-value stores, and request queues that persist data\n    to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for\n    records.\n\n    The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is\n    provided, it creates a default SQLite database 'crawlee.db' in the storage directory.\n\n    Database schema is automatically created during initialization. SQLite databases receive performance\n    optimizations including WAL mode and increased cache size.\n\n    Warning:\n        This is an experimental feature. The behavior and interface may change in future versions.\n    \"\"\"\n\n    _DEFAULT_DB_NAME = 'crawlee.db'\n    \"\"\"Default database name if not specified in connection string.\"\"\"\n\n    _SUPPORTED_DIALECTS: ClassVar[set[str]] = {'sqlite', 'postgresql', 'mysql', 'mariadb'}\n\n    def __init__(\n        self,\n        *,\n        connection_string: str | None = None,\n        engine: AsyncEngine | None = None,\n    ) -> None:\n        \"\"\"Initialize the SQL storage client.\n\n        Args:\n            connection_string: Database connection string (e.g., \"sqlite+aiosqlite:///crawlee.db\").\n                If not provided, defaults to SQLite database in the storage directory.\n            engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.\n        \"\"\"\n        if engine is not None and connection_string is not None:\n            raise ValueError('Either connection_string or engine must be provided, not both.')\n\n        self._connection_string = connection_string\n        self._engine = engine\n        self._initialized = False\n        self.session_maker: None | async_sessionmaker[AsyncSession] = None\n\n        # Flag needed to apply optimizations only for default database\n        self._default_flag = self._engine is None and self._connection_string is None\n        self._dialect_name: str | None = None\n\n        # Call the notification only once\n        warnings.warn(\n            'The SqlStorageClient is experimental and may change or be removed in future releases.',\n            category=UserWarning,\n            stacklevel=2,\n        )\n\n    async def __aenter__(self) -> SqlStorageClient:\n        \"\"\"Async context manager entry.\"\"\"\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None,\n        exc_value: BaseException | None,\n        exc_traceback: TracebackType | None,\n    ) -> None:\n        \"\"\"Async context manager exit.\"\"\"\n        await self.close()\n\n    @property\n    def engine(self) -> AsyncEngine:\n        \"\"\"Get the SQLAlchemy AsyncEngine instance.\"\"\"\n        if self._engine is None:\n            raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')\n        return self._engine\n\n    def get_dialect_name(self) -> str | None:\n        \"\"\"Get the database dialect name.\"\"\"\n        return self._dialect_name\n\n    async def initialize(self, configuration: Configuration) -> None:\n        \"\"\"Initialize the database schema.\n\n        This method creates all necessary tables if they don't exist.\n        Should be called before using the storage client.\n        \"\"\"\n        if not self._initialized:\n            engine = self._get_or_create_engine(configuration)\n            async with engine.begin() as conn:\n                self._dialect_name = engine.dialect.name\n\n                if self._dialect_name not in self._SUPPORTED_DIALECTS:\n                    raise ValueError(\n                        f'Unsupported database dialect: {self._dialect_name}. Supported: '\n                        f'{\", \".join(self._SUPPORTED_DIALECTS)}. Consider using a different database.',\n                    )\n\n                # Create tables if they don't exist.\n                # Rollback the transaction when an exception occurs.\n                # This is likely an attempt to create a database from several parallel processes.\n                try:\n                    # Set SQLite pragmas for performance and consistency\n                    if self._default_flag:\n                        await conn.execute(text('PRAGMA journal_mode=WAL'))  # Better concurrency\n                        await conn.execute(text('PRAGMA synchronous=NORMAL'))  # Balanced safety/speed\n                        await conn.execute(text('PRAGMA cache_size=100000'))  # 100MB cache\n                        await conn.execute(text('PRAGMA temp_store=MEMORY'))  # Memory temp storage\n                        await conn.execute(text('PRAGMA mmap_size=268435456'))  # 256MB memory mapping\n                        await conn.execute(text('PRAGMA foreign_keys=ON'))  # Enforce constraints\n                        await conn.execute(text('PRAGMA busy_timeout=30000'))  # 30s busy timeout\n                    await conn.run_sync(Base.metadata.create_all, checkfirst=True)\n                    from crawlee import __version__  # Noqa: PLC0415\n\n                    db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()\n\n                    # Raise an error if the new version creates breaking changes in the database schema.\n                    if db_version and db_version != __version__:\n                        warnings.warn(\n                            f'Database version {db_version} does not match library version {__version__}. '\n                            'This may lead to unexpected behavior. Drop the db if you want to make sure that '\n                            'everything will work fine.',\n                            category=UserWarning,\n                            stacklevel=2,\n                        )\n                    elif not db_version:\n                        await conn.execute(insert(VersionDb).values(version=__version__))\n                except (IntegrityError, OperationalError):\n                    await conn.rollback()\n\n            self._initialized = True\n\n    async def close(self) -> None:\n        \"\"\"Close the database connection pool.\"\"\"\n        if self._engine is not None:\n            await self._engine.dispose()\n        self._engine = None\n\n    def create_session(self) -> AsyncSession:\n        \"\"\"Create a new database session.\n\n        Returns:\n            A new AsyncSession instance.\n        \"\"\"\n        if self.session_maker is None:\n            self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)\n        return self.session_maker()\n\n    @override\n    async def create_dataset_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> SqlDatasetClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        await self.initialize(configuration)\n\n        client = await SqlDatasetClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=self,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_kvs_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> SqlKeyValueStoreClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        await self.initialize(configuration)\n\n        client = await SqlKeyValueStoreClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=self,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    @override\n    async def create_rq_client(\n        self,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n    ) -> SqlRequestQueueClient:\n        configuration = configuration or Configuration.get_global_configuration()\n        await self.initialize(configuration)\n\n        client = await SqlRequestQueueClient.open(\n            id=id,\n            name=name,\n            alias=alias,\n            storage_client=self,\n        )\n\n        await self._purge_if_needed(client, configuration)\n        return client\n\n    def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:\n        \"\"\"Get or create the database engine based on configuration.\"\"\"\n        if self._engine is not None:\n            return self._engine\n\n        if self._connection_string is not None:\n            connection_string = self._connection_string\n        else:\n            # Create SQLite database in the storage directory\n            storage_dir = Path(configuration.storage_dir)\n            if not storage_dir.exists():\n                storage_dir.mkdir(parents=True, exist_ok=True)\n\n            db_path = storage_dir / self._DEFAULT_DB_NAME\n\n            # Create connection string with path to default database\n            connection_string = f'sqlite+aiosqlite:///{db_path}'\n\n        if not any(connection_string.startswith(dialect) for dialect in self._SUPPORTED_DIALECTS):\n            raise ValueError(\n                f'Unsupported database. Supported: {\", \".join(self._SUPPORTED_DIALECTS)}. Consider using a different '\n                'database.'\n            )\n\n        kwargs: dict[str, Any] = {}\n        if 'mysql' in connection_string or 'mariadb' in connection_string:\n            connect_args: dict[str, Any] = {'connect_timeout': 30}\n            # MySQL/MariaDB require READ COMMITTED isolation level for correct behavior in concurrent environments\n            # without deadlocks.\n            kwargs['isolation_level'] = 'READ COMMITTED'\n        else:\n            connect_args = {'timeout': 30}\n\n        self._engine = create_async_engine(\n            connection_string,\n            future=True,\n            pool_size=5,\n            max_overflow=10,\n            pool_timeout=60,\n            pool_recycle=600,\n            pool_pre_ping=True,\n            echo=False,\n            connect_args=connect_args,\n            **kwargs,\n        )\n        return self._engine\n"
  },
  {
    "path": "src/crawlee/storage_clients/_sql/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storage_clients/models.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING, Annotated, Any, Generic\n\nfrom pydantic import BaseModel, BeforeValidator, ConfigDict, Field\nfrom typing_extensions import TypeVar\n\nfrom crawlee._types import HttpMethod\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.urls import validate_http_url\n\nKvsValueType = TypeVar('KvsValueType', default=Any)\n\n\n@docs_group('Storage data')\nclass StorageMetadata(BaseModel):\n    \"\"\"Represents the base model for storage metadata.\n\n    It contains common fields shared across all specific storage types.\n    \"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)\n\n    id: Annotated[str, Field(alias='id')]\n    \"\"\"The unique identifier of the storage.\"\"\"\n\n    name: Annotated[str | None, Field(alias='name', default=None)]\n    \"\"\"The name of the storage.\"\"\"\n\n    accessed_at: Annotated[datetime, Field(alias='accessedAt')]\n    \"\"\"The timestamp when the storage was last accessed.\"\"\"\n\n    created_at: Annotated[datetime, Field(alias='createdAt')]\n    \"\"\"The timestamp when the storage was created.\"\"\"\n\n    modified_at: Annotated[datetime, Field(alias='modifiedAt')]\n    \"\"\"The timestamp when the storage was last modified.\"\"\"\n\n\n@docs_group('Storage data')\nclass DatasetMetadata(StorageMetadata):\n    \"\"\"Model for a dataset metadata.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    item_count: Annotated[int, Field(alias='itemCount')]\n    \"\"\"The number of items in the dataset.\"\"\"\n\n\n@docs_group('Storage data')\nclass KeyValueStoreMetadata(StorageMetadata):\n    \"\"\"Model for a key-value store metadata.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n\n@docs_group('Storage data')\nclass RequestQueueMetadata(StorageMetadata):\n    \"\"\"Model for a request queue metadata.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]\n    \"\"\"Indicates whether the queue has been accessed by multiple clients (consumers).\"\"\"\n\n    handled_request_count: Annotated[int, Field(alias='handledRequestCount')]\n    \"\"\"The number of requests that have been handled from the queue.\"\"\"\n\n    pending_request_count: Annotated[int, Field(alias='pendingRequestCount')]\n    \"\"\"The number of requests that are still pending in the queue.\"\"\"\n\n    total_request_count: Annotated[int, Field(alias='totalRequestCount')]\n    \"\"\"The total number of requests that have been added to the queue.\"\"\"\n\n\n@docs_group('Storage data')\nclass KeyValueStoreRecordMetadata(BaseModel):\n    \"\"\"Model for a key-value store record metadata.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    key: Annotated[str, Field(alias='key')]\n    \"\"\"The key of the record.\n\n    A unique identifier for the record in the key-value store.\n    \"\"\"\n\n    content_type: Annotated[str, Field(alias='contentType')]\n    \"\"\"The MIME type of the record.\n\n    Describe the format and type of data stored in the record, following the MIME specification.\n    \"\"\"\n\n    size: Annotated[int | None, Field(alias='size', default=None)] = None\n    \"\"\"The size of the record in bytes.\"\"\"\n\n\n@docs_group('Storage data')\nclass KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):\n    \"\"\"Model for a key-value store record.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    value: Annotated[KvsValueType, Field(alias='value')]\n    \"\"\"The value of the record.\"\"\"\n\n\n@docs_group('Storage data')\nclass DatasetItemsListPage(BaseModel):\n    \"\"\"Model for a single page of dataset items returned from a collection list method.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    count: Annotated[int, Field(default=0)]\n    \"\"\"The number of objects returned on this page.\"\"\"\n\n    offset: Annotated[int, Field(default=0)]\n    \"\"\"The starting position of the first object returned, as specified in the API call.\"\"\"\n\n    limit: Annotated[int, Field(default=0)]\n    \"\"\"The maximum number of objects to return, as specified in the API call.\"\"\"\n\n    total: Annotated[int, Field(default=0)]\n    \"\"\"The total number of objects that match the criteria of the API call.\"\"\"\n\n    desc: Annotated[bool, Field(default=False)]\n    \"\"\"Indicates if the returned list is in descending order.\"\"\"\n\n    # Workaround for Pydantic and type checkers when using Annotated with default_factory\n    if TYPE_CHECKING:\n        items: list[dict] = []\n        \"\"\"The list of dataset items returned on this page.\"\"\"\n    else:\n        items: Annotated[list[dict], Field(default_factory=list)]\n        \"\"\"The list of dataset items returned on this page.\"\"\"\n\n\n@docs_group('Storage data')\nclass ProcessedRequest(BaseModel):\n    \"\"\"Represents a processed request.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    id: Annotated[str | None, Field(alias='requestId', default=None)] = None\n    \"\"\"Internal representation of the request by the storage client. Only some clients use id.\"\"\"\n\n    unique_key: Annotated[str, Field(alias='uniqueKey')]\n    was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')]\n    was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')]\n\n\n@docs_group('Storage data')\nclass UnprocessedRequest(BaseModel):\n    \"\"\"Represents an unprocessed request.\"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    unique_key: Annotated[str, Field(alias='uniqueKey')]\n    url: Annotated[str, BeforeValidator(validate_http_url), Field()]\n    method: Annotated[HttpMethod | None, Field()] = None\n\n\n@docs_group('Storage data')\nclass AddRequestsResponse(BaseModel):\n    \"\"\"Model for a response to add requests to a queue.\n\n    Contains detailed information about the processing results when adding multiple requests\n    to a queue. This includes which requests were successfully processed and which ones\n    encountered issues during processing.\n    \"\"\"\n\n    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)\n\n    processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]\n    \"\"\"Successfully processed requests, including information about whether they were\n    already present in the queue and whether they had been handled previously.\"\"\"\n\n    unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')]\n    \"\"\"Requests that could not be processed, typically due to validation errors or other issues.\"\"\"\n"
  },
  {
    "path": "src/crawlee/storage_clients/py.typed",
    "content": ""
  },
  {
    "path": "src/crawlee/storages/__init__.py",
    "content": "from ._dataset import Dataset\nfrom ._key_value_store import KeyValueStore\nfrom ._request_queue import RequestQueue\n\n__all__ = [\n    'Dataset',\n    'KeyValueStore',\n    'RequestQueue',\n]\n"
  },
  {
    "path": "src/crawlee/storages/_base.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.docs import docs_group\n\nif TYPE_CHECKING:\n    from crawlee.configuration import Configuration\n    from crawlee.storage_clients._base import StorageClient\n    from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata\n\n\n@docs_group('Storages')\nclass Storage(ABC):\n    \"\"\"Base class for storages.\"\"\"\n\n    @property\n    @abstractmethod\n    def id(self) -> str:\n        \"\"\"Get the storage ID.\"\"\"\n\n    @property\n    @abstractmethod\n    def name(self) -> str | None:\n        \"\"\"Get the storage name.\"\"\"\n\n    @abstractmethod\n    async def get_metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:\n        \"\"\"Get the storage metadata.\"\"\"\n\n    @classmethod\n    @abstractmethod\n    async def open(\n        cls,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n        storage_client: StorageClient | None = None,\n    ) -> Storage:\n        \"\"\"Open a storage, either restore existing or create a new one.\n\n        Args:\n            id: The storage ID.\n            name: The storage name (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\n                the digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n                (e.g. \"my-value-1\").\n            alias: The storage alias (run scope, creates unnamed storage).\n            configuration: Configuration object used during the storage creation or restoration process.\n            storage_client: Underlying storage client to use. If not provided, the default global storage client\n                from the service locator will be used.\n        \"\"\"\n\n    @abstractmethod\n    async def drop(self) -> None:\n        \"\"\"Drop the storage, removing it from the underlying storage client and clearing the cache.\"\"\"\n\n    @abstractmethod\n    async def purge(self) -> None:\n        \"\"\"Purge the storage, removing all items from the underlying storage client.\n\n        This method does not remove the storage itself, e.g. don't remove the metadata,\n        but clears all items within it.\n        \"\"\"\n"
  },
  {
    "path": "src/crawlee/storages/_dataset.py",
    "content": "from __future__ import annotations\n\nimport logging\nfrom io import StringIO\nfrom typing import TYPE_CHECKING, overload\n\nfrom typing_extensions import override\n\nfrom crawlee import service_locator\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.file import export_csv_to_stream, export_json_to_stream\n\nfrom ._base import Storage\nfrom ._key_value_store import KeyValueStore\nfrom ._utils import validate_storage_name\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n    from typing import Any, Literal\n\n    from typing_extensions import Unpack\n\n    from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs\n    from crawlee.configuration import Configuration\n    from crawlee.storage_clients import StorageClient\n    from crawlee.storage_clients._base import DatasetClient\n    from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata\n\nlogger = logging.getLogger(__name__)\n\n\n@docs_group('Storages')\nclass Dataset(Storage):\n    \"\"\"Dataset is a storage for managing structured tabular data.\n\n    The dataset class provides a high-level interface for storing and retrieving structured data\n    with consistent schema, similar to database tables or spreadsheets. It abstracts the underlying\n    storage implementation details, offering a consistent API regardless of where the data is\n    physically stored.\n\n    Dataset operates in an append-only mode, allowing new records to be added but not modified\n    or deleted after creation. This makes it particularly suitable for storing crawling results\n    and other data that should be immutable once collected.\n\n    The class provides methods for adding data, retrieving data with various filtering options,\n    and exporting data to different formats. You can create a dataset using the `open` class method,\n    specifying either a name or ID. The underlying storage implementation is determined by\n    the configured storage client.\n\n    ### Usage\n\n    ```python\n    from crawlee.storages import Dataset\n\n    # Open a dataset\n    dataset = await Dataset.open(name='my-dataset')\n\n    # Add data\n    await dataset.push_data({'title': 'Example Product', 'price': 99.99})\n\n    # Retrieve filtered data\n    results = await dataset.get_data(limit=10, desc=True)\n\n    # Export data\n    await dataset.export_to('results.json', content_type='json')\n    ```\n    \"\"\"\n\n    def __init__(self, client: DatasetClient, id: str, name: str | None) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `Dataset.open` constructor to create a new instance.\n\n        Args:\n            client: An instance of a storage client.\n            id: The unique identifier of the storage.\n            name: The name of the storage, if available.\n        \"\"\"\n        validate_storage_name(name)\n\n        self._client = client\n        self._id = id\n        self._name = name\n\n    @property\n    @override\n    def id(self) -> str:\n        return self._id\n\n    @property\n    @override\n    def name(self) -> str | None:\n        return self._name\n\n    @override\n    async def get_metadata(self) -> DatasetMetadata:\n        return await self._client.get_metadata()\n\n    @override\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n        storage_client: StorageClient | None = None,\n    ) -> Dataset:\n        configuration = service_locator.get_configuration() if configuration is None else configuration\n        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client\n\n        client_opener_coro = storage_client.create_dataset_client(\n            id=id, name=name, alias=alias, configuration=configuration\n        )\n        storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)\n\n        return await service_locator.storage_instance_manager.open_storage_instance(\n            cls,\n            id=id,\n            name=name,\n            alias=alias,\n            client_opener_coro=client_opener_coro,\n            storage_client_cache_key=storage_client_cache_key,\n        )\n\n    @override\n    async def drop(self) -> None:\n        storage_instance_manager = service_locator.storage_instance_manager\n        storage_instance_manager.remove_from_cache(self)\n        await self._client.drop()\n\n    @override\n    async def purge(self) -> None:\n        await self._client.purge()\n\n    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:\n        \"\"\"Store an object or an array of objects to the dataset.\n\n        The size of the data is limited by the receiving API and therefore `push_data()` will only\n        allow objects whose JSON representation is smaller than 9MB. When an array is passed,\n        none of the included objects may be larger than 9MB, but the array itself may be of any size.\n\n        Args:\n            data: A JSON serializable data structure to be stored in the dataset. The JSON representation\n                of each item must be smaller than 9MB.\n        \"\"\"\n        await self._client.push_data(data=data)\n\n    async def get_data(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n        flatten: list[str] | None = None,\n        view: str | None = None,\n    ) -> DatasetItemsListPage:\n        \"\"\"Retrieve a paginated list of items from a dataset based on various filtering parameters.\n\n        This method provides the flexibility to filter, sort, and modify the appearance of dataset items\n        when listed. Each parameter modifies the result set according to its purpose. The method also\n        supports pagination through 'offset' and 'limit' parameters.\n\n        Args:\n            offset: Skips the specified number of items at the start.\n            limit: The maximum number of items to retrieve. Unlimited if None.\n            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.\n            desc: Set to True to sort results in descending order.\n            fields: Fields to include in each item. Sorts fields as specified if provided.\n            omit: Fields to exclude from each item.\n            unwind: Unwinds items by a specified array field, turning each element into a separate item.\n            skip_empty: Excludes empty items from the results if True.\n            skip_hidden: Excludes fields starting with '#' if True.\n            flatten: Fields to be flattened in returned items.\n            view: Specifies the dataset view to be used.\n\n        Returns:\n            An object with filtered, sorted, and paginated dataset items plus pagination details.\n        \"\"\"\n        return await self._client.get_data(\n            offset=offset,\n            limit=limit,\n            clean=clean,\n            desc=desc,\n            fields=fields,\n            omit=omit,\n            unwind=unwind,\n            skip_empty=skip_empty,\n            skip_hidden=skip_hidden,\n            flatten=flatten,\n            view=view,\n        )\n\n    async def iterate_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> AsyncIterator[dict[str, Any]]:\n        \"\"\"Iterate over items in the dataset according to specified filters and sorting.\n\n        This method allows for asynchronously iterating through dataset items while applying various filters such as\n        skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\n        parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n        `skip_hidden` parameters.\n\n        Args:\n            offset: Skips the specified number of items at the start.\n            limit: The maximum number of items to retrieve. Unlimited if None.\n            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.\n            desc: Set to True to sort results in descending order.\n            fields: Fields to include in each item. Sorts fields as specified if provided.\n            omit: Fields to exclude from each item.\n            unwind: Unwinds items by a specified array field, turning each element into a separate item.\n            skip_empty: Excludes empty items from the results if True.\n            skip_hidden: Excludes fields starting with '#' if True.\n\n        Yields:\n            An asynchronous iterator of dictionary objects, each representing a dataset item after applying\n            the specified filters and transformations.\n        \"\"\"\n        async for item in self._client.iterate_items(\n            offset=offset,\n            limit=limit,\n            clean=clean,\n            desc=desc,\n            fields=fields,\n            omit=omit,\n            unwind=unwind,\n            skip_empty=skip_empty,\n            skip_hidden=skip_hidden,\n        ):\n            yield item\n\n    async def list_items(\n        self,\n        *,\n        offset: int = 0,\n        limit: int | None = 999_999_999_999,\n        clean: bool = False,\n        desc: bool = False,\n        fields: list[str] | None = None,\n        omit: list[str] | None = None,\n        unwind: list[str] | None = None,\n        skip_empty: bool = False,\n        skip_hidden: bool = False,\n    ) -> list[dict[str, Any]]:\n        \"\"\"Retrieve a list of all items from the dataset according to specified filters and sorting.\n\n        This method collects all dataset items into a list while applying various filters such as\n        skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit`\n        parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and\n        `skip_hidden` parameters.\n\n        Args:\n            offset: Skips the specified number of items at the start.\n            limit: The maximum number of items to retrieve. Unlimited if None.\n            clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.\n            desc: Set to True to sort results in descending order.\n            fields: Fields to include in each item. Sorts fields as specified if provided.\n            omit: Fields to exclude from each item.\n            unwind: Unwinds items by a specified array field, turning each element into a separate item.\n            skip_empty: Excludes empty items from the results if True.\n            skip_hidden: Excludes fields starting with '#' if True.\n\n        Returns:\n            A list of dictionary objects, each representing a dataset item after applying\n            the specified filters and transformations.\n        \"\"\"\n        return [\n            item\n            async for item in self.iterate_items(\n                offset=offset,\n                limit=limit,\n                clean=clean,\n                desc=desc,\n                fields=fields,\n                omit=omit,\n                unwind=unwind,\n                skip_empty=skip_empty,\n                skip_hidden=skip_hidden,\n            )\n        ]\n\n    @overload\n    async def export_to(\n        self,\n        key: str,\n        content_type: Literal['json'],\n        to_kvs_id: str | None = None,\n        to_kvs_name: str | None = None,\n        to_kvs_storage_client: StorageClient | None = None,\n        to_kvs_configuration: Configuration | None = None,\n        **kwargs: Unpack[ExportDataJsonKwargs],\n    ) -> None: ...\n\n    @overload\n    async def export_to(\n        self,\n        key: str,\n        content_type: Literal['csv'],\n        to_kvs_id: str | None = None,\n        to_kvs_name: str | None = None,\n        to_kvs_storage_client: StorageClient | None = None,\n        to_kvs_configuration: Configuration | None = None,\n        **kwargs: Unpack[ExportDataCsvKwargs],\n    ) -> None: ...\n\n    async def export_to(\n        self,\n        key: str,\n        content_type: Literal['json', 'csv'] = 'json',\n        to_kvs_id: str | None = None,\n        to_kvs_name: str | None = None,\n        to_kvs_storage_client: StorageClient | None = None,\n        to_kvs_configuration: Configuration | None = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Export the entire dataset into a specified file stored under a key in a key-value store.\n\n        This method consolidates all entries from a specified dataset into one file, which is then saved under a\n        given key in a key-value store. The format of the exported file is determined by the `content_type` parameter.\n        Either the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or\n        name should be used.\n\n        Args:\n            key: The key under which to save the data in the key-value store.\n            content_type: The format in which to export the data.\n            to_kvs_id: ID of the key-value store to save the exported file.\n                Specify only one of ID or name.\n            to_kvs_name: Name of the key-value store to save the exported file.\n                Specify only one of ID or name.\n            to_kvs_storage_client: Storage client to use for the key-value store.\n            to_kvs_configuration: Configuration for the key-value store.\n            kwargs: Additional parameters for the export operation, specific to the chosen content type.\n        \"\"\"\n        kvs = await KeyValueStore.open(\n            id=to_kvs_id,\n            name=to_kvs_name,\n            configuration=to_kvs_configuration,\n            storage_client=to_kvs_storage_client,\n        )\n        dst = StringIO()\n\n        if content_type == 'csv':\n            await export_csv_to_stream(self.iterate_items(), dst, **kwargs)\n            await kvs.set_value(key, dst.getvalue(), 'text/csv')\n        elif content_type == 'json':\n            await export_json_to_stream(self.iterate_items(), dst, **kwargs)\n            await kvs.set_value(key, dst.getvalue(), 'application/json')\n        else:\n            raise ValueError('Unsupported content type, expecting CSV or JSON')\n"
  },
  {
    "path": "src/crawlee/storages/_key_value_store.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom collections.abc import AsyncIterator\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload\n\nfrom pydantic import RootModel\nfrom typing_extensions import override\n\nfrom crawlee import service_locator\nfrom crawlee._types import JsonSerializable  # noqa: TC001\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.recoverable_state import RecoverableState\nfrom crawlee.storage_clients.models import KeyValueStoreMetadata\n\nfrom ._base import Storage\nfrom ._utils import validate_storage_name\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncIterator\n\n    from crawlee.configuration import Configuration\n    from crawlee.storage_clients import StorageClient\n    from crawlee.storage_clients._base import KeyValueStoreClient\n    from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata\nelse:\n    from crawlee._utils.recoverable_state import RecoverableState\n\nT = TypeVar('T')\n\nlogger = getLogger(__name__)\n\n\nclass AutosavedValue(RootModel):\n    root: dict[str, JsonSerializable]\n\n\n@docs_group('Storages')\nclass KeyValueStore(Storage):\n    \"\"\"Key-value store is a storage for reading and writing data records with unique key identifiers.\n\n    The key-value store class acts as a high-level interface for storing, retrieving, and managing data records\n    identified by unique string keys. It abstracts away the underlying storage implementation details,\n    allowing you to work with the same API regardless of whether data is stored in memory, on disk,\n    or in the cloud.\n\n    Each data record is associated with a specific MIME content type, allowing storage of various\n    data formats such as JSON, text, images, HTML snapshots or any binary data. This class is\n    commonly used to store inputs, outputs, and other artifacts of crawler operations.\n\n    You can instantiate a key-value store using the `open` class method, which will create a store\n    with the specified name or id. The underlying storage implementation is determined by the configured\n    storage client.\n\n    ### Usage\n\n    ```python\n    from crawlee.storages import KeyValueStore\n\n    # Open a named key-value store\n    kvs = await KeyValueStore.open(name='my-store')\n\n    # Store and retrieve data\n    await kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}])\n    product = await kvs.get_value('product-1234')\n    ```\n    \"\"\"\n\n    _autosaved_values: ClassVar[\n        dict[\n            str,\n            dict[str, RecoverableState[AutosavedValue]],\n        ]\n    ] = {}\n    \"\"\"Cache for recoverable (auto-saved) values.\"\"\"\n\n    def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `KeyValueStore.open` constructor to create a new instance.\n\n        Args:\n            client: An instance of a storage client.\n            id: The unique identifier of the storage.\n            name: The name of the storage, if available.\n        \"\"\"\n        validate_storage_name(name)\n\n        self._client = client\n        self._id = id\n        self._name = name\n\n        self._autosave_lock = asyncio.Lock()\n        \"\"\"Lock for autosaving values to prevent concurrent modifications.\"\"\"\n\n    @property\n    @override\n    def id(self) -> str:\n        return self._id\n\n    @property\n    @override\n    def name(self) -> str | None:\n        return self._name\n\n    @override\n    async def get_metadata(self) -> KeyValueStoreMetadata:\n        return await self._client.get_metadata()\n\n    @override\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n        storage_client: StorageClient | None = None,\n    ) -> KeyValueStore:\n        configuration = service_locator.get_configuration() if configuration is None else configuration\n        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client\n\n        client_opener_coro = storage_client.create_kvs_client(\n            id=id, name=name, alias=alias, configuration=configuration\n        )\n        additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)\n\n        return await service_locator.storage_instance_manager.open_storage_instance(\n            cls,\n            id=id,\n            name=name,\n            alias=alias,\n            client_opener_coro=client_opener_coro,\n            storage_client_cache_key=additional_cache_key,\n        )\n\n    @override\n    async def drop(self) -> None:\n        storage_instance_manager = service_locator.storage_instance_manager\n        storage_instance_manager.remove_from_cache(self)\n\n        await self._clear_cache()  # Clear cache with persistent values.\n        await self._client.drop()\n\n    @override\n    async def purge(self) -> None:\n        await self._client.purge()\n\n    @overload\n    async def get_value(self, key: str) -> Any: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T) -> T: ...\n\n    @overload\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...\n\n    async def get_value(self, key: str, default_value: T | None = None) -> T | None:\n        \"\"\"Get a value from the KVS.\n\n        Args:\n            key: Key of the record to retrieve.\n            default_value: Default value returned in case the record does not exist.\n\n        Returns:\n            The value associated with the given key. `default_value` is used in case the record does not exist.\n        \"\"\"\n        record = await self._client.get_value(key=key)\n        return record.value if record else default_value\n\n    async def set_value(\n        self,\n        key: str,\n        value: Any,\n        content_type: str | None = None,\n    ) -> None:\n        \"\"\"Set a value in the KVS.\n\n        Args:\n            key: Key of the record to set.\n            value: Value to set.\n            content_type: The MIME content type string.\n        \"\"\"\n        await self._client.set_value(key=key, value=value, content_type=content_type)\n\n    async def delete_value(self, key: str) -> None:\n        \"\"\"Delete a value from the KVS.\n\n        Args:\n            key: Key of the record to delete.\n        \"\"\"\n        await self._client.delete_value(key=key)\n\n    async def iterate_keys(\n        self,\n        exclusive_start_key: str | None = None,\n        limit: int | None = None,\n    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:\n        \"\"\"Iterate over the existing keys in the KVS.\n\n        Args:\n            exclusive_start_key: Key to start the iteration from.\n            limit: Maximum number of keys to return. None means no limit.\n\n        Yields:\n            Information about the key.\n        \"\"\"\n        async for item in self._client.iterate_keys(\n            exclusive_start_key=exclusive_start_key,\n            limit=limit,\n        ):\n            yield item\n\n    async def list_keys(\n        self,\n        exclusive_start_key: str | None = None,\n        limit: int = 1000,\n    ) -> list[KeyValueStoreRecordMetadata]:\n        \"\"\"List all the existing keys in the KVS.\n\n        It uses client's `iterate_keys` method to get the keys.\n\n        Args:\n            exclusive_start_key: Key to start the iteration from.\n            limit: Maximum number of keys to return.\n\n        Returns:\n            A list of keys in the KVS.\n        \"\"\"\n        return [\n            key\n            async for key in self._client.iterate_keys(\n                exclusive_start_key=exclusive_start_key,\n                limit=limit,\n            )\n        ]\n\n    async def record_exists(self, key: str) -> bool:\n        \"\"\"Check if a record with the given key exists in the key-value store.\n\n        Args:\n            key: Key of the record to check for existence.\n\n        Returns:\n            True if a record with the given key exists, False otherwise.\n        \"\"\"\n        return await self._client.record_exists(key=key)\n\n    async def get_public_url(self, key: str) -> str:\n        \"\"\"Get the public URL for the given key.\n\n        Args:\n            key: Key of the record for which URL is required.\n\n        Returns:\n            The public URL for the given key.\n        \"\"\"\n        return await self._client.get_public_url(key=key)\n\n    async def get_auto_saved_value(\n        self,\n        key: str,\n        default_value: dict[str, JsonSerializable] | None = None,\n    ) -> dict[str, JsonSerializable]:\n        \"\"\"Get a value from KVS that will be automatically saved on changes.\n\n        Args:\n            key: Key of the record, to store the value.\n            default_value: Value to be used if the record does not exist yet. Should be a dictionary.\n\n        Returns:\n            Return the value of the key.\n        \"\"\"\n        default_value = {} if default_value is None else default_value\n\n        async with self._autosave_lock:\n            cache = self._autosaved_values.setdefault(self.id, {})\n\n            if key in cache:\n                return cache[key].current_value.root\n\n            async def kvs_factory() -> KeyValueStore:\n                return self\n\n            cache[key] = recoverable_state = RecoverableState(\n                default_state=AutosavedValue(default_value),\n                persist_state_key=key,\n                persistence_enabled=True,\n                persist_state_kvs_factory=kvs_factory,\n                logger=logger,\n            )\n\n            await recoverable_state.initialize()\n\n        return recoverable_state.current_value.root\n\n    async def persist_autosaved_values(self) -> None:\n        \"\"\"Force autosaved values to be saved without waiting for an event in Event Manager.\"\"\"\n        if self.id in self._autosaved_values:\n            cache = self._autosaved_values[self.id]\n            for value in cache.values():\n                await value.persist_state()\n\n    async def _clear_cache(self) -> None:\n        \"\"\"Clear cache with autosaved values.\"\"\"\n        if self.id in self._autosaved_values:\n            cache = self._autosaved_values[self.id]\n            for value in cache.values():\n                await value.teardown()\n            cache.clear()\n"
  },
  {
    "path": "src/crawlee/storages/_request_queue.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom logging import getLogger\nfrom typing import TYPE_CHECKING, TypeVar\n\nfrom typing_extensions import override\n\nfrom crawlee import Request, service_locator\nfrom crawlee._utils.docs import docs_group\nfrom crawlee._utils.wait import wait_for_all_tasks_for_finish\nfrom crawlee.request_loaders import RequestManager\n\nfrom ._base import Storage\nfrom ._utils import validate_storage_name\n\nif TYPE_CHECKING:\n    from collections.abc import Sequence\n\n    from crawlee import Request\n    from crawlee.configuration import Configuration\n    from crawlee.storage_clients import StorageClient\n    from crawlee.storage_clients._base import RequestQueueClient\n    from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata\n\nlogger = getLogger(__name__)\n\nT = TypeVar('T')\n\n\n@docs_group('Storages')\nclass RequestQueue(Storage, RequestManager):\n    \"\"\"Request queue is a storage for managing HTTP requests.\n\n    The request queue class serves as a high-level interface for organizing and managing HTTP requests\n    during web crawling. It provides methods for adding, retrieving, and manipulating requests throughout\n    the crawling lifecycle, abstracting away the underlying storage implementation details.\n\n    Request queue maintains the state of each URL to be crawled, tracking whether it has been processed,\n    is currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified\n    by a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise.\n\n    The class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter\n    when adding requests. It also provides mechanisms for error handling and request reclamation when\n    processing fails.\n\n    You can open a request queue using the `open` class method, specifying either a name or ID to identify\n    the queue. The underlying storage implementation is determined by the configured storage client.\n\n    ### Usage\n\n    ```python\n    from crawlee.storages import RequestQueue\n\n    # Open a request queue\n    rq = await RequestQueue.open(name='my-queue')\n\n    # Add a request\n    await rq.add_request('https://example.com')\n\n    # Process requests\n    request = await rq.fetch_next_request()\n    if request:\n        try:\n            # Process the request\n            # ...\n            await rq.mark_request_as_handled(request)\n        except Exception:\n            await rq.reclaim_request(request)\n    ```\n    \"\"\"\n\n    def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> None:\n        \"\"\"Initialize a new instance.\n\n        Preferably use the `RequestQueue.open` constructor to create a new instance.\n\n        Args:\n            client: An instance of a storage client.\n            id: The unique identifier of the storage.\n            name: The name of the storage, if available.\n        \"\"\"\n        validate_storage_name(name)\n\n        self._client = client\n        self._id = id\n        self._name = name\n\n        self._add_requests_tasks = list[asyncio.Task]()\n        \"\"\"A list of tasks for adding requests to the queue.\"\"\"\n\n    @property\n    @override\n    def id(self) -> str:\n        return self._id\n\n    @property\n    @override\n    def name(self) -> str | None:\n        return self._name\n\n    @override\n    async def get_metadata(self) -> RequestQueueMetadata:\n        return await self._client.get_metadata()\n\n    @override\n    async def get_handled_count(self) -> int:\n        metadata = await self._client.get_metadata()\n        return metadata.handled_request_count\n\n    @override\n    async def get_total_count(self) -> int:\n        metadata = await self._client.get_metadata()\n        return metadata.total_request_count\n\n    @override\n    @classmethod\n    async def open(\n        cls,\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        configuration: Configuration | None = None,\n        storage_client: StorageClient | None = None,\n    ) -> RequestQueue:\n        configuration = service_locator.get_configuration() if configuration is None else configuration\n        storage_client = service_locator.get_storage_client() if storage_client is None else storage_client\n\n        client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)\n        additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)\n\n        return await service_locator.storage_instance_manager.open_storage_instance(\n            cls,\n            id=id,\n            name=name,\n            alias=alias,\n            client_opener_coro=client_opener_coro,\n            storage_client_cache_key=additional_cache_key,\n        )\n\n    @override\n    async def drop(self) -> None:\n        # Remove from cache before dropping\n        storage_instance_manager = service_locator.storage_instance_manager\n        storage_instance_manager.remove_from_cache(self)\n\n        await self._client.drop()\n\n    @override\n    async def purge(self) -> None:\n        await self._client.purge()\n\n    @override\n    async def add_request(\n        self,\n        request: str | Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        request = self._transform_request(request)\n        response = await self._client.add_batch_of_requests([request], forefront=forefront)\n\n        if response.processed_requests:\n            return response.processed_requests[0]\n\n        if response.unprocessed_requests:\n            logger.warning(\n                f'Request {request.url} was not processed by storage client \"{self._client.__class__.__name__}\".'\n            )\n        else:\n            logger.warning(\n                f'Request {request.url} was not processed by storage client \"{self._client.__class__.__name__}\" '\n                'received empty response.'\n            )\n        return None\n\n    @override\n    async def add_requests(\n        self,\n        requests: Sequence[str | Request],\n        *,\n        forefront: bool = False,\n        batch_size: int = 1000,\n        wait_time_between_batches: timedelta = timedelta(seconds=1),\n        wait_for_all_requests_to_be_added: bool = False,\n        wait_for_all_requests_to_be_added_timeout: timedelta | None = None,\n    ) -> None:\n        transformed_requests = self._transform_requests(requests)\n        wait_time_secs = wait_time_between_batches.total_seconds()\n\n        # Wait for the first batch to be added\n        first_batch = transformed_requests[:batch_size]\n        if first_batch:\n            await self._process_batch(\n                first_batch,\n                base_retry_wait=wait_time_between_batches,\n                forefront=forefront,\n            )\n\n        async def _process_remaining_batches() -> None:\n            for i in range(batch_size, len(transformed_requests), batch_size):\n                batch = transformed_requests[i : i + batch_size]\n                await self._process_batch(\n                    batch,\n                    base_retry_wait=wait_time_between_batches,\n                    forefront=forefront,\n                )\n                if i + batch_size < len(transformed_requests):\n                    await asyncio.sleep(wait_time_secs)\n\n        # Create and start the task to process remaining batches in the background\n        remaining_batches_task = asyncio.create_task(\n            _process_remaining_batches(),\n            name='request_queue_process_remaining_batches_task',\n        )\n\n        self._add_requests_tasks.append(remaining_batches_task)\n        remaining_batches_task.add_done_callback(lambda _: self._add_requests_tasks.remove(remaining_batches_task))\n\n        # Wait for all tasks to finish if requested\n        if wait_for_all_requests_to_be_added:\n            await wait_for_all_tasks_for_finish(\n                (remaining_batches_task,),\n                logger=logger,\n                timeout=wait_for_all_requests_to_be_added_timeout,\n            )\n\n    async def fetch_next_request(self) -> Request | None:\n        \"\"\"Return the next request in the queue to be processed.\n\n        Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled`\n        to mark the request as handled in the queue. If there was some error in processing the request, call\n        `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer\n        in another call to the `fetch_next_request` method.\n\n        Note that the `None` return value does not mean the queue processing finished, it means there are currently\n        no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished`\n        instead.\n\n        Returns:\n            The next request to process, or `None` if there are no more pending requests.\n        \"\"\"\n        return await self._client.fetch_next_request()\n\n    async def get_request(self, unique_key: str) -> Request | None:\n        \"\"\"Retrieve a specific request from the queue by its ID.\n\n        Args:\n            unique_key: Unique key of the request to retrieve.\n\n        Returns:\n            The request with the specified ID, or `None` if no such request exists.\n        \"\"\"\n        return await self._client.get_request(unique_key)\n\n    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:\n        \"\"\"Mark a request as handled after successful processing.\n\n        This method should be called after a request has been successfully processed.\n        Once marked as handled, the request will be removed from the queue and will\n        not be returned in subsequent calls to `fetch_next_request` method.\n\n        Args:\n            request: The request to mark as handled.\n\n        Returns:\n            Information about the queue operation.\n        \"\"\"\n        return await self._client.mark_request_as_handled(request)\n\n    async def reclaim_request(\n        self,\n        request: Request,\n        *,\n        forefront: bool = False,\n    ) -> ProcessedRequest | None:\n        \"\"\"Reclaim a failed request back to the queue for later processing.\n\n        If a request fails during processing, this method can be used to return it to the queue.\n        The request will be returned for processing again in a subsequent call\n        to `RequestQueue.fetch_next_request`.\n\n        Args:\n            request: The request to return to the queue.\n            forefront: If true, the request will be added to the beginning of the queue.\n                Otherwise, it will be added to the end.\n\n        Returns:\n            Information about the queue operation.\n        \"\"\"\n        return await self._client.reclaim_request(request, forefront=forefront)\n\n    async def is_empty(self) -> bool:\n        \"\"\"Check if the request queue is empty.\n\n        An empty queue means that there are no requests currently in the queue, either pending or being processed.\n        However, this does not necessarily mean that the crawling operation is finished, as there still might be\n        tasks that could add additional requests to the queue.\n\n        Returns:\n            True if the request queue is empty, False otherwise.\n        \"\"\"\n        return await self._client.is_empty()\n\n    async def is_finished(self) -> bool:\n        \"\"\"Check if the request queue is finished.\n\n        A finished queue means that all requests in the queue have been processed (the queue is empty) and there\n        are no more tasks that could add additional requests to the queue. This is the definitive way to check\n        if a crawling operation is complete.\n\n        Returns:\n            True if the request queue is finished (empty and no pending add operations), False otherwise.\n        \"\"\"\n        if self._add_requests_tasks:\n            logger.debug('Background add requests tasks are still in progress.')\n            return False\n\n        if await self.is_empty():\n            logger.debug('The request queue is empty.')\n            return True\n\n        return False\n\n    async def _process_batch(\n        self,\n        batch: Sequence[Request],\n        *,\n        base_retry_wait: timedelta,\n        attempt: int = 1,\n        forefront: bool = False,\n    ) -> None:\n        \"\"\"Process a batch of requests with automatic retry mechanism.\"\"\"\n        max_attempts = 5\n        response = await self._client.add_batch_of_requests(batch, forefront=forefront)\n\n        if response.unprocessed_requests:\n            logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.')\n            if attempt > max_attempts:\n                logger.warning(\n                    f'Following requests were not processed even after {max_attempts} attempts:\\n'\n                    f'{response.unprocessed_requests}'\n                )\n            else:\n                logger.debug('Retry to add requests.')\n                unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests}\n                retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys]\n                await asyncio.sleep((base_retry_wait * attempt).total_seconds())\n                await self._process_batch(retry_batch, base_retry_wait=base_retry_wait, attempt=attempt + 1)\n\n        request_count = len(batch) - len(response.unprocessed_requests)\n\n        if request_count:\n            logger.debug(\n                f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}'\n            )\n"
  },
  {
    "path": "src/crawlee/storages/_storage_instance_manager.py",
    "content": "from __future__ import annotations\n\nfrom asyncio import Lock\nfrom collections import defaultdict\nfrom collections.abc import Coroutine, Hashable\nfrom dataclasses import dataclass, field\nfrom typing import TYPE_CHECKING, TypeVar\nfrom weakref import WeakValueDictionary\n\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\nfrom crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient\n\nfrom ._utils import validate_storage_name\n\nif TYPE_CHECKING:\n    from ._base import Storage\n\nT = TypeVar('T', bound='Storage')\n\n\n@dataclass\nclass _StorageCache:\n    \"\"\"Cache for storage instances.\"\"\"\n\n    by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(\n        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))\n    )\n    \"\"\"Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key'].\"\"\"\n\n    by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(\n        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))\n    )\n    \"\"\"Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']\"\"\"\n\n    by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(\n        default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))\n    )\n    \"\"\"Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']\"\"\"\n\n    def remove_from_cache(self, storage_instance: Storage) -> None:\n        \"\"\"Remove a storage instance from the cache.\n\n        Args:\n            storage_instance: The storage instance to remove.\n        \"\"\"\n        storage_type = type(storage_instance)\n\n        # Remove from ID cache\n        for additional_key in self.by_id[storage_type][storage_instance.id]:\n            del self.by_id[storage_type][storage_instance.id][additional_key]\n            break\n\n        # Remove from name cache or alias cache. It can never be in both.\n        if storage_instance.name is not None:\n            for additional_key in self.by_name[storage_type][storage_instance.name]:\n                del self.by_name[storage_type][storage_instance.name][additional_key]\n                break\n        else:\n            for alias_key in self.by_alias[storage_type]:\n                for additional_key in self.by_alias[storage_type][alias_key]:\n                    del self.by_alias[storage_type][alias_key][additional_key]\n                    break\n\n\nClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient]\n\"\"\"Type alias for the client opener function.\"\"\"\n\n\nclass StorageInstanceManager:\n    \"\"\"Manager for caching and managing storage instances.\n\n    This class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue)\n    and provides a unified interface for opening and managing storage instances.\n    \"\"\"\n\n    _DEFAULT_STORAGE_ALIAS = '__default__'\n    \"\"\"Reserved alias for default unnamed storage.\"\"\"\n\n    def __init__(self) -> None:\n        self._cache: _StorageCache = _StorageCache()\n        self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()\n\n    async def open_storage_instance(\n        self,\n        cls: type[T],\n        *,\n        id: str | None,\n        name: str | None,\n        alias: str | None,\n        client_opener_coro: ClientOpenerCoro,\n        storage_client_cache_key: Hashable = '',\n    ) -> T:\n        \"\"\"Open a storage instance with caching support.\n\n        Args:\n            cls: The storage class to instantiate.\n            id: Storage ID.\n            name: Storage name. (global scope, persists across runs). Name can only contain letters \"a\" through \"z\",\n                the digits \"0\" through \"9\", and the hyphen (\"-\") but only in the middle of the string\n                (e.g. \"my-value-1\").\n            alias: Storage alias (run scope, creates unnamed storage).\n            client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.\n            storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.\n\n        Returns:\n            The storage instance.\n\n        Raises:\n            ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified.\n        \"\"\"\n        try:\n            if name == self._DEFAULT_STORAGE_ALIAS:\n                raise ValueError(\n                    f'Storage name cannot be \"{self._DEFAULT_STORAGE_ALIAS}\" as it is reserved for default alias.'\n                )\n\n            # Validate input parameters.\n            raise_if_too_many_kwargs(id=id, name=name, alias=alias)\n\n            # Auto-set alias='__default__' when no parameters are specified.\n            if not any([name, alias, id]):\n                alias = self._DEFAULT_STORAGE_ALIAS\n\n            # Check cache without lock first for performance.\n            if cached_instance := self._get_from_cache(\n                cls,\n                id=id,\n                name=name,\n                alias=alias,\n                storage_client_cache_key=storage_client_cache_key,\n            ):\n                return cached_instance\n\n            # Validate storage name\n            if name is not None:\n                validate_storage_name(name)\n\n            # Acquire lock for this opener\n            opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)\n            if not (lock := self._opener_locks.get(opener_lock_key)):\n                lock = Lock()\n                self._opener_locks[opener_lock_key] = lock\n\n            async with lock:\n                # Another task could have created the storage while we were waiting for the lock - check if that\n                # happened\n                if cached_instance := self._get_from_cache(\n                    cls,\n                    id=id,\n                    name=name,\n                    alias=alias,\n                    storage_client_cache_key=storage_client_cache_key,\n                ):\n                    return cached_instance\n\n                # Check for conflicts between named and alias storages\n                self._check_name_alias_conflict(\n                    cls,\n                    name=name,\n                    alias=alias,\n                    storage_client_cache_key=storage_client_cache_key,\n                )\n\n                # Create new instance\n                client: KeyValueStoreClient | DatasetClient | RequestQueueClient\n                client = await client_opener_coro\n\n                metadata = await client.get_metadata()\n\n                instance = cls(client, metadata.id, metadata.name)  # type: ignore[call-arg]\n                instance_name = getattr(instance, 'name', None)\n\n                # Cache the instance.\n                # Note: No awaits in this section. All cache entries must be written\n                # atomically to ensure pre-checks outside the lock see consistent state.\n\n                # Always cache by id.\n                self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance\n\n                # Cache named storage.\n                if instance_name is not None:\n                    self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance\n\n                # Cache unnamed storage.\n                if alias is not None:\n                    self._cache.by_alias[cls][alias][storage_client_cache_key] = instance\n\n                return instance\n\n        finally:\n            # Make sure the client opener is closed.\n            # If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup.\n            client_opener_coro.close()\n\n    def remove_from_cache(self, storage_instance: Storage) -> None:\n        \"\"\"Remove a storage instance from the cache.\n\n        Args:\n            storage_instance: The storage instance to remove.\n        \"\"\"\n        self._cache.remove_from_cache(storage_instance)\n\n    def clear_cache(self) -> None:\n        \"\"\"Clear all cached storage instances.\"\"\"\n        self._cache = _StorageCache()\n\n    def _get_from_cache(\n        self,\n        cls: type[T],\n        *,\n        id: str | None = None,\n        name: str | None = None,\n        alias: str | None = None,\n        storage_client_cache_key: Hashable = '',\n    ) -> T | None:\n        \"\"\"Get a storage instance from the cache.\"\"\"\n        if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):\n            if isinstance(cached_instance, cls):\n                return cached_instance\n            raise RuntimeError('Cached instance type mismatch.')\n\n        if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):\n            if isinstance(cached_instance, cls):\n                return cached_instance\n            raise RuntimeError('Cached instance type mismatch.')\n\n        if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):\n            if isinstance(cached_instance, cls):\n                return cached_instance\n            raise RuntimeError('Cached instance type mismatch.')\n\n        return None\n\n    def _check_name_alias_conflict(\n        self,\n        cls: type[T],\n        *,\n        name: str | None = None,\n        alias: str | None = None,\n        storage_client_cache_key: Hashable = '',\n    ) -> None:\n        \"\"\"Check for conflicts between named and alias storages.\"\"\"\n        if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):\n            raise ValueError(\n                f'Cannot create alias storage \"{alias}\" because a named storage with the same name already exists. '\n                f'Use a different alias or drop the existing named storage first.'\n            )\n\n        if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):\n            raise ValueError(\n                f'Cannot create named storage \"{name}\" because an alias storage with the same name already exists. '\n                f'Use a different name or drop the existing alias storage first.'\n            )\n"
  },
  {
    "path": "src/crawlee/storages/_utils.py",
    "content": "import re\n\nNAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')\n\n\ndef validate_storage_name(name: str | None) -> None:\n    if name and not NAME_REGEX.match(name):\n        raise ValueError(\n            f'Invalid storage name \"{name}\". Name can only contain letters \"a\" through \"z\", the digits \"0\" through'\n            '\"9\", and the hyphen (\"-\") but only in the middle of the string (e.g. \"my-value-1\")'\n        )\n"
  },
  {
    "path": "src/crawlee/storages/py.typed",
    "content": ""
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/e2e/__init__.py",
    "content": ""
  },
  {
    "path": "tests/e2e/conftest.py",
    "content": "import subprocess\nfrom pathlib import Path\n\nimport pytest\nfrom _pytest.config import Config\nfrom filelock import FileLock\n\n_CRAWLEE_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()\n\n\ndef pytest_configure(config: Config) -> None:\n    for marker in [\n        'httpx',\n        'curl_impersonate',\n        'impit',\n        'playwright',\n        'playwright_camoufox',\n        'playwright_chrome',\n        'playwright_firefox',\n        'playwright_webkit',\n        'parsel',\n        'beautifulsoup',\n        'uv',\n        'poetry',\n        'pip',\n    ]:\n        config.addinivalue_line('markers', f'{marker}: Integration test parameter marker.')\n\n\n@pytest.fixture(scope='session')\ndef crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path:\n    \"\"\"Build the package wheel if it hasn't been built yet, and return the path to the wheel.\"\"\"\n    # Make sure the wheel is not being built concurrently across all the pytest-xdist runners,\n    # through locking the building process with a temp file.\n    with FileLock(tmp_path_factory.getbasetemp().parent / 'crawlee_wheel_build.lock'):\n        # Make sure the wheel is built exactly once across all the pytest-xdist runners,\n        # through an indicator file saying that the wheel was already built.\n        was_wheel_built_this_test_run_file = tmp_path_factory.getbasetemp() / f'wheel_was_built_in_run_{testrun_uid}'\n        if not was_wheel_built_this_test_run_file.exists():\n            subprocess.run(\n                args='python -m build',\n                cwd=_CRAWLEE_ROOT_PATH,\n                shell=True,\n                check=True,\n                capture_output=True,\n            )\n            was_wheel_built_this_test_run_file.touch()\n\n        # Read the current package version, necessary for getting the right wheel filename.\n        pyproject_toml_file = (_CRAWLEE_ROOT_PATH / 'pyproject.toml').read_text(encoding='utf-8')\n        for line in pyproject_toml_file.splitlines():\n            if line.startswith('version = '):\n                delim = '\"' if '\"' in line else \"'\"\n                crawlee_version = line.split(delim)[1]\n                break\n        else:\n            raise RuntimeError('Unable to find version string.')\n\n        wheel_path = _CRAWLEE_ROOT_PATH / 'dist' / f'crawlee-{crawlee_version}-py3-none-any.whl'\n\n        # Just to be sure.\n        assert wheel_path.exists()\n\n        return wheel_path\n"
  },
  {
    "path": "tests/e2e/project_template/test_static_crawlers_templates.py",
    "content": "import os\nimport re\nimport subprocess\nfrom pathlib import Path\nfrom typing import Literal\n\nimport pytest\nfrom apify_client import ApifyClientAsync\nfrom cookiecutter.main import cookiecutter\n\nfrom crawlee._cli import default_start_url, template_directory\nfrom crawlee._utils.crypto import crypto_random_object_id\nfrom tests.e2e.project_template.utils import patch_crawlee_version_in_project\n\n# To run these tests locally, make sure you have apify-cli installed and available in the path.\n# https://docs.apify.com/cli/docs/installation\n\n\n@pytest.mark.parametrize(\n    'crawler_type',\n    [\n        pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox),\n        pytest.param('playwright-chrome', marks=pytest.mark.playwright_chrome),\n        pytest.param('playwright-firefox', marks=pytest.mark.playwright_firefox),\n        pytest.param('playwright-webkit', marks=pytest.mark.playwright_webkit),\n        pytest.param('playwright', marks=pytest.mark.playwright),\n        pytest.param('parsel', marks=pytest.mark.parsel),\n        pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),\n    ],\n)\n@pytest.mark.parametrize(\n    'http_client',\n    [\n        pytest.param('httpx', marks=pytest.mark.httpx),\n        pytest.param('curl-impersonate', marks=pytest.mark.curl_impersonate),\n        pytest.param('impit', marks=pytest.mark.impit),\n    ],\n)\n@pytest.mark.parametrize(\n    'package_manager',\n    [\n        pytest.param('pip', marks=pytest.mark.pip),\n        pytest.param('uv', marks=pytest.mark.uv),\n        pytest.param('poetry', marks=pytest.mark.poetry),\n    ],\n)\nasync def test_static_crawler_actor_at_apify(\n    tmp_path: Path,\n    crawlee_wheel_path: Path,\n    package_manager: Literal['pip', 'uv', 'poetry'],\n    crawler_type: str,\n    http_client: str,\n) -> None:\n    # Generate new actor name\n    actor_name = f'crawlee-python-template-e2e-test-{crypto_random_object_id(8).lower()}'\n\n    # Create project from template\n    cookiecutter(\n        template=str(template_directory),\n        no_input=True,\n        extra_context={\n            'project_name': actor_name,\n            'package_manager': package_manager,\n            'crawler_type': crawler_type,\n            'http_client': http_client,\n            'enable_apify_integration': True,\n            'start_url': default_start_url,\n            'install_project': False,\n        },\n        output_dir=str(tmp_path),\n    )\n\n    patch_crawlee_version_in_project(\n        project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager\n    )\n\n    # Print apify version for debugging purposes in rare cases of CLI failures\n    subprocess.run(['apify', '--version'], check=True)  # noqa: ASYNC221, S607\n\n    # Build actor using sequence of cli commands as the user would\n    subprocess.run(  # noqa: ASYNC221, S603\n        ['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']],  # noqa: S607\n        capture_output=True,\n        check=True,\n        cwd=tmp_path / actor_name,\n    )\n    subprocess.run(['apify', 'init', '-y', actor_name], capture_output=True, check=True, cwd=tmp_path / actor_name)  # noqa: ASYNC221, S603, S607\n\n    build_process = subprocess.run(['apify', 'push'], capture_output=True, check=False, cwd=tmp_path / actor_name)  # noqa: ASYNC221, S607\n    # Get actor ID from build log\n    actor_id_regexp = re.compile(r'https:\\/\\/console\\.apify\\.com\\/actors\\/(.*)#\\/builds\\/\\d*\\.\\d*\\.\\d*')\n\n    if match := re.findall(actor_id_regexp, build_process.stderr.decode()):\n        actor_id = match[0]\n    else:\n        raise AssertionError(f'Failed to find actor id in build log: {build_process.stderr.decode()}')\n\n    client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN'))\n    actor = client.actor(actor_id)\n\n    # Run actor\n    try:\n        assert build_process.returncode == 0\n        started_run_data = await actor.start(memory_mbytes=8192)\n        actor_run = client.run(started_run_data['id'])\n\n        finished_run_data = await actor_run.wait_for_finish()\n        actor_run_log = await actor_run.log().get()\n    finally:\n        # Delete the actor once it is no longer needed.\n        await actor.delete()\n\n    # Asserts\n    additional_run_info = f'Full actor run log: {actor_run_log}'\n    assert actor_run_log\n    assert finished_run_data\n    assert finished_run_data['status'] == 'SUCCEEDED', additional_run_info\n    assert (\n        'Crawler.stop() was called with following reason: The crawler has reached its limit of 10 requests per crawl.'\n    ) in actor_run_log, additional_run_info\n    assert int(re.findall(r'requests_finished\\s*│\\s*(\\d*)', actor_run_log)[-1]) >= 10, additional_run_info\n"
  },
  {
    "path": "tests/e2e/project_template/utils.py",
    "content": "import re\nimport shutil\nimport subprocess\nfrom pathlib import Path\nfrom typing import Literal\n\n\ndef patch_crawlee_version_in_project(\n    project_path: Path, wheel_path: Path, package_manager: Literal['pip', 'uv', 'poetry']\n) -> None:\n    \"\"\"Ensure that the test is using current version of the crawlee from the source and not from Pypi.\"\"\"\n    # Copy prepared .whl file\n    shutil.copy(wheel_path, project_path)\n\n    if package_manager in {'poetry', 'uv'}:\n        _patch_crawlee_version_in_pyproject_toml_based_project(project_path, wheel_path)\n    else:\n        _patch_crawlee_version_in_requirements_txt_based_project(project_path, wheel_path)\n\n\ndef _patch_crawlee_version_in_requirements_txt_based_project(project_path: Path, wheel_path: Path) -> None:\n    # Get any extras\n    requirements_path = project_path / 'requirements.txt'\n    with requirements_path.open() as f:\n        requirements = f.read()\n        crawlee_extras = re.findall(r'crawlee(\\[.*\\])', requirements)[0] or ''\n\n    # Modify requirements.txt to use crawlee from wheel file instead of from Pypi\n    with requirements_path.open() as f:\n        modified_lines = []\n        for line in f:\n            if 'crawlee' in line:\n                modified_lines.append(f'./{wheel_path.name}{crawlee_extras}\\n')\n            else:\n                modified_lines.append(line)\n    with requirements_path.open('w') as f:\n        f.write(''.join(modified_lines))\n\n    # Patch the dockerfile to have wheel file available\n    dockerfile_path = project_path / 'Dockerfile'\n    with dockerfile_path.open() as f:\n        modified_lines = []\n        for line in f:\n            modified_lines.append(line)\n            if line.startswith('COPY requirements.txt ./'):\n                modified_lines.extend(\n                    [\n                        f'COPY {wheel_path.name} ./\\n',\n                        # If no crawlee version bump, pip might be lazy and take existing pre-installed crawlee version,\n                        # make sure that one is patched as well.\n                        f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\\n',\n                    ]\n                )\n    with dockerfile_path.open('w') as f:\n        f.write(''.join(modified_lines))\n\n\ndef _patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, wheel_path: Path) -> None:\n    \"\"\"Ensure that the test is using current version of the crawlee from the source and not from Pypi.\"\"\"\n    # Get any extras\n    pyproject_path = project_path / 'pyproject.toml'\n    with pyproject_path.open() as f:\n        pyproject = f.read()\n        crawlee_extras = re.findall(r'crawlee(\\[.*\\])', pyproject)[0] or ''\n\n    # Inject crawlee wheel file to the docker image and update project to depend on it.\"\"\"\n    dockerfile_path = project_path / 'Dockerfile'\n    with dockerfile_path.open() as f:\n        modified_lines = []\n        for line in f:\n            modified_lines.append(line)\n            if line.startswith('COPY pyproject.toml'):\n                if 'uv.lock' in line:\n                    package_manager = 'uv'\n                elif 'poetry.lock' in line:\n                    package_manager = 'poetry'\n                else:\n                    raise RuntimeError('This does not look like a uv or poetry based project.')\n\n                # Create lock file that is expected by the docker to exist (even though it will be patched\n                # in the docker).\n                subprocess.run(\n                    args=[package_manager, 'lock'],\n                    cwd=str(project_path),\n                    check=True,\n                    capture_output=True,\n                )\n\n                # Add command to copy .whl to the docker image and update project with it.\n                # Patching in docker file due to the poetry not properly supporting relative paths for wheel packages\n                # and so the absolute path (in the container) is generated when running `add` command in the container.\n                modified_lines.extend(\n                    [\n                        f'COPY {wheel_path.name} ./\\n',\n                        # If no crawlee version bump, poetry might be lazy and take existing pre-installed crawlee\n                        # version, make sure that one is patched as well.\n                        f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\\n',\n                        f'RUN {package_manager} add ./{wheel_path.name}{crawlee_extras}\\n',\n                        f'RUN {package_manager} lock\\n',\n                    ]\n                )\n    with dockerfile_path.open('w') as f:\n        f.write(''.join(modified_lines))\n"
  },
  {
    "path": "tests/unit/README.md",
    "content": "# Unit tests\n\nSome tests may exhibit flaky behavior in CI. The reason for flaky behavior should be understood as it can indicate bug in the code or design flaw in the test. There are other reasons related to test execution, such as some tests that are not (or can not be) properly isolated, or limited resource constraints of the test executor.\n\nHere are some suggested approaches to mitigate flakiness, sorted in the order of preference:\n  - Investigate the root cause and fix the code or test.\n  - Apply one of the pytest marks to mitigate the flakiness:\n    - `@run_alone_on_mac` - Test with such mark will run alone on macOS exeutor in CI (normally several tests run in parallel, which can cause resource-sensitive tests to fail.) Use for resource sensitive tests that are known to be flaky only on macOS.\n    - `@run_alone` - Test with such mark will run alone on any executor. Use for resource sensitive tests that are known to be flaky on all platforms or for tests that can not be run in parallel with other test due to their design (This should be extremely rare).\n    - `@pytest.mark.flaky` - Test with such mark will be retried several times if it fails. Use for tests that are known to be flaky, but the reason for flakiness is not understood or can not be easily mitigated.\n    - `@pytest.mark.skip` - Test with such mark will be skipped. Use when none of the above approaches mitigate the test flakiness. Marking test as skipped should be a last resort, as it can hide potential bugs and give false sense of security. Skipped tests should be tracked in GitHub issue.\n"
  },
  {
    "path": "tests/unit/__init__.py",
    "content": ""
  },
  {
    "path": "tests/unit/_autoscaling/test_autoscaled_pool.py",
    "content": "# ruff: noqa: FBT003 # Boolean positional value in function call\n\nfrom __future__ import annotations\n\nimport asyncio\nfrom contextlib import suppress\nfrom datetime import datetime, timedelta, timezone\nfrom itertools import chain, repeat\nfrom typing import TYPE_CHECKING, TypeVar, cast\nfrom unittest.mock import Mock\n\nimport pytest\n\nfrom crawlee._autoscaling import AutoscaledPool, SystemStatus\nfrom crawlee._autoscaling._types import LoadRatioInfo, SystemInfo\nfrom crawlee._types import ConcurrencySettings\nfrom crawlee._utils.time import measure_time\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable\n\n\n@pytest.fixture\ndef system_status() -> SystemStatus | Mock:\n    return Mock(spec=SystemStatus)\n\n\nT = TypeVar('T')\n\n\ndef future(value: T, /) -> Awaitable[T]:\n    f = asyncio.Future[T]()\n    f.set_result(value)\n    return f\n\n\n@pytest.mark.run_alone\nasync def test_runs_concurrently(system_status: SystemStatus | Mock) -> None:\n    done_count = 0\n\n    async def run() -> None:\n        await asyncio.sleep(0.1)\n        nonlocal done_count\n        done_count += 1\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(done_count >= 10),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=10,\n            max_concurrency=10,\n        ),\n    )\n\n    with measure_time() as elapsed:\n        await pool.run()\n\n    assert elapsed.wall is not None\n    assert elapsed.wall < 0.3\n\n    assert done_count >= 10\n\n\nasync def test_abort_works(system_status: SystemStatus | Mock) -> None:\n    async def run() -> None:\n        await asyncio.sleep(60)\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(False),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=10,\n            max_concurrency=10,\n        ),\n    )\n\n    with measure_time() as elapsed:\n        run_task = asyncio.create_task(pool.run(), name='pool run task')\n        await asyncio.sleep(0.1)\n        assert pool.current_concurrency == 10\n        await pool.abort()\n        assert pool.current_concurrency == 0\n        await run_task\n\n    assert elapsed.wall is not None\n    assert elapsed.wall < 5\n\n\nasync def test_propagates_exceptions(system_status: SystemStatus | Mock) -> None:\n    done_count = 0\n\n    async def run() -> None:\n        await asyncio.sleep(0.1)\n        nonlocal done_count\n        done_count += 1\n\n        if done_count > 5:\n            raise RuntimeError('Scheduled crash')\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(done_count >= 20),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=10,\n            max_concurrency=10,\n        ),\n    )\n\n    with pytest.raises(RuntimeError, match=r'Scheduled crash'):\n        await pool.run()\n\n    assert done_count < 20\n\n\nasync def test_propagates_exceptions_after_finished(system_status: SystemStatus | Mock) -> None:\n    started_count = 0\n\n    async def run() -> None:\n        nonlocal started_count\n        started_count += 1\n\n        await asyncio.sleep(1)\n\n        raise RuntimeError('Scheduled crash')\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(started_count > 0),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=1,\n            desired_concurrency=1,\n            max_concurrency=1,\n        ),\n    )\n\n    with pytest.raises(RuntimeError, match=r'Scheduled crash'):\n        await pool.run()\n\n\n@pytest.mark.flaky(\n    rerun=3,\n    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1655.',\n)\nasync def test_autoscales(\n    monkeypatch: pytest.MonkeyPatch,\n    system_status: SystemStatus | Mock,\n) -> None:\n    done_count = 0\n\n    async def run() -> None:\n        await asyncio.sleep(0.1)\n        nonlocal done_count\n        done_count += 1\n\n    start = datetime.now(timezone.utc)\n\n    def get_historical_system_info() -> SystemInfo:\n        result = SystemInfo(\n            cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n        )\n\n        # 0.5 seconds after the start of the test, pretend the CPU became overloaded\n        if result.created_at - start >= timedelta(seconds=0.5):\n            result.cpu_info = LoadRatioInfo(limit_ratio=0.9, actual_ratio=1.0)\n\n        return result\n\n    cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info\n\n    # Override AP class attributes using monkeypatch.\n    monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1))\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(False),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=1,\n            desired_concurrency=1,\n            max_concurrency=4,\n        ),\n    )\n\n    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')\n\n    try:\n        # After 0.2s, there should be an increase in concurrency\n        await asyncio.sleep(0.2)\n        assert pool.desired_concurrency > 1\n\n        # After 0.5s, the concurrency should reach max concurrency\n        await asyncio.sleep(0.3)\n        assert pool.desired_concurrency == 4\n\n        # The concurrency should guarantee completion of more than 10 tasks (a single worker would complete ~5)\n        assert done_count > 10\n\n        # After 0.7s, the pretend overload should have kicked in and there should be a drop in desired concurrency\n        await asyncio.sleep(0.2)\n        assert pool.desired_concurrency < 4\n\n        # After a full second, the pool should scale down all the way to 1\n        await asyncio.sleep(0.3)\n        assert pool.desired_concurrency == 1\n    finally:\n        pool_run_task.cancel()\n        with suppress(asyncio.CancelledError):\n            await pool_run_task\n\n\nasync def test_autoscales_uses_desired_concurrency_ratio(\n    monkeypatch: pytest.MonkeyPatch,\n    system_status: SystemStatus | Mock,\n) -> None:\n    \"\"\"Test that desired concurrency ratio can limit desired concurrency.\n\n    This test creates situation where only one task is ready and then no other task is ever ready.\n    This creates situation where the system could scale up desired concurrency, but it will not do so because\n    desired_concurrency_ratio=1 means that first the system would have to increase current concurrency to same number as\n    desired concurrency and due to no other task ever being ready, it will never happen. Thus desired concurrency will\n    stay 2 as was the initial setup, even though other conditions would allow the increase. (max_concurrency=4,\n    system being idle).\n    \"\"\"\n\n    async def run() -> None:\n        await asyncio.sleep(0.1)\n\n    is_task_ready_iterator = chain([future(True)], repeat(future(False)))\n\n    def is_task_ready_function() -> Awaitable[bool]:\n        return next(is_task_ready_iterator)\n\n    def get_historical_system_info() -> SystemInfo:\n        return SystemInfo(\n            cpu_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            memory_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            event_loop_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n            client_info=LoadRatioInfo(limit_ratio=0.9, actual_ratio=0.3),\n        )\n\n    cast('Mock', system_status.get_historical_system_info).side_effect = get_historical_system_info\n\n    # Override AP class attributes using monkeypatch.\n    monkeypatch.setattr(AutoscaledPool, '_AUTOSCALE_INTERVAL', timedelta(seconds=0.1))\n    monkeypatch.setattr(AutoscaledPool, '_DESIRED_CONCURRENCY_RATIO', 1)\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=is_task_ready_function,\n        is_finished_function=lambda: future(False),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=2,\n            desired_concurrency=2,\n            max_concurrency=4,\n        ),\n    )\n\n    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')\n    try:\n        for _ in range(5):\n            assert pool.desired_concurrency == 2\n            await asyncio.sleep(0.1)\n\n    finally:\n        pool_run_task.cancel()\n        with suppress(asyncio.CancelledError):\n            await pool_run_task\n\n\nasync def test_max_tasks_per_minute_works(system_status: SystemStatus | Mock) -> None:\n    done_count = 0\n\n    async def run() -> None:\n        await asyncio.sleep(0.1)\n        nonlocal done_count\n        done_count += 1\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(True),\n        is_finished_function=lambda: future(False),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=1,\n            desired_concurrency=1,\n            max_concurrency=1,\n            max_tasks_per_minute=120,\n        ),\n    )\n\n    pool_run_task = asyncio.create_task(pool.run(), name='pool run task')\n    try:\n        await asyncio.sleep(0.5)\n        assert done_count <= 1\n    finally:\n        pool_run_task.cancel()\n        with suppress(asyncio.CancelledError):\n            await pool_run_task\n\n\nasync def test_allows_multiple_run_calls(system_status: SystemStatus | Mock) -> None:\n    done_count = 0\n\n    async def run() -> None:\n        nonlocal done_count\n        done_count += 1\n        await asyncio.sleep(0.1)\n\n    pool = AutoscaledPool(\n        system_status=system_status,\n        run_task_function=run,\n        is_task_ready_function=lambda: future(done_count < 4),\n        is_finished_function=lambda: future(done_count >= 4),\n        concurrency_settings=ConcurrencySettings(\n            min_concurrency=4,\n            desired_concurrency=4,\n            max_concurrency=4,\n        ),\n    )\n\n    await pool.run()\n    assert done_count == 4\n\n    done_count = 0\n\n    await pool.run()\n    assert done_count == 4\n"
  },
  {
    "path": "tests/unit/_autoscaling/test_snapshotter.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport time\nfrom bisect import insort\nfrom datetime import datetime, timedelta, timezone\nfrom logging import getLogger\nfrom math import floor\nfrom typing import TYPE_CHECKING, Any, cast\nfrom unittest import mock\nfrom unittest.mock import MagicMock\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee._autoscaling import Snapshotter\nfrom crawlee._autoscaling._types import (\n    SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD,\n    ClientSnapshot,\n    CpuSnapshot,\n    MemorySnapshot,\n)\nfrom crawlee._autoscaling.snapshotter import SortedSnapshotList\nfrom crawlee._utils.byte_size import ByteSize\nfrom crawlee._utils.system import CpuInfo, MemoryInfo, get_memory_info\nfrom crawlee.configuration import Configuration\nfrom crawlee.events import LocalEventManager\nfrom crawlee.events._types import Event, EventSystemInfoData\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n\n@pytest.fixture\nasync def event_manager() -> AsyncGenerator[LocalEventManager, None]:\n    # Use a long interval to avoid interference from periodic system info events during tests and ensure the first\n    # automatic event is consumed before yielding.\n\n    event_manager = LocalEventManager(system_info_interval=timedelta(hours=9999))\n\n    initial_system_info_consumed = asyncio.Event()\n\n    async def consume_automatic_system_info(_: EventSystemInfoData) -> None:\n        initial_system_info_consumed.set()\n\n    event_manager.on(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)\n\n    async with event_manager:\n        await initial_system_info_consumed.wait()\n        event_manager.off(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)\n\n        yield event_manager\n\n\n@pytest.fixture\nasync def snapshotter(event_manager: LocalEventManager) -> AsyncGenerator[Snapshotter, None]:\n    config = Configuration(available_memory_ratio=0.25)\n    service_locator.set_event_manager(event_manager)\n    async with Snapshotter.from_config(config) as snapshotter:\n        yield snapshotter\n\n\n@pytest.fixture\ndef default_cpu_info() -> CpuInfo:\n    return CpuInfo(used_ratio=0.5)\n\n\n@pytest.fixture\ndef default_memory_info() -> MemoryInfo:\n    return MemoryInfo(\n        total_size=ByteSize.from_gb(8),\n        current_size=ByteSize.from_gb(4),\n        system_wide_used_size=ByteSize.from_gb(5),\n    )\n\n\n@pytest.fixture\ndef event_system_data_info(default_cpu_info: CpuInfo, default_memory_info: MemoryInfo) -> EventSystemInfoData:\n    return EventSystemInfoData(\n        cpu_info=default_cpu_info,\n        memory_info=default_memory_info,\n    )\n\n\nasync def test_start_stop_lifecycle() -> None:\n    config = Configuration(available_memory_ratio=0.25)\n\n    async with Snapshotter.from_config(config):\n        pass\n\n\nasync def test_snapshot_cpu(\n    snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager\n) -> None:\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info)\n    await event_manager.wait_for_all_listeners_to_complete()\n    cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample())\n    assert len(cpu_snapshots) == 1\n    assert cpu_snapshots[0].used_ratio == event_system_data_info.cpu_info.used_ratio\n\n\nasync def test_snapshot_memory(\n    snapshotter: Snapshotter, event_system_data_info: EventSystemInfoData, event_manager: LocalEventManager\n) -> None:\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_data_info)\n    await event_manager.wait_for_all_listeners_to_complete()\n    memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample())\n    assert len(memory_snapshots) == 1\n    assert memory_snapshots[0].current_size == event_system_data_info.memory_info.current_size\n\n\nasync def test_snapshot_memory_with_memory_info_sets_system_wide_fields(\n    snapshotter: Snapshotter, event_manager: LocalEventManager\n) -> None:\n    memory_info = MemoryInfo(\n        total_size=ByteSize.from_gb(16),\n        current_size=ByteSize.from_gb(4),\n        system_wide_used_size=ByteSize.from_gb(12),\n    )\n\n    event_data = EventSystemInfoData(\n        cpu_info=CpuInfo(used_ratio=0.5),\n        memory_info=memory_info,\n    )\n\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n    await event_manager.wait_for_all_listeners_to_complete()\n\n    memory_snapshots = cast('list[MemorySnapshot]', snapshotter.get_memory_sample())\n\n    assert len(memory_snapshots) == 1\n    memory_snapshot = memory_snapshots[0]\n\n    # Test that system-wide fields are properly set\n    assert memory_snapshot.system_wide_used_size == memory_info.system_wide_used_size\n    assert memory_snapshot.system_wide_memory_size == memory_info.total_size\n\n\ndef test_snapshot_event_loop(snapshotter: Snapshotter) -> None:\n    # A first event loop snapshot is created when an instance is created.\n    event_loop_snapshots = snapshotter.get_event_loop_sample()\n    assert len(event_loop_snapshots) == 1\n\n\ndef test_snapshot_client(snapshotter: Snapshotter) -> None:\n    # A first client snapshot is created when an instance is created.\n    client_snapshots = snapshotter.get_client_sample()\n    assert len(client_snapshots) == 1\n\n\ndef test_snapshot_client_overloaded() -> None:\n    assert not ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2).is_overloaded\n    assert not ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2).is_overloaded\n    assert not ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2).is_overloaded\n    assert ClientSnapshot(error_count=7, new_error_count=3, max_error_count=2).is_overloaded\n\n\n@pytest.mark.run_alone\nasync def test_get_cpu_sample(\n    snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo\n) -> None:\n    now = datetime.now(timezone.utc)\n    snapshotter._SNAPSHOT_HISTORY = timedelta(hours=10)  # Extend history for testing\n\n    events_data = [\n        EventSystemInfoData(\n            cpu_info=CpuInfo(\n                used_ratio=0.5,\n                created_at=now - timedelta(hours=delta),\n            ),\n            memory_info=default_memory_info,\n        )\n        for delta in range(5, 0, -1)\n    ]\n    for event_data in events_data:\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n    await event_manager.wait_for_all_listeners_to_complete()\n\n    # When no sample duration is provided it should return all snapshots\n    samples = snapshotter.get_cpu_sample()\n    assert len(samples) == len(events_data)\n\n    duration = timedelta(hours=0.5)\n    samples = snapshotter.get_cpu_sample(duration)\n    assert len(samples) == 1\n\n    duration = timedelta(hours=2.5)\n    samples = snapshotter.get_cpu_sample(duration)\n    assert len(samples) == 3\n\n    duration = timedelta(hours=10)\n    samples = snapshotter.get_cpu_sample(duration)\n    assert len(samples) == len(events_data)\n\n\nasync def test_methods_raise_error_when_not_active() -> None:\n    snapshotter = Snapshotter.from_config(Configuration(available_memory_ratio=0.25))\n    assert snapshotter.active is False\n\n    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):\n        snapshotter.get_cpu_sample()\n\n    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):\n        snapshotter.get_memory_sample()\n\n    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):\n        snapshotter.get_event_loop_sample()\n\n    with pytest.raises(RuntimeError, match=r'Snapshotter is not active.'):\n        snapshotter.get_client_sample()\n\n    with pytest.raises(RuntimeError, match=r'Snapshotter is already active.'):\n        async with snapshotter, snapshotter:\n            pass\n\n    async with snapshotter:\n        snapshotter.get_cpu_sample()\n        snapshotter.get_memory_sample()\n        snapshotter.get_event_loop_sample()\n        snapshotter.get_client_sample()\n\n        assert snapshotter.active is True\n\n\nasync def test_snapshot_pruning_removes_outdated_records(\n    snapshotter: Snapshotter, event_manager: LocalEventManager, default_memory_info: MemoryInfo\n) -> None:\n    # Set the snapshot history to 2 hours\n    snapshotter._SNAPSHOT_HISTORY = timedelta(hours=2)\n\n    # Create timestamps for testing\n    now = datetime.now(timezone.utc)\n\n    def randomly_delayed_insort(*args: Any, **kwargs: Any) -> None:\n        \"\"\"Sort with injected delay to provoke otherwise hard to reproduce race condition.\"\"\"\n        time.sleep(0.05)\n        return insort(*args, **kwargs)\n\n    with mock.patch('crawlee._autoscaling.snapshotter.insort', side_effect=randomly_delayed_insort):\n        events_data = [\n            EventSystemInfoData(\n                cpu_info=CpuInfo(used_ratio=0.5, created_at=now - timedelta(hours=delta)),\n                memory_info=default_memory_info,\n            )\n            for delta in [0, 3, 2, 5]  # Out of order timestamps. Snapshotter can not rely on natural ordering.\n        ]\n\n        for event_data in events_data:\n            event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n        await event_manager.wait_for_all_listeners_to_complete()\n\n    cpu_snapshots = cast('list[CpuSnapshot]', snapshotter.get_cpu_sample())\n\n    # Check that only the last two snapshots remain\n    assert len(cpu_snapshots) == 2\n    assert cpu_snapshots[0].created_at == now - timedelta(hours=2)\n    assert cpu_snapshots[1].created_at == now\n\n\nasync def test_memory_load_evaluation_logs_warning_on_high_usage(\n    caplog: pytest.LogCaptureFixture,\n    event_manager: LocalEventManager,\n    default_cpu_info: CpuInfo,\n) -> None:\n    config = Configuration(memory_mbytes=8192)\n\n    service_locator.set_event_manager(event_manager)\n    snapshotter = Snapshotter.from_config(config)\n\n    high_memory_usage = ByteSize.from_gb(8) * 0.95  # 95% of 8 GB\n\n    event_data = EventSystemInfoData(\n        cpu_info=default_cpu_info,\n        memory_info=MemoryInfo(\n            total_size=ByteSize.from_gb(8),\n            current_size=high_memory_usage,\n            system_wide_used_size=ByteSize.from_gb(7),\n        ),\n    )\n\n    async with snapshotter:\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        # Filter log records to only include those from snapshotter\n        log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()]\n\n        assert len(log_records) == 1\n        assert log_records[0].levelname.lower() == 'warning'\n        assert 'Memory is critically overloaded' in log_records[0].msg\n\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        log_records = [record for record in caplog.records if 'snapshotter' in record.pathname.lower()]\n\n        assert len(log_records) == 1\n\n\nasync def test_memory_load_evaluation_silent_on_acceptable_usage(\n    monkeypatch: pytest.MonkeyPatch,\n    event_manager: LocalEventManager,\n    default_cpu_info: CpuInfo,\n) -> None:\n    mock_logger_warn = MagicMock()\n    monkeypatch.setattr(getLogger('crawlee.autoscaling.snapshotter'), 'warning', mock_logger_warn)\n\n    service_locator.set_event_manager(event_manager)\n    snapshotter = Snapshotter.from_config(Configuration(memory_mbytes=8192))\n\n    low_memory_usage = ByteSize.from_gb(8) * 0.8  # 80% of 8 GB\n\n    event_data = EventSystemInfoData(\n        cpu_info=default_cpu_info,\n        memory_info=MemoryInfo(\n            total_size=ByteSize.from_gb(8),\n            current_size=low_memory_usage,\n            system_wide_used_size=ByteSize.from_gb(7),\n        ),\n    )\n\n    async with snapshotter:\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_data)\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        assert mock_logger_warn.call_count == 0\n\n\nasync def test_snapshots_time_ordered(snapshotter: Snapshotter, event_manager: LocalEventManager) -> None:\n    # All internal snapshot list should be ordered by creation time in ascending order.\n    # Scenario where older emitted event arrives after newer event.\n    # Snapshotter should not trust the event order and check events' times.\n    time_new = datetime.now(tz=timezone.utc)\n    time_old = datetime.now(tz=timezone.utc) - timedelta(milliseconds=50)\n\n    def create_event_data(creation_time: datetime) -> EventSystemInfoData:\n        return EventSystemInfoData(\n            cpu_info=CpuInfo(used_ratio=0.5, created_at=creation_time),\n            memory_info=MemoryInfo(\n                current_size=ByteSize(bytes=1),\n                created_at=creation_time,\n                total_size=ByteSize(bytes=2),\n                system_wide_used_size=ByteSize.from_gb(5),\n            ),\n        )\n\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_new))\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_old))\n    await event_manager.wait_for_all_listeners_to_complete()\n\n    memory_samples = snapshotter.get_memory_sample()\n    cpu_samples = snapshotter.get_cpu_sample()\n    assert memory_samples[0].created_at == time_old\n    assert cpu_samples[0].created_at == time_old\n    assert memory_samples[1].created_at == time_new\n    assert cpu_samples[1].created_at == time_new\n\n\ndef test_sorted_snapshot_list_add_maintains_order() -> None:\n    \"\"\"Test that SortedSnapshotList.add method maintains sorted order by created_at with multiple items.\"\"\"\n    sorted_list = SortedSnapshotList[CpuSnapshot]()\n\n    # Create snapshots with different timestamps (more items to test binary search better)\n    now = datetime.now(timezone.utc)\n    snapshots = [\n        CpuSnapshot(used_ratio=0.1, max_used_ratio=0.95, created_at=now - timedelta(seconds=50)),  # oldest\n        CpuSnapshot(used_ratio=0.2, max_used_ratio=0.95, created_at=now - timedelta(seconds=40)),\n        CpuSnapshot(used_ratio=0.3, max_used_ratio=0.95, created_at=now - timedelta(seconds=30)),\n        CpuSnapshot(used_ratio=0.4, max_used_ratio=0.95, created_at=now - timedelta(seconds=20)),\n        CpuSnapshot(used_ratio=0.5, max_used_ratio=0.95, created_at=now - timedelta(seconds=10)),\n        CpuSnapshot(used_ratio=0.6, max_used_ratio=0.95, created_at=now - timedelta(seconds=5)),\n        CpuSnapshot(used_ratio=0.7, max_used_ratio=0.95, created_at=now),  # newest\n    ]\n\n    # Add snapshots in random order to test binary search insertion\n    add_order = [3, 0, 5, 1, 6, 2, 4]  # indices in random order\n    for i in add_order:\n        sorted_list.add(snapshots[i])\n\n    # Verify the list is sorted by created_at (should be in original order)\n    assert len(sorted_list) == 7\n    for i, snapshot in enumerate(sorted_list):\n        assert snapshot == snapshots[i], f'Item at index {i} is not correctly sorted'\n        if i > 0:\n            prev_time = sorted_list[i - 1].created_at\n            curr_time = snapshot.created_at\n            assert prev_time <= curr_time, f'Items at indices {i - 1} and {i} are not in chronological order'\n\n\n@pytest.mark.parametrize('dynamic_memory', [True, False])\nasync def test_dynamic_memory(\n    *,\n    default_cpu_info: CpuInfo,\n    event_manager: LocalEventManager,\n    dynamic_memory: bool,\n) -> None:\n    \"\"\"Test dynamic memory scaling scenario where the system-wide memory can change.\n\n    Create two memory snapshots. They have same memory usage, but different available memory.\n    First snapshot is created with insufficient memory, so it is overloaded.\n    Second snapshot is created with sufficient memory.\n\n    Based on the Snapshotter configuration, it will either take into account the increased available memory or not.\n    \"\"\"\n    _initial_memory_info = get_memory_info()\n    ratio_just_below_system_wide_overload = 0.99 * SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD\n\n    memory_mbytes = 0 if dynamic_memory else floor(_initial_memory_info.total_size.to_mb())\n\n    service_locator.set_event_manager(event_manager)\n\n    async with Snapshotter.from_config(\n        Configuration(memory_mbytes=memory_mbytes, available_memory_ratio=ratio_just_below_system_wide_overload)\n    ) as snapshotter:\n        # Default state, memory usage exactly at the overload threshold -> overloaded, but not system-wide overloaded\n        memory_infos = [\n            # Overloaded sample\n            MemoryInfo(\n                total_size=_initial_memory_info.total_size,\n                current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,\n                system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,\n            ),\n            # Same as first sample, with twice as memory available in the system\n            MemoryInfo(\n                total_size=_initial_memory_info.total_size * 2,  # Simulate increased total memory\n                current_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,\n                system_wide_used_size=_initial_memory_info.total_size * ratio_just_below_system_wide_overload,\n            ),\n        ]\n\n        for memory_info in memory_infos:\n            event_manager.emit(\n                event=Event.SYSTEM_INFO,\n                event_data=EventSystemInfoData(\n                    cpu_info=default_cpu_info,\n                    memory_info=memory_info,\n                ),\n            )\n\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        memory_samples = snapshotter.get_memory_sample()\n        assert len(memory_samples) == 2\n        # First sample will be overloaded.\n        assert memory_samples[0].is_overloaded\n        # Second sample can reflect the increased available memory based on the configuration used to create Snapshotter\n        assert memory_samples[1].is_overloaded == (not dynamic_memory)\n"
  },
  {
    "path": "tests/unit/_autoscaling/test_system_status.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timedelta, timezone\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee._autoscaling import Snapshotter, SystemStatus\nfrom crawlee._autoscaling._types import (\n    ClientSnapshot,\n    CpuSnapshot,\n    EventLoopSnapshot,\n    LoadRatioInfo,\n    MemorySnapshot,\n    SystemInfo,\n)\nfrom crawlee._utils.byte_size import ByteSize\nfrom crawlee.configuration import Configuration\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n\n@pytest.fixture\nasync def snapshotter() -> AsyncGenerator[Snapshotter, None]:\n    config = Configuration(available_memory_ratio=0.25)\n    async with Snapshotter.from_config(config) as snapshotter:\n        yield snapshotter\n\n\n@pytest.fixture\ndef now() -> datetime:\n    return datetime.now(timezone.utc)\n\n\nasync def test_start_stop_lifecycle() -> None:\n    config = Configuration(available_memory_ratio=0.25)\n\n    async with Snapshotter.from_config(config) as snapshotter:\n        system_status = SystemStatus(snapshotter)\n        system_status.get_current_system_info()\n        system_status.get_historical_system_info()\n\n\ndef test_cpu_is_overloaded(snapshotter: Snapshotter, now: datetime) -> None:\n    system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5)\n    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),\n            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),\n            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),\n            CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now),\n        ]\n    )\n    cpu_info = system_status._is_cpu_overloaded()\n\n    assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.667)\n    assert cpu_info.is_overloaded is True\n\n\ndef test_cpu_is_not_overloaded(snapshotter: Snapshotter, now: datetime) -> None:\n    system_status = SystemStatus(snapshotter, cpu_overload_threshold=0.5)\n    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),\n            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),\n            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),\n            CpuSnapshot(used_ratio=0.5, max_used_ratio=0.75, created_at=now),\n        ]\n    )\n    cpu_info = system_status._is_cpu_overloaded()\n\n    assert cpu_info == LoadRatioInfo(limit_ratio=0.5, actual_ratio=0.333)\n    assert cpu_info.is_overloaded is False\n\n\ndef test_get_system_info(snapshotter: Snapshotter, now: datetime) -> None:\n    system_status = SystemStatus(\n        snapshotter,\n        max_snapshot_age=timedelta(minutes=1),\n        cpu_overload_threshold=0.5,\n        memory_overload_threshold=0.5,\n        event_loop_overload_threshold=0.5,\n        client_overload_threshold=0.5,\n    )\n\n    # Add CPU snapshots\n    system_status._snapshotter._cpu_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            CpuSnapshot(used_ratio=0.6, max_used_ratio=0.75, created_at=now - timedelta(minutes=3)),\n            CpuSnapshot(used_ratio=0.7, max_used_ratio=0.75, created_at=now - timedelta(minutes=2)),\n            CpuSnapshot(used_ratio=0.8, max_used_ratio=0.75, created_at=now - timedelta(minutes=1)),\n            CpuSnapshot(used_ratio=0.9, max_used_ratio=0.75, created_at=now),\n        ]\n    )\n\n    # Add memory snapshots\n    system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(4),\n                max_memory_size=ByteSize.from_gb(12),\n                max_used_memory_ratio=0.8,\n                created_at=now - timedelta(seconds=90),\n                system_wide_used_size=None,\n                system_wide_memory_size=None,\n            ),\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(7),\n                max_memory_size=ByteSize.from_gb(8),\n                max_used_memory_ratio=0.8,\n                created_at=now - timedelta(seconds=60),\n                system_wide_used_size=None,\n                system_wide_memory_size=None,\n            ),\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(28),\n                max_memory_size=ByteSize.from_gb(30),\n                max_used_memory_ratio=0.8,\n                created_at=now - timedelta(seconds=30),\n                system_wide_used_size=None,\n                system_wide_memory_size=None,\n            ),\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(48),\n                max_memory_size=ByteSize.from_gb(60),\n                max_used_memory_ratio=0.8,\n                created_at=now,\n                system_wide_used_size=None,\n                system_wide_memory_size=None,\n            ),\n        ]\n    )\n\n    # Add event loop snapshots\n    system_status._snapshotter._event_loop_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            EventLoopSnapshot(\n                delay=timedelta(milliseconds=700),\n                max_delay=timedelta(milliseconds=500),\n                created_at=now - timedelta(minutes=3),\n            ),\n            EventLoopSnapshot(\n                delay=timedelta(milliseconds=600),\n                max_delay=timedelta(milliseconds=500),\n                created_at=now - timedelta(minutes=2),\n            ),\n            EventLoopSnapshot(\n                delay=timedelta(milliseconds=200),\n                max_delay=timedelta(milliseconds=500),\n                created_at=now - timedelta(minutes=1),\n            ),\n            EventLoopSnapshot(\n                delay=timedelta(milliseconds=100),\n                max_delay=timedelta(milliseconds=500),\n                created_at=now,\n            ),\n        ]\n    )\n\n    # Add client snapshots\n    system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            ClientSnapshot(error_count=1, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=3)),\n            ClientSnapshot(error_count=2, new_error_count=1, max_error_count=2, created_at=now - timedelta(minutes=2)),\n            ClientSnapshot(error_count=4, new_error_count=2, max_error_count=2, created_at=now - timedelta(minutes=1)),\n            ClientSnapshot(error_count=4, new_error_count=0, max_error_count=2, created_at=now),\n        ]\n    )\n\n    # Test current system info\n    current_system_info = system_status.get_current_system_info()\n    assert current_system_info == SystemInfo(\n        cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=1.0),\n        memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.5),\n        event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0),\n        client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0),\n        created_at=current_system_info.created_at,\n    )\n    assert current_system_info.is_system_idle is False\n\n    # Test historical system info\n    historical_system_info = system_status.get_historical_system_info()\n    assert historical_system_info == SystemInfo(\n        cpu_info=LoadRatioInfo(limit_ratio=system_status._cpu_overload_threshold, actual_ratio=0.667),\n        memory_info=LoadRatioInfo(limit_ratio=system_status._memory_overload_threshold, actual_ratio=0.667),\n        event_loop_info=LoadRatioInfo(limit_ratio=system_status._event_loop_overload_threshold, actual_ratio=0.333),\n        client_info=LoadRatioInfo(limit_ratio=system_status._client_overload_threshold, actual_ratio=0),\n        created_at=historical_system_info.created_at,\n    )\n    assert historical_system_info.is_system_idle is False\n\n\n@pytest.mark.parametrize(('client_overload_threshold', 'is_overloaded'), [(0.66, True), (0.67, False)])\ndef test_client_overloaded(\n    *, snapshotter: Snapshotter, now: datetime, client_overload_threshold: float, is_overloaded: bool\n) -> None:\n    system_status = SystemStatus(\n        snapshotter,\n        max_snapshot_age=timedelta(minutes=1),\n        client_overload_threshold=client_overload_threshold,\n    )\n\n    system_status._snapshotter._client_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            ClientSnapshot(error_count=1, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=3)),\n            ClientSnapshot(error_count=2, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=2)),\n            ClientSnapshot(error_count=3, new_error_count=1, max_error_count=0, created_at=now - timedelta(minutes=1)),\n            ClientSnapshot(error_count=3, new_error_count=0, max_error_count=0, created_at=now),\n        ]\n    )\n\n    # Ratio of overloaded snapshots is 2/3 (2 minutes out of 3)\n    assert system_status._is_client_overloaded().is_overloaded == is_overloaded\n\n\ndef test_memory_overloaded_system_wide(snapshotter: Snapshotter, now: datetime) -> None:\n    \"\"\"Test that system-wide memory overload is detected when system-wide memory utilization exceeds threshold.\"\"\"\n    system_status = SystemStatus(\n        snapshotter,\n        max_snapshot_age=timedelta(minutes=1),\n        memory_overload_threshold=0.5,  # Set high threshold so process memory won't trigger overload\n    )\n\n    # Add memory snapshots with system-wide memory usage above threshold (97%)\n    system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at(\n        [\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(1),  # Process memory is low\n                max_memory_size=ByteSize.from_gb(8),  # Max memory is high\n                max_used_memory_ratio=0.8,  # Ratio is fine\n                created_at=now - timedelta(minutes=1),\n                system_wide_used_size=ByteSize.from_gb(31),  # System-wide used is high\n                system_wide_memory_size=ByteSize.from_gb(32),  # System-wide total (31/32 = 96.875% < 97%)\n            ),\n            MemorySnapshot(\n                current_size=ByteSize.from_gb(1),  # Process memory is low\n                max_memory_size=ByteSize.from_gb(8),  # Max memory is high\n                max_used_memory_ratio=0.8,  # Ratio is fine\n                created_at=now,\n                system_wide_used_size=ByteSize.from_gb(31.5),  # System-wide used is high\n                system_wide_memory_size=ByteSize.from_gb(32),  # System-wide total (31.5/32 = 98.4% > 97%)\n            ),\n        ]\n    )\n\n    memory_info = system_status._is_memory_overloaded()\n\n    # Should be overloaded due to system-wide memory usage exceeding 97% threshold\n    assert memory_info.is_overloaded is True\n    # The actual ratio should be 1.0 (the entire time period from first to second snapshot is overloaded)\n    assert memory_info.actual_ratio == 1.0\n    assert memory_info.limit_ratio == 0.5\n"
  },
  {
    "path": "tests/unit/_statistics/test_error_tracker.py",
    "content": "import traceback\n\nimport pytest\n\nfrom crawlee.statistics._error_tracker import ErrorTracker\n\n\n@pytest.mark.parametrize(\n    ('error_tracker', 'expected_unique_errors'),\n    [\n        (ErrorTracker(), 5),\n        (ErrorTracker(show_file_and_line_number=False), 4),\n        (ErrorTracker(show_error_name=False), 4),\n        (ErrorTracker(show_error_message=False), 3),\n        (ErrorTracker(show_error_name=False, show_file_and_line_number=False), 3),\n        (ErrorTracker(show_file_and_line_number=False, show_error_message=False), 2),\n        (ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_error_message=False), 1),\n    ],\n)\nasync def test_error_tracker_counts(error_tracker: ErrorTracker, expected_unique_errors: int) -> None:\n    \"\"\"Use different settings of `error_tracker` and test unique errors count.\"\"\"\n\n    for error in [\n        Exception('Some value error abc'),\n        ValueError('Some value error abc'),  # Different type, different error\n        ValueError('Some value error cde'),  # Same type and similar message to previous, considered the same.\n        ValueError(\n            'Another value error efg'\n        ),  # Same type, but too different message to previous, considered different.\n        ValueError(),  # Same type but don't have message, considered different.\n    ]:\n        try:\n            raise error  # Errors raised on same line\n        except Exception as e:  # noqa:PERF203\n            await error_tracker.add(e)\n\n    try:\n        raise ValueError('Some value error abc')  # Same as one previous error, but different line.\n    except Exception as e:\n        await error_tracker.add(e)\n\n    assert error_tracker.total == 6\n    assert error_tracker.unique_error_count == expected_unique_errors\n\n\n@pytest.mark.parametrize(\n    ('message_1', 'message_2', 'expected_generic_message'),\n    [\n        ('Some error number 123', 'Some error number 456', 'Some error number ***'),\n        ('Some error number 123 456', 'Some error number 123 456 789', 'Some error number 123 456 ***'),\n        ('Some error number 0 0 0', 'Some error number 1 0 1', 'Some error number *** 0 ***'),\n    ],\n)\nasync def test_error_tracker_similar_messages_full_stack(\n    message_1: str, message_2: str, expected_generic_message: str\n) -> None:\n    \"\"\"Test that similar messages collapse into same group with generic name that contains wildcard symbols.\"\"\"\n    error_tracker = ErrorTracker()\n    for error in [\n        KeyError(message_1),\n        KeyError(message_1),\n        KeyError(message_1),\n        ValueError(message_1),\n        ValueError(message_2),\n        RuntimeError(message_2),\n    ]:\n        try:\n            raise error  # Errors raised on the same line\n        except Exception as e:  # noqa:PERF203\n            await error_tracker.add(e)\n            line = traceback.extract_tb(e.__traceback__)[0].lineno\n\n    file_name = __file__.split('/')[-1]\n    errors = error_tracker.get_most_common_errors()\n    assert errors[0][0] == f'{file_name}:{line}:KeyError:{message_1}'\n    assert errors[0][1] == 3\n    assert errors[1][0] == f'{file_name}:{line}:ValueError:{expected_generic_message}'\n    assert errors[1][1] == 2\n    assert errors[2][0] == f'{file_name}:{line}:RuntimeError:{message_2}'\n    assert errors[2][1] == 1\n\n\n@pytest.mark.parametrize(\n    ('show_full_message', 'expected_message'),\n    [\n        (True, 'Error line 1\\n Error line 2'),\n        (False, 'Error line 1'),\n    ],\n)\nasync def test_show_full_message(*, show_full_message: bool, expected_message: str) -> None:\n    \"\"\"Test error message settings with both options of `show_full_message`.\"\"\"\n    error_tracker = ErrorTracker(\n        show_error_name=False, show_file_and_line_number=False, show_full_message=show_full_message\n    )\n\n    try:\n        raise RuntimeError('Error line 1\\n Error line 2')  # Errors raised on the same line\n    except Exception as e:\n        await error_tracker.add(e)\n\n    assert error_tracker.get_most_common_errors()[0][0] == expected_message\n\n\nasync def test_error_tracker_with_errors_chain() -> None:\n    \"\"\"Test error tracker with errors chain.\"\"\"\n    error_tracker = ErrorTracker(show_error_name=False, show_file_and_line_number=False, show_full_message=True)\n\n    try:\n        raise ZeroDivisionError('Zero division error')  # Errors raised on the same line\n    except Exception as e:\n        try:\n            raise ValueError from e\n        except Exception as e:\n            await error_tracker.add(e)\n\n    assert error_tracker.get_most_common_errors()[0][0] == 'Zero division error'\n"
  },
  {
    "path": "tests/unit/_statistics/test_periodic_logging.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\n\nfrom crawlee.statistics import Statistics\n\nif TYPE_CHECKING:\n    import pytest\n\n\nasync def test_periodic_logging(caplog: pytest.LogCaptureFixture) -> None:\n    caplog.set_level(logging.INFO)\n\n    log_message = 'Periodic statistics XYZ'\n    statistics = Statistics.with_default_state(log_interval=timedelta(milliseconds=50), log_message=log_message)\n\n    async with statistics:\n        await asyncio.sleep(0.1)\n\n    matching_records = [rec for rec in caplog.records if rec.message.startswith(log_message)]\n    assert len(matching_records) >= 1\n"
  },
  {
    "path": "tests/unit/_statistics/test_persistence.py",
    "content": "from __future__ import annotations\n\nfrom crawlee.statistics import Statistics\n\n\nasync def test_basic_persistence() -> None:\n    key = 'statistics_foo'\n\n    async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:\n        statistics.state.requests_failed = 42\n\n    async with Statistics.with_default_state(persistence_enabled=True, persist_state_key=key) as statistics:\n        pass\n\n    assert statistics.state.requests_failed == 42\n"
  },
  {
    "path": "tests/unit/_statistics/test_request_max_duration.py",
    "content": "from __future__ import annotations\n\nimport asyncio\n\nfrom crawlee.statistics import Statistics\n\n\nasync def test_request_max_duration_tracks_maximum() -> None:\n    \"\"\"Test that request_max_duration correctly tracks the maximum duration, not the minimum.\"\"\"\n\n    # asyncio.sleep() can sleep slightly shorter than expected https://bugs.python.org/issue31539#msg302699\n    asyncio_sleep_time_tolerance = 0.015\n    sleep_time = 0.05\n\n    async with Statistics.with_default_state() as statistics:\n        # Record a short request\n        statistics.record_request_processing_start('request_1')\n        statistics.record_request_processing_finish('request_1')\n        first_duration = statistics.state.request_max_duration\n\n        # Record a longer request\n        statistics.record_request_processing_start('request_2')\n        await asyncio.sleep(sleep_time)  # 50ms delay\n        statistics.record_request_processing_finish('request_2')\n        second_duration = statistics.state.request_max_duration\n\n        # The max duration should be updated to the longer request's duration\n        assert second_duration is not None\n        assert first_duration is not None\n        assert second_duration >= first_duration\n        assert second_duration.total_seconds() >= (sleep_time - asyncio_sleep_time_tolerance)\n\n        # Record another short request - max should NOT decrease\n        statistics.record_request_processing_start('request_3')\n        statistics.record_request_processing_finish('request_3')\n        third_duration = statistics.state.request_max_duration\n\n        # The max duration should remain unchanged (still the longest request)\n        assert third_duration == second_duration\n"
  },
  {
    "path": "tests/unit/_statistics/test_request_processing_record.py",
    "content": "from datetime import timedelta\n\nfrom crawlee.statistics._statistics import RequestProcessingRecord\n\n\ndef test_tracking_time_resolution() -> None:\n    \"\"\"Test that `RequestProcessingRecord` tracks time with sufficient resolution.\n\n    This is generally not an issue on Linux, but on Windows some packages in older Python versions might be using system\n    timers with not so granular resolution - some sources estimate 15ms. This test will start failing on Windows\n    if unsuitable source of time measurement is selected due to two successive time measurements possibly using same\n    timing sample.\"\"\"\n    record = RequestProcessingRecord()\n    record.run()\n    record.finish()\n    assert record.duration\n    assert record.duration > timedelta(seconds=0)\n"
  },
  {
    "path": "tests/unit/_utils/test_byte_size.py",
    "content": "from __future__ import annotations\n\nimport pytest\n\nfrom crawlee._utils.byte_size import ByteSize\n\n\ndef test_initializations() -> None:\n    assert ByteSize(1024).bytes == 1024\n    assert ByteSize.from_kb(1).bytes == 1024\n    assert ByteSize.from_mb(1).bytes == 1024**2\n    assert ByteSize.from_gb(1).bytes == 1024**3\n    assert ByteSize.from_tb(1).bytes == 1024**4\n\n    with pytest.raises(ValueError, match=r'ByteSize cannot be negative'):\n        ByteSize(-1)\n\n\ndef test_conversions() -> None:\n    size = ByteSize.from_mb(2)\n    assert size.to_kb() == 2 * 1024\n    assert size.to_mb() == 2.0\n    assert size.to_gb() == 2 / 1024\n    assert size.to_tb() == 2 / (1024**2)\n\n\ndef test_string_representation() -> None:\n    assert str(ByteSize(512)) == '512 B'\n    assert str(ByteSize(2 * 1024)) == '2.00 KB'\n    assert str(ByteSize(3 * 1024**2)) == '3.00 MB'\n    assert str(ByteSize(4 * 1024**3)) == '4.00 GB'\n    assert str(ByteSize(5 * 1024**4)) == '5.00 TB'\n\n\ndef test_comparisons() -> None:\n    size1 = ByteSize(1024)\n    size2 = ByteSize(512)\n\n    assert size1 > size2\n    assert size1 >= size2\n    assert size2 < size1\n    assert size2 <= size1\n    assert size1 == ByteSize(1024)\n    assert size1 != size2\n\n\ndef test_additions() -> None:\n    # Addition of ByteSize instances\n    size1 = ByteSize(1024)\n    size2 = ByteSize(2048)\n    assert (size1 + size2).bytes == 3072\n\n    # Addition of ByteSize instance and an int\n    with pytest.raises(TypeError):\n        _ = size1 + 1024\n\n    # Addition of ByteSize instance and an float\n    with pytest.raises(TypeError):\n        _ = size2 + 123.45\n\n\ndef test_subtractions() -> None:\n    # Direct subtraction of ByteSize instances\n    size1 = ByteSize(2048)\n    size2 = ByteSize(1024)\n    assert (size1 - size2).bytes == 1024\n\n    # Subtraction resulting in a negative value raises ValueError\n    with pytest.raises(ValueError, match=r'Resulting ByteSize cannot be negative'):\n        _ = size2 - size1\n\n    # Subtraction of ByteSize instance and an int\n    with pytest.raises(TypeError):\n        _ = size1 - 1024\n\n    # Subtraction of ByteSize instance and an float\n    with pytest.raises(TypeError):\n        _ = size2 - 123.45\n\n\ndef test_multiplication() -> None:\n    # Multiplication of ByteSize by an int\n    size = ByteSize(1024)\n    result = size * 2\n    assert result.bytes == 2048\n\n    # Multiplication of ByteSize by a float\n    size_float = ByteSize(1024)\n    result_float = size_float * 1.5\n    assert result_float.bytes == 1536\n\n    # Test reflected multiplication\n    size_reflected = ByteSize(1024)\n    reflected_result = 3 * size_reflected\n    assert reflected_result.bytes == 3072\n\n\ndef test_divisions() -> None:\n    # Division of ByteSize by another ByteSize\n    size1 = ByteSize(2048)\n    size2 = ByteSize(1024)\n    assert (size1 / size2) == 2\n\n    # Division by zero when the divisor is a ByteSize with zero bytes\n    with pytest.raises(ZeroDivisionError):\n        _ = size1 / ByteSize(0)\n\n    # Division of ByteSize - multiplying by a float\n    assert (size1 * 0.5).bytes == 1024\n"
  },
  {
    "path": "tests/unit/_utils/test_console.py",
    "content": "from __future__ import annotations\n\nfrom crawlee._utils.console import make_table\n\n\ndef test_empty_input() -> None:\n    assert make_table([]) == ''\n\n\ndef test_empty_row() -> None:\n    assert make_table([()]) == ''\n\n\ndef test_single_column() -> None:\n    result = make_table([('test',)])\n    lines = result.split('\\n')\n    assert len(lines) == 3\n    assert lines[1] == '│ test │'\n\n\ndef test_two_columns() -> None:\n    data = [('Name', 'Age'), ('Alice', '30'), ('Bob', '25')]\n    result = make_table(data)\n    lines = result.split('\\n')\n    # fmt: off\n    assert lines == ['┌───────┬─────┐',\n                     '│ Name  │ Age │',\n                     '│ Alice │ 30  │',\n                     '│ Bob   │ 25  │',\n                     '└───────┴─────┘']\n    # fmt: on\n\n\ndef test_long_content_truncation() -> None:\n    data = [('Short', 'VeryVeryVeryLongContent')]\n    result = make_table(data, width=25)\n    lines = result.split('\\n')\n    # fmt: off\n    assert lines == ['┌───────────┬───────────┐',\n                     '│ Short     │ VeryVe... │',\n                     '└───────────┴───────────┘']\n    # fmt: on\n"
  },
  {
    "path": "tests/unit/_utils/test_crypto.py",
    "content": "from __future__ import annotations\n\nfrom crawlee._utils.crypto import compute_short_hash, crypto_random_object_id\n\n\ndef test_crypto_random_object_id_default_length() -> None:\n    object_id = crypto_random_object_id()\n    assert len(object_id) == 17, 'Default generated object ID should have a length of 17 characters.'\n\n\ndef test_crypto_random_object_id_custom_length() -> None:\n    for length in [5, 10, 20, 100]:\n        object_id = crypto_random_object_id(length)\n        assert len(object_id) == length, f'Generated object ID should have a length of {length} characters.'\n\n\ndef test_crypto_random_object_id_character_set() -> None:\n    long_random_object_id = crypto_random_object_id(1000)\n    allowed_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'\n    for char in long_random_object_id:\n        assert char in allowed_chars, f\"Character '{char}' is not in the expected alphanumeric range.\"\n\n\ndef test_compute_short_hash_with_known_input() -> None:\n    data = b'Hello world!'\n    expected_hash = 'c0535e4b'\n    assert compute_short_hash(data) == expected_hash, 'The hash does not match the expected output.'\n\n\ndef test_compute_short_hash_with_empty_input() -> None:\n    data = b''\n    expected_hash = 'e3b0c442'\n    assert compute_short_hash(data) == expected_hash, 'The hash for an empty input should follow the expected pattern.'\n\n\ndef test_compute_short_hash_output_length() -> None:\n    data = b'some random data'\n    assert len(compute_short_hash(data)) == 8, 'The output hash should be 8 characters long.'\n\n\ndef test_compute_short_hash_differentiates_input() -> None:\n    data1 = b'input 1'\n    data2 = b'input 2'\n    assert compute_short_hash(data1) != compute_short_hash(data2), 'Different inputs should produce different hashes.'\n"
  },
  {
    "path": "tests/unit/_utils/test_file.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timezone\n\nfrom crawlee._utils.file import json_dumps\n\n\nasync def test_json_dumps() -> None:\n    assert await json_dumps({'key': 'value'}) == '{\\n  \"key\": \"value\"\\n}'\n    assert await json_dumps(['one', 2, 3.0]) == '[\\n  \"one\",\\n  2,\\n  3.0\\n]'\n    assert await json_dumps('string') == '\"string\"'\n    assert await json_dumps(123) == '123'\n    assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '\"2022-01-01 00:00:00+00:00\"'\n"
  },
  {
    "path": "tests/unit/_utils/test_globs.py",
    "content": "from __future__ import annotations\n\nfrom crawlee._utils.globs import Glob\n\n\ndef test_asterisk() -> None:\n    glob = Glob('foo/*')\n    assert glob.regexp.match('bar/') is None\n    assert glob.regexp.match('foo/bar') is not None\n    assert glob.regexp.match('foo/bar/baz') is None\n\n\ndef test_double_asteritsk() -> None:\n    glob = Glob('foo/**')\n    assert glob.regexp.match('bar/') is None\n    assert glob.regexp.match('foo/bar') is not None\n    assert glob.regexp.match('foo/bar/baz') is not None\n"
  },
  {
    "path": "tests/unit/_utils/test_html_to_text.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom bs4 import BeautifulSoup\nfrom parsel import Selector\n\nfrom crawlee.crawlers._beautifulsoup._utils import html_to_text as html_to_text_beautifulsoup\nfrom crawlee.crawlers._parsel._utils import html_to_text as html_to_text_parsel\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n\n_EXPECTED_TEXT = (\n    \"Let's start with a simple text. \\n\"\n    \"The ships hung in the sky, much the way that bricks don't. \\n\"\n    \"These aren't the Droids you're looking for\\n\"\n    \"I'm sorry, Dave. I'm afraid I can't do that.\\n\"\n    \"I'm sorry, Dave. I'm afraid I can't do that.\\n\"\n    'A1\\tA2\\tA3\\t\\n'\n    'B1\\tB2\\tB3\\tB 4\\t\\n'\n    'This is some text with inline elements and HTML entities (>bla<) \\n'\n    'Test\\n'\n    'a\\n'\n    'few\\n'\n    'line\\n'\n    'breaks\\n'\n    'Spaces in an inline text should be completely ignored. \\n'\n    'But,\\n'\n    '    a pre-formatted\\n'\n    '                block  should  be  kept\\n'\n    '                                       pre-formatted.\\n'\n    'The Greatest Science Fiction Quotes Of All Time \\n'\n    \"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You \"\n    'Nexus, huh? I design your eyes.'\n)\n\n_EXAMPLE_HTML = \"\"\"\n<html>\n<head>\n    <title>Title SHOULD NOT be converted</title>\n\n    <!-- Comments SHOULD NOT be converted -->\n</head>\n<body with='some attributes'>\nLet's start with a        simple text.\n<p>\n    The ships hung in the sky, much the <a class=\"click\" href=\"https://example.com/a/b/first\">way that</a> bricks don't.\n</p>\n<ul>\n    <li>These aren't the Droids you're looking for</li>\n    <li some=\"attribute\"><a href=\"https://example.com/a/second\">I'm sorry, Dave. I'm afraid I can't do that.</a></li>\n    <li><a class=\"click\" href=\"https://example.com/a/b/third\">I'm sorry, Dave. I'm afraid I can't do that.</a></li>\n</ul>\n\n<img src=\"something\" alt=\"This should be ignored\" />\n\n<!-- Comments SHOULD NOT be converted -->\n\n<table>\n    <tr class=\"something\">\n        <td>A1</td>\n        <td attributes=\"are ignored\">A2</td>\n        <td>A3</td>\n    </tr>\n    <tr class=\"something\">\n        <td>B1</td>\n        <td attributes=\"are ignored\" even=\"second attribute\">B2</td>\n        <td>B3</td>\n        <td>B     4</td>\n    </tr>\n</table>\n\n<p>\n    This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)\n</p>\n\n<div>\n    Test<br>\n    a<br />\n    few<br>\n    line<br>\n    breaks<br>\n</div>\n\n\n\n\n    Spaces\n\n\n    in\n\n\n    an inline text                                should be\n\n\n    completely ignored.\n\n\n\n<pre>\nBut,\n    a pre-formatted\n                block  should  be  kept\n                                       pre-formatted.\n</pre>\n\n<svg>\n    These special elements SHOULD NOT BE CONVERTED.\n</svg>\n\n<script>\n    // These special elements should be completely skipped.\n    skipThis();\n</script>\n\n<style>\n    /* These special elements should be completely skipped. */\n    .skip_this {}\n</style>\n\n<canvas>\n    This should be skipped too.\n</canvas>\n\n<a class=\"click\" href=\"https://another.com/a/fifth\">The Greatest Science Fiction Quotes Of All Time</a>\n<p>\n    Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,\n    just eyes. You Nexus, huh? I design your <a class=\"click\" href=\"http://cool.com/\">eyes</a>.\n</p>\n</body>\n</html>\n\"\"\"\n\n\n@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])\n@pytest.mark.parametrize(\n    ('source', 'expected_text'),\n    [\n        pytest.param(_EXAMPLE_HTML, _EXPECTED_TEXT, id='Complex html'),\n        ('   Plain    text     node    ', 'Plain text node'),\n        ('   \\nPlain    text     node  \\n  ', 'Plain text node'),\n        ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\\nHeader 2'),\n        ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\\nHeader 2'),\n        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\\nHeader 2'),\n        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\\nHeader 2'),\n        ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\\n\\nHeader 2'),\n        ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\\n\\nHeader 2'),\n        ('<h1>Header 1</h1>  \\n <br>\\n<h2>Header 2</h2><br><br><br>', 'Header 1\\n\\nHeader 2'),\n        ('<h1>Header 1</h1>  \\n <br>\\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\\n\\n\\nHeader 2'),\n        ('<h1>Header 1</h1>  \\n <br>\\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\\n\\n\\n\\nHeader 2'),\n        ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\\nParagraph'),\n        ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\\nDiv2'),\n        ('<div>Div1</div><style>Skip styles</style>', 'Div1'),\n        ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),\n        ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),\n        ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),\n        ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),\n        ('<b>A  B  C  D  E\\n\\nF  G</b>', 'A B C D E F G'),\n        ('<pre>A  B  C  D  E\\n\\nF  G</pre>', 'A  B  C  D  E\\n\\nF  G'),\n        (\n            '<h1>Heading 1</h1><div><div><div><div>Deep  Div</div></div></div></div><h2>Heading       2</h2>',\n            'Heading 1\\nDeep Div\\nHeading 2',\n        ),\n        ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),\n        ('<span attributes=\"should\" be=\"ignored\">some <span>text</span></span>', 'some text'),\n        pytest.param(\n            (\n                \"\"\"<table>\n    <tr>\n        <td>Cell    A1</td><td>Cell A2</td>\n        <td>    Cell A3    </td>\n    </tr>\n    <tr>\n        <td>Cell    B1</td><td>Cell B2</td>\n    </tr>\n</table>\"\"\"\n            ),\n            'Cell A1\\tCell A2\\tCell A3 \\t\\nCell B1\\tCell B2',\n            id='Table',\n        ),\n        ('<span>&aacute; &eacute;</span>', 'á é'),\n    ],\n)\ndef test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None:\n    assert html_to_text(source) == expected_text\n\n\n@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])\ndef test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:\n    with pytest.raises(TypeError):\n        # Intentional wrong type test.\n        html_to_text(1)  # ty: ignore[invalid-argument-type]\n\n\ndef test_html_to_text_parsel() -> None:\n    assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT\n\n\ndef test_html_to_text_beautifulsoup() -> None:\n    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT\n"
  },
  {
    "path": "tests/unit/_utils/test_measure_time.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport time\n\nfrom crawlee._utils.time import measure_time\n\n\ndef test_measure_time_wall_sync() -> None:\n    with measure_time() as elapsed:\n        time.sleep(0.1)\n\n    assert elapsed.cpu is not None\n    assert elapsed.wall is not None\n    assert elapsed.wall >= 0.09\n\n\ndef test_measure_time_cpu_sync() -> None:\n    with measure_time() as elapsed:\n        start = time.time()\n        acc = 0\n\n        while time.time() - start < 0.1:\n            acc += 1\n            acc *= acc\n\n    assert elapsed.cpu is not None\n    assert elapsed.wall is not None\n    # Just verify that CPU time is measured and is positive.\n    assert elapsed.cpu > 0\n\n\nasync def test_measure_time_wall_async() -> None:\n    with measure_time() as elapsed:\n        await asyncio.sleep(0.1)\n\n    assert elapsed.cpu is not None\n    assert elapsed.wall is not None\n    assert elapsed.wall >= 0.09\n"
  },
  {
    "path": "tests/unit/_utils/test_raise_if_too_many_kwargs.py",
    "content": "from contextlib import nullcontext\nfrom typing import Any\n\nimport pytest\n\nfrom crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs\n\n\n@pytest.mark.parametrize(\n    ('kwargs', 'should_raise'),\n    [\n        ({'alias': 'alias', 'name': None, 'id': None}, False),\n        ({'alias': None, 'name': 'name', 'id': None}, False),\n        ({'alias': None, 'name': None, 'id': 'id'}, False),\n        ({'alias': 'alias', 'name': 'name', 'id': None}, True),\n        ({'alias': 'alias', 'name': None, 'id': 'id'}, True),\n        ({'alias': None, 'name': 'name', 'id': 'id'}, True),\n        ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True),\n        ({'alias': None, 'name': None, 'id': None}, False),\n    ],\n)\ndef test_limit_kwargs_default(kwargs: dict[str, Any], *, should_raise: bool) -> None:\n    context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext()\n    with context:\n        raise_if_too_many_kwargs(**kwargs)\n\n\n@pytest.mark.parametrize(\n    ('kwargs', 'should_raise'),\n    [\n        ({'alias': 'alias', 'name': 'name', 'id': 'id'}, True),\n        ({'alias': 'alias', 'name': 'name', 'id': None}, False),\n    ],\n)\ndef test_limit_kwargs(kwargs: dict[str, Any], *, should_raise: bool) -> None:\n    context = pytest.raises(ValueError, match=r'^Only one of .*') if should_raise else nullcontext()\n    with context:\n        raise_if_too_many_kwargs(max_kwargs=2, **kwargs)\n"
  },
  {
    "path": "tests/unit/_utils/test_recurring_task.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom unittest.mock import AsyncMock\n\nimport pytest\n\nfrom crawlee._utils.recurring_task import RecurringTask\n\n\n@pytest.fixture\ndef function() -> AsyncMock:\n    mock_function = AsyncMock()\n    mock_function.__name__ = 'mocked_function'  # To avoid issues with the function name in RecurringTask\n    return mock_function\n\n\n@pytest.fixture\ndef delay() -> timedelta:\n    return timedelta(milliseconds=30)\n\n\nasync def test_init(function: AsyncMock, delay: timedelta) -> None:\n    rt = RecurringTask(function, delay)\n    assert rt.func == function\n    assert rt.delay == delay\n    assert rt.task is None\n\n\nasync def test_start_and_stop(function: AsyncMock, delay: timedelta) -> None:\n    rt = RecurringTask(function, delay)\n\n    rt.start()\n    await asyncio.sleep(0)  # Yield control to allow the task to start\n\n    assert isinstance(rt.task, asyncio.Task)\n    assert not rt.task.done()\n\n    await rt.stop()\n    assert rt.task.done()\n\n\n@pytest.mark.run_alone\nasync def test_execution(function: AsyncMock, delay: timedelta) -> None:\n    task = RecurringTask(function, delay)\n\n    task.start()\n    await asyncio.sleep(0.2)  # Wait enough for the task to execute a few times\n    await task.stop()\n\n    assert isinstance(task.func, AsyncMock)  # To let type checker know that the function is a mock\n    assert task.func.call_count >= 3\n\n    await task.stop()\n"
  },
  {
    "path": "tests/unit/_utils/test_requests.py",
    "content": "from __future__ import annotations\n\nimport pytest\n\nfrom crawlee._types import HttpHeaders\nfrom crawlee._utils.requests import compute_unique_key, normalize_url\n\n\n@pytest.mark.parametrize(\n    ('url', 'expected_output', 'keep_url_fragment'),\n    [\n        ('https://example.com/?utm_source=test&utm_medium=test&key=value', 'https://example.com/?key=value', False),\n        (\n            'http://example.com/?key=value&another_key=another_value',\n            'http://example.com/?another_key=another_value&key=value',\n            False,\n        ),\n        ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com/?key=value', False),\n        ('', '', False),\n        ('http://example.com/#fragment', 'http://example.com/#fragment', True),\n        ('http://example.com/#fragment', 'http://example.com', False),\n        ('  https://example.com/  ', 'https://example.com', False),\n        ('http://example.com/?b=2&a=1', 'http://example.com/?a=1&b=2', False),\n    ],\n    ids=[\n        'remove_utm_params',\n        'retain_sort_non_utm_params',\n        'convert_scheme_netloc_to_lowercase',\n        'handle_empty_url',\n        'retain_fragment',\n        'remove_fragment',\n        'trim_whitespace',\n        'sort_query_params',\n    ],\n)\ndef test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: bool) -> None:\n    output = normalize_url(url, keep_url_fragment=keep_url_fragment)\n    assert output == expected_output\n\n\ndef test_compute_unique_key_basic() -> None:\n    url = 'https://crawlee.dev'\n    uk_get = compute_unique_key(url, method='GET')\n    uk_post = compute_unique_key(url, method='POST')\n    assert url == uk_get == uk_post\n\n\ndef test_compute_unique_key_handles_fragments() -> None:\n    url = 'https://crawlee.dev/#fragment'\n    uk_with_fragment = compute_unique_key(url, keep_url_fragment=True)\n    assert uk_with_fragment == url\n\n    uk_without_fragment = compute_unique_key(url, 'GET', keep_url_fragment=False)\n    assert uk_without_fragment == 'https://crawlee.dev'\n\n\ndef test_compute_unique_key_handles_payload() -> None:\n    url = 'https://crawlee.dev'\n    payload = b'{\"key\": \"value\"}'\n\n    # Payload without extended unique key\n    uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False)\n    assert uk == url\n\n    # Extended unique key and payload is None\n    uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True)\n    assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev'\n\n    # Extended unique key and payload is bytes\n    uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)\n    assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'\n\n\ndef test_compute_unique_key_handles_headers() -> None:\n    url = 'https://crawlee.dev'\n    headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})\n    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=False)\n    assert uk == url\n\n    extended_uk_expected = 'GET|4e1a2cf6|e3b0c442|https://crawlee.dev'\n\n    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)\n    assert uk == extended_uk_expected\n\n    # Accept-Encoding header should not be included.\n    headers = HttpHeaders({'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/json'})\n    uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)\n    assert uk == extended_uk_expected\n\n\ndef test_compute_unique_key_complex() -> None:\n    url = 'https://crawlee.dev'\n    headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})\n    payload = b'{\"key\": \"value\"}'\n\n    uk = compute_unique_key(\n        url,\n        method='POST',\n        headers=headers,\n        payload=payload,\n        session_id='test_session',\n        use_extended_unique_key=False,\n    )\n    assert uk == url\n\n    extended_uk = compute_unique_key(\n        url,\n        method='POST',\n        headers=headers,\n        payload=payload,\n        session_id='test_session',\n        use_extended_unique_key=True,\n    )\n    assert extended_uk == 'POST|4e1a2cf6|9724c1e2|test_session|https://crawlee.dev'\n\n\ndef test_compute_unique_key_post_with_none_payload() -> None:\n    url = 'https://crawlee.dev'\n    expected_output = 'POST|e3b0c442|e3b0c442|https://crawlee.dev'\n    output = compute_unique_key(url, 'POST', payload=None, use_extended_unique_key=True)\n    assert output == expected_output\n\n\ndef test_compute_unique_key_with_whitespace_in_headers() -> None:\n    url = 'https://crawlee.dev'\n    headers = HttpHeaders({'Content-Type': 'application/json'})\n    headers_with_whitespaces = HttpHeaders({'Content-Type': ' application/json '})\n\n    expected_output = 'GET|60d83e70|e3b0c442|https://crawlee.dev'\n    uk_1 = compute_unique_key(url, headers=headers, use_extended_unique_key=True)\n    assert uk_1 == expected_output\n\n    uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True)\n    assert uk_2 == expected_output\n"
  },
  {
    "path": "tests/unit/_utils/test_robots.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom crawlee._utils.robots import RobotsTxtFile\n\nif TYPE_CHECKING:\n    from yarl import URL\n\n    from crawlee.http_clients._base import HttpClient\n\n\nasync def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None:\n    robots_file = await RobotsTxtFile.find(str(server_url), http_client)\n    assert len(robots_file.get_sitemaps()) > 0\n\n\nasync def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None:\n    robots = await RobotsTxtFile.find(str(server_url), http_client)\n    assert robots.is_allowed('https://crawlee.dev')\n    assert robots.is_allowed(str(server_url / 'something/page.html'))\n    assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html'))\n    assert not robots.is_allowed(str(server_url / 'deny_all/page.html'))\n\n\nasync def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:\n    robots = await RobotsTxtFile.find(str(server_url), http_client)\n    assert len(robots.get_sitemaps()) == 2\n    assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}\n\n\nasync def test_parse_from_content() -> None:\n    content = \"\"\"User-agent: *\n        Disallow: *deny_all/\n        crawl-delay: 10\n        User-agent: Googlebot\n        Disallow: *deny_googlebot/\"\"\"\n    robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content)\n    assert robots.is_allowed('http://not-exists.com/something/page.html')\n    assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html')\n    assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')\n    assert not robots.is_allowed('http://not-exists.com/deny_all/page.html')\n\n\nasync def test_bind_robots_txt_url() -> None:\n    content = 'User-agent: *\\nDisallow: /'\n    robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content)\n    assert not robots.is_allowed('http://check.com/test.html')\n    assert robots.is_allowed('http://othercheck.com/robots.txt')\n"
  },
  {
    "path": "tests/unit/_utils/test_shared_timeout.py",
    "content": "import asyncio\nfrom datetime import timedelta\n\nimport pytest\n\nfrom crawlee._utils.time import SharedTimeout, measure_time\n\n\nasync def test_shared_timeout_tracks_elapsed_time() -> None:\n    timeout_duration = timedelta(seconds=1)\n    shared_timeout = SharedTimeout(timeout_duration)\n\n    # First usage\n    async with shared_timeout:\n        await asyncio.sleep(0.2)\n\n    # Second usage - should have less time remaining\n    async with shared_timeout as remaining:\n        assert remaining < timedelta(seconds=0.85)\n        assert remaining > timedelta(seconds=0)\n\n\nasync def test_shared_timeout_expires() -> None:\n    timeout_duration = timedelta(seconds=0.1)\n    shared_timeout = SharedTimeout(timeout_duration)\n\n    with measure_time() as elapsed, pytest.raises(asyncio.TimeoutError):\n        async with shared_timeout:\n            await asyncio.sleep(0.5)\n\n    assert elapsed.wall is not None\n    assert elapsed.wall < 0.3\n\n\nasync def test_shared_timeout_cannot_be_nested() -> None:\n    timeout_duration = timedelta(seconds=1)\n    shared_timeout = SharedTimeout(timeout_duration)\n\n    async with shared_timeout:\n        with pytest.raises(RuntimeError, match='cannot be entered twice'):\n            async with shared_timeout:\n                pass\n\n\nasync def test_shared_timeout_multiple_sequential_uses() -> None:\n    \"\"\"Test that SharedTimeout can be used multiple times sequentially.\"\"\"\n    timeout_duration = timedelta(seconds=1)\n    shared_timeout = SharedTimeout(timeout_duration)\n\n    for _ in range(5):\n        async with shared_timeout:\n            await asyncio.sleep(0.05)\n\n    # Should have consumed roughly 0.25 seconds\n    async with shared_timeout as remaining:\n        assert remaining < timedelta(seconds=0.8)\n        assert remaining > timedelta(seconds=0)\n"
  },
  {
    "path": "tests/unit/_utils/test_sitemap.py",
    "content": "import base64\nimport gzip\nfrom datetime import datetime\nfrom typing import Any\nfrom unittest.mock import AsyncMock, MagicMock\n\nfrom yarl import URL\n\nfrom crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap\nfrom crawlee.http_clients._base import HttpClient, HttpResponse\n\nBASIC_SITEMAP = \"\"\"\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n<url>\n<loc>http://not-exists.com/</loc>\n<lastmod>2005-02-03</lastmod>\n<changefreq>monthly</changefreq>\n<priority>0.8</priority>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=12&amp;desc=vacation_hawaii</loc>\n<changefreq>weekly</changefreq>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>\n<lastmod>2004-12-23</lastmod>\n<changefreq>weekly</changefreq>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>\n<lastmod>2004-12-23T18:00:15+00:00</lastmod>\n<priority>0.3</priority>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=83&amp;desc=vacation_usa</loc>\n<lastmod>2004-11-23</lastmod>\n</url>\n</urlset>\n\"\"\".strip()\n\nBASIC_RESULTS = {\n    'http://not-exists.com/',\n    'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',\n    'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',\n    'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',\n    'http://not-exists.com/catalog?item=83&desc=vacation_usa',\n}\n\n\ndef _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:\n    async def send_request(url: str, **_kwargs: Any) -> HttpResponse:\n        status, body = 404, b''\n        for pattern, (s, b) in url_map.items():\n            if pattern in url:\n                status, body = s, b\n                break\n        response = MagicMock(spec=HttpResponse)\n        response.status_code = status\n        response.read = AsyncMock(return_value=body)\n        return response\n\n    client = AsyncMock(spec=HttpClient)\n    client.send_request.side_effect = send_request\n    return client\n\n\ndef compress_gzip(data: str) -> bytes:\n    \"\"\"Compress a string using gzip.\"\"\"\n    return gzip.compress(data.encode())\n\n\ndef encode_base64(data: bytes) -> str:\n    \"\"\"Encode bytes to a base64 string.\"\"\"\n    return base64.b64encode(data).decode('utf-8')\n\n\nasync def test_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a basic sitemap.\"\"\"\n    sitemap_url = (server_url / 'sitemap.xml').with_query(\n        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'\n    )\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test extracting item metadata from a sitemap.\"\"\"\n    sitemap_url = (server_url / 'sitemap.xml').with_query(\n        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'\n    )\n\n    items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)]\n    assert len(items) == 5\n    assert items[0] == SitemapUrl(\n        loc='http://not-exists.com/',\n        priority=0.8,\n        changefreq='monthly',\n        lastmod=datetime.fromisoformat('2005-02-03'),\n        origin_sitemap_url=str(sitemap_url),\n    )\n\n\nasync def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a gzipped sitemap with correct type and .xml.gz url.\"\"\"\n    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))\n    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip')\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a invalid gzipped sitemap with correct type and .xml.gz url.\"\"\"\n    compress_data = compress_gzip(BASIC_SITEMAP)\n    invalid_gzipped_data = encode_base64(compress_data[:30])\n    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip')\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 0\n    assert sitemap.urls == []\n\n\nasync def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.\"\"\"\n    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(\n        base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip'\n    )\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a gzipped sitemap with bad type and .xml.gz url.\"\"\"\n    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))\n    sitemap_url = (server_url / 'sitemap.xml.gz').with_query(\n        base64=gzipped_data, c_type='application/xml; charset=utf-8'\n    )\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a gzipped sitemap with correct type and .xml url.\"\"\"\n    gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip')\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a parent sitemap that references child sitemaps.\"\"\"\n    parent_sitemap = \"\"\"\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n<sitemap>\n<loc>{child_sitemap}</loc>\n<lastmod>2004-12-23</lastmod>\n</sitemap>\n<sitemap>\n<loc>{child_sitemap_2}</loc>\n<lastmod>2004-12-23</lastmod>\n</sitemap>\n</sitemapindex>\n\"\"\".strip()\n    child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP)))\n    parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2)\n    encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode())\n    parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content)\n\n    sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 10\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a URL that does not point to a sitemap.\"\"\"\n    sitemap = await Sitemap.load(str(server_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 0\n    assert sitemap.urls == []\n\n\nasync def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a sitemap with CDATA sections.\"\"\"\n    cdata_sitemap = \"\"\"\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n<url>\n<loc><![CDATA[http://not-exists.com/catalog]]></loc>\n</url>\n</urlset>\n    \"\"\".strip()\n    sitemap_url = (server_url / 'sitemap.xml').with_query(\n        base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8'\n    )\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 1\n    assert sitemap.urls == ['http://not-exists.com/catalog']\n\n\nasync def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a plain text sitemap.\"\"\"\n    urls = [\n        'http://not-exists.com/catalog?item=78&desc=vacation_crete',\n        'http://not-exists.com/catalog?item=79&desc=vacation_somalia',\n    ]\n    txt_sitemap_content = '\\n'.join(urls)\n\n    sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode()))\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 2\n    assert set(sitemap.urls) == {\n        'http://not-exists.com/catalog?item=78&desc=vacation_crete',\n        'http://not-exists.com/catalog?item=79&desc=vacation_somalia',\n    }\n\n\nasync def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test loading a pretty-printed sitemap.\"\"\"\n    pretty_sitemap = \"\"\"\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n<url>\n<loc>\n    http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey\n</loc>\n<lastmod>\n    2005-02-03\n</lastmod>\n<changefreq>\n\n    monthly\n</changefreq>\n<priority>\n    0.8\n</priority>\n</url>\n</urlset>\n\"\"\".strip()\n    sitemap_url = (server_url / 'sitemap.xml').with_query(\n        base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8'\n    )\n    sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)\n\n    assert len(sitemap.urls) == 1\n    assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey']\n\n\nasync def test_sitemap_from_string() -> None:\n    \"\"\"Test creating a Sitemap instance from an XML string.\"\"\"\n    sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP)\n\n    assert len(sitemap.urls) == 5\n    assert set(sitemap.urls) == BASIC_RESULTS\n\n\nasync def test_discover_sitemap_from_robots_txt() -> None:\n    \"\"\"Sitemap URL found in robots.txt is yielded.\"\"\"\n    robots_content = b'User-agent: *\\nSitemap: http://example.com/custom-sitemap.xml'\n    http_client = _make_mock_client({'robots.txt': (200, robots_content)})\n\n    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]\n\n    assert urls == ['http://example.com/custom-sitemap.xml']\n\n\nasync def test_discover_sitemap_from_common_paths() -> None:\n    \"\"\"Sitemap is found at common paths when robots.txt has none.\"\"\"\n    http_client = _make_mock_client(\n        {'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')}\n    )\n\n    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]\n\n    assert urls == [\n        'http://example.com/sitemap.xml',\n        'http://example.com/sitemap.txt',\n        'http://example.com/sitemap_index.xml',\n    ]\n\n\nasync def test_discover_sitemap_from_input_url() -> None:\n    \"\"\"Input URL that is already a sitemap is yielded directly without checking common paths.\"\"\"\n    http_client = _make_mock_client({'/sitemap.txt': (200, b'')})\n\n    urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)]\n\n    assert urls == ['http://example.com/sitemap.xml']\n\n\nasync def test_discover_sitemap_deduplication() -> None:\n    \"\"\"Sitemap URL found in robots.txt is not yielded again from common paths check.\"\"\"\n    robots_content = b'User-agent: *\\nSitemap: http://example.com/sitemap.xml'\n    http_client = _make_mock_client(\n        {\n            'robots.txt': (200, robots_content),\n            '/sitemap.xml': (200, b''),\n        }\n    )\n\n    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]\n\n    assert urls == ['http://example.com/sitemap.xml']\n\n\nasync def test_discover_sitemaps_multiple_domains() -> None:\n    \"\"\"Sitemaps from multiple domains are all discovered.\"\"\"\n    http_client = _make_mock_client(\n        {\n            'domain-a.com/sitemap.xml': (200, b''),\n            'domain-b.com/sitemap.xml': (200, b''),\n        }\n    )\n\n    urls = [\n        url\n        async for url in discover_valid_sitemaps(\n            ['http://domain-a.com/page', 'http://domain-b.com/page'],\n            http_client=http_client,\n        )\n    ]\n\n    assert set(urls) == {\n        'http://domain-a.com/sitemap.xml',\n        'http://domain-b.com/sitemap.xml',\n    }\n\n\nasync def test_discover_sitemap_url_without_host_skipped() -> None:\n    \"\"\"URLs without a host are skipped.\"\"\"\n    http_client = _make_mock_client({})\n\n    urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)]\n\n    assert urls == []\n"
  },
  {
    "path": "tests/unit/_utils/test_system.py",
    "content": "from __future__ import annotations\n\nimport sys\nfrom multiprocessing import get_context, synchronize\nfrom multiprocessing.shared_memory import SharedMemory\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee._utils.byte_size import ByteSize\nfrom crawlee._utils.system import get_cpu_info, get_memory_info\n\nif TYPE_CHECKING:\n    from collections.abc import Callable\n\n\ndef test_get_memory_info_returns_valid_values() -> None:\n    memory_info = get_memory_info()\n\n    assert ByteSize(0) < memory_info.total_size < ByteSize.from_tb(1)\n    assert memory_info.current_size < memory_info.total_size\n\n\ndef test_get_cpu_info_returns_valid_values() -> None:\n    cpu_info = get_cpu_info()\n    assert 0 <= cpu_info.used_ratio <= 1\n\n\n@pytest.mark.skipif(sys.platform != 'linux', reason='Improved estimation available only on Linux')\ndef test_memory_estimation_does_not_overestimate_due_to_shared_memory() -> None:\n    \"\"\"Test that memory usage estimation is not overestimating memory usage by counting shared memory multiple times.\n\n    In this test, the parent process is started and its memory usage is measured in situations where it is running\n    child processes without additional memory, with shared additional memory and with own unshared additional memory.\n    Child process without additional memory are used to estimate baseline memory usage of any child process.\n    The following estimation is asserted by the test:\n    additional_memory_size_estimate_per_shared_memory_child * number_of_sharing_children_processes is approximately\n    equal to additional_memory_size_estimate_per_unshared_memory_child where the additional shared memory is exactly\n    the same as the unshared memory.\n    \"\"\"\n\n    ctx = get_context('fork')\n    estimated_memory_expectation = ctx.Value('b', False)  # noqa: FBT003  # Common usage pattern for multiprocessing.Value\n\n    def parent_process() -> None:\n        extra_memory_size = 1024 * 1024 * 100  # 100 MB\n        children_count = 4\n        # Memory calculation is not exact, so allow for some tolerance.\n        test_tolerance = 0.3\n\n        def no_extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None:\n            ready.wait()\n            measured.wait()\n\n        def extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None:\n            memory = SharedMemory(size=extra_memory_size, create=True)\n            assert memory.buf is not None\n            memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])\n            print(f'Using the memory... {memory.buf[-1]}')\n            ready.wait()\n            measured.wait()\n            memory.close()\n            memory.unlink()\n\n        def shared_extra_memory_child(\n            ready: synchronize.Barrier, measured: synchronize.Barrier, memory: SharedMemory\n        ) -> None:\n            assert memory.buf is not None\n            print(f'Using the memory... {memory.buf[-1]}')\n            ready.wait()\n            measured.wait()\n\n        def get_additional_memory_estimation_while_running_processes(\n            *, target: Callable, count: int = 1, use_shared_memory: bool = False\n        ) -> float:\n            processes = []\n            ready = ctx.Barrier(parties=count + 1)\n            measured = ctx.Barrier(parties=count + 1)\n            shared_memory: None | SharedMemory = None\n            memory_before = get_memory_info().current_size\n\n            if use_shared_memory:\n                shared_memory = SharedMemory(size=extra_memory_size, create=True)\n                assert shared_memory.buf is not None\n                shared_memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])\n                extra_args = [shared_memory]\n            else:\n                extra_args = []\n\n            for _ in range(count):\n                p = ctx.Process(target=target, args=[ready, measured, *extra_args])\n                p.start()\n                processes.append(p)\n\n            ready.wait()\n            memory_during = get_memory_info().current_size\n            measured.wait()\n\n            for p in processes:\n                p.join()\n\n            if shared_memory:\n                shared_memory.close()\n                shared_memory.unlink()\n\n            return (memory_during - memory_before).to_mb() / count\n\n        additional_memory_simple_child = get_additional_memory_estimation_while_running_processes(\n            target=no_extra_memory_child, count=children_count\n        )\n        additional_memory_extra_memory_child = (\n            get_additional_memory_estimation_while_running_processes(target=extra_memory_child, count=children_count)\n            - additional_memory_simple_child\n        )\n        additional_memory_shared_extra_memory_child = (\n            get_additional_memory_estimation_while_running_processes(\n                target=shared_extra_memory_child, count=children_count, use_shared_memory=True\n            )\n            - additional_memory_simple_child\n        )\n\n        memory_estimation_difference_ratio = (\n            abs((additional_memory_shared_extra_memory_child * children_count) - additional_memory_extra_memory_child)\n            / additional_memory_extra_memory_child\n        )\n\n        estimated_memory_expectation.value = memory_estimation_difference_ratio < test_tolerance\n\n        if not estimated_memory_expectation.value:\n            print(\n                f'{additional_memory_shared_extra_memory_child=}\\n'\n                f'{children_count=}\\n'\n                f'{additional_memory_extra_memory_child=}\\n'\n                f'{memory_estimation_difference_ratio=}'\n            )\n\n    process = ctx.Process(target=parent_process)\n    process.start()\n    process.join()\n\n    assert estimated_memory_expectation.value, (\n        'Estimated memory usage for process with shared memory does not meet the expectation.'\n    )\n"
  },
  {
    "path": "tests/unit/_utils/test_timedelta_ms.py",
    "content": "from __future__ import annotations\n\nfrom datetime import timedelta\nfrom typing import Any\n\nimport pytest\nfrom pydantic import BaseModel\n\nfrom crawlee._utils.models import timedelta_ms\n\n\nclass _ModelWithTimedeltaMs(BaseModel):\n    time_delta: timedelta_ms | None = None\n\n\n@pytest.mark.parametrize(\n    ('time_delta_input', 'expected_time_delta', 'expected_model_dump_value'),\n    [\n        (1.0, timedelta(milliseconds=1), 1),\n        (1, timedelta(milliseconds=1), 1),\n        ('1', timedelta(milliseconds=1), 1),\n        (timedelta(milliseconds=1), timedelta(milliseconds=1), 1),\n        (3.01, timedelta(microseconds=3010), 3),\n        (3.5, timedelta(microseconds=3500), 4),\n        (3.99, timedelta(microseconds=3990), 4),\n        (None, None, None),\n        (float('inf'), timedelta(days=999999999, seconds=3600 * 24 - 1, microseconds=999999), float('inf')),\n    ],\n)\ndef test_model_with_timedelta_ms_input_types(\n    time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int\n) -> None:\n    model = _ModelWithTimedeltaMs(time_delta=time_delta_input)  # ty: ignore[invalid-argument-type]\n    assert model.time_delta == expected_time_delta\n    assert model.model_dump() == {'time_delta': expected_model_dump_value}\n"
  },
  {
    "path": "tests/unit/_utils/test_urls.py",
    "content": "from __future__ import annotations\n\nimport pytest\nfrom pydantic import ValidationError\n\nfrom crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, validate_http_url\n\n\ndef test_is_url_absolute() -> None:\n    assert is_url_absolute('http://example.com/path') is True\n    assert is_url_absolute('https://example.com/path') is True\n    assert is_url_absolute('ftp://example.com/path') is True\n    assert is_url_absolute('//example.com/path') is False\n    assert is_url_absolute('/path/to/resource') is False\n    assert is_url_absolute('relative/path/to/resource') is False\n    assert is_url_absolute('example.com/path') is False\n\n\ndef test_convert_to_absolute_url() -> None:\n    base_url = 'http://example.com'\n    relative_url = '/path/to/resource'\n    absolute_url = convert_to_absolute_url(base_url, relative_url)\n    assert absolute_url == 'http://example.com/path/to/resource'\n\n    base_url = 'http://example.com'\n    relative_url = '//example.com/path/to/resource'\n    absolute_url = convert_to_absolute_url(base_url, relative_url)\n    assert absolute_url == 'http://example.com/path/to/resource'\n\n    base_url = 'http://example.com/base/'\n    relative_url = '../path/to/resource'\n    absolute_url = convert_to_absolute_url(base_url, relative_url)\n    assert absolute_url == 'http://example.com/path/to/resource'\n\n\ndef test_validate_http_url() -> None:\n    assert validate_http_url(None) is None\n\n    valid_url = 'https://example.com'\n    assert validate_http_url(valid_url) == valid_url\n\n    invalid_url = 'htp://invalid-url'\n    with pytest.raises(ValidationError):\n        validate_http_url(invalid_url)\n"
  },
  {
    "path": "tests/unit/browsers/test_browser_pool.py",
    "content": "from __future__ import annotations\n\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\nfrom unittest.mock import AsyncMock\n\nimport pytest\n\nfrom crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin\nfrom crawlee.browsers._browser_controller import BrowserController\nfrom crawlee.browsers._types import CrawleePage\nfrom tests.unit.utils import run_alone_on_mac\n\nif TYPE_CHECKING:\n    from collections.abc import Mapping\n    from typing import Any\n\n    from yarl import URL\n\n    from crawlee.proxy_configuration import ProxyInfo\n\n\nasync def test_default_plugin_new_page_creation(server_url: URL) -> None:\n    async with BrowserPool() as browser_pool:\n        page_1 = await browser_pool.new_page()\n        await page_1.page.goto(str(server_url))\n        assert page_1.browser_type == 'chromium'\n        assert page_1.page.url == str(server_url)\n        assert '<html' in await page_1.page.content()  # there is some HTML content\n        assert browser_pool.total_pages_count == 1\n\n        page_2 = await browser_pool.new_page()\n        await page_2.page.goto(str(server_url / 'status/200'))\n        assert page_2.browser_type == 'chromium'\n        assert page_2.page.url == str(server_url / 'status/200')\n        assert '<html' in await page_1.page.content()  # there is some HTML content\n        assert browser_pool.total_pages_count == 2\n\n        await page_1.page.close()\n        await page_2.page.close()\n\n\nasync def test_multiple_plugins_new_page_creation(server_url: URL) -> None:\n    plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')\n    plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')\n\n    async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:\n        assert browser_pool.plugins == [plugin_chromium, plugin_firefox]\n\n        page_1 = await browser_pool.new_page()\n        await page_1.page.goto(str(server_url))\n        assert page_1.browser_type == 'chromium'\n        assert page_1.page.url == str(server_url)\n        assert '<html' in await page_1.page.content()  # there is some HTML content\n\n        page_2 = await browser_pool.new_page()\n        await page_2.page.goto(str(server_url / 'headers'))\n        assert page_2.browser_type == 'firefox'\n        assert page_2.page.url == str(server_url / 'headers')\n        assert '<html' in await page_2.page.content()  # there is some HTML content\n\n        page_3 = await browser_pool.new_page()\n        await page_3.page.goto(str(server_url / 'user-agent'))\n        assert page_3.browser_type == 'chromium'\n        assert page_3.page.url == str(server_url / 'user-agent')\n        assert '<html' in await page_3.page.content()  # there is some HTML content\n\n        await page_1.page.close()\n        await page_2.page.close()\n        await page_3.page.close()\n\n        assert browser_pool.total_pages_count == 3\n\n\n@pytest.mark.flaky(\n    rerun=3,\n    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1660.',\n)\nasync def test_new_page_with_each_plugin(server_url: URL) -> None:\n    plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')\n    plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')\n\n    async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:\n        pages = await browser_pool.new_page_with_each_plugin()\n\n        assert len(pages) == 2\n\n        assert pages[0].browser_type == 'chromium'\n        assert pages[1].browser_type == 'firefox'\n\n        await pages[0].page.goto(str(server_url))\n        assert pages[0].page.url == str(server_url)\n        assert '<html' in await pages[0].page.content()  # there is some HTML content\n\n        await pages[1].page.goto(str(server_url / 'headers'))\n        assert pages[1].page.url == str(server_url / 'headers')\n        assert '<html' in await pages[1].page.content()\n\n        for page in pages:\n            await page.page.close()\n\n        assert browser_pool.total_pages_count == 2\n\n\n@run_alone_on_mac\nasync def test_with_default_plugin_constructor(server_url: URL) -> None:\n    # Use a generous operation timeout so that Firefox has enough time to launch on slow Windows CI.\n    async with BrowserPool.with_default_plugin(\n        headless=True, browser_type='firefox', operation_timeout=timedelta(seconds=60)\n    ) as browser_pool:\n        assert len(browser_pool.plugins) == 1\n        assert isinstance(browser_pool.plugins[0], PlaywrightBrowserPlugin)\n\n        page = await browser_pool.new_page()\n        assert page.browser_type == 'firefox'\n\n        await page.page.goto(str(server_url))\n        assert page.page.url == str(server_url)\n        assert '<html' in await page.page.content()  # there is some HTML content\n\n        await page.page.close()\n        assert browser_pool.total_pages_count == 1\n\n\nasync def test_new_page_with_existing_id() -> None:\n    async with BrowserPool() as browser_pool:\n        page_1 = await browser_pool.new_page()\n        with pytest.raises(ValueError, match=r'Page with ID: .* already exists.'):\n            await browser_pool.new_page(page_id=page_1.id)\n\n\nasync def test_new_page_with_invalid_plugin() -> None:\n    plugin_1 = PlaywrightBrowserPlugin(browser_type='chromium')\n    plugin_2 = PlaywrightBrowserPlugin(browser_type='firefox')\n    async with BrowserPool([plugin_1]) as browser_pool:\n        with pytest.raises(ValueError, match=r'Provided browser_plugin is not one of the plugins used by BrowserPool.'):\n            await browser_pool.new_page(browser_plugin=plugin_2)\n\n\nasync def test_resource_management(server_url: URL) -> None:\n    playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium')\n\n    async with BrowserPool([playwright_plugin]) as browser_pool:\n        page = await browser_pool.new_page()\n        await page.page.goto(str(server_url))\n        assert page.page.url == str(server_url)\n        assert '<html' in await page.page.content()  # there is some HTML content\n        assert browser_pool.total_pages_count == 1\n\n    # All pages should be closed in __aexit__\n    assert page.page.is_closed()\n\n\nasync def test_methods_raise_error_when_not_active() -> None:\n    plugin = PlaywrightBrowserPlugin()\n    browser_pool = BrowserPool([plugin])\n\n    assert browser_pool.active is False\n\n    with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'):\n        await browser_pool.new_page()\n\n    with pytest.raises(RuntimeError, match=r'BrowserPool is not active.'):\n        await browser_pool.new_page_with_each_plugin()\n\n    with pytest.raises(RuntimeError, match=r'BrowserPool is already active.'):\n        async with browser_pool, browser_pool:\n            pass\n\n    async with browser_pool:\n        assert browser_pool.active is True\n\n\nasync def test_with_plugin_contains_page_options(server_url: URL) -> None:\n    plugin = PlaywrightBrowserPlugin(browser_new_context_options={'user_agent': 'My Best User-Agent'})\n    async with BrowserPool(plugins=[plugin]) as browser_pool:\n        test_page = await browser_pool.new_page()\n        await test_page.page.goto(str(server_url / 'user-agent'))\n        assert 'My Best User-Agent' in await test_page.page.content()\n        await test_page.page.close()\n\n\n@pytest.mark.parametrize(\n    ('retire_after_page_count', 'expect_equal_browsers'),\n    [\n        pytest.param(2, True, id='Two pages opened in the same browser'),\n        pytest.param(1, False, id='Each page opened in a new browser.'),\n    ],\n)\nasync def test_browser_pool_retire_browser_after_page_count(\n    retire_after_page_count: int, *, expect_equal_browsers: bool\n) -> None:\n    async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool:\n        test_page = await browser_pool.new_page()\n        first_browser = test_page.page.context\n        await test_page.page.close()\n\n        test_page = await browser_pool.new_page()\n        second_browser = test_page.page.context\n\n        await test_page.page.close()\n\n        if expect_equal_browsers:\n            assert first_browser is second_browser\n        else:\n            assert first_browser is not second_browser\n\n\nasync def test_pre_page_create_hook_is_called() -> None:\n    call_mock = AsyncMock()\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.pre_page_create_hook\n        async def hook(\n            page_id: str,\n            controller: BrowserController,\n            browser_new_context_options: dict[str, Any],\n            proxy_info: ProxyInfo | None,\n        ) -> None:\n            await call_mock(page_id, controller, browser_new_context_options, proxy_info)\n\n            browser_new_context_options['user_agent'] = 'Modified User-Agent'\n\n            assert len(controller.pages) == 0\n\n        test_page = await browser_pool.new_page()\n        user_agent = await test_page.page.evaluate('navigator.userAgent')\n\n        await test_page.page.close()\n\n    assert user_agent == 'Modified User-Agent'\n\n    call_mock.assert_awaited_once()\n    page_id, controller, _, proxy_info = call_mock.call_args[0]\n\n    assert isinstance(page_id, str)\n    assert test_page.id == page_id\n    assert isinstance(controller, BrowserController)\n    assert proxy_info is None\n\n\nasync def test_post_page_create_hook_is_called() -> None:\n    call_mock = AsyncMock()\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.post_page_create_hook\n        async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:\n            await call_mock(crawlee_page, controller)\n            await crawlee_page.page.evaluate('window.__hook_applied = true')\n\n            assert isinstance(crawlee_page, CrawleePage)\n\n            assert len(controller.pages) == 1\n\n        test_page = await browser_pool.new_page()\n\n        js_result = await test_page.page.evaluate('window.__hook_applied')\n\n        await test_page.page.close()\n\n    assert js_result is True\n\n    call_mock.assert_awaited_once()\n    crawlee_page, controller = call_mock.call_args[0]\n\n    assert test_page is crawlee_page\n    assert isinstance(controller, BrowserController)\n\n\nasync def test_pre_page_close_hook() -> None:\n    call_mock = AsyncMock()\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.pre_page_close_hook\n        async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:\n            await call_mock(crawlee_page, controller)\n\n            assert not crawlee_page.page.is_closed()\n            assert len(controller.pages) == 1\n\n        test_page = await browser_pool.new_page()\n        await test_page.page.close()\n\n    call_mock.assert_awaited_once()\n    assert test_page.page.is_closed()\n\n\nasync def test_post_page_close_hook() -> None:\n    call_mock = AsyncMock()\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.post_page_close_hook\n        async def hook(page_id: str, controller: BrowserController) -> None:\n            await call_mock(page_id, controller)\n\n            assert len(controller.pages) == 0\n\n        test_page = await browser_pool.new_page()\n        await test_page.page.close()\n\n    page_id, controller = call_mock.call_args[0]\n\n    call_mock.assert_awaited_once()\n    assert test_page.id == page_id\n    assert isinstance(controller, BrowserController)\n\n\nasync def test_page_hooks_execution_order() -> None:\n    call_order: list[str] = []\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.pre_page_create_hook\n        async def pre_create(\n            _page_id: str,\n            _controller: BrowserController,\n            _browser_new_context_options: Mapping[str, Any],\n            _proxy_info: ProxyInfo | None,\n        ) -> None:\n            call_order.append('pre_create')\n\n        @browser_pool.post_page_create_hook\n        async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:\n            call_order.append('post_create')\n\n        @browser_pool.pre_page_close_hook\n        async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:\n            call_order.append('pre_close')\n\n        @browser_pool.post_page_close_hook\n        async def post_close(_page_id: str, _controller: BrowserController) -> None:\n            call_order.append('post_close')\n\n        page = await browser_pool.new_page()\n        await page.page.close()\n\n    assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']\n\n\nasync def test_multiple_hooks_all_called() -> None:\n    call_order: list[str] = []\n\n    async with BrowserPool() as browser_pool:\n\n        @browser_pool.post_page_create_hook\n        async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:\n            call_order.append('first')\n\n        @browser_pool.post_page_create_hook\n        async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:\n            call_order.append('second')\n\n        page = await browser_pool.new_page()\n        await page.page.close()\n\n    assert call_order == ['first', 'second']\n"
  },
  {
    "path": "tests/unit/browsers/test_playwright_browser.py",
    "content": "from __future__ import annotations\n\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom playwright.async_api import async_playwright\n\nfrom crawlee.browsers._playwright_browser import PlaywrightPersistentBrowser\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from playwright.async_api import Playwright\n\n\n@pytest.fixture\nasync def playwright() -> AsyncGenerator[Playwright, None]:\n    async with async_playwright() as playwright:\n        yield playwright\n\n\nasync def test_init(playwright: Playwright) -> None:\n    browser_type = playwright.chromium\n    persist_browser = PlaywrightPersistentBrowser(browser_type, user_data_dir=None, browser_launch_options={})\n    assert persist_browser._browser_type == browser_type\n    assert persist_browser.browser_type == browser_type\n    assert persist_browser._browser_launch_options == {}\n    assert persist_browser._temp_dir is None\n    assert persist_browser._user_data_dir is None\n    assert persist_browser._is_connected is True\n    assert persist_browser.is_connected() is True\n\n\nasync def test_delete_temp_folder_with_close_browser(playwright: Playwright) -> None:\n    persist_browser = PlaywrightPersistentBrowser(\n        playwright.chromium, user_data_dir=None, browser_launch_options={'headless': True}\n    )\n    await persist_browser.new_context()\n    assert isinstance(persist_browser._temp_dir, Path)\n    current_temp_dir = persist_browser._temp_dir\n    assert current_temp_dir.exists()\n    await persist_browser.close()\n    assert not current_temp_dir.exists()\n"
  },
  {
    "path": "tests/unit/browsers/test_playwright_browser_controller.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import datetime, timedelta, timezone\nfrom typing import TYPE_CHECKING, Any\nfrom unittest.mock import AsyncMock\n\nimport pytest\nfrom playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright\n\nfrom crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from yarl import URL\n\n\n@pytest.fixture\nasync def playwright() -> AsyncGenerator[Playwright, None]:\n    async with async_playwright() as playwright:\n        yield playwright\n\n\n@pytest.fixture\nasync def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:\n    browser = await playwright.chromium.launch()\n    yield browser\n    await browser.close()\n\n\n@pytest.fixture\nasync def controller(browser: Browser) -> AsyncGenerator[PlaywrightBrowserController, None]:\n    controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2)\n    yield controller\n    await controller.close()\n\n\nasync def test_initial_state(browser: Browser) -> None:\n    controller = PlaywrightBrowserController(browser)\n\n    # Test initial state\n    assert controller.pages == []\n    assert controller.pages_count == 0\n    assert isinstance(controller.last_page_opened_at, datetime)\n    assert controller.idle_time < timedelta(seconds=1)\n    assert controller.has_free_capacity\n\n\n@pytest.mark.run_alone\nasync def test_open_and_close_page(controller: PlaywrightBrowserController, server_url: URL) -> None:\n    page = await controller.new_page()\n    await page.goto(str(server_url))\n\n    assert page in controller.pages\n    assert controller.pages_count == 1\n    assert controller.last_page_opened_at <= datetime.now(timezone.utc)\n\n    await page.close()\n\n    assert page not in controller.pages\n    assert controller.pages_count == 0\n\n\nasync def test_max_open_pages_limit(controller: PlaywrightBrowserController) -> None:\n    page1 = await controller.new_page()\n    assert controller.pages_count == 1\n\n    page2 = await controller.new_page()\n    assert controller.pages_count == 2\n\n    with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'):\n        await controller.new_page()\n\n    assert controller.pages_count == 2\n\n    await page1.close()\n    assert controller.pages_count == 1\n\n    page3 = await controller.new_page()\n    assert controller.pages_count == 2\n\n    await page2.close()\n    await page3.close()\n\n    assert controller.pages == []\n    assert controller.pages_count == 0\n\n\nasync def test_idle_time(controller: PlaywrightBrowserController) -> None:\n    idle_time_before = controller.idle_time\n    await asyncio.sleep(1)  # Simulate waiting\n    idle_time_after = controller.idle_time\n    assert idle_time_after > idle_time_before\n\n\nasync def test_close_browser_with_open_pages(browser: Browser) -> None:\n    controller = PlaywrightBrowserController(browser, max_open_pages_per_browser=2)\n    _ = await controller.new_page()\n\n    with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'):\n        await controller.close()\n\n    assert controller.pages_count == 1\n    assert controller.is_browser_connected\n\n    await controller.close(force=True)\n\n    assert controller.pages_count == 0\n    assert not controller.is_browser_connected\n\n\nasync def test_memory_leak_on_concurrent_context_creation() -> None:\n    \"\"\"Test that only one browser context is created when multiple pages are opened concurrently.\"\"\"\n\n    # Prepare mocked browser with relevant methods and attributes\n    mocked_browser = AsyncMock()\n    mocked_context_launcher = AsyncMock()\n    mocked_context = AsyncMock(spec=BrowserContext)\n\n    mocked_context_launcher.return_value = mocked_context\n    mocked_context.new_page.return_value = AsyncMock(spec=Page)\n\n    async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any:\n        \"\"\"Ensure that both calls to create context overlap in time.\"\"\"\n        await asyncio.sleep(5)  # Simulate delay in creation to make sure race condition happens\n        return await mocked_context_launcher(*args, **kwargs)\n\n    mocked_browser.launch_persistent_context = delayed_launch_persistent_context\n\n    # Create minimal instance of PlaywrightBrowserController with mocked browser\n    controller = PlaywrightBrowserController(\n        PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None\n    )\n\n    # Both calls will try to create browser context at the same time, but only one context should be created.\n    await asyncio.gather(controller.new_page(), controller.new_page())\n\n    assert mocked_context_launcher.call_count == 1\n\n\nasync def test_max_open_pages_limit_on_concurrent_creation(controller: PlaywrightBrowserController) -> None:\n    pages = await asyncio.gather(controller.new_page(), controller.new_page())\n\n    assert controller.pages_count == 2\n\n    for page in pages:\n        await page.close()\n\n\nasync def test_max_open_pages_limit_error_on_concurrent_creation(controller: PlaywrightBrowserController) -> None:\n    \"\"\"Test that max open pages limit is respected during concurrent page creation.\"\"\"\n    with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'):\n        await asyncio.gather(controller.new_page(), controller.new_page(), controller.new_page())\n\n\nasync def test_browser_with_pre_existing_context(tmp_path: Path) -> None:\n    \"\"\"Test that using `Browser` with pre-existing active context re-uses such context.\"\"\"\n    async with async_playwright() as pw:\n        persistent_context = await pw.firefox.launch_persistent_context(\n            user_data_dir=str(tmp_path),\n            headless=True,\n        )\n        browser = persistent_context.browser\n        assert browser\n\n        controller = PlaywrightBrowserController(browser=browser)\n        page_1 = await controller.new_page()\n        page_2 = await controller.new_page()\n        assert page_1.context == page_2.context == persistent_context\n"
  },
  {
    "path": "tests/unit/browsers/test_playwright_browser_plugin.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.browsers import PlaywrightBrowserPlugin\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from yarl import URL\n\n\n@pytest.fixture\nasync def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:\n    async with PlaywrightBrowserPlugin() as plugin:\n        yield plugin\n\n\nasync def test_initial_state() -> None:\n    plugin = PlaywrightBrowserPlugin(\n        browser_type='chromium',\n        browser_launch_options={'headless': False},\n        browser_new_context_options={'viewport': {'width': 1920, 'height': 1080}},\n        max_open_pages_per_browser=10,\n    )\n\n    # Test initial state\n    assert plugin.browser_type == 'chromium'\n    assert 'headless' in plugin.browser_launch_options\n    assert plugin.browser_launch_options['headless'] is False\n    assert plugin.browser_new_context_options == {'viewport': {'width': 1920, 'height': 1080}}\n    assert plugin.max_open_pages_per_browser == 10\n\n\nasync def test_new_browser(plugin: PlaywrightBrowserPlugin, server_url: URL) -> None:\n    browser_controller = await plugin.new_browser()\n\n    assert browser_controller.is_browser_connected\n\n    page = await browser_controller.new_page()\n    await page.goto(str(server_url))\n\n    await page.close()\n    await browser_controller.close()\n\n    assert not browser_controller.is_browser_connected\n\n\nasync def test_multiple_new_browsers(plugin: PlaywrightBrowserPlugin) -> None:\n    browser_controller_1 = await plugin.new_browser()\n    browser_controller_2 = await plugin.new_browser()\n\n    assert browser_controller_1 is not browser_controller_2\n\n\nasync def test_methods_raise_error_when_not_active() -> None:\n    plugin = PlaywrightBrowserPlugin()\n\n    assert plugin.active is False\n\n    with pytest.raises(RuntimeError, match=r'Plugin is not active'):\n        await plugin.new_browser()\n\n    with pytest.raises(RuntimeError, match=r'Plugin is already active.'):\n        async with plugin, plugin:\n            pass\n\n    async with plugin:\n        assert plugin.active is True\n\n\nasync def raise_error_if_chrome_and_executable_path() -> None:\n    with pytest.raises(\n        ValueError, match=r'Cannot use `use_chrome` with `Configuration.default_browser_path` or `executable_path` set.'\n    ):\n        PlaywrightBrowserPlugin(\n            browser_type='chrome',\n            browser_launch_options={'executable_path': '/path/to/chrome'},\n        )\n"
  },
  {
    "path": "tests/unit/conftest.py",
    "content": "from __future__ import annotations\n\nimport logging\nimport os\nimport warnings\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport pytest\nfrom curl_cffi import CurlHttpVersion\nfrom fakeredis import FakeAsyncRedis\nfrom proxy import Proxy\nfrom uvicorn.config import Config\n\nfrom crawlee import service_locator\nfrom crawlee.crawlers import BasicCrawler\nfrom crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network\nfrom crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient\nfrom crawlee.proxy_configuration import ProxyInfo\nfrom crawlee.statistics import Statistics\nfrom crawlee.storages import KeyValueStore\nfrom tests.unit.server import TestServer, app, serve_in_thread\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, Callable, Iterator\n    from pathlib import Path\n\n    from yarl import URL\n\n    from crawlee.http_clients._base import HttpClient\n\n\n@pytest.fixture(autouse=True)\nasync def suppress_user_warning() -> AsyncGenerator[None, None]:\n    \"\"\"Suppress user warnings during tests.\n\n    Mostly to suppress warnings about the experimental status of the SqlStorageClient.\n    \"\"\"\n    with warnings.catch_warnings():\n        warnings.simplefilter('ignore', UserWarning)\n        yield\n\n\n@pytest.fixture\ndef prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]:\n    \"\"\"Prepare the testing environment by resetting the global state before each test.\n\n    This fixture ensures that the global state of the package is reset to a known baseline before each test runs.\n    It also configures a temporary storage directory for test isolation.\n\n    Args:\n        monkeypatch: Test utility provided by pytest for patching.\n        tmp_path: A unique temporary directory path provided by pytest for test isolation.\n\n    Returns:\n        A callable that prepares the test environment.\n    \"\"\"\n\n    def _prepare_test_env() -> None:\n        # Disable the browser sandbox by setting the environment variable. This is required for running\n        # Playwright tests in the CI environment, where the sandbox is not supported.\n        monkeypatch.setenv('CRAWLEE_DISABLE_BROWSER_SANDBOX', 'true')\n\n        # Set the environment variable for the local storage directory to the temporary path.\n        monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path))\n\n        # Reset the services in the service locator.\n        service_locator._configuration = None\n        service_locator._event_manager = None\n        service_locator._storage_client = None\n        service_locator.storage_instance_manager.clear_cache()\n\n        # Verify that the test environment was set up correctly.\n        assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path)\n\n        # Reset global class variables to ensure test isolation.\n        KeyValueStore._autosaved_values = {}\n        Statistics._Statistics__next_id = 0  # type:ignore[attr-defined] # Mangled attribute\n        BasicCrawler._BasicCrawler__next_id = 0  # type:ignore[attr-defined] # Mangled attribute\n\n    return _prepare_test_env\n\n\n@pytest.fixture(autouse=True)\ndef _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None:\n    \"\"\"Isolate the testing environment by resetting global state before and after each test.\n\n    This fixture ensures that each test starts with a clean slate and that any modifications during the test\n    do not affect subsequent tests. It runs automatically for all tests.\n\n    Args:\n        prepare_test_env: Fixture to prepare the environment before each test.\n    \"\"\"\n    prepare_test_env()\n\n\n@pytest.fixture(autouse=True)\ndef _set_crawler_log_level(pytestconfig: pytest.Config, monkeypatch: pytest.MonkeyPatch) -> None:\n    from crawlee import _log_config  # noqa: PLC0415\n\n    loglevel = cast('str | None', pytestconfig.getoption('--log-level'))\n    if loglevel is not None:\n        monkeypatch.setattr(_log_config, 'get_configured_log_level', lambda: getattr(logging, loglevel.upper()))\n\n\n@pytest.fixture\nasync def proxy_info(unused_tcp_port: int) -> ProxyInfo:\n    username = 'user'\n    password = 'pass'\n\n    return ProxyInfo(\n        url=f'http://{username}:{password}@127.0.0.1:{unused_tcp_port}',\n        scheme='http',\n        hostname='127.0.0.1',\n        port=unused_tcp_port,\n        username=username,\n        password=password,\n    )\n\n\n@pytest.fixture\nasync def proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:\n    with Proxy(\n        [\n            '--hostname',\n            proxy_info.hostname,\n            '--port',\n            str(proxy_info.port),\n            '--basic-auth',\n            f'{proxy_info.username}:{proxy_info.password}',\n        ]\n    ):\n        yield proxy_info\n\n\n@pytest.fixture\nasync def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:\n    with Proxy(\n        [\n            '--hostname',\n            proxy_info.hostname,\n            '--port',\n            str(proxy_info.port),\n            '--basic-auth',\n            f'{proxy_info.username}:{proxy_info.password}',\n            '--disable-http-proxy',\n        ]\n    ):\n        yield proxy_info\n\n\n@pytest.fixture(scope='session')\ndef header_network() -> dict:\n    return get_available_header_network()\n\n\n@pytest.fixture\nasync def key_value_store() -> AsyncGenerator[KeyValueStore, None]:\n    kvs = await KeyValueStore.open()\n    yield kvs\n    await kvs.drop()\n\n\n@pytest.fixture(scope='session')\ndef http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]:\n    \"\"\"Create and start an HTTP test server.\"\"\"\n    config = Config(app=app, lifespan='off', loop='asyncio', port=unused_tcp_port_factory())\n    server = TestServer(config=config)\n    yield from serve_in_thread(server)\n\n\n@pytest.fixture(scope='session')\ndef server_url(http_server: TestServer) -> URL:\n    \"\"\"Provide the base URL of the test server.\"\"\"\n    return http_server.url\n\n\n# It is needed only in some tests, so we use the standard `scope=function`\n@pytest.fixture\ndef redirect_http_server(unused_tcp_port_factory: Callable[[], int]) -> Iterator[TestServer]:\n    \"\"\"Create and start an HTTP test server.\"\"\"\n    config = Config(\n        app=app,\n        lifespan='off',\n        loop='asyncio',\n        port=unused_tcp_port_factory(),\n        limit_max_requests=100,\n        timeout_graceful_shutdown=10,\n        log_level='error',\n        access_log=False,\n        ws='websockets-sansio',\n    )\n    server = TestServer(config=config)\n    yield from serve_in_thread(server)\n\n\n@pytest.fixture\ndef redirect_server_url(redirect_http_server: TestServer) -> URL:\n    \"\"\"Provide the base URL of the test server.\"\"\"\n    return redirect_http_server.url\n\n\n@pytest.fixture(\n    params=[\n        pytest.param('httpx', id='httpx'),\n        pytest.param('impit', id='impit'),\n        pytest.param('curl', id='curl'),\n    ]\n)\nasync def http_client(request: pytest.FixtureRequest) -> AsyncGenerator[HttpClient, None]:\n    class_client: type[HttpClient]\n    kwargs: dict[str, Any]\n    if request.param == 'curl':\n        class_client = CurlImpersonateHttpClient\n        kwargs = {'http_version': CurlHttpVersion.V1_1}\n    elif request.param == 'impit':\n        class_client = ImpitHttpClient\n        kwargs = {'http3': False}\n    else:\n        class_client = HttpxHttpClient\n        kwargs = {'http2': True}\n    async with class_client(**kwargs) as client:\n        yield client\n\n\n@pytest.fixture\ndef redis_client() -> FakeAsyncRedis:\n    return FakeAsyncRedis()\n"
  },
  {
    "path": "tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom dataclasses import dataclass\nfrom datetime import timedelta\nfrom itertools import cycle\nfrom typing import TYPE_CHECKING, cast\nfrom unittest.mock import Mock, call, patch\n\nimport pytest\nfrom bs4 import Tag\nfrom parsel import Selector\nfrom typing_extensions import override\n\nfrom crawlee import Request\nfrom crawlee.crawlers import (\n    AdaptivePlaywrightCrawler,\n    AdaptivePlaywrightCrawlingContext,\n    AdaptivePlaywrightPostNavCrawlingContext,\n    AdaptivePlaywrightPreNavCrawlingContext,\n    BasicCrawler,\n    RenderingType,\n    RenderingTypePrediction,\n    RenderingTypePredictor,\n)\nfrom crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import (\n    AdaptivePlaywrightCrawlerStatisticState,\n)\nfrom crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (\n    AdaptiveContextError,\n)\nfrom crawlee.sessions import SessionPool\nfrom crawlee.statistics import Statistics\nfrom crawlee.storage_clients import SqlStorageClient\nfrom crawlee.storages import KeyValueStore, RequestQueue\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator, Iterator\n    from pathlib import Path\n\n    from yarl import URL\n\n\n_H1_TEXT = 'Static'\n_H2_TEXT = 'Only in browser'\n_H3_CHANGED_TEXT = 'Changed by JS'\n_INJECTED_JS_DELAY_MS = 100\n_PAGE_CONTENT_STATIC = f\"\"\"\n<h1>{_H1_TEXT}</h1>\n<h3>Initial text</h3>\n<script>\n    setTimeout(function() {{\n    let h2 = document.createElement(\"h2\");\n    h2.innerText = \"{_H2_TEXT}\";\n    document.getElementsByTagName(\"body\")[0].append(h2);\n    document.getElementsByTagName(\"h3\")[0].textContent=\"{_H3_CHANGED_TEXT}\";\n    }}, {_INJECTED_JS_DELAY_MS});\n\n</script>\n\"\"\"\n\n\n@pytest.fixture\ndef test_urls(server_url: URL) -> list[str]:\n    \"\"\"Example pages used in the test are mocked for static requests.\"\"\"\n    return [\n        str(server_url.with_path('echo_content').with_query(content=_PAGE_CONTENT_STATIC)),\n        str(server_url.with_path('echo_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)),\n    ]\n\n\n@pytest.fixture\nasync def key_value_store() -> AsyncGenerator[KeyValueStore, None]:\n    kvs = await KeyValueStore.open()\n    yield kvs\n    await kvs.drop()\n\n\nclass _SimpleRenderingTypePredictor(RenderingTypePredictor):\n    \"\"\"Simplified predictor for tests.\"\"\"\n\n    def __init__(\n        self,\n        rendering_types: Iterator[RenderingType] | None = None,\n        detection_probability_recommendation: None | Iterator[float] = None,\n    ) -> None:\n        super().__init__()\n\n        self._rendering_types = rendering_types or cycle(['static'])\n        self._detection_probability_recommendation = detection_probability_recommendation or cycle([1])\n\n    @override\n    def predict(self, request: Request) -> RenderingTypePrediction:\n        return RenderingTypePrediction(next(self._rendering_types), next(self._detection_probability_recommendation))\n\n    @override\n    def store_result(self, request: Request, rendering_type: RenderingType) -> None:\n        pass\n\n\n@dataclass(frozen=True)\nclass TestInput:\n    __test__ = False\n\n    expected_pw_count: int\n    expected_static_count: int\n    rendering_types: Iterator[RenderingType]\n    detection_probability_recommendation: Iterator[float]\n\n\n@pytest.mark.parametrize(\n    'test_input',\n    [\n        pytest.param(\n            TestInput(\n                expected_pw_count=0,\n                expected_static_count=2,\n                # Lack of ty support, see https://github.com/astral-sh/ty/issues/2348.\n                rendering_types=cycle(['static']),\n                detection_probability_recommendation=cycle([0]),\n            ),\n            id='Static only',\n        ),\n        pytest.param(\n            TestInput(\n                expected_pw_count=2,\n                expected_static_count=0,\n                rendering_types=cycle(['client only']),\n                detection_probability_recommendation=cycle([0]),\n            ),\n            id='Client only',\n        ),\n        pytest.param(\n            TestInput(\n                expected_pw_count=1,\n                expected_static_count=1,\n                rendering_types=cycle(['static', 'client only']),\n                detection_probability_recommendation=cycle([0]),\n            ),\n            id='Mixed',\n        ),\n        pytest.param(\n            TestInput(\n                expected_pw_count=2,\n                expected_static_count=2,\n                rendering_types=cycle(['static', 'client only']),\n                detection_probability_recommendation=cycle([1]),\n            ),\n            id='Enforced rendering type detection',\n        ),\n    ],\n)\nasync def test_adaptive_crawling(\n    test_input: TestInput,\n    test_urls: list[str],\n) -> None:\n    \"\"\"Tests correct routing to pre-nav hooks and correct handling through proper handler.\"\"\"\n\n    predictor = _SimpleRenderingTypePredictor(\n        rendering_types=test_input.rendering_types,\n        detection_probability_recommendation=test_input.detection_probability_recommendation,\n    )\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=predictor,\n    )\n\n    pw_handler_count = 0\n    static_handler_count = 0\n\n    pw_hook_count = 0\n    static_hook_count = 0\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        nonlocal pw_handler_count\n        nonlocal static_handler_count\n\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            pw_handler_count += 1\n        except AdaptiveContextError:\n            static_handler_count += 1\n\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:  # Intentionally unused arg\n        nonlocal static_hook_count\n        nonlocal pw_hook_count\n\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            pw_hook_count += 1\n        except AdaptiveContextError:\n            static_hook_count += 1\n\n    await crawler.run(test_urls)\n\n    assert pw_handler_count == test_input.expected_pw_count\n    assert pw_hook_count == test_input.expected_pw_count\n\n    assert static_handler_count == test_input.expected_static_count\n    assert static_hook_count == test_input.expected_static_count\n\n\nasync def test_adaptive_crawling_parsel(test_urls: list[str]) -> None:\n    \"\"\"Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)\"\"\"\n    predictor = _SimpleRenderingTypePredictor(\n        rendering_types=cycle(['static', 'client only']),\n        detection_probability_recommendation=cycle([0]),\n    )\n\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        rendering_type_predictor=predictor,\n    )\n\n    pw_handler_count = 0\n    static_handler_count = 0\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        nonlocal pw_handler_count\n        nonlocal static_handler_count\n\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            pw_handler_count += 1\n        except AdaptiveContextError:\n            static_handler_count += 1\n\n    await crawler.run(test_urls)\n\n    assert pw_handler_count == 1\n    assert static_handler_count == 1\n\n\nasync def test_adaptive_crawling_pre_nav_change_to_context(test_urls: list[str]) -> None:\n    \"\"\"Tests that context can be modified in pre-navigation hooks.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n    user_data_in_pre_nav_hook = []\n    user_data_in_handler = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        user_data_in_handler.append(context.request.user_data.get('data', None))\n\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        user_data_in_pre_nav_hook.append(context.request.user_data.get('data', None))\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            context.request.user_data['data'] = 'pw'\n        except AdaptiveContextError:\n            context.request.user_data['data'] = 'bs'\n\n    await crawler.run(test_urls[:1])\n    # Check that repeated pre nav hook invocations do not influence each other while probing\n    assert user_data_in_pre_nav_hook == [None, None]\n    # Check that the request handler sees changes to user data done by pre nav hooks\n    assert user_data_in_handler == ['pw', 'bs']\n\n\nasync def test_playwright_only_pre_navigation_hook(test_urls: list[str]) -> None:\n    \"\"\"Test that hook can be registered for playwright only sub crawler.\n\n    Create a situation where one page is crawled by both sub crawlers. One common pre navigation hook is registered and\n    one playwright only pre navigation hook is registered.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n    pre_nav_hook_common = Mock()\n    pre_nav_hook_playwright = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        pass\n\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        pre_nav_hook_common(context.request.url)\n\n    @crawler.pre_navigation_hook(playwright_only=True)\n    async def pre_nav_hook_pw_only(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:\n        pre_nav_hook_playwright(context.page.url)\n\n    await crawler.run(test_urls[:1])\n\n    # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler.\n    pre_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])])\n    # Hook is called only by playwright sub crawler.\n    pre_nav_hook_playwright.assert_called_once_with('about:blank')\n\n\nasync def test_adaptive_crawling_post_nav_change_to_context(test_urls: list[str]) -> None:\n    \"\"\"Tests that context can be modified in post-navigation hooks.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n    user_data_in_post_nav_hook = []\n    user_data_in_handler = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        user_data_in_handler.append(context.request.user_data.get('data', None))\n\n    @crawler.post_navigation_hook\n    async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:\n        user_data_in_post_nav_hook.append(context.request.user_data.get('data', None))\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            context.request.user_data['data'] = 'pw'\n        except AdaptiveContextError:\n            context.request.user_data['data'] = 'bs'\n\n    await crawler.run(test_urls[:1])\n    # Check that repeated post nav hook invocations do not influence each other while probing\n    assert user_data_in_post_nav_hook == [None, None]\n    # Check that the request handler sees changes to user data done by post nav hooks\n    assert user_data_in_handler == ['pw', 'bs']\n\n\nasync def test_playwright_only_post_navigation_hook(test_urls: list[str]) -> None:\n    \"\"\"Test that hook can be registered for playwright only sub crawler.\n\n    Create a situation where one page is crawled by both sub crawlers. One common post navigation hook is registered and\n    one playwright only post navigation hook is registered.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n    post_nav_hook_common = Mock()\n    post_nav_hook_playwright = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        pass\n\n    @crawler.post_navigation_hook\n    async def post_nav_hook(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:\n        post_nav_hook_common(context.request.url)\n\n    @crawler.post_navigation_hook(playwright_only=True)\n    async def post_nav_hook_pw_only(context: AdaptivePlaywrightPostNavCrawlingContext) -> None:\n        post_nav_hook_playwright(context.page.url)\n\n    await crawler.run(test_urls[:1])\n\n    # Default behavior. Hook is called every time, both static sub crawler and playwright sub crawler.\n    post_nav_hook_common.assert_has_calls([call(test_urls[0]), call(test_urls[0])])\n    # Hook is called only by playwright sub crawler.\n    post_nav_hook_playwright.assert_called_once_with(test_urls[0])\n\n\nasync def test_adaptive_crawling_result(test_urls: list[str]) -> None:\n    \"\"\"Tests that result only from one sub crawler is saved.\n\n    Enforced rendering type detection to run both sub crawlers.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            await context.push_data({'handler': 'pw'})\n        except AdaptiveContextError:\n            await context.push_data({'handler': 'bs'})\n\n    await crawler.run(test_urls[:1])\n\n    # Enforced rendering type detection will trigger both sub crawlers, but only pw crawler result is saved.\n    assert (await crawler.get_data()).items == [{'handler': 'pw'}]\n\n\n@pytest.mark.parametrize(\n    ('pw_saved_data', 'static_saved_data', 'expected_result_rendering_type'),\n    [\n        pytest.param({'some': 'data'}, {'some': 'data'}, 'static', id='Same results from sub crawlers'),\n        pytest.param({'some': 'data'}, {'different': 'data'}, 'client only', id='Different results from sub crawlers'),\n    ],\n)\nasync def test_adaptive_crawling_predictor_calls(\n    pw_saved_data: dict[str, str],\n    static_saved_data: dict[str, str],\n    expected_result_rendering_type: RenderingType,\n    test_urls: list[str],\n) -> None:\n    \"\"\"Tests expected predictor calls. Same results.\"\"\"\n    some_label = 'bla'\n    some_url = test_urls[0]\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n    requests = [Request.from_url(url=some_url, label=some_label)]\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            await context.push_data(pw_saved_data)\n        except AdaptiveContextError:\n            await context.push_data(static_saved_data)\n\n    with (\n        patch.object(static_only_predictor_enforce_detection, 'store_result', Mock()) as mocked_store_result,\n        patch.object(\n            static_only_predictor_enforce_detection, 'predict', Mock(return_value=RenderingTypePrediction('static', 1))\n        ) as mocked_predict,\n    ):\n        await crawler.run(requests)\n\n    assert mocked_predict.call_count == 1\n    assert mocked_predict.call_args[0][0].url == requests[0].url\n\n    # If `static` and `client only` results are same, `store_result` should be called with `static`.\n    mocked_store_result.assert_called_once_with(mocked_predict.call_args[0][0], expected_result_rendering_type)\n\n\nasync def test_adaptive_crawling_result_use_state_isolation(\n    key_value_store: KeyValueStore, test_urls: list[str]\n) -> None:\n    \"\"\"Tests that global state accessed through `use_state` is changed only by one sub crawler.\n\n    Enforced rendering type detection to run both sub crawlers.\"\"\"\n    static_only_predictor_enforce_detection = _SimpleRenderingTypePredictor()\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_enforce_detection,\n    )\n    await key_value_store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0})\n    request_handler_calls = 0\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        nonlocal request_handler_calls\n        state = cast('dict[str, int]', await context.use_state())\n        request_handler_calls += 1\n        state['counter'] += 1\n\n    await crawler.run(test_urls[:1])\n\n    await key_value_store.persist_autosaved_values()\n\n    # Request handler was called twice\n    assert request_handler_calls == 2\n    # Increment of global state happened only once\n    assert (await key_value_store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 1\n\n\nasync def test_adaptive_crawling_statistics(test_urls: list[str]) -> None:\n    \"\"\"Test adaptive crawler statistics.\n\n    Crawler set to static crawling, but due to result_checker returning False on static crawling result it\n    will do browser crawling instead as well. This increments all three adaptive crawling related stats.\"\"\"\n    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_predictor_no_detection,\n        result_checker=lambda result: False,  #  noqa: ARG005  # Intentionally unused argument.\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        pass\n\n    await crawler.run(test_urls[:1])\n\n    assert crawler.statistics.state.http_only_request_handler_runs == 1\n    assert crawler.statistics.state.browser_request_handler_runs == 1\n    assert crawler.statistics.state.rendering_type_mispredictions == 1\n\n    # Despite running both sub crawlers the top crawler statistics should count this as one request finished.\n    assert crawler.statistics.state.requests_finished == 1\n    assert crawler.statistics.state.requests_failed == 0\n\n\n@pytest.mark.parametrize(\n    'error_in_pw_crawler',\n    [\n        pytest.param(False, id='Error only in static sub crawler'),\n        pytest.param(True, id='Error in both sub crawlers'),\n    ],\n)\nasync def test_adaptive_crawler_exceptions_in_sub_crawlers(*, error_in_pw_crawler: bool, test_urls: list[str]) -> None:\n    \"\"\"Test that correct results are committed when exceptions are raised in sub crawlers.\n\n    Exception in bs sub crawler will be logged and pw sub crawler used instead.\n    Any result from bs sub crawler will be discarded, result form pw crawler will be saved instead.\n    (But global state modifications through `use_state` will not be reverted!!!)\n\n    Exception in pw sub crawler will prevent any result from being committed. Even if `push_data` was called before\n    the exception\n    \"\"\"\n    static_only_no_detection_predictor = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        rendering_type_predictor=static_only_no_detection_predictor,\n    )\n    saved_data = {'some': 'data'}\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            await context.push_data(saved_data)\n            if error_in_pw_crawler:\n                raise RuntimeError('Some pw sub crawler related error')\n\n        except AdaptiveContextError:\n            await context.push_data({'this': 'data should not be saved'})\n            raise RuntimeError('Some bs sub crawler related error') from None\n\n    await crawler.run(test_urls[:1])\n\n    dataset = await crawler.get_dataset()\n    stored_results = [item async for item in dataset.iterate_items()]\n\n    if error_in_pw_crawler:\n        assert stored_results == []\n    else:\n        assert stored_results == [saved_data]\n\n\nasync def test_adaptive_playwright_crawler_statistics_in_init() -> None:\n    \"\"\"Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics.\"\"\"\n    persistence_enabled = True\n    persist_state_kvs_name = 'some-name'\n    persist_state_key = 'come key'\n    log_message = 'some message'\n    periodic_message_logger = logging.getLogger('some logger')\n    log_interval = timedelta(minutes=2)\n    statistics = Statistics.with_default_state(\n        persistence_enabled=persistence_enabled,\n        persist_state_kvs_name=persist_state_kvs_name,\n        persist_state_key=persist_state_key,\n        log_message=log_message,\n        periodic_message_logger=periodic_message_logger,\n        log_interval=log_interval,\n    )\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(statistics=statistics)\n    await crawler.run([])  # ensure that statistics get initialized\n\n    assert type(crawler._statistics.state) is AdaptivePlaywrightCrawlerStatisticState\n\n    assert crawler._statistics._state._persistence_enabled == persistence_enabled\n    assert crawler._statistics._state._persist_state_key == persist_state_key\n\n    assert crawler._statistics._log_message == log_message\n    assert crawler._statistics._periodic_message_logger == periodic_message_logger\n\n\nasync def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: list[str]) -> None:\n    \"\"\"Tests that timeout in static sub crawler forces fall back to browser sub crawler.\n\n    Create situation where static sub crawler blocks(should time out), such error should start browser sub\n    crawler.\n    \"\"\"\n    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n    # Use a generous timeout so the static pipeline has enough time to reach the handler even on slow CI.\n    # The handler will block indefinitely, so the timeout will always fire during the handler's wait.\n    request_handler_timeout = timedelta(seconds=10)\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        max_request_retries=0,\n        rendering_type_predictor=static_only_predictor_no_detection,\n        request_handler_timeout=request_handler_timeout,\n    )\n    mocked_static_handler = Mock(name='static_handler')\n    mocked_browser_handler = Mock(name='browser_handler')\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            mocked_browser_handler()\n        except AdaptiveContextError:\n            mocked_static_handler()\n            # Relax timeout for the fallback browser request to allow for slow browser startup on CI\n            crawler._request_handler_timeout = timedelta(seconds=120)\n            # Block indefinitely - will be cancelled when the request_handler_timeout fires.\n            await asyncio.Event().wait()\n\n    await crawler.run(test_urls[:1])\n\n    mocked_static_handler.assert_called_once_with()\n    # Browser handler was capable of running despite static handler blocking longer than the handler timeout.\n    mocked_browser_handler.assert_called_once_with()\n\n\nasync def test_adaptive_playwright_crawler_default_predictor(test_urls: list[str]) -> None:\n    \"\"\"Test default rendering type predictor integration into crawler.\"\"\"\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()\n    mocked_static_handler = Mock()\n    mocked_browser_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        try:\n            # page is available only if it was crawled by PlaywrightCrawler.\n            context.page  # noqa:B018 Intentionally \"useless expression\". Can trigger exception.\n            mocked_browser_handler()\n        except AdaptiveContextError:\n            mocked_static_handler()\n\n    await crawler.run(test_urls[:1])\n\n    # First prediction should trigger rendering type detection as the predictor does not have any data for prediction.\n    mocked_static_handler.assert_called_once_with()\n    mocked_browser_handler.assert_called_once_with()\n\n\nasync def test_adaptive_context_query_selector_beautiful_soup(test_urls: list[str]) -> None:\n    \"\"\"Test that `context.query_selector_one` works regardless of the crawl type for BeautifulSoup variant.\n\n    Handler tries to locate two elements h1 and h2.\n    h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html.\n    Create situation where page is crawled with static sub crawler first.\n    Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger\n    `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub\n    crawler is able to wait for the h2 element.\"\"\"\n\n    # Get page with injected JS code that will add some element after timeout\n    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n\n    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n        max_request_retries=1,\n        rendering_type_predictor=static_only_predictor_no_detection,\n    )\n\n    mocked_h1_handler = Mock()\n    mocked_h2_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        h1 = await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2))\n        mocked_h1_handler(h1)\n        h2 = await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2))\n        mocked_h2_handler(h2)\n\n    await crawler.run(test_urls[:1])\n\n    expected_h1_tag = Tag(name='h1')\n    expected_h1_tag.append(_H1_TEXT)\n\n    expected_h2_tag = Tag(name='h2')\n    expected_h2_tag.append(_H2_TEXT)\n\n    # Called by both sub crawlers\n    mocked_h1_handler.assert_has_calls([call(expected_h1_tag), call(expected_h1_tag)])\n    # Called only by pw sub crawler\n    mocked_h2_handler.assert_has_calls([call(expected_h2_tag)])\n\n\n@pytest.mark.flaky(\n    rerun=3,\n    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1650.',\n)\nasync def test_adaptive_context_query_selector_parsel(test_urls: list[str]) -> None:\n    \"\"\"Test that `context.query_selector_one` works regardless of the crawl type for Parsel variant.\n\n    Handler tries to locate two elements h1 and h2.\n    h1 exists immediately, h2 is created dynamically by inline JS snippet embedded in the html.\n    Create situation where page is crawled with static sub crawler first.\n    Static sub crawler should be able to locate only h1. It will try to wait for h2, trying to wait for h2 will trigger\n    `AdaptiveContextError` which will force the adaptive crawler to try playwright sub crawler instead. Playwright sub\n    crawler is able to wait for the h2 element.\"\"\"\n\n    # Get page with injected JS code that will add some element after timeout\n    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n    expected_h1_tag = f'<h1>{_H1_TEXT}</h1>'\n    expected_h2_tag = f'<h2>{_H2_TEXT}</h2>'\n\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        max_request_retries=1,\n        rendering_type_predictor=static_only_predictor_no_detection,\n    )\n\n    mocked_h1_handler = Mock()\n    mocked_h2_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        if h1 := await context.query_selector_one('h1', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)):\n            mocked_h1_handler(type(h1), h1.get())\n        if h2 := await context.query_selector_one('h2', timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)):\n            mocked_h2_handler(type(h2), h2.get())\n\n    await crawler.run(test_urls[:1])\n\n    # Called by both sub crawlers\n    mocked_h1_handler.assert_has_calls([call(Selector, expected_h1_tag), call(Selector, expected_h1_tag)])\n    # Called only by pw sub crawler\n    mocked_h2_handler.assert_has_calls([call(Selector, expected_h2_tag)])\n\n\nasync def test_adaptive_context_parse_with_static_parser_parsel(test_urls: list[str]) -> None:\n    \"\"\"Test `context.parse_with_static_parser` works regardless of the crawl type for Parsel variant.\n\n    (Test covers also  `context.wait_for_selector`, which is called by `context.parse_with_static_parser`)\n    \"\"\"\n    static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0]))\n    expected_h2_tag = f'<h2>{_H2_TEXT}</h2>'\n\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        max_request_retries=1,\n        rendering_type_predictor=static_only_predictor_no_detection,\n    )\n\n    mocked_h2_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        h2_static = context.parsed_content.css('h2')  # Should not find anything\n        mocked_h2_handler(h2_static)\n\n        # Reparse whole page after h2 appears\n        parsed_content_after_h2_appeared = await context.parse_with_static_parser(\n            selector='h2', timeout=timedelta(milliseconds=_INJECTED_JS_DELAY_MS * 2)\n        )\n        mocked_h2_handler(parsed_content_after_h2_appeared.css('h2')[0].get())\n\n    await crawler.run(test_urls[:1])\n\n    mocked_h2_handler.assert_has_calls(\n        [\n            call([]),  # Static sub crawler tried and did not find h2.\n            call([]),  # Playwright sub crawler tried and did not find h2 without waiting.\n            call(expected_h2_tag),  # Playwright waited for h2 to appear.\n        ]\n    )\n\n\nasync def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str]) -> None:\n    \"\"\"Test that context helpers work on latest version of the page.\n\n    Scenario where page is changed after a while. H2 element is added and text of H3 element is modified.\n    Test that context helpers automatically work on latest version of the page by reading H3 element and expecting it's\n    dynamically changed text instead of the original static text.\n    \"\"\"\n    browser_only_predictor_no_detection = _SimpleRenderingTypePredictor(\n        rendering_types=cycle(['client only']),\n        detection_probability_recommendation=cycle([0]),\n    )\n    expected_h3_tag = f'<h3>{_H3_CHANGED_TEXT}</h3>'\n\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        max_request_retries=1,\n        rendering_type_predictor=browser_only_predictor_no_detection,\n    )\n\n    mocked_h3_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        await context.query_selector_one('h2')  # Wait for change that is indicated by appearance of h2 element.\n        if h3 := await context.query_selector_one('h3'):\n            mocked_h3_handler(h3.get())  # Get updated h3 element.\n\n    await crawler.run(test_urls[:1])\n\n    mocked_h3_handler.assert_called_once_with(expected_h3_tag)\n\n\nasync def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None:\n    \"\"\"Test that querying non-existing selector returns `None`\"\"\"\n    browser_only_predictor_no_detection = _SimpleRenderingTypePredictor(\n        rendering_types=cycle(['client only']),\n        detection_probability_recommendation=cycle([0]),\n    )\n\n    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(\n        max_request_retries=1,\n        rendering_type_predictor=browser_only_predictor_no_detection,\n    )\n\n    mocked_h3_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n        mocked_h3_handler(await context.query_selector_one('non sense selector', timeout=timedelta(milliseconds=1)))\n\n    await crawler.run(test_urls[:1])\n\n    mocked_h3_handler.assert_called_once_with(None)\n\n\n@pytest.mark.parametrize(\n    'test_input',\n    [\n        pytest.param(\n            TestInput(\n                expected_pw_count=0,\n                expected_static_count=2,\n                rendering_types=cycle(['static']),\n                detection_probability_recommendation=cycle([0]),\n            ),\n            id='Static only',\n        ),\n        pytest.param(\n            TestInput(\n                expected_pw_count=2,\n                expected_static_count=0,\n                rendering_types=cycle(['client only']),\n                detection_probability_recommendation=cycle([0]),\n            ),\n            id='Client only',\n        ),\n        pytest.param(\n            TestInput(\n                expected_pw_count=2,\n                expected_static_count=2,\n                rendering_types=cycle(['static', 'client only']),\n                detection_probability_recommendation=cycle([1]),\n            ),\n            id='Enforced rendering type detection',\n        ),\n    ],\n)\nasync def test_change_context_state_after_handling(test_input: TestInput, server_url: URL) -> None:\n    \"\"\"Test that context state is saved after handling the request.\"\"\"\n    predictor = _SimpleRenderingTypePredictor(\n        rendering_types=test_input.rendering_types,\n        detection_probability_recommendation=test_input.detection_probability_recommendation,\n    )\n\n    request_queue = await RequestQueue.open(name='state-test')\n    used_session_id = None\n\n    async with SessionPool() as session_pool:\n        crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n            rendering_type_predictor=predictor,\n            session_pool=session_pool,\n            request_manager=request_queue,\n        )\n\n        @crawler.router.default_handler\n        async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n            nonlocal used_session_id\n\n            if context.session is not None:\n                used_session_id = context.session.id\n                context.session.user_data['session_state'] = True\n\n            if isinstance(context.request.user_data['request_state'], list):\n                context.request.user_data['request_state'].append('handler')\n\n        request = Request.from_url(str(server_url), user_data={'request_state': ['initial']})\n\n        await crawler.run([request])\n\n        assert used_session_id is not None\n\n        session = await session_pool.get_session_by_id(used_session_id)\n        check_request = await request_queue.get_request(request.unique_key)\n\n        assert session is not None\n        assert check_request is not None\n\n        assert session.user_data.get('session_state') is True\n        # Check that request user data was updated in the handler and only onse.\n        assert check_request.user_data.get('request_state') == ['initial', 'handler']\n\n        await request_queue.drop()\n\n\nasync def test_adaptive_playwright_crawler_with_sql_storage(test_urls: list[str], tmp_path: Path) -> None:\n    \"\"\"Tests that AdaptivePlaywrightCrawler can be initialized with SqlStorageClient.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:\n        crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(\n            storage_client=storage_client,\n        )\n\n        mocked_handler = Mock()\n\n        @crawler.router.default_handler\n        async def request_handler(_context: AdaptivePlaywrightCrawlingContext) -> None:\n            mocked_handler()\n\n        await crawler.run(test_urls[:1])\n\n        mocked_handler.assert_called()\n"
  },
  {
    "path": "tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py",
    "content": "from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import (\n    AdaptivePlaywrightCrawlerStatisticState,\n)\nfrom crawlee.statistics import Statistics\n\n\nasync def test_predictor_state_persistence() -> None:\n    \"\"\"Test that adaptive statistics can be correctly persisted and initialized from persisted values.\"\"\"\n\n    async with Statistics(\n        state_model=AdaptivePlaywrightCrawlerStatisticState, persistence_enabled=True\n    ) as adaptive_statistics:\n        adaptive_statistics.state.browser_request_handler_runs = 1\n        adaptive_statistics.state.rendering_type_mispredictions = 2\n        adaptive_statistics.state.http_only_request_handler_runs = 3\n\n        persistence_state_key = adaptive_statistics._state._persist_state_key\n    # adaptive_statistics are persisted after leaving the context\n\n    # new_adaptive_statistics are initialized from the persisted values.\n    async with Statistics(\n        state_model=AdaptivePlaywrightCrawlerStatisticState,\n        persistence_enabled=True,\n        persist_state_key=persistence_state_key,\n    ) as new_adaptive_statistics:\n        pass\n\n    assert new_adaptive_statistics.state.browser_request_handler_runs == 1\n    assert new_adaptive_statistics.state.rendering_type_mispredictions == 2\n    assert new_adaptive_statistics.state.http_only_request_handler_runs == 3\n"
  },
  {
    "path": "tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py",
    "content": ""
  },
  {
    "path": "tests/unit/crawlers/_adaptive_playwright/test_predictor.py",
    "content": "from __future__ import annotations\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee.crawlers._adaptive_playwright._rendering_type_predictor import (\n    DefaultRenderingTypePredictor,\n    RenderingType,\n    calculate_url_similarity,\n    get_url_components,\n)\nfrom crawlee.storages import KeyValueStore\n\n\n@pytest.mark.parametrize('label', ['some label', None])\n@pytest.mark.parametrize(\n    ('url', 'expected_prediction'),\n    [\n        ('http://www.aaa.com/some/stuff/extra', 'static'),\n        ('http://www.aab.com/some/otherstuff', 'static'),\n        ('http://www.aac.com/some', 'static'),\n        ('http://www.ddd.com/some/stuff/extra', 'client only'),\n        ('http://www.dde.com/some/otherstuff', 'client only'),\n        ('http://www.ddf.com/some', 'client only'),\n    ],\n)\nasync def test_predictor_same_label(url: str, expected_prediction: RenderingType, label: str | None) -> None:\n    async with DefaultRenderingTypePredictor() as predictor:\n        learning_inputs: tuple[tuple[str, RenderingType], ...] = (\n            ('http://www.aaa.com/some/stuff', 'static'),\n            ('http://www.aab.com/some/stuff', 'static'),\n            ('http://www.aac.com/some/stuff', 'static'),\n            ('http://www.ddd.com/some/stuff', 'client only'),\n            ('http://www.dde.com/some/stuff', 'client only'),\n            ('http://www.ddf.com/some/stuff', 'client only'),\n        )\n\n        # Learn from small set\n        for learned_url, rendering_type in learning_inputs:\n            predictor.store_result(Request.from_url(url=learned_url, label=label), rendering_type=rendering_type)\n\n        assert predictor.predict(Request.from_url(url=url, label=label)).rendering_type == expected_prediction\n\n\nasync def test_predictor_new_label_increased_detection_probability_recommendation() -> None:\n    \"\"\"Test that urls of uncommon labels have increased detection recommendation.\n\n    This increase should gradually drop as the predictor learns more data with this label.\"\"\"\n    detection_ratio = 0.01\n    label = 'some label'\n    async with DefaultRenderingTypePredictor(detection_ratio=detection_ratio) as predictor:\n        # Learn first prediction of this label\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label=label), rendering_type='static'\n        )\n        # Increased detection_probability_recommendation\n        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))\n        assert prediction.rendering_type == 'static'\n        assert prediction.detection_probability_recommendation == detection_ratio * 4\n\n        # Learn second prediction of this label\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuffe', label=label), rendering_type='static'\n        )\n        # Increased detection_probability_recommendation\n        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))\n        assert prediction.rendering_type == 'static'\n        assert prediction.detection_probability_recommendation == detection_ratio * 3\n\n        # Learn third prediction of this label\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuffi', label=label), rendering_type='static'\n        )\n        # Increased detection_probability_recommendation\n        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))\n        assert prediction.rendering_type == 'static'\n        assert prediction.detection_probability_recommendation == detection_ratio * 2\n\n        # Learn fourth prediction of this label.\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuffo', label=label), rendering_type='static'\n        )\n        # Label considered stable now. There should be no increase of detection_probability_recommendation.\n        prediction = predictor.predict(Request.from_url(url='http://www.aaa.com/some/stuffa', label=label))\n        assert prediction.rendering_type == 'static'\n        assert prediction.detection_probability_recommendation == detection_ratio\n\n\nasync def test_unreliable_prediction() -> None:\n    \"\"\"Test that detection_probability_recommendation for unreliable predictions is 1.\n\n    Create situation where no learning data of new label is available for the predictor.\n    It's first prediction is not reliable as both options have 50% chance, so it should set maximum\n    detection_probability_recommendation.\"\"\"\n    learnt_label = 'some label'\n\n    async with DefaultRenderingTypePredictor() as predictor:\n        # Learn two predictions of some label. One of each to make predictor very uncertain.\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label=learnt_label), rendering_type='static'\n        )\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/otherstuff', label=learnt_label), rendering_type='client only'\n        )\n\n        # Predict for new label. Predictor does not have enough information to give any reliable guess and should make\n        # it clear by setting detection_probability_recommendation=1\n        probability = predictor.predict(\n            Request.from_url(url='http://www.unknown.com', label='new label')\n        ).detection_probability_recommendation\n        assert probability == 1\n\n\nasync def test_no_learning_data_prediction() -> None:\n    \"\"\"Test that predictor can predict even if it never learnt anything before.\n\n    It should give some prediction, but it has to set detection_probability_recommendation=1\"\"\"\n    async with DefaultRenderingTypePredictor() as predictor:\n        probability = predictor.predict(\n            Request.from_url(url='http://www.unknown.com', label='new label')\n        ).detection_probability_recommendation\n\n        assert probability == 1\n\n\nasync def test_persistent_no_learning_data_prediction() -> None:\n    \"\"\"Test that the model is saved after initialisation in KeyValueStore.\"\"\"\n    persist_key = 'test-no_learning-state'\n    async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as _predictor:\n        pass\n\n    kvs = await KeyValueStore.open()\n\n    persisted_data = await kvs.get_value(persist_key)\n\n    assert persisted_data is not None\n    assert persisted_data['model']['is_fitted'] is False\n\n\nasync def test_persistent_prediction() -> None:\n    \"\"\"Test that the model and resources is saved after train in KeyValueStore.\"\"\"\n    persist_key = 'test-persistent-state'\n    async with DefaultRenderingTypePredictor(persistence_enabled=True, persist_state_key=persist_key) as predictor:\n        # Learn some data\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static'\n        )\n\n    kvs = await KeyValueStore.open()\n\n    persisted_data = await kvs.get_value(persist_key)\n\n    assert persisted_data is not None\n    assert persisted_data['model']['is_fitted'] is True\n\n\n@pytest.mark.parametrize(\n    ('persistence_enabled', 'same_result'),\n    [\n        pytest.param(True, True, id='with persistence'),\n        pytest.param(False, False, id='without persistence'),\n    ],\n)\nasync def test_persistent_prediction_recovery(*, persistence_enabled: bool, same_result: bool) -> None:\n    \"\"\"Test that the model and resources is recovered from KeyValueStore.\"\"\"\n    persist_key = 'test-persistent-state-recovery'\n\n    async with DefaultRenderingTypePredictor(\n        detection_ratio=0.01, persistence_enabled=persistence_enabled, persist_state_key=persist_key\n    ) as predictor:\n        # Learn some data\n        predictor.store_result(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label'), rendering_type='static'\n        )\n        before_recover_prediction = predictor.predict(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label')\n        )\n\n    # Recover predictor\n    async with DefaultRenderingTypePredictor(\n        detection_ratio=0.01, persistence_enabled=True, persist_state_key=persist_key\n    ) as recover_predictor:\n        after_recover_prediction = recover_predictor.predict(\n            Request.from_url(url='http://www.aaa.com/some/stuff', label='some label')\n        )\n\n    # If persistence is enabled, the predicted results must be the same.\n    if same_result:\n        assert (\n            before_recover_prediction.detection_probability_recommendation\n            == after_recover_prediction.detection_probability_recommendation\n        )\n    else:\n        assert (\n            before_recover_prediction.detection_probability_recommendation\n            != after_recover_prediction.detection_probability_recommendation\n        )\n\n\n@pytest.mark.parametrize(\n    ('url_1', 'url_2', 'expected_rounded_similarity'),\n    [\n        (\n            'https://docs.python.org/3/library/itertools.html#itertools.zip_longest',\n            'https://docs.python.org/3.7/library/itertools.html#itertools.zip_longest',\n            0.67,\n        ),\n        ('https://differente.com/same', 'https://differenta.com/same', 0),\n        ('https://same.com/almost_the_same', 'https://same.com/almost_the_sama', 1),\n        ('https://same.com/same/extra', 'https://same.com/same', 0.5),\n    ],\n)\ndef test_url_similarity(url_1: str, url_2: str, expected_rounded_similarity: float) -> None:\n    assert (\n        round(calculate_url_similarity(url_1=get_url_components(url_1), url_2=get_url_components(url_2)), 2)\n        == expected_rounded_similarity\n    )\n"
  },
  {
    "path": "tests/unit/crawlers/_basic/test_basic_crawler.py",
    "content": "# ruff: noqa: ARG001\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os\nimport re\nimport sys\nimport time\nfrom asyncio import Future\nfrom collections import Counter\nfrom concurrent.futures import ProcessPoolExecutor\nfrom dataclasses import dataclass\nfrom datetime import timedelta\nfrom itertools import product\nfrom typing import TYPE_CHECKING, Any, Literal, cast\nfrom unittest.mock import AsyncMock, Mock, call, patch\n\nimport pytest\n\nfrom crawlee import ConcurrencySettings, Glob, service_locator\nfrom crawlee._request import Request, RequestState\nfrom crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod\nfrom crawlee._utils.robots import RobotsTxtFile\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import BasicCrawler\nfrom crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError\nfrom crawlee.events import Event, EventCrawlerStatusData\nfrom crawlee.events._local_event_manager import LocalEventManager\nfrom crawlee.request_loaders import RequestList, RequestManagerTandem\nfrom crawlee.sessions import Session, SessionPool\nfrom crawlee.statistics import FinalStatistics\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\nfrom crawlee.storages import Dataset, KeyValueStore, RequestQueue\n\nif TYPE_CHECKING:\n    from collections.abc import Callable, Sequence\n    from pathlib import Path\n\n    from yarl import URL\n\n    from crawlee._types import JsonSerializable\n    from crawlee.statistics import StatisticsState\n\n\nasync def test_processes_requests_from_explicit_queue() -> None:\n    queue = await RequestQueue.open()\n    await queue.add_requests(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    crawler = BasicCrawler(request_manager=queue)\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n\n    await crawler.run()\n\n    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']\n\n\nasync def test_processes_requests_from_request_source_tandem() -> None:\n    request_queue = await RequestQueue.open()\n    await request_queue.add_requests(\n        ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']\n    )\n\n    request_list = RequestList(['https://a.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com'])\n\n    crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue))\n    calls = set[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.add(context.request.url)\n\n    await crawler.run()\n\n    assert calls == {\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n        'https://d.placeholder.com',\n        'https://e.placeholder.com',\n    }\n\n\nasync def test_processes_requests_from_run_args() -> None:\n    crawler = BasicCrawler()\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']\n\n\nasync def test_allows_multiple_run_calls() -> None:\n    crawler = BasicCrawler()\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    assert calls == [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n    ]\n\n\nasync def test_retries_failed_requests() -> None:\n    crawler = BasicCrawler()\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n\n        if context.request.url == 'https://b.placeholder.com':\n            raise RuntimeError('Arbitrary crash for testing purposes')\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    assert calls == [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n        'https://b.placeholder.com',\n        'https://b.placeholder.com',\n        'https://b.placeholder.com',\n    ]\n\n\nasync def test_respects_no_retry() -> None:\n    crawler = BasicCrawler(max_request_retries=2)\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n        raise RuntimeError('Arbitrary crash for testing purposes')\n\n    await crawler.run(\n        [\n            'https://a.placeholder.com',\n            'https://b.placeholder.com',\n            Request.from_url(url='https://c.placeholder.com', no_retry=True),\n        ]\n    )\n\n    assert calls == [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n    ]\n\n\nasync def test_respects_request_specific_max_retries() -> None:\n    crawler = BasicCrawler(max_request_retries=0)\n    calls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        calls.append(context.request.url)\n        raise RuntimeError('Arbitrary crash for testing purposes')\n\n    await crawler.run(\n        [\n            'https://a.placeholder.com',\n            'https://b.placeholder.com',\n            Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}),\n        ]\n    )\n\n    assert calls == [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n        'https://c.placeholder.com',\n    ]\n\n\nasync def test_calls_error_handler() -> None:\n    # Data structure to better track the calls to the error handler.\n    @dataclass(frozen=True)\n    class Call:\n        url: str\n        error: Exception\n\n    # List to store the information of calls to the error handler.\n    calls = list[Call]()\n\n    crawler = BasicCrawler(max_request_retries=2)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.request.url == 'https://b.placeholder.com':\n            raise RuntimeError('Arbitrary crash for testing purposes')\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request:\n        # Append the current call information.\n        calls.append(Call(context.request.url, error))\n        return context.request\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    # Verify that the error handler was called twice\n    assert len(calls) == 2\n\n    # Check calls\n    for error_call in calls:\n        assert error_call.url == 'https://b.placeholder.com'\n        assert isinstance(error_call.error, RuntimeError)\n\n\nasync def test_calls_error_handler_for_session_errors() -> None:\n    crawler = BasicCrawler(\n        max_session_rotations=1,\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        raise SessionError('Arbitrary session error for testing purposes')\n\n    error_handler_mock = AsyncMock()\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        await error_handler_mock(context, error)\n\n    await crawler.run(['https://crawlee.dev'])\n\n    assert error_handler_mock.call_count == 1\n\n\nasync def test_handles_error_in_error_handler() -> None:\n    crawler = BasicCrawler(max_request_retries=3)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.request.url == 'https://b.placeholder.com':\n            raise RuntimeError('Arbitrary crash for testing purposes')\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        raise RuntimeError('Crash in error handler')\n\n    with pytest.raises(UserDefinedErrorHandlerError):\n        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n\nasync def test_calls_failed_request_handler() -> None:\n    crawler = BasicCrawler(max_request_retries=3)\n    calls = list[tuple[BasicCrawlingContext, Exception]]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.request.url == 'https://b.placeholder.com':\n            raise RuntimeError('Arbitrary crash for testing purposes')\n\n    @crawler.failed_request_handler\n    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        calls.append((context, error))\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    assert len(calls) == 1\n    assert calls[0][0].request.url == 'https://b.placeholder.com'\n    assert isinstance(calls[0][1], RuntimeError)\n\n\n@pytest.mark.parametrize('handler', ['failed_request_handler', 'error_handler'])\nasync def test_handlers_use_context_helpers(tmp_path: Path, handler: str) -> None:\n    \"\"\"Test that context helpers used in `failed_request_handler` and in `error_handler` have effect.\"\"\"\n    # Prepare crawler\n    storage_client = FileSystemStorageClient()\n    crawler = BasicCrawler(\n        max_request_retries=1, storage_client=storage_client, configuration=Configuration(storage_dir=str(tmp_path))\n    )\n    # Test data\n    rq_alias = 'other'\n    test_data = {'some': 'data'}\n    test_key = 'key'\n    test_value = 'value'\n    test_request = Request.from_url('https://d.placeholder.com')\n\n    # Request handler with injected error\n    @crawler.router.default_handler\n    async def request_handler(context: BasicCrawlingContext) -> None:\n        raise RuntimeError('Arbitrary crash for testing purposes')\n\n    # Apply one of the handlers\n    @getattr(crawler, handler)\n    async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None:\n        await context.push_data(test_data)\n        await context.add_requests(requests=[test_request], rq_alias=rq_alias)\n        kvs = await context.get_key_value_store()\n        await kvs.set_value(test_key, test_value)\n\n    await crawler.run(['https://b.placeholder.com'])\n\n    # Verify that the context helpers used in handlers had effect on used storages\n    dataset = await Dataset.open(storage_client=storage_client)\n    kvs = await KeyValueStore.open(storage_client=storage_client)\n    rq = await RequestQueue.open(alias=rq_alias, storage_client=storage_client)\n\n    assert test_value == await kvs.get_value(test_key)\n    assert [test_data] == (await dataset.get_data()).items\n    assert test_request == await rq.fetch_next_request()\n\n\nasync def test_handles_error_in_failed_request_handler() -> None:\n    crawler = BasicCrawler(max_request_retries=3)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.request.url == 'https://b.placeholder.com':\n            raise RuntimeError('Arbitrary crash for testing purposes')\n\n    @crawler.failed_request_handler\n    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        raise RuntimeError('Crash in failed request handler')\n\n    with pytest.raises(UserDefinedErrorHandlerError):\n        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n\n@pytest.mark.parametrize(\n    ('method', 'path', 'payload'),\n    [\n        pytest.param('GET', 'get', None, id='get send_request'),\n        pytest.param('POST', 'post', b'Hello, world!', id='post send_request'),\n    ],\n)\nasync def test_send_request_works(server_url: URL, method: HttpMethod, path: str, payload: None | bytes) -> None:\n    response_data: dict[str, Any] = {}\n\n    crawler = BasicCrawler(max_request_retries=3)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        response = await context.send_request(str(server_url / path), method=method, payload=payload)\n\n        response_data['body'] = json.loads(await response.read())\n        response_data['headers'] = response.headers\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    response_body = response_data.get('body')\n    assert response_body is not None\n    assert response_body.get('data') == (payload.decode() if payload else None)\n\n    response_headers = response_data.get('headers')\n    assert response_headers is not None\n    content_type = response_headers.get('content-type')\n    assert content_type is not None\n    assert content_type == 'application/json'\n\n\n@dataclass\nclass AddRequestsTestInput:\n    start_url: str\n    loaded_url: str\n    requests: Sequence[str | Request]\n    expected_urls: Sequence[str]\n    kwargs: EnqueueLinksKwargs\n\n\nSTRATEGY_TEST_URLS = (\n    'https://someplace.com/',\n    'http://someplace.com/index.html',\n    'https://blog.someplace.com/index.html',\n    'https://redirect.someplace.com',\n    'https://other.place.com/index.html',\n    'https://someplace.jp/',\n)\n\nINCLUDE_TEST_URLS = (\n    'https://someplace.com/',\n    'https://someplace.com/blog/category/cats',\n    'https://someplace.com/blog/category/boots',\n    'https://someplace.com/blog/archive/index.html',\n    'https://someplace.com/blog/archive/cats',\n)\n\n\n@pytest.mark.parametrize(\n    'test_input',\n    argvalues=[\n        # Basic use case\n        pytest.param(\n            AddRequestsTestInput(\n                start_url='https://a.placeholder.com',\n                loaded_url='https://a.placeholder.com',\n                requests=[\n                    'https://a.placeholder.com',\n                    Request.from_url('https://b.placeholder.com'),\n                    'https://c.placeholder.com',\n                ],\n                kwargs={},\n                expected_urls=['https://b.placeholder.com', 'https://c.placeholder.com'],\n            ),\n            id='basic',\n        ),\n        # Enqueue strategy\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[0],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(),\n                expected_urls=STRATEGY_TEST_URLS[1:],\n            ),\n            id='enqueue_strategy_default',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[0],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='all'),\n                expected_urls=STRATEGY_TEST_URLS[1:],\n            ),\n            id='enqueue_strategy_all',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[0],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-domain'),\n                expected_urls=STRATEGY_TEST_URLS[1:4],\n            ),\n            id='enqueue_strategy_same_domain',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[0],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-hostname'),\n                expected_urls=[STRATEGY_TEST_URLS[1]],\n            ),\n            id='enqueue_strategy_same_hostname',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[0],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-origin'),\n                expected_urls=[],\n            ),\n            id='enqueue_strategy_same_origin',\n        ),\n        # Enqueue strategy with redirect\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[3],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(),\n                expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],\n            ),\n            id='redirect_enqueue_strategy_default',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[3],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='all'),\n                expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],\n            ),\n            id='redirect_enqueue_strategy_all',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[3],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-domain'),\n                expected_urls=STRATEGY_TEST_URLS[:3],\n            ),\n            id='redirect_enqueue_strategy_same_domain',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[3],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-hostname'),\n                expected_urls=[],\n            ),\n            id='redirect_enqueue_strategy_same_hostname',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=STRATEGY_TEST_URLS[3],\n                loaded_url=STRATEGY_TEST_URLS[0],\n                requests=STRATEGY_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(strategy='same-origin'),\n                expected_urls=[],\n            ),\n            id='redirect_enqueue_strategy_same_origin',\n        ),\n        # Include/exclude\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=INCLUDE_TEST_URLS[0],\n                loaded_url=INCLUDE_TEST_URLS[0],\n                requests=INCLUDE_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(include=[Glob('https://someplace.com/**/cats')]),\n                expected_urls=[INCLUDE_TEST_URLS[1], INCLUDE_TEST_URLS[4]],\n            ),\n            id='include_exclude_1',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=INCLUDE_TEST_URLS[0],\n                loaded_url=INCLUDE_TEST_URLS[0],\n                requests=INCLUDE_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(exclude=[Glob('https://someplace.com/**/cats')]),\n                expected_urls=[INCLUDE_TEST_URLS[2], INCLUDE_TEST_URLS[3]],\n            ),\n            id='include_exclude_2',\n        ),\n        pytest.param(\n            AddRequestsTestInput(\n                start_url=INCLUDE_TEST_URLS[0],\n                loaded_url=INCLUDE_TEST_URLS[0],\n                requests=INCLUDE_TEST_URLS,\n                kwargs=EnqueueLinksKwargs(\n                    include=[Glob('https://someplace.com/**/cats')], exclude=[Glob('https://**/archive/**')]\n                ),\n                expected_urls=[INCLUDE_TEST_URLS[1]],\n            ),\n            id='include_exclude_3',\n        ),\n    ],\n)\nasync def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None:\n    visit = Mock()\n\n    crawler = BasicCrawler()\n\n    @crawler.router.handler('start')\n    async def start_handler(context: BasicCrawlingContext) -> None:\n        # Assign test value to loaded_url - BasicCrawler does not do any navigation by itself\n        context.request.loaded_url = test_input.loaded_url\n        await context.add_requests(\n            test_input.requests,\n            **test_input.kwargs,\n        )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        visit(context.request.url)\n\n    await crawler.run([Request.from_url(test_input.start_url, label='start')])\n\n    visited = {call[0][0] for call in visit.call_args_list}\n    assert visited == set(test_input.expected_urls)\n\n\nasync def test_session_rotation(server_url: URL) -> None:\n    session_ids: list[str | None] = []\n\n    crawler = BasicCrawler(\n        max_session_rotations=7,\n        max_request_retries=1,\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        session_ids.append(context.session.id if context.session else None)\n        raise SessionError('Test error')\n\n    await crawler.run([str(server_url)])\n\n    # exactly 7 handler calls happened\n    assert len(session_ids) == 7\n\n    # all session ids are not None\n    assert None not in session_ids\n\n    # and each was a different session\n    assert len(set(session_ids)) == 7\n\n\nasync def test_final_statistics() -> None:\n    crawler = BasicCrawler(max_request_retries=2)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        id_param = context.request.get_query_param_from_url('id')\n        assert id_param is not None\n        id = int(id_param)\n\n        await asyncio.sleep(0.001)\n\n        if context.request.retry_count == 0 and id % 2 == 0:\n            raise RuntimeError('First crash')\n\n        if context.request.retry_count == 1 and id % 3 == 0:\n            raise RuntimeError('Second crash')\n\n        if context.request.retry_count == 2 and id % 4 == 0:\n            raise RuntimeError('Third crash')\n\n    final_statistics = await crawler.run(\n        [Request.from_url(f'https://someplace.com/?id={id}', label='start') for id in range(50)]\n    )\n\n    assert final_statistics.requests_total == 50\n    assert final_statistics.requests_finished == 45\n    assert final_statistics.requests_failed == 5\n\n    assert final_statistics.retry_histogram == [25, 16, 9]\n\n    assert final_statistics.request_avg_finished_duration is not None\n    assert final_statistics.request_avg_finished_duration > timedelta()\n\n    assert final_statistics.request_avg_failed_duration is not None\n    assert final_statistics.request_avg_failed_duration > timedelta()\n\n    assert final_statistics.request_total_duration > timedelta()\n\n    assert final_statistics.crawler_runtime > timedelta()\n\n    assert final_statistics.requests_finished_per_minute > 0\n    assert final_statistics.requests_failed_per_minute > 0\n\n\nasync def test_crawler_get_storages() -> None:\n    crawler = BasicCrawler()\n\n    rp = await crawler.get_request_manager()\n    assert isinstance(rp, RequestQueue)\n\n    dataset = await crawler.get_dataset()\n    assert isinstance(dataset, Dataset)\n\n    kvs = await crawler.get_key_value_store()\n    assert isinstance(kvs, KeyValueStore)\n\n\nasync def test_crawler_run_requests() -> None:\n    crawler = BasicCrawler()\n    seen_urls = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        seen_urls.append(context.request.url)\n\n    start_urls = [\n        'http://test.io/1',\n        'http://test.io/2',\n        'http://test.io/3',\n    ]\n    stats = await crawler.run(start_urls)\n\n    assert seen_urls == start_urls\n    assert stats.requests_total == 3\n    assert stats.requests_finished == 3\n\n\nasync def test_context_push_and_get_data() -> None:\n    crawler = BasicCrawler()\n    dataset = await Dataset.open()\n\n    await dataset.push_data({'a': 1})\n    assert (await crawler.get_data()).items == [{'a': 1}]\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.push_data({'b': 2})\n\n    await dataset.push_data({'c': 3})\n    assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}]\n\n    stats = await crawler.run(['http://test.io/1'])\n\n    assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}, {'b': 2}]\n    assert stats.requests_total == 1\n    assert stats.requests_finished == 1\n\n\nasync def test_context_push_and_get_data_handler_error() -> None:\n    crawler = BasicCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.push_data({'b': 2})\n        raise RuntimeError('Watch me crash')\n\n    stats = await crawler.run(['https://a.placeholder.com'])\n\n    assert (await crawler.get_data()).items == []\n    assert stats.requests_total == 1\n    assert stats.requests_finished == 0\n    assert stats.requests_failed == 1\n\n\nasync def test_crawler_push_and_export_data(tmp_path: Path) -> None:\n    crawler = BasicCrawler()\n    dataset = await Dataset.open()\n\n    await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])\n    await dataset.push_data({'id': 2, 'test': 'test'})\n\n    await crawler.export_data(path=tmp_path / 'dataset.json')\n    await crawler.export_data(path=tmp_path / 'dataset.csv')\n\n    assert json.load((tmp_path / 'dataset.json').open()) == [\n        {'id': 0, 'test': 'test'},\n        {'id': 1, 'test': 'test'},\n        {'id': 2, 'test': 'test'},\n    ]\n\n    # On Windows, text mode file writes convert \\n to \\r\\n, resulting in \\r\\n line endings.\n    # On Unix/Linux, \\n remains as \\n.\n    if sys.platform == 'win32':\n        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\\r\\n0,test\\r\\n1,test\\r\\n2,test\\r\\n'\n    else:\n        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\\n0,test\\n1,test\\n2,test\\n'\n\n\nasync def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None:\n    crawler = BasicCrawler()\n    dataset = await Dataset.open()\n\n    await dataset.push_data({'z': 1, 'a': 2})\n\n    json_path = tmp_path / 'dataset.json'\n    csv_path = tmp_path / 'dataset.csv'\n\n    await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':'))\n    await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\\n')\n\n    assert json_path.read_text() == '[{\"a\":2,\"z\":1}]'\n    assert csv_path.read_text() == 'z;a\\n1;2\\n'\n\n\nasync def test_context_push_and_export_data(tmp_path: Path) -> None:\n    crawler = BasicCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])\n        await context.push_data({'id': 2, 'test': 'test'})\n\n    await crawler.run(['http://test.io/1'])\n\n    await crawler.export_data(path=tmp_path / 'dataset.json')\n    await crawler.export_data(path=tmp_path / 'dataset.csv')\n\n    assert json.load((tmp_path / 'dataset.json').open()) == [\n        {'id': 0, 'test': 'test'},\n        {'id': 1, 'test': 'test'},\n        {'id': 2, 'test': 'test'},\n    ]\n\n    # On Windows, text mode file writes convert \\n to \\r\\n, resulting in \\r\\n line endings.\n    # On Unix/Linux, \\n remains as \\n.\n    if sys.platform == 'win32':\n        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\\r\\n0,test\\r\\n1,test\\r\\n2,test\\r\\n'\n    else:\n        assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\\n0,test\\n1,test\\n2,test\\n'\n\n\nasync def test_context_update_kv_store() -> None:\n    crawler = BasicCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        store = await context.get_key_value_store()\n        await store.set_value('foo', 'bar')\n\n    await crawler.run(['https://hello.world'])\n\n    store = await crawler.get_key_value_store()\n    assert (await store.get_value('foo')) == 'bar'\n\n\nasync def test_context_use_state() -> None:\n    crawler = BasicCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.use_state({'hello': 'world'})\n\n    await crawler.run(['https://hello.world'])\n\n    kvs = await crawler.get_key_value_store()\n    value = await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')\n\n    assert value == {'hello': 'world'}\n\n\nasync def test_crawler_use_state() -> None:\n    crawler = BasicCrawler()\n\n    await crawler.use_state({'hello': 'world'})\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        # The state set by the crawler must be available in the context of the request handler\n        state = await context.use_state()\n        assert state == {'hello': 'world'}\n\n    await crawler.run(['https://hello.world'])\n\n\nasync def test_context_use_state_crawlers_share_state() -> None:\n    async def handler(context: BasicCrawlingContext) -> None:\n        state = await context.use_state({'urls': []})\n        assert isinstance(state['urls'], list)\n        state['urls'].append(context.request.url)\n\n    crawler_1 = BasicCrawler(id=0, request_handler=handler)\n    crawler_2 = BasicCrawler(id=0, request_handler=handler)\n\n    await crawler_1.run(['https://a.com'])\n    await crawler_2.run(['https://b.com'])\n\n    kvs = await KeyValueStore.open()\n    assert crawler_1._id == crawler_2._id == 0\n    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_{crawler_1._id}') == {\n        'urls': ['https://a.com', 'https://b.com']\n    }\n\n\nasync def test_crawlers_share_stats() -> None:\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.use_state({'urls': []})\n\n    crawler_1 = BasicCrawler(id=0, request_handler=handler)\n    crawler_2 = BasicCrawler(id=0, request_handler=handler, statistics=crawler_1.statistics)\n\n    result1 = await crawler_1.run(['https://a.com'])\n    result2 = await crawler_2.run(['https://b.com'])\n\n    assert crawler_1.statistics == crawler_2.statistics\n    assert result1.requests_finished == 1\n    assert result2.requests_finished == 2\n\n\nasync def test_context_use_state_crawlers_own_state() -> None:\n    async def handler(context: BasicCrawlingContext) -> None:\n        state = await context.use_state({'urls': []})\n        assert isinstance(state['urls'], list)\n        state['urls'].append(context.request.url)\n\n    crawler_1 = BasicCrawler(request_handler=handler)\n    crawler_2 = BasicCrawler(request_handler=handler)\n\n    await crawler_1.run(['https://a.com'])\n    await crawler_2.run(['https://b.com'])\n\n    kvs = await KeyValueStore.open()\n    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0') == {'urls': ['https://a.com']}\n    assert await kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1') == {'urls': ['https://b.com']}\n\n\nasync def test_context_handlers_use_state(key_value_store: KeyValueStore) -> None:\n    state_in_handler_one: dict[str, JsonSerializable] = {}\n    state_in_handler_two: dict[str, JsonSerializable] = {}\n    state_in_handler_three: dict[str, JsonSerializable] = {}\n\n    crawler = BasicCrawler()\n\n    @crawler.router.handler('one')\n    async def handler_one(context: BasicCrawlingContext) -> None:\n        state = await context.use_state({'hello': 'world'})\n        state_in_handler_one.update(state)\n        state['hello'] = 'new_world'\n        await context.add_requests([Request.from_url('https://crawlee.dev/docs/quick-start', label='two')])\n\n    @crawler.router.handler('two')\n    async def handler_two(context: BasicCrawlingContext) -> None:\n        state = await context.use_state({'hello': 'world'})\n        state_in_handler_two.update(state)\n        state['hello'] = 'last_world'\n\n    @crawler.router.handler('three')\n    async def handler_three(context: BasicCrawlingContext) -> None:\n        state = await context.use_state({'hello': 'world'})\n        state_in_handler_three.update(state)\n\n    await crawler.run([Request.from_url('https://crawlee.dev/', label='one')])\n    await crawler.run([Request.from_url('https://crawlee.dev/docs/examples', label='three')])\n\n    # The state in handler_one must match the default state\n    assert state_in_handler_one == {'hello': 'world'}\n\n    # The state in handler_two must match the state updated in handler_one\n    assert state_in_handler_two == {'hello': 'new_world'}\n\n    # The state in handler_three must match the final state updated in previous run\n    assert state_in_handler_three == {'hello': 'last_world'}\n\n    store = await crawler.get_key_value_store()\n\n    # The state in the KVS must match with the last set state\n    assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')) == {'hello': 'last_world'}\n\n\n@pytest.mark.parametrize(\n    'use_failed_requests', [pytest.param(True, id='failed requests'), pytest.param(False, id='finished requests')]\n)\nasync def test_max_requests_per_crawl(*, use_failed_requests: bool) -> None:\n    start_urls = [\n        'http://test.io/1',\n        'http://test.io/2',\n        'http://test.io/3',\n        'http://test.io/4',\n        'http://test.io/5',\n    ]\n    processed_urls = []\n\n    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately\n    crawler = BasicCrawler(\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        max_requests_per_crawl=3,\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if use_failed_requests:\n            raise RuntimeError('Arbitrary crash for testing purposes')\n        processed_urls.append(context.request.url)\n\n    stats = await crawler.run(start_urls)\n\n    # Verify that only 3 out of the 5 provided URLs were made\n    if not use_failed_requests:\n        assert len(processed_urls) == 3\n        assert stats.requests_finished == 3\n    assert stats.requests_total == 3\n\n\nasync def test_max_crawl_depth() -> None:\n    processed_urls = []\n\n    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately\n    crawler = BasicCrawler(\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        max_crawl_depth=2,\n    )\n\n    @crawler.router.handler('start')\n    async def start_handler(context: BasicCrawlingContext) -> None:\n        processed_urls.append(context.request.url)\n        await context.add_requests(['https://someplace.com/too-deep'])\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        processed_urls.append(context.request.url)\n\n    start_request = Request.from_url('https://someplace.com/', label='start')\n    start_request.crawl_depth = 2\n\n    stats = await crawler.run([start_request])\n\n    assert len(processed_urls) == 1\n    assert stats.requests_total == 1\n    assert stats.requests_finished == 1\n\n\n@pytest.mark.parametrize(\n    ('total_requests', 'fail_at_request', 'expected_starts', 'expected_finished'),\n    [\n        (3, None, 3, 3),\n        (3, 2, 2, 1),\n    ],\n    ids=[\n        'all_requests_successful',\n        'abort_on_second_request',\n    ],\n)\nasync def test_abort_on_error(\n    total_requests: int, fail_at_request: int | None, expected_starts: int, expected_finished: int\n) -> None:\n    starts_urls = []\n\n    crawler = BasicCrawler(\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        abort_on_error=True,\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        starts_urls.append(context.request.url)\n\n        if context.request.user_data.get('n_request') == fail_at_request:\n            raise ValueError('Error request')\n\n    stats = await crawler.run(\n        [\n            Request.from_url('https://crawlee.dev', always_enqueue=True, user_data={'n_request': i + 1})\n            for i in range(total_requests)\n        ]\n    )\n\n    assert len(starts_urls) == expected_starts\n    assert stats.requests_finished == expected_finished\n\n\ndef test_crawler_log() -> None:\n    crawler = BasicCrawler()\n    assert isinstance(crawler.log, logging.Logger)\n    crawler.log.info('Test log message')\n\n\nasync def test_consecutive_runs_purge_request_queue() -> None:\n    crawler = BasicCrawler()\n    visit = Mock()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        visit(context.request.url)\n\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    counter = Counter(args[0][0] for args in visit.call_args_list)\n    assert counter == {\n        'https://a.placeholder.com': 3,\n        'https://b.placeholder.com': 3,\n        'https://c.placeholder.com': 3,\n    }\n\n\n@pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')\n@pytest.mark.parametrize(\n    ('statistics_log_format'),\n    [\n        pytest.param('table', id='With table for logs'),\n        pytest.param('inline', id='With inline logs'),\n    ],\n)\nasync def test_logs_final_statistics(\n    monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline']\n) -> None:\n    # Set the log level to INFO to capture the final statistics log.\n    caplog.set_level(logging.INFO)\n\n    crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.push_data({'something': 'something'})\n\n    fake_statistics = FinalStatistics(\n        requests_finished=4,\n        requests_failed=33,\n        retry_histogram=[1, 4, 8],\n        request_avg_failed_duration=timedelta(seconds=99),\n        request_avg_finished_duration=timedelta(milliseconds=483),\n        requests_finished_per_minute=0.33,\n        requests_failed_per_minute=0.1,\n        request_total_duration=timedelta(minutes=12),\n        requests_total=37,\n        crawler_runtime=timedelta(minutes=5),\n    )\n\n    monkeypatch.setattr(crawler._statistics, 'calculate', lambda: fake_statistics)\n\n    result = await crawler.run()\n    assert result is fake_statistics\n\n    final_statistics = next(\n        (record for record in caplog.records if record.msg.startswith('Final')),\n        None,\n    )\n\n    assert final_statistics is not None\n    if statistics_log_format == 'table':\n        assert final_statistics.msg.splitlines() == [\n            'Final request statistics:',\n            '┌───────────────────────────────┬────────────┐',\n            '│ requests_finished             │ 4          │',\n            '│ requests_failed               │ 33         │',\n            '│ retry_histogram               │ [1, 4, 8]  │',\n            '│ request_avg_failed_duration   │ 1min 39.0s │',\n            '│ request_avg_finished_duration │ 483.0ms    │',\n            '│ requests_finished_per_minute  │ 0.33       │',\n            '│ requests_failed_per_minute    │ 0.1        │',\n            '│ request_total_duration        │ 12min      │',\n            '│ requests_total                │ 37         │',\n            '│ crawler_runtime               │ 5min       │',\n            '└───────────────────────────────┴────────────┘',\n        ]\n    else:\n        assert final_statistics.msg == 'Final request statistics:'\n\n        # `extra` parameters are not defined on `LogRecord`, so we cast to `Any` to access them.\n        record = cast('Any', final_statistics)\n\n        assert record.requests_finished == 4\n        assert record.requests_failed == 33\n        assert record.retry_histogram == [1, 4, 8]\n        assert record.request_avg_failed_duration == 99.0\n        assert record.request_avg_finished_duration == 0.483\n        assert record.requests_finished_per_minute == 0.33\n        assert record.requests_failed_per_minute == 0.1\n        assert record.request_total_duration == 720.0\n        assert record.requests_total == 37\n        assert record.crawler_runtime == 300.0\n\n\nasync def test_crawler_manual_stop() -> None:\n    \"\"\"Test that no new requests are handled after crawler.stop() is called.\"\"\"\n    start_urls = [\n        'http://test.io/1',\n        'http://test.io/2',\n        'http://test.io/3',\n    ]\n    processed_urls = []\n\n    # Set max_concurrency to 1 to ensure testing urls are visited one by one in order.\n    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1))\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        processed_urls.append(context.request.url)\n        if context.request.url == start_urls[1]:\n            crawler.stop()\n\n    stats = await crawler.run(start_urls)\n\n    # Verify that only 2 out of the 3 provided URLs were made\n    assert len(processed_urls) == 2\n    assert stats.requests_total == 2\n    assert stats.requests_finished == 2\n\n\n@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')\nasync def test_crawler_multiple_stops_in_parallel() -> None:\n    \"\"\"Test that no new requests are handled after crawler.stop() is called, but ongoing requests can still finish.\"\"\"\n\n    start_urls = [\n        'http://test.io/1',\n        'http://test.io/2',\n        'http://test.io/3',\n    ]\n    processed_urls = []\n\n    # Set concurrency to 2 to ensure two urls are being visited in parallel.\n    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2))\n\n    both_handlers_started = asyncio.Barrier(2)  # type:ignore[attr-defined]  # Test is skipped in older Python versions.\n    only_one_handler_at_a_time = asyncio.Semaphore(1)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await both_handlers_started.wait()  # Block until both handlers are started.\n\n        async with only_one_handler_at_a_time:\n            # Reliably create situation where one handler called `crawler.stop()`, while other handler is still running.\n            crawler.stop(reason=f'Stop called on {context.request.url}')\n            processed_urls.append(context.request.url)\n\n    stats = await crawler.run(start_urls)\n\n    # Verify that only 2 out of the 3 provided URLs were made\n    assert len(processed_urls) == 2\n    assert stats.requests_total == 2\n    assert stats.requests_finished == 2\n\n\nasync def test_services_no_side_effect_on_crawler_init() -> None:\n    custom_configuration = Configuration()\n    custom_event_manager = LocalEventManager.from_config(custom_configuration)\n    custom_storage_client = MemoryStorageClient()\n\n    _ = BasicCrawler(\n        configuration=custom_configuration,\n        event_manager=custom_event_manager,\n        storage_client=custom_storage_client,\n    )\n\n    assert service_locator.get_configuration() is not custom_configuration\n    assert service_locator.get_event_manager() is not custom_event_manager\n    assert service_locator.get_storage_client() is not custom_storage_client\n\n\nasync def test_crawler_uses_default_services() -> None:\n    custom_configuration = Configuration()\n    service_locator.set_configuration(custom_configuration)\n\n    custom_event_manager = LocalEventManager.from_config(custom_configuration)\n    service_locator.set_event_manager(custom_event_manager)\n\n    custom_storage_client = MemoryStorageClient()\n    service_locator.set_storage_client(custom_storage_client)\n\n    basic_crawler = BasicCrawler()\n\n    assert basic_crawler._service_locator.get_configuration() is custom_configuration\n    assert basic_crawler._service_locator.get_event_manager() is custom_event_manager\n    assert basic_crawler._service_locator.get_storage_client() is custom_storage_client\n\n\nasync def test_services_crawlers_can_use_different_services() -> None:\n    custom_configuration_1 = Configuration()\n    custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1)\n    custom_storage_client_1 = MemoryStorageClient()\n\n    custom_configuration_2 = Configuration()\n    custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2)\n    custom_storage_client_2 = MemoryStorageClient()\n\n    _ = BasicCrawler(\n        configuration=custom_configuration_1,\n        event_manager=custom_event_manager_1,\n        storage_client=custom_storage_client_1,\n    )\n\n    _ = BasicCrawler(\n        configuration=custom_configuration_2,\n        event_manager=custom_event_manager_2,\n        storage_client=custom_storage_client_2,\n    )\n\n\nasync def test_crawler_uses_default_storages(tmp_path: Path) -> None:\n    configuration = Configuration(\n        storage_dir=str(tmp_path),\n        purge_on_start=True,\n    )\n    service_locator.set_configuration(configuration)\n\n    dataset = await Dataset.open()\n    kvs = await KeyValueStore.open()\n    rq = await RequestQueue.open()\n\n    crawler = BasicCrawler()\n\n    assert dataset is await crawler.get_dataset()\n    assert kvs is await crawler.get_key_value_store()\n    assert rq is await crawler.get_request_manager()\n\n\nasync def test_crawler_can_use_other_storages(tmp_path: Path) -> None:\n    configuration = Configuration(\n        storage_dir=str(tmp_path),\n        purge_on_start=True,\n    )\n    service_locator.set_configuration(configuration)\n\n    dataset = await Dataset.open()\n    kvs = await KeyValueStore.open()\n    rq = await RequestQueue.open()\n\n    crawler = BasicCrawler(storage_client=MemoryStorageClient())\n\n    assert dataset is not await crawler.get_dataset()\n    assert kvs is not await crawler.get_key_value_store()\n    assert rq is not await crawler.get_request_manager()\n\n\nasync def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> None:\n    \"\"\"Test that crawler can use non-global storage of the same type as global storage without conflicts\"\"\"\n    a_path = tmp_path / 'a'\n    b_path = tmp_path / 'b'\n    a_path.mkdir()\n    b_path.mkdir()\n    expected_paths = {\n        path / storage\n        for path, storage in product({a_path, b_path}, {'datasets', 'key_value_stores', 'request_queues'})\n    }\n\n    configuration_a = Configuration(\n        storage_dir=str(a_path),\n        purge_on_start=True,\n    )\n    configuration_b = Configuration(\n        storage_dir=str(b_path),\n        purge_on_start=True,\n    )\n\n    # Set global configuration\n    service_locator.set_configuration(configuration_a)\n    service_locator.set_storage_client(FileSystemStorageClient())\n    # Create storages based on the global services\n    dataset = await Dataset.open()\n    kvs = await KeyValueStore.open()\n    rq = await RequestQueue.open()\n\n    # Set the crawler to use different storage client\n    crawler = BasicCrawler(storage_client=FileSystemStorageClient(), configuration=configuration_b)\n\n    # Assert that the storages are different\n    assert dataset is not await crawler.get_dataset()\n    assert kvs is not await crawler.get_key_value_store()\n    assert rq is not await crawler.get_request_manager()\n\n    # Assert that all storages exists on the filesystem\n    for path in expected_paths:\n        assert path.is_dir()\n\n\nasync def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None:\n    custom_storage_client = MemoryStorageClient()\n\n    crawler = BasicCrawler(\n        storage_client=custom_storage_client,\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await context.push_data({'foo': 'bar'})\n\n    other_storage_client = MemoryStorageClient()\n    service_locator.set_storage_client(other_storage_client)\n\n    with monkeypatch.context() as monkey:\n        spy = Mock(wraps=service_locator.get_storage_client)\n        monkey.setattr(service_locator, 'get_storage_client', spy)\n        await crawler.run(['https://does-not-matter.com'])\n        assert spy.call_count >= 1\n\n    dataset = await crawler.get_dataset()\n    data = await dataset.get_data()\n    assert data.items == [{'foo': 'bar'}]\n\n\n@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')\nasync def test_context_use_state_race_condition_in_handlers(key_value_store: KeyValueStore) -> None:\n    \"\"\"Two parallel handlers increment global variable obtained by `use_state` method.\n\n    Result should be incremented by 2.\n    Method `use_state` must be implemented in a way that prevents race conditions in such scenario.\"\"\"\n    # Test is skipped in older Python versions.\n    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415\n\n    crawler = BasicCrawler()\n    store = await crawler.get_key_value_store()\n    await store.set_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0', {'counter': 0})\n    handler_barrier = Barrier(2)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        state = cast('dict[str, int]', await context.use_state())\n        await handler_barrier.wait()  # Block until both handlers get the state.\n        state['counter'] += 1\n        await handler_barrier.wait()  # Block until both handlers increment the state.\n\n    await crawler.run(['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'])\n\n    store = await crawler.get_key_value_store()\n    # Ensure that local state is pushed back to kvs.\n    await store.persist_autosaved_values()\n    assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0'))['counter'] == 2\n\n\n@pytest.mark.run_alone\n@pytest.mark.flaky(\n    reruns=3, reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1652.'\n)\n@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.timeout was introduced in Python 3.11.')\n@pytest.mark.parametrize(\n    'sleep_type',\n    [\n        pytest.param('async_sleep'),\n        pytest.param('sync_sleep', marks=pytest.mark.skip(reason='https://github.com/apify/crawlee-python/issues/908')),\n    ],\n)\nasync def test_timeout_in_handler(sleep_type: str) -> None:\n    \"\"\"Test that timeout from request handler is treated the same way as exception thrown in request handler.\n\n    Handler should be able to time out even if the code causing the timeout is blocking sync code.\n    Crawler should attempt to retry it.\n    This test creates situation where the request handler times out twice, on third retry it does not time out.\"\"\"\n    # Test is skipped in older Python versions.\n    from asyncio import timeout  # type:ignore[attr-defined] # noqa: PLC0415\n\n    non_realtime_system_coefficient = 10\n    handler_timeout = timedelta(seconds=1)\n    max_request_retries = 3\n    double_handler_timeout_s = handler_timeout.total_seconds() * 2\n    handler_sleep = iter([double_handler_timeout_s, double_handler_timeout_s, 0])\n\n    crawler = BasicCrawler(\n        request_handler_timeout=handler_timeout,\n        max_request_retries=max_request_retries,\n        storage_client=MemoryStorageClient(),\n    )\n\n    mocked_handler_before_sleep = Mock()\n    mocked_handler_after_sleep = Mock()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        mocked_handler_before_sleep()\n\n        if sleep_type == 'async_sleep':\n            await asyncio.sleep(next(handler_sleep))\n        else:\n            time.sleep(next(handler_sleep))  # noqa:ASYNC251  # Using blocking sleep in async function is the test.\n\n        # This will not execute if timeout happens.\n        mocked_handler_after_sleep()\n\n    # Timeout in pytest, because previous implementation would run crawler until following:\n    # \"The request queue seems to be stuck for 300.0s, resetting internal state.\"\n    async with timeout(max_request_retries * double_handler_timeout_s * non_realtime_system_coefficient):\n        await crawler.run(['https://a.placeholder.com'])\n\n    assert crawler.statistics.state.requests_finished == 1\n    assert mocked_handler_before_sleep.call_count == max_request_retries\n    assert mocked_handler_after_sleep.call_count == 1\n\n\n@pytest.mark.flaky(\n    reruns=3,\n    reason='Test is flaky on Windows and MacOS, see https://github.com/apify/crawlee-python/issues/1649.',\n)\n@pytest.mark.parametrize(\n    ('keep_alive', 'max_requests_per_crawl', 'expected_handled_requests_count'),\n    [\n        pytest.param(True, 2, 2, id='keep_alive, 2 requests'),\n        pytest.param(True, 1, 1, id='keep_alive, but max_requests_per_crawl achieved after 1 request'),\n        pytest.param(False, 2, 0, id='Crawler without keep_alive (default), crawler finished before adding requests'),\n    ],\n)\nasync def test_keep_alive(\n    *, keep_alive: bool, max_requests_per_crawl: int, expected_handled_requests_count: int\n) -> None:\n    \"\"\"Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.\n\n    Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag.\"\"\"\n    additional_urls = ['https://a.placeholder.com', 'https://b.placeholder.com']\n    expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]\n\n    crawler = BasicCrawler(\n        keep_alive=keep_alive,\n        max_requests_per_crawl=max_requests_per_crawl,\n        # If more request can run in parallel, then max_requests_per_crawl is not deterministic.\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        storage_client=MemoryStorageClient(),\n    )\n    mocked_handler = Mock()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        mocked_handler(context.request.url)\n        if context.request == additional_urls[-1]:\n            crawler.stop()\n\n    crawler_run_task = asyncio.create_task(crawler.run())\n\n    # Give some time to crawler to finish(or be in keep_alive state) and add new request.\n    # TODO: Replace sleep time by waiting for specific crawler state.\n    # https://github.com/apify/crawlee-python/issues/925\n    await asyncio.sleep(1)\n    assert crawler_run_task.done() != keep_alive\n    add_request_task = asyncio.create_task(crawler.add_requests(additional_urls))\n\n    await asyncio.gather(crawler_run_task, add_request_task)\n\n    mocked_handler.assert_has_calls(expected_handler_calls)\n\n\n@pytest.mark.parametrize(\n    ('retire'),\n    [\n        pytest.param(False, id='without retire'),\n        pytest.param(True, id='with retire'),\n    ],\n)\nasync def test_session_retire_in_user_handler(*, retire: bool) -> None:\n    crawler = BasicCrawler(session_pool=SessionPool(max_pool_size=1))\n    sessions = list[str]()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.session:\n            sessions.append(context.session.id)\n\n            context.session.retire() if retire else None\n\n        await context.add_requests(['https://b.placeholder.com'])\n\n    await crawler.run(['https://a.placeholder.com'])\n\n    # The session should differ if `retire` was called and match otherwise since pool size == 1\n    if retire:\n        assert sessions[1] != sessions[0]\n    else:\n        assert sessions[1] == sessions[0]\n\n\nasync def test_bound_session_to_request() -> None:\n    async with SessionPool() as session_pool:\n        check_session: Session = await session_pool.get_session()\n        used_sessions = list[str]()\n        crawler = BasicCrawler(session_pool=session_pool)\n\n        @crawler.router.default_handler\n        async def handler(context: BasicCrawlingContext) -> None:\n            if context.session:\n                used_sessions.append(context.session.id)\n\n        requests = [\n            Request.from_url('https://a.placeholder.com', session_id=check_session.id, always_enqueue=True)\n            for _ in range(10)\n        ]\n\n        await crawler.run(requests)\n\n        assert len(used_sessions) == 10\n        assert set(used_sessions) == {check_session.id}\n\n\nasync def test_bound_sessions_to_same_request() -> None:\n    # Use a custom function to avoid errors due to random Session retrieval\n    def create_session_function() -> Callable[[], Session]:\n        counter = -1\n\n        def create_session() -> Session:\n            nonlocal counter\n            counter += 1\n            return Session(id=str(counter))\n\n        return create_session\n\n    check_sessions = [str(session_id) for session_id in range(10)]\n    used_sessions = list[str]()\n    crawler = BasicCrawler(session_pool=SessionPool(create_session_function=create_session_function()))\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if context.session:\n            used_sessions.append(context.session.id)\n\n    requests = [\n        Request.from_url('https://a.placeholder.com', session_id=str(session_id), use_extended_unique_key=True)\n        for session_id in range(10)\n    ]\n\n    await crawler.run(requests)\n\n    assert len(used_sessions) == 10\n    assert set(used_sessions) == set(check_sessions)\n\n\nasync def test_error_bound_session_to_request() -> None:\n    crawler = BasicCrawler(request_handler=AsyncMock())\n\n    requests = [Request.from_url('https://a.placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)]\n\n    stats = await crawler.run(requests)\n\n    assert stats.requests_total == 10\n    assert stats.requests_failed == 10\n    assert stats.retry_histogram == [10]\n\n\nasync def test_handle_error_bound_session_to_request() -> None:\n    error_handler_mock = AsyncMock()\n    crawler = BasicCrawler(request_handler=AsyncMock())\n\n    @crawler.failed_request_handler\n    async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> None:\n        if isinstance(error, RequestCollisionError):\n            await error_handler_mock(context, error)\n\n    requests = [Request.from_url('https://a.placeholder.com', session_id='1')]\n\n    await crawler.run(requests)\n\n    assert error_handler_mock.call_count == 1\n\n\nasync def test_handles_session_error_in_failed_request_handler() -> None:\n    crawler = BasicCrawler(max_session_rotations=1)\n    handler_requests = set()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        raise SessionError('blocked')\n\n    @crawler.failed_request_handler\n    async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:\n        handler_requests.add(context.request.url)\n\n    requests = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']\n\n    await crawler.run(requests)\n\n    assert set(requests) == handler_requests\n\n\nasync def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None:\n    crawler = BasicCrawler(respect_robots_txt_file=True)\n\n    with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy:\n        await asyncio.gather(\n            *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)]\n        )\n\n        # Check that the lock was acquired only once\n        assert spy.call_count == 1\n\n\nasync def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCaptureFixture) -> None:\n    caplog.set_level(logging.INFO)\n    crawler = BasicCrawler(\n        configure_logging=False,\n        max_request_retries=1,\n        request_handler_timeout=timedelta(seconds=1),\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        # Intentionally add a delay longer than the timeout to trigger the timeout mechanism\n        await asyncio.sleep(10)  # INJECTED DELAY\n\n    # Capture all logs from the 'crawlee' logger at INFO level or higher\n    with caplog.at_level(logging.INFO, logger='crawlee'):\n        await crawler.run([Request.from_url('https://a.placeholder.com')])\n\n    # Check for the timeout message in any of the logs\n    found_timeout_message = False\n    for record in caplog.records:\n        if record.message and 'timed out after 1.0 seconds' in record.message:\n            full_message = (record.message or '') + (record.exc_text or '')\n            assert '\\n' not in full_message\n            assert '# INJECTED DELAY' in full_message\n            found_timeout_message = True\n            break\n\n    assert found_timeout_message, 'Expected log message about request handler error was not found.'\n\n\nasync def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:\n    crawler = BasicCrawler(configure_logging=False, max_request_retries=1)\n\n    @crawler.router.default_handler\n    async def default_handler(_: BasicCrawlingContext) -> None:\n        await asyncio.wait_for(Future(), timeout=1)\n\n    # Capture all logs from the 'crawlee' logger at INFO level or higher\n    with caplog.at_level(logging.INFO, logger='crawlee'):\n        await crawler.run([Request.from_url('https://a.placeholder.com')])\n\n    # Check for 1 line summary message\n    found_timeout_message = False\n    for record in caplog.records:\n        if re.match(\n            r'Retrying request to .* due to: Timeout raised by user defined handler\\. File .*, line .*,'\n            r' in default_handler,     await asyncio.wait_for\\(Future\\(\\), timeout=1\\)',\n            record.message,\n        ):\n            found_timeout_message = True\n            break\n\n    assert found_timeout_message, 'Expected log message about request handler error was not found.'\n\n\nasync def test_status_message_callback() -> None:\n    \"\"\"Test that status message callback is called with the correct message.\"\"\"\n    status_message_callback = AsyncMock()\n    states: list[dict[str, StatisticsState | None]] = []\n\n    async def status_callback(\n        state: StatisticsState, previous_state: StatisticsState | None, message: str\n    ) -> str | None:\n        await status_message_callback(message)\n        states.append({'state': state, 'previous_state': previous_state})\n        return message\n\n    crawler = BasicCrawler(\n        status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01)\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        await asyncio.sleep(0.1)  # Simulate some processing time\n\n    await crawler.run(['https://a.placeholder.com'])\n\n    assert status_message_callback.called\n\n    assert len(states) > 1\n\n    first_call = states[0]\n    second_call = states[1]\n\n    # For the first call, `previous_state` is None\n    assert first_call['state'] is not None\n    assert first_call['previous_state'] is None\n\n    # For second call, `previous_state` is the first state\n    assert second_call['state'] is not None\n    assert second_call['previous_state'] is not None\n    assert second_call['previous_state'] == first_call['state']\n\n\nasync def test_status_message_emit() -> None:\n    event_manager = service_locator.get_event_manager()\n\n    status_message_listener = Mock()\n\n    def listener(event_data: EventCrawlerStatusData) -> None:\n        status_message_listener(event_data)\n\n    event_manager.on(event=Event.CRAWLER_STATUS, listener=listener)\n\n    crawler = BasicCrawler(request_handler=AsyncMock())\n\n    await crawler.run(['https://a.placeholder.com'])\n\n    event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)\n\n    assert status_message_listener.called\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None, *, by_id: bool) -> None:\n    crawler = BasicCrawler()\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_id = rq.id\n        queue_name = None\n    else:\n        queue_id = None\n    visit_urls = set()\n\n    check_requests = [\n        Request.from_url('https://a.placeholder.com'),\n        Request.from_url('https://b.placeholder.com'),\n        Request.from_url('https://c.placeholder.com'),\n    ]\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.add_requests(check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run(['https://start.placeholder.com'])\n\n    requests_from_queue = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request)\n\n    assert requests_from_queue == check_requests\n    assert visit_urls == {'https://start.placeholder.com'}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'queue_id'),\n    [\n        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),\n        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),\n        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),\n        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),\n    ],\n)\nasync def test_add_requests_error_with_multi_params(\n    queue_id: str | None, queue_name: str | None, queue_alias: str | None\n) -> None:\n    crawler = BasicCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        with pytest.raises(ValueError, match='Only one of `rq_id`, `rq_name` or `rq_alias` can be set'):\n            await context.add_requests(\n                [Request.from_url('https://a.placeholder.com')],\n                rq_id=queue_id,\n                rq_name=queue_name,\n                rq_alias=queue_alias,\n            )\n\n    await crawler.run(['https://start.placeholder.com'])\n\n\nasync def test_crawler_purge_request_queue_uses_same_storage_client() -> None:\n    \"\"\"Make sure that purge on start does not replace the storage client in the underlying storage manager\"\"\"\n\n    # Set some different storage_client globally and different for Crawlee.\n    service_locator.set_storage_client(FileSystemStorageClient())\n    unrelated_rq = await RequestQueue.open()\n    unrelated_request = Request.from_url('https://x.placeholder.com')\n    await unrelated_rq.add_request(unrelated_request)\n\n    crawler = BasicCrawler(storage_client=MemoryStorageClient())\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        context.log.info(context.request.url)\n\n    for _ in (1, 2):\n        await crawler.run(requests=[Request.from_url('https://a.placeholder.com')], purge_request_queue=True)\n        assert crawler.statistics.state.requests_finished == 1\n\n    # Crawler should not fall back to the default storage after the purge\n    assert await unrelated_rq.fetch_next_request() == unrelated_request\n\n\nasync def _run_crawler(crawler_id: int | None, requests: list[str], storage_dir: str) -> StatisticsState:\n    \"\"\"Run crawler and return its statistics state.\n\n    Must be defined like this to be pickable for ProcessPoolExecutor.\"\"\"\n\n    async def request_handler(context: BasicCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n        # Add visited url to crawler state and use it to verify state persistence.\n        state = await context.use_state({'urls': []})\n        state['urls'] = state.get('urls')\n        assert isinstance(state['urls'], list)\n        state['urls'].append(context.request.url)\n        context.log.info(f'State {state}')\n\n    crawler = BasicCrawler(\n        id=crawler_id,\n        request_handler=request_handler,\n        concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1),\n        configuration=Configuration(\n            storage_dir=storage_dir,\n            purge_on_start=False,\n        ),\n    )\n\n    await crawler.run(requests)\n    return crawler.statistics.state\n\n\n@dataclass\nclass _CrawlerInput:\n    requests: list[str]\n    id: None | int = None\n\n\ndef _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_dir: str) -> list[StatisticsState]:\n    return [\n        asyncio.run(_run_crawler(crawler_id=crawler_input.id, requests=crawler_input.requests, storage_dir=storage_dir))\n        for crawler_input in crawler_inputs\n    ]\n\n\nasync def test_crawler_state_persistence(tmp_path: Path) -> None:\n    \"\"\"Test that crawler statistics and state persist and are loaded correctly.\n\n    This test simulates starting the crawler process twice, and checks that the statistics include first run.\"\"\"\n\n    state_kvs = await KeyValueStore.open(\n        storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))\n    )\n\n    with ProcessPoolExecutor() as executor:\n        # Crawl 2 requests in the first run and automatically persist the state.\n        first_run_state = executor.submit(\n            _process_run_crawlers,\n            crawler_inputs=[_CrawlerInput(requests=['https://a.placeholder.com', 'https://b.placeholder.com'])],\n            storage_dir=str(tmp_path),\n        ).result()[0]\n        # Expected state after first crawler run\n        assert first_run_state.requests_finished == 2\n        state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')\n        assert state.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com']\n\n    # Do not reuse the executor to simulate a fresh process to avoid modified class attributes.\n    with ProcessPoolExecutor() as executor:\n        # Crawl 1 additional requests in the second run, but use previously automatically persisted state.\n        second_run_state = executor.submit(\n            _process_run_crawlers,\n            crawler_inputs=[_CrawlerInput(requests=['https://c.placeholder.com'])],\n            storage_dir=str(tmp_path),\n        ).result()[0]\n\n        # Expected state after second crawler run\n        # 2 requests from first run and 1 request from second run.\n        assert second_run_state.requests_finished == 3\n\n        state = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')\n        assert state.get('urls') == [\n            'https://a.placeholder.com',\n            'https://b.placeholder.com',\n            'https://c.placeholder.com',\n        ]\n\n    assert first_run_state.crawler_started_at == second_run_state.crawler_started_at\n    assert first_run_state.crawler_finished_at\n    assert second_run_state.crawler_finished_at\n\n    assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at\n    assert first_run_state.crawler_runtime < second_run_state.crawler_runtime\n\n\nasync def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Path) -> None:\n    \"\"\"Test that crawler statistics and state persist and are loaded correctly.\n\n    This test simulates starting the crawler process twice, and checks that the statistics include first run.\n    Each time two distinct crawlers are running, and they should keep using their own statistics and state.\"\"\"\n    state_kvs = await KeyValueStore.open(\n        storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))\n    )\n\n    with ProcessPoolExecutor() as executor:\n        # Run 2 crawler, each crawl 1 request in and automatically persist the state.\n        first_run_states = executor.submit(\n            _process_run_crawlers,\n            crawler_inputs=[\n                _CrawlerInput(requests=['https://a.placeholder.com']),\n                _CrawlerInput(requests=['https://c.placeholder.com']),\n            ],\n            storage_dir=str(tmp_path),\n        ).result()\n        # Expected state after first crawler run\n        assert first_run_states[0].requests_finished == 1\n        assert first_run_states[1].requests_finished == 1\n        state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')\n        assert state_0.get('urls') == ['https://a.placeholder.com']\n        state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1')\n        assert state_1.get('urls') == ['https://c.placeholder.com']\n\n    with ProcessPoolExecutor() as executor:\n        # Run 2 crawler, each crawl 1 request in and automatically persist the state.\n        second_run_states = executor.submit(\n            _process_run_crawlers,\n            crawler_inputs=[\n                _CrawlerInput(requests=['https://b.placeholder.com']),\n                _CrawlerInput(requests=['https://d.placeholder.com']),\n            ],\n            storage_dir=str(tmp_path),\n        ).result()\n        # Expected state after first crawler run\n        assert second_run_states[0].requests_finished == 2\n        assert second_run_states[1].requests_finished == 2\n        state_0 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')\n        assert state_0.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com']\n        state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1')\n        assert state_1.get('urls') == ['https://c.placeholder.com', 'https://d.placeholder.com']\n\n\nasync def test_crawler_intermediate_statistics() -> None:\n    \"\"\"Test that crawler statistics are correctly updating total runtime on every calculate call.\"\"\"\n    crawler = BasicCrawler()\n    check_time = timedelta(seconds=0.1)\n\n    async def wait_for_statistics_initialization() -> None:\n        while not crawler.statistics.active:  # noqa: ASYNC110 # It is ok for tests.\n            await asyncio.sleep(0.1)\n\n    @crawler.router.default_handler\n    async def handler(_: BasicCrawlingContext) -> None:\n        await asyncio.sleep(check_time.total_seconds() * 5)\n\n    # Start crawler and wait until statistics are initialized.\n    crawler_task = asyncio.create_task(crawler.run(['https://a.placeholder.com']))\n    await wait_for_statistics_initialization()\n\n    # Wait some time and check that runtime is updated.\n    await asyncio.sleep(check_time.total_seconds())\n    crawler.statistics.calculate()\n    assert crawler.statistics.state.crawler_runtime >= check_time\n\n    # Wait for crawler to finish\n    await crawler_task\n\n\nasync def test_protect_request_in_run_handlers() -> None:\n    \"\"\"Test that request in crawling context are protected in run handlers.\"\"\"\n    request_queue = await RequestQueue.open(name='state-test')\n\n    request = Request.from_url('https://test.url/', user_data={'request_state': ['initial']})\n\n    crawler = BasicCrawler(request_manager=request_queue, max_request_retries=0)\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if isinstance(context.request.user_data['request_state'], list):\n            context.request.user_data['request_state'].append('modified')\n        raise ValueError('Simulated error after modifying request')\n\n    await crawler.run([request])\n\n    check_request = await request_queue.get_request(request.unique_key)\n    assert check_request is not None\n    assert check_request.user_data['request_state'] == ['initial']\n\n    await request_queue.drop()\n\n\nasync def test_new_request_error_handler() -> None:\n    \"\"\"Test that error in new_request_handler is handled properly.\"\"\"\n    queue = await RequestQueue.open()\n    crawler = BasicCrawler(\n        request_manager=queue,\n    )\n\n    request = Request.from_url('https://a.placeholder.com')\n\n    @crawler.router.default_handler\n    async def handler(context: BasicCrawlingContext) -> None:\n        if '|test' in context.request.unique_key:\n            return\n        raise ValueError('This error should not be handled by error handler')\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request | None:\n        return Request.from_url(\n            context.request.url,\n            unique_key=f'{context.request.unique_key}|test',\n        )\n\n    await crawler.run([request])\n\n    original_request = await queue.get_request(request.unique_key)\n    error_request = await queue.get_request(f'{request.unique_key}|test')\n\n    assert original_request is not None\n    assert original_request.state == RequestState.ERROR_HANDLER\n    assert original_request.was_already_handled\n\n    assert error_request is not None\n    assert error_request.state == RequestState.DONE\n    assert error_request.was_already_handled\n"
  },
  {
    "path": "tests/unit/crawlers/_basic/test_context_pipeline.py",
    "content": "from __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING\nfrom unittest.mock import AsyncMock\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee.crawlers import ContextPipeline\nfrom crawlee.errors import ContextPipelineFinalizationError, ContextPipelineInitializationError, RequestHandlerError\nfrom crawlee.sessions._session import Session\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n\n@dataclass(frozen=True)\nclass EnhancedCrawlingContext(BasicCrawlingContext):\n    foo: str\n\n\n@dataclass(frozen=True)\nclass MoreEnhancedCrawlingContext(EnhancedCrawlingContext):\n    bar: int\n\n\nasync def test_calls_consumer_without_middleware() -> None:\n    consumer = AsyncMock()\n\n    pipeline = ContextPipeline()\n    context = BasicCrawlingContext(\n        request=Request.from_url(url='https://test.io/'),\n        send_request=AsyncMock(),\n        add_requests=AsyncMock(),\n        session=Session(),\n        proxy_info=AsyncMock(),\n        push_data=AsyncMock(),\n        use_state=AsyncMock(),\n        get_key_value_store=AsyncMock(),\n        log=logging.getLogger(),\n    )\n\n    await pipeline(context, consumer)\n\n    consumer.assert_called_once_with(context)\n\n\nasync def test_calls_consumers_and_middlewares() -> None:\n    events = list[str]()\n\n    async def consumer(context: MoreEnhancedCrawlingContext) -> None:\n        events.append('consumer_called')\n        assert context.bar == 4\n\n    async def middleware_a(context: BasicCrawlingContext) -> AsyncGenerator[EnhancedCrawlingContext, None]:\n        events.append('middleware_a_in')\n        yield EnhancedCrawlingContext(\n            request=context.request,\n            foo='foo',\n            send_request=AsyncMock(),\n            add_requests=AsyncMock(),\n            session=context.session,\n            proxy_info=AsyncMock(),\n            push_data=AsyncMock(),\n            use_state=AsyncMock(),\n            get_key_value_store=AsyncMock(),\n            log=logging.getLogger(),\n        )\n        events.append('middleware_a_out')\n\n    async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreEnhancedCrawlingContext, None]:\n        events.append('middleware_b_in')\n        yield MoreEnhancedCrawlingContext(\n            request=context.request,\n            foo=context.foo,\n            bar=4,\n            send_request=AsyncMock(),\n            add_requests=AsyncMock(),\n            session=context.session,\n            proxy_info=AsyncMock(),\n            push_data=AsyncMock(),\n            use_state=AsyncMock(),\n            get_key_value_store=AsyncMock(),\n            log=logging.getLogger(),\n        )\n        events.append('middleware_b_out')\n\n    pipeline = ContextPipeline[BasicCrawlingContext]().compose(middleware_a).compose(middleware_b)\n\n    context = BasicCrawlingContext(\n        request=Request.from_url(url='https://test.io/'),\n        send_request=AsyncMock(),\n        add_requests=AsyncMock(),\n        session=Session(),\n        proxy_info=AsyncMock(),\n        push_data=AsyncMock(),\n        use_state=AsyncMock(),\n        get_key_value_store=AsyncMock(),\n        log=logging.getLogger(),\n    )\n    await pipeline(context, consumer)\n\n    assert events == [\n        'middleware_a_in',\n        'middleware_b_in',\n        'consumer_called',\n        'middleware_b_out',\n        'middleware_a_out',\n    ]\n\n\nasync def test_wraps_consumer_errors() -> None:\n    consumer = AsyncMock(side_effect=RuntimeError('Arbitrary crash for testing purposes'))\n\n    pipeline = ContextPipeline()\n    context = BasicCrawlingContext(\n        request=Request.from_url(url='https://test.io/'),\n        send_request=AsyncMock(),\n        add_requests=AsyncMock(),\n        session=Session(),\n        proxy_info=AsyncMock(),\n        push_data=AsyncMock(),\n        use_state=AsyncMock(),\n        get_key_value_store=AsyncMock(),\n        log=logging.getLogger(),\n    )\n\n    with pytest.raises(RequestHandlerError):\n        await pipeline(context, consumer)\n\n\nasync def test_handles_exceptions_in_middleware_initialization() -> None:\n    consumer = AsyncMock()\n    cleanup = AsyncMock()\n\n    async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:\n        yield context\n        await cleanup()\n\n    async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:\n        raise RuntimeError('Crash during middleware initialization')\n        yield context\n\n    pipeline = ContextPipeline().compose(step_1).compose(step_2)\n    context = BasicCrawlingContext(\n        request=Request.from_url(url='https://test.io/'),\n        send_request=AsyncMock(),\n        add_requests=AsyncMock(),\n        session=Session(),\n        proxy_info=AsyncMock(),\n        push_data=AsyncMock(),\n        use_state=AsyncMock(),\n        get_key_value_store=AsyncMock(),\n        log=logging.getLogger(),\n    )\n\n    with pytest.raises(ContextPipelineInitializationError):\n        await pipeline(context, consumer)\n\n    assert not consumer.called\n    assert cleanup.called\n\n\nasync def test_handles_exceptions_in_middleware_finalization() -> None:\n    consumer = AsyncMock()\n    cleanup = AsyncMock()\n\n    async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:\n        yield context\n        await cleanup()\n\n    async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]:\n        yield context\n        raise RuntimeError('Crash during middleware finalization')\n\n    pipeline = ContextPipeline().compose(step_1).compose(step_2)\n    context = BasicCrawlingContext(\n        request=Request.from_url(url='https://test.io/'),\n        send_request=AsyncMock(),\n        add_requests=AsyncMock(),\n        session=Session(),\n        proxy_info=AsyncMock(),\n        push_data=AsyncMock(),\n        use_state=AsyncMock(),\n        get_key_value_store=AsyncMock(),\n        log=logging.getLogger(),\n    )\n\n    with pytest.raises(ContextPipelineFinalizationError):\n        await pipeline(context, consumer)\n\n    assert consumer.called\n    assert not cleanup.called\n"
  },
  {
    "path": "tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\nfrom unittest import mock\n\nimport pytest\n\nfrom crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason\nfrom crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext\nfrom crawlee.storages import RequestQueue\n\nif TYPE_CHECKING:\n    from yarl import URL\n\n    from crawlee._request import RequestOptions\n    from crawlee.http_clients._base import HttpClient\n\n\nasync def test_basic(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    handler = mock.AsyncMock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        links = context.soup.find_all('a')\n        await handler(links)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    assert handler.called\n\n    # The handler should find three links\n    assert len(handler.call_args[0][0]) == 3\n\n\nasync def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:\n    redirect_target = str(server_url / 'start_enqueue')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    assert visit.mock_calls[0] == expected_visit_calls[0]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:\n    redirect_target = str(server_url / 'start_enqueue_non_href')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(selector='img', attribute='src')\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'base_subpath/image_1')),\n        mock.call(str(server_url / 'image_2')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(selector='a.foo')\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:\n    start_urls = [str(server_url / 'start_enqueue')]\n    processed_urls = []\n\n    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately\n    crawler = BeautifulSoupCrawler(\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        max_requests_per_crawl=3,\n        http_client=http_client,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        await context.enqueue_links()\n        processed_urls.append(context.request.url)\n\n    stats = await crawler.run(start_urls)\n\n    # Verify that only 3 out of the possible 5 requests were made\n    assert len(processed_urls) == 3\n    assert stats.requests_total == 3\n    assert stats.requests_finished == 3\n\n\nasync def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    visit = mock.Mock()\n    headers = []\n\n    def test_transform_request_function(\n        request_options: RequestOptions,\n    ) -> RequestOptions | RequestTransformAction:\n        if 'page_3' in request_options['url']:\n            return 'skip'\n\n        request_options['headers'] = HttpHeaders({'transform-header': 'my-header'})\n        return request_options\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        headers.append(context.request.headers)\n\n        await context.enqueue_links(transform_request_function=test_transform_request_function)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    # url /page_3 should not be visited\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'page_4')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # # all urls added to `enqueue_links` must have a custom header\n    assert headers[1]['transform-header'] == 'my-header'\n    assert headers[2]['transform-header'] == 'my-header'\n    assert headers[3]['transform-header'] == 'my-header'\n\n\nasync def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(max_session_rotations=1, http_client=http_client)\n    stats = await crawler.run([str(server_url / 'incapsula')])\n    assert stats.requests_failed == 1\n\n\ndef test_default_logger() -> None:\n    assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler'\n\n\nasync def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.\"\"\"\n    visit = mock.Mock()\n    fail = mock.Mock()\n    crawler = BeautifulSoupCrawler(\n        http_client=http_client,\n        respect_robots_txt_file=True,\n        max_request_retries=0,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(strategy='all')\n\n    @crawler.failed_request_handler\n    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        fail(context.request.url)\n\n    await crawler.run([str(server_url / 'problematic_links')])\n\n    # Email must be skipped\n    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.\n    expected_visit_calls = [\n        mock.call(str(server_url / 'problematic_links')),\n        mock.call('https://avatars.githubusercontent.com/apify'),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # The budplaceholder.com does not exist.\n    expected_fail_calls = [\n        mock.call('https://budplaceholder.com/'),\n    ]\n    fail.assert_has_calls(expected_fail_calls, any_order=True)\n\n\nasync def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)\n    skip = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        await context.enqueue_links()\n\n    @crawler.on_skipped_request\n    async def skipped_hook(url: str, _reason: SkippedReason) -> None:\n        skip(url)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_skip_calls = [\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n    ]\n    skip.assert_has_calls(expected_skip_calls, any_order=True)\n\n\nasync def test_extract_links(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_1')\n\n\nasync def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        links = await context.extract_links(selector='li', attribute='data-href')\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'non_href_links')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_2')\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_with_rq_param(\n    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    @crawler.router.default_handler\n    async def handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_requests_with_rq_param(\n    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    check_requests: list[str] = [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n    ]\n\n    @crawler.router.default_handler\n    async def handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(\n            requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all'\n        )\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == set(check_requests)\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_id', 'queue_name', 'queue_alias'),\n    [\n        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),\n        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),\n        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),\n        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),\n    ],\n)\nasync def test_enqueue_links_error_with_multi_params(\n    server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None\n) -> None:\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n\n    @crawler.router.default_handler\n    async def handler(context: BeautifulSoupCrawlingContext) -> None:\n        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):\n            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n\nasync def test_navigation_timeout_on_slow_request(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test that navigation_timeout causes TimeoutError on slow HTTP requests.\"\"\"\n    crawler = BeautifulSoupCrawler(\n        http_client=http_client,\n        navigation_timeout=timedelta(seconds=1),\n        max_request_retries=0,\n    )\n\n    failed_request_handler = mock.AsyncMock()\n    crawler.failed_request_handler(failed_request_handler)\n\n    request_handler = mock.AsyncMock()\n    crawler.router.default_handler(request_handler)\n\n    # Request endpoint that delays 5 seconds - should timeout at 1 second\n    await crawler.run([str(server_url.with_path('/slow').with_query(delay=5))])\n\n    assert failed_request_handler.call_count == 1\n    assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)\n\n\nasync def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:\n    crawler = BeautifulSoupCrawler(\n        navigation_timeout=timedelta(seconds=1),\n        max_request_retries=0,\n    )\n\n    request_handler = mock.AsyncMock()\n    crawler.router.default_handler(request_handler)\n    crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))\n\n    # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled\n    result = await crawler.run([str(server_url)])\n\n    assert result.requests_failed == 1\n    assert result.requests_finished == 0\n    assert request_handler.call_count == 0\n\n\nasync def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL, http_client: HttpClient) -> None:\n    crawler = BeautifulSoupCrawler(\n        http_client=http_client,\n        request_handler_timeout=timedelta(seconds=0.5),\n        max_request_retries=0,\n    )\n\n    request_handler = mock.AsyncMock()\n    crawler.router.default_handler(request_handler)\n\n    # Navigation takes 1 second (exceeds handler timeout), but should still succeed\n    result = await crawler.run([str(server_url.with_path('/slow').with_query(delay=1))])\n\n    assert result.requests_failed == 0\n    assert result.requests_finished == 1\n    assert request_handler.call_count == 1\n\n\nasync def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None:\n    crawler = BeautifulSoupCrawler()\n\n    handler_calls = mock.AsyncMock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        await handler_calls(context.request.url)\n\n        target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url)))\n\n        await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin')\n\n    await crawler.run([str(server_url)])\n\n    assert handler_calls.called\n    assert handler_calls.call_count == 1\n\n\nasync def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:\n    start_url = str(server_url / 'sub_index')\n    requests = [start_url]\n\n    crawler = BeautifulSoupCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(limit=1)\n\n    await crawler.run(requests)\n\n    # Only one link should be enqueued from sub_index due to the limit\n    expected_visit_calls = [\n        mock.call(start_url),\n        mock.call(str(server_url / 'page_3')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n"
  },
  {
    "path": "tests/unit/crawlers/_http/test_http_crawler.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom typing import TYPE_CHECKING\nfrom unittest.mock import AsyncMock, Mock\nfrom urllib.parse import parse_qs, urlencode\n\nimport pytest\n\nfrom crawlee import ConcurrencySettings, Request, RequestState\nfrom crawlee.crawlers import HttpCrawler\nfrom crawlee.sessions import SessionPool\nfrom crawlee.statistics import Statistics\nfrom crawlee.storages import RequestQueue\nfrom tests.unit.server_endpoints import HELLO_WORLD\n\nif TYPE_CHECKING:\n    from collections.abc import Awaitable, Callable\n\n    from yarl import URL\n\n    from crawlee._types import BasicCrawlingContext\n    from crawlee.crawlers import HttpCrawlingContext\n    from crawlee.http_clients._base import HttpClient\n\n# Payload, e.g. data for a form submission.\nPAYLOAD = {\n    'custname': 'John Doe',\n    'custtel': '1234567890',\n    'custemail': 'johndoe@example.com',\n    'size': 'large',\n    'topping': '[\"bacon\", \"cheese\", \"mushroom\"]',\n    'delivery': '13:00',\n    'comments': 'Please ring the doorbell upon arrival.',\n}\n\n\n@pytest.fixture\nasync def mock_request_handler() -> Callable[[HttpCrawlingContext], Awaitable[None]] | AsyncMock:\n    return AsyncMock()\n\n\n@pytest.fixture\nasync def crawler(\n    http_client: HttpClient, mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]]\n) -> HttpCrawler:\n    return HttpCrawler(http_client=http_client, request_handler=mock_request_handler)\n\n\n@pytest.fixture\nasync def crawler_without_retries(\n    mock_request_handler: Callable[[HttpCrawlingContext], Awaitable[None]],\n) -> HttpCrawler:\n    return HttpCrawler(\n        request_handler=mock_request_handler,\n        retry_on_blocked=False,\n        max_request_retries=0,\n    )\n\n\nasync def test_fetches_html(\n    crawler: HttpCrawler,\n    mock_request_handler: AsyncMock,\n    server_url: URL,\n) -> None:\n    await crawler.add_requests([str(server_url)])\n    await crawler.run()\n\n    mock_request_handler.assert_called_once()\n    assert mock_request_handler.call_args[0][0].request.url == str(server_url)\n\n\nasync def test_handles_redirects(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None:\n    redirect_target = str(server_url)\n    redirect_url = str(server_url.with_path('redirect').with_query(url=redirect_target))\n    await crawler.add_requests([redirect_url])\n    await crawler.run()\n\n    mock_request_handler.assert_called_once()\n    assert mock_request_handler.call_args[0][0].request.loaded_url == redirect_target\n    assert mock_request_handler.call_args[0][0].request.url == redirect_url\n\n\n@pytest.mark.parametrize(\n    ('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'),\n    [\n        # error without retry for all 4xx statuses\n        pytest.param([], [], 1, id='default_behavior'),\n        # make retry for codes in `additional_http_error_status_codes` list\n        pytest.param([402], [], 3, id='additional_status_codes'),\n        # take as successful status codes from the `ignore_http_error_status_codes` list\n        pytest.param([], [402], 0, id='ignore_error_status_codes'),\n        # check precedence for `additional_http_error_status_codes`\n        pytest.param([402], [402], 3, id='additional_and_ignore'),\n    ],\n)\nasync def test_handles_client_errors(\n    additional_http_error_status_codes: list[int],\n    ignore_http_error_status_codes: list[int],\n    expected_number_error: int,\n    mock_request_handler: AsyncMock,\n    server_url: URL,\n) -> None:\n    crawler = HttpCrawler(\n        request_handler=mock_request_handler,\n        additional_http_error_status_codes=additional_http_error_status_codes,\n        ignore_http_error_status_codes=ignore_http_error_status_codes,\n        max_request_retries=2,\n    )\n\n    await crawler.add_requests([str(server_url / 'status/402')])\n    await crawler.run()\n\n    assert crawler.statistics.error_tracker.total == expected_number_error\n\n    # Request handler should not be called for error status codes.\n    if expected_number_error:\n        mock_request_handler.assert_not_called()\n    else:\n        mock_request_handler.assert_called()\n\n\n@pytest.mark.parametrize(\n    ('ignore_http_error_status_codes', 'use_session_pool', 'expected_session_rotate', 'expected_number_error'),\n    [\n        # change session and retry for no block 4xx statuses\n        pytest.param([], True, 4, 1, id='default_behavior'),\n        # error without retry for all 4xx statuses\n        pytest.param([], False, 0, 1, id='default_behavior_without_session_pool'),\n        # take as successful status codes from the `ignore_http_error_status_codes` list with Session Pool\n        pytest.param([403], True, 0, 0, id='ignore_error_status_codes'),\n        # take as successful status codes from the `ignore_http_error_status_codes` list without Session Pool\n        pytest.param([403], False, 0, 0, id='ignore_error_status_codes_without_session_pool'),\n    ],\n)\nasync def test_handles_session_block_errors(\n    *,\n    ignore_http_error_status_codes: list[int],\n    use_session_pool: bool,\n    expected_session_rotate: int,\n    expected_number_error: int,\n    mock_request_handler: AsyncMock,\n    server_url: URL,\n) -> None:\n    crawler = HttpCrawler(\n        request_handler=mock_request_handler,\n        ignore_http_error_status_codes=ignore_http_error_status_codes,\n        max_request_retries=3,\n        max_session_rotations=5,\n        use_session_pool=use_session_pool,\n    )\n\n    await crawler.add_requests([str(server_url / 'status/403')])\n    await crawler.run()\n\n    assert crawler.statistics.error_tracker.total == expected_number_error\n    assert crawler.statistics.error_tracker_retry.total == expected_session_rotate\n\n    # Request handler should not be called for error status codes.\n    if expected_number_error:\n        mock_request_handler.assert_not_called()\n    else:\n        mock_request_handler.assert_called()\n\n\nasync def test_handles_server_error(crawler: HttpCrawler, mock_request_handler: AsyncMock, server_url: URL) -> None:\n    await crawler.add_requests([str(server_url / 'status/500')])\n    await crawler.run()\n\n    mock_request_handler.assert_not_called()\n\n\nasync def test_stores_cookies(http_client: HttpClient, server_url: URL) -> None:\n    visit = Mock()\n    track_session_usage = Mock()\n\n    async with SessionPool(max_pool_size=1) as session_pool:\n        crawler = HttpCrawler(\n            # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies\n            ignore_http_error_status_codes=[401],\n            session_pool=session_pool,\n            http_client=http_client,\n        )\n\n        @crawler.router.default_handler\n        async def handler(context: HttpCrawlingContext) -> None:\n            visit(context.request.url)\n            track_session_usage(context.session.id if context.session else None)\n\n        await crawler.run(\n            [\n                str(server_url.with_path('set_cookies').extend_query(a=1)),\n                str(server_url.with_path('set_cookies').extend_query(b=2)),\n                str(server_url.with_path('set_cookies').extend_query(c=3)),\n            ]\n        )\n\n        visited = {call[0][0] for call in visit.call_args_list}\n        assert len(visited) == 3\n\n        session_ids = {call[0][0] for call in track_session_usage.call_args_list}\n        assert len(session_ids) == 1\n\n        session = await session_pool.get_session_by_id(session_ids.pop())\n        assert session is not None\n        assert {cookie['name']: cookie['value'] for cookie in session.cookies.get_cookies_as_dicts()} == {\n            'a': '1',\n            'b': '2',\n            'c': '3',\n        }\n\n\nasync def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server_url: URL) -> None:\n    await crawler.add_requests([str(server_url / 'status/400')])\n    stats = await crawler.run()\n\n    # by default, client errors are not retried\n    assert stats.requests_failed == 1\n    assert stats.retry_histogram == [1]\n    assert stats.requests_total == 1\n\n\nasync def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) -> None:\n    await crawler.add_requests([str(server_url.with_path('status/500').with_query(id=i)) for i in range(10)])\n    await crawler.add_requests([str(server_url.with_path('status/402').with_query(id=i)) for i in range(10)])\n    await crawler.add_requests([str(server_url.with_path('status/403').with_query(id=i)) for i in range(10)])\n    await crawler.add_requests([str(server_url.with_query(id=i)) for i in range(10)])\n\n    await crawler.run()\n    assert crawler.statistics.state.requests_with_status_code == {\n        '200': 10,\n        '403': 100,  # block errors change session and retry\n        '402': 10,  # client errors are not retried by default\n        '500': 40,  # server errors are retried by default\n    }\n\n\nasync def test_sending_payload_as_raw_data(http_client: HttpClient, server_url: URL) -> None:\n    crawler = HttpCrawler(http_client=http_client)\n    responses = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        response = json.loads(await context.http_response.read())\n        # The post endpoint returns the provided payload in the response.\n        responses.append(response)\n\n    encoded_payload = urlencode(PAYLOAD).encode()\n    request = Request.from_url(\n        url=str(server_url / 'post'),\n        method='POST',\n        payload=encoded_payload,\n    )\n\n    await crawler.run([request])\n\n    assert len(responses) == 1, 'Request handler should be called exactly once.'\n    assert responses[0]['data'].encode() == encoded_payload, 'Response payload data does not match the sent payload.'\n\n    # The reconstructed payload data should match the original payload. We have to flatten the values, because\n    # parse_qs returns a list of values for each key.\n    response_data = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data']).items()}\n    assert response_data == PAYLOAD, 'The reconstructed payload data does not match the sent payload.'\n\n    assert responses[0]['json'] is None, 'Response JSON data should be empty when only raw data is sent.'\n    assert responses[0]['form'] == {}, 'Response form data should be empty when only raw data is sent.'\n\n\nasync def test_sending_payload_as_form_data(http_client: HttpClient, server_url: URL) -> None:\n    crawler = HttpCrawler(http_client=http_client)\n    responses = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        response = json.loads(await context.http_response.read())\n        # The /post endpoint returns the provided payload in the response.\n        responses.append(response)\n\n    request = Request.from_url(\n        url=str(server_url / 'post'),\n        method='POST',\n        headers={'content-type': 'application/x-www-form-urlencoded'},\n        payload=urlencode(PAYLOAD).encode(),\n    )\n\n    await crawler.run([request])\n\n    assert len(responses) == 1, 'Request handler should be called exactly once.'\n    assert responses[0]['form'] == PAYLOAD, 'Form data in response does not match the sent payload.'\n\n    assert responses[0]['json'] is None, 'Response JSON data should be empty when only form data is sent.'\n    assert responses[0]['data'] == '', 'Response raw data should be empty when only form data is sent.'\n\n\nasync def test_sending_payload_as_json(http_client: HttpClient, server_url: URL) -> None:\n    crawler = HttpCrawler(http_client=http_client)\n    responses = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        response = json.loads(await context.http_response.read())\n        # The /post endpoint returns the provided payload in the response.\n        responses.append(response)\n\n    json_payload = json.dumps(PAYLOAD).encode()\n    request = Request.from_url(\n        url=str(server_url / 'post'),\n        method='POST',\n        payload=json_payload,\n        headers={'content-type': 'application/json'},\n    )\n\n    await crawler.run([request])\n\n    assert len(responses) == 1, 'Request handler should be called exactly once.'\n    assert responses[0]['data'].encode() == json_payload, 'Response raw JSON data does not match the sent payload.'\n    assert responses[0]['json'] == PAYLOAD, 'Response JSON data does not match the sent payload.'\n\n    assert responses[0]['form'] == {}, 'Response form data should be empty when only JSON data is sent.'\n\n\nasync def test_sending_url_query_params(http_client: HttpClient, server_url: URL) -> None:\n    crawler = HttpCrawler(http_client=http_client)\n    responses = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        response = json.loads(await context.http_response.read())\n        # The /get endpoint returns the provided query parameters in the response.\n        responses.append(response)\n\n    base_url = server_url / 'get'\n    query_params = {'param1': 'value1', 'param2': 'value2'}\n    request = Request.from_url(url=str(base_url.extend_query(query_params)))\n\n    await crawler.run([request])\n\n    assert len(responses) == 1, 'Request handler should be called exactly once.'\n\n    response_args = responses[0]['args']\n    assert response_args == query_params, 'Reconstructed query params must match the original query params.'\n\n\nasync def test_http_crawler_pre_navigation_hook_execution(server_url: URL) -> None:\n    \"\"\"Test that pre-navigation hooks are executed.\"\"\"\n    crawler = HttpCrawler(request_handler=AsyncMock())\n\n    call_mock = AsyncMock()\n\n    # Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(context: BasicCrawlingContext) -> None:\n        await call_mock(context.request.loaded_url)\n\n    await crawler.run([str(server_url)])\n\n    # `pre_navigation_hook` is called before the request is made, so the loaded URL should be None.\n    call_mock.assert_called_once_with(None)\n\n\nasync def test_http_crawler_post_navigation_hook_execution(server_url: URL) -> None:\n    \"\"\"Test that post-navigation hooks are executed.\"\"\"\n    crawler = HttpCrawler(request_handler=AsyncMock())\n\n    call_mock = AsyncMock()\n\n    # Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook(context: HttpCrawlingContext) -> None:\n        await call_mock(context.request.loaded_url)\n\n    await crawler.run([str(server_url)])\n\n    # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL.\n    call_mock.assert_called_once_with(str(server_url))\n\n\nasync def test_http_crawler_navigation_hooks_order(server_url: URL) -> None:\n    \"\"\"Test that post-navigation hooks are executed in correct order.\"\"\"\n    execution_order = []\n\n    crawler = HttpCrawler()\n\n    # Register final context handler.\n    @crawler.router.default_handler\n    async def default_request_handler(_context: HttpCrawlingContext) -> None:\n        execution_order.append('final handler')\n\n    # Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook_1(_context: BasicCrawlingContext) -> None:\n        execution_order.append('pre-navigation-hook 1')\n\n    # Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(_context: BasicCrawlingContext) -> None:\n        execution_order.append('pre-navigation-hook 2')\n\n    # Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook_1(_context: HttpCrawlingContext) -> None:\n        execution_order.append('post-navigation-hook 1')\n\n    # Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook_2(_context: HttpCrawlingContext) -> None:\n        execution_order.append('post-navigation-hook 2')\n\n    await crawler.run([str(server_url)])\n\n    assert execution_order == [\n        'pre-navigation-hook 1',\n        'pre-navigation-hook 2',\n        'post-navigation-hook 1',\n        'post-navigation-hook 2',\n        'final handler',\n    ]\n\n\nasync def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> None:\n    \"\"\"Test isolation cookies for Session with curl\"\"\"\n    sessions_ids: list[str] = []\n    sessions_cookies: dict[str, dict[str, str]] = {}\n    response_cookies: dict[str, dict[str, str]] = {}\n\n    crawler = HttpCrawler(\n        session_pool=SessionPool(\n            max_pool_size=1,\n            create_session_settings={\n                'max_error_score': 50,\n            },\n        ),\n        http_client=http_client,\n        max_request_retries=10,\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: HttpCrawlingContext) -> None:\n        if not context.session:\n            return\n\n        sessions_ids.append(context.session.id)\n\n        if context.request.unique_key not in {'1', '2'}:\n            return\n\n        sessions_cookies[context.session.id] = {\n            cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()\n        }\n        response_data = json.loads(await context.http_response.read())\n        response_cookies[context.session.id] = response_data.get('cookies')\n\n        if context.request.user_data.get('retire_session'):\n            context.session.retire()\n\n    await crawler.run(\n        [\n            # The first request sets the cookie in the session\n            str(server_url.with_path('set_cookies').extend_query(a=1)),\n            # With the second request, we check the cookies in the session and set retire\n            Request.from_url(str(server_url.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),\n            # The third request is made with a new session to make sure it does not use another session's cookies\n            Request.from_url(str(server_url.with_path('/cookies')), unique_key='2'),\n        ]\n    )\n\n    assert len(sessions_cookies) == 2\n    assert len(response_cookies) == 2\n\n    assert sessions_ids[0] == sessions_ids[1]\n\n    cookie_session_id = sessions_ids[0]\n    clean_session_id = sessions_ids[2]\n\n    assert cookie_session_id != clean_session_id\n\n    # The initiated cookies must match in both the response and the session store\n    assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}\n\n    # For a clean session, the cookie should not be in the session store or in the response\n    # This way we can be sure that no cookies are being leaked through the http client\n    assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}\n\n\nasync def test_store_complex_cookies(server_url: URL) -> None:\n    visit = Mock()\n    track_session_usage = Mock()\n    async with SessionPool(max_pool_size=1) as session_pool:\n        crawler = HttpCrawler(session_pool=session_pool)\n\n        @crawler.router.default_handler\n        async def handler(context: HttpCrawlingContext) -> None:\n            visit(context.request.url)\n            track_session_usage(context.session.id if context.session else None)\n\n        await crawler.run([str(server_url / 'set_complex_cookies')])\n\n        visited = {call[0][0] for call in visit.call_args_list}\n        assert len(visited) == 1\n\n        session_ids = {call[0][0] for call in track_session_usage.call_args_list}\n        assert len(session_ids) == 1\n\n        session = await session_pool.get_session_by_id(session_ids.pop())\n        assert session is not None\n\n        session_cookies_dict = {cookie['name']: cookie for cookie in session.cookies.get_cookies_as_dicts()}\n\n        assert len(session_cookies_dict) == 6\n\n        # cookie string: 'basic=1; Path=/; HttpOnly; SameSite=Lax'\n        assert session_cookies_dict['basic'] == {\n            'name': 'basic',\n            'value': '1',\n            'domain': server_url.host,\n            'path': '/',\n            'secure': False,\n            'http_only': True,\n            'same_site': 'Lax',\n        }\n\n        # cookie string: 'withpath=2; Path=/html; SameSite=None'\n        assert session_cookies_dict['withpath'] == {\n            'name': 'withpath',\n            'value': '2',\n            'domain': server_url.host,\n            'path': '/html',\n            'secure': False,\n            'http_only': False,\n            'same_site': 'None',\n        }\n\n        # cookie string: 'strict=3; Path=/; SameSite=Strict'\n        assert session_cookies_dict['strict'] == {\n            'name': 'strict',\n            'value': '3',\n            'domain': server_url.host,\n            'path': '/',\n            'secure': False,\n            'http_only': False,\n            'same_site': 'Strict',\n        }\n\n        # cookie string: 'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict'\n        assert session_cookies_dict['secure'] == {\n            'name': 'secure',\n            'value': '4',\n            'domain': server_url.host,\n            'path': '/',\n            'secure': True,\n            'http_only': True,\n            'same_site': 'Strict',\n        }\n\n        # cookie string: 'short=5; Path=/;'\n        assert session_cookies_dict['short'] == {\n            'name': 'short',\n            'value': '5',\n            'domain': server_url.host,\n            'path': '/',\n            'secure': False,\n            'http_only': False,\n        }\n\n        # Some clients may ignore `.` at the beginning of the domain\n        # https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3\n        assert session_cookies_dict['domain'] == {\n            'name': 'domain',\n            'value': '6',\n            'domain': {server_url.host},\n            'path': '/',\n            'secure': False,\n            'http_only': False,\n        } or {\n            'name': 'domain',\n            'value': '6',\n            'domain': f'.{server_url.host}',\n            'path': '/',\n            'secure': False,\n            'http_only': False,\n        }\n\n\ndef test_default_logger() -> None:\n    assert HttpCrawler().log.name == 'HttpCrawler'\n\n\nasync def test_get_snapshot(server_url: URL) -> None:\n    crawler = HttpCrawler()\n\n    snapshot = None\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        nonlocal snapshot\n        snapshot = await context.get_snapshot()\n\n    await crawler.run([str(server_url)])\n\n    assert snapshot is not None\n    assert snapshot.html is not None\n    assert snapshot.html == HELLO_WORLD.decode('utf8')\n\n\nasync def test_error_snapshot_through_statistics(server_url: URL) -> None:\n    statistics = Statistics.with_default_state(save_error_snapshots=True)\n    crawler = HttpCrawler(statistics=statistics)\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        raise RuntimeError(rf'Exception /\\ with file name unfriendly symbols in {context.request.url}')\n\n    await crawler.run([str(server_url)])\n\n    kvs = await crawler.get_key_value_store()\n    kvs_content = {}\n    async for key_info in kvs.iterate_keys():\n        # Skip any non-error snapshot keys, e.g. __RQ_STATE_.\n        if 'ERROR_SNAPSHOT' not in key_info.key:\n            continue\n        kvs_content[key_info.key] = await kvs.get_value(key_info.key)\n\n    # One error, three time retried.\n    content_key = next(iter(kvs_content))\n    assert crawler.statistics.error_tracker.total == 4\n    assert crawler.statistics.error_tracker.unique_error_count == 1\n    assert len(kvs_content) == 1\n    assert content_key.endswith('.html')\n    assert kvs_content[content_key] == HELLO_WORLD.decode('utf8')\n\n\nasync def test_request_state(server_url: URL) -> None:\n    queue = await RequestQueue.open(alias='http_request_state')\n    crawler = HttpCrawler(request_manager=queue)\n\n    success_request = Request.from_url(str(server_url))\n    assert success_request.state == RequestState.UNPROCESSED\n\n    error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True})\n\n    requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}}\n\n    @crawler.pre_navigation_hook\n    async def pre_navigation_hook(context: BasicCrawlingContext) -> None:\n        requests_states[context.request.unique_key]['pre_navigation'] = context.request.state\n\n    @crawler.router.default_handler\n    async def request_handler(context: HttpCrawlingContext) -> None:\n        if context.request.user_data.get('cause_error'):\n            raise ValueError('Caused error as requested')\n        requests_states[context.request.unique_key]['request_handler'] = context.request.state\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        requests_states[context.request.unique_key]['error_handler'] = context.request.state\n\n    @crawler.failed_request_handler\n    async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state\n\n    await crawler.run([success_request, error_request])\n\n    handled_success_request = await queue.get_request(success_request.unique_key)\n\n    assert handled_success_request is not None\n    assert handled_success_request.state == RequestState.DONE\n\n    assert requests_states[success_request.unique_key] == {\n        'pre_navigation': RequestState.BEFORE_NAV,\n        'request_handler': RequestState.REQUEST_HANDLER,\n    }\n\n    handled_error_request = await queue.get_request(error_request.unique_key)\n    assert handled_error_request is not None\n    assert handled_error_request.state == RequestState.ERROR\n\n    assert requests_states[error_request.unique_key] == {\n        'pre_navigation': RequestState.BEFORE_NAV,\n        'error_handler': RequestState.ERROR_HANDLER,\n        'failed_request_handler': RequestState.ERROR,\n    }\n\n    await queue.drop()\n"
  },
  {
    "path": "tests/unit/crawlers/_parsel/test_parsel_crawler.py",
    "content": "from __future__ import annotations\n\nimport sys\nfrom typing import TYPE_CHECKING\nfrom unittest import mock\n\nimport pytest\n\nfrom crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.storages import RequestQueue\n\nif TYPE_CHECKING:\n    from yarl import URL\n\n    from crawlee._request import RequestOptions\n    from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext\n    from crawlee.http_clients._base import HttpClient\n\n\nasync def test_basic(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    handler = mock.AsyncMock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        links = context.selector.css('a::attr(href)').getall()\n        await handler(links)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    assert handler.called\n\n    # The handler should find three links\n    assert len(handler.call_args[0][0]) == 3\n\n\nasync def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:\n    redirect_target = str(server_url / 'start_enqueue')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n\n    crawler = ParselCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        url = str(context.request.url)\n        visit(url)\n        await context.enqueue_links()\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    assert visit.mock_calls[0] == expected_visit_calls[0]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:\n    redirect_target = str(server_url / 'start_enqueue_non_href')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n\n    crawler = ParselCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(selector='img', attribute='src')\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'base_subpath/image_1')),\n        mock.call(str(server_url / 'image_2')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:\n    \"\"\"Call `enqueue_links` with arguments that can't be used together.\"\"\"\n    crawler = ParselCrawler(max_request_retries=1)\n    exceptions = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        try:\n            # Testing runtime enforcement of the overloads.\n            await context.enqueue_links(requests=[Request.from_url(str(server_url / 'start_enqueue'))], selector='a')\n        except Exception as e:\n            exceptions.append(e)\n\n    await crawler.run([str(server_url)])\n\n    assert len(exceptions) == 1\n    assert type(exceptions[0]) is ValueError\n\n\nasync def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(selector='a.foo')\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:\n    start_urls = [str(server_url / 'start_enqueue')]\n    processed_urls = []\n\n    # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately\n    crawler = ParselCrawler(\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n        max_requests_per_crawl=3,\n        http_client=http_client,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        await context.enqueue_links()\n        processed_urls.append(context.request.url)\n\n    stats = await crawler.run(start_urls)\n\n    # Verify that only 3 out of the possible 5 requests were made\n    assert len(processed_urls) == 3\n    assert stats.requests_total == 3\n    assert stats.requests_finished == 3\n\n\nasync def test_enqueue_links_with_transform_request_function(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    visit = mock.Mock()\n    headers = []\n\n    def test_transform_request_function(\n        request_options: RequestOptions,\n    ) -> RequestOptions | RequestTransformAction:\n        if 'page_3' in request_options['url']:\n            return 'skip'\n\n        request_options['headers'] = HttpHeaders({'transform-header': 'my-header'})\n        return request_options\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        headers.append(context.request.headers)\n        await context.enqueue_links(transform_request_function=test_transform_request_function, label='test')\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    # url /page_3 should not be visited\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_4')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # all urls added to `enqueue_links` must have a custom header\n    assert headers[1]['transform-header'] == 'my-header'\n    assert headers[2]['transform-header'] == 'my-header'\n    assert headers[3]['transform-header'] == 'my-header'\n\n\nasync def test_handle_blocked_request(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(max_session_rotations=1, http_client=http_client)\n\n    stats = await crawler.run([str(server_url / 'incapsula')])\n    assert stats.requests_failed == 1\n\n\nasync def test_handle_blocked_status_code(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(max_session_rotations=1, http_client=http_client)\n\n    # Patch internal calls and run crawler\n    with (\n        mock.patch.object(\n            crawler._statistics,\n            'record_request_processing_failure',\n            wraps=crawler._statistics.record_request_processing_failure,\n        ) as record_request_processing_failure,\n        mock.patch.object(\n            crawler._statistics.error_tracker, 'add', wraps=crawler._statistics.error_tracker.add\n        ) as error_tracker_add,\n    ):\n        stats = await crawler.run([str(server_url / 'status/403')])\n\n    assert stats.requests_failed == 1\n    assert record_request_processing_failure.called\n    assert error_tracker_add.called\n    assert crawler._statistics.error_tracker.total == 1\n\n\n# TODO: Remove the skip mark when the test is fixed:\n# https://github.com/apify/crawlee-python/issues/838\n@pytest.mark.skip(reason='The test does not work with `crawlee._utils.try_import.ImportWrapper`.')\ndef test_import_error_handled() -> None:\n    # Simulate ImportError for parsel\n    with mock.patch.dict('sys.modules', {'parsel': None}):\n        # Invalidate ParselCrawler import\n        sys.modules.pop('crawlee.crawlers', None)\n        sys.modules.pop('crawlee.crawlers._parsel', None)\n        with pytest.raises(ImportError) as import_error:\n            from crawlee.crawlers import ParselCrawler  # noqa: F401 PLC0415\n\n    # Check if the raised ImportError contains the expected message\n    assert str(import_error.value) == (\n        \"To import this, you need to install the 'parsel' extra.\"\n        \"For example, if you use pip, run `pip install 'crawlee[parsel]'`.\"\n    )\n\n\nasync def test_json(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    handler = mock.AsyncMock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        result = context.selector.jmespath('hello').getall()\n        await handler(result)\n\n    await crawler.run([str(server_url / 'json')])\n\n    assert handler.called\n\n    assert handler.call_args[0][0] == ['world']\n\n\nasync def test_xml(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    handler = mock.AsyncMock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        result = context.selector.css('hello').getall()\n        await handler(result)\n\n    await crawler.run([str(server_url / 'xml')])\n\n    assert handler.called\n\n    assert handler.call_args[0][0] == ['<hello>world</hello>']\n\n\ndef test_default_logger() -> None:\n    assert ParselCrawler().log.name == 'ParselCrawler'\n\n\nasync def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:\n    \"\"\"Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.\"\"\"\n    visit = mock.Mock()\n    fail = mock.Mock()\n    crawler = ParselCrawler(\n        http_client=http_client,\n        respect_robots_txt_file=True,\n        max_request_retries=0,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(strategy='all')\n\n    @crawler.failed_request_handler\n    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        fail(context.request.url)\n\n    await crawler.run([str(server_url / 'problematic_links')])\n\n    # Email must be skipped\n    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.\n    expected_visit_calls = [\n        mock.call(str(server_url / 'problematic_links')),\n        mock.call('https://avatars.githubusercontent.com/apify'),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # The budplaceholder.com does not exist.\n    expected_fail_calls = [\n        mock.call('https://budplaceholder.com/'),\n    ]\n    fail.assert_has_calls(expected_fail_calls, any_order=True)\n\n\nasync def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)\n    skip = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        await context.enqueue_links()\n\n    @crawler.on_skipped_request\n    async def skipped_hook(url: str, _reason: SkippedReason) -> None:\n        skip(url)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_skip_calls = [\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n    ]\n    skip.assert_has_calls(expected_skip_calls, any_order=True)\n\n\nasync def test_extract_links(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_1')\n\n\nasync def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        links = await context.extract_links(selector='li', attribute='data-href')\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'non_href_links')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_2')\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_with_rq_param(\n    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_requests_with_rq_param(\n    server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    check_requests: list[str] = [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n    ]\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(\n            requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all'\n        )\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == set(check_requests)\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_id', 'queue_name', 'queue_alias'),\n    [\n        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),\n        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),\n        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),\n        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),\n    ],\n)\nasync def test_enqueue_links_error_with_multi_params(\n    server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None\n) -> None:\n    crawler = ParselCrawler(http_client=http_client)\n\n    @crawler.router.default_handler\n    async def handler(context: ParselCrawlingContext) -> None:\n        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):\n            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n\nasync def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:\n    start_url = str(server_url / 'sub_index')\n    requests = [start_url]\n\n    crawler = ParselCrawler(http_client=http_client)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: ParselCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(limit=1)\n\n    await crawler.run(requests)\n\n    # Only one link should be enqueued from sub_index due to the limit\n    expected_visit_calls = [\n        mock.call(start_url),\n        mock.call(str(server_url / 'page_3')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n"
  },
  {
    "path": "tests/unit/crawlers/_playwright/test_playwright_crawler.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING, Any, Literal\nfrom unittest import mock\nfrom unittest.mock import AsyncMock, Mock\n\nimport pytest\n\nfrom crawlee import (\n    ConcurrencySettings,\n    Glob,\n    HttpHeaders,\n    Request,\n    RequestState,\n    RequestTransformAction,\n    SkippedReason,\n    service_locator,\n)\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import PlaywrightCrawler\nfrom crawlee.fingerprint_suite import (\n    DefaultFingerprintGenerator,\n    FingerprintGenerator,\n    HeaderGeneratorOptions,\n    ScreenOptions,\n)\nfrom crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values\nfrom crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD\nfrom crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type\nfrom crawlee.http_clients import ImpitHttpClient\nfrom crawlee.proxy_configuration import ProxyConfiguration\nfrom crawlee.sessions import Session, SessionPool\nfrom crawlee.statistics import Statistics\nfrom crawlee.statistics._error_snapshotter import ErrorSnapshotter\nfrom crawlee.storages import RequestQueue\nfrom tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD\n\nif TYPE_CHECKING:\n    from pathlib import Path\n\n    from yarl import URL\n\n    from crawlee._request import RequestOptions\n    from crawlee._types import HttpMethod, HttpPayload\n    from crawlee.browsers._types import BrowserType\n    from crawlee.crawlers import (\n        BasicCrawlingContext,\n        PlaywrightCrawlingContext,\n        PlaywrightPostNavCrawlingContext,\n        PlaywrightPreNavCrawlingContext,\n    )\n\n\n@pytest.mark.parametrize(\n    ('method', 'path', 'payload'),\n    [\n        pytest.param('GET', 'get', None, id='get request'),\n        pytest.param('POST', 'post', None, id='post request'),\n        pytest.param('POST', 'post', b'Hello, world!', id='post request with payload'),\n    ],\n)\nasync def test_basic_request(method: HttpMethod, path: str, payload: HttpPayload, server_url: URL) -> None:\n    requests = [Request.from_url(str(server_url / path), method=method, payload=payload)]\n    crawler = PlaywrightCrawler()\n    result: dict = {}\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        assert context.page is not None\n        result['request_url'] = context.request.url\n        result['page_url'] = context.page.url\n        result['page_content'] = await context.page.content()\n\n    await crawler.run(requests)\n    assert result.get('request_url') == result.get('page_url') == requests[0].url\n    assert (payload.decode() if payload else '') in result.get('page_content', '')\n\n\nasync def test_enqueue_links(redirect_server_url: URL, server_url: URL) -> None:\n    redirect_target = str(server_url / 'start_enqueue')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n    crawler = PlaywrightCrawler()\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    assert visit.mock_calls[0] == expected_visit_calls[0]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL) -> None:\n    redirect_target = str(server_url / 'start_enqueue_non_href')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n    requests = [redirect_url]\n    crawler = PlaywrightCrawler()\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(selector='img', attribute='src')\n\n    await crawler.run(requests)\n\n    expected_visit_calls = [\n        mock.call(redirect_url),\n        mock.call(str(server_url / 'base_subpath/image_1')),\n        mock.call(str(server_url / 'image_2')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_enqueue_links_with_incompatible_kwargs_raises_error(server_url: URL) -> None:\n    \"\"\"Call `enqueue_links` with arguments that can't be used together.\"\"\"\n    crawler = PlaywrightCrawler(max_request_retries=1)\n    exceptions = []\n\n    @crawler.pre_navigation_hook\n    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        await context.page.route('**/*', lambda route: route.fulfill(status=200))\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        try:\n            # Testing runtime enforcement of the overloads.\n            await context.enqueue_links(requests=[Request.from_url('https://www.whatever.com')], selector='a')\n        except Exception as e:\n            exceptions.append(e)\n\n    await crawler.run([str(server_url)])\n\n    assert len(exceptions) == 1\n    assert type(exceptions[0]) is ValueError\n\n\nasync def test_enqueue_links_with_transform_request_function(server_url: URL) -> None:\n    crawler = PlaywrightCrawler()\n    visit = mock.Mock()\n    headers = []\n\n    def test_transform_request_function(request: RequestOptions) -> RequestOptions | RequestTransformAction:\n        if request['url'] == str(server_url / 'sub_index'):\n            request['headers'] = HttpHeaders({'transform-header': 'my-header'})\n            return request\n        return 'skip'\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        headers.append(context.request.headers)\n        await context.enqueue_links(transform_request_function=test_transform_request_function)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # all urls added to `enqueue_links` must have a custom header\n    assert headers[1]['transform-header'] == 'my-header'\n\n\nasync def test_nonexistent_url_invokes_error_handler() -> None:\n    crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock())\n\n    error_handler = mock.AsyncMock(return_value=None)\n    crawler.error_handler(error_handler)\n\n    failed_handler = mock.AsyncMock(return_value=None)\n    crawler.failed_request_handler(failed_handler)\n\n    await crawler.run(['https://this-does-not-exist-22343434.com'])\n    assert error_handler.call_count == 3\n    assert failed_handler.call_count == 1\n\n\nasync def test_redirect_handling(server_url: URL, redirect_server_url: URL) -> None:\n    # Set up a dummy crawler that tracks visited URLs\n    crawler = PlaywrightCrawler()\n    handled_urls = set[str]()\n\n    redirect_target = str(server_url / 'start_enqueue')\n    redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        handled_urls.add(context.request.loaded_url or '')\n\n    # Request with redirects\n    request = Request.from_url(url=redirect_url)\n\n    # Ensure that the request uses the same origin strategy - `redirect_target` will be considered out of scope\n    request.crawlee_data.enqueue_strategy = 'same-origin'\n\n    # No URLs should be visited in the run\n    await crawler.run([request])\n    assert handled_urls == set()\n\n\n@pytest.mark.parametrize(\n    'fingerprint_generator',\n    [\n        pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),\n        pytest.param(\n            DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])),\n            id='Explicitly passed fingerprint generator.',\n        ),\n        pytest.param('default', id='Default fingerprint generator.'),\n    ],\n)\nasync def test_chromium_headless_headers(\n    header_network: dict, fingerprint_generator: None | FingerprintGenerator | Literal['default'], server_url: URL\n) -> None:\n    browser_type: BrowserType = 'chromium'\n    crawler = PlaywrightCrawler(headless=True, browser_type=browser_type, fingerprint_generator=fingerprint_generator)\n    headers = dict[str, str]()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        response_headers = json.loads(response)\n\n        for key, val in response_headers.items():\n            headers[key] = val\n\n    await crawler.run([str(server_url / 'headers')])\n\n    user_agent = headers.get('user-agent')\n    assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent\n    assert any(\n        keyword in user_agent\n        for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]\n    ), user_agent\n\n    assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')\n    assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')\n    assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform')\n\n    assert 'headless' not in headers['sec-ch-ua'].lower()\n    assert 'headless' not in headers['user-agent'].lower()\n\n\n@pytest.mark.flaky(reruns=3, reason='Test is flaky.')\nasync def test_firefox_headless_headers(header_network: dict, server_url: URL) -> None:\n    browser_type: BrowserType = 'firefox'\n    crawler = PlaywrightCrawler(headless=True, browser_type=browser_type)\n    headers = dict[str, str]()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        response_headers = json.loads(response)\n\n        for key, val in response_headers.items():\n            headers[key] = val\n\n    await crawler.run([str(server_url / 'headers')])\n\n    assert 'user-agent' in headers\n    assert 'sec-ch-ua' not in headers\n    assert 'sec-ch-ua-mobile' not in headers\n    assert 'sec-ch-ua-platform' not in headers\n\n    assert 'headless' not in headers['user-agent'].lower()\n\n    user_agent = headers.get('user-agent')\n    assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})\n    assert any(\n        keyword in user_agent\n        for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]\n    )\n\n\nasync def test_custom_headers(server_url: URL) -> None:\n    crawler = PlaywrightCrawler()\n    response_headers = dict[str, str]()\n    request_headers = {'Power-Header': 'ring', 'Library': 'storm', 'My-Test-Header': 'fuzz'}\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        context_response_headers = json.loads(response)\n        for key, val in context_response_headers.items():\n            response_headers[key] = val\n\n    await crawler.run([Request.from_url(str(server_url / 'headers'), headers=request_headers)])\n\n    assert response_headers.get('power-header') == request_headers['Power-Header']\n    assert response_headers.get('library') == request_headers['Library']\n    assert response_headers.get('my-test-header') == request_headers['My-Test-Header']\n\n\nasync def test_pre_navigation_hook() -> None:\n    crawler = PlaywrightCrawler(request_handler=mock.AsyncMock())\n    visit = mock.Mock()\n\n    @crawler.pre_navigation_hook\n    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        visit()\n        await context.page.route('**/*', lambda route: route.fulfill(status=200))\n\n    await crawler.run(['https://test.com', 'https://test.io'])\n\n    assert visit.call_count == 2\n\n\nasync def test_proxy_set() -> None:\n    # Configure crawler with proxy settings\n    proxy_value = 'http://1111:1111'\n    crawler = PlaywrightCrawler(proxy_configuration=ProxyConfiguration(proxy_urls=[proxy_value]))\n\n    handler_data = {}\n\n    mock_handler = mock.AsyncMock(return_value=None)\n    crawler.router.default_handler(mock_handler)\n\n    # Use pre_navigation_hook to verify proxy and configure playwright route\n    @crawler.pre_navigation_hook\n    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        if context.proxy_info:\n            # Store information about the used proxy\n            handler_data['proxy'] = context.proxy_info.url\n\n        # Emulate server response to prevent Playwright from making real requests\n        await context.page.route('**/*', lambda route: route.fulfill(status=200))\n\n    await crawler.run(['https://test.com'])\n\n    assert handler_data.get('proxy') == proxy_value\n\n\n@pytest.mark.run_alone\n@pytest.mark.parametrize(\n    'use_incognito_pages',\n    [\n        pytest.param(False, id='without use_incognito_pages'),\n        pytest.param(True, id='with use_incognito_pages'),\n    ],\n)\nasync def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL) -> None:\n    sessions_ids: list[str] = []\n    sessions: dict[str, Session] = {}\n    sessions_cookies: dict[str, dict[str, str]] = {}\n    response_cookies: dict[str, dict[str, str]] = {}\n\n    crawler = PlaywrightCrawler(\n        session_pool=SessionPool(max_pool_size=1),\n        use_incognito_pages=use_incognito_pages,\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n    )\n\n    @crawler.router.default_handler\n    async def handler(context: PlaywrightCrawlingContext) -> None:\n        if not context.session:\n            return\n\n        sessions_ids.append(context.session.id)\n        sessions[context.session.id] = context.session\n\n        if context.request.unique_key == '1':\n            # With the second request, we check the cookies in the session and set retire\n            await context.add_requests(\n                [\n                    Request.from_url(\n                        str(server_url.with_path('/cookies')), unique_key='2', user_data={'retire_session': True}\n                    )\n                ]\n            )\n            return\n\n        response_data = json.loads(await context.response.text())\n        response_cookies[context.session.id] = response_data.get('cookies')\n\n        if context.request.user_data.get('retire_session'):\n            context.session.retire()\n\n        if context.request.unique_key == '2':\n            # The third request is made with a new session to make sure it does not use another session's cookies\n            await context.add_requests([Request.from_url(str(server_url.with_path('/cookies')), unique_key='3')])\n\n    await crawler.run(\n        [\n            # The first request sets the cookie in the session\n            Request.from_url(str(server_url.with_path('set_cookies').extend_query(a=1)), unique_key='1'),\n        ]\n    )\n\n    assert len(response_cookies) == 2\n    assert len(sessions) == 2\n\n    assert sessions_ids[0] == sessions_ids[1]\n\n    sessions_cookies = {\n        sessions_id: {\n            cookie['name']: cookie['value'] for cookie in sessions[sessions_id].cookies.get_cookies_as_dicts()\n        }\n        for sessions_id in sessions_ids\n    }\n\n    assert len(sessions_cookies) == 2\n\n    cookie_session_id = sessions_ids[0]\n    clean_session_id = sessions_ids[2]\n\n    assert cookie_session_id != clean_session_id\n\n    # When using `use_incognito_pages` there should be full cookie isolation\n    if use_incognito_pages:\n        # The initiated cookies must match in both the response and the session store\n        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}\n\n        # For a clean session, the cookie should not be in the sesstion store or in the response\n        # This way we can be sure that no cookies are being leaked through the http client\n        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}\n    # Without `use_incognito_pages` we will have access to the session cookie,\n    # but there will be a cookie leak via PlaywrightContext\n    else:\n        # The initiated cookies must match in both the response and the session store\n        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}\n\n        # PlaywrightContext makes cookies shared by all sessions that work with it.\n        # So in this case a clean session contains the same cookies\n        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}\n\n\nasync def test_save_cookies_after_handler_processing(server_url: URL) -> None:\n    \"\"\"Test that cookies are saved correctly.\"\"\"\n    async with SessionPool(max_pool_size=1) as session_pool:\n        crawler = PlaywrightCrawler(session_pool=session_pool)\n\n        session_ids = []\n\n        @crawler.router.default_handler\n        async def request_handler(context: PlaywrightCrawlingContext) -> None:\n            # Simulate cookies installed from an external source in the browser\n            await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}])\n\n            if context.session:\n                session_ids.append(context.session.id)\n\n        await crawler.run([str(server_url)])\n\n        assert len(session_ids) == 1\n\n        check_session = await session_pool.get_session()\n\n        assert check_session.id == session_ids[0]\n        session_cookies = {cookie['name']: cookie['value'] for cookie in check_session.cookies.get_cookies_as_dicts()}\n\n        assert session_cookies == {'check': 'test'}\n\n\nasync def test_read_write_cookies(server_url: URL) -> None:\n    \"\"\"Test that cookies are reloaded correctly.\"\"\"\n    async with SessionPool(max_pool_size=1) as session_pool:\n        crawler = PlaywrightCrawler(session_pool=session_pool)\n\n        playwright_cookies = []\n        session_cookies = []\n\n        # Check that no errors occur when reading and writing cookies.\n        @crawler.router.default_handler\n        async def request_handler(context: PlaywrightCrawlingContext) -> None:\n            cookies = await context.page.context.cookies()\n            playwright_cookies.extend(cookies)\n\n            if context.session:\n                context.session.cookies.set_cookies_from_playwright_format(cookies)\n                session_cookies.extend(context.session.cookies.get_cookies_as_dicts())\n\n        await crawler.run([str(server_url / 'set_complex_cookies')])\n\n        # Check that the cookie was received with `partitionKey`\n        assert any('partitionKey' in cookie for cookie in playwright_cookies)\n\n        assert len(playwright_cookies) == len(session_cookies)\n\n\nasync def test_custom_fingerprint_uses_generator_options(server_url: URL) -> None:\n    min_width = 300\n    max_width = 600\n    min_height = 500\n    max_height = 1200\n\n    fingerprint_generator = DefaultFingerprintGenerator(\n        header_options=HeaderGeneratorOptions(browsers=['firefox'], operating_systems=['android']),\n        screen_options=ScreenOptions(\n            min_width=min_width, max_width=max_width, min_height=min_height, max_height=max_height\n        ),\n    )\n\n    crawler = PlaywrightCrawler(headless=True, fingerprint_generator=fingerprint_generator)\n\n    fingerprints = dict[str, Any]()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        for relevant_key in (\n            'window.navigator.userAgent',\n            'window.navigator.userAgentData',\n            'window.screen.height',\n            'window.screen.width',\n        ):\n            fingerprints[relevant_key] = await context.page.evaluate(f'()=>{relevant_key}')\n\n    await crawler.run([str(server_url)])\n\n    assert 'Firefox' in fingerprints['window.navigator.userAgent']\n    assert fingerprints['window.navigator.userAgentData']['platform'] == 'Android'\n    assert min_width <= int(fingerprints['window.screen.width']) <= max_width\n    assert min_height <= int(fingerprints['window.screen.height']) <= max_height\n\n\nasync def test_custom_fingerprint_matches_header_user_agent(server_url: URL) -> None:\n    \"\"\"Test that generated fingerprint and header have matching user agent.\"\"\"\n\n    crawler = PlaywrightCrawler(headless=True, fingerprint_generator=DefaultFingerprintGenerator())\n    response_headers = dict[str, str]()\n    fingerprints = dict[str, str]()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        context_response_headers = dict(json.loads(response))\n\n        response_headers['User-Agent'] = context_response_headers['user-agent']\n        fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent')\n\n    await crawler.run([str(server_url / 'headers')])\n\n    assert response_headers['User-Agent'] == fingerprints['window.navigator.userAgent']\n\n\nasync def test_ignore_http_error_status_codes(server_url: URL) -> None:\n    \"\"\"Test that error codes that would normally trigger session error can be ignored.\"\"\"\n    crawler = PlaywrightCrawler(ignore_http_error_status_codes={403})\n    target_url = str(server_url / 'status/403')\n    mocked_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        mocked_handler(context.request.url)\n\n    await crawler.run([target_url])\n\n    mocked_handler.assert_called_once_with(target_url)\n\n\nasync def test_additional_http_error_status_codes(server_url: URL) -> None:\n    \"\"\"Test that use of `additional_http_error_status_codes` can raise error on common status code.\"\"\"\n    crawler = PlaywrightCrawler(additional_http_error_status_codes={200})\n\n    mocked_handler = Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        mocked_handler(context.request.url)\n\n    await crawler.run([str(server_url)])\n\n    mocked_handler.assert_not_called()\n\n\nasync def test_launch_with_user_data_dir(tmp_path: Path, server_url: URL) -> None:\n    \"\"\"Check that the persist context is created in the specified folder in `user_data_dir`.\"\"\"\n    check_path = tmp_path / 'Default'\n    crawler = PlaywrightCrawler(\n        headless=True, user_data_dir=tmp_path, request_handler=mock.AsyncMock(return_value=None)\n    )\n\n    assert not check_path.exists()\n\n    await crawler.run([str(server_url)])\n\n    assert check_path.exists()\n\n\nasync def test_launch_with_user_data_dir_and_fingerprint(tmp_path: Path, server_url: URL) -> None:\n    \"\"\"Check that the persist context works with fingerprints.\"\"\"\n    check_path = tmp_path / 'Default'\n    fingerprints = dict[str, str]()\n\n    crawler = PlaywrightCrawler(\n        headless=True,\n        user_data_dir=tmp_path,\n        request_handler=mock.AsyncMock(return_value=None),\n        fingerprint_generator=DefaultFingerprintGenerator(),\n    )\n\n    @crawler.pre_navigation_hook\n    async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        fingerprints['window.navigator.userAgent'] = await context.page.evaluate('()=>window.navigator.userAgent')\n\n    assert not check_path.exists()\n\n    await crawler.run([str(server_url)])\n\n    assert check_path.exists()\n\n    assert fingerprints['window.navigator.userAgent']\n    assert 'headless' not in fingerprints['window.navigator.userAgent'].lower()\n\n\nasync def test_get_snapshot(server_url: URL) -> None:\n    crawler = PlaywrightCrawler()\n\n    snapshot = None\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        nonlocal snapshot\n        snapshot = await context.get_snapshot()\n\n    await crawler.run([str(server_url)])\n\n    assert snapshot is not None\n    assert snapshot.html is not None\n    assert snapshot.screenshot is not None\n    # Check at least jpeg start and end expected bytes. Content is not relevant for the test.\n    assert snapshot.screenshot.startswith(b'\\xff\\xd8')\n    assert snapshot.screenshot.endswith(b'\\xff\\xd9')\n    assert snapshot.html == HELLO_WORLD.decode('utf-8')\n\n\nasync def test_error_snapshot_through_statistics(server_url: URL) -> None:\n    \"\"\"Test correct use of error snapshotter by the Playwright crawler.\n\n    In this test the crawler will visit 4 pages.\n    - 2 x page endpoints will return the same error\n    - homepage endpoint will return unique error\n    - headers endpoint will return no error\n    \"\"\"\n    max_retries = 2\n    crawler = PlaywrightCrawler(\n        statistics=Statistics.with_default_state(save_error_snapshots=True), max_request_retries=max_retries\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        if 'page' in context.request.url:\n            raise RuntimeError('page error')\n        if 'headers' in context.request.url:\n            return\n        raise RuntimeError('home error')\n\n    await crawler.run(\n        [str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')]\n    )\n\n    kvs = await crawler.get_key_value_store()\n    kvs_content = {}\n\n    async for key_info in kvs.iterate_keys():\n        # Skip any non-error snapshot keys, e.g. __RQ_STATE_.\n        if 'ERROR_SNAPSHOT' not in key_info.key:\n            continue\n        kvs_content[key_info.key] = await kvs.get_value(key_info.key)\n\n        assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS)\n        if key_info.key.endswith('.jpg'):\n            # Check at least jpeg start and end expected bytes. Content is not relevant for the test.\n            assert kvs_content[key_info.key].startswith(b'\\xff\\xd8')\n            assert kvs_content[key_info.key].endswith(b'\\xff\\xd9')\n        elif 'page' in key_info.key:\n            assert kvs_content[key_info.key] == GENERIC_RESPONSE.decode('utf-8')\n        else:\n            assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')\n\n    # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.\n    assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1)\n    assert crawler.statistics.error_tracker.unique_error_count == 2\n    assert len(list(kvs_content.keys())) == 4\n\n\nasync def test_respect_robots_txt(server_url: URL) -> None:\n    crawler = PlaywrightCrawler(respect_robots_txt_file=True)\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links()\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_visit_calls = [\n        mock.call(str(server_url / 'start_enqueue')),\n        mock.call(str(server_url / 'sub_index')),\n        mock.call(str(server_url / 'base_page')),\n        mock.call(str(server_url / 'base_subpath/page_5')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:\n    \"\"\"Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt.\"\"\"\n    visit = mock.Mock()\n    fail = mock.Mock()\n    crawler = PlaywrightCrawler(\n        respect_robots_txt_file=True,\n        max_request_retries=0,\n    )\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(strategy='all')\n\n    @crawler.failed_request_handler\n    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        fail(context.request.url)\n\n    await crawler.run([str(server_url / 'problematic_links')])\n\n    # Email must be skipped\n    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.\n    expected_visit_calls = [\n        mock.call(str(server_url / 'problematic_links')),\n        mock.call('https://avatars.githubusercontent.com/apify'),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n    # The budplaceholder.com does not exist.\n    expected_fail_calls = [\n        mock.call('https://budplaceholder.com/'),\n    ]\n    fail.assert_has_calls(expected_fail_calls, any_order=True)\n\n\nasync def test_on_skipped_request(server_url: URL) -> None:\n    crawler = PlaywrightCrawler(respect_robots_txt_file=True)\n    skip = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        await context.enqueue_links()\n\n    @crawler.on_skipped_request\n    async def skipped_hook(url: str, _reason: SkippedReason) -> None:\n        skip(url)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    expected_skip_calls = [\n        mock.call(str(server_url / 'page_1')),\n        mock.call(str(server_url / 'page_2')),\n        mock.call(str(server_url / 'page_3')),\n        mock.call(str(server_url / 'page_4')),\n    ]\n    skip.assert_has_calls(expected_skip_calls, any_order=True)\n\n\nasync def test_send_request(server_url: URL) -> None:\n    check_data: dict[str, Any] = {}\n\n    crawler = PlaywrightCrawler()\n\n    @crawler.pre_navigation_hook\n    async def pre_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        send_request_response = await context.send_request(str(server_url / 'user-agent'))\n        check_data['pre_send_request'] = dict(json.loads(await send_request_response.read()))\n\n    @crawler.post_navigation_hook\n    async def post_hook(context: PlaywrightPostNavCrawlingContext) -> None:\n        send_request_response = await context.send_request(str(server_url / 'user-agent'))\n        check_data['post_send_request'] = dict(json.loads(await send_request_response.read()))\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        check_data['default'] = dict(json.loads(response))\n        send_request_response = await context.send_request(str(server_url / 'user-agent'))\n        check_data['send_request'] = dict(json.loads(await send_request_response.read()))\n\n    await crawler.run([str(server_url / 'user-agent')])\n\n    assert check_data['default'].get('user-agent') is not None\n    assert check_data['send_request'].get('user-agent') is not None\n\n    assert check_data['pre_send_request'] == check_data['send_request']\n    assert check_data['post_send_request'] == check_data['send_request']\n    assert check_data['default'] == check_data['send_request']\n\n\nasync def test_send_request_with_client(server_url: URL) -> None:\n    \"\"\"Check that the persist context works with fingerprints.\"\"\"\n    check_data: dict[str, Any] = {}\n\n    crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'}))\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        response = await context.response.text()\n        check_data['default'] = dict(json.loads(response))\n        send_request_response = await context.send_request(str(server_url / 'user-agent'))\n        check_data['send_request'] = dict(json.loads(await send_request_response.read()))\n\n    await crawler.run([str(server_url / 'user-agent')])\n\n    assert check_data['default'].get('user-agent') is not None\n    assert check_data['send_request']['user-agent'] == 'My User-Agent'\n\n    assert check_data['default'] != check_data['send_request']\n\n\nasync def test_passing_configuration() -> None:\n    \"\"\"Check that the configuration is allowed to be passed to the Playwrightcrawler.\"\"\"\n    service_locator.set_configuration(Configuration(log_level='INFO'))\n    configuration = Configuration(log_level='WARNING')\n\n    crawler = PlaywrightCrawler(configuration=configuration)\n\n    assert service_locator.get_configuration().log_level == 'INFO'\n    assert crawler._service_locator.get_configuration().log_level == 'WARNING'\n\n\nasync def test_extract_links(server_url: URL) -> None:\n    crawler = PlaywrightCrawler()\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        links = await context.extract_links(exclude=[Glob(f'{server_url}sub_index')])\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_1')\n\n\nasync def test_extract_non_href_links(server_url: URL) -> None:\n    crawler = PlaywrightCrawler()\n    extracted_links: list[str] = []\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        links = await context.extract_links(selector='li', attribute='data-href')\n        extracted_links.extend(request.url for request in links)\n\n    await crawler.run([str(server_url / 'non_href_links')])\n\n    assert len(extracted_links) == 1\n    assert extracted_links[0] == str(server_url / 'page_2')\n\n\nasync def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None:\n    caplog.set_level(logging.INFO)\n    crawler = PlaywrightCrawler(configure_logging=False)\n    non_existent_page = 'https://totally-non-existing-site.com/blablablba'\n\n    # Capture all logs from the 'crawlee' logger at INFO level or higher\n    with caplog.at_level(logging.INFO, logger='crawlee'):\n        await crawler.run([Request.from_url(non_existent_page)])\n\n    expected_summarized_log = (\n        f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}'\n    )\n\n    # Find the Playwright specific error message in the logs\n    found_playwright_message = False\n    for record in caplog.records:\n        if record.message and expected_summarized_log in record.message:\n            full_message = (record.message or '') + (record.exc_text or '')\n            assert '\\n' not in full_message\n            found_playwright_message = True\n            break\n\n    assert found_playwright_message, 'Expected log message about request handler error was not found.'\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_with_rq_param(\n    server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = PlaywrightCrawler()\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    @crawler.router.default_handler\n    async def handler(context: PlaywrightCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_name', 'queue_alias', 'by_id'),\n    [\n        pytest.param('named-queue', None, False, id='with rq_name'),\n        pytest.param(None, 'alias-queue', False, id='with rq_alias'),\n        pytest.param('id-queue', None, True, id='with rq_id'),\n    ],\n)\nasync def test_enqueue_links_requests_with_rq_param(\n    server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool\n) -> None:\n    crawler = PlaywrightCrawler()\n    rq = await RequestQueue.open(name=queue_name, alias=queue_alias)\n    if by_id:\n        queue_name = None\n        queue_id = rq.id\n    else:\n        queue_id = None\n    visit_urls: set[str] = set()\n\n    check_requests: list[str] = [\n        'https://a.placeholder.com',\n        'https://b.placeholder.com',\n        'https://c.placeholder.com',\n    ]\n\n    @crawler.router.default_handler\n    async def handler(context: PlaywrightCrawlingContext) -> None:\n        visit_urls.add(context.request.url)\n        await context.enqueue_links(\n            requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all'\n        )\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n    requests_from_queue: list[str] = []\n    while request := await rq.fetch_next_request():\n        requests_from_queue.append(request.url)\n\n    assert set(requests_from_queue) == set(check_requests)\n    assert visit_urls == {str(server_url / 'start_enqueue')}\n\n    await rq.drop()\n\n\n@pytest.mark.parametrize(\n    ('queue_id', 'queue_name', 'queue_alias'),\n    [\n        pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),\n        pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),\n        pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),\n        pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),\n    ],\n)\nasync def test_enqueue_links_error_with_multi_params(\n    server_url: URL, queue_id: str | None, queue_name: str | None, queue_alias: str | None\n) -> None:\n    crawler = PlaywrightCrawler()\n\n    @crawler.router.default_handler\n    async def handler(context: PlaywrightCrawlingContext) -> None:\n        with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):\n            await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)\n\n    await crawler.run([str(server_url / 'start_enqueue')])\n\n\nasync def test_navigation_timeout_on_slow_page_load(server_url: URL) -> None:\n    crawler = PlaywrightCrawler(\n        navigation_timeout=timedelta(seconds=1),\n        max_request_retries=0,\n    )\n\n    request_handler = AsyncMock()\n    crawler.router.default_handler(request_handler)\n\n    failed_request_handler = AsyncMock()\n    crawler.failed_request_handler(failed_request_handler)\n\n    result = await crawler.run([str((server_url / 'slow').with_query(delay=2))])\n\n    assert result.requests_failed == 1\n    assert result.requests_finished == 0\n\n    assert request_handler.call_count == 0\n\n    assert failed_request_handler.call_count == 1\n    assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)\n\n\nasync def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:\n    crawler = PlaywrightCrawler(\n        navigation_timeout=timedelta(seconds=0.5),\n        max_request_retries=0,\n    )\n\n    request_handler = AsyncMock()\n    crawler.router.default_handler(request_handler)\n    crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))\n\n    # Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled\n    result = await crawler.run([str(server_url)])\n\n    assert result.requests_failed == 1\n    assert result.requests_finished == 0\n    assert request_handler.call_count == 0\n\n\nasync def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL) -> None:\n    crawler = PlaywrightCrawler(\n        request_handler_timeout=timedelta(seconds=0.5),\n        max_request_retries=0,\n    )\n\n    request_handler = AsyncMock()\n    crawler.router.default_handler(request_handler)\n\n    # Navigation takes 1 second (exceeds handler timeout), but should still succeed\n    result = await crawler.run([str((server_url / 'slow').with_query(delay=1))])\n\n    assert result.requests_failed == 0\n    assert result.requests_finished == 1\n    assert request_handler.call_count == 1\n\n\nasync def test_request_state(server_url: URL) -> None:\n    queue = await RequestQueue.open(alias='playwright_request_state')\n    crawler = PlaywrightCrawler(request_manager=queue)\n\n    success_request = Request.from_url(str(server_url))\n    assert success_request.state == RequestState.UNPROCESSED\n\n    error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True})\n\n    requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}}\n\n    @crawler.pre_navigation_hook\n    async def pre_navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        requests_states[context.request.unique_key]['pre_navigation'] = context.request.state\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        if context.request.user_data.get('cause_error'):\n            raise ValueError('Caused error as requested')\n        requests_states[context.request.unique_key]['request_handler'] = context.request.state\n\n    @crawler.error_handler\n    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        requests_states[context.request.unique_key]['error_handler'] = context.request.state\n\n    @crawler.failed_request_handler\n    async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None:\n        requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state\n\n    await crawler.run([success_request, error_request])\n\n    handled_success_request = await queue.get_request(success_request.unique_key)\n\n    assert handled_success_request is not None\n    assert handled_success_request.state == RequestState.DONE\n\n    assert requests_states[success_request.unique_key] == {\n        'pre_navigation': RequestState.BEFORE_NAV,\n        'request_handler': RequestState.REQUEST_HANDLER,\n    }\n\n    handled_error_request = await queue.get_request(error_request.unique_key)\n    assert handled_error_request is not None\n    assert handled_error_request.state == RequestState.ERROR\n\n    assert requests_states[error_request.unique_key] == {\n        'pre_navigation': RequestState.BEFORE_NAV,\n        'error_handler': RequestState.ERROR_HANDLER,\n        'failed_request_handler': RequestState.ERROR,\n    }\n\n    await queue.drop()\n\n\nasync def test_enqueue_links_with_limit(server_url: URL) -> None:\n    start_url = str(server_url / 'sub_index')\n    requests = [start_url]\n\n    crawler = PlaywrightCrawler()\n    visit = mock.Mock()\n\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        visit(context.request.url)\n        await context.enqueue_links(limit=1)\n\n    await crawler.run(requests)\n\n    # Only one link should be enqueued from sub_index due to the limit\n    expected_visit_calls = [\n        mock.call(start_url),\n        mock.call(str(server_url / 'page_3')),\n    ]\n    visit.assert_has_calls(expected_visit_calls, any_order=True)\n\n\nasync def test_playwright_crawler_pre_navigation_hook_execution(server_url: URL) -> None:\n    \"\"\"Test that pre-navigation hooks are executed.\"\"\"\n    crawler = PlaywrightCrawler(request_handler=AsyncMock())\n\n    call_mock = AsyncMock()\n\n    # Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(context: PlaywrightPreNavCrawlingContext) -> None:\n        await call_mock(context.page.url)\n\n    await crawler.run([str(server_url)])\n\n    # `pre_navigation_hook` is called before the request is made, so the loaded URL should be 'about:blank'.\n    call_mock.assert_called_once_with('about:blank')\n\n\nasync def test_playwright_crawler_post_navigation_hook_execution(server_url: URL) -> None:\n    \"\"\"Test that post-navigation hooks are executed.\"\"\"\n    crawler = PlaywrightCrawler(request_handler=AsyncMock())\n\n    call_mock = AsyncMock()\n\n    # Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook(context: PlaywrightPostNavCrawlingContext) -> None:\n        await call_mock(context.page.url)\n\n    await crawler.run([str(server_url)])\n\n    # `post_navigation_hook` is called after the request is made, so the loaded URL should be the result URL.\n    call_mock.assert_called_once_with(str(server_url))\n\n\nasync def test_playwright_navigation_hooks_order(server_url: URL) -> None:\n    \"\"\"Test that post-navigation hooks are executed in correct order.\"\"\"\n    execution_order = []\n\n    crawler = PlaywrightCrawler()\n\n    #  Register final context handler.\n    @crawler.router.default_handler\n    async def default_request_handler(_context: PlaywrightCrawlingContext) -> None:\n        execution_order.append('final handler')\n\n    #  Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook_1(_context: PlaywrightPreNavCrawlingContext) -> None:\n        execution_order.append('pre-navigation-hook 1')\n\n    #  Register pre navigation hook.\n    @crawler.pre_navigation_hook\n    async def pre_nav_hook(_context: PlaywrightPreNavCrawlingContext) -> None:\n        execution_order.append('pre-navigation-hook 2')\n\n    #  Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook_1(_context: PlaywrightPostNavCrawlingContext) -> None:\n        execution_order.append('post-navigation-hook 1')\n\n    #  Register post navigation hook.\n    @crawler.post_navigation_hook\n    async def post_nav_hook_2(_context: PlaywrightPostNavCrawlingContext) -> None:\n        execution_order.append('post-navigation-hook 2')\n\n    await crawler.run([str(server_url)])\n\n    assert execution_order == [\n        'pre-navigation-hook 1',\n        'pre-navigation-hook 2',\n        'post-navigation-hook 1',\n        'post-navigation-hook 2',\n        'final handler',\n    ]\n"
  },
  {
    "path": "tests/unit/crawlers/_playwright/test_utils.py",
    "content": "from playwright.async_api import async_playwright\nfrom yarl import URL\n\nfrom crawlee.crawlers._playwright._utils import block_requests, infinite_scroll\n\n\nasync def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:\n    \"\"\"Checks that infinite_scroll loads all items on a page with infinite scrolling.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch(headless=True)\n        page = await browser.new_page()\n\n        target_url = str(server_url / 'infinite_scroll')\n\n        # Get data with manual scrolling\n        await page.goto(target_url)\n\n        manual_items = []\n        for _ in range(4):\n            items = await page.query_selector_all('.item')\n            manual_items = items\n            await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')\n            await page.wait_for_timeout(1000)\n\n        # Reset page\n        await page.close()\n        page = await browser.new_page()\n        await page.goto(target_url)\n\n        # Get data with infinite_scroll utility\n        before_scroll = await page.query_selector_all('.item')\n        assert len(before_scroll) != len(manual_items)\n        assert len(before_scroll) == 10\n\n        await infinite_scroll(page)\n\n        after_scroll = await page.query_selector_all('.item')\n\n        assert len(before_scroll) < len(after_scroll)\n        assert len(manual_items) == len(after_scroll)\n\n        await browser.close()\n\n\nasync def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None:\n    \"\"\"Checks that infinite_scroll does not call error on a page without infinite scrolling.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch(headless=True)\n        page = await browser.new_page()\n\n        await page.goto(str(server_url))\n\n        await infinite_scroll(page)\n\n        title = await page.title()\n\n        assert title == 'Hello, world!'\n\n        await browser.close()\n\n\nasync def test_double_call_infinite_scroll(server_url: URL) -> None:\n    \"\"\"Checks that calling infinite_scroll twice does not load more items the second time.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch(headless=True)\n        page = await browser.new_page()\n\n        await page.goto(str(server_url / 'infinite_scroll'))\n\n        await infinite_scroll(page)\n        first_count = len(await page.query_selector_all('.item'))\n\n        await infinite_scroll(page)\n        second_count = len(await page.query_selector_all('.item'))\n\n        assert first_count == second_count\n\n        await browser.close()\n\n\nasync def test_block_requests_default(server_url: URL) -> None:\n    \"\"\"Checks that block_requests blocks the correct resources by default.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch()\n\n        target_url = str(server_url / 'resource_loading_page')\n\n        # Default behavior, all resources load\n        page = await browser.new_page()\n        loaded_urls_no_block = []\n\n        page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1]))\n        await page.goto(target_url)\n        await page.wait_for_load_state('networkidle')\n        await page.close()\n\n        # With blocking — collect loaded resources\n        page = await browser.new_page()\n        loaded_urls_blocked = []\n\n        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))\n        await block_requests(page)\n        await page.goto(target_url)\n        await page.wait_for_load_state('networkidle')\n        await page.close()\n\n        await browser.close()\n\n    # Without blocking, both resources should load\n    assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'}\n\n    # With blocking, only JS should load\n    assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'}\n\n\nasync def test_block_requests_with_extra_patterns(server_url: URL) -> None:\n    \"\"\"Checks that block_requests blocks the correct resources with extra patterns.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch()\n\n        target_url = str(server_url / 'resource_loading_page')\n\n        page = await browser.new_page()\n        loaded_urls_blocked = []\n\n        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))\n        await block_requests(page, extra_url_patterns=['*.js'])\n        await page.goto(target_url)\n        await page.wait_for_load_state('networkidle')\n        await page.close()\n\n        await browser.close()\n\n        # With blocking, only HTML should load\n        assert set(loaded_urls_blocked) == {'resource_loading_page'}\n\n\nasync def test_block_requests_with_custom_patterns(server_url: URL) -> None:\n    \"\"\"Checks that block_requests blocks the correct resources with custom patterns.\"\"\"\n    async with async_playwright() as p:\n        browser = await p.chromium.launch()\n\n        target_url = str(server_url / 'resource_loading_page')\n\n        page = await browser.new_page()\n        loaded_urls_blocked = []\n\n        page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))\n        await block_requests(page, url_patterns=['*.js'])\n        await page.goto(target_url)\n        await page.wait_for_load_state('networkidle')\n        await page.close()\n\n        await browser.close()\n\n        # With blocking, only PNG should load\n        assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'}\n"
  },
  {
    "path": "tests/unit/events/test_event_manager.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport logging\nfrom datetime import timedelta\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any\nfrom unittest import mock\nfrom unittest.mock import AsyncMock, MagicMock\n\nimport pytest\n\nfrom crawlee.events import Event, EventManager, EventSystemInfoData\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n\n@pytest.fixture\nasync def event_manager() -> AsyncGenerator[EventManager, None]:\n    async with EventManager() as event_manager:\n        yield event_manager\n\n\n@pytest.fixture\ndef event_system_info_data() -> EventSystemInfoData:\n    return MagicMock(spec=EventSystemInfoData)\n\n\n@pytest.fixture\ndef async_listener() -> AsyncMock:\n    async def async_listener(payload: Any) -> None:\n        pass\n\n    al = AsyncMock()\n    update_wrapper(al, async_listener)\n    return al\n\n\n@pytest.fixture\ndef sync_listener() -> MagicMock:\n    def sync_listener(payload: Any) -> None:\n        pass\n\n    sl = MagicMock()\n    update_wrapper(sl, sync_listener)\n    return sl\n\n\nasync def test_emit_invokes_registered_sync_listener(\n    sync_listener: MagicMock,\n    event_manager: EventManager,\n    event_system_info_data: EventSystemInfoData,\n) -> None:\n    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n\n    await asyncio.sleep(0.1)  # Allow some time for the event to be processed\n\n    assert sync_listener.call_count == 1\n    assert sync_listener.call_args[0] == (event_system_info_data,)\n\n\nasync def test_emit_invokes_both_sync_and_async_listeners(\n    sync_listener: MagicMock,\n    async_listener: AsyncMock,\n    event_manager: EventManager,\n    event_system_info_data: EventSystemInfoData,\n) -> None:\n    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)\n    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n\n    await asyncio.sleep(0.1)  # Allow some time for the event to be processed\n\n    assert async_listener.call_count == 1\n    assert async_listener.call_args[0] == (event_system_info_data,)\n\n    assert sync_listener.call_count == 1\n    assert sync_listener.call_args[0] == (event_system_info_data,)\n\n\nasync def test_emit_event_with_no_listeners(\n    event_manager: EventManager,\n    event_system_info_data: EventSystemInfoData,\n    async_listener: AsyncMock,\n) -> None:\n    # Register a listener for a different event\n    event_manager.on(event=Event.ABORTING, listener=async_listener)\n\n    # Attempt to emit an event for which no listeners are registered, it should not fail\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n    await asyncio.sleep(0.1)  # Allow some time for the event to be processed\n\n    # Ensure the listener for the other event was not called\n    assert async_listener.call_count == 0\n\n\nasync def test_emit_invokes_parameterless_listener(\n    event_manager: EventManager,\n    event_system_info_data: EventSystemInfoData,\n) -> None:\n    sync_mock = MagicMock()\n\n    def sync_listener() -> None:\n        sync_mock()\n\n    async_mock = MagicMock()\n\n    async def async_listener() -> None:\n        async_mock()\n\n    event_manager.on(event=Event.SYSTEM_INFO, listener=sync_listener)\n    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n    await asyncio.sleep(0.1)  # Allow some time for the event to be processed\n\n    assert sync_mock.call_count == 1\n    assert async_mock.call_count == 1\n\n\nasync def test_remove_nonexistent_listener_does_not_fail(\n    async_listener: AsyncMock,\n    event_manager: EventManager,\n) -> None:\n    # Attempt to remove a specific listener that was never added.\n    event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener)\n    # Attempt to remove all listeners.\n    event_manager.off(event=Event.ABORTING)\n\n\nasync def test_removed_listener_not_invoked_on_emit(\n    async_listener: AsyncMock,\n    event_manager: EventManager,\n    event_system_info_data: EventSystemInfoData,\n) -> None:\n    event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n    event_manager.off(event=Event.SYSTEM_INFO, listener=async_listener)\n    event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n\n    await asyncio.sleep(0.1)  # Allow some time for the event to be processed\n    assert async_listener.call_count == 0\n\n\nasync def test_close_clears_listeners_and_tasks(async_listener: AsyncMock) -> None:\n    async with EventManager() as event_manager:\n        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n\n    assert async_listener.call_count == 0\n    assert len(event_manager._listener_tasks) == 0\n    assert len(event_manager._listeners_to_wrappers) == 0\n\n\nasync def test_close_after_emit_processes_event(\n    async_listener: AsyncMock,\n    event_system_info_data: EventSystemInfoData,\n) -> None:\n    async with EventManager() as event_manager:\n        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n\n    # Event should be processed before the event manager is closed\n    assert async_listener.call_count == 1\n    assert async_listener.call_args[0] == (event_system_info_data,)\n\n    assert len(event_manager._listener_tasks) == 0\n    assert len(event_manager._listeners_to_wrappers) == 0\n\n\nasync def test_wait_for_all_listeners_cancelled_error(\n    monkeypatch: pytest.MonkeyPatch,\n    caplog: pytest.LogCaptureFixture,\n) -> None:\n    # Simulate long-running listener tasks\n    async def long_running_listener() -> None:\n        await asyncio.sleep(10)\n\n    # Define a side effect function that raises CancelledError\n    async def mock_async_wait(*_: Any, **__: Any) -> None:\n        raise asyncio.CancelledError\n\n    with pytest.raises(asyncio.CancelledError), caplog.at_level(logging.WARNING):  # noqa: PT012\n        async with EventManager(close_timeout=timedelta(milliseconds=10)) as event_manager:\n            event_manager.on(event=Event.SYSTEM_INFO, listener=long_running_listener)\n\n            # Use monkeypatch to replace asyncio.wait with mock_async_wait\n            monkeypatch.setattr('asyncio.wait', mock_async_wait)\n\n\nasync def test_methods_raise_error_when_not_active(event_system_info_data: EventSystemInfoData) -> None:\n    event_manager = EventManager()\n\n    assert event_manager.active is False\n\n    with pytest.raises(RuntimeError, match=r'EventManager is not active.'):\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n\n    with pytest.raises(RuntimeError, match=r'EventManager is not active.'):\n        await event_manager.wait_for_all_listeners_to_complete()\n\n    with pytest.raises(RuntimeError, match=r'EventManager is already active.'):\n        async with event_manager, event_manager:\n            pass\n\n    async with event_manager:\n        event_manager.emit(event=Event.SYSTEM_INFO, event_data=event_system_info_data)\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        assert event_manager.active is True\n\n\nasync def test_event_manager_in_context_persistence() -> None:\n    \"\"\"Test that entering the `EventManager` context emits persist state event at least once.\"\"\"\n    event_manager = EventManager()\n\n    with mock.patch.object(event_manager, '_emit_persist_state_event', AsyncMock()) as mocked_emit_persist_state_event:\n        async with event_manager:\n            pass\n\n    assert mocked_emit_persist_state_event.call_count >= 1\n"
  },
  {
    "path": "tests/unit/events/test_local_event_manager.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom typing import Any\nfrom unittest.mock import AsyncMock\n\nfrom crawlee.events import LocalEventManager\nfrom crawlee.events._types import Event, EventSystemInfoData\n\n\nasync def test_emit_system_info_event() -> None:\n    mocked_listener = AsyncMock()\n\n    async def async_listener(payload: Any) -> None:\n        await mocked_listener(payload)\n\n    system_info_interval = timedelta(milliseconds=50)\n    test_tolerance_coefficient = 10\n    async with LocalEventManager(system_info_interval=system_info_interval) as event_manager:\n        event_manager.on(event=Event.SYSTEM_INFO, listener=async_listener)\n        await asyncio.sleep(system_info_interval.total_seconds() * test_tolerance_coefficient)\n\n    assert mocked_listener.call_count >= 1\n    assert isinstance(mocked_listener.call_args[0][0], EventSystemInfoData)\n"
  },
  {
    "path": "tests/unit/fingerprint_suite/test_adapters.py",
    "content": "from collections.abc import Iterable\n\nimport pytest\nfrom browserforge.headers import Browser\n\nfrom crawlee.fingerprint_suite import (\n    DefaultFingerprintGenerator,\n    HeaderGeneratorOptions,\n    ScreenOptions,\n)\nfrom crawlee.fingerprint_suite._browserforge_adapter import PatchedHeaderGenerator\nfrom crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD\n\n\ndef test_fingerprint_generator_has_default() -> None:\n    \"\"\"Test that header generator can work without any options.\"\"\"\n    assert DefaultFingerprintGenerator().generate()\n\n\ndef test_fingerprint_generator_some_options_stress_test() -> None:\n    \"\"\"Test that header generator can work consistently.\"\"\"\n    fingerprint_generator = DefaultFingerprintGenerator(\n        mock_web_rtc=True,\n        screen_options=ScreenOptions(min_width=500),\n        header_options=HeaderGeneratorOptions(strict=True),\n    )\n\n    for _ in range(20):\n        fingerprint = fingerprint_generator.generate()\n\n        assert fingerprint.mockWebRTC is True\n        assert fingerprint.screen.availWidth > 500\n\n\ndef test_fingerprint_generator_all_options() -> None:\n    \"\"\"Test that header generator can work with all the options. Some most basic checks of fingerprint.\n\n    Fingerprint generation option might have no effect if there is no fingerprint sample present in collected data.\n    \"\"\"\n    min_width = 600\n    max_width = 1800\n    min_height = 400\n    max_height = 1200\n\n    fingerprint = DefaultFingerprintGenerator(\n        mock_web_rtc=True,\n        slim=True,\n        screen_options=ScreenOptions(\n            min_width=min_width,\n            max_width=max_width,\n            min_height=min_height,\n            max_height=max_height,\n        ),\n        header_options=HeaderGeneratorOptions(\n            strict=True,\n            browsers=['firefox'],\n            operating_systems=['windows'],\n            devices=['mobile'],\n            locales=['en'],  #  Does not generate any other values than `en-US` regardless of the input in browserforge\n            http_version='2',  # Http1 does not work in browserforge\n        ),\n    ).generate()\n\n    assert fingerprint.screen.availWidth >= min_width\n    assert fingerprint.screen.availWidth <= max_width\n    assert fingerprint.screen.availHeight >= min_height\n    assert fingerprint.screen.availHeight <= max_height\n\n    assert fingerprint.mockWebRTC is True\n    assert fingerprint.slim is True\n    assert 'Firefox' in fingerprint.navigator.userAgent\n    assert 'Win' in fingerprint.navigator.oscpu\n    assert 'en-US' in fingerprint.navigator.languages\n\n\n@pytest.mark.parametrize(\n    'browser',\n    [\n        'firefox',\n        ['firefox'],\n        [Browser(name='firefox')],\n    ],\n)\ndef test_patched_header_generator_generate(browser: Iterable[str | Browser]) -> None:\n    \"\"\"Test that PatchedHeaderGenerator works with all the possible types correctly.\"\"\"\n    header = PatchedHeaderGenerator().generate(browser=browser)\n    assert any(keyword in header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD['firefox'])\n"
  },
  {
    "path": "tests/unit/fingerprint_suite/test_header_generator.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.fingerprint_suite import HeaderGenerator\nfrom crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values\nfrom crawlee.fingerprint_suite._consts import (\n    BROWSER_TYPE_HEADER_KEYWORD,\n)\n\nif TYPE_CHECKING:\n    from crawlee.fingerprint_suite._types import SupportedBrowserType\n\n\ndef test_get_common_headers(header_network: dict) -> None:\n    header_generator = HeaderGenerator()\n    headers = header_generator.get_common_headers()\n\n    assert 'Accept' in headers\n    assert headers['Accept'] in get_available_header_values(header_network, {'Accept', 'accept'})\n    assert 'Accept-Language' in headers\n\n\ndef test_get_random_user_agent_header() -> None:\n    \"\"\"Test that a random User-Agent header is generated.\"\"\"\n    header_generator = HeaderGenerator()\n    headers = header_generator.get_random_user_agent_header()\n\n    assert 'User-Agent' in headers\n    assert headers['User-Agent']\n\n\n@pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari'])\ndef test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None:\n    \"\"\"Test that the User-Agent header is consistently generated correctly.\n\n    (Very fast even when stress tested.)\"\"\"\n    for _ in range(100):\n        header_generator = HeaderGenerator()\n        headers = header_generator.get_user_agent_header(browser_type=browser_type)\n\n        assert 'User-Agent' in headers\n        assert any(keyword in headers['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type])\n        assert headers['User-Agent'] in get_available_header_values(header_network, {'user-agent', 'User-Agent'})\n\n\ndef test_get_user_agent_header_invalid_browser_type() -> None:\n    \"\"\"Test that an invalid browser type raises a ValueError.\"\"\"\n    header_generator = HeaderGenerator()\n\n    with pytest.raises(ValueError, match=r'Unsupported browser type'):\n        header_generator.get_user_agent_header(browser_type='invalid_browser')  # ty: ignore[invalid-argument-type]\n\n\ndef test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None:\n    \"\"\"Test that Sec-Ch-Ua headers are generated correctly for Chrome.\"\"\"\n    header_generator = HeaderGenerator()\n    headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome')\n\n    assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')\n    assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')\n    assert headers.get('sec-ch-ua-platform') in get_available_header_values(header_network, 'sec-ch-ua-platform')\n\n\ndef test_get_sec_ch_ua_headers_firefox() -> None:\n    \"\"\"Test that sec-ch-ua headers are not generated for Firefox.\"\"\"\n    header_generator = HeaderGenerator()\n    headers = header_generator.get_sec_ch_ua_headers(browser_type='firefox')\n\n    assert not headers\n\n\ndef test_get_sec_ch_ua_headers_invalid_browser_type() -> None:\n    \"\"\"Test that an invalid browser type raises a ValueError for sec-ch-ua headers.\"\"\"\n    header_generator = HeaderGenerator()\n\n    with pytest.raises(ValueError, match=r'Unsupported browser type'):\n        header_generator.get_sec_ch_ua_headers(browser_type='invalid_browser')  # ty: ignore[invalid-argument-type]\n"
  },
  {
    "path": "tests/unit/http_clients/test_http_clients.py",
    "content": "from __future__ import annotations\n\nimport os\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom curl_cffi import CurlHttpVersion\n\nfrom crawlee import Request\nfrom crawlee.errors import ProxyError\nfrom crawlee.http_clients import CurlImpersonateHttpClient, HttpClient, HttpxHttpClient, ImpitHttpClient\nfrom crawlee.statistics import Statistics\nfrom tests.unit.server_endpoints import HELLO_WORLD\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from _pytest.fixtures import SubRequest\n    from yarl import URL\n\n    from crawlee.proxy_configuration import ProxyInfo\n\n\n@pytest.fixture\nasync def custom_http_client(request: SubRequest) -> AsyncGenerator[HttpClient]:\n    \"\"\"Helper fixture to reduce code duplication.\n\n    If clients are not initialized, create their default instances.\n    Return client in active context, leave the context after the test.\"\"\"\n\n    client = request.param if isinstance(request.param, HttpClient) else request.param()\n    async with client as _:\n        yield _\n\n\nasync def test_http_1(http_client: HttpClient, server_url: URL) -> None:\n    response = await http_client.send_request(str(server_url))\n    assert response.http_version == 'HTTP/1.1'\n\n\n@pytest.mark.parametrize(\n    'custom_http_client',\n    [\n        pytest.param(CurlImpersonateHttpClient(http_version=CurlHttpVersion.V2_0), id='curl'),\n        pytest.param(HttpxHttpClient(http1=False, http2=True), id='httpx'),\n        pytest.param(ImpitHttpClient(), id='impit'),\n    ],\n    indirect=['custom_http_client'],\n)\nasync def test_http_2(custom_http_client: HttpClient) -> None:\n    response = await custom_http_client.send_request('https://apify.com/')\n    assert response.http_version == 'HTTP/2'\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')\nasync def test_crawl_with_proxy(\n    http_client: HttpClient,\n    proxy: ProxyInfo,\n    server_url: URL,\n) -> None:\n    url = str(server_url / 'status/222')\n    request = Request.from_url(url)\n\n    async with Statistics.with_default_state() as statistics:\n        result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)\n\n    assert result.http_response.status_code == 222  # 222 - authentication successful\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')\nasync def test_crawl_with_proxy_disabled(\n    http_client: HttpClient,\n    disabled_proxy: ProxyInfo,\n) -> None:\n    url = 'https://apify.com/'\n    request = Request.from_url(url)\n\n    with pytest.raises(ProxyError):\n        async with Statistics.with_default_state() as statistics:\n            await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics)\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')\nasync def test_send_request_with_proxy(\n    http_client: HttpClient,\n    proxy: ProxyInfo,\n    server_url: URL,\n) -> None:\n    url = str(server_url / 'status/222')\n\n    response = await http_client.send_request(url, proxy_info=proxy)\n    assert response.status_code == 222  # 222 - authentication successful\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')\nasync def test_send_request_with_proxy_disabled(\n    http_client: HttpClient,\n    disabled_proxy: ProxyInfo,\n) -> None:\n    url = 'https://apify.com/'\n\n    with pytest.raises(ProxyError):\n        await http_client.send_request(url, proxy_info=disabled_proxy)\n\n\nasync def test_crawl_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None:\n    target_url = str(server_url / 'status/200')\n    redirect_url = str((server_url / 'redirect').update_query(url=target_url))\n    request = Request.from_url(redirect_url)\n    crawling_result = await http_client.crawl(request)\n\n    assert crawling_result.http_response.status_code == 200\n    assert request.loaded_url == target_url\n\n\n@pytest.mark.parametrize(\n    'custom_http_client',\n    [\n        pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'),\n        pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'),\n        pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'),\n    ],\n    indirect=['custom_http_client'],\n)\nasync def test_crawl_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None:\n    target_url = str(server_url / 'status/200')\n    redirect_url = str((server_url / 'redirect').update_query(url=target_url))\n    request = Request.from_url(redirect_url)\n\n    crawling_result = await custom_http_client.crawl(request)\n\n    assert crawling_result.http_response.status_code == 302\n    assert crawling_result.http_response.headers['Location'] == target_url\n    assert request.loaded_url == redirect_url\n\n\nasync def test_send_request_allow_redirects_by_default(http_client: HttpClient, server_url: URL) -> None:\n    target_url = str(server_url / 'status/200')\n    redirect_url = str((server_url / 'redirect').update_query(url=target_url))\n\n    response = await http_client.send_request(redirect_url)\n\n    assert response.status_code == 200\n\n\n@pytest.mark.parametrize(\n    'custom_http_client',\n    [\n        pytest.param(CurlImpersonateHttpClient(allow_redirects=False), id='curl'),\n        pytest.param(HttpxHttpClient(follow_redirects=False), id='httpx'),\n        pytest.param(ImpitHttpClient(follow_redirects=False), id='impit'),\n    ],\n    indirect=['custom_http_client'],\n)\nasync def test_send_request_allow_redirects_false(custom_http_client: HttpClient, server_url: URL) -> None:\n    target_url = str(server_url / 'status/200')\n    redirect_url = str((server_url / 'redirect').update_query(url=target_url))\n\n    response = await custom_http_client.send_request(redirect_url)\n\n    assert response.status_code == 302\n    assert response.headers['Location'] == target_url\n\n\nasync def test_stream(http_client: HttpClient, server_url: URL) -> None:\n    content_body: bytes = b''\n\n    async with http_client.stream(str(server_url)) as response:\n        assert response.status_code == 200\n        async for chunk in response.read_stream():\n            content_body += chunk\n\n    assert content_body == HELLO_WORLD\n\n\nasync def test_stream_error_double_read_stream(http_client: HttpClient, server_url: URL) -> None:\n    async with http_client.stream(str(server_url)) as response:\n        assert response.status_code == 200\n        content_body_first: bytes = b''\n        async for chunk in response.read_stream():\n            content_body_first += chunk\n\n        with pytest.raises(RuntimeError):\n            [chunk async for chunk in response.read_stream()]\n\n    assert content_body_first == HELLO_WORLD\n\n\nasync def test_stream_error_for_read(http_client: HttpClient, server_url: URL) -> None:\n    async with http_client.stream(str(server_url)) as response:\n        assert response.status_code == 200\n\n        with pytest.raises(RuntimeError):\n            await response.read()\n\n\nasync def test_send_request_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None:\n    response = await http_client.send_request(str(server_url))\n\n    assert response.status_code == 200\n    with pytest.raises(RuntimeError):\n        [item async for item in response.read_stream()]\n\n\nasync def test_send_crawl_error_for_read_stream(http_client: HttpClient, server_url: URL) -> None:\n    response = await http_client.crawl(Request.from_url(str(server_url)))\n    http_response = response.http_response\n\n    assert http_response.status_code == 200\n    with pytest.raises(RuntimeError):\n        [item async for item in http_response.read_stream()]\n\n\n@pytest.mark.parametrize(\n    'custom_http_client',\n    [\n        pytest.param(CurlImpersonateHttpClient(), id='curl'),\n        pytest.param(HttpxHttpClient(), id='httpx'),\n        pytest.param(ImpitHttpClient(), id='impit'),\n    ],\n)\nasync def test_reuse_context_manager(custom_http_client: HttpClient, server_url: URL) -> None:\n    async with custom_http_client:\n        response = await custom_http_client.send_request(str(server_url))\n        assert response.status_code == 200\n\n    # Reusing the context manager should not raise an error\n    async with custom_http_client:\n        response = await custom_http_client.send_request(str(server_url))\n        assert response.status_code == 200\n\n\nasync def test_work_after_cleanup(http_client: HttpClient, server_url: URL) -> None:\n    response = await http_client.send_request(str(server_url))\n    assert response.status_code == 200\n\n    # Cleanup the client\n    await http_client.cleanup()\n\n    # After cleanup, the client should still work\n    response = await http_client.send_request(str(server_url))\n    assert response.status_code == 200\n\n\nasync def test_compressed_chunked_stream(http_client: HttpClient, server_url: URL) -> None:\n    content_body: bytes = b''\n\n    async with http_client.stream(str(server_url / 'get_compressed')) as response:\n        assert response.status_code == 200\n        async for chunk in response.read_stream():\n            content_body += chunk\n\n    assert content_body == HELLO_WORLD * 1000\n"
  },
  {
    "path": "tests/unit/http_clients/test_httpx.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values\nfrom crawlee.fingerprint_suite._consts import COMMON_ACCEPT_LANGUAGE\nfrom crawlee.http_clients import HttpxHttpClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from yarl import URL\n\n    from crawlee.http_clients import HttpClient\n\n\n@pytest.fixture\nasync def http_client() -> AsyncGenerator[HttpClient]:\n    async with HttpxHttpClient(http2=False) as client:\n        yield client\n\n\nasync def test_common_headers_and_user_agent(server_url: URL, header_network: dict) -> None:\n    \"\"\"Test that the relevant headers use header values from header generator instead of default Httpx headers.\n\n    Httpx uses own headers by default which is not desired as it could increase blocking chances.\n    \"\"\"\n    client = HttpxHttpClient()\n\n    response = await client.send_request(str(server_url / 'headers'))\n    response_headers = json.loads((await response.read()).decode())\n\n    assert 'accept' in response_headers\n    assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'})\n\n    assert 'accept-language' in response_headers\n    assert response_headers['accept-language'] == COMMON_ACCEPT_LANGUAGE\n\n    # By default, HTTPX uses its own User-Agent, which should be replaced by the one from the header generator.\n    assert 'user-agent' in response_headers\n    assert 'python-httpx' not in response_headers['user-agent']\n    assert response_headers['user-agent'] in get_available_header_values(header_network, {'User-Agent', 'user-agent'})\n"
  },
  {
    "path": "tests/unit/otel/test_crawler_instrumentor.py",
    "content": "import io\nimport json\nimport re\nfrom unittest import mock\n\nfrom opentelemetry.sdk.resources import Resource\nfrom opentelemetry.sdk.trace import TracerProvider\nfrom opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor\nfrom opentelemetry.trace import set_tracer_provider\nfrom yarl import URL\n\nfrom crawlee import ConcurrencySettings\nfrom crawlee.crawlers import ParselCrawler\nfrom crawlee.otel.crawler_instrumentor import CrawlerInstrumentor\nfrom crawlee.storages import Dataset\n\n\nasync def test_crawler_instrumentor_capability(server_url: URL) -> None:\n    \"\"\"Test OpenTelemetry instrumentation capability of the crawler.\n\n    Instrument the crawler and one additional class and check that telemetry data is generated correctly.\n    Telemetry data is redirected to an in-memory file for testing purposes.\"\"\"\n\n    resource = Resource.create(\n        {\n            'service.name': 'ExampleCrawler',\n            'service.version': '1.0.0',\n            'environment': 'development',\n        }\n    )\n    # Set up the OpenTelemetry tracer provider and exporter\n    provider = TracerProvider(resource=resource)\n    in_memory_sink_for_telemetry = io.StringIO(newline='\\n')\n    exporter = ConsoleSpanExporter(out=in_memory_sink_for_telemetry)\n    provider.add_span_processor(SimpleSpanProcessor(exporter))\n    set_tracer_provider(provider)\n    # Instrument the crawler with OpenTelemetry\n    instrumentor = CrawlerInstrumentor(instrument_classes=[Dataset])\n    instrumentor.instrument()\n\n    # Generate first telemetry data from `Dataset` public methods.\n    # `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.\n    dataset = await Dataset.open(name='test-dataset')\n    await dataset.drop()\n\n    # Other traces will be from crawler run.\n    crawler = ParselCrawler(\n        max_requests_per_crawl=1,\n        request_handler=mock.AsyncMock(),\n        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),\n    )\n\n    # Run crawler and generate more telemetry data.\n    await crawler.run([str(server_url)])\n\n    # Telemetry jsons are packed together in one string. Unpack them and load as json objects.\n    telemetry_strings = in_memory_sink_for_telemetry.getvalue()\n    telemetry_data = [\n        json.loads(telemetry_string) for telemetry_string in re.split(r'(?<=\\})\\s*(?=\\{)', telemetry_strings)\n    ]\n\n    # Do some basic checks on the telemetry data.\n    # The point of this test is not to check completeness of the data, but telemetry capability.\n\n    # Extra `instrument_classes` telemetry - KeyValueStore.open() is parent to KeyValueStore.__init__() span.\n    assert telemetry_data[0]['name'] == '__init__'\n    assert telemetry_data[0]['attributes']['code.function.name'] == 'Dataset.__init__'\n    assert telemetry_data[0]['resource']['attributes'] == dict(resource.attributes)\n\n    assert telemetry_data[1]['name'] == 'open'\n    assert telemetry_data[1]['attributes']['code.function.name'] == 'Dataset.open'\n    assert telemetry_data[1]['resource']['attributes'] == dict(resource.attributes)\n\n    # Opening KeyValueStore creates a new trace.\n    assert telemetry_data[0]['context']['trace_id'] == telemetry_data[1]['context']['trace_id']\n\n    assert telemetry_data[2]['name'] == 'drop'\n    assert telemetry_data[2]['attributes']['code.function.name'] == 'Dataset.drop'\n    assert telemetry_data[2]['resource']['attributes'] == dict(resource.attributes)\n\n    # Dropping KeyValueStore creates a new trace.\n    assert telemetry_data[2]['context']['trace_id'] != telemetry_data[1]['context']['trace_id']\n\n    # Crawler telemetry - all crawler spans will be in one trace as there is only one request in this test.\n    assert telemetry_data[3]['name'] == '_execute_pre_navigation_hooks, action'\n    assert telemetry_data[3]['attributes']['code.function.name'] == 'AbstractHttpCrawler._execute_pre_navigation_hooks'\n    assert telemetry_data[3]['attributes']['url.full'] == str(server_url)\n    assert telemetry_data[3]['resource']['attributes'] == dict(resource.attributes)\n\n    assert telemetry_data[-1]['name'] == '__run_task_function'\n    assert telemetry_data[-1]['attributes']['code.function.name'] == 'BasicCrawler.__run_task_function'\n    assert telemetry_data[-1]['resource']['attributes'] == dict(resource.attributes)\n\n    # Processing of the request is in the same trace.\n    assert telemetry_data[3]['context']['trace_id'] == telemetry_data[-1]['context']['trace_id']\n\n    # Check that trace_ids of unrelated traces are not the same.\n    assert telemetry_data[0]['context']['trace_id'] != telemetry_data[-1]['context']['trace_id']\n"
  },
  {
    "path": "tests/unit/proxy_configuration/test_new_proxy_info.py",
    "content": "from __future__ import annotations\n\nfrom itertools import cycle\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def test_returns_proxy_info() -> None:\n    \"\"\"Test that proxy_urls can return contain both string and None.\"\"\"\n    config = ProxyConfiguration(proxy_urls=[None, 'http://proxy.com:1111'])\n\n    proxy_info = await config.new_proxy_info(None, None, None)\n    assert proxy_info is None\n\n    proxy_info = await config.new_proxy_info(None, None, None)\n    assert proxy_info is not None\n    assert proxy_info.url == 'http://proxy.com:1111'\n    assert proxy_info.hostname == 'proxy.com'\n    assert proxy_info.username == ''\n    assert proxy_info.password == ''\n    assert proxy_info.port == 1111\n\n\nasync def test_throws_on_invalid_new_url_function() -> None:\n    config = ProxyConfiguration(\n        new_url_function=lambda session_id=None, request=None: 'http://proxy.com:1111*invalid_url'  # noqa: ARG005\n    )\n\n    with pytest.raises(ValueError):  # noqa: PT011\n        await config.new_proxy_info(None, None, None)\n\n\nasync def test_returns_proxy_info_with_new_url_function() -> None:\n    \"\"\"Test that new_url_function can return string and None.\"\"\"\n    proxy_iterator = cycle([None, 'http://proxy.com:1111'])\n\n    config = ProxyConfiguration(new_url_function=lambda session_id=None, request=None: next(proxy_iterator))  # noqa: ARG005\n\n    proxy_info = await config.new_proxy_info(None, None, None)\n    assert proxy_info is None\n\n    proxy_info = await config.new_proxy_info(None, None, None)\n    assert proxy_info is not None\n    assert proxy_info.url == 'http://proxy.com:1111'\n    assert proxy_info.hostname == 'proxy.com'\n    assert proxy_info.username == ''\n    assert proxy_info.password == ''\n    assert proxy_info.port == 1111\n\n\nasync def test_returns_proxy_info_with_new_url_function_async() -> None:\n    async def new_url(session_id: str | None = None, request: Request | None = None) -> str:  # noqa: ARG001\n        return 'http://proxy.com:1111'\n\n    config = ProxyConfiguration(new_url_function=new_url)\n\n    proxy_info = await config.new_proxy_info(None, None, None)\n\n    assert proxy_info is not None\n    assert proxy_info.url == 'http://proxy.com:1111'\n    assert proxy_info.hostname == 'proxy.com'\n    assert proxy_info.username == ''\n    assert proxy_info.password == ''\n    assert proxy_info.port == 1111\n\n\nasync def test_rotates_proxies() -> None:\n    proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']\n    config = ProxyConfiguration(proxy_urls=proxy_urls)\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[1]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[2]\n\n\nasync def test_rotates_proxies_with_sessions() -> None:\n    proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']\n    request = Request(url='http://some.domain/abc', unique_key='1')\n    sessions = [f'session_{i}' for i in range(6)]\n\n    config = ProxyConfiguration(proxy_urls=proxy_urls)\n\n    # A single session should always receive the same proxy\n    info = await config.new_proxy_info(sessions[0], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(sessions[0], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(sessions[0], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(sessions[0], request, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(sessions[0], request, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    # Different sessions should get rotated proxies\n    info = await config.new_proxy_info(sessions[1], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[1]\n\n    info = await config.new_proxy_info(sessions[2], request, None)\n    assert info is not None\n    assert info.url == proxy_urls[2]\n\n    info = await config.new_proxy_info(sessions[3], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(sessions[4], None, None)\n    assert info is not None\n    assert info.url == proxy_urls[1]\n\n    info = await config.new_proxy_info(sessions[5], request, None)\n    assert info is not None\n    assert info.url == proxy_urls[2]\n\n    # Without sessions should get rotated proxies\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(None, request, None)\n    assert info is not None\n    assert info.url == proxy_urls[1]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[2]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == proxy_urls[0]\n\n    info = await config.new_proxy_info(None, request, None)\n    assert info is not None\n    assert info.url == proxy_urls[1]\n\n\n@pytest.mark.parametrize(\n    ('url', 'expected_port'),\n    [\n        # Default ports based on the URL scheme\n        ('http://proxy.com', 80),\n        ('https://proxy.com', 443),\n        # Explicit ports specified in the URL\n        ('http://proxy.com:80', 80),\n        ('http://proxy.com:1234', 1234),\n    ],\n)\nasync def test_sets_port(url: str, expected_port: int) -> None:\n    \"\"\"Test that the port property is set correctly.\n\n    The port is inferred from the URL scheme if it is not specified in the URL.\n    \"\"\"\n    config = ProxyConfiguration(proxy_urls=[url])\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.port == expected_port\n"
  },
  {
    "path": "tests/unit/proxy_configuration/test_tiers.py",
    "content": "from __future__ import annotations\n\nfrom crawlee import Request\nfrom crawlee.proxy_configuration import ProxyConfiguration\n\n\nasync def test_rotates_proxies_uniformly_with_no_request() -> None:\n    tiered_proxy_urls: list[list[str | None]] = [\n        ['http://proxy:1111', 'http://proxy:2222'],\n        ['http://proxy:3333', 'http://proxy:4444'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][1]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[1][0]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[1][1]\n\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n\nasync def test_retrying_request_makes_tier_go_up() -> None:\n    tiered_proxy_urls: list[list[str | None]] = [\n        ['http://proxy:1111'],\n        ['http://proxy:2222'],\n        ['http://proxy:3333'],\n        ['http://proxy:4444'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    # Calling `new_proxy_info` with the same request most probably means it's being retried\n    request_1 = Request(url='http://some.domain/abc', unique_key='1')\n\n    info = await config.new_proxy_info(None, request_1, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n    info = await config.new_proxy_info(None, request_1, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[1][0]\n\n    info = await config.new_proxy_info(None, request_1, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[2][0]\n\n    # Subsequent requests with the same domain should use the same tier\n    request_2 = Request(url='http://some.domain/xyz', unique_key='2')\n\n    info = await config.new_proxy_info(None, request_2, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[2][0]\n\n\nasync def test_retrying_request_makes_tier_go_up_with_sessions() -> None:\n    tiered_proxy_urls: list[list[str | None]] = [\n        ['http://proxy:1111'],\n        ['http://proxy:2222'],\n        ['http://proxy:3333'],\n        ['http://proxy:4444'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    request = Request(url='http://some.domain/abc', unique_key='1')\n\n    # Calling `new_proxy_info` with the same request likely means that it is being retried.\n    # However, a single session should always receive the same proxy\n    info = await config.new_proxy_info('session_id', request, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n    info = await config.new_proxy_info('session_id', request, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n    info = await config.new_proxy_info('session_id', request, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n    # For a new session, we will get a proxy from the corresponding tier\n    info = await config.new_proxy_info('session_id2', request, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[3][0]\n\n    info = await config.new_proxy_info('session_id2', request, None)\n    assert info is not None\n    assert info.url == tiered_proxy_urls[3][0]\n\n\nasync def test_successful_request_makes_tier_go_down() -> None:\n    \"\"\"Repeatedly requesting a proxy for a single request will cause the proxy tier to go up -\n    ProxyConfiguration assumes those are retries. Then, requesting a proxy for different requests to the same domain\n    will cause the tier to drop back down.\"\"\"\n\n    tiered_proxy_urls: list[list[str | None]] = [\n        ['http://proxy:1111'],\n        ['http://proxy:2222'],\n        ['http://proxy:3333'],\n        ['http://proxy:4444'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    request_1 = Request(url='http://some.domain/abc', unique_key='1')\n\n    info = None\n    for tier in tiered_proxy_urls:\n        info = await config.new_proxy_info(None, request_1, None)\n        assert info is not None\n        assert info.url == tier[0]\n\n    for i in range(100):\n        new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i))\n        info = await config.new_proxy_info(None, new_request, None)\n\n    assert info is not None\n    assert info.url == tiered_proxy_urls[0][0]\n\n\nasync def test_none_proxy_retrying_request_makes_tier_go_up() -> None:\n    tiered_proxy_urls: list[list[str | None]] = [\n        [None],\n        ['http://proxy:1111'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    # Calling `new_proxy_info` with the same request most probably means it's being retried\n    request_1 = Request(url='http://some.domain/abc', unique_key='1')\n\n    # No proxy used.\n    info = await config.new_proxy_info(None, request_1, None)\n    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'\n\n    # Proxy should go up one tier for same request that was already sent before.\n    info = await config.new_proxy_info(None, request_1, None)\n    assert info is not None, (\n        'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'\n    )\n    assert info.url == tiered_proxy_urls[1][0]\n\n\nasync def test_none_proxy_rotates_proxies_uniformly_with_no_request() -> None:\n    tiered_proxy_urls = [\n        [None, 'http://proxy:1111'],\n    ]\n\n    config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)\n\n    # No proxy used.\n    info = await config.new_proxy_info(None, None, None)\n    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'\n\n    # Proxy should be rotated on the same proxy tier for a new request.\n    info = await config.new_proxy_info(None, None, None)\n    assert info is not None, (\n        'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'\n    )\n    assert info.url == tiered_proxy_urls[0][1]\n\n    # Proxy rotation starts from the beginning of the proxy list after last proxy in tier was used. No proxy used again.\n    info = await config.new_proxy_info(None, None, None)\n    assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'\n"
  },
  {
    "path": "tests/unit/request_loaders/test_request_list.py",
    "content": "from collections.abc import AsyncGenerator\n\nfrom crawlee.request_loaders._request_list import RequestList\nfrom crawlee.storages import KeyValueStore\n\n\nasync def test_sync_traversal() -> None:\n    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    while not await request_list.is_finished():\n        item = await request_list.fetch_next_request()\n        assert item is not None\n\n        await request_list.mark_request_as_handled(item)\n\n    assert await request_list.is_empty()\n\n\nasync def test_async_traversal() -> None:\n    async def generator() -> AsyncGenerator[str]:\n        yield 'https://a.placeholder.com'\n        yield 'https://b.placeholder.com'\n        yield 'https://c.placeholder.com'\n\n    request_list = RequestList(generator())\n\n    while not await request_list.is_finished():\n        item = await request_list.fetch_next_request()\n        assert item is not None\n\n        await request_list.mark_request_as_handled(item)\n\n    assert await request_list.is_empty()\n\n\nasync def test_is_empty_does_not_depend_on_fetch_next_request() -> None:\n    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])\n\n    item_1 = await request_list.fetch_next_request()\n    assert item_1 is not None\n    assert not await request_list.is_finished()\n\n    item_2 = await request_list.fetch_next_request()\n    assert item_2 is not None\n    assert not await request_list.is_finished()\n\n    item_3 = await request_list.fetch_next_request()\n    assert item_3 is not None\n    assert not await request_list.is_finished()\n\n    assert await request_list.is_empty()\n    assert not await request_list.is_finished()\n\n    await request_list.mark_request_as_handled(item_1)\n    await request_list.mark_request_as_handled(item_2)\n    await request_list.mark_request_as_handled(item_3)\n\n    assert await request_list.is_empty()\n    assert await request_list.is_finished()\n\n\nasync def test_persist_requests_key_with_sync_iterable() -> None:\n    \"\"\"Test that persist_requests_key persists request data from a sync iterable.\"\"\"\n    persist_key = 'test_requests_persist_sync'\n    urls = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']\n\n    # Create a request list with persistence enabled\n    request_list = RequestList(urls, persist_requests_key=persist_key)\n\n    # Fetch one request to trigger initialization\n    first_request = await request_list.fetch_next_request()\n    assert first_request is not None\n    assert first_request.url == 'https://a.placeholder.com'\n\n    # Check that the requests were persisted\n    kvs = await KeyValueStore.open()\n    persisted_data = await kvs.get_value(persist_key)\n    assert persisted_data is not None\n\n\nasync def test_persist_requests_key_with_empty_iterator() -> None:\n    \"\"\"Test behavior when persist_requests_key is provided but the iterator is empty.\"\"\"\n    persist_key = 'test_empty_iterator'\n\n    # Create request list with empty iterator\n    request_list = RequestList([], persist_requests_key=persist_key)\n\n    # Should be empty immediately\n    assert await request_list.is_empty()\n    assert await request_list.is_finished()\n\n    # Check that empty requests were persisted\n    kvs = await KeyValueStore.open()\n    persisted_data = await kvs.get_value(persist_key)\n    assert persisted_data is not None\n\n\nasync def test_requests_restoration_without_state() -> None:\n    \"\"\"Test that persisted request data is properly restored on subsequent RequestList creation.\"\"\"\n    persist_requests_key = 'test_requests_restoration'\n    urls = ['https://restore1.placeholder.com', 'https://restore2.placeholder.com']\n\n    # Create first request list and process one request\n    request_list_1 = RequestList(urls, persist_requests_key=persist_requests_key)\n    first_request = await request_list_1.fetch_next_request()\n    assert first_request is not None\n    assert first_request.url == 'https://restore1.placeholder.com'\n    await request_list_1.mark_request_as_handled(first_request)\n\n    # Create second request list with same persist key (simulating restart)\n    # Since we don't have state persistence, it will start from the beginning of the persisted data\n    spy = iter(['1', '2', '3'])\n    request_list_2 = RequestList(spy, persist_requests_key=persist_requests_key)\n\n    # Should be able to fetch requests from persisted data, but starts from beginning\n    first_request_again = await request_list_2.fetch_next_request()\n    assert first_request_again is not None\n    assert first_request_again.url == 'https://restore1.placeholder.com'\n    await request_list_2.mark_request_as_handled(first_request_again)\n\n    # Make sure that the second instance did not consume the input iterator\n    assert len(list(spy)) == 3\n\n\nasync def test_state_restoration() -> None:\n    \"\"\"Test that persisted processing state is properly restored on subsequent RequestList creation.\"\"\"\n    persist_state_key = 'test_state_restoration'\n    urls = [\n        'https://restore1.placeholder.com',\n        'https://restore2.placeholder.com',\n        'https://restore3.placeholder.com',\n        'https://restore4.placeholder.com',\n    ]\n\n    # Create first request list and process one request\n    request_list_1 = RequestList(\n        urls,\n        persist_state_key=persist_state_key,\n    )\n\n    first_request = await request_list_1.fetch_next_request()\n    assert first_request is not None\n    assert first_request.url == 'https://restore1.placeholder.com'\n    await request_list_1.mark_request_as_handled(first_request)\n    await request_list_1._state.persist_state()\n\n    # Create second request list with same persist key (simulating restart)\n    request_list_2 = RequestList(\n        urls,\n        persist_state_key=persist_state_key,\n    )\n\n    # Should be able to continue where the previous instance left off\n    next_request = await request_list_2.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://restore2.placeholder.com'\n    await request_list_2.mark_request_as_handled(next_request)\n\n    next_request = await request_list_2.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://restore3.placeholder.com'\n    await request_list_2.mark_request_as_handled(next_request)\n\n    next_request = await request_list_2.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://restore4.placeholder.com'\n    await request_list_2.mark_request_as_handled(next_request)\n\n\nasync def test_requests_and_state_restoration() -> None:\n    \"\"\"Test that persisted request data and processing state is properly restored on subsequent RequestList creation.\"\"\"\n    persist_requests_key = 'test_requests_restoration'\n    persist_state_key = 'test_state_restoration'\n    urls = [\n        'https://restore1.placeholder.com',\n        'https://restore2.placeholder.com',\n        'https://restore3.placeholder.com',\n    ]\n\n    # Create first request list and process one request\n    request_list_1 = RequestList(\n        urls,\n        persist_requests_key=persist_requests_key,\n        persist_state_key=persist_state_key,\n    )\n\n    first_request = await request_list_1.fetch_next_request()\n    assert first_request is not None\n    assert first_request.url == 'https://restore1.placeholder.com'\n    await request_list_1.mark_request_as_handled(first_request)\n    await request_list_1._state.persist_state()\n\n    # Create second request list with same persist key (simulating restart)\n    spy = iter(['1', '2', '3'])\n    request_list_2 = RequestList(\n        spy,\n        persist_requests_key=persist_requests_key,\n        persist_state_key=persist_state_key,\n    )\n\n    # Should be able to fetch requests from persisted data and continue where the previous instance left off\n    next_request = await request_list_2.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://restore2.placeholder.com'\n    await request_list_2.mark_request_as_handled(next_request)\n\n    next_request = await request_list_2.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://restore3.placeholder.com'\n    await request_list_2.mark_request_as_handled(next_request)\n\n    # Make sure that the second instance did not consume the input iterator\n    assert len(list(spy)) == 3\n\n\nasync def test_persist_requests_key_only_persists_once() -> None:\n    \"\"\"Test that requests are only persisted once, even with multiple RequestList instances.\"\"\"\n    persist_key = 'test_requests_once'\n    urls = ['https://once1.placeholder.com', 'https://once2.placeholder.com']\n\n    # Create first request list\n    request_list_1 = RequestList(urls, persist_requests_key=persist_key)\n    await request_list_1.fetch_next_request()  # Trigger persistence\n\n    # Get initial persisted data\n    kvs = await KeyValueStore.open()\n    initial_data = await kvs.get_value(persist_key)\n    assert initial_data is not None\n\n    # Create second request list with different data\n    different_urls = ['https://different.placeholder.com']\n    request_list_2 = RequestList(different_urls, persist_requests_key=persist_key)\n    await request_list_2.fetch_next_request()  # Should use persisted data, not new data\n\n    # Verify the persisted data hasn't changed\n    current_data = await kvs.get_value(persist_key)\n    assert current_data == initial_data\n\n    # The request should come from the original persisted data, not the new iterator\n    fetched_request = await request_list_2.fetch_next_request()\n    assert fetched_request is not None\n    assert fetched_request.url == 'https://once2.placeholder.com'  # From original data\n"
  },
  {
    "path": "tests/unit/request_loaders/test_sitemap_request_loader.py",
    "content": "import asyncio\nimport base64\nimport gzip\nfrom typing import TYPE_CHECKING\n\nfrom yarl import URL\n\nfrom crawlee import RequestOptions, RequestTransformAction\nfrom crawlee.http_clients._base import HttpClient\nfrom crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader\nfrom crawlee.storages import KeyValueStore\n\nif TYPE_CHECKING:\n    from crawlee._types import JsonSerializable\n\nBASIC_SITEMAP = \"\"\"\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n<url>\n<loc>http://not-exists.com/</loc>\n<lastmod>2005-02-03</lastmod>\n<changefreq>monthly</changefreq>\n<priority>0.8</priority>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=12&amp;desc=vacation_hawaii</loc>\n<changefreq>weekly</changefreq>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>\n<lastmod>2004-12-23</lastmod>\n<changefreq>weekly</changefreq>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>\n<lastmod>2004-12-23T18:00:15+00:00</lastmod>\n<priority>0.3</priority>\n</url>\n<url>\n<loc>http://not-exists.com/catalog?item=83&amp;desc=vacation_usa</loc>\n<lastmod>2004-11-23</lastmod>\n</url>\n</urlset>\n\"\"\".strip()\n\n\ndef compress_gzip(data: str) -> bytes:\n    \"\"\"Compress a string using gzip.\"\"\"\n    return gzip.compress(data.encode())\n\n\ndef encode_base64(data: bytes) -> str:\n    \"\"\"Encode bytes to a base64 string.\"\"\"\n    return base64.b64encode(data).decode('utf-8')\n\n\nasync def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)\n\n    while not await sitemap_loader.is_finished():\n        item = await sitemap_loader.fetch_next_request()\n\n        if item:\n            await sitemap_loader.mark_request_as_handled(item)\n\n    assert await sitemap_loader.is_empty()\n    assert await sitemap_loader.is_finished()\n    assert await sitemap_loader.get_total_count() == 5\n    assert await sitemap_loader.get_handled_count() == 5\n\n\nasync def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)\n\n    items = []\n\n    for _ in range(5):\n        item = await sitemap_loader.fetch_next_request()\n        assert item is not None\n        assert not await sitemap_loader.is_finished()\n        items.append(item)\n\n    assert await sitemap_loader.is_empty()\n    assert not await sitemap_loader.is_finished()\n\n    for item in items:\n        await sitemap_loader.mark_request_as_handled(item)\n\n    assert await sitemap_loader.is_empty()\n\n    await asyncio.sleep(0.1)\n\n    assert await sitemap_loader.is_finished()\n\n\nasync def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2, http_client=http_client)\n\n    item = await sitemap_loader.fetch_next_request()\n    assert item is not None\n    await sitemap_loader.mark_request_as_handled(item)\n\n    assert not await sitemap_loader.is_empty()\n    assert not await sitemap_loader.is_finished()\n\n    await sitemap_loader.abort_loading()\n\n    item = await sitemap_loader.fetch_next_request()\n    assert item is not None\n    await sitemap_loader.mark_request_as_handled(item)\n\n    assert await sitemap_loader.is_finished()\n\n\nasync def test_create_persist_state_for_sitemap_loading(\n    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore\n) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    persist_key = 'create_persist_state'\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)\n    assert await sitemap_loader.is_finished() is False\n\n    await sitemap_loader.close()\n\n    state_data = await key_value_store.get_value(persist_key)\n\n    assert state_data is not None\n    assert state_data['handledCount'] == 0\n\n\nasync def test_data_persistence_for_sitemap_loading(\n    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore\n) -> None:\n    async def wait_for_sitemap_loader_not_empty(sitemap_loader: SitemapRequestLoader) -> None:\n        while await sitemap_loader.is_empty() and not await sitemap_loader.is_finished():  # noqa: ASYNC110\n            await asyncio.sleep(0.1)\n\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    persist_key = 'data_persist_state'\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)\n\n    # Give time to load\n    await asyncio.wait_for(wait_for_sitemap_loader_not_empty(sitemap_loader), timeout=2)\n\n    await sitemap_loader.close()\n\n    state_data = await key_value_store.get_value(persist_key)\n\n    assert state_data is not None\n    assert state_data['handledCount'] == 0\n    assert state_data['totalCount'] == 5\n    assert len(state_data['urlQueue']) == 5\n\n\nasync def test_recovery_data_persistence_for_sitemap_loading(\n    server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore\n) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n    persist_key = 'recovery_persist_state'\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)\n\n    item = await sitemap_loader.fetch_next_request()\n\n    assert item is not None\n    await sitemap_loader.mark_request_as_handled(item)\n\n    await sitemap_loader.close()\n\n    state_data = await key_value_store.get_value(persist_key)\n\n    assert state_data is not None\n    next_item_in_kvs = state_data['urlQueue'][0]\n\n    sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, persist_state_key=persist_key)\n\n    item = await sitemap_loader.fetch_next_request()\n\n    assert item is not None\n    assert item.url == next_item_in_kvs\n\n\nasync def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:\n    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))\n\n    def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:\n        user_data: dict[str, JsonSerializable] = {'transformed': True}\n        request_options['user_data'] = user_data\n        return request_options\n\n    sitemap_loader = SitemapRequestLoader(\n        [str(sitemap_url)],\n        http_client=http_client,\n        transform_request_function=transform_request,\n    )\n\n    extracted_urls = set()\n\n    while not await sitemap_loader.is_finished():\n        request = await sitemap_loader.fetch_next_request()\n\n        if request:\n            assert request.user_data.get('transformed') is True\n\n            extracted_urls.add(request.url)\n\n            await sitemap_loader.mark_request_as_handled(request)\n\n    assert len(extracted_urls) == 5\n    assert extracted_urls == {\n        'http://not-exists.com/',\n        'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',\n        'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',\n        'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',\n        'http://not-exists.com/catalog?item=83&desc=vacation_usa',\n    }\n"
  },
  {
    "path": "tests/unit/server.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport base64\nimport gzip\nimport json\nimport sys\nimport threading\nimport time\nfrom collections.abc import Awaitable, Callable, Coroutine, Iterator\nfrom typing import TYPE_CHECKING, Any\nfrom urllib.parse import parse_qs\n\nfrom uvicorn.server import Server\nfrom yarl import URL\n\nfrom tests.unit.server_endpoints import (\n    BASE_INDEX,\n    GENERIC_RESPONSE,\n    HELLO_WORLD,\n    INCAPSULA,\n    INFINITE_SCROLL,\n    NON_HREF_LINKS,\n    PROBLEMATIC_LINKS,\n    RESOURCE_LOADING_PAGE,\n    ROBOTS_TXT,\n    SECONDARY_INDEX,\n    START_ENQUEUE,\n    START_ENQUEUE_NON_HREF,\n)\n\nif TYPE_CHECKING:\n    from socket import socket\n\nReceive = Callable[[], Awaitable[dict[str, Any]]]\nSend = Callable[[dict[str, Any]], Coroutine[None, None, None]]\nPathHandler = Callable[[dict[str, Any], Receive, Send], Coroutine[None, None, None]]\n\n\ndef get_headers_dict(scope: dict[str, Any]) -> dict[str, str]:\n    \"\"\"Extract request headers and return them as a dictionary.\"\"\"\n    headers = {}\n    for name, value in scope.get('headers', []):\n        headers[name.decode()] = value.decode()\n    return headers\n\n\ndef get_query_params(query_string: bytes) -> dict[str, str]:\n    \"\"\"Extract and parse query parameters from the request.\"\"\"\n    args = parse_qs(query_string.decode(), keep_blank_values=True)\n    result_args = {}\n\n    for key, values in args.items():\n        if values:\n            result_args[key] = values[0]\n\n    return result_args\n\n\ndef get_cookies_from_headers(headers: dict[str, Any]) -> dict[str, str]:\n    \"\"\"Extract cookies from request headers.\"\"\"\n    cookies = {}\n    cookie_header: str = headers.get('cookie', '')\n    if cookie_header:\n        for cookie in cookie_header.split(';'):\n            name, value = cookie.strip().split('=')\n            cookies[name] = value\n    return cookies\n\n\nasync def send_json_response(send: Send, data: Any, status: int = 200) -> None:\n    \"\"\"Send a JSON response to the client.\"\"\"\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': status,\n            'headers': [[b'content-type', b'application/json']],\n        }\n    )\n    await send({'type': 'http.response.body', 'body': json.dumps(data, indent=2).encode()})\n\n\nasync def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:\n    \"\"\"Send an HTML response to the client.\"\"\"\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': status,\n            'headers': [[b'content-type', b'text/html; charset=utf-8']],\n        }\n    )\n    await send({'type': 'http.response.body', 'body': html_content})\n\n\nasync def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:\n    \"\"\"Main ASGI application handler that routes requests to specific handlers.\n\n    Args:\n        scope: The ASGI connection scope.\n        receive: The ASGI receive function.\n        send: The ASGI send function.\n    \"\"\"\n    assert scope['type'] == 'http'\n    paths: dict[str, PathHandler] = {\n        'start_enqueue': start_enqueue_endpoint,\n        'start_enqueue_non_href': start_enqueue_non_href_endpoint,\n        'sub_index': secondary_index_endpoint,\n        'incapsula': incapsula_endpoint,\n        'page_1': generic_response_endpoint,\n        'page_2': generic_response_endpoint,\n        'page_3': generic_response_endpoint,\n        'base_page': base_index_endpoint,\n        'problematic_links': problematic_links_endpoint,\n        'non_href_links': non_href_links_endpoint,\n        'set_cookies': set_cookies,\n        'set_complex_cookies': set_complex_cookies,\n        'cookies': get_cookies,\n        'status': echo_status,\n        'headers': echo_headers,\n        'user-agent': echo_user_agent,\n        'echo_content': echo_content,\n        'sitemap.txt': echo_content,\n        'sitemap.xml': echo_content,\n        'sitemap.xml.gz': echo_content,\n        'get': get_echo,\n        'post': post_echo,\n        'redirect': redirect_to_url,\n        'json': hello_world_json,\n        'xml': hello_world_xml,\n        'robots.txt': robots_txt,\n        'get_compressed': get_compressed,\n        'slow': slow_response,\n        'infinite_scroll': infinite_scroll_endpoint,\n        'resource_loading_page': resource_loading_endpoint,\n    }\n    path = URL(scope['path']).parts[1]\n    # Route requests to appropriate handlers\n    if path in paths:\n        path_func = paths[path]\n        await path_func(scope, receive, send)\n    else:\n        await hello_world(scope, receive, send)\n\n\nasync def get_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests to retrieve cookies sent in the request.\"\"\"\n    headers = get_headers_dict(scope)\n    cookies = get_cookies_from_headers(headers)\n    await send_json_response(send, {'cookies': cookies})\n\n\nasync def set_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests to set cookies from query parameters and redirect.\"\"\"\n\n    query_params = get_query_params(scope.get('query_string', b''))\n\n    headers = [\n        [b'content-type', b'text/plain; charset=utf-8'],\n        [b'location', b'/cookies'],  # Redirect header\n    ]\n\n    for key, values in query_params.items():\n        if values:  # Only add if there's at least one value\n            cookie_value = f'{key}={values[0]}; Path=/'\n            headers.append([b'set-cookie', cookie_value.encode()])\n\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': 302,  # 302 Found for redirect\n            'headers': headers,\n        }\n    )\n    await send({'type': 'http.response.body', 'body': b'Redirecting to get_cookies...'})\n\n\nasync def hello_world(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle basic requests with a simple HTML response.\"\"\"\n    await send_html_response(\n        send,\n        HELLO_WORLD,\n    )\n\n\nasync def hello_world_json(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle basic requests with a simple JSON response.\"\"\"\n    await send_json_response(\n        send,\n        {'hello': 'world'},\n    )\n\n\nasync def hello_world_xml(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle basic requests with a simple XML response.\"\"\"\n    await send_html_response(\n        send,\n        b\"\"\"<?xml version=\"1.0\"?>\n            <hello>world</hello>\"\"\",\n    )\n\n\nasync def post_echo(scope: dict[str, Any], receive: Receive, send: Send) -> None:\n    \"\"\"Echo back POST request details similar to httpbin.org/post.\"\"\"\n    # Extract basic request info\n    path = scope.get('path', '')\n    query_string = scope.get('query_string', b'')\n    args = get_query_params(query_string)\n\n    # Extract headers and cookies\n    headers = get_headers_dict(scope)\n\n    # Read the request body\n    body = b''\n    form = {}\n    json_data = None\n    more_body = True\n\n    while more_body:\n        message = await receive()\n        if message['type'] == 'http.request':\n            body += message.get('body', b'')\n            more_body = message.get('more_body', False)\n\n    # Parse body based on content type\n    content_type = headers.get('content-type', '').lower()\n\n    if body and 'application/json' in content_type:\n        json_data = json.loads(body.decode())\n\n    if body and 'application/x-www-form-urlencoded' in content_type:\n        form_data = parse_qs(body.decode())\n        for key, values in form_data.items():\n            form[key] = values[0] if len(values) == 1 else values\n\n    body_text = '' if form else body.decode('utf-8', errors='replace')\n\n    # Prepare response\n    response = {\n        'args': args,\n        'data': body_text,\n        'files': {},  # Not handling multipart file uploads\n        'form': form,\n        'headers': headers,\n        'json': json_data,\n        'origin': headers.get('host', ''),\n        'url': f'http://{headers[\"host\"]}{path}',\n    }\n\n    await send_json_response(send, response)\n\n\nasync def echo_status(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Echo the status code from the URL path.\"\"\"\n    status_code = int(scope['path'].replace('/status/', ''))\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': status_code,\n            'headers': [[b'content-type', b'text/plain']],\n        }\n    )\n    await send({'type': 'http.response.body', 'body': b''})\n\n\nasync def echo_headers(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Echo back the request headers as JSON.\"\"\"\n    headers = get_headers_dict(scope)\n    await send_json_response(send, headers)\n\n\nasync def start_enqueue_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the main page with links.\"\"\"\n    await send_html_response(\n        send,\n        START_ENQUEUE,\n    )\n\n\nasync def secondary_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the secondary page with links.\"\"\"\n    await send_html_response(\n        send,\n        SECONDARY_INDEX,\n    )\n\n\nasync def incapsula_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for a page with an incapsula iframe.\"\"\"\n    await send_html_response(\n        send,\n        INCAPSULA,\n    )\n\n\nasync def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests with a generic HTML response.\"\"\"\n    await send_html_response(\n        send,\n        GENERIC_RESPONSE,\n    )\n\n\nasync def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests with a page containing problematic links.\"\"\"\n    await send_html_response(\n        send,\n        PROBLEMATIC_LINKS,\n    )\n\n\nasync def non_href_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests with a page containing non-href links.\"\"\"\n    await send_html_response(\n        send,\n        NON_HREF_LINKS,\n    )\n\n\nasync def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests that should redirect to a specified full URL.\"\"\"\n    query_params = get_query_params(scope.get('query_string', b''))\n\n    target_url = query_params.get('url', 'http://example.com')\n    status_code = int(query_params.get('status', 302))\n\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': status_code,\n            'headers': [\n                [b'content-type', b'text/plain; charset=utf-8'],\n                [b'location', target_url.encode()],\n            ],\n        }\n    )\n    await send({'type': 'http.response.body', 'body': f'Redirecting to {target_url}...'.encode()})\n\n\nasync def echo_user_agent(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Echo back the user agent header as a response.\"\"\"\n    headers = get_headers_dict(scope)\n    user_agent = headers.get('user-agent', 'Not provided')\n    await send_json_response(send, {'user-agent': user_agent})\n\n\nasync def get_echo(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Echo back GET request details similar to httpbin.org/get.\"\"\"\n    path = scope.get('path', '')\n    query_string = scope.get('query_string', b'')\n    args = get_query_params(query_string)\n    headers = get_headers_dict(scope)\n\n    origin = scope.get('client', ('unknown', 0))[0]\n\n    host = headers.get('host', 'localhost')\n    scheme = headers.get('x-forwarded-proto', 'http')\n    url = f'{scheme}://{host}{path}'\n    if query_string:\n        url += f'?{query_string}'\n\n    response = {\n        'args': args,\n        'headers': headers,\n        'origin': origin,\n        'url': url,\n    }\n\n    await send_json_response(send, response)\n\n\nasync def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests to set specific cookies with various attributes.\"\"\"\n\n    headers = [\n        [b'content-type', b'text/plain; charset=utf-8'],\n        [b'set-cookie', b'basic=1; Path=/; HttpOnly; SameSite=Lax'],\n        [b'set-cookie', b'withpath=2; Path=/html; SameSite=None'],\n        [b'set-cookie', b'strict=3; Path=/; SameSite=Strict'],\n        [b'set-cookie', b'secure=4; Path=/; HttpOnly; Secure; SameSite=Strict; Partitioned'],\n        [b'set-cookie', b'short=5; Path=/;'],\n        [b'set-cookie', b'domain=6; Path=/; Domain=.127.0.0.1;'],\n    ]\n\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': 200,\n            'headers': headers,\n        }\n    )\n    await send({'type': 'http.response.body', 'body': b'Cookies have been set!'})\n\n\nasync def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Echo back content (plain text or base64) with specified content-type.\"\"\"\n    query_params = get_query_params(scope.get('query_string', b''))\n\n    content = query_params.get('content', '')\n    base64_content = query_params.get('base64', '')\n    c_type = query_params.get('c_type', 'text/html; charset=utf-8')\n\n    out_content = base64.b64decode(base64_content) if base64_content else content.encode()\n\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': 200,\n            'headers': [[b'content-type', c_type.encode()]],\n        }\n    )\n\n    await send({'type': 'http.response.body', 'body': out_content})\n\n\nasync def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the robots.txt file.\"\"\"\n    await send_html_response(send, ROBOTS_TXT)\n\n\nasync def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Return large gzip compressed content.\"\"\"\n\n    await send(\n        {\n            'type': 'http.response.start',\n            'status': 200,\n            'headers': [[b'content-encoding', b'gzip']],\n        }\n    )\n    await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)})\n\n\nasync def slow_response(scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests with a configurable delay to test timeouts.\"\"\"\n    query_params = get_query_params(scope.get('query_string', b''))\n    delay = float(query_params.get('delay', '5'))  # Default 5 second delay\n\n    await asyncio.sleep(delay)\n    await send_html_response(send, HELLO_WORLD)\n\n\nasync def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the infinite scroll page.\"\"\"\n    await send_html_response(\n        send,\n        INFINITE_SCROLL,\n    )\n\n\nasync def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the resource loading page.\"\"\"\n    await send_html_response(\n        send,\n        RESOURCE_LOADING_PAGE,\n    )\n\n\nasync def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the base index page.\"\"\"\n    host = f'http://{get_headers_dict(_scope).get(\"host\", \"localhost\")}'\n    content = BASE_INDEX.format(host=host).encode()\n    await send_html_response(\n        send,\n        content,\n    )\n\n\nasync def start_enqueue_non_href_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:\n    \"\"\"Handle requests for the base index page.\"\"\"\n    host = f'http://{get_headers_dict(_scope).get(\"host\", \"localhost\")}'\n    content = START_ENQUEUE_NON_HREF.format(host=host).encode()\n    await send_html_response(\n        send,\n        content,\n    )\n\n\nclass TestServer(Server):\n    \"\"\"A test HTTP server implementation based on Uvicorn Server.\"\"\"\n\n    @property\n    def url(self) -> URL:\n        \"\"\"Get the base URL of the server.\n\n        Returns:\n            A URL instance with the server's base URL.\n        \"\"\"\n        protocol = 'https' if self.config.is_ssl else 'http'\n        return URL(f'{protocol}://{self.config.host}:{self.config.port}/')\n\n    async def serve(self, sockets: list[socket] | None = None) -> None:\n        \"\"\"Run the server and set up restart capability.\n\n        Args:\n            sockets: Optional list of sockets to bind to.\n        \"\"\"\n        self.restart_requested = asyncio.Event()\n\n        loop = asyncio.get_event_loop()\n        tasks = {\n            loop.create_task(super().serve(sockets=sockets)),\n            loop.create_task(self.watch_restarts()),\n        }\n        await asyncio.wait(tasks)\n\n    async def restart(self) -> None:\n        \"\"\"Request server restart and wait for it to complete.\n\n        This method can be called from a different thread than the one the server\n        is running on, and from a different async environment.\n        \"\"\"\n        self.started = False\n        self.restart_requested.set()\n        while not self.started:  # noqa: ASYNC110\n            await asyncio.sleep(0.2)\n\n    async def watch_restarts(self) -> None:\n        \"\"\"Watch for and handle restart requests.\"\"\"\n        while True:\n            if self.should_exit:\n                return\n\n            try:\n                await asyncio.wait_for(self.restart_requested.wait(), timeout=0.1)\n            except asyncio.TimeoutError:\n                continue\n\n            self.restart_requested.clear()\n            await self.shutdown()\n            await self.startup()\n\n    def run(self, sockets: list[socket] | None = None) -> None:\n        \"\"\"Run the server.\"\"\"\n        # Set the event loop policy in thread with server for Windows and Python 3.12+.\n        # This is necessary because there are problems with closing connections when using `ProactorEventLoop`\n        if sys.version_info >= (3, 12) and sys.platform == 'win32':\n            return asyncio.run(self.serve(sockets=sockets), loop_factory=asyncio.SelectorEventLoop)\n        super().run(sockets=sockets)\n        return None\n\n\ndef serve_in_thread(server: TestServer) -> Iterator[TestServer]:\n    \"\"\"Run a server in a background thread and yield it.\"\"\"\n    thread = threading.Thread(target=server.run)\n    thread.start()\n    try:\n        while not server.started:\n            time.sleep(1e-3)\n        yield server\n    finally:\n        server.should_exit = True\n        thread.join()\n"
  },
  {
    "path": "tests/unit/server_endpoints.py",
    "content": "# Test server response content for testing\n\nHELLO_WORLD = b\"\"\"\\\n<html><head>\n    <title>Hello, world!</title>\n</head>\n<body>\n</body></html>\"\"\"\n\nSTART_ENQUEUE = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"/sub_index\" class=\"foo\">Link 1</a>\n    <a href=\"/page_1\">Link 2</a>\n    <a href=\"mailto:test@test.com\">test@test.com</a>\n</body></html>\"\"\"\n\nSTART_ENQUEUE_NON_HREF = \"\"\"\\\n<html><head>\n    <base href=\"{host}/base_subpath/\">\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"/page_3\">Link A</a>\n    <a href=\"/page_2\">Link B</a>\n    <img src=\"image_1\"/>\n    <img src=\"/image_2\"/>\n</body></html>\"\"\"\n\nSECONDARY_INDEX = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"/page_3\">Link 3</a>\n    <a href=\"/page_2\">Link 4</a>\n    <a href=\"/base_page\">Base Page</a>\n</body></html>\"\"\"\n\nBASE_INDEX = \"\"\"\\\n<html><head>\n    <base href=\"{host}/base_subpath/\">\n    <base href=\"{host}/sub_index/\">\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"page_5\">Link 5</a>\n    <a href=\"/page_4\">Link 6</a>\n</body></html>\"\"\"\n\nINCAPSULA = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    <iframe src=Test_Incapsula_Resource>\n    </iframe>\n</body></html>\"\"\"\n\nPROBLEMATIC_LINKS = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"https://budplaceholder.com/\">Placeholder</a>\n    <a href=\"mailto:test@test.com\">test@test.com</a>\n    <a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>\n</body></html>\"\"\"\n\nNON_HREF_LINKS = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    <a href=\"/page_1\"></a>\n    <li data-href=\"/page_2\"></li>\n</body></html>\n\"\"\"\n\nGENERIC_RESPONSE = b\"\"\"\\\n<html><head>\n    <title>Hello</title>\n</head>\n<body>\n    Insightful content\n</body></html>\"\"\"\n\n\nROBOTS_TXT = b\"\"\"\\\nUser-agent: *\nDisallow: *deny_all/\nDisallow: /page_\ncrawl-delay: 10\n\nUser-agent: Googlebot\nDisallow: *deny_googlebot/\ncrawl-delay: 1\n\nuser-agent: Mozilla\ncrawl-delay: 2\n\nsitemap: http://not-exists.com/sitemap_1.xml\nsitemap: http://not-exists.com/sitemap_2.xml\"\"\"\n\n\nINFINITE_SCROLL = b\"\"\"\\\n<!DOCTYPE html>\n<html>\n<body>\n    <div id=\"content\"></div>\n\n    <script>\n        let page = 0;\n        let loading = false;\n\n        for (let i = 0; i < 10; i++) {\n            const div = document.createElement('div');\n            div.className = 'item';\n            div.style.height = '200px';\n            div.textContent = 'Item ' + (i + 1);\n            document.getElementById('content').appendChild(div);\n        }\n\n        async function loadMore() {\n            if (loading || page >= 3) return;\n            loading = true;\n            page++;\n\n            await new Promise(resolve => setTimeout(resolve, 100));\n\n            for (let i = 0; i < 10; i++) {\n                const div = document.createElement('div');\n                div.className = 'item';\n                div.style.height = '200px';\n                div.textContent = 'Item ' + (page * 10 + i + 1);\n                document.getElementById('content').appendChild(div);\n            }\n\n            loading = false;\n        }\n\n        window.addEventListener('scroll', () => {\n            if (window.innerHeight + window.scrollY >= document.body.offsetHeight - 100) {\n                loadMore();\n            }\n        });\n    </script>\n</body>\n</html>\n\"\"\"\n\nRESOURCE_LOADING_PAGE = b\"\"\"\\\n<!DOCTYPE html>\n<html>\n  <head>\n    <script src=\"/server_static/test.js\"></script>\n  </head>\n  <body>\n    <img src=\"/server_static/test.png\" />\n  </body>\n</html>\n\"\"\"\n"
  },
  {
    "path": "tests/unit/server_static/test.js",
    "content": ""
  },
  {
    "path": "tests/unit/sessions/test_cookies.py",
    "content": "from __future__ import annotations\n\nimport pytest\n\nfrom crawlee.sessions._cookies import CookieParam, PlaywrightCookieParam, SessionCookies\n\n\n@pytest.fixture\ndef cookie_dict() -> CookieParam:\n    return CookieParam(\n        {\n            'name': 'test_cookie',\n            'value': 'test_value',\n            'domain': 'example.com',\n            'path': '/test',\n            'expires': 1735689600,\n            'http_only': True,\n            'secure': True,\n            'same_site': 'Strict',\n        }\n    )\n\n\n@pytest.fixture\ndef session_cookies(cookie_dict: CookieParam) -> SessionCookies:\n    session_cookies = SessionCookies()\n    session_cookies.set(**cookie_dict)\n    return session_cookies\n\n\ndef test_set_basic_cookie() -> None:\n    \"\"\"Test setting a basic cookie with minimal attributes.\"\"\"\n    session_cookies = SessionCookies()\n    session_cookies.set('test', 'value')\n    cookies = list(session_cookies.jar)\n\n    assert len(cookies) == 1\n    cookie = cookies[0]\n    assert cookie.name == 'test'\n    assert cookie.value == 'value'\n    assert cookie.path == '/'\n    assert not cookie.secure\n    assert not cookie.has_nonstandard_attr('httpOnpy')\n\n\ndef test_set_cookie_with_all_attributes(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:\n    \"\"\"Test setting a cookie with all available attributes.\"\"\"\n    cookies = list(session_cookies.jar)\n\n    assert len(cookies) == 1\n    cookie = cookies[0]\n\n    assert cookie.name == cookie_dict.get('name')\n    assert cookie.value == cookie_dict.get('value')\n    assert cookie.path == cookie_dict.get('path')\n    assert cookie.domain == cookie_dict.get('domain')\n    assert cookie.expires == cookie_dict.get('expires')\n    assert cookie.has_nonstandard_attr('HttpOnly')\n    assert cookie.secure\n    assert cookie.get_nonstandard_attr('SameSite') == 'Strict'\n\n\ndef test_convert_cookie_to_dict(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:\n    \"\"\"Test converting Cookie object to dictionary representation.\"\"\"\n    cookies = list(session_cookies.jar)\n\n    assert len(cookies) == 1\n    cookie = cookies[0]\n\n    converted_cookie_dict = session_cookies._convert_cookie_to_dict(cookie)\n    assert converted_cookie_dict == cookie_dict\n\n\ndef test_convert_dict_format(session_cookies: SessionCookies) -> None:\n    \"\"\"Test normalizing cookie attributes between internal and browser formats.\"\"\"\n    internal_format = CookieParam({'name': 'test', 'value': 'value', 'http_only': True, 'same_site': 'Lax'})\n\n    # Test internal to browser format\n    browser_format = session_cookies._to_playwright(internal_format)\n    assert 'httpOnly' in browser_format\n    assert 'sameSite' in browser_format\n    assert 'http_only' not in browser_format\n    assert 'same_site' not in browser_format\n\n    # Test browser to internal format\n    browser_format = PlaywrightCookieParam({'name': 'test', 'value': 'value', 'httpOnly': True, 'sameSite': 'Lax'})\n    internal_format = session_cookies._from_playwright(browser_format)\n    assert 'http_only' in internal_format\n    assert 'same_site' in internal_format\n    assert 'httpOnly' not in internal_format\n    assert 'sameSite' not in internal_format\n\n\ndef test_get_cookies_as_browser_format(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:\n    \"\"\"Test getting cookies in browser-compatible format.\"\"\"\n    browser_cookies = session_cookies.get_cookies_as_playwright_format()\n\n    assert len(browser_cookies) == 1\n    cookie = browser_cookies[0]\n    assert 'httpOnly' in cookie\n    assert 'sameSite' in cookie\n    assert cookie['httpOnly'] == cookie_dict.get('http_only')\n    assert cookie['sameSite'] == cookie_dict.get('same_site')\n\n\ndef test_get_cookies_as_dicts(session_cookies: SessionCookies, cookie_dict: CookieParam) -> None:\n    \"\"\"Test get list of dictionary from a SessionCookies.\"\"\"\n    test_session_cookies = session_cookies.get_cookies_as_dicts()\n\n    assert [cookie_dict] == test_session_cookies\n\n\ndef test_store_cookie(session_cookies: SessionCookies) -> None:\n    \"\"\"Test storing a Cookie object directly.\"\"\"\n    test_session_cookies = SessionCookies()\n    cookies = list(session_cookies.jar)\n    test_session_cookies.store_cookie(cookies[0])\n\n    assert test_session_cookies == session_cookies\n\n\ndef test_store_multidomain_cookies() -> None:\n    \"\"\"Test of storing cookies with the same name for different domains\"\"\"\n    session_cookies = SessionCookies()\n    session_cookies.set(name='a', value='1', domain='test.io')\n    session_cookies.set(name='a', value='2', domain='notest.io')\n    check_cookies = {\n        item.get('domain'): (item['name'], item['value']) for item in session_cookies.get_cookies_as_dicts()\n    }\n\n    assert len(check_cookies) == 2\n\n    assert check_cookies['test.io'] == ('a', '1')\n    assert check_cookies['notest.io'] == ('a', '2')\n"
  },
  {
    "path": "tests/unit/sessions/test_models.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timedelta, timezone\n\nimport pytest\n\nfrom crawlee.sessions._cookies import CookieParam\nfrom crawlee.sessions._models import SessionModel\n\nSESSION_CREATED_AT = datetime.now(timezone.utc)\n\n\n@pytest.fixture\ndef session_direct() -> SessionModel:\n    \"\"\"Provide a SessionModel instance directly using fixed parameters.\"\"\"\n    return SessionModel(\n        id='test_session',\n        max_age=timedelta(minutes=30),\n        user_data={'user_key': 'user_value'},\n        max_error_score=3.0,\n        error_score_decrement=0.5,\n        created_at=SESSION_CREATED_AT,\n        usage_count=0,\n        max_usage_count=10,\n        error_score=0.0,\n        cookies=[CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],\n        blocked_status_codes=[401, 403, 429],\n    )\n\n\n@pytest.fixture\ndef session_args_camel() -> dict:\n    \"\"\"Provide session parameters as dictionary with camel case keys.\"\"\"\n    return {\n        'id': 'test_session',\n        'maxAge': '00:30:00',\n        'userData': {'user_key': 'user_value'},\n        'maxErrorScore': 3.0,\n        'errorScoreDecrement': 0.5,\n        'createdAt': SESSION_CREATED_AT,\n        'usageCount': 0,\n        'maxUsageCount': 10,\n        'errorScore': 0.0,\n        'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],\n        'blockedStatusCodes': [401, 403, 429],\n    }\n\n\n@pytest.fixture\ndef session_args_snake() -> dict:\n    \"\"\"Provide session parameters as dictionary with snake case keys.\"\"\"\n    return {\n        'id': 'test_session',\n        'max_age': '00:30:00',\n        'user_data': {'user_key': 'user_value'},\n        'max_error_score': 3.0,\n        'error_score_decrement': 0.5,\n        'created_at': SESSION_CREATED_AT,\n        'usage_count': 0,\n        'max_usage_count': 10,\n        'error_score': 0.0,\n        'cookies': [CookieParam({'name': 'cookie_key', 'value': 'cookie_value'})],\n        'blocked_status_codes': [401, 403, 429],\n    }\n\n\ndef test_session_model(\n    session_direct: SessionModel,\n    session_args_camel: dict,\n    session_args_snake: dict,\n) -> None:\n    \"\"\"Test equivalence of SessionModel instances created directly and from camelCase, and snake_case kwargs.\"\"\"\n    session_camel = SessionModel(**session_args_camel)\n    session_snake = SessionModel(**session_args_snake)\n\n    assert session_direct == session_camel == session_snake\n    assert session_direct.id == session_camel.id == session_snake.id == 'test_session'\n\n    # Check that max_age is correctly parsed into a timedelta object\n    assert session_direct.max_age == session_camel.max_age == session_snake.max_age == timedelta(minutes=30)\n"
  },
  {
    "path": "tests/unit/sessions/test_session.py",
    "content": "from __future__ import annotations\n\nfrom datetime import datetime, timedelta, timezone\n\nimport pytest\n\nfrom crawlee.sessions._cookies import SessionCookies\nfrom crawlee.sessions._session import Session\n\n\n@pytest.fixture\ndef session() -> Session:\n    return Session(\n        id='test_session',\n        max_age=timedelta(minutes=30),\n        user_data={'user_key': 'user_value'},\n        max_error_score=3.0,\n        error_score_decrement=0.5,\n        created_at=datetime.now(timezone.utc),\n        usage_count=0,\n        max_usage_count=10,\n        error_score=0.0,\n        cookies={'cookie_key': 'cookie_value'},\n        blocked_status_codes=[401, 403, 429],\n    )\n\n\ndef test_session_init(session: Session) -> None:\n    \"\"\"Verify that the session initializes correctly with the expected properties.\"\"\"\n    assert session.id == 'test_session'\n    assert session.user_data == {'user_key': 'user_value'}\n    assert session.cookies == SessionCookies({'cookie_key': 'cookie_value'})\n    assert session.expires_at >= datetime.now(timezone.utc)\n    assert not session.is_blocked\n    assert not session.is_expired\n    assert not session.is_max_usage_count_reached\n    assert session.is_usable\n\n\ndef test_session_get_state(session: Session) -> None:\n    \"\"\"Check if the session state is correctly retrievable in both dict and model forms.\"\"\"\n    session_state_dict = session.get_state(as_dict=True)\n    assert session_state_dict['id'] == 'test_session'\n\n    session_state_model = session.get_state(as_dict=False)\n    assert session_state_model.id == 'test_session'\n\n    session_2 = Session.from_model(session_state_model)\n    assert session_2.id == 'test_session'\n\n\ndef test_mark_good(session: Session) -> None:\n    \"\"\"Test the mark_good method increases usage count and potentially decreases error score.\"\"\"\n    initial_usage_count = session.usage_count\n    session.mark_good()\n    assert session.usage_count == initial_usage_count + 1\n    assert session.error_score == 0\n\n\ndef test_mark_bad(session: Session) -> None:\n    \"\"\"Test the mark_bad method affects the session's error score and usage.\"\"\"\n    initial_error_score = session.error_score\n    session.mark_bad()\n    assert session.error_score == initial_error_score + 1\n\n\ndef test_multiple_marks(session: Session) -> None:\n    \"\"\"Test the mark_good and mark_bad methods in sequence.\"\"\"\n    initial_usage_count = session.usage_count\n    session.mark_bad()\n    session.mark_bad()\n    assert session.error_score == initial_usage_count + 2\n    session.mark_good()\n    session.mark_good()\n    assert session.error_score == initial_usage_count + 1\n    session.mark_bad()\n    session.mark_bad()\n    session.mark_good()\n    assert session.is_blocked\n    assert not session.is_usable\n\n\ndef test_retire_method(session: Session) -> None:\n    \"\"\"Test that retire method properly sets the session as unusable.\"\"\"\n    session.retire()\n    assert not session.is_usable\n    assert session.error_score == 3.0\n\n\ndef test_retire_on_blocked_status_code(session: Session) -> None:\n    \"\"\"Test retiring the session based on specific HTTP status codes.\"\"\"\n    status_code = 403\n    result = session.is_blocked_status_code(status_code=status_code)\n    assert result is True\n\n\ndef test_not_retire_on_not_block_status_code(session: Session) -> None:\n    \"\"\"Test that the session is not retired on a non-blocked status code.\"\"\"\n    status_code = 200\n    result = session.is_blocked_status_code(status_code=status_code)\n    assert result is False\n\n\ndef test_session_expiration() -> None:\n    \"\"\"Test the expiration logic of the session.\"\"\"\n    session = Session(created_at=datetime.now(timezone.utc) - timedelta(hours=1))\n    assert session.is_expired\n"
  },
  {
    "path": "tests/unit/sessions/test_session_pool.py",
    "content": "from __future__ import annotations\n\nimport logging\nfrom datetime import datetime, timezone\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.events import EventManager\nfrom crawlee.events._types import Event, EventPersistStateData\nfrom crawlee.sessions import Session, SessionPool\nfrom crawlee.sessions._models import SessionPoolModel\nfrom crawlee.storages import KeyValueStore\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\nMAX_POOL_SIZE = 3\nKVS_NAME = 'test-session-pool'\nPERSIST_STATE_KEY = 'crawlee_session_pool_state'\n\n\n@pytest.fixture\nasync def kvs() -> AsyncGenerator[KeyValueStore, None]:\n    kvs = await KeyValueStore.open(name=KVS_NAME)\n    yield kvs\n    await kvs.drop()\n\n\n@pytest.fixture\nasync def event_manager() -> AsyncGenerator[EventManager, None]:\n    async with EventManager() as em:\n        yield em\n\n\n@pytest.fixture\nasync def session_pool() -> AsyncGenerator[SessionPool, None]:\n    async with SessionPool(max_pool_size=MAX_POOL_SIZE, persistence_enabled=False) as sp:\n        yield sp\n\n\nasync def test_session_pool_init(session_pool: SessionPool) -> None:\n    \"\"\"Ensure that the session pool initializes correctly with predefined parameters.\"\"\"\n    assert session_pool.session_count == MAX_POOL_SIZE\n    assert session_pool.usable_session_count == MAX_POOL_SIZE\n    assert session_pool.retired_session_count == 0\n\n\nasync def test_add_session(session_pool: SessionPool) -> None:\n    \"\"\"Test adding sessions to the session pool increases session counts appropriately.\"\"\"\n    session_01 = Session(id='test_session_01')\n    session_02 = Session(id='test_session_02')\n    session_pool.add_session(session=session_01)\n    session_pool.add_session(session=session_02)\n    assert session_pool.session_count == MAX_POOL_SIZE + 2\n    assert session_pool.usable_session_count == MAX_POOL_SIZE + 2\n    assert session_pool.retired_session_count == 0\n\n\nasync def test_add_session_duplicate(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None:\n    \"\"\"Verify that adding a duplicate session logs a warning and does not increase count.\"\"\"\n    session_01 = Session(id='test_session_01')\n    session_02 = Session(id='test_session_01')\n\n    session_pool.add_session(session=session_01)\n    assert session_pool.session_count == MAX_POOL_SIZE + 1\n\n    with caplog.at_level(logging.WARNING):\n        session_pool.add_session(session=session_02)\n\n    assert session_pool.session_count == MAX_POOL_SIZE + 1\n\n\nasync def test_get_session(session_pool: SessionPool) -> None:\n    \"\"\"Check retrieval of a session from the pool and verify its properties.\"\"\"\n    session = await session_pool.get_session()\n    assert session is not None\n    assert session.expires_at >= datetime.now(timezone.utc)\n    assert not session.is_blocked\n    assert not session.is_expired\n    assert not session.is_max_usage_count_reached\n    assert session.is_usable\n\n\nasync def test_get_session_no_usable(caplog: pytest.LogCaptureFixture, session_pool: SessionPool) -> None:\n    \"\"\"Ensure that retrieval of a non-existent or retired session returns None and logs warning.\"\"\"\n    session = await session_pool.get_session_by_id('non_existent')\n    assert session is None\n\n    session = Session(id='test_session_not_usable')\n    session.retire()\n    assert not session.is_usable\n    session_pool.add_session(session=session)\n    assert session_pool.session_count == MAX_POOL_SIZE + 1\n\n    with caplog.at_level(logging.WARNING):\n        session = await session_pool.get_session_by_id('test_session_not_usable')\n        assert session is None\n\n\nasync def test_create_session_function() -> None:\n    \"\"\"Validate that a session created via a custom function works and has the expected fields set.\"\"\"\n    user_data = {'created_by': 'test_create_session_function'}\n    async with SessionPool(\n        max_pool_size=MAX_POOL_SIZE,\n        persistence_enabled=False,\n        create_session_function=lambda: Session(user_data=user_data),\n    ) as sp:\n        session = await sp.get_session()\n        assert session is not None\n        assert session.user_data == user_data\n\n\n@pytest.mark.parametrize('kvs_name', [KVS_NAME, None])\nasync def test_session_pool_persist(event_manager: EventManager, kvs_name: str | None) -> None:\n    \"\"\"Test persistence of session pool state to KVS and validate stored data integrity.\"\"\"\n    service_locator.set_event_manager(event_manager)\n\n    async with SessionPool(\n        max_pool_size=MAX_POOL_SIZE,\n        persistence_enabled=True,\n        persist_state_kvs_name=kvs_name,\n        persist_state_key=PERSIST_STATE_KEY,\n    ) as sp:\n        # Emit persist state event and wait for the persistence to complete\n        event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))\n        await event_manager.wait_for_all_listeners_to_complete()\n\n        # Get the persisted state from the key-value store\n        kvs = await KeyValueStore.open(name=kvs_name)\n        previous_state = await kvs.get_value(key=PERSIST_STATE_KEY)\n        assert isinstance(previous_state, dict)\n        sp_model = SessionPoolModel(**previous_state)\n\n        # Check if the state is correctly persisted\n        assert sp_model.session_count == sp.session_count\n        assert sp_model.usable_session_count == sp.usable_session_count\n        assert sp_model.retired_session_count == sp.retired_session_count\n\n        # Check if all the sessions are correctly persisted\n        for kvs_session in sp_model.sessions.values():\n            session = await sp.get_session_by_id(kvs_session.id)\n            assert kvs_session == session\n\n\nasync def test_session_pool_persist_and_restore(event_manager: EventManager, kvs: KeyValueStore) -> None:\n    \"\"\"Check session pool's ability to persist its state and then restore it accurately after reset.\"\"\"\n    service_locator.set_event_manager(event_manager)\n\n    async with SessionPool(\n        max_pool_size=MAX_POOL_SIZE,\n        persistence_enabled=True,\n        persist_state_kvs_name=KVS_NAME,\n        persist_state_key=PERSIST_STATE_KEY,\n    ):\n        # Emit persist state event and wait for the persistence to complete\n        event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))\n        await event_manager.wait_for_all_listeners_to_complete()\n\n    async with SessionPool(\n        max_pool_size=MAX_POOL_SIZE,\n        persistence_enabled=True,\n        persist_state_kvs_name=KVS_NAME,\n        persist_state_key=PERSIST_STATE_KEY,\n    ) as sp:\n        # Not just reset the store and check it's empty\n        await sp.reset_store()\n        previous_state = await kvs.get_value(key=PERSIST_STATE_KEY)\n        assert previous_state is None\n\n\nasync def test_methods_raise_error_when_not_active() -> None:\n    session = Session()\n    session_pool = SessionPool()\n\n    assert session_pool.active is False\n\n    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):\n        session_pool.get_state(as_dict=True)\n\n    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):\n        session_pool.add_session(session)\n\n    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):\n        await session_pool.get_session()\n\n    with pytest.raises(RuntimeError, match=r'SessionPool is not active.'):\n        await session_pool.get_session_by_id(session.id)\n\n    await session_pool.reset_store()\n\n    with pytest.raises(RuntimeError, match=r'SessionPool is already active.'):\n        async with session_pool, session_pool:\n            pass\n\n    async with session_pool:\n        assert session_pool.active is True\n"
  },
  {
    "path": "tests/unit/storage_clients/_file_system/test_fs_dataset_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee._consts import METADATA_FILENAME\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from crawlee.storage_clients._file_system import FileSystemDatasetClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n@pytest.fixture\nasync def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]:\n    \"\"\"A fixture for a file system dataset client.\"\"\"\n    client = await FileSystemStorageClient().create_dataset_client(name='test-dataset', configuration=configuration)\n    yield client\n    await client.drop()\n\n\nasync def test_file_and_directory_creation(configuration: Configuration) -> None:\n    \"\"\"Test that file system dataset creates proper files and directories.\"\"\"\n    client = await FileSystemStorageClient().create_dataset_client(name='new-dataset', configuration=configuration)\n\n    # Verify files were created\n    assert client.path_to_dataset.exists()\n    assert client.path_to_metadata.exists()\n\n    # Verify metadata file structure\n    with client.path_to_metadata.open() as f:\n        metadata = json.load(f)\n        client_metadata = await client.get_metadata()\n        assert metadata['id'] == client_metadata.id\n        assert metadata['name'] == 'new-dataset'\n        assert metadata['item_count'] == 0\n\n    await client.drop()\n\n\nasync def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None:\n    \"\"\"Test that data is properly persisted to files with correct content.\"\"\"\n    item = {'key': 'value', 'number': 42}\n    await dataset_client.push_data(item)\n\n    # Verify files are created on disk\n    all_files = list(dataset_client.path_to_dataset.glob('*.json'))\n    assert len(all_files) == 2  # 1 data file + 1 metadata file\n\n    # Verify actual file content\n    data_files = [item for item in all_files if item.name != METADATA_FILENAME]\n    assert len(data_files) == 1\n\n    with Path(data_files[0]).open() as f:\n        saved_item = json.load(f)\n        assert saved_item == item\n\n    # Test multiple items file creation\n    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]\n    await dataset_client.push_data(items)\n\n    all_files = list(dataset_client.path_to_dataset.glob('*.json'))\n    assert len(all_files) == 5  # 4 data files + 1 metadata file\n\n    data_files = [f for f in all_files if f.name != METADATA_FILENAME]\n    assert len(data_files) == 4  # Original item + 3 new items\n\n\nasync def test_drop_removes_files_from_disk(dataset_client: FileSystemDatasetClient) -> None:\n    \"\"\"Test that dropping a dataset removes the entire dataset directory from disk.\"\"\"\n    await dataset_client.push_data({'test': 'data'})\n\n    assert dataset_client.path_to_dataset.exists()\n\n    # Drop the dataset\n    await dataset_client.drop()\n\n    assert not dataset_client.path_to_dataset.exists()\n\n\nasync def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> None:\n    \"\"\"Test that metadata file is updated correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await dataset_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates accessed_at\n    await dataset_client.get_data()\n\n    # Verify timestamps\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_get = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates modified_at\n    await dataset_client.push_data({'new': 'item'})\n\n    # Verify timestamps again\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_get\n\n    # Verify metadata file is updated on disk\n    with dataset_client.path_to_metadata.open() as f:\n        metadata_json = json.load(f)\n        assert metadata_json['item_count'] == 1\n\n\nasync def test_data_persistence_across_reopens() -> None:\n    \"\"\"Test that data persists correctly when reopening the same dataset.\"\"\"\n    storage_client = FileSystemStorageClient()\n\n    # Create dataset and add data\n    original_client = await storage_client.create_dataset_client(name='persistence-test')\n\n    test_data = {'test_item': 'test_value', 'id': 123}\n    await original_client.push_data(test_data)\n\n    dataset_id = (await original_client.get_metadata()).id\n\n    # Reopen by ID and verify data persists\n    reopened_client = await storage_client.create_dataset_client(id=dataset_id)\n\n    data = await reopened_client.get_data()\n    assert len(data.items) == 1\n    assert data.items[0] == test_data\n\n    await reopened_client.drop()\n"
  },
  {
    "path": "tests/unit/storage_clients/_file_system/test_fs_kvs_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee._consts import METADATA_FILENAME\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n@pytest.fixture\nasync def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]:\n    \"\"\"A fixture for a file system key-value store client.\"\"\"\n    client = await FileSystemStorageClient().create_kvs_client(name='test-kvs', configuration=configuration)\n    yield client\n    await client.drop()\n\n\nasync def test_file_and_directory_creation(configuration: Configuration) -> None:\n    \"\"\"Test that file system KVS creates proper files and directories.\"\"\"\n    client = await FileSystemStorageClient().create_kvs_client(name='new-kvs', configuration=configuration)\n\n    # Verify files were created\n    assert client.path_to_kvs.exists()\n    assert client.path_to_metadata.exists()\n\n    # Verify metadata file structure\n    with client.path_to_metadata.open() as f:\n        metadata = json.load(f)\n        assert metadata['id'] == (await client.get_metadata()).id\n        assert metadata['name'] == 'new-kvs'\n\n    await client.drop()\n\n\nasync def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that values are properly persisted to files with correct content and metadata.\"\"\"\n    test_key = 'test-key'\n    test_value = 'Hello, world!'\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Check if the files were created\n    key_path = kvs_client.path_to_kvs / test_key\n    key_metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}'\n    assert key_path.exists()\n    assert key_metadata_path.exists()\n\n    # Check file content\n    content = key_path.read_text(encoding='utf-8')\n    assert content == test_value\n\n    # Check record metadata file\n    with key_metadata_path.open() as f:\n        metadata = json.load(f)\n        assert metadata['key'] == test_key\n        assert metadata['content_type'] == 'text/plain; charset=utf-8'\n        assert metadata['size'] == len(test_value.encode('utf-8'))\n\n\nasync def test_binary_data_persistence(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that binary data is stored correctly without corruption.\"\"\"\n    test_key = 'test-binary'\n    test_value = b'\\x00\\x01\\x02\\x03\\x04'\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Verify binary file exists\n    key_path = kvs_client.path_to_kvs / test_key\n    assert key_path.exists()\n\n    # Verify binary content is preserved\n    content = key_path.read_bytes()\n    assert content == test_value\n\n    # Verify retrieval works correctly\n    record = await kvs_client.get_value(key=test_key)\n    assert record is not None\n    assert record.value == test_value\n    assert record.content_type == 'application/octet-stream'\n\n\nasync def test_json_serialization_to_file(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that JSON objects are properly serialized to files.\"\"\"\n    test_key = 'test-json'\n    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Check if file content is valid JSON\n    key_path = kvs_client.path_to_kvs / test_key\n    with key_path.open() as f:\n        file_content = json.load(f)\n        assert file_content == test_value\n\n\nasync def test_file_deletion_on_value_delete(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that deleting a value removes its files from disk.\"\"\"\n    test_key = 'test-delete'\n    test_value = 'Delete me'\n\n    # Set a value\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Verify files exist\n    key_path = kvs_client.path_to_kvs / test_key\n    metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}'\n    assert key_path.exists()\n    assert metadata_path.exists()\n\n    # Delete the value\n    await kvs_client.delete_value(key=test_key)\n\n    # Verify files were deleted\n    assert not key_path.exists()\n    assert not metadata_path.exists()\n\n\nasync def test_drop_removes_directory(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that drop removes the entire store directory from disk.\"\"\"\n    await kvs_client.set_value(key='test', value='test-value')\n\n    assert kvs_client.path_to_kvs.exists()\n\n    # Drop the store\n    await kvs_client.drop()\n\n    assert not kvs_client.path_to_kvs.exists()\n\n\nasync def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) -> None:\n    \"\"\"Test that read/write operations properly update metadata file timestamps.\"\"\"\n    # Record initial timestamps\n    metadata = await kvs_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await kvs_client.get_value(key='nonexistent')\n\n    # Verify accessed timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await kvs_client.set_value(key='test', value='test-value')\n\n    # Verify modified timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n\n\nasync def test_data_persistence_across_reopens(configuration: Configuration) -> None:\n    \"\"\"Test that data persists correctly when reopening the same KVS.\"\"\"\n    storage_client = FileSystemStorageClient()\n\n    # Create KVS and add data\n    original_client = await storage_client.create_kvs_client(name='persistence-test', configuration=configuration)\n\n    test_key = 'persistent-key'\n    test_value = 'persistent-value'\n    await original_client.set_value(key=test_key, value=test_value)\n\n    kvs_id = (await original_client.get_metadata()).id\n\n    # Reopen by ID and verify data persists\n    reopened_client = await storage_client.create_kvs_client(\n        id=kvs_id,\n    )\n\n    record = await reopened_client.get_value(key=test_key)\n    assert record is not None\n    assert record.value == test_value\n\n    await reopened_client.drop()\n"
  },
  {
    "path": "tests/unit/storage_clients/_file_system/test_fs_rq_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import Request, service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from crawlee.storage_clients._file_system import FileSystemRequestQueueClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n@pytest.fixture\nasync def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]:\n    \"\"\"A fixture for a file system request queue client.\"\"\"\n    client = await FileSystemStorageClient().create_rq_client(\n        name='test-request-queue',\n    )\n    yield client\n    await client.drop()\n\n\nasync def test_file_and_directory_creation() -> None:\n    \"\"\"Test that file system RQ creates proper files and directories.\"\"\"\n    client = await FileSystemStorageClient().create_rq_client(name='new-request-queue')\n\n    # Verify files were created\n    assert client.path_to_rq.exists()\n    assert client.path_to_metadata.exists()\n\n    # Verify metadata file structure\n    with client.path_to_metadata.open() as f:\n        metadata = json.load(f)\n        assert metadata['id'] == (await client.get_metadata()).id\n        assert metadata['name'] == 'new-request-queue'\n\n    await client.drop()\n\n\nasync def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None:\n    \"\"\"Test that requests are properly persisted to files.\"\"\"\n    requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/2'),\n        Request.from_url('https://example.com/3'),\n    ]\n\n    await rq_client.add_batch_of_requests(requests)\n\n    # Verify request files are created\n    request_files = list(rq_client.path_to_rq.glob('*.json'))\n    # Should have 3 request files + 1 metadata file\n    assert len(request_files) == 4\n    assert rq_client.path_to_metadata in request_files\n\n    # Verify actual request file content\n    data_files = [f for f in request_files if f != rq_client.path_to_metadata]\n    assert len(data_files) == 3\n\n    for req_file in data_files:\n        with req_file.open() as f:\n            request_data = json.load(f)\n            assert 'url' in request_data\n            assert request_data['url'].startswith('https://example.com/')\n\n\nasync def test_opening_rq_does_not_have_side_effect_on_service_locator(configuration: Configuration) -> None:\n    \"\"\"Opening request queue client should cause setting storage client in the global service locator.\"\"\"\n    await FileSystemStorageClient().create_rq_client(name='test_request_queue', configuration=configuration)\n\n    # Set some specific storage client in the service locator. There should be no `ServiceConflictError`.\n    service_locator.set_storage_client(MemoryStorageClient())\n\n\nasync def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) -> None:\n    \"\"\"Test that drop removes the entire RQ directory from disk.\"\"\"\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    rq_path = rq_client.path_to_rq\n    assert rq_path.exists()\n\n    # Drop the request queue\n    await rq_client.drop()\n\n    assert not rq_path.exists()\n\n\nasync def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> None:\n    \"\"\"Test that metadata file is updated correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await rq_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await rq_client.is_empty()\n\n    # Verify accessed timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    # Verify modified timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n\n    # Verify metadata file is updated on disk\n    with rq_client.path_to_metadata.open() as f:\n        metadata_json = json.load(f)\n        assert metadata_json['total_request_count'] == 1\n\n\nasync def test_data_persistence_across_reopens() -> None:\n    \"\"\"Test that requests persist correctly when reopening the same RQ.\"\"\"\n    storage_client = FileSystemStorageClient()\n\n    # Create RQ and add requests\n    original_client = await storage_client.create_rq_client(\n        name='persistence-test',\n    )\n\n    test_requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/2'),\n    ]\n    await original_client.add_batch_of_requests(test_requests)\n\n    rq_id = (await original_client.get_metadata()).id\n\n    # Reopen by ID and verify requests persist\n    reopened_client = await storage_client.create_rq_client(\n        id=rq_id,\n    )\n\n    metadata = await reopened_client.get_metadata()\n    assert metadata.total_request_count == 2\n\n    # Fetch requests to verify they're still there\n    request1 = await reopened_client.fetch_next_request()\n    request2 = await reopened_client.fetch_next_request()\n\n    assert request1 is not None\n    assert request2 is not None\n    assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'}\n\n    await reopened_client.drop()\n\n\nasync def test_get_request_does_not_mark_in_progress(rq_client: FileSystemRequestQueueClient) -> None:\n    \"\"\"Test that get_request does not block a request from being fetched.\"\"\"\n    request = Request.from_url('https://example.com/blocked')\n    await rq_client.add_batch_of_requests([request])\n\n    fetched = await rq_client.get_request(request.unique_key)\n    assert fetched is not None\n    assert fetched.unique_key == request.unique_key\n\n    next_request = await rq_client.fetch_next_request()\n    assert next_request is not None\n    assert next_request.unique_key == request.unique_key\n"
  },
  {
    "path": "tests/unit/storage_clients/_memory/test_memory_dataset_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.storage_clients import MemoryStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from crawlee.storage_clients._memory import MemoryDatasetClient\n\n\n@pytest.fixture\nasync def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]:\n    \"\"\"Fixture that provides a fresh memory dataset client for each test.\"\"\"\n    client = await MemoryStorageClient().create_dataset_client(name='test-dataset')\n    yield client\n    await client.drop()\n\n\nasync def test_memory_specific_purge_behavior() -> None:\n    \"\"\"Test memory-specific purge behavior and in-memory storage characteristics.\"\"\"\n    # Create dataset and add data\n    dataset_client1 = await MemoryStorageClient().create_dataset_client(\n        name='test-purge-dataset',\n    )\n    await dataset_client1.push_data({'item': 'initial data'})\n\n    # Verify data was added\n    items = await dataset_client1.get_data()\n    assert len(items.items) == 1\n\n    # Reopen with same storage client instance\n    dataset_client2 = await MemoryStorageClient().create_dataset_client(\n        name='test-purge-dataset',\n    )\n\n    # Verify data was purged (memory storage specific behavior)\n    items = await dataset_client2.get_data()\n    assert len(items.items) == 0\n\n\nasync def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> None:\n    \"\"\"Test that metadata timestamps are updated correctly in memory storage.\"\"\"\n    # Record initial timestamps\n    metadata = await dataset_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await dataset_client.get_data()\n\n    # Verify timestamps (memory-specific behavior)\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await dataset_client.push_data({'new': 'item'})\n\n    # Verify timestamps were updated\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n"
  },
  {
    "path": "tests/unit/storage_clients/_memory/test_memory_kvs_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.storage_clients import MemoryStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from crawlee.storage_clients._memory import MemoryKeyValueStoreClient\n\n\n@pytest.fixture\nasync def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]:\n    \"\"\"Fixture that provides a fresh memory key-value store client for each test.\"\"\"\n    client = await MemoryStorageClient().create_kvs_client(name='test-kvs')\n    yield client\n    await client.drop()\n\n\nasync def test_memory_specific_purge_behavior() -> None:\n    \"\"\"Test memory-specific purge behavior and in-memory storage characteristics.\"\"\"\n\n    # Create KVS and add data\n    kvs_client1 = await MemoryStorageClient().create_kvs_client(\n        name='test-purge-kvs',\n    )\n    await kvs_client1.set_value(key='test-key', value='initial value')\n\n    # Verify value was set\n    record = await kvs_client1.get_value(key='test-key')\n    assert record is not None\n    assert record.value == 'initial value'\n\n    # Reopen with same storage client instance\n    kvs_client2 = await MemoryStorageClient().create_kvs_client(\n        name='test-purge-kvs',\n    )\n\n    # Verify value was purged (memory storage specific behavior)\n    record = await kvs_client2.get_value(key='test-key')\n    assert record is None\n\n\nasync def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None:\n    \"\"\"Test that metadata timestamps are updated correctly in memory storage.\"\"\"\n    # Record initial timestamps\n    metadata = await kvs_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await kvs_client.get_value(key='nonexistent')\n\n    # Verify timestamps (memory-specific behavior)\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await kvs_client.set_value(key='test', value='test-value')\n\n    # Verify timestamps were updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n"
  },
  {
    "path": "tests/unit/storage_clients/_memory/test_memory_rq_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee.storage_clients import MemoryStorageClient\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from crawlee.storage_clients._memory import MemoryRequestQueueClient\n\n\n@pytest.fixture\nasync def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]:\n    \"\"\"Fixture that provides a fresh memory request queue client for each test.\"\"\"\n    client = await MemoryStorageClient().create_rq_client(name='test-rq')\n    yield client\n    await client.drop()\n\n\nasync def test_memory_specific_purge_behavior() -> None:\n    \"\"\"Test memory-specific purge behavior and in-memory storage characteristics.\"\"\"\n    # Create RQ and add data\n    rq_client1 = await MemoryStorageClient().create_rq_client(\n        name='test-purge-rq',\n    )\n    request = Request.from_url(url='https://example.com/initial')\n    await rq_client1.add_batch_of_requests([request])\n\n    # Verify request was added\n    assert await rq_client1.is_empty() is False\n\n    # Reopen with same storage client instance\n    rq_client2 = await MemoryStorageClient().create_rq_client(\n        name='test-purge-rq',\n    )\n\n    # Verify queue was purged (memory storage specific behavior)\n    assert await rq_client2.is_empty() is True\n\n\nasync def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None:\n    \"\"\"Test that metadata timestamps are updated correctly in memory storage.\"\"\"\n    # Record initial timestamps\n    metadata = await rq_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await rq_client.is_empty()\n\n    # Verify timestamps (memory-specific behavior)\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    # Verify timestamps were updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n"
  },
  {
    "path": "tests/unit/storage_clients/_redis/test_redis_dataset_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.storage_clients import RedisStorageClient\nfrom crawlee.storage_clients._redis._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from fakeredis import FakeAsyncRedis\n\n    from crawlee.storage_clients._redis import RedisDatasetClient\n\n\n@pytest.fixture\nasync def dataset_client(\n    redis_client: FakeAsyncRedis,\n    suppress_user_warning: None,  # noqa: ARG001\n) -> AsyncGenerator[RedisDatasetClient, None]:\n    \"\"\"A fixture for a Redis dataset client.\"\"\"\n    client = await RedisStorageClient(redis=redis_client).create_dataset_client(\n        name='test_dataset',\n    )\n    yield client\n    await client.drop()\n\n\nasync def test_base_keys_creation(dataset_client: RedisDatasetClient) -> None:\n    \"\"\"Test that Redis dataset client creates proper keys.\"\"\"\n    metadata = await dataset_client.get_metadata()\n    name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset'\n\n    dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))\n\n    assert dataset_id is not None\n    assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id\n\n    items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))\n    assert items is not None\n    assert len(items) == 0\n\n    metadata_data = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:metadata'))\n\n    assert isinstance(metadata_data, dict)\n    assert metadata_data['id'] == metadata.id\n\n\nasync def test_record_and_content_verification(dataset_client: RedisDatasetClient) -> None:\n    \"\"\"Test that data is properly persisted to Redis with correct content.\"\"\"\n    item = {'key': 'value', 'number': 42}\n    await dataset_client.push_data(item)\n\n    # Verify metadata record\n    metadata = await dataset_client.get_metadata()\n    assert metadata.item_count == 1\n    assert metadata.created_at is not None\n    assert metadata.modified_at is not None\n    assert metadata.accessed_at is not None\n\n    # Verify records in Redis\n    all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))\n\n    assert all_items is not None\n    assert len(all_items) == 1\n\n    # Verify actual file content\n    assert all_items[0] == item\n\n    # Test multiple records\n    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]\n    await dataset_client.push_data(items)\n\n    all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))\n    assert all_items is not None\n    assert len(all_items) == 4\n\n\nasync def test_drop_removes_records(dataset_client: RedisDatasetClient) -> None:\n    \"\"\"Test that dropping a dataset removes all records from Redis.\"\"\"\n    await dataset_client.push_data({'test': 'data'})\n\n    metadata = await dataset_client.get_metadata()\n    name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))\n    dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))\n    items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset'\n    assert dataset_id is not None\n    assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id\n    assert items is not None\n    assert len(items) == 1\n\n    # Drop the dataset\n    await dataset_client.drop()\n\n    # Verify removal of all records\n    name_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id))\n    dataset_id_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset'))\n    items_after_drop = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$'))\n\n    assert name_after_drop is None\n    assert dataset_id_after_drop is None\n    assert items_after_drop is None\n\n\nasync def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> None:\n    \"\"\"Test that metadata record is updated correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await dataset_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates accessed_at\n    await dataset_client.get_data()\n\n    # Verify timestamps\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_get = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates modified_at\n    await dataset_client.push_data({'new': 'item'})\n\n    # Verify timestamps again\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_get\n"
  },
  {
    "path": "tests/unit/storage_clients/_redis/test_redis_kvs_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee.storage_clients import RedisStorageClient\nfrom crawlee.storage_clients._redis._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from fakeredis import FakeAsyncRedis\n\n    from crawlee.storage_clients._redis import RedisKeyValueStoreClient\n\n\n@pytest.fixture\nasync def kvs_client(\n    redis_client: FakeAsyncRedis,\n    suppress_user_warning: None,  # noqa: ARG001\n) -> AsyncGenerator[RedisKeyValueStoreClient, None]:\n    \"\"\"A fixture for a Redis KVS client.\"\"\"\n    client = await RedisStorageClient(redis=redis_client).create_kvs_client(\n        name='test_kvs',\n    )\n    yield client\n    await client.drop()\n\n\nasync def test_base_keys_creation(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that Redis KVS client creates proper keys.\"\"\"\n    metadata = await kvs_client.get_metadata()\n    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs'\n\n    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))\n\n    assert kvs_id is not None\n    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id\n\n    metadata_data = await await_redis_response(kvs_client.redis.json().get('key_value_stores:test_kvs:metadata'))\n\n    assert isinstance(metadata_data, dict)\n    assert metadata_data['id'] == metadata.id\n\n\nasync def test_value_record_creation_and_content(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that values are properly persisted to records with correct content and metadata.\"\"\"\n    test_key = 'test-key'\n    test_value = 'Hello, world!'\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Check if the records were created\n    records_key = 'key_value_stores:test_kvs:items'\n    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'\n    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))\n    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))\n    assert record_exists is True\n    assert metadata_exists is True\n\n    # Check record content\n    content = await await_redis_response(kvs_client.redis.hget(records_key, test_key))\n    content = content.decode() if isinstance(content, bytes) else content\n    assert content == test_value\n\n    # Check record metadata\n    record_metadata = await await_redis_response(kvs_client.redis.hget(records_items_metadata, test_key))\n    assert record_metadata is not None\n    assert isinstance(record_metadata, (str, bytes))\n    metadata = json.loads(record_metadata)\n\n    # Check record metadata\n    assert metadata['key'] == test_key\n    assert metadata['content_type'] == 'text/plain; charset=utf-8'\n    assert metadata['size'] == len(test_value.encode('utf-8'))\n\n    # Verify retrieval works correctly\n    check_value = await kvs_client.get_value(key=test_key)\n    assert check_value is not None\n    assert check_value.value == test_value\n\n\nasync def test_binary_data_persistence(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that binary data is stored correctly without corruption.\"\"\"\n    test_key = 'test-binary'\n    test_value = b'\\x00\\x01\\x02\\x03\\x04'\n    records_key = 'key_value_stores:test_kvs:items'\n    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Verify binary file exists\n    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))\n    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))\n    assert record_exists is True\n    assert metadata_exists is True\n\n    # Verify binary content is preserved\n    content = await await_redis_response(kvs_client.redis.hget(records_key, test_key))\n    assert content == test_value\n\n    # Verify retrieval works correctly\n    record = await kvs_client.get_value(key=test_key)\n    assert record is not None\n    assert record.value == test_value\n    assert record.content_type == 'application/octet-stream'\n\n\nasync def test_json_serialization_to_record(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that JSON objects are properly serialized to records.\"\"\"\n    test_key = 'test-json'\n    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Check if record content is valid JSON\n    records_key = 'key_value_stores:test_kvs:items'\n    record = await await_redis_response(kvs_client.redis.hget(records_key, test_key))\n    assert record is not None\n    assert isinstance(record, (str, bytes))\n    assert json.loads(record) == test_value\n\n\nasync def test_records_deletion_on_value_delete(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that deleting a value removes its records from Redis.\"\"\"\n    test_key = 'test-delete'\n    test_value = 'Delete me'\n    records_key = 'key_value_stores:test_kvs:items'\n    records_items_metadata = 'key_value_stores:test_kvs:metadata_items'\n\n    # Set a value\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    # Verify records exist\n    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))\n    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))\n    assert record_exists is True\n    assert metadata_exists is True\n\n    # Delete the value\n    await kvs_client.delete_value(key=test_key)\n\n    # Verify files were deleted\n    record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key))\n    metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key))\n    assert record_exists is False\n    assert metadata_exists is False\n\n\nasync def test_drop_removes_keys(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that drop removes the entire store directory from disk.\"\"\"\n    await kvs_client.set_value(key='test', value='test-value')\n\n    metadata = await kvs_client.get_metadata()\n    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))\n    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))\n    items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items'))\n    metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items'))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs'\n    assert kvs_id is not None\n    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id\n    assert items is not None\n    assert items != {}\n    assert metadata_items is not None\n    assert metadata_items != {}\n\n    # Drop the store\n    await kvs_client.drop()\n\n    name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id))\n    kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs'))\n    items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items'))\n    metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items'))\n    assert name is None\n    assert kvs_id is None\n    assert items == {}\n    assert metadata_items == {}\n\n\nasync def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) -> None:\n    \"\"\"Test that read/write operations properly update metadata file timestamps.\"\"\"\n    # Record initial timestamps\n    metadata = await kvs_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await kvs_client.get_value(key='nonexistent')\n\n    # Verify accessed timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await kvs_client.set_value(key='test', value='test-value')\n\n    # Verify modified timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n"
  },
  {
    "path": "tests/unit/storage_clients/_redis/test_redis_rq_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee.storage_clients import RedisStorageClient\nfrom crawlee.storage_clients._redis._utils import await_redis_response\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from fakeredis import FakeAsyncRedis\n\n    from crawlee.storage_clients._redis import RedisRequestQueueClient\n\n\n@pytest.fixture(params=['default', 'bloom'])\nasync def rq_client(\n    redis_client: FakeAsyncRedis,\n    request: pytest.FixtureRequest,\n    suppress_user_warning: None,  # noqa: ARG001\n) -> AsyncGenerator[RedisRequestQueueClient, None]:\n    \"\"\"A fixture for a Redis RQ client.\"\"\"\n    client = await RedisStorageClient(redis=redis_client, queue_dedup_strategy=request.param).create_rq_client(\n        name='test_request_queue'\n    )\n    yield client\n    await client.drop()\n\n\nasync def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that Redis RQ client creates proper keys.\"\"\"\n\n    metadata = await rq_client.get_metadata()\n    name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue'\n\n    kvs_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue'))\n\n    assert kvs_id is not None\n    assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id\n\n    if rq_client._dedup_strategy == 'bloom':\n        added_bf = await await_redis_response(\n            rq_client.redis.exists('request_queues:test_request_queue:added_bloom_filter')\n        )\n        assert added_bf == 1\n\n        handled_bf = await await_redis_response(\n            rq_client.redis.exists('request_queues:test_request_queue:handled_bloom_filter')\n        )\n        assert handled_bf == 1\n\n    metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata'))\n\n    assert isinstance(metadata_data, dict)\n    assert metadata_data['id'] == metadata.id\n\n\nasync def test_request_records_persistence(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that requests are properly persisted to Redis.\"\"\"\n    requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/2'),\n        Request.from_url('https://example.com/3'),\n    ]\n\n    await rq_client.add_batch_of_requests(requests)\n\n    # Verify request records are created\n    request_queue_response = await await_redis_response(\n        rq_client.redis.lmpop(1, 'request_queues:test_request_queue:queue', direction='left', count=10)\n    )\n    assert request_queue_response is not None\n    assert isinstance(request_queue_response, list)\n    request_keys = request_queue_response[1]\n    assert isinstance(request_keys, list)\n    assert len(request_keys) == 3\n\n    # Verify actual request file content\n    requests_records_data = await await_redis_response(\n        rq_client.redis.hgetall('request_queues:test_request_queue:data')\n    )\n    assert isinstance(requests_records_data, dict)\n\n    for key in request_keys:\n        request_data = json.loads(requests_records_data[key])  # ty: ignore[invalid-argument-type]\n        assert 'url' in request_data\n        assert request_data['url'].startswith('https://example.com/')\n\n\nasync def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that drop removes all request records from Redis.\"\"\"\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    rq_queue = 'request_queues:test_request_queue:queue'\n    rq_data = 'request_queues:test_request_queue:data'\n    added_bf = 'request_queues:test_request_queue:added_bloom_filter'\n    handled_bf = 'request_queues:test_request_queue:handled_bloom_filter'\n    pending_set = 'request_queues:test_request_queue:pending_set'\n    handled_set = 'request_queues:test_request_queue:handled_set'\n    metadata_key = 'request_queues:test_request_queue:metadata'\n\n    metadata = await rq_client.get_metadata()\n    name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))\n\n    assert name is not None\n    assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue'\n\n    rq_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue'))\n    assert rq_id is not None\n    assert rq_id.decode() if isinstance(rq_id, bytes) else rq_id\n\n    rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue))\n    rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data))\n    metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key))\n    assert rq_queue_exists == 1\n    assert rq_data_exists == 1\n    assert metadata_exists == 1\n\n    if rq_client._dedup_strategy == 'bloom':\n        added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf))\n        handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf))\n        assert added_bf_exists == 1\n        assert handled_bf_exists == 1\n    elif rq_client._dedup_strategy == 'default':\n        pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set))\n        handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set))\n        assert pending_set_exists == 1\n        # No requests marked as handled\n        assert handled_set_exists == 0\n\n    # Drop the request queue\n    await rq_client.drop()\n\n    # Verify removal of all records\n    name_after_drop = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id))\n    rq_id_after_drop = await await_redis_response(\n        rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')\n    )\n    rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue))\n    rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data))\n    metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key))\n    assert name_after_drop is None\n    assert rq_id_after_drop is None\n    assert rq_queue_exists == 0\n    assert rq_data_exists == 0\n    assert metadata_exists == 0\n\n    if rq_client._dedup_strategy == 'bloom':\n        added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf))\n        handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf))\n        assert added_bf_exists == 0\n        assert handled_bf_exists == 0\n    elif rq_client._dedup_strategy == 'default':\n        pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set))\n        handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set))\n        assert pending_set_exists == 0\n        assert handled_set_exists == 0\n\n\nasync def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that metadata file is updated correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await rq_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await rq_client.is_empty()\n\n    # Verify accessed timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    # Verify modified timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n\n\nasync def test_get_request(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that get_request works correctly.\"\"\"\n    requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/2'),\n        Request.from_url('https://example.com/3'),\n    ]\n\n    added_requests = await rq_client.add_batch_of_requests(requests)\n    assert len(added_requests.processed_requests) == 3\n\n    for req in requests:\n        fetched_request = await rq_client.get_request(req.unique_key)\n        assert fetched_request is not None\n        assert fetched_request.unique_key == req.unique_key\n        assert fetched_request.url == req.url\n\n    # Test fetching a non-existent request\n    non_existent = await rq_client.get_request('non-existent-id')\n    assert non_existent is None\n\n\nasync def test_deduplication(rq_client: RedisRequestQueueClient) -> None:\n    \"\"\"Test that request deduplication works correctly.\"\"\"\n    requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/3'),\n    ]\n\n    await rq_client.add_batch_of_requests(requests)\n\n    # Verify only unique requests are added\n    metadata = await rq_client.get_metadata()\n    assert metadata.pending_request_count == 2\n    assert metadata.total_request_count == 2\n\n    # Fetch requests and verify order\n    request1 = await rq_client.fetch_next_request()\n    assert request1 is not None\n    assert request1 == requests[0]\n\n    # Fetch the next request, which should skip the duplicate\n    request2 = await rq_client.fetch_next_request()\n    assert request2 is not None\n    assert request2 == requests[2]\n\n    # Verify no more requests are available\n    request3 = await rq_client.fetch_next_request()\n    assert request3 is None\n"
  },
  {
    "path": "tests/unit/storage_clients/_sql/test_sql_dataset_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom sqlalchemy import inspect, select\nfrom sqlalchemy.ext.asyncio import create_async_engine\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import SqlStorageClient\nfrom crawlee.storage_clients._sql._db_models import DatasetItemDb, DatasetMetadataDb\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from sqlalchemy import Connection\n\n    from crawlee.storage_clients._sql import SqlDatasetClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    \"\"\"Temporary configuration for tests.\"\"\"\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n# Helper function that allows you to use inspect with an asynchronous engine\ndef get_tables(sync_conn: Connection) -> list[str]:\n    inspector = inspect(sync_conn)\n    return inspector.get_table_names()\n\n\n@pytest.fixture\nasync def dataset_client(\n    configuration: Configuration,\n) -> AsyncGenerator[SqlDatasetClient, None]:\n    \"\"\"A fixture for a SQL dataset client.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_dataset_client(\n            name='test-dataset',\n            configuration=configuration,\n        )\n        yield client\n        await client.drop()\n\n\nasync def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL dataset client creates tables with a connection string.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:\n        await storage_client.create_dataset_client(\n            name='new-dataset',\n            configuration=configuration,\n        )\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'dataset_records' in tables\n            assert 'datasets' in tables\n\n\nasync def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL dataset client creates tables with a pre-configured engine.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)\n\n    async with SqlStorageClient(engine=engine) as storage_client:\n        await storage_client.create_dataset_client(\n            name='new-dataset',\n            configuration=configuration,\n        )\n\n        async with engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'dataset_records' in tables\n            assert 'datasets' in tables\n\n\nasync def test_tables_and_metadata_record(configuration: Configuration) -> None:\n    \"\"\"Test that SQL dataset creates proper tables and metadata records.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_dataset_client(\n            name='new-dataset',\n            configuration=configuration,\n        )\n\n        client_metadata = await client.get_metadata()\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'dataset_records' in tables\n            assert 'datasets' in tables\n\n        async with client.get_session() as session:\n            stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new-dataset')\n            result = await session.execute(stmt)\n            orm_metadata = result.scalar_one_or_none()\n            assert orm_metadata is not None\n            assert orm_metadata.id == client_metadata.id\n            assert orm_metadata.name == 'new-dataset'\n            assert orm_metadata.item_count == 0\n\n        await client.drop()\n\n\nasync def test_record_and_content_verification(dataset_client: SqlDatasetClient) -> None:\n    \"\"\"Test that dataset client can push data and verify its content.\"\"\"\n    item = {'key': 'value', 'number': 42}\n    await dataset_client.push_data(item)\n\n    # Verify metadata record\n    metadata = await dataset_client.get_metadata()\n    assert metadata.item_count == 1\n    assert metadata.created_at is not None\n    assert metadata.modified_at is not None\n    assert metadata.accessed_at is not None\n\n    async with dataset_client.get_session() as session:\n        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 1\n        saved_item = records[0].data\n        assert saved_item == item\n\n    # Test pushing multiple items and verify total count\n    items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}]\n    await dataset_client.push_data(items)\n\n    async with dataset_client.get_session() as session:\n        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 4\n\n\nasync def test_drop_removes_records(dataset_client: SqlDatasetClient) -> None:\n    \"\"\"Test that dropping a dataset removes all records from the database.\"\"\"\n    await dataset_client.push_data({'test': 'data'})\n\n    client_metadata = await dataset_client.get_metadata()\n\n    async with dataset_client.get_session() as session:\n        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 1\n\n    # Drop the dataset\n    await dataset_client.drop()\n\n    async with dataset_client.get_session() as session:\n        stmt = select(DatasetItemDb).where(DatasetItemDb.dataset_id == client_metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 0\n        metadata = await session.get(DatasetMetadataDb, client_metadata.id)\n        assert metadata is None\n\n\nasync def test_metadata_record_updates(dataset_client: SqlDatasetClient) -> None:\n    \"\"\"Test that metadata record is updated correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await dataset_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates accessed_at\n    await dataset_client.get_data()\n\n    # Verify timestamps\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_get = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform an operation that updates modified_at\n    await dataset_client.push_data({'new': 'item'})\n\n    # Verify timestamps again\n    metadata = await dataset_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_get\n\n    # Verify metadata record is updated in db\n    async with dataset_client.get_session() as session:\n        orm_metadata = await session.get(DatasetMetadataDb, metadata.id)\n        assert orm_metadata is not None\n        orm_metadata.item_count = 1\n        assert orm_metadata.created_at == initial_created\n        assert orm_metadata.accessed_at == metadata.accessed_at\n        assert orm_metadata.modified_at == metadata.modified_at\n\n\nasync def test_data_persistence_across_reopens(configuration: Configuration) -> None:\n    \"\"\"Test that data persists correctly when reopening the same dataset.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        original_client = await storage_client.create_dataset_client(\n            name='persistence-test',\n            configuration=configuration,\n        )\n\n        test_data = {'test_item': 'test_value', 'id': 123}\n        await original_client.push_data(test_data)\n\n        dataset_id = (await original_client.get_metadata()).id\n\n        reopened_client = await storage_client.create_dataset_client(\n            id=dataset_id,\n            configuration=configuration,\n        )\n\n        data = await reopened_client.get_data()\n        assert len(data.items) == 1\n        assert data.items[0] == test_data\n\n        await reopened_client.drop()\n"
  },
  {
    "path": "tests/unit/storage_clients/_sql/test_sql_kvs_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom sqlalchemy import inspect, select\nfrom sqlalchemy.ext.asyncio import create_async_engine\n\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import SqlStorageClient\nfrom crawlee.storage_clients._sql._db_models import KeyValueStoreMetadataDb, KeyValueStoreRecordDb\nfrom crawlee.storage_clients.models import KeyValueStoreMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from sqlalchemy import Connection\n\n    from crawlee.storage_clients._sql import SqlKeyValueStoreClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    \"\"\"Temporary configuration for tests.\"\"\"\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n@pytest.fixture\nasync def kvs_client(\n    configuration: Configuration,\n) -> AsyncGenerator[SqlKeyValueStoreClient, None]:\n    \"\"\"A fixture for a SQL key-value store client.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_kvs_client(\n            name='test-kvs',\n            configuration=configuration,\n        )\n        yield client\n        await client.drop()\n\n\n# Helper function that allows you to use inspect with an asynchronous engine\ndef get_tables(sync_conn: Connection) -> list[str]:\n    inspector = inspect(sync_conn)\n    return inspector.get_table_names()\n\n\nasync def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL key-value store client creates tables with a connection string.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:\n        await storage_client.create_kvs_client(\n            name='new-kvs',\n            configuration=configuration,\n        )\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'key_value_stores' in tables\n            assert 'key_value_store_records' in tables\n\n\nasync def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL key-value store client creates tables with a pre-configured engine.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)\n\n    async with SqlStorageClient(engine=engine) as storage_client:\n        await storage_client.create_kvs_client(\n            name='new-kvs',\n            configuration=configuration,\n        )\n\n        async with engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'key_value_stores' in tables\n            assert 'key_value_store_records' in tables\n\n\nasync def test_tables_and_metadata_record(configuration: Configuration) -> None:\n    \"\"\"Test that SQL key-value store creates proper tables and metadata records.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_kvs_client(\n            name='new-kvs',\n            configuration=configuration,\n        )\n\n        client_metadata = await client.get_metadata()\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'key_value_stores' in tables\n            assert 'key_value_store_records' in tables\n\n        async with client.get_session() as session:\n            stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new-kvs')\n            result = await session.execute(stmt)\n            orm_metadata = result.scalar_one_or_none()\n            metadata = KeyValueStoreMetadata.model_validate(orm_metadata)\n            assert metadata.id == client_metadata.id\n            assert metadata.name == 'new-kvs'\n\n        await client.drop()\n\n\nasync def test_value_record_creation(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that SQL key-value store client can create a record.\"\"\"\n    test_key = 'test-key'\n    test_value = 'Hello, world!'\n    await kvs_client.set_value(key=test_key, value=test_value)\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is not None\n        assert record.key == test_key\n        assert record.content_type == 'text/plain; charset=utf-8'\n        assert record.size == len(test_value.encode('utf-8'))\n        assert record.value == test_value.encode('utf-8')\n\n\nasync def test_binary_data_persistence(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that binary data is stored correctly without corruption.\"\"\"\n    test_key = 'test-binary'\n    test_value = b'\\x00\\x01\\x02\\x03\\x04'\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is not None\n        assert record.key == test_key\n        assert record.content_type == 'application/octet-stream'\n        assert record.size == len(test_value)\n        assert record.value == test_value\n\n    verify_record = await kvs_client.get_value(key=test_key)\n    assert verify_record is not None\n    assert verify_record.value == test_value\n    assert verify_record.content_type == 'application/octet-stream'\n\n\nasync def test_json_serialization_to_record(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that JSON objects are properly serialized to records.\"\"\"\n    test_key = 'test-json'\n    test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]}\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is not None\n        assert record.key == test_key\n        assert json.loads(record.value.decode('utf-8')) == test_value\n\n\nasync def test_record_deletion_on_value_delete(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that deleting a value removes its record from the database.\"\"\"\n    test_key = 'test-delete'\n    test_value = 'Delete me'\n\n    # Set a value\n    await kvs_client.set_value(key=test_key, value=test_value)\n\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is not None\n        assert record.key == test_key\n        assert record.value == test_value.encode('utf-8')\n\n    # Delete the value\n    await kvs_client.delete_value(key=test_key)\n\n    # Verify record was deleted\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == test_key)\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is None\n\n\nasync def test_drop_removes_records(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that drop removes all records from the database.\"\"\"\n    await kvs_client.set_value(key='test', value='test-value')\n\n    client_metadata = await kvs_client.get_metadata()\n\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test')\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is not None\n\n    # Drop the store\n    await kvs_client.drop()\n\n    async with kvs_client.get_session() as session:\n        stmt = select(KeyValueStoreRecordDb).where(KeyValueStoreRecordDb.key == 'test')\n        result = await session.execute(stmt)\n        record = result.scalar_one_or_none()\n        assert record is None\n        metadata = await session.get(KeyValueStoreMetadataDb, client_metadata.id)\n        assert metadata is None\n\n\nasync def test_metadata_record_updates(kvs_client: SqlKeyValueStoreClient) -> None:\n    \"\"\"Test that read/write operations properly update metadata record timestamps.\"\"\"\n    # Record initial timestamps\n    metadata = await kvs_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await kvs_client.get_value(key='nonexistent')\n\n    # Verify accessed timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await kvs_client.set_value(key='test', value='test-value')\n\n    # Verify modified timestamp was updated\n    metadata = await kvs_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n\n    async with kvs_client.get_session() as session:\n        orm_metadata = await session.get(KeyValueStoreMetadataDb, metadata.id)\n        assert orm_metadata is not None\n        assert orm_metadata.created_at == metadata.created_at\n        assert orm_metadata.accessed_at == metadata.accessed_at\n        assert orm_metadata.modified_at == metadata.modified_at\n\n\nasync def test_data_persistence_across_reopens(configuration: Configuration) -> None:\n    \"\"\"Test that data persists correctly when reopening the same key-value store.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        original_client = await storage_client.create_kvs_client(\n            name='persistence-test',\n            configuration=configuration,\n        )\n\n        test_key = 'persistent-key'\n        test_value = 'persistent-value'\n        await original_client.set_value(key=test_key, value=test_value)\n\n        kvs_id = (await original_client.get_metadata()).id\n\n        # Reopen by ID and verify data persists\n        reopened_client = await storage_client.create_kvs_client(\n            id=kvs_id,\n            configuration=configuration,\n        )\n\n        record = await reopened_client.get_value(key=test_key)\n        assert record is not None\n        assert record.value == test_value\n\n        await reopened_client.drop()\n"
  },
  {
    "path": "tests/unit/storage_clients/_sql/test_sql_rq_client.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\nfrom sqlalchemy import inspect, select\nfrom sqlalchemy.ext.asyncio import create_async_engine\n\nfrom crawlee import Request\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import SqlStorageClient\nfrom crawlee.storage_clients._sql._db_models import RequestDb, RequestQueueMetadataDb\nfrom crawlee.storage_clients.models import RequestQueueMetadata\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n    from sqlalchemy import Connection\n\n    from crawlee.storage_clients._sql import SqlRequestQueueClient\n\n\n@pytest.fixture\ndef configuration(tmp_path: Path) -> Configuration:\n    \"\"\"Temporary configuration for tests.\"\"\"\n    return Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n\n@pytest.fixture\nasync def rq_client(\n    configuration: Configuration,\n) -> AsyncGenerator[SqlRequestQueueClient, None]:\n    \"\"\"A fixture for a SQL request queue client.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_rq_client(\n            name='test-request-queue',\n            configuration=configuration,\n        )\n        yield client\n        await client.drop()\n\n\n# Helper function that allows you to use inspect with an asynchronous engine\ndef get_tables(sync_conn: Connection) -> list[str]:\n    inspector = inspect(sync_conn)\n    return inspector.get_table_names()\n\n\nasync def test_create_tables_with_connection_string(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL request queue client creates tables with a connection string.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:\n        await storage_client.create_rq_client(\n            name='test-request-queue',\n            configuration=configuration,\n        )\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'request_queues' in tables\n            assert 'request_queue_records' in tables\n            assert 'request_queue_state' in tables\n\n\nasync def test_create_tables_with_engine(configuration: Configuration, tmp_path: Path) -> None:\n    \"\"\"Test that SQL request queue client creates tables with a pre-configured engine.\"\"\"\n    storage_dir = tmp_path / 'test_table.db'\n\n    engine = create_async_engine(f'sqlite+aiosqlite:///{storage_dir}', future=True, echo=False)\n\n    async with SqlStorageClient(engine=engine) as storage_client:\n        await storage_client.create_rq_client(\n            name='test-request-queue',\n            configuration=configuration,\n        )\n\n        async with engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'request_queues' in tables\n            assert 'request_queue_records' in tables\n            assert 'request_queue_state' in tables\n\n\nasync def test_tables_and_metadata_record(configuration: Configuration) -> None:\n    \"\"\"Test that SQL request queue creates proper tables and metadata records.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        client = await storage_client.create_rq_client(\n            name='test-request-queue',\n            configuration=configuration,\n        )\n\n        client_metadata = await client.get_metadata()\n\n        async with storage_client.engine.begin() as conn:\n            tables = await conn.run_sync(get_tables)\n            assert 'request_queues' in tables\n            assert 'request_queue_records' in tables\n            assert 'request_queue_state' in tables\n\n        async with client.get_session() as session:\n            stmt = select(RequestQueueMetadataDb).where(RequestQueueMetadataDb.name == 'test-request-queue')\n            result = await session.execute(stmt)\n            orm_metadata = result.scalar_one_or_none()\n            metadata = RequestQueueMetadata.model_validate(orm_metadata)\n            assert metadata.id == client_metadata.id\n            assert metadata.name == 'test-request-queue'\n\n        await client.drop()\n\n\nasync def test_request_records_persistence(rq_client: SqlRequestQueueClient) -> None:\n    \"\"\"Test that all added requests are persisted and can be retrieved from the database.\"\"\"\n    requests = [\n        Request.from_url('https://example.com/1'),\n        Request.from_url('https://example.com/2'),\n        Request.from_url('https://example.com/3'),\n    ]\n\n    await rq_client.add_batch_of_requests(requests)\n\n    metadata_client = await rq_client.get_metadata()\n\n    async with rq_client.get_session() as session:\n        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata_client.id)\n        result = await session.execute(stmt)\n        db_requests = result.scalars().all()\n        assert len(db_requests) == 3\n    for db_request in db_requests:\n        request = json.loads(db_request.data)\n        assert request['url'] in ['https://example.com/1', 'https://example.com/2', 'https://example.com/3']\n\n\nasync def test_drop_removes_records(rq_client: SqlRequestQueueClient) -> None:\n    \"\"\"Test that drop removes all records from the database.\"\"\"\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n    metadata = await rq_client.get_metadata()\n    async with rq_client.get_session() as session:\n        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 1\n\n    await rq_client.drop()\n\n    async with rq_client.get_session() as session:\n        stmt = select(RequestDb).where(RequestDb.request_queue_id == metadata.id)\n        result = await session.execute(stmt)\n        records = result.scalars().all()\n        assert len(records) == 0\n        db_metadata = await session.get(RequestQueueMetadataDb, metadata.id)\n        assert db_metadata is None\n\n\nasync def test_metadata_record_updates(rq_client: SqlRequestQueueClient) -> None:\n    \"\"\"Test that metadata record updates correctly after operations.\"\"\"\n    # Record initial timestamps\n    metadata = await rq_client.get_metadata()\n    initial_created = metadata.created_at\n    initial_accessed = metadata.accessed_at\n    initial_modified = metadata.modified_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a read operation\n    await rq_client.is_empty()\n\n    # Verify accessed timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.accessed_at > initial_accessed\n    assert metadata.modified_at == initial_modified\n\n    accessed_after_read = metadata.accessed_at\n\n    # Wait a moment to ensure timestamps can change\n    await asyncio.sleep(0.01)\n\n    # Perform a write operation\n    await rq_client.add_batch_of_requests([Request.from_url('https://example.com')])\n\n    # Verify modified timestamp was updated\n    metadata = await rq_client.get_metadata()\n    assert metadata.created_at == initial_created\n    assert metadata.modified_at > initial_modified\n    assert metadata.accessed_at > accessed_after_read\n\n    async with rq_client.get_session() as session:\n        orm_metadata = await session.get(RequestQueueMetadataDb, metadata.id)\n        assert orm_metadata is not None\n        assert orm_metadata.created_at == metadata.created_at\n        assert orm_metadata.accessed_at == metadata.accessed_at\n        assert orm_metadata.modified_at == metadata.modified_at\n\n\nasync def test_data_persistence_across_reopens(configuration: Configuration) -> None:\n    \"\"\"Test that data persists correctly when reopening the same request queue.\"\"\"\n    async with SqlStorageClient() as storage_client:\n        original_client = await storage_client.create_rq_client(\n            name='persistence-test',\n            configuration=configuration,\n        )\n\n        test_requests = [\n            Request.from_url('https://example.com/1'),\n            Request.from_url('https://example.com/2'),\n        ]\n        await original_client.add_batch_of_requests(test_requests)\n\n        rq_id = (await original_client.get_metadata()).id\n\n        # Reopen by ID and verify data persists\n        reopened_client = await storage_client.create_rq_client(\n            id=rq_id,\n            configuration=configuration,\n        )\n\n        metadata = await reopened_client.get_metadata()\n        assert metadata.total_request_count == 2\n\n        # Fetch requests to verify they're still there\n        request1 = await reopened_client.fetch_next_request()\n        request2 = await reopened_client.fetch_next_request()\n\n        assert request1 is not None\n        assert request2 is not None\n        assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'}\n\n        await reopened_client.drop()\n"
  },
  {
    "path": "tests/unit/storages/conftest.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.storage_clients import (\n    FileSystemStorageClient,\n    MemoryStorageClient,\n    RedisStorageClient,\n    SqlStorageClient,\n    StorageClient,\n)\n\nif TYPE_CHECKING:\n    from fakeredis import FakeAsyncRedis\n\n\n@pytest.fixture(params=['memory', 'file_system', 'sql', 'redis'])\ndef storage_client(\n    request: pytest.FixtureRequest,\n    redis_client: FakeAsyncRedis,\n) -> StorageClient:\n    \"\"\"Parameterized fixture to test with different storage clients.\"\"\"\n    storage_client: StorageClient\n\n    storage_type = request.param\n\n    if storage_type == 'memory':\n        storage_client = MemoryStorageClient()\n    elif storage_type == 'sql':\n        storage_client = SqlStorageClient()\n    elif storage_type == 'redis':\n        storage_client = RedisStorageClient(redis=redis_client)\n    else:\n        storage_client = FileSystemStorageClient()\n    service_locator.set_storage_client(storage_client)\n    return storage_client\n"
  },
  {
    "path": "tests/unit/storages/test_dataset.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\nfrom crawlee.storages import Dataset, KeyValueStore\nfrom crawlee.storages._storage_instance_manager import StorageInstanceManager\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n    from typing import Any\n\n    from crawlee.storage_clients import StorageClient\n\n\n@pytest.fixture\nasync def dataset(\n    storage_client: StorageClient,\n) -> AsyncGenerator[Dataset, None]:\n    \"\"\"Fixture that provides a dataset instance for each test.\"\"\"\n    dataset = await Dataset.open(\n        storage_client=storage_client,\n    )\n\n    yield dataset\n    await dataset.drop()\n\n\nasync def test_open_creates_new_dataset(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() creates a new dataset with proper metadata.\"\"\"\n    dataset = await Dataset.open(\n        name='new-dataset',\n        storage_client=storage_client,\n    )\n\n    # Verify dataset properties\n    assert dataset.id is not None\n    assert dataset.name == 'new-dataset'\n\n    metadata = await dataset.get_metadata()\n    assert metadata.item_count == 0\n\n    await dataset.drop()\n\n\nasync def test_reopen_default(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test reopening a dataset with default parameters.\"\"\"\n    # Create a first dataset instance with default parameters\n    dataset_1 = await Dataset.open(\n        storage_client=storage_client,\n    )\n\n    # Verify default properties\n    assert dataset_1.id is not None\n    metadata_1 = await dataset_1.get_metadata()\n    assert metadata_1.item_count == 0\n\n    # Add an item\n    await dataset_1.push_data({'key': 'value'})\n    metadata_1 = await dataset_1.get_metadata()\n    assert metadata_1.item_count == 1\n\n    # Reopen the same dataset\n    dataset_2 = await Dataset.open(\n        storage_client=storage_client,\n    )\n\n    # Verify both instances reference the same dataset\n    assert dataset_2.id == dataset_1.id\n    assert dataset_2.name == dataset_1.name\n    metadata_1 = await dataset_1.get_metadata()\n    metadata_2 = await dataset_2.get_metadata()\n    assert metadata_2.item_count == metadata_1.item_count == 1\n\n    # Verify they are the same object (cached)\n    assert id(dataset_1) == id(dataset_2)\n\n    # Clean up\n    await dataset_1.drop()\n\n\nasync def test_open_by_id(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening a dataset by its ID.\"\"\"\n    # First create a dataset by name\n    dataset1 = await Dataset.open(\n        name='dataset-by-id-test',\n        storage_client=storage_client,\n    )\n\n    # Add some data to identify it\n    test_item = {'test': 'opening_by_id', 'timestamp': 12345}\n    await dataset1.push_data(test_item)\n\n    # Open the dataset by ID\n    dataset2 = await Dataset.open(\n        id=dataset1.id,\n        storage_client=storage_client,\n    )\n\n    # Verify it's the same dataset\n    assert dataset2.id == dataset1.id\n    assert dataset2.name == 'dataset-by-id-test'\n\n    # Verify the data is still there\n    data = await dataset2.get_data()\n    assert data.count == 1\n    assert data.items[0]['test'] == 'opening_by_id'\n    assert data.items[0]['timestamp'] == 12345\n\n    # Clean up\n    await dataset2.drop()\n\n\nasync def test_open_existing_dataset(\n    dataset: Dataset,\n) -> None:\n    \"\"\"Test that open() loads an existing dataset correctly.\"\"\"\n    # Open the same dataset again\n    reopened_dataset = await Dataset.open(\n        name=dataset.name,\n    )\n\n    # Verify dataset properties\n    assert dataset.id == reopened_dataset.id\n    assert dataset.name == reopened_dataset.name\n    metadata = await dataset.get_metadata()\n    reopened_metadata = await reopened_dataset.get_metadata()\n    assert metadata.item_count == reopened_metadata.item_count\n\n    # Verify they are the same object (from cache)\n    assert id(dataset) == id(reopened_dataset)\n\n\nasync def test_open_with_id_and_name(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() raises an error when both id and name are provided.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"name\".',\n    ):\n        await Dataset.open(\n            id='some-id',\n            name='some-name',\n            storage_client=storage_client,\n        )\n\n\nasync def test_push_data_single_item(dataset: Dataset) -> None:\n    \"\"\"Test pushing a single item to the dataset.\"\"\"\n    item = {'key': 'value', 'number': 42}\n    await dataset.push_data(item)\n\n    # Verify item was stored\n    result = await dataset.get_data()\n    assert result.count == 1\n    assert result.items[0] == item\n\n\nasync def test_push_data_multiple_items(dataset: Dataset) -> None:\n    \"\"\"Test pushing multiple items to the dataset.\"\"\"\n    items = [\n        {'id': 1, 'name': 'Item 1'},\n        {'id': 2, 'name': 'Item 2'},\n        {'id': 3, 'name': 'Item 3'},\n    ]\n    await dataset.push_data(items)\n\n    # Verify items were stored\n    result = await dataset.get_data()\n    assert result.count == 3\n    assert result.items == items\n\n\nasync def test_get_data_empty_dataset(dataset: Dataset) -> None:\n    \"\"\"Test getting data from an empty dataset returns empty results.\"\"\"\n    result = await dataset.get_data()\n\n    assert result.count == 0\n    assert result.total == 0\n    assert result.items == []\n\n\nasync def test_get_data_with_pagination(dataset: Dataset) -> None:\n    \"\"\"Test getting data with offset and limit parameters for pagination.\"\"\"\n    # Add some items\n    items = [{'id': i} for i in range(1, 11)]  # 10 items\n    await dataset.push_data(items)\n\n    # Test offset\n    result = await dataset.get_data(offset=3)\n    assert result.count == 7\n    assert result.offset == 3\n    assert result.items[0]['id'] == 4\n\n    # Test limit\n    result = await dataset.get_data(limit=5)\n    assert result.count == 5\n    assert result.limit == 5\n    assert result.items[-1]['id'] == 5\n\n    # Test both offset and limit\n    result = await dataset.get_data(offset=2, limit=3)\n    assert result.count == 3\n    assert result.offset == 2\n    assert result.limit == 3\n    assert result.items[0]['id'] == 3\n    assert result.items[-1]['id'] == 5\n\n\nasync def test_get_data_descending_order(dataset: Dataset) -> None:\n    \"\"\"Test getting data in descending order reverses the item order.\"\"\"\n    # Add some items\n    items = [{'id': i} for i in range(1, 6)]  # 5 items\n    await dataset.push_data(items)\n\n    # Get items in descending order\n    result = await dataset.get_data(desc=True)\n\n    assert result.desc is True\n    assert result.items[0]['id'] == 5\n    assert result.items[-1]['id'] == 1\n\n\nasync def test_get_data_skip_empty(dataset: Dataset) -> None:\n    \"\"\"Test getting data with skip_empty option filters out empty items.\"\"\"\n    # Add some items including an empty one\n    items = [\n        {'id': 1, 'name': 'Item 1'},\n        {},  # Empty item\n        {'id': 3, 'name': 'Item 3'},\n    ]\n    await dataset.push_data(items)\n\n    # Get all items\n    result = await dataset.get_data()\n    assert result.count == 3\n\n    # Get non-empty items\n    result = await dataset.get_data(skip_empty=True)\n    assert result.count == 2\n    assert all(item != {} for item in result.items)\n\n\nasync def test_iterate_items(dataset: Dataset) -> None:\n    \"\"\"Test iterating over dataset items yields each item in the correct order.\"\"\"\n    # Add some items\n    items = [{'id': i} for i in range(1, 6)]  # 5 items\n    await dataset.push_data(items)\n\n    # Iterate over all items\n    collected_items = [item async for item in dataset.iterate_items()]\n\n    assert len(collected_items) == 5\n    assert collected_items[0]['id'] == 1\n    assert collected_items[-1]['id'] == 5\n\n\nasync def test_iterate_items_with_options(dataset: Dataset) -> None:\n    \"\"\"Test iterating with offset, limit and desc parameters.\"\"\"\n    # Add some items\n    items = [{'id': i} for i in range(1, 11)]  # 10 items\n    await dataset.push_data(items)\n\n    # Test with offset and limit\n    collected_items = [item async for item in dataset.iterate_items(offset=3, limit=3)]\n\n    assert len(collected_items) == 3\n    assert collected_items[0]['id'] == 4\n    assert collected_items[-1]['id'] == 6\n\n    # Test with descending order\n    collected_items = []\n    async for item in dataset.iterate_items(desc=True, limit=3):\n        collected_items.append(item)\n\n    assert len(collected_items) == 3\n    assert collected_items[0]['id'] == 10\n    assert collected_items[-1]['id'] == 8\n\n\nasync def test_list_items(dataset: Dataset) -> None:\n    \"\"\"Test that list_items returns all dataset items as a list.\"\"\"\n    # Add some items\n    items = [{'id': i} for i in range(1, 6)]  # 5 items\n    await dataset.push_data(items)\n\n    # Get all items as a list\n    collected_items = await dataset.list_items()\n\n    assert len(collected_items) == 5\n    assert collected_items[0]['id'] == 1\n    assert collected_items[-1]['id'] == 5\n\n\nasync def test_list_items_with_options(dataset: Dataset) -> None:\n    \"\"\"Test that list_items respects filtering options.\"\"\"\n    # Add some items\n    items: list[dict[str, Any]] = [\n        {'id': 1, 'name': 'Item 1'},\n        {'id': 2, 'name': 'Item 2'},\n        {'id': 3},  # Item with missing 'name' field\n        {},  # Empty item\n        {'id': 5, 'name': 'Item 5'},\n    ]\n    await dataset.push_data(items)\n\n    # Test with offset and limit\n    collected_items = await dataset.list_items(offset=1, limit=2)\n    assert len(collected_items) == 2\n    assert collected_items[0]['id'] == 2\n    assert collected_items[1]['id'] == 3\n\n    # Test with descending order - skip empty items to avoid KeyError\n    collected_items = await dataset.list_items(desc=True, skip_empty=True)\n\n    # Filter items that have an 'id' field\n    items_with_ids = [item for item in collected_items if 'id' in item]\n    id_values = [item['id'] for item in items_with_ids]\n\n    # Verify the list is sorted in descending order\n    assert sorted(id_values, reverse=True) == id_values, f'IDs should be in descending order. Got {id_values}'\n\n    # Verify key IDs are present and in the right order\n    if 5 in id_values and 3 in id_values:\n        assert id_values.index(5) < id_values.index(3), 'ID 5 should come before ID 3 in descending order'\n\n    # Test with skip_empty\n    collected_items = await dataset.list_items(skip_empty=True)\n    assert len(collected_items) == 4  # Should skip the empty item\n    assert all(item != {} for item in collected_items)\n\n    # Test with fields - manually filter since 'fields' parameter is not supported\n    # Get all items first\n    collected_items = await dataset.list_items()\n    assert len(collected_items) == 5\n\n    # Manually extract only the 'id' field from each item\n    filtered_items = [{key: item[key] for key in ['id'] if key in item} for item in collected_items]\n\n    # Verify 'name' field is not present in any item\n    assert all('name' not in item for item in filtered_items)\n\n    # Test clean functionality manually instead of using the clean parameter\n    # Get all items\n    collected_items = await dataset.list_items()\n\n    # Manually filter out empty items as 'clean' would do\n    clean_items = [item for item in collected_items if item != {}]\n\n    assert len(clean_items) == 4  # Should have 4 non-empty items\n    assert all(item != {} for item in clean_items)\n\n\nasync def test_drop(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test dropping a dataset removes it from cache and clears its data.\"\"\"\n    dataset = await Dataset.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    # Add some data\n    await dataset.push_data({'test': 'data'})\n\n    # Drop the dataset\n    await dataset.drop()\n\n    # Verify dataset is empty (by creating a new one with the same name)\n    new_dataset = await Dataset.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    result = await new_dataset.get_data()\n    assert result.count == 0\n    await new_dataset.drop()\n\n\nasync def test_export_to_json(\n    dataset: Dataset,\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test exporting dataset to JSON format.\"\"\"\n    # Create a key-value store for export\n    kvs = await KeyValueStore.open(\n        name='export-kvs',\n    )\n\n    # Add some items to the dataset\n    items = [\n        {'id': 1, 'name': 'Item 1'},\n        {'id': 2, 'name': 'Item 2'},\n        {'id': 3, 'name': 'Item 3'},\n    ]\n    await dataset.push_data(items)\n\n    # Export to JSON\n    await dataset.export_to(\n        key='dataset_export.json',\n        content_type='json',\n        to_kvs_name='export-kvs',\n        to_kvs_storage_client=storage_client,\n    )\n\n    # Retrieve the exported file\n    record = await kvs.get_value(key='dataset_export.json')\n    assert record is not None\n\n    # Verify content has all the items\n    assert '\"id\": 1' in record\n    assert '\"id\": 2' in record\n    assert '\"id\": 3' in record\n\n    await kvs.drop()\n\n\nasync def test_export_to_csv(\n    dataset: Dataset,\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test exporting dataset to CSV format.\"\"\"\n    # Create a key-value store for export\n    kvs = await KeyValueStore.open(\n        name='export-kvs',\n        storage_client=storage_client,\n    )\n\n    # Add some items to the dataset\n    items = [\n        {'id': 1, 'name': 'Item 1'},\n        {'id': 2, 'name': 'Item 2'},\n        {'id': 3, 'name': 'Item 3'},\n    ]\n    await dataset.push_data(items)\n\n    # Export to CSV\n    await dataset.export_to(\n        key='dataset_export.csv',\n        content_type='csv',\n        to_kvs_name='export-kvs',\n        to_kvs_storage_client=storage_client,\n    )\n\n    # Retrieve the exported file\n    record = await kvs.get_value(key='dataset_export.csv')\n    assert record is not None\n\n    # Verify content has all the items\n    assert 'id,name' in record\n    assert '1,Item 1' in record\n    assert '2,Item 2' in record\n    assert '3,Item 3' in record\n\n    await kvs.drop()\n\n\nasync def test_export_to_invalid_content_type(dataset: Dataset) -> None:\n    \"\"\"Test exporting dataset with invalid content type raises error.\"\"\"\n    with pytest.raises(ValueError, match=r'Unsupported content type'):\n        await dataset.export_to(key='invalid_export', content_type='invalid')  # ty: ignore[no-matching-overload]\n\n\nasync def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) -> None:\n    \"\"\"Test exporting dataset using many optional arguments together.\"\"\"\n    target_kvs_name = 'some-kvs'\n    target_storage_client = FileSystemStorageClient()\n    export_key = 'exported_dataset'\n    data = {'some key': 'some data'}\n\n    # Prepare custom directory and configuration\n    custom_dir_name = 'some_dir'\n    custom_dir = tmp_path / custom_dir_name\n    custom_dir.mkdir()\n    target_configuration = Configuration(storage_dir=str(custom_dir))\n\n    # Set expected values\n    expected_exported_data = f'{json.dumps([{\"some key\": \"some data\"}])}'\n    expected_kvs_dir = custom_dir / 'key_value_stores' / target_kvs_name\n\n    # Populate dataset and export\n    await dataset.push_data(data)\n    await dataset.export_to(\n        key=export_key,\n        content_type='json',\n        to_kvs_name=target_kvs_name,\n        to_kvs_storage_client=target_storage_client,\n        to_kvs_configuration=target_configuration,\n    )\n\n    # Verify the directory was created\n    assert expected_kvs_dir.is_dir()\n    # Verify that kvs contains the exported data\n    kvs = await KeyValueStore.open(\n        name=target_kvs_name, storage_client=target_storage_client, configuration=target_configuration\n    )\n\n    assert await kvs.get_value(key=export_key) == expected_exported_data\n\n\nasync def test_large_dataset(dataset: Dataset) -> None:\n    \"\"\"Test handling a large dataset with many items.\"\"\"\n    items = [{'id': i, 'value': f'value-{i}'} for i in range(100)]\n    await dataset.push_data(items)\n\n    # Test that all items are retrieved\n    result = await dataset.get_data(limit=None)\n    assert result.count == 100\n    assert result.total == 100\n\n    # Test pagination with large datasets\n    result = await dataset.get_data(offset=50, limit=25)\n    assert result.count == 25\n    assert result.offset == 50\n    assert result.items[0]['id'] == 50\n    assert result.items[-1]['id'] == 74\n\n\nasync def test_purge(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test purging a dataset removes all data but keeps the dataset itself.\"\"\"\n    # First create a dataset\n    dataset = await Dataset.open(\n        name='purge-test-dataset',\n        storage_client=storage_client,\n    )\n\n    # Add some data\n    initial_items = [\n        {'id': 1, 'name': 'Item 1'},\n        {'id': 2, 'name': 'Item 2'},\n        {'id': 3, 'name': 'Item 3'},\n    ]\n    await dataset.push_data(initial_items)\n\n    # Verify data was added\n    data = await dataset.get_data()\n    assert data.count == 3\n    assert data.total == 3\n    metadata = await dataset.get_metadata()\n    assert metadata.item_count == 3\n\n    # Record the dataset ID\n    dataset_id = dataset.id\n\n    # Purge the dataset\n    await dataset.purge()\n\n    # Verify the dataset still exists but is empty\n    assert dataset.id == dataset_id  # Same ID preserved\n    assert dataset.name == 'purge-test-dataset'  # Same name preserved\n\n    # Dataset should be empty now\n    data = await dataset.get_data()\n    assert data.count == 0\n    assert data.total == 0\n    metadata = await dataset.get_metadata()\n    assert metadata.item_count == 0\n\n    # Verify we can add new data after purging\n    new_item = {'id': 4, 'name': 'New Item After Purge'}\n    await dataset.push_data(new_item)\n\n    data = await dataset.get_data()\n    assert data.count == 1\n    assert data.items[0]['name'] == 'New Item After Purge'\n\n    # Clean up\n    await dataset.drop()\n\n\nasync def test_open_with_alias(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening datasets with alias parameter for NDU functionality.\"\"\"\n    # Create datasets with different aliases\n    dataset_1 = await Dataset.open(\n        alias='test_alias_1',\n        storage_client=storage_client,\n    )\n    dataset_2 = await Dataset.open(\n        alias='test_alias_2',\n        storage_client=storage_client,\n    )\n\n    # Verify they have different IDs but no names (unnamed)\n    assert dataset_1.id != dataset_2.id\n    assert dataset_1.name is None\n    assert dataset_2.name is None\n\n    # Add different data to each\n    await dataset_1.push_data({'source': 'alias_1', 'value': 1})\n    await dataset_2.push_data({'source': 'alias_2', 'value': 2})\n\n    # Verify data isolation\n    data_1 = await dataset_1.get_data()\n    data_2 = await dataset_2.get_data()\n\n    assert data_1.count == 1\n    assert data_2.count == 1\n    assert data_1.items[0]['source'] == 'alias_1'\n    assert data_2.items[0]['source'] == 'alias_2'\n\n    # Clean up\n    await dataset_1.drop()\n    await dataset_2.drop()\n\n\nasync def test_alias_caching(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that datasets with same alias return same instance (cached).\"\"\"\n    # Open dataset with alias\n    dataset_1 = await Dataset.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Open again with same alias\n    dataset_2 = await Dataset.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Should be same instance\n    assert dataset_1 is dataset_2\n    assert dataset_1.id == dataset_2.id\n\n    # Clean up\n    await dataset_1.drop()\n\n\nasync def test_alias_with_id_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and id raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"alias\".',\n    ):\n        await Dataset.open(\n            id='some-id',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_name_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and name raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"name\", \"alias\".',\n    ):\n        await Dataset.open(\n            name='some-name',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_all_parameters_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing id, name, and alias raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"name\", \"alias\".',\n    ):\n        await Dataset.open(\n            id='some-id',\n            name='some-name',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_special_characters(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test alias functionality with special characters.\"\"\"\n    special_aliases = [\n        'alias-with-dashes',\n        'alias_with_underscores',\n        'alias.with.dots',\n        'alias123with456numbers',\n        'CamelCaseAlias',\n    ]\n\n    datasets = []\n    for alias in special_aliases:\n        dataset = await Dataset.open(\n            alias=alias,\n            storage_client=storage_client,\n        )\n        datasets.append(dataset)\n\n        # Add data with the alias as identifier\n        await dataset.push_data({'alias_used': alias, 'test': 'special_chars'})\n\n    # Verify all work correctly\n    for i, dataset in enumerate(datasets):\n        data = await dataset.get_data()\n        assert data.count == 1\n        assert data.items[0]['alias_used'] == special_aliases[i]\n\n    # Clean up\n    for dataset in datasets:\n        await dataset.drop()\n\n\nasync def test_named_vs_alias_conflict_detection(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that conflicts between named and alias storages are detected.\"\"\"\n    # Test 1: Create named storage first, then try alias with same name\n    named_dataset = await Dataset.open(name='conflict-test', storage_client=storage_client)\n    assert named_dataset.name == 'conflict-test'\n\n    # Try to create alias with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create alias storage \"conflict-test\".*already exists'):\n        await Dataset.open(alias='conflict-test', storage_client=storage_client)\n\n    # Clean up\n    await named_dataset.drop()\n\n    # Test 2: Create alias first, then try named with same name\n    alias_dataset = await Dataset.open(alias='conflict-test2', storage_client=storage_client)\n    assert alias_dataset.name is None  # Alias storages have no name\n\n    # Try to create named with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create named storage \"conflict-test2\".*already exists'):\n        await Dataset.open(name='conflict-test2', storage_client=storage_client)\n\n    # Clean up\n    await alias_dataset.drop()\n\n\nasync def test_alias_parameter(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test dataset creation and operations with alias parameter.\"\"\"\n    # Create dataset with alias\n    alias_dataset = await Dataset.open(\n        alias='test_alias',\n        storage_client=storage_client,\n    )\n\n    # Verify alias dataset properties\n    assert alias_dataset.id is not None\n    assert alias_dataset.name is None  # Alias storages should be unnamed\n\n    # Test data operations\n    await alias_dataset.push_data({'type': 'alias', 'value': 1})\n    data = await alias_dataset.get_data()\n    assert data.count == 1\n    assert data.items[0]['type'] == 'alias'\n\n    await alias_dataset.drop()\n\n\nasync def test_alias_vs_named_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that alias and named datasets with same identifier are isolated.\"\"\"\n    # Create named dataset\n    named_dataset = await Dataset.open(\n        name='test-identifier',\n        storage_client=storage_client,\n    )\n\n    # Verify named dataset\n    assert named_dataset.name == 'test-identifier'\n    await named_dataset.push_data({'type': 'named'})\n\n    # Clean up named dataset first\n    await named_dataset.drop()\n\n    # Now create alias dataset with same identifier (should work after cleanup)\n    alias_dataset = await Dataset.open(\n        alias='test_identifier',\n        storage_client=storage_client,\n    )\n\n    # Should be different instance\n    assert alias_dataset.name is None\n    await alias_dataset.push_data({'type': 'alias'})\n\n    # Verify alias data\n    alias_data = await alias_dataset.get_data()\n    assert alias_data.items[0]['type'] == 'alias'\n\n    await alias_dataset.drop()\n\n\nasync def test_default_vs_alias_default_equivalence(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that default dataset and alias='default' are equivalent.\"\"\"\n    # Open default dataset\n    default_dataset = await Dataset.open(\n        storage_client=storage_client,\n    )\n\n    alias_default_dataset = await Dataset.open(\n        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,\n        storage_client=storage_client,\n    )\n\n    # Should be the same\n    assert default_dataset.id == alias_default_dataset.id\n    assert default_dataset.name is None\n    assert alias_default_dataset.name is None\n\n    # Data should be shared\n    await default_dataset.push_data({'source': 'default'})\n    data = await alias_default_dataset.get_data()\n    assert data.items[0]['source'] == 'default'\n\n    await default_dataset.drop()\n\n\nasync def test_multiple_alias_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that different aliases create separate datasets.\"\"\"\n    datasets = []\n\n    for i in range(3):\n        dataset = await Dataset.open(\n            alias=f'alias_{i}',\n            storage_client=storage_client,\n        )\n        await dataset.push_data({'alias': f'alias_{i}', 'index': i})\n        datasets.append(dataset)\n\n    # All should be different\n    for i in range(3):\n        for j in range(i + 1, 3):\n            assert datasets[i].id != datasets[j].id\n\n    # Verify data isolation\n    for i, dataset in enumerate(datasets):\n        data = await dataset.get_data()\n        assert data.items[0]['alias'] == f'alias_{i}'\n        await dataset.drop()\n\n\nasync def test_purge_on_start_enabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=True)\n\n    # First, create all storage types with purge enabled and add data.\n    default_dataset = await Dataset.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_dataset = await Dataset.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_dataset = await Dataset.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_dataset.push_data({'type': 'default', 'data': 'should_be_purged'})\n    await alias_dataset.push_data({'type': 'alias', 'data': 'should_be_purged'})\n    await named_dataset.push_data({'type': 'named', 'data': 'should_persist'})\n\n    # Verify data was added\n    default_data = await default_dataset.get_data()\n    alias_data = await alias_dataset.get_data()\n    named_data = await named_dataset.get_data()\n\n    assert len(default_data.items) == 1\n    assert len(alias_data.items) == 1\n    assert len(named_data.items) == 1\n\n    # Verify that default and alias storages are unnamed\n    default_metadata = await default_dataset.get_metadata()\n    alias_metadata = await alias_dataset.get_metadata()\n    named_metadata = await named_dataset.get_metadata()\n\n    assert default_metadata.name is None\n    assert alias_metadata.name is None\n    assert named_metadata.name == 'purge-test-named'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_dataset_2 = await Dataset.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_dataset_2 = await Dataset.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_dataset_2 = await Dataset.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after purge\n    default_data_after = await default_dataset_2.get_data()\n    alias_data_after = await alias_dataset_2.get_data()\n    named_data_after = await named_dataset_2.get_data()\n\n    # Unnamed storages (alias and default) should be purged (data removed)\n    assert len(default_data_after.items) == 0\n    assert len(alias_data_after.items) == 0\n\n    # Named storage should retain data (not purged)\n    assert len(named_data_after.items) == 1\n\n    # Clean up\n    await named_dataset_2.drop()\n    await alias_dataset_2.drop()\n    await default_dataset_2.drop()\n\n\nasync def test_purge_on_start_disabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=False: all storages retain data regardless of type.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=False)\n\n    # First, create all storage types with purge disabled and add data.\n    default_dataset = await Dataset.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_dataset = await Dataset.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_dataset = await Dataset.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_dataset.push_data({'type': 'default', 'data': 'should_persist'})\n    await alias_dataset.push_data({'type': 'alias', 'data': 'should_persist'})\n    await named_dataset.push_data({'type': 'named', 'data': 'should_persist'})\n\n    # Verify data was added\n    default_data = await default_dataset.get_data()\n    alias_data = await alias_dataset.get_data()\n    named_data = await named_dataset.get_data()\n\n    assert len(default_data.items) == 1\n    assert len(alias_data.items) == 1\n    assert len(named_data.items) == 1\n\n    # Verify that default and alias storages are unnamed\n    default_metadata = await default_dataset.get_metadata()\n    alias_metadata = await alias_dataset.get_metadata()\n    named_metadata = await named_dataset.get_metadata()\n\n    assert default_metadata.name is None\n    assert alias_metadata.name is None\n    assert named_metadata.name == 'purge-test-named'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_dataset_2 = await Dataset.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_dataset_2 = await Dataset.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_dataset_2 = await Dataset.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after purge\n    default_data_after = await default_dataset_2.get_data()\n    alias_data_after = await alias_dataset_2.get_data()\n    named_data_after = await named_dataset_2.get_data()\n\n    # All storages should retain data (not purged)\n    assert len(default_data_after.items) == 1\n    assert len(alias_data_after.items) == 1\n    assert len(named_data_after.items) == 1\n\n    assert default_data_after.items[0]['data'] == 'should_persist'\n    assert alias_data_after.items[0]['data'] == 'should_persist'\n    assert named_data_after.items[0]['data'] == 'should_persist'\n\n    # Clean up\n    await default_dataset_2.drop()\n    await alias_dataset_2.drop()\n    await named_dataset_2.drop()\n\n\nasync def test_name_default_not_allowed(storage_client: StorageClient) -> None:\n    \"\"\"Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=f'Storage name cannot be \"{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}\" as '\n        f'it is reserved for default alias.',\n    ):\n        await Dataset.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)\n\n\n@pytest.mark.parametrize(\n    ('name', 'is_valid'),\n    [\n        pytest.param('F', True, id='single-char'),\n        pytest.param('7', True, id='single-digit'),\n        pytest.param('FtghdfseySds', True, id='mixed-case'),\n        pytest.param('125673450', True, id='all-digits'),\n        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),\n        pytest.param('name-with-dashes', True, id='dashes'),\n        pytest.param('1-value', True, id='number start'),\n        pytest.param('value-1', True, id='number end'),\n        pytest.param('test-1-value', True, id='number middle'),\n        pytest.param('test-------value', True, id='multiple-dashes'),\n        pytest.param('test-VALUES-test', True, id='multiple-cases'),\n        pytest.param('name_with_underscores', False, id='underscores'),\n        pytest.param('name with spaces', False, id='spaces'),\n        pytest.param('-test', False, id='dashes start'),\n        pytest.param('test-', False, id='dashes end'),\n    ],\n)\nasync def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:\n    \"\"\"Test name validation logic.\"\"\"\n    if is_valid:\n        # Should not raise\n        dataset = await Dataset.open(name=name, storage_client=storage_client)\n        assert dataset.name == name\n        await dataset.drop()\n    else:\n        with pytest.raises(ValueError, match=rf'Invalid storage name \"{name}\".*'):\n            await Dataset.open(name=name, storage_client=storage_client)\n\n\nasync def test_record_with_noascii_chars(dataset: Dataset) -> None:\n    \"\"\"Test handling record with non-ASCII characters.\"\"\"\n    init_value = {\n        'record_1': 'Supermaxi El Jardín',\n        'record_2': 'záznam dva',\n        'record_3': '記録三',\n    }\n\n    # Save the record to the dataset\n    await dataset.push_data(init_value)\n\n    # Get the record and verify\n    value = await dataset.get_data()\n    assert value is not None\n    assert value.items[0] == init_value\n"
  },
  {
    "path": "tests/unit/storages/test_key_value_store.py",
    "content": "from __future__ import annotations\n\nimport json\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, SqlStorageClient, StorageClient\nfrom crawlee.storages import KeyValueStore\nfrom crawlee.storages._storage_instance_manager import StorageInstanceManager\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n    from pathlib import Path\n\n\n@pytest.fixture\nasync def kvs(\n    storage_client: StorageClient,\n) -> AsyncGenerator[KeyValueStore, None]:\n    \"\"\"Fixture that provides a key-value store instance for each test.\"\"\"\n    kvs = await KeyValueStore.open(\n        storage_client=storage_client,\n    )\n\n    yield kvs\n    await kvs.drop()\n\n\nasync def test_open_creates_new_kvs(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() creates a new key-value store with proper metadata.\"\"\"\n    kvs = await KeyValueStore.open(\n        name='new-kvs',\n        storage_client=storage_client,\n    )\n\n    # Verify key-value store properties\n    assert kvs.id is not None\n    assert kvs.name == 'new-kvs'\n\n    await kvs.drop()\n\n\nasync def test_open_existing_kvs(\n    kvs: KeyValueStore,\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() loads an existing key-value store correctly.\"\"\"\n    # Open the same key-value store again\n    reopened_kvs = await KeyValueStore.open(\n        name=kvs.name,\n        storage_client=storage_client,\n    )\n\n    # Verify key-value store properties\n    assert kvs.id == reopened_kvs.id\n    assert kvs.name == reopened_kvs.name\n\n    # Verify they are the same object (from cache)\n    assert id(kvs) == id(reopened_kvs)\n\n\nasync def test_open_with_id_and_name(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() raises an error when both id and name are provided.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"name\".',\n    ):\n        await KeyValueStore.open(\n            id='some-id',\n            name='some-name',\n            storage_client=storage_client,\n        )\n\n\nasync def test_open_by_id(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening a key-value store by its ID.\"\"\"\n    # First create a key-value store by name\n    kvs1 = await KeyValueStore.open(\n        name='kvs-by-id-test',\n        storage_client=storage_client,\n    )\n\n    # Add some data to identify it\n    await kvs1.set_value('test_key', {'test': 'opening_by_id', 'timestamp': 12345})\n\n    # Open the key-value store by ID\n    kvs2 = await KeyValueStore.open(\n        id=kvs1.id,\n        storage_client=storage_client,\n    )\n\n    # Verify it's the same key-value store\n    assert kvs2.id == kvs1.id\n    assert kvs2.name == 'kvs-by-id-test'\n\n    # Verify the data is still there\n    value = await kvs2.get_value('test_key')\n    assert value is not None\n    assert value['test'] == 'opening_by_id'\n    assert value['timestamp'] == 12345\n\n    # Clean up\n    await kvs2.drop()\n\n\nasync def test_set_get_value(kvs: KeyValueStore) -> None:\n    \"\"\"Test setting and getting a value from the key-value store.\"\"\"\n    # Set a value\n    test_key = 'test-key'\n    test_value = {'data': 'value', 'number': 42}\n    await kvs.set_value(test_key, test_value)\n\n    # Get the value\n    result = await kvs.get_value(test_key)\n    assert result == test_value\n\n\nasync def test_set_get_none(kvs: KeyValueStore) -> None:\n    \"\"\"Test setting and getting None as a value.\"\"\"\n    test_key = 'none-key'\n    await kvs.set_value(test_key, None)\n    result = await kvs.get_value(test_key)\n    assert result is None\n\n\nasync def test_get_value_nonexistent(kvs: KeyValueStore) -> None:\n    \"\"\"Test getting a nonexistent value returns None.\"\"\"\n    result = await kvs.get_value('nonexistent-key')\n    assert result is None\n\n\nasync def test_get_value_with_default(kvs: KeyValueStore) -> None:\n    \"\"\"Test getting a nonexistent value with a default value.\"\"\"\n    default_value = {'default': True}\n    result = await kvs.get_value('nonexistent-key', default_value=default_value)\n    assert result == default_value\n\n\nasync def test_set_value_with_content_type(kvs: KeyValueStore) -> None:\n    \"\"\"Test setting a value with a specific content type.\"\"\"\n    test_key = 'test-json'\n    test_value = {'data': 'value', 'items': [1, 2, 3]}\n    await kvs.set_value(test_key, test_value, content_type='application/json')\n\n    # Verify the value is retrievable\n    result = await kvs.get_value(test_key)\n    assert result == test_value\n\n\nasync def test_delete_value(kvs: KeyValueStore) -> None:\n    \"\"\"Test deleting a value from the key-value store.\"\"\"\n    # Set a value first\n    test_key = 'delete-me'\n    test_value = 'value to delete'\n    await kvs.set_value(test_key, test_value)\n\n    # Verify value exists\n    assert await kvs.get_value(test_key) == test_value\n\n    # Delete the value\n    await kvs.delete_value(test_key)\n\n    # Verify value is gone\n    assert await kvs.get_value(test_key) is None\n\n\nasync def test_list_keys_empty_kvs(kvs: KeyValueStore) -> None:\n    \"\"\"Test listing keys from an empty key-value store.\"\"\"\n    keys = await kvs.list_keys()\n    assert len(keys) == 0\n\n\nasync def test_list_keys(kvs: KeyValueStore) -> None:\n    \"\"\"Test listing keys from a key-value store with items.\"\"\"\n    # Add some items\n    await kvs.set_value('key1', 'value1')\n    await kvs.set_value('key2', 'value2')\n    await kvs.set_value('key3', 'value3')\n\n    # List keys\n    keys = await kvs.list_keys()\n\n    # Verify keys\n    assert len(keys) == 3\n    key_names = [k.key for k in keys]\n    assert 'key1' in key_names\n    assert 'key2' in key_names\n    assert 'key3' in key_names\n\n\nasync def test_list_keys_with_limit(kvs: KeyValueStore) -> None:\n    \"\"\"Test listing keys with a limit parameter.\"\"\"\n    # Add some items\n    for i in range(10):\n        await kvs.set_value(f'key{i}', f'value{i}')\n\n    # List with limit\n    keys = await kvs.list_keys(limit=5)\n    assert len(keys) == 5\n\n\nasync def test_list_keys_with_exclusive_start_key(kvs: KeyValueStore) -> None:\n    \"\"\"Test listing keys with an exclusive start key.\"\"\"\n    # Add some items in a known order\n    await kvs.set_value('key1', 'value1')\n    await kvs.set_value('key2', 'value2')\n    await kvs.set_value('key3', 'value3')\n    await kvs.set_value('key4', 'value4')\n    await kvs.set_value('key5', 'value5')\n\n    # Get all keys first to determine their order\n    all_keys = await kvs.list_keys()\n    all_key_names = [k.key for k in all_keys]\n\n    if len(all_key_names) >= 3:\n        # Start from the second key\n        start_key = all_key_names[1]\n        keys = await kvs.list_keys(exclusive_start_key=start_key)\n\n        # We should get all keys after the start key\n        expected_count = len(all_key_names) - all_key_names.index(start_key) - 1\n        assert len(keys) == expected_count\n\n        # First key should be the one after start_key\n        first_returned_key = keys[0].key\n        assert first_returned_key != start_key\n        assert all_key_names.index(first_returned_key) > all_key_names.index(start_key)\n\n\nasync def test_iterate_keys(kvs: KeyValueStore) -> None:\n    \"\"\"Test iterating over keys in the key-value store.\"\"\"\n    # Add some items\n    await kvs.set_value('key1', 'value1')\n    await kvs.set_value('key2', 'value2')\n    await kvs.set_value('key3', 'value3')\n\n    collected_keys = [key async for key in kvs.iterate_keys()]\n\n    # Verify iteration result\n    assert len(collected_keys) == 3\n    key_names = [k.key for k in collected_keys]\n    assert 'key1' in key_names\n    assert 'key2' in key_names\n    assert 'key3' in key_names\n\n\nasync def test_iterate_keys_with_limit(kvs: KeyValueStore) -> None:\n    \"\"\"Test iterating over keys with a limit parameter.\"\"\"\n    # Add some items\n    for i in range(10):\n        await kvs.set_value(f'key{i}', f'value{i}')\n\n    collected_keys = [key async for key in kvs.iterate_keys(limit=5)]\n\n    # Verify iteration result\n    assert len(collected_keys) == 5\n\n\nasync def test_drop(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test dropping a key-value store removes it from cache and clears its data.\"\"\"\n    kvs = await KeyValueStore.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    # Add some data\n    await kvs.set_value('test', 'data')\n\n    # Drop the key-value store\n    await kvs.drop()\n\n    # Verify key-value store is empty (by creating a new one with the same name)\n    new_kvs = await KeyValueStore.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    # Attempt to get a previously stored value\n    result = await new_kvs.get_value('test')\n    assert result is None\n    await new_kvs.drop()\n\n\nasync def test_reopen_default(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test reopening the default key-value store.\"\"\"\n    # Open the default key-value store\n    kvs1 = await KeyValueStore.open(\n        storage_client=storage_client,\n    )\n\n    # Set a value\n    await kvs1.set_value('test_key', 'test_value')\n\n    # Open the default key-value store again\n    kvs2 = await KeyValueStore.open(\n        storage_client=storage_client,\n    )\n\n    # Verify they are the same store\n    assert kvs1.id == kvs2.id\n    assert kvs1.name == kvs2.name\n\n    # Verify the value is accessible\n    value1 = await kvs1.get_value('test_key')\n    value2 = await kvs2.get_value('test_key')\n    assert value1 == value2 == 'test_value'\n\n    # Verify they are the same object\n    assert id(kvs1) == id(kvs2)\n\n\nasync def test_complex_data_types(kvs: KeyValueStore) -> None:\n    \"\"\"Test storing and retrieving complex data types.\"\"\"\n    # Test nested dictionaries\n    nested_dict = {\n        'level1': {\n            'level2': {\n                'level3': 'deep value',\n                'numbers': [1, 2, 3],\n            },\n        },\n        'array': [{'a': 1}, {'b': 2}],\n    }\n    await kvs.set_value('nested', nested_dict)\n    result = await kvs.get_value('nested')\n    assert result == nested_dict\n\n    # Test lists\n    test_list = [1, 'string', True, None, {'key': 'value'}]\n    await kvs.set_value('list', test_list)\n    result = await kvs.get_value('list')\n    assert result == test_list\n\n\nasync def test_string_data(kvs: KeyValueStore) -> None:\n    \"\"\"Test storing and retrieving string data.\"\"\"\n    # Plain string\n    await kvs.set_value('string', 'simple string')\n    result = await kvs.get_value('string')\n    assert result == 'simple string'\n\n    # JSON string\n    json_string = json.dumps({'key': 'value'})\n    await kvs.set_value('json_string', json_string)\n    result = await kvs.get_value('json_string')\n    assert result == json_string\n\n\nasync def test_key_with_special_characters(kvs: KeyValueStore) -> None:\n    \"\"\"Test storing and retrieving values with keys containing special characters.\"\"\"\n    # Key with spaces, slashes, and special characters\n    special_key = 'key with spaces/and/slashes!@#$%^&*()'\n    test_value = 'Special key value'\n\n    # Store the value with the special key\n    await kvs.set_value(key=special_key, value=test_value)\n\n    # Retrieve the value and verify it matches\n    result = await kvs.get_value(key=special_key)\n    assert result is not None\n    assert result == test_value\n\n    # Make sure the key is properly listed\n    keys = await kvs.list_keys()\n    key_names = [k.key for k in keys]\n    assert special_key in key_names\n\n    # Test key deletion\n    await kvs.delete_value(key=special_key)\n    assert await kvs.get_value(key=special_key) is None\n\n\nasync def test_data_persistence_on_reopen() -> None:\n    \"\"\"Test that data persists when reopening a KeyValueStore.\"\"\"\n    kvs1 = await KeyValueStore.open()\n\n    await kvs1.set_value('key_123', 'value_123')\n\n    result1 = await kvs1.get_value('key_123')\n    assert result1 == 'value_123'\n\n    kvs2 = await KeyValueStore.open()\n\n    result2 = await kvs2.get_value('key_123')\n    assert result2 == 'value_123'\n    assert await kvs1.list_keys() == await kvs2.list_keys()\n\n    await kvs2.set_value('key_456', 'value_456')\n\n    result1 = await kvs1.get_value('key_456')\n    assert result1 == 'value_456'\n\n\nasync def test_purge(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test purging a key-value store removes all values but keeps the store itself.\"\"\"\n    # First create a key-value store\n    kvs = await KeyValueStore.open(\n        name='purge-test-kvs',\n        storage_client=storage_client,\n    )\n\n    # Add some values\n    await kvs.set_value('key1', 'value1')\n    await kvs.set_value('key2', 'value2')\n    await kvs.set_value('key3', {'complex': 'value', 'number': 42})\n\n    # Verify values were added\n    keys = await kvs.list_keys()\n    assert len(keys) == 3\n\n    # Record the store ID\n    kvs_id = kvs.id\n\n    # Purge the key-value store\n    await kvs.purge()\n\n    # Verify the store still exists but is empty\n    assert kvs.id == kvs_id  # Same ID preserved\n    assert kvs.name == 'purge-test-kvs'  # Same name preserved\n\n    # Store should be empty now\n    keys = await kvs.list_keys()\n    assert len(keys) == 0\n\n    # Values should no longer be accessible\n    assert await kvs.get_value('key1') is None\n    assert await kvs.get_value('key2') is None\n    assert await kvs.get_value('key3') is None\n\n    # Verify we can add new values after purging\n    await kvs.set_value('new_key', 'new value after purge')\n\n    value = await kvs.get_value('new_key')\n    assert value == 'new value after purge'\n\n    # Clean up\n    await kvs.drop()\n\n\nasync def test_record_exists_nonexistent(kvs: KeyValueStore) -> None:\n    \"\"\"Test that record_exists returns False for a nonexistent key.\"\"\"\n    result = await kvs.record_exists('nonexistent-key')\n    assert result is False\n\n\nasync def test_record_exists_after_set(kvs: KeyValueStore) -> None:\n    \"\"\"Test that record_exists returns True after setting a value.\"\"\"\n    test_key = 'exists-key'\n    test_value = {'data': 'test'}\n\n    # Initially should not exist\n    assert await kvs.record_exists(test_key) is False\n\n    # Set the value\n    await kvs.set_value(test_key, test_value)\n\n    # Now should exist\n    assert await kvs.record_exists(test_key) is True\n\n\nasync def test_record_exists_after_delete(kvs: KeyValueStore) -> None:\n    \"\"\"Test that record_exists returns False after deleting a value.\"\"\"\n    test_key = 'exists-then-delete-key'\n    test_value = 'will be deleted'\n\n    # Set a value\n    await kvs.set_value(test_key, test_value)\n    assert await kvs.record_exists(test_key) is True\n\n    # Delete the value\n    await kvs.delete_value(test_key)\n\n    # Should no longer exist\n    assert await kvs.record_exists(test_key) is False\n\n\nasync def test_record_exists_with_none_value(kvs: KeyValueStore) -> None:\n    \"\"\"Test that record_exists returns True even when value is None.\"\"\"\n    test_key = 'none-value-key'\n\n    # Set None as value\n    await kvs.set_value(test_key, None)\n\n    # Should still exist even though value is None\n    assert await kvs.record_exists(test_key) is True\n\n    # Verify we can distinguish between None value and nonexistent key\n    assert await kvs.get_value(test_key) is None\n    assert await kvs.record_exists(test_key) is True\n    assert await kvs.record_exists('truly-nonexistent') is False\n\n\nasync def test_record_exists_different_content_types(kvs: KeyValueStore) -> None:\n    \"\"\"Test record_exists with different content types.\"\"\"\n    test_cases = [\n        ('json-key', {'data': 'json'}, 'application/json'),\n        ('text-key', 'plain text', 'text/plain'),\n        ('binary-key', b'binary data', 'application/octet-stream'),\n    ]\n\n    for key, value, content_type in test_cases:\n        # Set value with specific content type\n        await kvs.set_value(key, value, content_type=content_type)\n\n        # Should exist regardless of content type\n        assert await kvs.record_exists(key) is True\n\n\nasync def test_record_exists_multiple_keys(kvs: KeyValueStore) -> None:\n    \"\"\"Test record_exists with multiple keys and batch operations.\"\"\"\n    keys_and_values = [\n        ('key1', 'value1'),\n        ('key2', {'nested': 'object'}),\n        ('key3', [1, 2, 3]),\n        ('key4', None),\n    ]\n\n    # Initially, none should exist\n    for key, _ in keys_and_values:\n        assert await kvs.record_exists(key) is False\n\n    # Set all values\n    for key, value in keys_and_values:\n        await kvs.set_value(key, value)\n\n    # All should exist now\n    for key, _ in keys_and_values:\n        assert await kvs.record_exists(key) is True\n\n    # Test some non-existent keys\n    assert await kvs.record_exists('nonexistent1') is False\n    assert await kvs.record_exists('nonexistent2') is False\n\n\nasync def test_record_exists_after_purge(kvs: KeyValueStore) -> None:\n    \"\"\"Test that record_exists returns False after purging the store.\"\"\"\n    # Set some values\n    await kvs.set_value('key1', 'value1')\n    await kvs.set_value('key2', 'value2')\n\n    # Verify they exist\n    assert await kvs.record_exists('key1') is True\n    assert await kvs.record_exists('key2') is True\n\n    # Purge the store\n    await kvs.purge()\n\n    # Should no longer exist\n    assert await kvs.record_exists('key1') is False\n    assert await kvs.record_exists('key2') is False\n\n\nasync def test_open_with_alias(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening key-value stores with alias parameter for NDU functionality.\"\"\"\n    # Create key-value stores with different aliases\n    kvs_1 = await KeyValueStore.open(\n        alias='test_alias_1',\n        storage_client=storage_client,\n    )\n    kvs_2 = await KeyValueStore.open(\n        alias='test_alias_2',\n        storage_client=storage_client,\n    )\n\n    # Verify they have different IDs but no names (unnamed)\n    assert kvs_1.id != kvs_2.id\n    assert kvs_1.name is None\n    assert kvs_2.name is None\n\n    # Add different data to each\n    await kvs_1.set_value('source', 'alias_1')\n    await kvs_2.set_value('source', 'alias_2')\n\n    # Verify data isolation\n    value_1 = await kvs_1.get_value('source')\n    value_2 = await kvs_2.get_value('source')\n\n    assert value_1 == 'alias_1'\n    assert value_2 == 'alias_2'\n\n    # Clean up\n    await kvs_1.drop()\n    await kvs_2.drop()\n\n\nasync def test_alias_caching(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that key-value stores with same alias return same instance (cached).\"\"\"\n    # Open kvs with alias\n    kvs_1 = await KeyValueStore.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Open again with same alias\n    kvs_2 = await KeyValueStore.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Should be same instance\n    assert kvs_1 is kvs_2\n    assert kvs_1.id == kvs_2.id\n\n    # Clean up\n    await kvs_1.drop()\n\n\nasync def test_alias_with_id_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and id raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"alias\".',\n    ):\n        await KeyValueStore.open(\n            id='some-id',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_name_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and name raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"name\", \"alias\".',\n    ):\n        await KeyValueStore.open(\n            name='some-name',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_special_characters(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test alias functionality with special characters.\"\"\"\n    special_aliases = [\n        'alias-with-dashes',\n        'alias_with_underscores',\n        'alias.with.dots',\n        'alias123with456numbers',\n        'CamelCaseAlias',\n    ]\n\n    stores = []\n    for alias in special_aliases:\n        kvs = await KeyValueStore.open(\n            alias=alias,\n            storage_client=storage_client,\n        )\n        stores.append(kvs)\n\n        # Add data with the alias as identifier\n        await kvs.set_value('alias_used', alias)\n        await kvs.set_value('test', 'special_chars')\n\n    # Verify all work correctly\n    for i, kvs in enumerate(stores):\n        assert await kvs.get_value('alias_used') == special_aliases[i]\n        assert await kvs.get_value('test') == 'special_chars'\n\n    # Clean up\n    for kvs in stores:\n        await kvs.drop()\n\n\nasync def test_alias_key_operations(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that key operations work correctly with alias stores.\"\"\"\n    kvs = await KeyValueStore.open(\n        alias='key_ops_test',\n        storage_client=storage_client,\n    )\n\n    # Test setting multiple keys\n    test_data = {\n        'key1': {'data': 'value1', 'number': 1},\n        'key2': 'simple string value',\n        'key3': [1, 2, 3, 4, 5],\n        'key4': None,\n    }\n\n    for key, value in test_data.items():\n        await kvs.set_value(key, value)\n\n    # Test getting all keys\n    keys = await kvs.list_keys()\n    key_names = [k.key for k in keys]\n    assert len(keys) == 4\n    for key in test_data:\n        assert key in key_names\n\n    # Test record_exists\n    for key in test_data:\n        assert await kvs.record_exists(key) is True\n    assert await kvs.record_exists('nonexistent') is False\n\n    # Test iteration\n    collected_keys = [key async for key in kvs.iterate_keys()]\n    assert len(collected_keys) == 4\n\n    # Test deletion\n    await kvs.delete_value('key2')\n    assert await kvs.record_exists('key2') is False\n    assert await kvs.get_value('key2') is None\n\n    # Verify other keys still exist\n    remaining_keys = await kvs.list_keys()\n    assert len(remaining_keys) == 3\n\n    # Clean up\n    await kvs.drop()\n\n\nasync def test_named_vs_alias_conflict_detection(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that conflicts between named and alias storages are detected.\"\"\"\n    # Test 1: Create named storage first, then try alias with same name\n    named_kvs = await KeyValueStore.open(name='conflict-test', storage_client=storage_client)\n    assert named_kvs.name == 'conflict-test'\n\n    # Try to create alias with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create alias storage \"conflict-test\".*already exists'):\n        await KeyValueStore.open(alias='conflict-test', storage_client=storage_client)\n\n    # Clean up\n    await named_kvs.drop()\n\n    # Test 2: Create alias first, then try named with same name\n    alias_kvs = await KeyValueStore.open(alias='conflict-test2', storage_client=storage_client)\n    assert alias_kvs.name is None  # Alias storages have no name\n\n    # Try to create named with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create named storage \"conflict-test2\".*already exists'):\n        await KeyValueStore.open(name='conflict-test2', storage_client=storage_client)\n\n    # Clean up\n    await alias_kvs.drop()\n\n    # Test 3: Different names should work fine\n    named_kvs_ok = await KeyValueStore.open(name='different-name', storage_client=storage_client)\n    alias_kvs_ok = await KeyValueStore.open(alias='different-alias', storage_client=storage_client)\n\n    assert named_kvs_ok.name == 'different-name'\n    assert alias_kvs_ok.name is None\n\n    # Clean up\n    await named_kvs_ok.drop()\n    await alias_kvs_ok.drop()\n\n\nasync def test_alias_parameter(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test key-value store creation and operations with alias parameter.\"\"\"\n    # Create kvs with alias\n    alias_kvs = await KeyValueStore.open(\n        alias='test_alias',\n        storage_client=storage_client,\n    )\n\n    # Verify alias kvs properties\n    assert alias_kvs.id is not None\n    assert alias_kvs.name is None  # Alias storages should be unnamed\n\n    # Test data operations\n    await alias_kvs.set_value('test_key', {'type': 'alias', 'value': 1})\n    value = await alias_kvs.get_value('test_key')\n    assert value['type'] == 'alias'\n\n    await alias_kvs.drop()\n\n\nasync def test_alias_vs_named_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that alias and named key-value stores with same identifier are isolated.\"\"\"\n    # Create named kvs\n    named_kvs = await KeyValueStore.open(\n        name='test-identifier',\n        storage_client=storage_client,\n    )\n\n    # Verify named kvs\n    assert named_kvs.name == 'test-identifier'\n    await named_kvs.set_value('type', 'named')\n\n    # Clean up named kvs first\n    await named_kvs.drop()\n\n    # Now create alias kvs with same identifier (should work after cleanup)\n    alias_kvs = await KeyValueStore.open(\n        alias='test_identifier',\n        storage_client=storage_client,\n    )\n\n    # Should be different instance\n    assert alias_kvs.name is None\n    await alias_kvs.set_value('type', 'alias')\n\n    # Verify alias data\n    alias_value = await alias_kvs.get_value('type')\n    assert alias_value == 'alias'\n\n    await alias_kvs.drop()\n\n\nasync def test_default_vs_alias_default_equivalence(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that default key-value store and alias='default' are equivalent.\"\"\"\n    # Open default kvs\n    default_kvs = await KeyValueStore.open(\n        storage_client=storage_client,\n    )\n\n    alias_default_kvs = await KeyValueStore.open(\n        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,\n        storage_client=storage_client,\n    )\n\n    # Should be the same\n    assert default_kvs.id == alias_default_kvs.id\n    assert default_kvs.name is None\n    assert alias_default_kvs.name is None\n\n    # Data should be shared\n    await default_kvs.set_value('source', 'default')\n    value = await alias_default_kvs.get_value('source')\n    assert value == 'default'\n\n    await default_kvs.drop()\n\n\nasync def test_multiple_alias_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that different aliases create separate key-value stores.\"\"\"\n    kvs_stores = []\n\n    for i in range(3):\n        kvs = await KeyValueStore.open(\n            alias=f'alias_{i}',\n            storage_client=storage_client,\n        )\n        await kvs.set_value('alias', f'alias_{i}')\n        await kvs.set_value('index', i)\n        kvs_stores.append(kvs)\n\n    # All should be different\n    for i in range(3):\n        for j in range(i + 1, 3):\n            assert kvs_stores[i].id != kvs_stores[j].id\n\n    # Verify data isolation\n    for i, kvs in enumerate(kvs_stores):\n        alias_value = await kvs.get_value('alias')\n        index_value = await kvs.get_value('index')\n        assert alias_value == f'alias_{i}'\n        # For memory storage, value is preserved as int; for filesystem it's converted to string\n        assert index_value == i or index_value == str(i)\n        await kvs.drop()\n\n\nasync def test_purge_on_start_enabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=True)\n\n    # First, create all storage types with purge enabled and add data.\n    default_kvs = await KeyValueStore.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_kvs = await KeyValueStore.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_kvs = await KeyValueStore.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_kvs.set_value(key='data', value='should_be_purged')\n    await alias_kvs.set_value(key='data', value='should_be_purged')\n    await named_kvs.set_value(key='data', value='should_persist')\n\n    # Verify data was added\n    default_data = await default_kvs.get_value(key='data')\n    alias_data = await alias_kvs.get_value(key='data')\n    named_data = await named_kvs.get_value(key='data')\n\n    assert default_data == 'should_be_purged'\n    assert alias_data == 'should_be_purged'\n    assert named_data == 'should_persist'\n\n    # Verify that default and alias storages are unnamed\n    default_metadata = await default_kvs.get_metadata()\n    alias_metadata = await alias_kvs.get_metadata()\n    named_metadata = await named_kvs.get_metadata()\n\n    assert default_metadata.name is None\n    assert alias_metadata.name is None\n    assert named_metadata.name == 'purge-test-named'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_kvs_2 = await KeyValueStore.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_kvs_2 = await KeyValueStore.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_kvs_2 = await KeyValueStore.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after purge\n    default_data_after = await default_kvs_2.get_value(key='data')\n    alias_data_after = await alias_kvs_2.get_value(key='data')\n    named_data_after = await named_kvs_2.get_value(key='data')\n\n    # Unnamed storages (alias and default) should be purged (data removed)\n    assert default_data_after is None\n    assert alias_data_after is None\n\n    # Named storage should retain data (not purged)\n    assert named_data_after == 'should_persist'\n\n    # Clean up\n    await named_kvs_2.drop()\n    await alias_kvs_2.drop()\n    await default_kvs_2.drop()\n\n\nasync def test_purge_on_start_disabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=False: all storages retain data regardless of type.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=False)\n\n    # First, create all storage types with purge disabled and add data.\n    default_kvs = await KeyValueStore.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_kvs = await KeyValueStore.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_kvs = await KeyValueStore.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_kvs.set_value('data', 'should_persist')\n    await alias_kvs.set_value('data', 'should_persist')\n    await named_kvs.set_value('data', 'should_persist')\n\n    # Verify data was added\n    default_data = await default_kvs.get_value('data')\n    alias_data = await alias_kvs.get_value('data')\n    named_data = await named_kvs.get_value('data')\n\n    assert default_data == 'should_persist'\n    assert alias_data == 'should_persist'\n    assert named_data == 'should_persist'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_kvs_2 = await KeyValueStore.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_kvs_2 = await KeyValueStore.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_kvs_2 = await KeyValueStore.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after reopen\n    default_data_after = await default_kvs_2.get_value('data')\n    alias_data_after = await alias_kvs_2.get_value('data')\n    named_data_after = await named_kvs_2.get_value('data')\n\n    # All storages should retain data when purge is disabled\n    assert default_data_after == 'should_persist'\n    assert alias_data_after == 'should_persist'\n    assert named_data_after == 'should_persist'\n\n    # Clean up\n    await named_kvs_2.drop()\n    await alias_kvs_2.drop()\n    await default_kvs_2.drop()\n\n\nasync def test_name_default_not_allowed(storage_client: StorageClient) -> None:\n    \"\"\"Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=f'Storage name cannot be \"{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}\" as '\n        f'it is reserved for default alias.',\n    ):\n        await KeyValueStore.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)\n\n\n@pytest.mark.parametrize(\n    ('name', 'is_valid'),\n    [\n        pytest.param('F', True, id='single-char'),\n        pytest.param('7', True, id='single-digit'),\n        pytest.param('FtghdfseySds', True, id='mixed-case'),\n        pytest.param('125673450', True, id='all-digits'),\n        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),\n        pytest.param('name-with-dashes', True, id='dashes'),\n        pytest.param('1-value', True, id='number start'),\n        pytest.param('value-1', True, id='number end'),\n        pytest.param('test-1-value', True, id='number middle'),\n        pytest.param('test-------value', True, id='multiple-dashes'),\n        pytest.param('test-VALUES-test', True, id='multiple-cases'),\n        pytest.param('name_with_underscores', False, id='underscores'),\n        pytest.param('name with spaces', False, id='spaces'),\n        pytest.param('-test', False, id='dashes start'),\n        pytest.param('test-', False, id='dashes end'),\n    ],\n)\nasync def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:\n    \"\"\"Test name validation logic.\"\"\"\n    if is_valid:\n        # Should not raise\n        dataset = await KeyValueStore.open(name=name, storage_client=storage_client)\n        assert dataset.name == name\n        await dataset.drop()\n    else:\n        with pytest.raises(ValueError, match=rf'Invalid storage name \"{name}\".*'):\n            await KeyValueStore.open(name=name, storage_client=storage_client)\n\n\n@pytest.mark.parametrize(\n    'tested_storage_client_class',\n    [\n        pytest.param(MemoryStorageClient, id='tested=MemoryStorageClient'),\n        pytest.param(FileSystemStorageClient, id='tested=FileSystemStorageClient'),\n        pytest.param(SqlStorageClient, id='tested=SqlStorageClient'),\n    ],\n)\n@pytest.mark.parametrize(\n    'global_storage_client_class',\n    [\n        pytest.param(MemoryStorageClient, id='global=MemoryStorageClient'),\n        pytest.param(FileSystemStorageClient, id='global=FileSystemStorageClient'),\n        pytest.param(SqlStorageClient, id='global=SqlStorageClient'),\n    ],\n)\nasync def test_get_auto_saved_value_various_global_clients(\n    tmp_path: Path, tested_storage_client_class: type[StorageClient], global_storage_client_class: type[StorageClient]\n) -> None:\n    \"\"\"Ensure that persistence is working for all clients regardless of what is set in service locator.\"\"\"\n    tested_storage_client = tested_storage_client_class()\n    global_storage_client = global_storage_client_class()\n\n    service_locator.set_configuration(\n        Configuration(\n            storage_dir=str(tmp_path),\n            purge_on_start=True,\n        )\n    )\n    service_locator.set_storage_client(global_storage_client)\n\n    kvs = await KeyValueStore.open(storage_client=tested_storage_client)\n    values_kvs = {'key': 'some_value'}\n    test_key = 'test_key'\n\n    autosaved_value_kvs = await kvs.get_auto_saved_value(test_key)\n    assert autosaved_value_kvs == {}\n    autosaved_value_kvs.update(values_kvs)\n    await kvs.persist_autosaved_values()\n\n    assert await kvs.get_value(test_key) == autosaved_value_kvs\n\n\nasync def test_record_with_noascii_chars(kvs: KeyValueStore) -> None:\n    \"\"\"Test storing and retrieving a record with non-ASCII characters.\"\"\"\n    init_value = {\n        'record_1': 'Supermaxi El Jardín',\n        'record_2': 'záznam dva',\n        'record_3': '記録三',\n    }\n    key = 'non_ascii_key'\n\n    # Save the record in the key-value store\n    await kvs.set_value(key, init_value)\n\n    # Get the record and verify\n    value = await kvs.get_value(key)\n    assert value is not None\n    assert value == init_value\n"
  },
  {
    "path": "tests/unit/storages/test_request_manager_tandem.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass\nfrom unittest.mock import create_autospec\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee.request_loaders import RequestLoader, RequestManagerTandem\nfrom crawlee.storages import RequestQueue\n\n\n@dataclass\nclass TestInput:\n    __test__ = False\n\n    request_loader_items: list[str | Request | None]\n    request_manager_items: list[str | Request]\n    discovered_items: list[Request]\n    expected_result: set[str]\n\n\n@pytest.mark.parametrize(\n    argnames='test_input',\n    argvalues=[\n        pytest.param(\n            TestInput(\n                request_loader_items=['https://a.placeholder.com', 'https://b.placeholder.com'],\n                request_manager_items=[],\n                discovered_items=[Request.from_url('https://c.placeholder.com')],\n                expected_result={\n                    'https://a.placeholder.com',\n                    'https://b.placeholder.com',\n                    'https://c.placeholder.com',\n                },\n            ),\n            id='basic_usage',\n        ),\n        pytest.param(\n            TestInput(\n                request_loader_items=[\n                    Request.from_url('https://a.placeholder.com'),\n                    None,\n                    Request.from_url('https://c.placeholder.com'),\n                ],\n                request_manager_items=['https://b.placeholder.com', 'http://d.com'],\n                discovered_items=[],\n                expected_result={\n                    'https://a.placeholder.com',\n                    'https://b.placeholder.com',\n                    'https://c.placeholder.com',\n                    'http://d.com',\n                },\n            ),\n            id='wait_for_read_only_source',\n        ),\n    ],\n)\nasync def test_basic_functionality(test_input: TestInput) -> None:\n    request_queue = await RequestQueue.open()\n\n    if test_input.request_manager_items:\n        await request_queue.add_requests(test_input.request_manager_items)\n\n    mock_request_loader = create_autospec(RequestLoader, instance=True, spec_set=True)\n    mock_request_loader.fetch_next_request.side_effect = lambda: test_input.request_loader_items.pop(0)\n    mock_request_loader.is_finished.side_effect = lambda: len(test_input.request_loader_items) == 0\n\n    tandem = RequestManagerTandem(mock_request_loader, request_queue)\n    processed = set[str]()\n\n    while not await tandem.is_finished():\n        request = await tandem.fetch_next_request()\n        assert request is not None\n        processed.add(request.url)\n\n        for new_request in test_input.discovered_items:\n            await tandem.add_request(new_request)\n\n        await tandem.mark_request_as_handled(request)\n\n    assert processed == test_input.expected_result\n"
  },
  {
    "path": "tests/unit/storages/test_request_queue.py",
    "content": "from __future__ import annotations\n\nimport asyncio\nfrom datetime import timedelta\nfrom typing import TYPE_CHECKING\n\nimport pytest\n\nfrom crawlee import Request, service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import MemoryStorageClient, StorageClient\nfrom crawlee.storages import RequestQueue\nfrom crawlee.storages._storage_instance_manager import StorageInstanceManager\n\nif TYPE_CHECKING:\n    from collections.abc import AsyncGenerator\n\n    from crawlee.storage_clients import StorageClient\n\n\n@pytest.fixture\nasync def rq(\n    storage_client: StorageClient,\n) -> AsyncGenerator[RequestQueue, None]:\n    \"\"\"Fixture that provides a request queue instance for each test.\"\"\"\n    rq = await RequestQueue.open(\n        storage_client=storage_client,\n    )\n\n    yield rq\n    await rq.drop()\n\n\nasync def test_open_creates_new_rq(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() creates a new request queue with proper metadata.\"\"\"\n    rq = await RequestQueue.open(\n        name='new-request-queue',\n        storage_client=storage_client,\n    )\n\n    # Verify request queue properties\n    assert rq.id is not None\n    assert rq.name == 'new-request-queue'\n    metadata = await rq.get_metadata()\n    assert metadata.pending_request_count == 0\n    assert metadata.handled_request_count == 0\n    assert metadata.total_request_count == 0\n\n    await rq.drop()\n\n\nasync def test_open_existing_rq(\n    rq: RequestQueue,\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() loads an existing request queue correctly.\"\"\"\n    # Open the same request queue again\n    reopened_rq = await RequestQueue.open(\n        name=rq.name,\n        storage_client=storage_client,\n    )\n\n    # Verify request queue properties\n    assert rq.id == reopened_rq.id\n    assert rq.name == reopened_rq.name\n\n    # Verify they are the same object (from cache)\n    assert id(rq) == id(reopened_rq)\n\n\nasync def test_open_with_id_and_name(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that open() raises an error when both id and name are provided.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"name\".',\n    ):\n        await RequestQueue.open(\n            id='some-id',\n            name='some-name',\n            storage_client=storage_client,\n        )\n\n\nasync def test_open_by_id(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening a request queue by its ID.\"\"\"\n    # First create a request queue by name\n    rq1 = await RequestQueue.open(\n        name='rq-by-id-test',\n        storage_client=storage_client,\n    )\n\n    # Add a request to identify it\n    await rq1.add_request('https://example.com/open-by-id-test')\n\n    # Open the request queue by ID\n    rq2 = await RequestQueue.open(\n        id=rq1.id,\n        storage_client=storage_client,\n    )\n\n    # Verify it's the same request queue\n    assert rq2.id == rq1.id\n    assert rq2.name == 'rq-by-id-test'\n\n    # Verify the request is still there\n    request = await rq2.fetch_next_request()\n    assert request is not None\n    assert request.url == 'https://example.com/open-by-id-test'\n\n    # Clean up\n    await rq2.drop()\n\n\nasync def test_add_request_string_url(rq: RequestQueue) -> None:\n    \"\"\"Test adding a request with a string URL.\"\"\"\n    # Add a request with a string URL\n    url = 'https://example.com'\n    result = await rq.add_request(url)\n\n    # Verify request was added\n    assert result is not None\n    assert result.unique_key is not None\n    assert result.was_already_present is False\n    assert result.was_already_handled is False\n\n    # Verify the queue stats were updated\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 1\n    assert metadata.pending_request_count == 1\n\n\nasync def test_add_request_object(rq: RequestQueue) -> None:\n    \"\"\"Test adding a request object.\"\"\"\n    # Create and add a request object\n    request = Request.from_url(url='https://example.com', user_data={'key': 'value'})\n    result = await rq.add_request(request)\n\n    # Verify request was added\n    assert result is not None\n    assert result.unique_key is not None\n    assert result.was_already_present is False\n    assert result.was_already_handled is False\n\n    # Verify the queue stats were updated\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 1\n    assert metadata.pending_request_count == 1\n\n\nasync def test_add_duplicate_request(rq: RequestQueue) -> None:\n    \"\"\"Test adding a duplicate request to the queue.\"\"\"\n    # Add a request\n    url = 'https://example.com'\n    first_result = await rq.add_request(url)\n\n    assert first_result is not None\n\n    # Add the same request again\n    second_result = await rq.add_request(url)\n\n    # Verify the second request was detected as duplicate\n    assert second_result is not None\n    assert second_result.was_already_present is True\n    assert second_result.unique_key == first_result.unique_key\n\n    # Verify the queue stats weren't incremented twice\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 1\n    assert metadata.pending_request_count == 1\n\n\nasync def test_add_requests_batch(rq: RequestQueue) -> None:\n    \"\"\"Test adding multiple requests in a batch.\"\"\"\n    # Create a batch of requests\n    urls = [\n        'https://example.com/page1',\n        'https://example.com/page2',\n        'https://example.com/page3',\n    ]\n\n    # Add the requests\n    await rq.add_requests(urls)\n\n    # Wait for all background tasks to complete\n    await asyncio.sleep(0.1)\n\n    # Verify the queue stats\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 3\n    assert metadata.pending_request_count == 3\n\n\nasync def test_add_requests_batch_with_forefront(rq: RequestQueue) -> None:\n    \"\"\"Test adding multiple requests in a batch with forefront option.\"\"\"\n    # Add some initial requests\n    await rq.add_request('https://example.com/page1')\n    await rq.add_request('https://example.com/page2')\n\n    # Add a batch of priority requests at the forefront\n\n    await rq.add_requests(\n        [\n            'https://example.com/priority1',\n            'https://example.com/priority2',\n            'https://example.com/priority3',\n        ],\n        forefront=True,\n    )\n\n    # Wait for all background tasks to complete\n    await asyncio.sleep(0.1)\n\n    # Fetch requests - they should come out in priority order first\n    next_request1 = await rq.fetch_next_request()\n    assert next_request1 is not None\n    assert next_request1.url.startswith('https://example.com/priority')\n\n    next_request2 = await rq.fetch_next_request()\n    assert next_request2 is not None\n    assert next_request2.url.startswith('https://example.com/priority')\n\n    next_request3 = await rq.fetch_next_request()\n    assert next_request3 is not None\n    assert next_request3.url.startswith('https://example.com/priority')\n\n    # Now we should get the original requests\n    next_request4 = await rq.fetch_next_request()\n    assert next_request4 is not None\n    assert next_request4.url == 'https://example.com/page1'\n\n    next_request5 = await rq.fetch_next_request()\n    assert next_request5 is not None\n    assert next_request5.url == 'https://example.com/page2'\n\n    # Queue should be empty now\n    next_request6 = await rq.fetch_next_request()\n    assert next_request6 is None\n\n\nasync def test_add_requests_with_forefront(rq: RequestQueue) -> None:\n    \"\"\"Test adding requests to the front of the queue.\"\"\"\n    # Add some initial requests\n    await rq.add_request('https://example.com/page1')\n    await rq.add_request('https://example.com/page2')\n\n    # Add a priority request at the forefront\n    await rq.add_request('https://example.com/priority', forefront=True)\n\n    # Fetch the next request - should be the priority one\n    next_request = await rq.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://example.com/priority'\n\n\nasync def test_add_requests_mixed_forefront(rq: RequestQueue) -> None:\n    \"\"\"Test the ordering when adding requests with mixed forefront values.\"\"\"\n    # Add normal requests\n    await rq.add_request('https://example.com/normal1')\n    await rq.add_request('https://example.com/normal2')\n\n    # Add a batch with forefront=True\n    await rq.add_requests(\n        ['https://example.com/priority1', 'https://example.com/priority2'],\n        forefront=True,\n    )\n\n    # Add another normal request\n    await rq.add_request('https://example.com/normal3')\n\n    # Add another priority request\n    await rq.add_request('https://example.com/priority3', forefront=True)\n\n    # Wait for background tasks\n    await asyncio.sleep(0.1)\n\n    # The expected order should be:\n    # 1. priority3 (most recent forefront)\n    # 2. priority1 (from batch, forefront)\n    # 3. priority2 (from batch, forefront)\n    # 4. normal1 (oldest normal)\n    # 5. normal2\n    # 6. normal3 (newest normal)\n\n    requests = []\n    while True:\n        req = await rq.fetch_next_request()\n        if req is None:\n            break\n        requests.append(req)\n        await rq.mark_request_as_handled(req)\n\n    assert len(requests) == 6\n    assert requests[0].url == 'https://example.com/priority3'\n\n    # The next two should be from the forefront batch (exact order within batch may vary)\n    batch_urls = {requests[1].url, requests[2].url}\n    assert 'https://example.com/priority1' in batch_urls\n    assert 'https://example.com/priority2' in batch_urls\n\n    # Then the normal requests in order\n    assert requests[3].url == 'https://example.com/normal1'\n    assert requests[4].url == 'https://example.com/normal2'\n    assert requests[5].url == 'https://example.com/normal3'\n\n\nasync def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None:\n    \"\"\"Test fetching and marking requests as handled.\"\"\"\n    # Add some requests\n    await rq.add_request('https://example.com/page1')\n    await rq.add_request('https://example.com/page2')\n\n    # Fetch first request\n    request1 = await rq.fetch_next_request()\n    assert request1 is not None\n    assert request1.url == 'https://example.com/page1'\n\n    # Mark the request as handled\n    result = await rq.mark_request_as_handled(request1)\n    assert result is not None\n    assert result.was_already_handled is True\n\n    # Fetch next request\n    request2 = await rq.fetch_next_request()\n    assert request2 is not None\n    assert request2.url == 'https://example.com/page2'\n\n    # Mark the second request as handled\n    await rq.mark_request_as_handled(request2)\n\n    # Verify counts\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 2\n    assert metadata.handled_request_count == 2\n    assert metadata.pending_request_count == 0\n\n    # Verify queue is empty\n    empty_request: Request | None = await rq.fetch_next_request()\n    assert empty_request is None\n\n\nasync def test_get_request_by_id(rq: RequestQueue) -> None:\n    \"\"\"Test retrieving a request by its ID.\"\"\"\n    # Add a request\n    added_result = await rq.add_request('https://example.com')\n\n    assert added_result is not None\n\n    unique_key = added_result.unique_key\n\n    # Retrieve the request by ID\n    retrieved_request = await rq.get_request(unique_key)\n    assert retrieved_request is not None\n    assert retrieved_request.unique_key == unique_key\n    assert retrieved_request.url == 'https://example.com'\n\n\nasync def test_handled_request_records_persistence(rq: RequestQueue) -> None:\n    request = Request.from_url('https://example.com/1')\n    await rq.add_request(request)\n    fetched_request = await rq.fetch_next_request()\n    assert isinstance(fetched_request, Request)\n    await rq.mark_request_as_handled(fetched_request)\n    fetched_request = await rq.get_request(request.unique_key)\n    assert isinstance(fetched_request, Request)\n    assert fetched_request.unique_key == request.unique_key\n\n\nasync def test_get_non_existent_request(rq: RequestQueue) -> None:\n    \"\"\"Test retrieving a request that doesn't exist.\"\"\"\n    non_existent_request = await rq.get_request('non-existent-id')\n    assert non_existent_request is None\n\n\nasync def test_reclaim_request(rq: RequestQueue) -> None:\n    \"\"\"Test reclaiming a request that failed processing.\"\"\"\n    # Add a request\n    await rq.add_request('https://example.com')\n\n    # Fetch the request\n    request = await rq.fetch_next_request()\n    assert request is not None\n\n    # Reclaim the request\n    result = await rq.reclaim_request(request)\n    assert result is not None\n    assert result.was_already_handled is False\n\n    # Verify we can fetch it again\n    reclaimed_request = await rq.fetch_next_request()\n    assert reclaimed_request is not None\n    assert reclaimed_request.unique_key == request.unique_key\n    assert reclaimed_request.url == 'https://example.com'\n\n\nasync def test_reclaim_request_with_forefront(rq: RequestQueue) -> None:\n    \"\"\"Test reclaiming a request to the front of the queue.\"\"\"\n    # Add requests\n    await rq.add_request('https://example.com/first')\n    await rq.add_request('https://example.com/second')\n\n    # Fetch the first request\n    first_request = await rq.fetch_next_request()\n    assert first_request is not None\n    assert first_request.url == 'https://example.com/first'\n\n    # Reclaim it to the forefront\n    await rq.reclaim_request(first_request, forefront=True)\n\n    # The reclaimed request should be returned first (before the second request)\n    next_request = await rq.fetch_next_request()\n    assert next_request is not None\n    assert next_request.url == 'https://example.com/first'\n\n\nasync def test_is_empty(rq: RequestQueue) -> None:\n    \"\"\"Test checking if a request queue is empty.\"\"\"\n    # Initially the queue should be empty\n    assert await rq.is_empty() is True\n\n    # Add a request\n    await rq.add_request('https://example.com')\n    assert await rq.is_empty() is False\n\n    # Fetch and handle the request\n    request = await rq.fetch_next_request()\n\n    assert request is not None\n    await rq.mark_request_as_handled(request)\n\n    # Queue should be empty again\n    assert await rq.is_empty() is True\n\n\n@pytest.mark.parametrize(\n    ('wait_for_all'),\n    [\n        pytest.param(True, id='wait for all'),\n        pytest.param(False, id='do not wait for all'),\n    ],\n)\nasync def test_add_requests_wait_for_all(\n    rq: RequestQueue,\n    *,\n    wait_for_all: bool,\n) -> None:\n    \"\"\"Test adding requests with wait_for_all_requests_to_be_added option.\"\"\"\n    urls = [f'https://example.com/{i}' for i in range(15)]\n\n    # Add requests without waiting\n    await rq.add_requests(\n        urls,\n        batch_size=5,\n        wait_for_all_requests_to_be_added=wait_for_all,\n        wait_time_between_batches=timedelta(milliseconds=50),\n    )\n\n    if not wait_for_all:\n        # Immediately after adding, the total count may be less than 15 due to background processing\n        assert await rq.get_total_count() <= 15\n\n        # Wait for background tasks to complete\n        while await rq.get_total_count() < 15:  # noqa: ASYNC110\n            await asyncio.sleep(0.1)\n\n    # Verify all requests were added\n    assert await rq.get_total_count() == 15\n\n\nasync def test_is_finished(rq: RequestQueue) -> None:\n    \"\"\"Test checking if a request queue is finished.\"\"\"\n    # Initially the queue should be finished (empty and no background tasks)\n    assert await rq.is_finished() is True\n\n    # Add a request\n    await rq.add_request('https://example.com')\n    assert await rq.is_finished() is False\n\n    # Add requests in the background\n    await rq.add_requests(\n        ['https://example.com/1', 'https://example.com/2'],\n        wait_for_all_requests_to_be_added=False,\n    )\n\n    # Queue shouldn't be finished while background tasks are running\n    assert await rq.is_finished() is False\n\n    # Wait for background tasks to finish\n    await asyncio.sleep(0.2)\n\n    # Process all requests\n    while True:\n        request = await rq.fetch_next_request()\n        if request is None:\n            break\n        await rq.mark_request_as_handled(request)\n\n    # Now queue should be finished\n    assert await rq.is_finished() is True\n\n\nasync def test_mark_non_existent_request_as_handled(rq: RequestQueue) -> None:\n    \"\"\"Test marking a non-existent request as handled.\"\"\"\n    # Create a request that hasn't been added to the queue\n    request = Request.from_url(url='https://example.com', id='non-existent-id')\n\n    # Attempt to mark it as handled\n    result = await rq.mark_request_as_handled(request)\n    assert result is None\n\n\nasync def test_reclaim_non_existent_request(rq: RequestQueue) -> None:\n    \"\"\"Test reclaiming a non-existent request.\"\"\"\n    # Create a request that hasn't been added to the queue\n    request = Request.from_url(url='https://example.com', id='non-existent-id')\n\n    # Attempt to reclaim it\n    result = await rq.reclaim_request(request)\n    assert result is None\n\n\nasync def test_drop(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test dropping a request queue removes it from cache and clears its data.\"\"\"\n    rq = await RequestQueue.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    # Add a request\n    await rq.add_request('https://example.com')\n\n    # Drop the request queue\n    await rq.drop()\n\n    # Verify request queue is empty (by creating a new one with the same name)\n    new_rq = await RequestQueue.open(\n        name='drop-test',\n        storage_client=storage_client,\n    )\n\n    # Verify the queue is empty\n    assert await new_rq.is_empty() is True\n    metadata = await new_rq.get_metadata()\n    assert metadata.total_request_count == 0\n    assert metadata.pending_request_count == 0\n    await new_rq.drop()\n\n\nasync def test_reopen_default(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test reopening the default request queue.\"\"\"\n    # First clean up any storage instance caches\n    storage_instance_manager = service_locator.storage_instance_manager\n    storage_instance_manager.clear_cache()\n\n    # Open the default request queue\n    rq1 = await RequestQueue.open(\n        storage_client=storage_client,\n    )\n\n    # If a request queue already exists (due to previous test run), purge it to start fresh\n    try:\n        await rq1.purge()\n    except Exception:\n        # If purge fails, try dropping and recreating\n        await rq1.drop()\n        rq1 = await RequestQueue.open(\n            storage_client=storage_client,\n        )\n\n    # Verify we're starting fresh\n    metadata1 = await rq1.get_metadata()\n    assert metadata1.pending_request_count == 0\n\n    # Add a request\n    await rq1.add_request('https://example.com/')\n\n    # Verify the request was added\n    metadata1 = await rq1.get_metadata()\n    assert metadata1.pending_request_count == 1\n\n    # Open the default request queue again\n    rq2 = await RequestQueue.open(\n        storage_client=storage_client,\n    )\n\n    # Verify they are the same queue\n    assert rq1.id == rq2.id\n    assert rq1.name == rq2.name\n    metadata1 = await rq1.get_metadata()\n    metadata2 = await rq2.get_metadata()\n    assert metadata1.total_request_count == metadata2.total_request_count\n    assert metadata1.pending_request_count == metadata2.pending_request_count\n    assert metadata1.handled_request_count == metadata2.handled_request_count\n\n    # Verify the request is accessible\n    request = await rq2.fetch_next_request()\n    assert request is not None\n    assert request.url == 'https://example.com/'\n\n    # Clean up after the test\n    await rq1.drop()\n\n\nasync def test_purge(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test purging a request queue removes all requests but keeps the queue itself.\"\"\"\n    # First create a request queue\n    rq = await RequestQueue.open(\n        name='purge-test-queue',\n        storage_client=storage_client,\n    )\n\n    # Add some requests\n    await rq.add_requests(\n        [\n            'https://example.com/page1',\n            'https://example.com/page2',\n            'https://example.com/page3',\n        ]\n    )\n\n    # Verify requests were added\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 3\n    assert metadata.pending_request_count == 3\n    assert metadata.handled_request_count == 0\n\n    # Record the queue ID\n    queue_id = rq.id\n\n    # Purge the queue\n    await rq.purge()\n\n    # Verify the queue still exists but is empty\n    assert rq.id == queue_id  # Same ID preserved\n    assert rq.name == 'purge-test-queue'  # Same name preserved\n\n    # Queue should be empty now\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 0\n    assert metadata.pending_request_count == 0\n    assert metadata.handled_request_count == 0\n    assert await rq.is_empty() is True\n\n    # Verify we can add new requests after purging\n    await rq.add_request('https://example.com/new-after-purge')\n\n    request = await rq.fetch_next_request()\n    assert request is not None\n    assert request.url == 'https://example.com/new-after-purge'\n\n    # Clean up\n    await rq.drop()\n\n\nasync def test_open_with_alias(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test opening request queues with alias parameter for NDU functionality.\"\"\"\n    # Create request queues with different aliases\n    rq_1 = await RequestQueue.open(\n        alias='test_alias_1',\n        storage_client=storage_client,\n    )\n    rq_2 = await RequestQueue.open(\n        alias='test_alias_2',\n        storage_client=storage_client,\n    )\n\n    # Verify they have different IDs but no names (unnamed)\n    assert rq_1.id != rq_2.id\n    assert rq_1.name is None\n    assert rq_2.name is None\n\n    # Add different requests to each\n    await rq_1.add_request('https://example.com/1')\n    await rq_1.add_request('https://example.com/2')\n    await rq_2.add_request('https://example.com/3')\n\n    # Verify data isolation\n    request_1 = await rq_1.fetch_next_request()\n    request_2 = await rq_2.fetch_next_request()\n\n    assert request_1 is not None\n    assert request_2 is not None\n    assert request_1.url == 'https://example.com/1'\n    assert request_2.url == 'https://example.com/3'\n\n    # Clean up\n    await rq_1.drop()\n    await rq_2.drop()\n\n\nasync def test_alias_caching(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that request queues with same alias return same instance (cached).\"\"\"\n    # Open rq with alias\n    rq_1 = await RequestQueue.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Open again with same alias\n    rq_2 = await RequestQueue.open(\n        alias='cache_test',\n        storage_client=storage_client,\n    )\n\n    # Should be same instance\n    assert rq_1 is rq_2\n    assert rq_1.id == rq_2.id\n\n    # Clean up\n    await rq_1.drop()\n\n\nasync def test_alias_with_id_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and id raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"id\", \"alias\".',\n    ):\n        await RequestQueue.open(\n            id='some-id',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_name_error(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that providing both alias and name raises error.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=r'Only one of \"id\", \"name\", \"alias\" can be specified, but following arguments '\n        r'were specified: \"name\", \"alias\".',\n    ):\n        await RequestQueue.open(\n            name='some-name',\n            alias='some-alias',\n            storage_client=storage_client,\n        )\n\n\nasync def test_alias_with_special_characters(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test alias functionality with special characters.\"\"\"\n    special_aliases = [\n        'alias-with-dashes',\n        'alias_with_underscores',\n        'alias.with.dots',\n        'alias123with456numbers',\n        'CamelCaseAlias',\n    ]\n\n    queues = []\n    for alias in special_aliases:\n        rq = await RequestQueue.open(\n            alias=alias,\n            storage_client=storage_client,\n        )\n        queues.append(rq)\n\n        # Add request with the alias as identifier in URL\n        await rq.add_request(f'https://example.com/{alias}')\n\n    # Verify all work correctly\n    for i, rq in enumerate(queues):\n        request = await rq.fetch_next_request()\n        assert request is not None\n        assert f'/{special_aliases[i]}' in request.url\n\n    # Clean up\n    for rq in queues:\n        await rq.drop()\n\n\nasync def test_alias_request_operations(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that request operations work correctly with alias queues.\"\"\"\n    rq = await RequestQueue.open(\n        alias='request_ops_test',\n        storage_client=storage_client,\n    )\n\n    # Test adding multiple requests\n    urls = [\n        'https://example.com/page1',\n        'https://example.com/page2',\n        'https://example.com/page3',\n    ]\n\n    for url in urls:\n        result = await rq.add_request(url)\n        assert result is not None\n        assert result.was_already_present is False\n\n    # Test queue metadata\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 3\n    assert metadata.pending_request_count == 3\n    assert metadata.handled_request_count == 0\n\n    # Test fetching and handling requests\n    processed_urls = []\n    while not await rq.is_empty():\n        request = await rq.fetch_next_request()\n        if request:\n            processed_urls.append(request.url)\n            await rq.mark_request_as_handled(request)\n\n    # Verify all requests were processed\n    assert len(processed_urls) == 3\n    assert set(processed_urls) == set(urls)\n\n    # Verify final state\n    metadata = await rq.get_metadata()\n    assert metadata.pending_request_count == 0\n    assert metadata.handled_request_count == 3\n    assert await rq.is_empty() is True\n\n    # Clean up\n    await rq.drop()\n\n\nasync def test_alias_forefront_operations(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test forefront operations work correctly with alias queues.\"\"\"\n    rq = await RequestQueue.open(\n        alias='forefront_test',\n        storage_client=storage_client,\n    )\n\n    # Add normal requests\n    await rq.add_request('https://example.com/normal1')\n    await rq.add_request('https://example.com/normal2')\n\n    # Add priority request to forefront\n    await rq.add_request('https://example.com/priority', forefront=True)\n\n    # Priority request should come first\n    priority_request = await rq.fetch_next_request()\n    assert priority_request is not None\n    assert priority_request.url == 'https://example.com/priority'\n\n    # Then normal requests\n    normal_request = await rq.fetch_next_request()\n    assert normal_request is not None\n    assert normal_request.url == 'https://example.com/normal1'\n\n    # Clean up\n    await rq.drop()\n\n\nasync def test_alias_batch_operations(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test batch operations work correctly with alias queues.\"\"\"\n    rq = await RequestQueue.open(\n        alias='batch_test',\n        storage_client=storage_client,\n    )\n\n    # Test batch adding\n    batch_urls = [\n        'https://example.com/batch1',\n        'https://example.com/batch2',\n        'https://example.com/batch3',\n    ]\n\n    await rq.add_requests(batch_urls)\n\n    # Wait for background processing\n    await asyncio.sleep(0.1)\n\n    # Verify all requests were added\n    metadata = await rq.get_metadata()\n    assert metadata.total_request_count == 3\n\n    # Clean up\n    await rq.drop()\n\n\nasync def test_named_vs_alias_conflict_detection(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that conflicts between named and alias storages are detected.\"\"\"\n    # Test 1: Create named storage first, then try alias with same name\n    named_rq = await RequestQueue.open(\n        name='conflict-test',\n        storage_client=storage_client,\n    )\n    assert named_rq.name == 'conflict-test'\n\n    # Try to create alias with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create alias storage \"conflict-test\".*already exists'):\n        await RequestQueue.open(alias='conflict-test', storage_client=storage_client)\n\n    # Clean up\n    await named_rq.drop()\n\n    # Test 2: Create alias first, then try named with same name\n    alias_rq = await RequestQueue.open(alias='conflict-test2', storage_client=storage_client)\n    assert alias_rq.name is None  # Alias storages have no name\n\n    # Try to create named with same name - should raise error\n    with pytest.raises(ValueError, match=r'Cannot create named storage \"conflict-test2\".*already exists'):\n        await RequestQueue.open(name='conflict-test2', storage_client=storage_client)\n\n    # Clean up\n    await alias_rq.drop()\n\n    # Test 3: Different names should work fine\n    named_rq_ok = await RequestQueue.open(name='different-name')\n    alias_rq_ok = await RequestQueue.open(alias='different-alias')\n\n    assert named_rq_ok.name == 'different-name'\n    assert alias_rq_ok.name is None\n\n    # Clean up\n    await named_rq_ok.drop()\n    await alias_rq_ok.drop()\n\n\nasync def test_alias_parameter(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test request queue creation and operations with alias parameter.\"\"\"\n    # Create request queue with alias\n    alias_rq = await RequestQueue.open(\n        alias='test_alias',\n        storage_client=storage_client,\n    )\n\n    # Verify alias request queue properties\n    assert alias_rq.id is not None\n    assert alias_rq.name is None  # Alias storages should be unnamed\n\n    # Test data operations\n    await alias_rq.add_request('https://example.com/alias')\n    metadata = await alias_rq.get_metadata()\n    assert metadata.pending_request_count == 1\n\n    await alias_rq.drop()\n\n\nasync def test_alias_vs_named_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that alias and named request queues with same identifier are isolated.\"\"\"\n    # Create named request queue\n    named_rq = await RequestQueue.open(\n        name='test-identifier',\n        storage_client=storage_client,\n    )\n\n    # Verify named request queue\n    assert named_rq.name == 'test-identifier'\n    await named_rq.add_request('https://named.example.com')\n\n    # Clean up named request queue first\n    await named_rq.drop()\n\n    # Now create alias request queue with same identifier (should work after cleanup)\n    alias_rq = await RequestQueue.open(\n        alias='test-identifier',\n        storage_client=storage_client,\n    )\n\n    # Should be different instance\n    assert alias_rq.name is None\n    await alias_rq.add_request('https://alias.example.com')\n\n    # Verify alias data\n    alias_request = await alias_rq.fetch_next_request()\n    assert alias_request is not None\n    assert alias_request.url == 'https://alias.example.com'\n\n    await alias_rq.drop()\n\n\nasync def test_default_vs_alias_default_equivalence(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that default request queue and alias='default' are equivalent.\"\"\"\n    # Open default request queue\n    default_rq = await RequestQueue.open(\n        storage_client=storage_client,\n    )\n\n    alias_default_rq = await RequestQueue.open(\n        alias=StorageInstanceManager._DEFAULT_STORAGE_ALIAS,\n        storage_client=storage_client,\n    )\n\n    # Should be the same\n    assert default_rq.id == alias_default_rq.id\n    assert default_rq.name is None\n    assert alias_default_rq.name is None\n\n    # Data should be shared\n    await default_rq.add_request('https://default.example.com')\n    metadata = await alias_default_rq.get_metadata()\n    assert metadata.pending_request_count == 1\n\n    await default_rq.drop()\n\n\nasync def test_multiple_alias_isolation(\n    storage_client: StorageClient,\n) -> None:\n    \"\"\"Test that different aliases create separate request queues.\"\"\"\n    request_queues = []\n\n    for i in range(3):\n        rq = await RequestQueue.open(\n            alias=f'alias_{i}',\n            storage_client=storage_client,\n        )\n        await rq.add_request(f'https://example.com/alias_{i}')\n        request_queues.append(rq)\n\n    # All should be different\n    for i in range(3):\n        for j in range(i + 1, 3):\n            assert request_queues[i].id != request_queues[j].id\n\n    # Verify data isolation\n    for i, rq in enumerate(request_queues):\n        request = await rq.fetch_next_request()\n        assert request is not None\n        assert request.url == f'https://example.com/alias_{i}'\n        await rq.drop()\n\n\nasync def test_purge_on_start_enabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=True: named storages retain data, unnamed storages are purged.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=True)\n\n    # First, create all storage types with purge enabled and add data.\n    default_rq = await RequestQueue.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_rq = await RequestQueue.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_rq = await RequestQueue.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_rq.add_requests(\n        [\n            'https://default.example.com/1',\n            'https://default.example.com/2',\n            'https://default.example.com/3',\n        ]\n    )\n    await alias_rq.add_requests(\n        [\n            'https://alias.example.com/1',\n            'https://alias.example.com/2',\n            'https://alias.example.com/3',\n        ]\n    )\n    await named_rq.add_requests(\n        [\n            'https://named.example.com/1',\n            'https://named.example.com/2',\n            'https://named.example.com/3',\n        ]\n    )\n\n    default_request = await default_rq.fetch_next_request()\n    alias_request = await alias_rq.fetch_next_request()\n    named_request = await named_rq.fetch_next_request()\n\n    assert default_request is not None\n    assert alias_request is not None\n    assert named_request is not None\n\n    await default_rq.mark_request_as_handled(default_request)\n    await alias_rq.mark_request_as_handled(alias_request)\n    await named_rq.mark_request_as_handled(named_request)\n\n    # Verify data was added\n    default_metadata = await default_rq.get_metadata()\n    alias_metadata = await alias_rq.get_metadata()\n    named_metadata = await named_rq.get_metadata()\n\n    assert default_metadata.pending_request_count == 2\n    assert alias_metadata.pending_request_count == 2\n    assert named_metadata.pending_request_count == 2\n\n    assert default_metadata.handled_request_count == 1\n    assert alias_metadata.handled_request_count == 1\n    assert named_metadata.handled_request_count == 1\n\n    assert default_metadata.total_request_count == 3\n    assert alias_metadata.total_request_count == 3\n    assert named_metadata.total_request_count == 3\n\n    # Verify that default and alias storages are unnamed\n    assert default_metadata.name is None\n    assert alias_metadata.name is None\n    assert named_metadata.name == 'purge-test-named'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_rq_2 = await RequestQueue.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_rq_2 = await RequestQueue.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_rq_2 = await RequestQueue.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after purge\n    default_metadata_after = await default_rq_2.get_metadata()\n    alias_metadata_after = await alias_rq_2.get_metadata()\n    named_metadata_after = await named_rq_2.get_metadata()\n\n    # Unnamed storages (alias and default) should be purged (data removed)\n    assert default_metadata_after.pending_request_count == 0\n    assert alias_metadata_after.pending_request_count == 0\n    assert named_metadata_after.pending_request_count == 2\n\n    assert default_metadata_after.handled_request_count == 0\n    assert alias_metadata_after.handled_request_count == 0\n    assert named_metadata_after.handled_request_count == 1\n\n    assert default_metadata_after.total_request_count == 0\n    assert alias_metadata_after.total_request_count == 0\n    assert named_metadata_after.total_request_count == 3\n\n    # Clean up\n    await named_rq_2.drop()\n    await alias_rq_2.drop()\n    await default_rq_2.drop()\n\n\nasync def test_purge_on_start_disabled(storage_client: StorageClient) -> None:\n    \"\"\"Test purge behavior when purge_on_start=False: all storages retain data regardless of type.\"\"\"\n\n    # Skip this test for memory storage since it doesn't persist data between client instances.\n    if isinstance(storage_client, MemoryStorageClient):\n        pytest.skip('Memory storage does not persist data between client instances.')\n\n    configuration = Configuration(purge_on_start=False)\n\n    # First, create all storage types with purge disabled and add data.\n    default_rq = await RequestQueue.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    alias_rq = await RequestQueue.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    named_rq = await RequestQueue.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    await default_rq.add_requests(\n        [\n            'https://default.example.com/1',\n            'https://default.example.com/2',\n            'https://default.example.com/3',\n        ]\n    )\n    await alias_rq.add_requests(\n        [\n            'https://alias.example.com/1',\n            'https://alias.example.com/2',\n            'https://alias.example.com/3',\n        ]\n    )\n    await named_rq.add_requests(\n        [\n            'https://named.example.com/1',\n            'https://named.example.com/2',\n            'https://named.example.com/3',\n        ]\n    )\n\n    default_request = await default_rq.fetch_next_request()\n    alias_request = await alias_rq.fetch_next_request()\n    named_request = await named_rq.fetch_next_request()\n\n    assert default_request is not None\n    assert alias_request is not None\n    assert named_request is not None\n\n    await default_rq.mark_request_as_handled(default_request)\n    await alias_rq.mark_request_as_handled(alias_request)\n    await named_rq.mark_request_as_handled(named_request)\n\n    # Verify data was added\n    default_metadata = await default_rq.get_metadata()\n    alias_metadata = await alias_rq.get_metadata()\n    named_metadata = await named_rq.get_metadata()\n\n    assert default_metadata.pending_request_count == 2\n    assert alias_metadata.pending_request_count == 2\n    assert named_metadata.pending_request_count == 2\n\n    assert default_metadata.handled_request_count == 1\n    assert alias_metadata.handled_request_count == 1\n    assert named_metadata.handled_request_count == 1\n\n    assert default_metadata.total_request_count == 3\n    assert alias_metadata.total_request_count == 3\n    assert named_metadata.total_request_count == 3\n\n    # Verify that default and alias storages are unnamed\n    assert default_metadata.name is None\n    assert alias_metadata.name is None\n    assert named_metadata.name == 'purge-test-named'\n\n    # Clear storage cache to simulate \"reopening\" storages\n    service_locator.storage_instance_manager.clear_cache()\n\n    # Now \"reopen\" all storages\n    default_rq_2 = await RequestQueue.open(\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    alias_rq_2 = await RequestQueue.open(\n        alias='purge-test-alias',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n    named_rq_2 = await RequestQueue.open(\n        name='purge-test-named',\n        storage_client=storage_client,\n        configuration=configuration,\n    )\n\n    # Check the data after purge\n    default_metadata_after = await default_rq_2.get_metadata()\n    alias_metadata_after = await alias_rq_2.get_metadata()\n    named_metadata_after = await named_rq_2.get_metadata()\n\n    # Unnamed storages (alias and default) should be purged (data removed)\n    assert default_metadata_after.pending_request_count == 2\n    assert alias_metadata_after.pending_request_count == 2\n    assert named_metadata_after.pending_request_count == 2\n\n    assert default_metadata_after.handled_request_count == 1\n    assert alias_metadata_after.handled_request_count == 1\n    assert named_metadata_after.handled_request_count == 1\n\n    assert default_metadata_after.total_request_count == 3\n    assert alias_metadata_after.total_request_count == 3\n    assert named_metadata_after.total_request_count == 3\n\n    # Clean up\n    await named_rq_2.drop()\n    await alias_rq_2.drop()\n    await default_rq_2.drop()\n\n\nasync def test_name_default_not_allowed(storage_client: StorageClient) -> None:\n    \"\"\"Test that storage can't have default alias as name, to prevent collisions with unnamed storage alias.\"\"\"\n    with pytest.raises(\n        ValueError,\n        match=f'Storage name cannot be \"{StorageInstanceManager._DEFAULT_STORAGE_ALIAS}\" as '\n        f'it is reserved for default alias.',\n    ):\n        await RequestQueue.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client)\n\n\n@pytest.mark.parametrize(\n    ('name', 'is_valid'),\n    [\n        pytest.param('F', True, id='single-char'),\n        pytest.param('7', True, id='single-digit'),\n        pytest.param('FtghdfseySds', True, id='mixed-case'),\n        pytest.param('125673450', True, id='all-digits'),\n        pytest.param('Ft2134Sfe0O1hf', True, id='mixed-alphanumeric'),\n        pytest.param('name-with-dashes', True, id='dashes'),\n        pytest.param('1-value', True, id='number start'),\n        pytest.param('value-1', True, id='number end'),\n        pytest.param('test-1-value', True, id='number middle'),\n        pytest.param('test-------value', True, id='multiple-dashes'),\n        pytest.param('test-VALUES-test', True, id='multiple-cases'),\n        pytest.param('name_with_underscores', False, id='underscores'),\n        pytest.param('name with spaces', False, id='spaces'),\n        pytest.param('-test', False, id='dashes start'),\n        pytest.param('test-', False, id='dashes end'),\n    ],\n)\nasync def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:\n    \"\"\"Test name validation logic.\"\"\"\n    if is_valid:\n        # Should not raise\n        dataset = await RequestQueue.open(name=name, storage_client=storage_client)\n        assert dataset.name == name\n        await dataset.drop()\n    else:\n        with pytest.raises(ValueError, match=rf'Invalid storage name \"{name}\".*'):\n            await RequestQueue.open(name=name, storage_client=storage_client)\n\n\nasync def test_reclaim_request_with_change_state(rq: RequestQueue) -> None:\n    \"\"\"Test reclaiming a request and changing its state.\"\"\"\n    # Add a request\n    await rq.add_request(Request.from_url('https://example.com/original', user_data={'state': 'original'}))\n\n    # Fetch the request\n    request = await rq.fetch_next_request()\n    assert request is not None\n    assert request.url == 'https://example.com/original'\n    assert request.user_data['state'] == 'original'\n\n    # Reclaim the request with modified user data\n    request.user_data['state'] = 'modified'\n    result = await rq.reclaim_request(request)\n    assert result is not None\n    assert result.was_already_handled is False\n\n    # Fetch the reclaimed request\n    reclaimed_request = await rq.fetch_next_request()\n    assert reclaimed_request is not None\n    assert reclaimed_request.url == 'https://example.com/original'\n    assert reclaimed_request.user_data['state'] == 'modified'\n\n\nasync def test_request_with_noascii_chars(rq: RequestQueue) -> None:\n    \"\"\"Test handling requests with non-ASCII characters in user data.\"\"\"\n    data_with_special_chars = {\n        'record_1': 'Supermaxi El Jardín',\n        'record_2': 'záznam dva',\n        'record_3': '記録三',\n    }\n    init_request = Request.from_url('https://crawlee.dev', user_data=data_with_special_chars)\n\n    # Add a request with special user data\n    await rq.add_request(init_request)\n\n    # Get the request and verify\n    request = await rq.fetch_next_request()\n    assert request is not None\n    assert request.url == 'https://crawlee.dev'\n    assert request.user_data == init_request.user_data\n"
  },
  {
    "path": "tests/unit/storages/test_storage_instance_manager.py",
    "content": "import asyncio\nimport sys\nfrom pathlib import Path\nfrom typing import cast\nfrom unittest.mock import AsyncMock\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\nfrom crawlee.storages import Dataset, KeyValueStore, RequestQueue\nfrom crawlee.storages._base import Storage\n\n\n@pytest.fixture(autouse=True)\ndef clean_storage_instance_manager() -> None:\n    \"\"\"Helper function to clean the storage instance manager before each test.\"\"\"\n    service_locator.storage_instance_manager.clear_cache()\n\n\n@pytest.fixture(params=[KeyValueStore, Dataset, RequestQueue])\ndef storage_type(request: pytest.FixtureRequest) -> type[Storage]:\n    return cast('type[Storage]', request.param)\n\n\nasync def test_unique_storage_by_storage_client(tmp_path: Path, storage_type: type[Storage]) -> None:\n    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))\n\n    storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)\n    storage_2 = await storage_type.open(storage_client=FileSystemStorageClient(), configuration=config)\n    assert storage_1 is not storage_2\n\n\nasync def test_same_storage_when_different_client(tmp_path: Path, storage_type: type[Storage]) -> None:\n    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))\n\n    storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)\n    storage_2 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)\n    assert storage_1 is storage_2\n\n\nasync def test_unique_storage_by_storage_type(tmp_path: Path) -> None:\n    config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))\n    storage_client = MemoryStorageClient()\n\n    kvs = await KeyValueStore.open(storage_client=storage_client, configuration=config)\n    dataset = await Dataset.open(storage_client=storage_client, configuration=config)\n    assert kvs is not dataset\n\n\nasync def test_unique_storage_by_name(storage_type: type[Storage]) -> None:\n    \"\"\"Test that StorageInstanceManager support different storage clients at the same time.\"\"\"\n    storage_client = MemoryStorageClient()\n\n    storage_1 = await storage_type.open(storage_client=storage_client, name='kvs1')\n    storage_2 = await storage_type.open(storage_client=storage_client, name='kvs2')\n    assert storage_1 is not storage_2\n\n\nasync def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path, storage_type: type[Storage]) -> None:\n    \"\"\"Test that StorageInstanceManager support unique cache key. Difference in storage_dir.\"\"\"\n    path_1 = tmp_path / 'dir1'\n    path_2 = tmp_path / 'dir2'\n    path_1.mkdir()\n    path_2.mkdir()\n\n    config_1 = Configuration(storage_dir=str(path_1))\n\n    config_2 = Configuration(storage_dir=str(path_2))\n\n    storage_client = FileSystemStorageClient()\n\n    storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1)\n    storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2)\n    assert storage_1 is not storage_2\n\n\nasync def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path, storage_type: type[Storage]) -> None:\n    \"\"\"Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same\n    storage.\"\"\"\n    config_1 = Configuration(storage_dir=str(tmp_path))\n\n    config_2 = Configuration(storage_dir=str(tmp_path))\n\n    storage_client = FileSystemStorageClient()\n\n    storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1)\n    storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2)\n    assert storage_1 is storage_2\n\n\nasync def test_identical_storage_default_config(storage_type: type[Storage]) -> None:\n    \"\"\"Test that StorageInstanceManager correctly caches storage based on the storage client.\"\"\"\n    storage_client = MemoryStorageClient()\n\n    storage_1 = await storage_type.open(storage_client=storage_client)\n    storage_2 = await storage_type.open(storage_client=storage_client)\n    assert storage_1 is storage_2\n\n\nasync def test_identical_storage_default_storage(storage_type: type[Storage]) -> None:\n    \"\"\"Test that StorageInstanceManager correctly caches storage based on the storage client.\"\"\"\n    storage_1 = await storage_type.open()\n    storage_2 = await storage_type.open()\n    assert storage_1 is storage_2\n\n\nasync def test_identical_storage_clear_cache(storage_type: type[Storage]) -> None:\n    storage_1 = await storage_type.open()\n    service_locator.storage_instance_manager.clear_cache()\n    storage_2 = await storage_type.open()\n    assert storage_1 is not storage_2\n\n\nasync def test_identical_storage_remove_from_cache(storage_type: type[Storage]) -> None:\n    storage_1 = await storage_type.open()\n    service_locator.storage_instance_manager.remove_from_cache(storage_1)\n    storage_2 = await storage_type.open()\n    assert storage_1 is not storage_2\n\n\nasync def test_preexisting_unnamed_storage_open_by_id(storage_type: type[Storage]) -> None:\n    \"\"\"Test that persisted pre-existing unnamed storage can be opened by ID.\"\"\"\n    storage_client = FileSystemStorageClient()\n    storage_1 = await storage_type.open(alias='custom_name', storage_client=storage_client)\n\n    # Make service_locator unaware of this storage\n    service_locator.storage_instance_manager.clear_cache()\n\n    storage_1_again = await storage_type.open(id=storage_1.id, storage_client=storage_client)\n\n    assert storage_1.id == storage_1_again.id\n\n\n@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')\nasync def test_concurrent_open_datasets() -> None:\n    \"\"\"Test that concurrent open datasets with the same name return the same instance.\"\"\"\n    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415\n\n    barrier = Barrier(2)\n\n    async def push_data(data: dict) -> None:\n        await barrier.wait()\n        dataset = await Dataset.open(name='concurrent-storage')\n        await dataset.push_data(data)\n\n    await asyncio.gather(\n        push_data({'test_1': '1'}),\n        push_data({'test_2': '2'}),\n    )\n\n    dataset = await Dataset.open(name='concurrent-storage')\n\n    items = await dataset.get_data()\n    assert len(items.items) == 2\n\n    await dataset.drop()\n\n\n@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.')\nasync def test_concurrent_open_datasets_with_same_name_and_alias() -> None:\n    \"\"\"Test that concurrent open requests for the same storage return the same instance.\"\"\"\n    from asyncio import Barrier  # type:ignore[attr-defined] # noqa: PLC0415\n\n    valid_kwargs: dict[str, str | None] = {}\n\n    exception_calls = AsyncMock()\n\n    barrier = Barrier(2)\n\n    async def open_dataset(name: str | None, alias: str | None) -> None:\n        await barrier.wait()\n        try:\n            await Dataset.open(name=name, alias=alias)\n            valid_kwargs['name'] = name\n            valid_kwargs['alias'] = alias\n        except ValueError:\n            await exception_calls()\n\n    await asyncio.gather(\n        open_dataset(name=None, alias='concurrent-storage'),\n        open_dataset(name='concurrent-storage', alias=None),\n    )\n\n    # Ensure that a ValueError was raised due to name/alias conflict\n    exception_calls.assert_called_once()\n\n    dataset = await Dataset.open(name=valid_kwargs.get('name'), alias=valid_kwargs.get('alias'))\n\n    await dataset.drop()\n"
  },
  {
    "path": "tests/unit/test_cli.py",
    "content": "from __future__ import annotations\n\nimport os\nfrom unittest.mock import ANY, Mock\n\nimport pytest\nimport readchar\nfrom typer.testing import CliRunner\n\nimport crawlee._cli\n\nrunner = CliRunner()\n\n\n@pytest.fixture\ndef mock_cookiecutter(monkeypatch: pytest.MonkeyPatch) -> Mock:\n    mock_cookiecutter = Mock()\n    monkeypatch.setattr(target=crawlee._cli, name='cookiecutter', value=mock_cookiecutter)\n\n    return mock_cookiecutter\n\n\ndef test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None:\n    mock_input = iter(\n        [\n            *'my_project',\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n        ]\n    )\n    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))\n\n    result = runner.invoke(crawlee._cli.cli, ['create'])\n    assert 'Your project \"my_project\" was created.' in result.output\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'poetry',\n            'crawler_type': 'beautifulsoup',\n            'http_client': 'impit',\n            'enable_apify_integration': False,\n            'start_url': 'https://crawlee.dev',\n            'install_project': True,\n        },\n    )\n\n\ndef test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch) -> None:\n    mock_input = iter(\n        [\n            *'my_project',\n            readchar.key.ENTER,\n            readchar.key.DOWN,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n        ]\n    )\n    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))\n\n    result = runner.invoke(crawlee._cli.cli, ['create'])\n    assert 'Your project \"my_project\" was created.' in result.output\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'poetry',\n            'crawler_type': 'parsel',\n            'http_client': 'impit',\n            'enable_apify_integration': False,\n            'start_url': 'https://crawlee.dev',\n            'install_project': True,\n        },\n    )\n\n\ndef test_create_non_interactive(mock_cookiecutter: Mock) -> None:\n    runner.invoke(\n        crawlee._cli.cli,\n        [\n            'create',\n            'my_project',\n            '--crawler-type',\n            'playwright',\n            '--http-client',\n            'httpx',\n            '--package-manager',\n            'pip',\n            '--start-url',\n            'https://yr.no',\n            '--no-apify',\n            '--no-install',\n        ],\n    )\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'pip',\n            'crawler_type': 'playwright',\n            'http_client': 'httpx',\n            'start_url': 'https://yr.no',\n            'enable_apify_integration': False,\n            'install_project': False,\n        },\n    )\n\n\ndef test_create_existing_folder(\n    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory\n) -> None:\n    mock_input = iter(\n        [\n            *'my_project',\n            readchar.key.ENTER,\n        ]\n    )\n    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))\n\n    tmp = tmp_path_factory.mktemp('workdir')\n    os.chdir(tmp)\n    (tmp / 'existing_project').mkdir()\n\n    result = runner.invoke(\n        crawlee._cli.cli,\n        [\n            'create',\n            'existing_project',\n            '--crawler-type',\n            'playwright',\n            '--http-client',\n            'httpx',\n            '--package-manager',\n            'pip',\n            '--start-url',\n            'https://yr.no',\n            '--no-apify',\n            '--install',\n        ],\n    )\n    assert 'existing_project already exists' in result.output\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'pip',\n            'crawler_type': 'playwright',\n            'http_client': 'httpx',\n            'start_url': 'https://yr.no',\n            'enable_apify_integration': False,\n            'install_project': True,\n        },\n    )\n\n\ndef test_create_existing_folder_interactive(\n    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory\n) -> None:\n    mock_input = iter(\n        [\n            *'existing_project',\n            readchar.key.ENTER,\n            *'my_project',\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n        ]\n    )\n    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))\n\n    tmp = tmp_path_factory.mktemp('workdir')\n    os.chdir(tmp)\n    (tmp / 'existing_project').mkdir()\n\n    result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright'])\n    assert 'existing_project already exists' in result.output\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'poetry',\n            'crawler_type': 'playwright',\n            'http_client': 'impit',\n            'start_url': 'https://crawlee.dev',\n            'enable_apify_integration': False,\n            'install_project': True,\n        },\n    )\n\n\ndef test_create_existing_folder_interactive_multiple_attempts(\n    mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory\n) -> None:\n    mock_input = iter(\n        [\n            *'existing_project',\n            readchar.key.ENTER,\n            *'existing_project_2',\n            readchar.key.ENTER,\n            *'my_project',\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n            readchar.key.ENTER,\n        ]\n    )\n    monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input))\n\n    tmp = tmp_path_factory.mktemp('workdir')\n    os.chdir(tmp)\n    (tmp / 'existing_project').mkdir()\n    (tmp / 'existing_project_2').mkdir()\n\n    result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright'])\n    assert 'existing_project already exists' in result.output\n\n    mock_cookiecutter.assert_called_with(\n        template=ANY,\n        no_input=True,\n        extra_context={\n            'project_name': 'my_project',\n            'package_manager': 'poetry',\n            'crawler_type': 'playwright',\n            'http_client': 'impit',\n            'start_url': 'https://crawlee.dev',\n            'enable_apify_integration': False,\n            'install_project': True,\n        },\n    )\n"
  },
  {
    "path": "tests/unit/test_configuration.py",
    "content": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING\n\nfrom anyio import Path as AnyioPath\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.crawlers import HttpCrawler, HttpCrawlingContext\nfrom crawlee.statistics import Statistics\nfrom crawlee.storage_clients import MemoryStorageClient\nfrom crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient\n\nif TYPE_CHECKING:\n    from pathlib import Path\n\n    from yarl import URL\n\n\ndef test_global_configuration_works() -> None:\n    assert (\n        Configuration.get_global_configuration()\n        is Configuration.get_global_configuration()\n        is service_locator.get_configuration()\n        is service_locator.get_configuration()\n    )\n\n\ndef test_global_configuration_works_reversed() -> None:\n    assert (\n        service_locator.get_configuration()\n        is service_locator.get_configuration()\n        is Configuration.get_global_configuration()\n        is Configuration.get_global_configuration()\n    )\n\n\nasync def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None:\n    \"\"\"Make the Crawler use MemoryStorageClient which can't persist state.\"\"\"\n    service_locator.set_configuration(Configuration(storage_dir=str(tmp_path)))\n    crawler = HttpCrawler(storage_client=MemoryStorageClient())\n\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        await context.push_data({'url': context.request.url})\n\n    await crawler.run([str(server_url)])\n\n    # Verify that no files were created in the storage directory.\n    content = [path async for path in AnyioPath(tmp_path).iterdir()]\n    assert content == [], 'Expected the storage directory to be empty, but it is not.'\n\n\nasync def test_storage_persisted_with_explicit_statistics_with_persistable_storage(\n    tmp_path: Path, server_url: URL\n) -> None:\n    \"\"\"Make the Crawler use MemoryStorageClient which can't persist state,\n    but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state.\"\"\"\n\n    configuration = Configuration(storage_dir=str(tmp_path))\n    service_locator.set_configuration(configuration)\n    service_locator.set_storage_client(FileSystemStorageClient())\n\n    crawler = HttpCrawler(\n        storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True)\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        await context.push_data({'url': context.request.url})\n\n    await crawler.run([str(server_url)])\n\n    # Verify that files were created in the storage directory.\n    content = [path async for path in AnyioPath(tmp_path).iterdir()]\n    assert content != [], 'Expected the storage directory to contain files, but it does not.'\n\n\nasync def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None:\n    configuration = Configuration(\n        storage_dir=str(tmp_path),\n    )\n\n    storage_client = FileSystemStorageClient()\n\n    crawler = HttpCrawler(\n        configuration=configuration,\n        storage_client=storage_client,\n    )\n\n    @crawler.router.default_handler\n    async def default_handler(context: HttpCrawlingContext) -> None:\n        await context.push_data({'url': context.request.url})\n\n    await crawler.run([str(server_url)])\n\n    # Verify that files were created in the storage directory.\n    content = [path async for path in AnyioPath(tmp_path).iterdir()]\n    assert content != [], 'Expected the storage directory to contain files, but it does not.'\n"
  },
  {
    "path": "tests/unit/test_log_config.py",
    "content": "from __future__ import annotations\n\nimport logging\nimport sys\n\nimport pytest\n\nfrom crawlee._log_config import CrawleeLogFormatter\n\n\ndef get_log_record(level: int, msg: str, exc_info: logging._SysExcInfoType | None = None) -> logging.LogRecord:\n    return logging.LogRecord(\n        name='test',\n        level=level,\n        pathname=__file__,\n        lineno=0,\n        msg=msg,\n        args=(),\n        exc_info=exc_info,\n    )\n\n\n@pytest.mark.parametrize(\n    ('level', 'msg', 'expected'),\n    [\n        (logging.DEBUG, 'Debug message', '\\x1b[90m[test]\\x1b[0m \\x1b[34mDEBUG\\x1b[0m Debug message'),\n        (logging.INFO, 'Info message', '\\x1b[90m[test]\\x1b[0m \\x1b[32mINFO \\x1b[0m Info message'),\n        (logging.WARNING, 'Warning message', '\\x1b[90m[test]\\x1b[0m \\x1b[33mWARN \\x1b[0m Warning message'),\n        (logging.ERROR, 'Error message', '\\x1b[90m[test]\\x1b[0m \\x1b[31mERROR\\x1b[0m Error message'),\n    ],\n    ids=['debug', 'info', 'warning', 'error'],\n)\ndef test_formatted_message(level: int, msg: str, expected: str) -> None:\n    formatter = CrawleeLogFormatter()\n    record = get_log_record(level, msg)\n    formatted_message = formatter.format(record)\n    assert formatted_message == expected\n\n\ndef test_formatting_with_exception() -> None:\n    formatter = CrawleeLogFormatter()\n    try:\n        raise ValueError('This is a test exception')\n\n    except ValueError:\n        exc_info = sys.exc_info()\n        record = get_log_record(logging.ERROR, 'Exception occurred', exc_info=exc_info)\n        formatted_message = formatter.format(record)\n\n        assert '\\x1b[90m[test]\\x1b[0m \\x1b[31mERROR\\x1b[0m Exception occurred' in formatted_message\n        assert 'ValueError: This is a test exception' in formatted_message\n\n\ndef test_formatter_without_name() -> None:\n    formatter = CrawleeLogFormatter(include_logger_name=False)\n    record = get_log_record(logging.INFO, 'Info message without name')\n    formatted_message = formatter.format(record)\n    assert formatted_message == '\\x1b[32mINFO \\x1b[0m Info message without name'\n"
  },
  {
    "path": "tests/unit/test_router.py",
    "content": "from __future__ import annotations\n\nimport logging\nfrom unittest.mock import AsyncMock, Mock\n\nimport pytest\n\nfrom crawlee import Request\nfrom crawlee._types import BasicCrawlingContext\nfrom crawlee.router import Router\nfrom crawlee.sessions import Session\n\n\nclass MockContext(BasicCrawlingContext):\n    def __init__(self, *, label: str | None) -> None:\n        super().__init__(\n            request=Request.from_url(url='https://example.com/', user_data={'label': label}),\n            session=Session(),\n            send_request=AsyncMock(),\n            add_requests=AsyncMock(),\n            proxy_info=AsyncMock(),\n            push_data=AsyncMock(),\n            use_state=AsyncMock(),\n            get_key_value_store=AsyncMock(),\n            log=logging.getLogger(),\n        )\n\n\nasync def test_router_no_handlers() -> None:\n    router = Router[MockContext]()\n\n    with pytest.raises(RuntimeError):\n        await router(MockContext(label=None))\n\n\nasync def test_router_no_default_handler() -> None:\n    router = Router[MockContext]()\n    mock_handler = Mock()\n\n    @router.handler('A')\n    async def handler_a(_context: MockContext) -> None:\n        mock_handler()\n\n    with pytest.raises(RuntimeError):\n        await router(MockContext(label='B'))\n\n    mock_handler.assert_not_called()\n\n\nasync def test_router_default_handler_invoked() -> None:\n    router = Router[MockContext]()\n    mock_default_handler = Mock()\n    mock_handler_a = Mock()\n\n    @router.handler('A')\n    async def handler_a(_context: MockContext) -> None:\n        mock_handler_a()\n\n    @router.default_handler\n    async def default_handler(_context: MockContext) -> None:\n        mock_default_handler()\n\n    await router(MockContext(label='B'))\n\n    mock_default_handler.assert_called()\n    mock_handler_a.assert_not_called()\n\n\nasync def test_router_specific_handler_invoked() -> None:\n    router = Router[MockContext]()\n    mock_default_handler = Mock()\n    mock_handler_a = Mock()\n    mock_handler_b = Mock()\n\n    @router.handler('A')\n    async def handler_a(_context: MockContext) -> None:\n        mock_handler_a()\n\n    @router.handler('B')\n    async def handler_b(_context: MockContext) -> None:\n        mock_handler_b()\n\n    @router.default_handler\n    async def default_handler(_context: MockContext) -> None:\n        mock_default_handler()\n\n    await router(MockContext(label='B'))\n\n    mock_default_handler.assert_not_called()\n    mock_handler_a.assert_not_called()\n    mock_handler_b.assert_called()\n\n\nasync def test_router_handler_not_nullified() -> None:\n    router = Router[MockContext]()\n\n    @router.handler('A')\n    async def handler_a(_context: MockContext) -> None:\n        pass\n\n    assert handler_a is not None\n\n\nasync def test_router_multi_labelled_handler() -> None:\n    router = Router[MockContext]()\n    mock_handler = Mock()\n\n    @router.handler('A')\n    @router.handler('B')\n    async def handler(_context: MockContext) -> None:\n        mock_handler(_context.request.label)\n\n    await router(MockContext(label='A'))\n    mock_handler.assert_called_with('A')\n    await router(MockContext(label='B'))\n    mock_handler.assert_called_with('B')\n    assert mock_handler.call_count == 2\n"
  },
  {
    "path": "tests/unit/test_service_locator.py",
    "content": "from __future__ import annotations\n\nimport pytest\n\nfrom crawlee import service_locator\nfrom crawlee.configuration import Configuration\nfrom crawlee.errors import ServiceConflictError\nfrom crawlee.events import LocalEventManager\nfrom crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient\n\n\ndef test_default_configuration() -> None:\n    default_config = Configuration()\n    config = service_locator.get_configuration()\n    assert config == default_config  # == because these are in fact different instances, which should be fine\n\n\ndef test_custom_configuration() -> None:\n    custom_config = Configuration(default_browser_path='custom_path')\n    service_locator.set_configuration(custom_config)\n    config = service_locator.get_configuration()\n    assert config is custom_config\n\n\ndef test_configuration_overwrite_not_possible() -> None:\n    default_config = Configuration()\n    service_locator.set_configuration(default_config)\n\n    custom_config = Configuration(default_browser_path='custom_path')\n    with pytest.raises(ServiceConflictError):\n        service_locator.set_configuration(custom_config)\n\n\ndef test_configuration_conflict() -> None:\n    service_locator.get_configuration()\n    custom_config = Configuration(default_browser_path='custom_path')\n\n    with pytest.raises(ServiceConflictError, match=r'Configuration is already in use.'):\n        service_locator.set_configuration(custom_config)\n\n\ndef test_default_event_manager() -> None:\n    default_event_manager = service_locator.get_event_manager()\n    assert isinstance(default_event_manager, LocalEventManager)\n\n\ndef test_custom_event_manager() -> None:\n    custom_event_manager = LocalEventManager()\n    service_locator.set_event_manager(custom_event_manager)\n    event_manager = service_locator.get_event_manager()\n    assert event_manager is custom_event_manager\n\n\ndef test_event_manager_overwrite_not_possible() -> None:\n    custom_event_manager = LocalEventManager()\n    service_locator.set_event_manager(custom_event_manager)\n\n    another_custom_event_manager = LocalEventManager()\n    with pytest.raises(ServiceConflictError):\n        service_locator.set_event_manager(another_custom_event_manager)\n\n\ndef test_event_manager_conflict() -> None:\n    service_locator.get_event_manager()\n    custom_event_manager = LocalEventManager()\n\n    with pytest.raises(ServiceConflictError, match=r'EventManager is already in use.'):\n        service_locator.set_event_manager(custom_event_manager)\n\n\ndef test_default_storage_client() -> None:\n    default_storage_client = service_locator.get_storage_client()\n    assert isinstance(default_storage_client, FileSystemStorageClient)\n\n\ndef test_custom_storage_client() -> None:\n    custom_storage_client = MemoryStorageClient()\n    service_locator.set_storage_client(custom_storage_client)\n    storage_client = service_locator.get_storage_client()\n    assert storage_client is custom_storage_client\n\n\ndef test_storage_client_overwrite_not_possible() -> None:\n    custom_storage_client = MemoryStorageClient()\n    service_locator.set_storage_client(custom_storage_client)\n\n    another_custom_storage_client = MemoryStorageClient()\n    with pytest.raises(ServiceConflictError):\n        service_locator.set_storage_client(another_custom_storage_client)\n\n\ndef test_storage_client_conflict() -> None:\n    service_locator.get_storage_client()\n    custom_storage_client = MemoryStorageClient()\n\n    with pytest.raises(ServiceConflictError, match=r'StorageClient is already in use.'):\n        service_locator.set_storage_client(custom_storage_client)\n"
  },
  {
    "path": "tests/unit/utils.py",
    "content": "import sys\n\nimport pytest\n\nrun_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x\n"
  },
  {
    "path": "typos.toml",
    "content": "# Configuration for typos spell checker\n# https://github.com/crate-ci/typos\n\n[default]\nextend-ignore-re = [\n    \"https?://[^\\\\s]+\", # Ignore URLs\n    \"'gASV[^']+\",       # Ignore base64-encoded pickle data\n]\n\n[files]\n# Extend the default exclude list\nextend-exclude = [\n    \"*.lock\",\n    \"*.min.js\",\n    \"*.min.css\",\n    \"CHANGELOG.md\",\n]\n\n[default.extend-identifiers]\n# Add project-specific identifiers that should not be treated as typos\nser_json_inf_nan = \"ser_json_inf_nan\" # Pydantic config parameter\nasend = \"asend\" # Python async generator method\n\n[default.extend-words]\n# Add project-specific words that should not be treated as typos\nmke = \"mke\" # Sennheiser MKE product name\nconsts = \"consts\"  # Common abbreviation for \"constants\"\n"
  },
  {
    "path": "website/.eslintrc.json",
    "content": "{\n    \"extends\": [\n        \"@apify/eslint-config-ts\",\n        \"plugin:react/recommended\",\n        \"plugin:react-hooks/recommended\"\n    ],\n    \"parserOptions\": {\n        \"project\": \"./tsconfig.eslint.json\",\n        \"ecmaFeatures\": {\n            \"jsx\": true\n        },\n        \"ecmaVersion\": 2020\n    },\n    \"env\": {\n        \"browser\": true\n    },\n    \"settings\": {\n        \"react\": {\n            \"version\": \"detect\"\n        }\n    },\n    \"rules\": {\n        \"quote-props\": [\"error\", \"consistent\"],\n        \"no-void\": 0\n    },\n    \"root\": true\n}\n"
  },
  {
    "path": "website/.yarnrc.yml",
    "content": "nodeLinker: node-modules\nenableGlobalCache: true\n"
  },
  {
    "path": "website/babel.config.js",
    "content": "module.exports = {\n    presets: [require.resolve('@docusaurus/core/lib/babel/preset')],\n};\n"
  },
  {
    "path": "website/build_api_reference.sh",
    "content": "#!/bin/bash\n\n# Generate import shortcuts from the modules\npython generate_module_shortcuts.py\n"
  },
  {
    "path": "website/docusaurus.config.js",
    "content": "/* eslint-disable global-require */\nconst path = require('path');\n\nconst { externalLinkProcessor } = require('./tools/utils/externalLink');\n\nconst GROUP_ORDER = [\n    'Autoscaling',\n    'Browser management',\n    'Configuration',\n    'Crawlers',\n    'Crawling contexts',\n    'Errors',\n    'Event data',\n    'Event managers',\n    'Functions',\n    'HTTP clients',\n    'HTTP parsers',\n    'Request loaders',\n    'Session management',\n    'Statistics',\n    'Storage clients',\n    'Storage data',\n    'Storages',\n    'Other',\n];\n\nconst groupSort = (g1, g2) => {\n    if (GROUP_ORDER.includes(g1) && GROUP_ORDER.includes(g2)) {\n        return GROUP_ORDER.indexOf(g1) - GROUP_ORDER.indexOf(g2);\n    }\n    return g1.localeCompare(g2);\n};\n\n/** @type {Partial<import('@docusaurus/types').DocusaurusConfig>} */\nmodule.exports = {\n    title: 'Crawlee for Python · Fast, reliable Python web crawlers.',\n    url: 'https://crawlee.dev',\n    baseUrl: '/python/',\n    trailingSlash: false,\n    organizationName: 'apify',\n    projectName: 'crawlee-python',\n    scripts: [\n        '/python/js/custom.js',\n        '/crawlee-python/js/custom.js',\n    ],\n    githubHost: 'github.com',\n    future: {\n        experimental_faster: true,\n        v4: {\n            removeLegacyPostBuildHeadAttribute: true,\n            useCssCascadeLayers: false, // this breaks styles on homepage and link colors everywhere\n        },\n    },\n    headTags: [\n        // Intercom messenger\n        {\n            tagName: 'script',\n            innerHTML: `window.intercomSettings={api_base:\"https://api-iam.intercom.io\",app_id:\"kod1r788\"};`,\n            attributes: {},\n        },\n        // Intercom messenger\n        {\n            tagName: 'script',\n            innerHTML: `(function(){var w=window;var ic=w.Intercom;if(typeof ic===\"function\"){ic('reattach_activator');ic('update',w.intercomSettings);}else{var d=document;var i=function(){i.c(arguments);};i.q=[];i.c=function(args){i.q.push(args);};w.Intercom=i;var l=function(){var s=d.createElement('script');s.type='text/javascript';s.async=true;s.src='https://widget.intercom.io/widget/kod1r788';var x=d.getElementsByTagName('script')[0];x.parentNode.insertBefore(s,x);};if(document.readyState==='complete'){l();}else if(w.attachEvent){w.attachEvent('onload',l);}else{w.addEventListener('load',l,false);}}})()`,\n            attributes: {},\n        },\n    ],\n    favicon: 'img/favicon.ico',\n    customFields: {\n        markdownOptions: {\n            html: true,\n        },\n        gaGtag: true,\n        repoUrl: 'https://github.com/apify/crawlee-python',\n    },\n    onBrokenLinks: 'throw',\n    markdown: {\n        mermaid: true,\n        hooks: {\n            onBrokenMarkdownLinks: 'throw',\n        },\n    },\n    themes: [\n        '@docusaurus/theme-mermaid',\n    ],\n    presets: /** @type {import('@docusaurus/types').PresetConfig[]} */ ([\n        [\n            '@docusaurus/preset-classic',\n            /** @type {import('@docusaurus/preset-classic').Options} */\n            ({\n                docs: {\n                    showLastUpdateAuthor: true,\n                    showLastUpdateTime: true,\n                    path: '../docs',\n                    sidebarPath: './sidebars.js',\n                    rehypePlugins: [externalLinkProcessor],\n                    // disableVersioning: true,\n                    editUrl: (doc) => {\n                        return `https://github.com/apify/crawlee-python/edit/master/website/${doc.versionDocsDirPath}/${doc.docPath}`;\n                    },\n                },\n                theme: {\n                    customCss: '/src/css/custom.css',\n                },\n            }),\n        ],\n    ]),\n    plugins: [\n        [\n            '@apify/docusaurus-plugin-typedoc-api',\n            {\n                projectRoot: '.',\n                changelogs: false,\n                readmes: false,\n                packages: [{ path: '.' }],\n                typedocOptions: {\n                    excludeExternals: false,\n                },\n                sortSidebar: groupSort,\n                routeBasePath: 'api',\n                pythonOptions: {\n                    pythonModulePath: path.join(__dirname, '../src/crawlee'),\n                    moduleShortcutsPath: path.join(__dirname, 'module_shortcuts.json'),\n                },\n            },\n        ],\n        // [\n        //     '@docusaurus/plugin-client-redirects',\n        //     {\n        //         redirects: [\n        //             {\n        //                 from: '/docs',\n        //                 to: '/docs/quick-start',\n        //             },\n        //             {\n        //                 from: '/docs/next',\n        //                 to: '/docs/next/quick-start',\n        //             },\n        //             {\n        //                 from: '/docs/guides/environment-variables',\n        //                 to: '/docs/guides/configuration',\n        //             },\n        //             {\n        //                 from: '/docs/guides/getting-started',\n        //                 to: '/docs/introduction',\n        //             },\n        //             {\n        //                 from: '/docs/guides/apify-platform',\n        //                 to: '/docs/deployment/apify-platform',\n        //             },\n        //         ],\n        //         createRedirects(existingPath) {\n        //             if (!existingPath.endsWith('/')) {\n        //                 return `${existingPath}/`;\n        //             }\n        //\n        //             return undefined; // Return a falsy value: no redirect created\n        //         },\n        //     },\n        // ],\n        [\n            'docusaurus-gtm-plugin',\n            {\n                id: 'GTM-5P7MCS7',\n            },\n        ],\n        [\n            '@signalwire/docusaurus-plugin-llms-txt',\n            {\n                enableDescriptions: false,\n                content: {\n                    includeVersionedDocs: false,\n                    enableLlmsFullTxt: true,\n                    relativePaths: false,\n                },\n            },\n        ],\n        async function runnableCodeBlock() {\n            return {\n                name: 'runnable-code-block',\n                configureWebpack() {\n                    return {\n                        resolveLoader: {\n                            alias: {\n                                'roa-loader': require.resolve(`${__dirname}/roa-loader/`),\n                            },\n                        },\n                    };\n                },\n            };\n        },\n        // skipping svgo for animated crawlee logo\n        async function doNotUseSVGO() {\n            return {\n                name: 'docusaurus-svgo',\n                configureWebpack(config) {\n                    // find the svg rule\n                    const svgRule = config.module.rules.find((r) => typeof r === 'object' && r.test.toString() === '/\\\\.svg$/i');\n\n                    // find the svgr loader\n                    const svgrLoader = svgRule?.oneOf?.[0];\n\n                    // make copy of svgr loader and disable svgo\n                    const svgrLoaderCopy = JSON.parse(JSON.stringify(svgrLoader));\n\n                    // include only animated logo\n                    svgrLoaderCopy.include = /animated-crawlee-logo/;\n\n                    // turn off svgo\n                    svgrLoaderCopy.use[0].options.svgo = false;\n\n                    // insert the copy after the original svgr loader\n                    svgRule.oneOf.splice(1, 0, svgrLoaderCopy);\n\n                    // exclude animated logo from the first svgr loader (with svgo enabled)\n                    svgrLoader.exclude = /animated-crawlee-logo/;\n\n                    return {\n                        mergeStrategy: {\n                            'module.rules': 'replace',\n                        },\n                        module: {\n                            rules: config.module.rules,\n                        },\n                    };\n                },\n            };\n        },\n        [\n            path.resolve(__dirname, 'src/plugins/docusaurus-plugin-segment'),\n            {\n                writeKey: process.env.SEGMENT_TOKEN,\n                allowedInDev: false,\n            },\n        ],\n    ],\n    themeConfig:\n    /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ ({\n        docs: {\n            versionPersistence: 'localStorage',\n            sidebar: {\n                hideable: true,\n            },\n        },\n        navbar: {\n            hideOnScroll: true,\n            logo: {\n                src: 'img/crawlee-python-light.svg',\n                srcDark: 'img/crawlee-python-dark.svg',\n            },\n            title: 'Crawlee for Python',\n            items: [\n                {\n                    type: 'doc',\n                    docId: 'quick-start/quick-start',\n                    label: 'Docs',\n                    position: 'left',\n                },\n                {\n                    type: 'doc',\n                    docId: '/examples',\n                    label: 'Examples',\n                    position: 'left',\n                },\n                {\n                    to: '/api',\n                    label: 'API',\n                    position: 'left',\n                    activeBaseRegex: 'api/(?!.*/changelog)',\n                },\n                {\n                    type: 'doc',\n                    label: 'Changelog',\n                    docId: 'changelog',\n                    className: 'changelog',\n                },\n                {\n                    href: 'https://crawlee.dev/blog',\n                    target: '_self',\n                    rel: 'dofollow',\n                    label: 'Blog',\n                    position: 'left',\n                },\n            ],\n        },\n        colorMode: {\n            defaultMode: 'light',\n            disableSwitch: false,\n            respectPrefersColorScheme: true,\n        },\n        prism: {\n            defaultLanguage: 'typescript',\n            theme: require('prism-react-renderer').themes.github,\n            darkTheme: require('prism-react-renderer').themes.dracula,\n            additionalLanguages: ['docker', 'log', 'bash', 'diff', 'json'],\n        },\n        metadata: [\n            // eslint-disable-next-line max-len\n            { name: 'description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` },\n            // eslint-disable-next-line max-len\n            { name: 'og:description', content: `Crawlee helps you build and maintain your Python crawlers. It's open source and modern, with type hints for Python to help you catch bugs early.` },\n        ],\n        image: 'img/crawlee-python-og.png',\n        footer: {\n            links: [\n                {\n                    title: 'Docs',\n                    items: [\n                        {\n                            label: 'Guides',\n                            to: 'docs/guides',\n                        },\n                        {\n                            label: 'Examples',\n                            to: 'docs/examples',\n                        },\n                        {\n                            label: 'API reference',\n                            to: 'api',\n                        },\n                        {\n                            label: 'Changelog',\n                            to: 'docs/changelog',\n                        },\n                    ],\n                },\n                {\n                    title: 'Product',\n                    items: [\n                        {\n                            label: 'Discord',\n                            href: 'https://discord.com/invite/jyEM2PRvMU',\n                        },\n                        {\n                            label: 'Stack Overflow',\n                            href: 'https://stackoverflow.com/questions/tagged/crawlee-python',\n                        },\n                        {\n                            label: 'Twitter',\n                            href: 'https://twitter.com/apify',\n                        },\n                        {\n                            label: 'YouTube',\n                            href: 'https://www.youtube.com/apify',\n                        },\n                    ],\n                },\n                {\n                    title: 'More',\n                    items: [\n                        {\n                            label: 'Apify platform',\n                            href: 'https://apify.com',\n                        },\n                        {\n                            label: 'Docusaurus',\n                            href: 'https://docusaurus.io',\n                        },\n                        {\n                            label: 'GitHub',\n                            href: 'https://github.com/apify/crawlee-python',\n                        },\n                    ],\n                },\n            ],\n        },\n        algolia: {\n            appId: '5JC94MPMLY',\n            apiKey: '878493fcd7001e3c179b6db6796a999b', // search only (public) API key\n            indexName: 'crawlee_python',\n            placeholder: 'Search documentation',\n            algoliaOptions: {\n                facetFilters: ['version:VERSION'],\n            },\n            translations: {\n                button: {\n                    buttonText: 'Search documentation...',\n                },\n            },\n        },\n    }),\n};\n"
  },
  {
    "path": "website/generate_module_shortcuts.py",
    "content": "#!/usr/bin/env python3\n\nfrom __future__ import annotations\n\nimport importlib\nimport inspect\nimport json\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from types import ModuleType\n\n\ndef get_module_shortcuts(module: ModuleType, parent_classes: list | None = None) -> dict:\n    \"\"\"Traverse a module and its submodules to identify and register shortcuts for classes.\"\"\"\n    shortcuts = {}\n\n    if parent_classes is None:\n        parent_classes = []\n\n    parent_module_name = '.'.join(module.__name__.split('.')[:-1])\n    module_classes = []\n\n    for classname, cls in inspect.getmembers(module, inspect.isclass):\n        module_classes.append(cls)\n        if cls in parent_classes:\n            shortcuts[f'{module.__name__}.{classname}'] = f'{parent_module_name}.{classname}'\n\n    for _, submodule in inspect.getmembers(module, inspect.ismodule):\n        if submodule.__name__.startswith('apify'):\n            shortcuts.update(get_module_shortcuts(submodule, module_classes))\n\n    return shortcuts\n\n\ndef resolve_shortcuts(shortcuts: dict) -> None:\n    \"\"\"Resolve linked shortcuts.\n\n    For example, if there are shortcuts A -> B and B -> C, resolve them to A -> C.\n    \"\"\"\n    for source, target in shortcuts.items():\n        while target in shortcuts:\n            shortcuts[source] = shortcuts[target]\n            target = shortcuts[target]  # noqa: PLW2901\n\n\nshortcuts = {}\nfor module_name in ['crawlee']:\n    try:\n        module = importlib.import_module(module_name)\n        module_shortcuts = get_module_shortcuts(module)\n        shortcuts.update(module_shortcuts)\n    except ModuleNotFoundError:  # noqa: PERF203\n        pass\n\nresolve_shortcuts(shortcuts)\n\nwith Path('module_shortcuts.json').open('w', encoding='utf-8') as shortcuts_file:\n    json.dump(shortcuts, shortcuts_file, indent=4, sort_keys=True)\n"
  },
  {
    "path": "website/package.json",
    "content": "{\n    \"name\": \"crawlee\",\n    \"scripts\": {\n        \"examples\": \"docusaurus-examples\",\n        \"postinstall\": \"npx patch-package\",\n        \"start\": \"rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && docusaurus start\",\n        \"start:fast\": \"rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && CRAWLEE_DOCS_FAST=1 docusaurus start\",\n        \"build\": \"rimraf .docusaurus && cp ../CHANGELOG.md ../docs/changelog.md && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs build\",\n        \"publish-gh-pages\": \"docusaurus-publish\",\n        \"write-translations\": \"docusaurus write-translations\",\n        \"version\": \"docusaurus version\",\n        \"rename-version\": \"docusaurus rename-version\",\n        \"prettify\": \"prettier --write --config ./tools/docs-prettier.config.js ../docs/guides/*.md\",\n        \"swizzle\": \"docusaurus swizzle\",\n        \"deploy\": \"rimraf .docusaurus && node --max_old_space_size=16000 node_modules/@docusaurus/core/bin/docusaurus.mjs deploy\",\n        \"docusaurus\": \"docusaurus\",\n        \"clean\": \"rimraf .docusaurus build\",\n        \"lint\": \"yarn lint:code\",\n        \"lint:fix\": \"yarn lint:code:fix\",\n        \"lint:code\": \"eslint .\",\n        \"lint:code:fix\": \"eslint . --fix\"\n    },\n    \"dependencies\": {\n        \"@apify/docusaurus-plugin-typedoc-api\": \"^5.1.0\",\n        \"@apify/utilities\": \"^2.8.0\",\n        \"@docusaurus/core\": \"^3.9.2\",\n        \"@docusaurus/faster\": \"^3.9.2\",\n        \"@docusaurus/mdx-loader\": \"^3.9.2\",\n        \"@docusaurus/plugin-client-redirects\": \"^3.9.2\",\n        \"@docusaurus/preset-classic\": \"^3.9.2\",\n        \"@docusaurus/theme-mermaid\": \"^3.9.2\",\n        \"@giscus/react\": \"^3.0.0\",\n        \"@mdx-js/react\": \"^3.0.1\",\n        \"@mermaid-js/layout-elk\": \"^0.2.0\",\n        \"@signalwire/docusaurus-plugin-llms-txt\": \"^1.2.1\",\n        \"axios\": \"^1.5.0\",\n        \"buffer\": \"^6.0.3\",\n        \"clsx\": \"^2.0.0\",\n        \"crypto-browserify\": \"^3.12.0\",\n        \"docusaurus-gtm-plugin\": \"^0.0.2\",\n        \"prism-react-renderer\": \"^2.1.0\",\n        \"process\": \"^0.11.10\",\n        \"prop-types\": \"^15.8.1\",\n        \"raw-loader\": \"^4.0.2\",\n        \"react\": \"^19.0.0\",\n        \"react-dom\": \"^19.0.0\",\n        \"react-github-btn\": \"^1.4.0\",\n        \"react-lite-youtube-embed\": \"^3.0.0\",\n        \"stream-browserify\": \"^3.0.0\",\n        \"unist-util-visit\": \"^5.0.0\"\n    },\n    \"devDependencies\": {\n        \"@apify/eslint-config-ts\": \"^0.4.0\",\n        \"@apify/tsconfig\": \"^0.1.0\",\n        \"@apify/ui-icons\": \"^1.23.0\",\n        \"@docusaurus/module-type-aliases\": \"^3.9.2\",\n        \"@docusaurus/types\": \"^3.9.2\",\n        \"@types/react\": \"^19.0.0\",\n        \"@typescript-eslint/eslint-plugin\": \"^8.46.0\",\n        \"@typescript-eslint/parser\": \"^8.46.0\",\n        \"eslint\": \"^10.0.0\",\n        \"eslint-plugin-react\": \"^7.37.5\",\n        \"eslint-plugin-react-hooks\": \"^7.0.0\",\n        \"fs-extra\": \"^11.1.0\",\n        \"patch-package\": \"^8.0.0\",\n        \"path-browserify\": \"^1.0.1\",\n        \"prettier\": \"^3.0.0\",\n        \"rimraf\": \"^6.0.0\",\n        \"typescript\": \"^5.9.3\"\n    },\n    \"packageManager\": \"yarn@4.13.0\"\n}\n"
  },
  {
    "path": "website/patches/@docusaurus+core+3.4.0.patch",
    "content": "diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\nindex 903f8dc..b6b60bf 100644\n--- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\n+++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\n@@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {\n         window.scrollTo(0, 0);\n     }\n     else {\n-        const id = decodeURIComponent(hash.substring(1));\n-        const element = document.getElementById(id);\n-        element?.scrollIntoView();\n+        setTimeout(() => {\n+            const id = decodeURIComponent(hash.substring(1));\n+            const element = document.getElementById(id);\n+            element?.scrollIntoView();\n+        }, 100);\n     }\n }\n function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {\n"
  },
  {
    "path": "website/patches/@docusaurus+core+3.5.2.patch",
    "content": "diff --git a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\nindex 903f8dc..b6b60bf 100644\n--- a/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\n+++ b/node_modules/@docusaurus/core/lib/client/ClientLifecyclesDispatcher.js\n@@ -30,9 +30,11 @@ function scrollAfterNavigation({ location, previousLocation, }) {\n         window.scrollTo(0, 0);\n     }\n     else {\n-        const id = decodeURIComponent(hash.substring(1));\n-        const element = document.getElementById(id);\n-        element?.scrollIntoView();\n+        setTimeout(() => {\n+            const id = decodeURIComponent(hash.substring(1));\n+            const element = document.getElementById(id);\n+            element?.scrollIntoView();\n+        }, 100);\n     }\n }\n function ClientLifecyclesDispatcher({ children, location, previousLocation, }) {\n"
  },
  {
    "path": "website/roa-loader/index.js",
    "content": "const { createHash } = require('node:crypto');\nconst { inspect } = require('node:util');\n\nconst { urlToRequest } = require('loader-utils');\n\nconst signingUrl = new URL('https://api.apify.com/v2/tools/encode-and-sign');\nsigningUrl.searchParams.set('token', process.env.APIFY_SIGNING_TOKEN);\nconst queue = [];\nconst cache = {};\nlet working = false;\n\nfunction hash(source) {\n    return createHash('sha1').update(source).digest('hex');\n}\n\nasync function getHash(source) {\n    const cacheKey = hash(source);\n\n    if (cache[cacheKey]) {\n        return cache[cacheKey];\n    }\n\n    const memory = source.match(/playwright|puppeteer/i) ? 4096 : 1024;\n    const res = await (await fetch(signingUrl, {\n        method: 'POST',\n        body: JSON.stringify({\n            input: JSON.stringify({ code: source }),\n            options: {\n                build: 'latest',\n                contentType: 'application/json; charset=utf-8',\n                memory,\n                timeout: 180,\n            },\n        }),\n        headers: {\n            'Content-Type': 'application/json; charset=utf-8',\n        },\n    }));\n\n    if (!res.ok) {\n        console.error(`Signing failed: ${res.status} ${res.statusText}`, await res.text());\n        return 'invalid-token';\n    }\n\n    const body = await res.json();\n\n    if (!body.data || !body.data.encoded) {\n        console.error(`Signing failed:' ${inspect(body.error) || 'Unknown error'}`, body);\n        return 'invalid-token';\n    }\n\n    cache[cacheKey] = body.data.encoded;\n    await new Promise((resolve) => setTimeout(resolve, 100));\n\n    return body.data.encoded;\n}\n\nasync function encodeAndSign(source) {\n    if (!process.env.APIFY_SIGNING_TOKEN) {\n        return 'invalid-token';\n    }\n\n    if (working) {\n        return new Promise((resolve, reject) => {\n            queue.push(() => {\n                return getHash(source).then(resolve, reject);\n            });\n        });\n    }\n\n    let res;\n\n    try {\n        working = true;\n        res = await getHash(source);\n\n        while (queue.length) {\n            await queue.shift()();\n        }\n    } finally {\n        working = false;\n    }\n\n    return res;\n}\n\nmodule.exports = async function (code) {\n    if (process.env.CRAWLEE_DOCS_FAST) {\n        return { code, hash: 'fast' };\n    }\n\n    console.log(`Signing ${urlToRequest(this.resourcePath)}...`, { working, queue: queue.length });\n    const codeHash = await encodeAndSign(code);\n    return { code, hash: codeHash };\n};\n"
  },
  {
    "path": "website/roa-loader/package.json",
    "content": "{\n  \"name\": \"roa-loader\",\n  \"version\": \"1.0.0\",\n  \"description\": \"\",\n  \"main\": \"index.js\",\n  \"scripts\": {\n    \"test\": \"echo \\\"Error: no test specified\\\" && exit 1\"\n  },\n  \"keywords\": [],\n  \"author\": \"\",\n  \"license\": \"ISC\",\n  \"dependencies\": {\n    \"loader-utils\": \"^3.2.1\"\n  }\n}\n"
  },
  {
    "path": "website/sidebars.js",
    "content": "module.exports = {\n    docs: [\n        'quick-start/quick-start',\n        {\n            type: 'category',\n            label: 'Introduction',\n            collapsed: false,\n            link: {\n                type: 'doc',\n                id: 'introduction/introduction',\n            },\n            items: [\n                'introduction/setting-up',\n                'introduction/first-crawler',\n                'introduction/adding-more-urls',\n                'introduction/real-world-project',\n                'introduction/crawling',\n                'introduction/scraping',\n                'introduction/saving-data',\n                'introduction/refactoring',\n                'introduction/deployment',\n            ],\n        },\n        {\n            type: 'category',\n            label: 'Guides',\n            collapsed: true,\n            link: {\n                type: 'generated-index',\n                title: 'Guides',\n                slug: '/guides',\n                keywords: ['guides'],\n            },\n            items: [\n                {\n                    type: 'autogenerated',\n                    dirName: 'guides',\n                },\n            ],\n        },\n        {\n            type: 'category',\n            label: 'Deployment',\n            collapsed: true,\n            link: {\n                type: 'generated-index',\n                title: 'Deployment guides',\n                description: 'Here you can find guides on how to deploy your crawlers to various cloud providers.',\n                slug: '/deployment',\n            },\n            items: [\n                {\n                    type: 'doc',\n                    id: 'deployment/apify-platform',\n                    label: 'Deploy on Apify',\n                },\n                {\n                    type: 'doc',\n                    id: 'deployment/aws-lambda',\n                    label: 'Deploy on AWS Lambda'\n                },\n                {\n                    type: 'category',\n                    label: 'Deploy to Google Cloud',\n                    items: [\n                        'deployment/gcp-cloud-run-functions',\n                        'deployment/gcp-cloud-run',\n                    ],\n                },\n            ],\n        },\n        {\n            type: 'category',\n            label: 'Examples',\n            collapsed: true,\n            link: {\n                type: 'generated-index',\n                title: 'Examples',\n                slug: '/examples',\n                keywords: ['examples'],\n            },\n            items: [\n                {\n                    type: 'autogenerated',\n                    dirName: 'examples',\n                },\n            ],\n        },\n        // {\n        //     type: 'category',\n        //     label: 'Experiments',\n        //     link: {\n        //         type: 'generated-index',\n        //         title: 'Experiments',\n        //         slug: '/experiments',\n        //         keywords: ['experiments', 'experimental-features'],\n        //     },\n        //     items: [\n        //         {\n        //             type: 'autogenerated',\n        //             dirName: 'experiments',\n        //         },\n        //     ],\n        // },\n        {\n            type: 'category',\n            label: 'Upgrading',\n            collapsed: true,\n            link: {\n                type: 'generated-index',\n                title: 'Upgrading',\n                slug: '/upgrading',\n                keywords: ['upgrading'],\n            },\n            items: [\n                {\n                    type: 'autogenerated',\n                    dirName: 'upgrading',\n                },\n            ],\n        },\n        {\n            type: 'doc',\n            label: 'Changelog',\n            id: 'changelog',\n        },\n    ],\n};\n"
  },
  {
    "path": "website/src/components/ApiLink.jsx",
    "content": "import React from 'react';\nimport Link from '@docusaurus/Link';\n// eslint-disable-next-line import/no-extraneous-dependencies\nimport { useDocsVersion } from '@docusaurus/theme-common/internal';\nimport useDocusaurusContext from '@docusaurus/useDocusaurusContext';\n\n// const pkg = require('../../../packages/crawlee/package.json');\n//\n// const [v1, v2] = pkg.version.split('.');\n// const stable = [v1, v2].join('.');\n\nconst ApiLink = ({ to, children }) => {\n    return (\n        <Link to={`/api/${to}`}>{children}</Link>\n    );\n\n    // const version = useDocsVersion();\n    // const { siteConfig } = useDocusaurusContext();\n    //\n    // // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) {\n    // if (siteConfig.presets[0][1].docs.disableVersioning) {\n    //     return (\n    //         <Link to={`/api/${to}`}>{children}</Link>\n    //     );\n    // }\n    //\n    // return (\n    //     <Link to={`/api/${version.version === 'current' ? 'next' : version.version}/${to}`}>{children}</Link>\n    // );\n};\n\nexport default ApiLink;\n"
  },
  {
    "path": "website/src/components/Button.jsx",
    "content": "import Link from '@docusaurus/Link';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './Button.module.css';\nimport CrawleeSvg from '../../static/img/crawlee-logo-monocolor.svg';\n\nexport default function Button({ children, to, withIcon, type = 'primary', className, isBig }) {\n    return (\n        <Link to={to} target=\"_self\" rel=\"dofollow\">\n            <span className={clsx(\n                className,\n                styles.button,\n                type === 'primary' && styles.buttonPrimary,\n                type === 'secondary' && styles.buttonSecondary,\n                isBig && styles.big,\n            )}>\n                {withIcon && <CrawleeSvg />}\n                {children}\n            </span>\n        </Link>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Button.module.css",
    "content": ".button {\n    display: inline-flex;\n    align-items: center;\n    text-align: center;\n    padding: 8px 16px;\n    border-radius: 8px;\n    font-family: (--ifm-font-family-base);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 500;\n    line-height: 24px;\n    cursor: pointer;\n    transition: background-color 0.2s;\n\n    svg {\n        margin-right: 8px;\n    }\n}\n\n.buttonPrimary {\n    background-color: var(--color-black-action);\n    color: var(--color-text-on-primary);\n    border: none;\n\n    path {\n        stroke: var(--color-text-on-primary);\n        &:first-child {\n            fill: var(--color-text-on-primary);\n        }\n    }\n}\n\n.buttonPrimary:hover {\n    background-color: var(--color-primary-action-hover);\n}\n\n.buttonSecondary {\n    background-color: var(--color-background);\n    color: var(--color-text);\n    border: 1px solid var(--color-border);\n\n    path {\n        stroke: var(--color-black-action);\n        &:first-child {\n            fill: var(--color-black-action);\n        }\n    }\n}\n\n.buttonSecondary:hover {\n    border: 1px solid var(--color-text);\n}\n\n.big {\n    padding: 12px 24px;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .button {\n        width: auto;\n    }\n}\n"
  },
  {
    "path": "website/src/components/CopyButton.jsx",
    "content": "/* eslint-disable max-len */\nimport clsx from 'clsx';\nimport React, { useState } from 'react';\n\nimport styles from './CopyButton.module.css';\n\nexport default function CopyButton({ copyText, compact = false, className }) {\n    const [copied, setCopied] = useState(false);\n    const copy = async () => {\n        await navigator.clipboard.writeText(copyText);\n        setCopied(true);\n        setTimeout(() => setCopied(false), 2000);\n    };\n    return <button\n        type=\"button\"\n        aria-label=\"Copy code to clipboard\"\n        title=\"Copy\"\n        onClick={copy}\n        className={clsx(className, styles.copyButton, compact ? styles.copyButtonCompact : styles.copyButtonDefault)}\n    >\n        {copied\n            ? <svg width=\"20\" height=\"20\" viewBox=\"0 0 20 20\" xmlns=\"http://www.w3.org/2000/svg\">\n                <path fillRule=\"evenodd\" clipRule=\"evenodd\" d=\"M18.0303 5.09467C18.3232 5.38756 18.3232 5.86244 18.0303 6.15533L8.03033 16.1553C7.73744 16.4482 7.26256 16.4482 6.96967 16.1553L2.59467 11.7803C2.30178 11.4874 2.30178 11.0126 2.59467 10.7197C2.88756 10.4268 3.36244 10.4268 3.65533 10.7197L7.5 14.5643L16.9697 5.09467C17.2626 4.80178 17.7374 4.80178 18.0303 5.09467Z\" />\n            </svg>\n\n            : <svg width=\"20\" height=\"20\" viewBox=\"0 0 20 20\" xmlns=\"http://www.w3.org/2000/svg\">\n                <path\n                    fillRule=\"evenodd\"\n                    clipRule=\"evenodd\"\n                    d=\"M8.375 2.375C7.13236 2.375 6.125 3.38236 6.125 4.625V6.125H4.625C3.38236 6.125 2.375 7.13236 2.375 8.375V15.375C2.375 16.6176 3.38236 17.625 4.625 17.625H11.625C12.8676 17.625 13.875 16.6176 13.875 15.375V13.875H15.375C16.6176 13.875 17.625 12.8676 17.625 11.625V4.625C17.625 3.38236 16.6176 2.375 15.375 2.375H8.375ZM13.875 12.375H15.375C15.7892 12.375 16.125 12.0392 16.125 11.625V4.625C16.125 4.21079 15.7892 3.875 15.375 3.875H8.375C7.96079 3.875 7.625 4.21079 7.625 4.625V6.125H11.625C12.8676 6.125 13.875 7.13236 13.875 8.375V12.375ZM4.625 7.625C4.21079 7.625 3.875 7.96079 3.875 8.375V15.375C3.875 15.7892 4.21079 16.125 4.625 16.125H11.625C12.0392 16.125 12.375 15.7892 12.375 15.375V8.375C12.375 7.96079 12.0392 7.625 11.625 7.625H4.625Z\" />\n            </svg>\n        }\n    </button>;\n}\n"
  },
  {
    "path": "website/src/components/CopyButton.module.css",
    "content": ".copyButton {\n  all: unset;\n  display: inline-flex;\n  align-items: center;\n  justify-content: center;\n  box-sizing: border-box;\n  cursor: pointer;\n  fill: var(--color-icon);\n\n  svg {\n    flex-shrink: 0;\n  }\n}\n\n.copyButtonDefault {\n  width: 28px;\n  height: 28px;\n  background-color: var(--color-background-muted);\n  border: 1px solid var(--color-border);\n  border-radius: 6px;\n  transition: background-color 0.12s ease-out;\n\n  &:hover {\n      background-color: var(--color-hover);\n  }\n\n  svg {\n    padding: 1px;\n  }\n}\n\n.copyButtonCompact {\n  svg {\n    width: 16px;\n    height: 16px;\n  }\n}"
  },
  {
    "path": "website/src/components/Gradients.jsx",
    "content": "import React from 'react';\n\nexport default function Gradients() {\n    return (\n        <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"0\" height=\"0\" viewBox=\"0 0 0 0\" fill=\"none\">\n            <defs>\n                <linearGradient id=\"gradient-1\" x1=\"26.6667\" y1=\"12\" x2=\"14.2802\" y2=\"34.5208\"\n                                gradientUnits=\"userSpaceOnUse\">\n                    <stop offset=\"0%\" stop-color=\"#9dceff\"/>\n                    <stop offset=\"70%\" stop-color=\"#4584b6\"/>\n                    <stop offset=\"100%\" stop-color=\"#4584b6\"/>\n                </linearGradient>\n                <linearGradient id=\"gradient-2\" x1=\"29.6667\" y1=\"0\" x2=\"-1.80874\" y2=\"26.2295\"\n                                gradientUnits=\"userSpaceOnUse\">\n                <stop offset=\"0%\" stop-color=\"#4584b6\"/>\n                </linearGradient>\n            </defs>\n        </svg>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Highlights.jsx",
    "content": "import React from 'react';\nimport clsx from 'clsx';\nimport styles from './Highlights.module.css';\nimport Gradients from './Gradients';\n\nconst FeatureList = [\n    {\n        title: 'Python with type hints',\n        Svg: require('../../static/img/features/runs-on-py.svg').default,\n        description: (\n            <>\n                Crawlee for Python is written in a modern way using type hints, providing code completion in your IDE\n                and helping you catch bugs early on build time.\n            </>\n        ),\n    },\n    // {\n    //     title: 'HTTP scraping',\n    //     Svg: require('../../static/img/features/fingerprints.svg').default,\n    //     description: (\n    //         <>\n    //             Crawlee makes HTTP requests that <a href=\"https://crawlee.dev/docs/guides/avoid-blocking\"><b>mimic browser headers and TLS fingerprints</b></a>.\n    //             It also rotates them automatically based on data about real-world traffic. Popular HTML\n    //             parsers <b><a href=\"https://crawlee.dev/docs/guides/cheerio-crawler-guide\">Cheerio</a>&nbsp;\n    //             and <a href=\"https://crawlee.dev/docs/guides/jsdom-crawler-guide\">JSDOM</a></b> are included.\n    //         </>\n    //     ),\n    // },\n    {\n        title: 'Headless browsers',\n        Svg: require('../../static/img/features/works-everywhere.svg').default,\n        description: (\n            <>\n                Switch your crawlers from HTTP to a <a href=\"https://crawlee.dev/python/api/class/PlaywrightCrawler\">headless browser</a> in 3 lines of code.\n                Crawlee builds on top of <b>Playwright</b> and adds its own features. Chrome, Firefox and more.\n            </>\n        ),\n\n        // TODO: this is not true yet\n        // Crawlee builds on top of <b>Playwright</b> and adds its own <b>anti-blocking features and human-like fingerprints</b>. Chrome, Firefox and more.\n    },\n    {\n        title: 'Automatic scaling and proxy management',\n        Svg: require('../../static/img/features/auto-scaling.svg').default,\n        description: (\n            <>\n                Crawlee automatically manages concurrency based on <a href=\"https://crawlee.dev/python/api/class/AutoscaledPool\">available system resources</a> and&nbsp;\n                <a href=\"https://crawlee.dev/python/api/class/ProxyConfiguration\">smartly rotates proxies</a>.\n                Proxies that often time-out, return network errors or bad HTTP codes like 401 or 403 are discarded.\n            </>\n        ),\n    },\n    // {\n    //     title: 'Queue and Storage',\n    //     Svg: require('../../static/img/features/storage.svg').default,\n    //     description: (\n    //         <>\n    //             You can <a href=\"https://crawlee.dev/docs/guides/result-storage\">save files, screenshots and JSON results</a> to disk with one line of code\n    //             or plug an adapter for your DB. Your URLs are <a href=\"https://crawlee.dev/docs/guides/request-storage\">kept in a queue</a> that ensures their\n    //             uniqueness and that you don't lose progress when something fails.\n    //         </>\n    //     ),\n    // },\n    // {\n    //     title: 'Helpful utils and configurability',\n    //     Svg: require('../../static/img/features/node-requests.svg').default,\n    //     description: (\n    //         <>\n    //             Crawlee includes tools for <a href=\"https://crawlee.dev/api/utils/namespace/social\">extracting social handles</a> or phone numbers, infinite scrolling, blocking\n    //             unwanted assets <a href=\"https://crawlee.dev/api/utils\">and many more</a>. It works great out of the box, but also provides&nbsp;\n    //             <a href=\"https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions\">rich configuration options</a>.\n    //         </>\n    //     ),\n    // },\n];\n\nfunction Feature({ Svg, title, description }) {\n    return (\n        <div className={clsx('col col--4')}>\n            <div className=\"padding-horiz--md padding-bottom--md\">\n                <div className={styles.featureIcon}>\n                    {Svg ? <Svg alt={title}/> : null}\n                </div>\n                <h3>{title}</h3>\n                <p>{description}</p>\n            </div>\n        </div>\n    );\n}\n\nexport default function Highlights() {\n    return (\n        <section className={styles.features}>\n            <Gradients />\n            <div className=\"container\">\n                <div className=\"row\">\n                    {FeatureList.map((props, idx) => (\n                        <Feature key={idx} {...props} />\n                    ))}\n                </div>\n            </div>\n        </section>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Highlights.module.css",
    "content": ".features {\n    display: flex;\n    align-items: center;\n    width: 100%;\n    font-size: 18px;\n    line-height: 32px;\n    color: #41465d;\n}\n\nhtml[data-theme=\"dark\"] .features {\n    color: #b3b8d2;\n}\n\n.feature svg {\n    height: 60px;\n    width: 60px;\n}\n\n.features svg path:nth-child(1) {\n    fill: url(#gradient-1) !important;\n}\n\n.features svg path:nth-child(n + 1) {\n    fill: url(#gradient-2) !important;\n}\n\nhtml[data-theme=\"dark\"] .featureIcon {\n    background: #272c3d;\n}\n\n.featureIcon {\n    display: flex;\n    justify-content: center;\n    align-items: center;\n    margin-bottom: 24px;\n    border-radius: 8px;\n    background-color: #f2f3fb;\n    width: 48px;\n    height: 48px;\n}\n\n.features h3 {\n    font-weight: 700;\n    font-size: 18px;\n    line-height: 32px;\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageCliExample.jsx",
    "content": "import React from 'react';\n\nimport CopyButton from '../CopyButton';\nimport styles from './HomepageCliExample.module.css';\n\nconst cliCommand = `uvx 'crawlee[cli]' create my-crawler`;\n\nexport default function CliExample() {\n    return (\n        <section className={styles.cliExampleSection}>\n            <div className={styles.cliExampleTitle}>\n                Or start with a template from our CLI\n            </div>\n            <code className={styles.cliExampleCodeBlock}>\n                <pre>\n                    <span className={styles.cliCommandPrefix}>$</span>\n                    {cliCommand}\n                    <CopyButton copyText={cliCommand} />\n                </pre>\n            </code>\n            <div className={styles.cliExampleSubtitle}>\n                Built with 🤍 by Apify. Forever free and open-source.\n            </div>\n        </section>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageCliExample.module.css",
    "content": ".cliExampleSection {\n    display: flex;\n    flex-direction: column;\n    justify-content: center;\n    align-items: center;\n    text-align: center;\n    padding: 16px;\n}\n\n.cliExampleTitle {\n    color: var(--color-text-muted);\n    font-size: 18px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 28px;\n    margin-bottom: 16px;\n}\n\n.cliExampleCodeBlock {\n    width: fit-content;\n    height: fit-content;\n    padding: 0;\n    border: 0;\n    margin-bottom: 18px;\n    width: 100%;\n\n    pre {\n        margin: 0;\n        width: 100%;\n        padding: 8px 16px;\n        background-color: var(--color-background-muted);\n        border: 1px solid var(--color-border);\n        display: flex;\n        align-items: center;\n        gap: 16px;\n        font-size: 14px;\n        line-height: 20px;\n\n        button {\n            margin-left: auto;\n        }\n    }\n\n    .cliCommandPrefix {\n        color: var(--color-text-muted);\n        user-select: none;\n    }\n\n    /* TABLET */\n    @media (min-width: 768px) {\n        max-width: 526px;\n    }\n}\n\n.cliExampleSubtitle {\n    color: var(--color-text-subtle);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 24px;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .cliExampleSection {\n        padding: 64px 0;\n    }\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageCtaSection.jsx",
    "content": "import { useColorMode } from '@docusaurus/theme-common';\nimport React from 'react';\n\nimport AnimatedLogoDark from './animated-crawlee-logo-dark.svg';\nimport AnimatedLogoLight from './animated-crawlee-logo-light.svg';\nimport styles from './HomepageCtaSection.module.css';\nimport homepageStyles from '../../pages/index.module.css';\nimport Button from '../Button';\n\nexport default function HomepageCtaSection() {\n    const { colorMode } = useColorMode();\n    return (\n        <section className={styles.ctaSection}>\n            <h2 className={styles.ctaTitle}>Get started now!</h2>\n            <div className={styles.ctaDescription}>\n                Crawlee won’t fix broken selectors for you (yet), but it makes\n                building and maintaining reliable crawlers faster and easier—so\n                you can focus on what matters most.\n            </div>\n            <div className={styles.ctaButtonContainer}>\n                <Button to=\"/docs/quick-start\" withIcon type=\"primary\" isBig>\n                    Get started\n                </Button>\n            </div>\n\n            <div\n                className={homepageStyles.fadedOutSeparator}\n                id={styles.ctaFadedOutSeparator}\n            />\n            <div\n                className={homepageStyles.fadedOutSeparatorVertical}\n                id={styles.fadedOutSeparatorVerticalLeft}\n            />\n            <div\n                className={homepageStyles.fadedOutSeparatorVertical}\n                id={styles.fadedOutSeparatorVerticalRight}\n            />\n            <div\n                className={homepageStyles.dashedDecorativeCircle}\n                id={styles.ctaDashedCircleRight}\n            />\n\n            {colorMode === 'dark' ? (\n                <AnimatedLogoDark className={styles.ctaImage} />\n            ) : (\n                <AnimatedLogoLight className={styles.ctaImage} />\n            )}\n        </section>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageCtaSection.module.css",
    "content": ".ctaSection {\n    position: relative;\n    display: flex;\n    flex-direction: column;\n    justify-content: center;\n    align-items: center;\n    text-align: center;\n    padding: 16px;\n    padding-bottom: 0;\n    gap: 24px;\n    overflow: clip;\n}\n\n.ctaTitle {\n    color: var(--color-text);\n    font-family: 'Lota Grotesque';\n    font-size: 36px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 46px;\n    margin: 0;\n}\n\n.ctaDescription {\n    color: var(--color-text-muted);\n    font-size: 18px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 28px;\n    max-width: 780px;\n}\n\n.ctaButtonContainer {\n    display: flex;\n    flex-direction: column;\n    justify-content: center;\n    align-items: center;\n    text-align: center;\n    gap: 16px;\n    width: 100%;\n}\n\n.ctaImage {\n    z-index: -1;\n    margin-top: -90px;\n    margin-bottom: -30px;\n    min-height: 400px;\n}\n\n#ctaFadedOutSeparator {\n    position: absolute;\n    top: 370px;\n    width: 100%;\n    z-index: -2;\n}\n\n#fadedOutSeparatorVerticalLeft {\n    position: absolute;\n    left: 190px;\n    bottom: 0;\n    height: 100%;\n    z-index: -2;\n}\n\n#fadedOutSeparatorVerticalRight {\n    position: absolute;\n    right: 190px;\n    bottom: 0;\n    height: 100%;\n    z-index: -2;\n}\n\n#ctaDashedCircleRight {\n    position: absolute;\n    right: -120px;\n    top: 370px;\n    z-index: -2;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .ctaSection {\n        padding-top: 80px;\n    }\n\n    .ctaTitle {\n        font-size: 48px;\n        line-height: 56px;\n    }\n\n    .ctaButtonContainer {\n        flex-direction: row;\n    }\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageHeroSection.jsx",
    "content": "import React from 'react';\n\nimport styles from './HomepageHeroSection.module.css';\nimport homepageStyles from '../../pages/index.module.css';\n\nexport default function HomepageHeroSection() {\n    return (\n        <section className={styles.hero}>\n            <h1 className={styles.heroTitle}>\n                Build reliable web scrapers. Fast.\n            </h1>\n            <div\n                className={homepageStyles.dashedSeparator}\n                id={styles.separatorHeroHeader}\n            />\n            <p className={styles.heroSubtitle}>\n                Crawlee is a web scraping library for JavaScript and Python. It\n                handles blocking, crawling, proxies, and browsers for you.\n            </p>\n            <div\n                className={homepageStyles.dashedSeparator}\n                id={styles.separatorHeroHeader2}\n            >\n                <div\n                    className={homepageStyles.dashedDecorativeCircle}\n                    id={styles.heroDecorativeCircle}\n                />\n            </div>\n        </section>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/HomepageHeroSection.module.css",
    "content": ".hero {\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n    justify-content: center;\n    padding: 32px 0;\n    h1 {\n        padding-inline: 12px;\n    }\n}\n\n.heroTitle {\n    color: var(--color-text);\n    font-size: 52px;\n    line-height: 60px;\n    font-weight: 400;\n    text-align: center;\n    margin: 0 0 16px 0;\n}\n\n.heroSubtitle {\n    color: var(--color-text-muted);\n    font-size: 18px;\n    line-height: 28px;\n    font-weight: 400;\n    text-align: center;\n    margin: 0 16px;\n    max-width: 792px;\n}\n\n#separatorHeroHeader {\n    display: none;\n}\n\n#separatorHeroHeader2 {\n    display: none;\n}\n\n#heroDecorativeCircle {\n    width: 60px;\n    height: 60px;\n    right: -60px;\n    top: 0px;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .hero {\n        padding: 64px 0 0 0;\n        h1 {\n            padding-inline: 24px;\n        }\n    }\n    .heroTitle {\n        font-size: 54px;\n        line-height: 64px;\n        margin: 0 16px 24px 16px;\n    }\n    .heroSubtitle {\n        margin: 0 16px 30px 16px;\n    }\n    #separatorHeroHeader {\n        display: none;\n    }\n    #separatorHeroHeader2 {\n        display: block;\n    }\n}\n\n/* DESKTOP */\n@media (min-width: 1024px) {\n    .hero {\n        padding: 120px 0 0 0;\n    }\n    .heroSubtitle {\n        margin: 30px 16px;\n    }\n    #separatorHeroHeader {\n        display: block;\n    }\n}\n"
  },
  {
    "path": "website/src/components/Homepage/LanguageInfoWidget.jsx",
    "content": "import { useColorMode } from '@docusaurus/theme-common';\nimport ThemedImage from '@theme/ThemedImage';\nimport clsx from 'clsx';\nimport React from 'react';\nimport GitHubButton from 'react-github-btn';\n\nimport Button from '../Button';\nimport CopyButton from '../CopyButton';\nimport styles from './LanguageInfoWidget.module.css';\n\nexport default function LanguageInfoWidget({\n    language,\n    command,\n    to,\n    githubUrl,\n}) {\n    const { colorMode } = useColorMode();\n    return (\n        <div className={styles.languageGetStartedContainer}>\n            {language === 'JavaScript' && (\n                <ThemedImage\n                    sources={{\n                        light: 'img/crawlee-javascript-light.svg',\n                        dark: 'img/crawlee-javascript-dark.svg',\n                    }}\n                    alt=\"Crawlee JavaScript\"\n                />\n            )}\n            {language === 'Python' && (\n                <ThemedImage\n                    sources={{\n                        light: 'img/crawlee-python-light.svg',\n                        dark: 'img/crawlee-python-dark.svg',\n                    }}\n                    alt=\"Crawlee Python\"\n                />\n            )}\n            <div className={clsx(styles.buttonContainer)}>\n                <Button to={to}>\n                    {command ? 'Learn more' : 'Get started'}\n                </Button>\n                <GitHubButton\n                    href={githubUrl}\n                    data-color-scheme={colorMode}\n                    data-show-count=\"true\"\n                    aria-label=\"Star crawlee on GitHub\"\n                    data-size=\"large\"\n                    style={{ minHeight: '28px' }}\n                >\n                    Star\n                </GitHubButton>\n            </div>\n            {command && (\n                <code className={styles.commandContainer}>\n                    {command} <CopyButton copyText={command} compact />\n                </code>\n            )}\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/LanguageInfoWidget.module.css",
    "content": ".languageGetStartedContainer {\n    margin: 0;\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n    padding-inline: 12px;\n}\n\n.languageGetStartedContainer img {\n    height: 40px;\n    margin-bottom: 16px;\n}\n\n.buttonContainer {\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n    gap: 16px;\n    & > span {\n        line-height: 0;\n        min-height: 28px;\n    }\n    a,\n    a span {\n        min-width: 190px;\n        text-align: center;\n        justify-content: center;\n    }\n}\n\n.buttonContainer:has(+ code) {\n    margin-bottom: 16px;\n    gap: 12px;\n}\n\n.commandContainer {\n    margin: 0;\n    padding: 0;\n    color: var(--color-text);\n    font-size: 12px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 16px;\n    background-color: transparent;\n    border: 0;\n    display: flex;\n    align-items: center;\n}\n\n.commandContainer button {\n    opacity: 0;\n    transition: opacity var(--ifm-transition-fast) ease-in;\n}\n\n.commandContainer:hover button,\n.commandContainer button:hover {\n    opacity: 1;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .languageGetStartedContainer {\n        margin: 24px 0 40px 0;\n    }\n    .buttonContainer:has(+ code) {\n        flex-direction: row;\n    }\n    .buttonContainer:has(+ code) {\n        a,\n        a span {\n            min-width: 0;\n        }\n    }\n}\n"
  },
  {
    "path": "website/src/components/Homepage/LanguageSwitch.jsx",
    "content": "import React, { useCallback, useEffect, useRef, useState } from 'react';\nimport styles from './LanguageSwitch.module.css';\nimport clsx from 'clsx';\n\nexport default function LanguageSwitch({\n    options = ['JavaScript', 'Python'],\n    defaultOption = 'JavaScript',\n    onChange,\n}) {\n    const [activeOption, setActiveOption] = useState(defaultOption)\n    const [backgroundStyle, setBackgroundStyle] = useState({})\n    const optionRefs = useRef < (HTMLButtonElement | null)[] > ([])\n\n    const updateBackgroundStyle = useCallback(() => {\n        const activeIndex = options.indexOf(activeOption)\n        const activeElement = optionRefs.current[activeIndex]\n        if (activeElement) {\n            const { offsetLeft, offsetWidth } = activeElement\n            setBackgroundStyle({\n                transform: `translateX(${offsetLeft}px)`,\n                width: `${offsetWidth}px`,\n            })\n        }\n    }, [activeOption, options])\n\n    useEffect(() => {\n        updateBackgroundStyle()\n    }, [updateBackgroundStyle])\n\n    const handleOptionClick = (option) => {\n        setActiveOption(option)\n        onChange?.(option)\n    }\n\n    return (\n        <div className={styles.languageSwitch}>\n            {options.map((option, index) => (\n                <button\n                    key={option}\n                    ref={(el) => (optionRefs.current[index] = el)}\n                    className={clsx(styles.switchOption, option === activeOption && styles.active)}\n                    onClick={() => handleOptionClick(option)}\n                >\n                    {option}\n                </button>\n            ))}\n            <div className={styles.switchBackground} style={backgroundStyle} />\n        </div>\n    )\n}\n"
  },
  {
    "path": "website/src/components/Homepage/LanguageSwitch.module.css",
    "content": ".languageSwitch {\n    z-index: 1;\n    display: inline-flex;\n    position: relative;\n    background-color: var(--color-background-subtle);\n    border-radius: 6px;\n    padding: 4px;\n}\n\n.switchOption {\n    position: relative;\n    z-index: 1;\n    padding: 6px 16px;\n    font-size: 14px;\n    font-weight: 500;\n    color: var(--color-text-muted);\n    background: none;\n    border: none;\n    cursor: pointer;\n    transition: color 0.3s ease;\n}\n\n.switchOption:hover {\n    color: var(--color-text);\n}\n\n.switchOption.active {\n    color: var(--color-text);\n}\n\n.switchBackground {\n    position: absolute;\n    top: 4px;\n    bottom: 4px;\n    left: 0;\n    border-radius: 6px;\n    background-color: var(--color-background);\n    transition:\n        transform 0.3s ease,\n        width 0.3s ease;\n}\n"
  },
  {
    "path": "website/src/components/Homepage/RiverSection.jsx",
    "content": "import Link from '@docusaurus/Link';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './RiverSection.module.css';\n\nexport default function RiverSection({ title, description, content, reversed, to }) {\n    return (\n        <div className={styles.riverWrapper}>\n            <div className={clsx(styles.riverContainer, { [styles.riverReversed]: reversed })}>\n                <div className={clsx(styles.riverSection, styles.riverText)}>\n                    <h3 className={styles.riverTitle}>{title}</h3>\n                    <p className={styles.riverDescription}>{description}</p>\n                    <Link className={styles.riverButton} to={to}>\n                        Learn more\n                    </Link>\n                </div>\n                <div className={clsx(styles.riverSection, styles.riverContent)}>{content}</div>\n            </div>\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/RiverSection.module.css",
    "content": "/* Base styles */\n.riverWrapper {\n    width: 100%;\n    border-top: 1px solid var(--color-separator);\n    border-bottom: 1px solid var(--color-separator);\n}\n\n.riverContainer {\n    max-width: 1200px;\n    margin: 0 auto;\n    display: flex;\n    flex-direction: column;\n\n    /* Tablet layout */\n    @media (min-width: 768px) {\n        flex-direction: row;\n\n        &.riverReversed {\n            flex-direction: row-reverse;\n        }\n    }\n}\n\n.riverSection {\n    width: 100%;\n\n    /* Tablet layout */\n    @media (min-width: 768px) {\n        min-width: 0;\n        flex-basis: 50%;\n        flex-grow: 0;\n    }\n}\n\n.riverText {\n    padding: 24px 16px;\n\n    /* Tablet layout */\n    @media (min-width: 768px) {\n        padding: 40px 32px;\n    }\n\n    /* Desktop layout */\n    @media (min-width: 1024px) {\n        padding: 48px 80px;\n    }\n}\n\n/* Text styles */\n.riverTitle {\n    flex: 1;\n    margin-top: 0;\n    margin-bottom: 12px;\n    font-size: 32px;\n    font-weight: 400;\n    line-height: 40px;\n\n    /* Desktop layout */\n    @media (min-width: 1024px) {\n        max-width: 440px;\n    }\n}\n\n.riverDescription {\n    margin-bottom: 24px;\n    color: var(--color-text-muted);\n    font-size: 16px;\n    line-height: 24px;\n\n    /* Desktop layout */\n    @media (min-width: 1024px) {\n        max-width: 440px;\n    }\n}\n\n.riverButton {\n    cursor: pointer;\n    padding: 8px 12px;\n    background-color: transparent;\n    border: 1px solid var(--color-border);\n    border-radius: 12px;\n    display: flex;\n    align-items: center;\n    justify-content: center;\n    font-size: 16px;\n    line-height: 24px;\n    transition: background-color 0.12s ease-out;\n    width: fit-content;\n    color: var(--color-text);\n\n    &:hover {\n        background-color: var(--color-hover);\n        color: var(--color-text);\n    }\n\n    path {\n        stroke: var(--color-icon);\n    }\n}\n\n.riverButton::after {\n    content: '→';\n    margin-inline: 4px;\n    transition: margin 0.3s ease;\n}\n\n.riverButton:hover {\n    color: var(--color-text);\n    &::after {\n        margin: 0 0 0 8px;\n    }\n}\n\n.riverContent {\n    min-height: 180px;\n    background-color: var(--color-background-muted);\n    border-top: 1px solid var(--color-separator);\n    display: flex;\n    flex-direction: column;\n    overflow: hidden;\n\n    img {\n        max-height: 284px;\n        object-fit: cover;\n        height: 100%;\n        width: 100%;\n        margin-block: auto;\n    }\n\n    :global(.code-block) {\n        flex-grow: 1;\n        margin-bottom: 0;\n        border-radius: 0;\n        box-shadow: none;\n\n        :global(div[class*=\"codeBlockContent\"]) {\n            height: 100%;\n\n            pre {\n                height: 100%;\n                display: flex;\n                align-items: center;\n                background: var(--color-background-muted) !important;\n            }\n            code {\n                height: auto;\n                font-size: 14px;\n                background: var(--color-background-muted);\n                min-width: initial;\n                padding: 16px 8px 16px 4px;\n\n                span::before {\n                    margin-right: 16px;\n                    left: unset !important;\n                    color: var(--color-text-subtle);\n                    opacity: 1;\n                }\n            }\n        }\n    }\n\n    /* Tablet layout */\n    @media (min-width: 768px) {\n        border-top: none;\n        border-left: 1px solid var(--color-separator);\n    }\n\n    .riverReversed & {\n        /* Tablet layout */\n        @media (min-width: 768px) {\n            border-left: none;\n            border-right: 1px solid var(--color-separator);\n        }\n    }\n}\n"
  },
  {
    "path": "website/src/components/Homepage/ThreeCardsWithIcon.jsx",
    "content": "import Link from '@docusaurus/Link';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './ThreeCardsWithIcon.module.css';\n\nexport default function ThreeCardsWithIcon({ cards }) {\n    return (\n        <div className={styles.cardsWrapper}>\n            {cards?.map((card, index) => {\n                const content = (\n                    <>\n                        <div className={styles.cardIcon}>{card.icon}</div>\n                        <h3 className={styles.cardTitle}>{card.title}</h3>\n                        <p className={styles.cardDescription}>\n                            {card.description}\n                        </p>\n                        {card.actionLink && (\n                            <Link\n                                to={card.actionLink.href}\n                                className={styles.cardAction}\n                            >\n                                {card.actionLink.text}\n                            </Link>\n                        )}\n                    </>\n                );\n\n                if (card.to) {\n                    return (\n                        <Link\n                            className={clsx(\n                                styles.cardItem,\n                                styles.cardItemLink,\n                            )}\n                            to={card.to}\n                            key={index}\n                        >\n                            {content}\n                        </Link>\n                    );\n                }\n\n                return (\n                    <div className={styles.cardItem} key={index}>\n                        {content}\n                    </div>\n                );\n            })}\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/components/Homepage/ThreeCardsWithIcon.module.css",
    "content": ".cardsWrapper {\n    display: flex;\n    flex-direction: column;\n    border-block: 1px solid var(--color-separator);\n\n    @media (min-width: 768px) {\n        flex-direction: row;\n    }\n}\n\n/* Card styles */\n.cardItem {\n    display: flex;\n    flex: 1;\n    flex-direction: column;\n    padding: 40px 24px;\n    background: var(--color-card-background);\n    transition: background 0.1s ease;\n\n    border-bottom: 1px solid var(--color-separator);\n    &:last-child {\n        border-bottom: 0;\n    }\n\n    @media (min-width: 768px) {\n        border-bottom: 0;\n        border-right: 1px solid var(--color-separator);\n        &:last-child {\n            border-right: 0;\n        }\n    }\n}\n\na.cardItem:hover {\n    background: var(--color-card-background-hover);\n}\n\n.cardItem:has(:local(.cardAction)) {\n    padding: 24px;\n}\n\n.cardIcon {\n    margin-bottom: 16px;\n    display: flex;\n    align-items: center;\n    justify-content: center;\n\n    width: 72px;\n    height: 72px;\n\n    border-radius: 6px;\n    border: 1px solid var(--color-separator);\n    background: var(--color-background);\n}\n\n.cardIcon img {\n    width: 50px;\n}\n\n.cardTitle {\n    margin: 0;\n    margin-bottom: 8px;\n    color: var(--color-text);\n    font-size: 26px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 34px;\n}\n\n.cardDescription {\n    color: var(--color-text-muted);\n    font-family: var(--ifm-font-family-base);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 24px;\n    margin: 0;\n    margin-bottom: 12px;\n}\n\n.cardAction {\n    color: var(--color-text-muted);\n    font-family: var(--ifm-font-family-base);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 650;\n    line-height: 24px;\n    width: fit-content;\n    margin-top: auto;\n}\n\n.cardAction::after {\n    content: \"→\";\n    margin-left: 4px;\n    transition: margin 0.3s ease;\n}\n\n.cardAction:hover {\n    color: var(--color-text);\n    &::after {\n        margin-left: 8px;\n    }\n}\n"
  },
  {
    "path": "website/src/components/LLMButtons.jsx",
    "content": "import {\n    AnthropicIcon,\n    ChatGptIcon,\n    CheckIcon,\n    ChevronDownIcon,\n    CopyIcon,\n    ExternalLinkIcon,\n    LoaderIcon,\n    MarkdownIcon,\n    PerplexityIcon,\n} from '@apify/ui-icons';\nimport { useLocation } from '@docusaurus/router';\nimport clsx from 'clsx';\nimport React, {\n    useCallback,\n    useEffect,\n    useMemo,\n    useRef,\n    useState,\n} from 'react';\n\nimport styles from './LLMButtons.module.css';\n\nconst DROPDOWN_OPTIONS = [\n    {\n        label: 'Copy for LLM',\n        description: 'Copy page as Markdown for LLMs',\n        showExternalIcon: false,\n        icon: CopyIcon,\n        value: 'copyForLLM',\n        analytics: {\n            buttonText: 'Copy for LLM',\n            element: 'llm-buttons.copyForLLM',\n        },\n    },\n    {\n        label: 'View as Markdown',\n        description: 'View this page as plain text',\n        icon: MarkdownIcon,\n        value: 'viewAsMarkdown',\n        showExternalIcon: true,\n        analytics: {\n            buttonText: 'View as Markdown',\n            element: 'llm-buttons.viewAsMarkdown',\n        },\n    },\n    {\n        label: 'Open in ChatGPT',\n        description: 'Ask questions about this page',\n        icon: ChatGptIcon,\n        value: 'openInChatGPT',\n        showExternalIcon: true,\n        analytics: {\n            buttonText: 'Open in ChatGPT',\n            element: 'llm-buttons.openInChatGPT',\n        },\n    },\n    {\n        label: 'Open in Claude',\n        description: 'Ask questions about this page',\n        icon: AnthropicIcon,\n        value: 'openInClaude',\n        showExternalIcon: true,\n        analytics: {\n            buttonText: 'Open in Claude',\n            element: 'llm-buttons.openInClaude',\n        },\n    },\n    {\n        label: 'Open in Perplexity',\n        description: 'Ask questions about this page',\n        icon: PerplexityIcon,\n        value: 'openInPerplexity',\n        showExternalIcon: true,\n        analytics: {\n            buttonText: 'Open in Perplexity',\n            element: 'llm-buttons.openInPerplexity',\n        },\n    },\n];\n\nconst CHAT_GPT_BASE = 'https://chatgpt.com/?hints=search&q=';\nconst CLAUDE_BASE = 'https://claude.ai/new?q=';\nconst PERPLEXITY_BASE = 'https://www.perplexity.ai/search/new?q=';\n\nconst getPrompt = (currentUrl) => `Read from ${currentUrl} so I can ask questions about it.`;\nconst getMarkdownUrl = (currentUrl) => {\n    const url = new URL(currentUrl);\n    url.pathname = `${url.pathname.replace(/\\/$/, '')}.md`;\n    return url.toString();\n};\n\nconst trackClick = (buttonText, element) => {\n    if (typeof window !== 'undefined' && window.analytics) {\n        window.analytics.track('Clicked', {\n            app: 'crawlee',\n            button_text: buttonText,\n            element,\n        });\n    }\n};\n\nconst getOptionHref = (value, currentUrl) => {\n    if (!currentUrl) {\n        return undefined;\n    }\n\n    switch (value) {\n        case 'viewAsMarkdown':\n            return getMarkdownUrl(currentUrl);\n        case 'openInChatGPT':\n            return `${CHAT_GPT_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;\n        case 'openInClaude':\n            return `${CLAUDE_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;\n        case 'openInPerplexity':\n            return `${PERPLEXITY_BASE}${encodeURIComponent(getPrompt(currentUrl))}`;\n        default:\n            return undefined;\n    }\n};\n\nconst Menu = ({\n    className,\n    components = {},\n    onMenuOpen,\n    onSelect,\n    options = [],\n}) => {\n    const [isOpen, setIsOpen] = useState(false);\n    const [focusedIndex, setFocusedIndex] = useState(0);\n    const menuRef = useRef(null);\n    const menuItemRefs = useRef([]);\n\n    const MenuBaseComponent = components.MenuBase;\n\n    const closeMenu = useCallback(() => {\n        setIsOpen(false);\n        setFocusedIndex(0);\n    }, []);\n\n    const toggleMenu = useCallback(() => {\n        setIsOpen((prev) => {\n            if (!prev) {\n                setFocusedIndex(0);\n            }\n            return !prev;\n        });\n    }, []);\n\n    const handleKeyDown = useCallback(\n        (event) => {\n            if (event.key === 'Enter' || event.key === ' ') {\n                event.preventDefault();\n                toggleMenu();\n            } else if (event.key === 'ArrowDown') {\n                event.preventDefault();\n                if (!isOpen) {\n                    toggleMenu();\n                } else {\n                    setFocusedIndex((prev) => (prev + 1) % options.length);\n                }\n            } else if (event.key === 'ArrowUp') {\n                event.preventDefault();\n                if (isOpen) {\n                    setFocusedIndex((prev) => (prev - 1 + options.length) % options.length);\n                }\n            }\n        },\n        [toggleMenu, isOpen, options.length],\n    );\n\n    const handleOptionSelect = useCallback(\n        (option, event) => {\n            onSelect?.(option, event);\n            closeMenu();\n        },\n        [closeMenu, onSelect],\n    );\n\n    const handleMenuItemKeyDown = useCallback(\n        (event, option, index) => {\n            if (event.key === 'Enter' || event.key === ' ') {\n                event.preventDefault();\n                event.currentTarget.click();\n                return;\n            }\n\n            if (event.key === 'ArrowDown') {\n                event.preventDefault();\n                setFocusedIndex((index + 1) % options.length);\n                return;\n            }\n\n            if (event.key === 'ArrowUp') {\n                event.preventDefault();\n                setFocusedIndex((index - 1 + options.length) % options.length);\n                return;\n            }\n\n            if (event.key === 'Escape') {\n                event.preventDefault();\n                closeMenu();\n            }\n        },\n        [options.length, closeMenu],\n    );\n\n    useEffect(() => {\n        onMenuOpen?.(isOpen);\n    }, [isOpen, onMenuOpen]);\n\n    useEffect(() => {\n        if (isOpen && menuItemRefs.current[focusedIndex]) {\n            menuItemRefs.current[focusedIndex].focus();\n        }\n    }, [isOpen, focusedIndex]);\n\n    useEffect(() => {\n        if (!isOpen) {\n            return undefined;\n        }\n\n        const handleClickOutside = (event) => {\n            if (!menuRef.current?.contains(event.target)) {\n                closeMenu();\n            }\n        };\n\n        const handleEscape = (event) => {\n            if (event.key === 'Escape') {\n                closeMenu();\n            }\n        };\n\n        document.addEventListener('mousedown', handleClickOutside);\n        document.addEventListener('keydown', handleEscape);\n\n        return () => {\n            document.removeEventListener('mousedown', handleClickOutside);\n            document.removeEventListener('keydown', handleEscape);\n        };\n    }, [closeMenu, isOpen]);\n\n    return (\n        <div className={clsx(styles.menu, className)} ref={menuRef}>\n            <MenuBaseComponent\n                onClick={toggleMenu}\n                onKeyDown={handleKeyDown}\n                aria-haspopup=\"menu\"\n                aria-expanded={isOpen}\n                aria-controls=\"llm-menu\"\n            />\n            {isOpen && (\n                <div className={styles.menuDropdown} role=\"menu\" id=\"llm-menu\">\n                    {options.map((option, index) => {\n                        const WrapperComponent = option.href ? 'a' : 'button';\n\n                        return (\n                            <WrapperComponent\n                                key={option.value}\n                                ref={(el) => {\n                                    menuItemRefs.current[index] = el;\n                                }}\n                                className={styles.menuOptionWrapper}\n                                role=\"menuitem\"\n                                tabIndex={0}\n                                href={option.href}\n                                target={option.target}\n                                rel={option.rel}\n                                type={option.href ? undefined : 'button'}\n                                onClick={(event) => {\n                                    if (!option.href) {\n                                        event.preventDefault();\n                                    }\n                                    handleOptionSelect(option, event);\n                                }}\n                                onKeyDown={(e) => handleMenuItemKeyDown(e, option, index)}\n                            >\n                                <Option {...option} />\n                            </WrapperComponent>\n                        );\n                    })}\n                </div>\n            )}\n        </div>\n    );\n};\n\nfunction getButtonText({ status }) {\n    switch (status) {\n        case 'loading':\n            return 'Copying...';\n        case 'copied':\n            return 'Copied';\n        default:\n            return 'Copy for LLM';\n    }\n}\n\nconst onCopyAsMarkdownClick = async ({ setCopyingStatus, currentUrl }) => {\n    const sourceUrl = currentUrl || (typeof window !== 'undefined' ? window.location.href : '');\n\n    if (!sourceUrl) {\n        return;\n    }\n\n    trackClick('Copy for LLM', 'llm-buttons.copyForLLM');\n\n    const markdownUrl = getMarkdownUrl(sourceUrl);\n\n    try {\n        setCopyingStatus('loading');\n\n        // Safari requires clipboard writes to be created synchronously inside the user gesture.\n        // We therefore pass a Promise that resolves to a Blob into ClipboardItem instead of\n        // awaiting fetch first — otherwise Safari would reject the clipboard operation.\n        const markdownContent = new ClipboardItem({\n            'text/plain': fetch(markdownUrl)\n                .then((response) => {\n                    if (!response.ok) {\n                        throw new Error(`Failed to fetch markdown: ${response.status}`);\n                    }\n                    return response.text();\n                })\n                .then((content) => new Blob([content], { type: 'text/plain' })),\n        });\n\n        await navigator.clipboard.write([markdownContent]);\n\n        // Show success feedback\n        setCopyingStatus('copied');\n    } catch (error) {\n        console.error('Failed to copy markdown content:', error);\n    } finally {\n        setTimeout(() => setCopyingStatus('idle'), 2000);\n    }\n};\n\nconst COPYING_STATUS_ICON = {\n    loading: <LoaderIcon size={16} />,\n    copied: <CheckIcon size={16} />,\n    idle: <CopyIcon size={16} />,\n}\n\nconst MenuBase = React.forwardRef(({\n    copyingStatus,\n    setCopyingStatus,\n    chevronIconRef,\n    currentUrl,\n    ...buttonProps\n}, ref) => {\n    const mergedButtonProps = {\n        ...buttonProps,\n        tabIndex: buttonProps.tabIndex ?? 0,\n    };\n\n    return (\n        <div className={styles.llmButtonWrapper}>\n            <div\n                ref={ref}\n                className={styles.llmButton}\n                {...mergedButtonProps}\n            >\n                <div\n                    className={styles.copyUpIconWrapper}\n                    onClick={(event) => {\n                        event.stopPropagation();\n                        onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });\n                    }}\n                >\n                    {COPYING_STATUS_ICON[copyingStatus]}\n                </div>\n                <span\n                    onClick={(event) => {\n                        event.stopPropagation();\n                        onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });\n                    }}\n                    className={styles.llmButtonText}\n                >\n                    {getButtonText({ status: copyingStatus })}\n                </span>\n                <div className={styles.chevronIconWrapper}>\n                    <ChevronDownIcon\n                        size=\"16\"\n                        color=\"currentColor\"\n                        className={styles.chevronIcon}\n                        ref={chevronIconRef}\n                    />\n                </div>\n            </div>\n        </div>\n    );\n});\nMenuBase.displayName = 'MenuBase';\n\nconst Option = ({ label, description, showExternalIcon, icon }) => {\n    const Icon = icon ?? CopyIcon;\n\n    return (\n        <div className={styles.menuOption}>\n            <Icon size={16} className={styles.menuOptionIcon} />\n            <div className={styles.menuOptionText}>\n                <span className={styles.menuOptionLabel}>{label}</span>\n                <span className={styles.menuOptionDescription}>{description}</span>\n            </div>\n            {showExternalIcon && (\n                <ExternalLinkIcon\n                    size={16}\n                    className={styles.menuOptionExternalIcon}\n                />\n            )}\n        </div>\n    );\n};\n\nexport default function LLMButtons() {\n    const [copyingStatus, setCopyingStatus] = useState('idle');\n    const [isMarkdownAvailable, setIsMarkdownAvailable] = useState(false);\n    const chevronIconRef = useRef(null);\n    const location = useLocation();\n\n    const currentUrl = typeof window !== 'undefined'\n        ? `${window.location.origin}${location.pathname}${location.search}${location.hash}`\n        : '';\n\n    useEffect(() => {\n        if (!currentUrl) {\n            // TODO: Feel free to tell me how to fix this 🤦‍♂️\n            // eslint-disable-next-line react-hooks/set-state-in-effect\n            setIsMarkdownAvailable(false);\n            return undefined;\n        }\n\n        const controller = new AbortController();\n        const markdownUrl = getMarkdownUrl(currentUrl);\n\n        const checkMarkdownAvailability = async () => {\n            try {\n                const response = await fetch(markdownUrl, {\n                    method: 'HEAD',\n                    signal: controller.signal,\n                });\n                setIsMarkdownAvailable(response.ok);\n            } catch (error) {\n                if (error.name === 'AbortError') {\n                    return;\n                }\n                setIsMarkdownAvailable(false);\n            }\n        };\n\n        checkMarkdownAvailability();\n\n        return () => {\n            controller.abort();\n        };\n    }, [currentUrl]);\n\n    const menuOptions = useMemo(\n        () => DROPDOWN_OPTIONS.map((option) => {\n            const href = getOptionHref(option.value, currentUrl);\n\n            if (option.value === 'viewAsMarkdown') {\n                if (!isMarkdownAvailable) {\n                    return null;\n                }\n            }\n\n            return {\n                ...option,\n                href,\n                target: href ? '_blank' : undefined,\n                rel: href ? 'noopener noreferrer' : undefined,\n            };\n        }).filter(Boolean),\n        [isMarkdownAvailable, currentUrl],\n    );\n\n    const onMenuOptionClick = useCallback(\n        (option, event) => {\n            if (!option) {\n                return;\n            }\n\n            if (option.analytics) {\n                trackClick(option.analytics.buttonText, option.analytics.element);\n            }\n\n            if (option.value === 'copyForLLM') {\n                event?.preventDefault();\n                onCopyAsMarkdownClick({ setCopyingStatus, currentUrl });\n            }\n        },\n        [setCopyingStatus, currentUrl],\n    );\n\n    return (\n        <Menu\n            className={styles.llmMenu}\n            onMenuOpen={(isOpen) => chevronIconRef.current?.classList.toggle(\n                styles.chevronIconOpen,\n                isOpen,\n            )}\n            components={{\n                MenuBase: (props) => (\n                    <MenuBase\n                        copyingStatus={copyingStatus}\n                        setCopyingStatus={setCopyingStatus}\n                        chevronIconRef={chevronIconRef}\n                        currentUrl={currentUrl}\n                        {...props}\n                    />\n                ),\n            }}\n            onSelect={onMenuOptionClick}\n            options={menuOptions}\n        />\n    );\n}\n"
  },
  {
    "path": "website/src/components/LLMButtons.module.css",
    "content": ".llmMenu {\n    display: flex;\n    justify-content: flex-end;\n    flex: 0 0 auto;\n  }\n  \n  @media (max-width: 996px) {\n    .llmMenu {\n      width: 100%;\n      justify-content: flex-start;\n    }\n  }\n  \n  .llmButtonWrapper {\n    display: flex;\n    justify-content: flex-end;\n    width: auto;\n  }\n  \n  .llmButton {\n    display: flex;\n    align-items: center;\n    border-radius: 0.5rem;\n    border: 1px solid var(--color-separator);\n    background-color: var(--color-background-subtle);\n    cursor: pointer;\n    transition: background-color 0.2s ease-in-out, border-color 0.2s ease-in-out;\n  }\n  \n  .copyUpIconWrapper {\n    display: flex;\n    align-items: center;\n    justify-content: center;\n    padding: 0.6rem 0.5rem 0.6rem 0.8rem;\n  }\n  \n  .llmButtonText {\n    display: flex;\n    align-items: center;\n    padding-right: 0.8rem;\n    border-right: 1px solid var(--color-separator);\n    margin: 0;\n    font: 400 0.875rem/1.4 Inter, sans-serif;\n  }\n  \n  .chevronIconWrapper {\n    display: flex;\n    align-items: center;\n    justify-content: center;\n    padding-inline: 0.25rem;\n  }\n  \n  .chevronIcon {\n    transition: transform 0.2s ease-in-out;\n  }\n  \n  .chevronIconOpen {\n    transform: rotate(180deg);\n  }\n  \n  .menu {\n    position: relative;\n    width: fit-content;\n  }\n  \n  .menuDropdown {\n    position: absolute;\n    right: 0;\n    top: calc(100% + 0.5rem); \n    padding: 0.375rem;\n    border-radius: 0.75rem;\n    border: 1px solid var(--color-separator);\n    background-color: var(--color-background);\n    box-shadow: 0 12px 32px rgb(10 11 36 / 20%);\n    min-width: 17rem;\n    max-width: min(20rem, calc(100vw - 1.5rem));\n    z-index: 2;\n    display: flex;\n    flex-direction: column;\n    gap: 0.25rem;\n  }\n  \n  @media (max-width: 996px) {\n    .menuDropdown {\n      left: 0;\n      right: auto;\n      width: min(20rem, calc(100vw - 1.5rem));\n    }\n  }\n  \n  .menuOption {\n    display: flex;\n    gap: 0.5rem;\n    padding: 0.25rem 0.5rem;\n    border-radius: 0.5rem;\n    transition: background-color 0.15s ease-in-out;\n  }\n  \n  .menuOption:hover {\n    background: var(--color-hover);\n  }\n  \n  .menuOptionWrapper {\n    border: none;\n    background: transparent;\n    padding: 0;\n    text-align: left;\n    width: 100%;\n    display: block;\n    text-decoration: none;\n    color: inherit;\n    cursor: pointer;\n    outline: none;\n  }\n  \n  .menuOptionWrapper:focus-visible .menuOption {\n    background: var(--color-hover);\n    outline-offset: -2px;\n  }\n  \n  .menuOptionIcon,\n  .menuOptionExternalIcon {\n    flex-shrink: 0;\n  }\n  \n  .menuOptionIcon {\n    margin-top: 0.2rem;\n  }\n  \n  .menuOptionText {\n    flex: 1;\n    display: flex;\n    flex-direction: column;\n    gap: 0.125rem;\n    line-height: 1rem;\n    padding: 4px 0;\n  }\n  \n  .menuOptionLabel {\n    margin: 0;\n    font-size: 0.875rem;\n    line-height: 1rem;\n    font-weight: 400;\n    color: var(--ifm-font-color-base);\n  }\n  \n  .menuOptionDescription {\n    margin: 0;\n    font-size: 0.8rem;\n    color: var(--color-text-subtle);\n  }"
  },
  {
    "path": "website/src/components/RunnableCodeBlock.jsx",
    "content": "import React from 'react';\nimport clsx from 'clsx';\nimport CodeBlock from '@theme/CodeBlock';\nimport Link from '@docusaurus/Link';\nimport styles from './RunnableCodeBlock.module.css';\n\nconst PYTHON_ACTOR_RUNNER = 'HH9rhkFXiZbheuq1V'\n\nconst RunnableCodeBlock = ({ children, actor, hash, ...props }) => {\n    hash = hash ?? children.hash;\n\n    if (!children.code) {\n        throw new Error(`RunnableCodeBlock requires \"code\" and \"hash\" props\nMake sure you are importing the code block contents with the roa-loader.`);\n    }\n\n    if (!hash) {\n        return (\n            <CodeBlock {...props}>\n                { children.code }\n            </CodeBlock>\n        );\n    }\n\n    const href = `https://console.apify.com/actors/${actor ?? PYTHON_ACTOR_RUNNER}?runConfig=${hash}&asrc=run_on_apify`;\n\n    return (\n        <div className={clsx(styles.container, 'runnable-code-block')}>\n            <Link href={href} className={styles.button} rel=\"follow\">\n                Run on\n                <svg width=\"91\" height=\"25\" viewBox=\"0 0 91 25\" fill=\"none\" xmlns=\"http://www.w3.org/2000/svg\" className=\"apify-logo-light alignMiddle_src-theme-Footer-index-module\">\n                    <path fill=\"#246DFF\" d=\"M13.785 0h9.889c.201 0 .364.163.364.363v15.074c0 .361-.47.501-.669.2L13.48.561A.363.363 0 0 1 13.785 0Z\"/><path fill=\"#20A34E\" d=\"M10.253 0H.364A.364.364 0 0 0 0 .363v15.074c0 .361.47.501.669.2L10.558.561A.363.363 0 0 0 10.253 0Z\"/><path fill=\"#F86606\" d=\"M11.85 12.069.616 23.358a.363.363 0 0 0 .259.62h22.298a.363.363 0 0 0 .26-.618L12.37 12.07a.365.365 0 0 0-.52-.001Z\"/><path className=\"apify-logo\" fill=\"#000\" d=\"M77.267 3.298H73.06c-1.317 0-1.881.657-1.881 1.853V6.3h6.13l3.503 8.066L84.315 6.3h3.056l-7.335 16.859h-3.009l2.257-5.206-4.195-9.12h-3.91v9.331H68.17V8.832h-3.268V6.3h3.268V4.565c0-2.298 1.27-3.658 3.973-3.658h5.124v2.391Z\"/><path className=\"apify-logo\" fill=\"#000\" fill-rule=\"evenodd\" d=\"M53.32 6.042c3.102 0 5.641 2.321 5.641 6.19 0 3.893-2.538 6.19-5.641 6.19-2.586 0-3.88-1.594-4.114-2.063v6.776h-2.962V6.3h2.985v1.876c.212-.446 1.505-2.134 4.09-2.134Zm-.776 2.626c-2.045 0-3.362 1.524-3.362 3.564 0 2.017 1.316 3.564 3.362 3.564 2.068 0 3.385-1.547 3.385-3.564 0-2.04-1.317-3.564-3.385-3.564ZM38.44 5.995c3.69 0 5.735 1.923 5.735 4.736v4.01c0 .704.259 1.032.94 1.079v2.415h-.94c-1.48-.024-2.445-.587-2.774-1.642-.587.844-1.81 1.83-3.855 1.83-2.797 0-4.913-1.595-4.913-4.01 0-2.392 1.81-3.682 4.748-3.682h3.903c0-1.43-1.105-2.344-2.845-2.344-1.645 0-2.303.89-2.468 1.195h-3.033c.236-1.266 1.764-3.587 5.501-3.587Zm-.565 6.776c-1.387 0-2.28.516-2.28 1.595 0 1.149 1.081 1.829 2.586 1.829 1.692 0 3.103-.844 3.103-2.415V12.77h-3.409Z\" clip-rule=\"evenodd\"/><path className=\"apify-logo\" fill=\"#000\" d=\"M63.47 18.164h-3.009V6.3h3.01v11.864ZM63.518 4.4H60.39V.837h3.127v3.565Z\"/>\n                </svg>\n            </Link>\n            <CodeBlock {...props} className={clsx(styles.codeBlock, 'code-block', props.title != null ? 'has-title' : 'no-title')}>\n                { children.code }\n            </CodeBlock>\n        </div>\n    );\n};\n\nexport default RunnableCodeBlock;\n"
  },
  {
    "path": "website/src/components/RunnableCodeBlock.module.css",
    "content": ".button {\n    display: inline-block;\n    padding: 3px 10px;\n    position: absolute;\n    top: calc(var(--ifm-pre-padding) / 2);\n    right: 9px;\n    z-index: 1;\n    font-size: 16px;\n    line-height: 28px;\n    background: var(--prism-background-color);\n    color: var(--prism-color);\n    border: 1px solid var(--ifm-color-emphasis-300);\n    border-radius: var(--ifm-global-radius);\n    opacity: 0.7;\n    font-weight: 600;\n    width: 155px;\n}\n\n@media screen and (max-width: 768px) {\n    .button {\n        display: none;\n    }\n}\n\n.button svg {\n    height: 20px;\n    position: absolute;\n    top: 7.5px;\n    right: 0;\n}\n\n.button:hover {\n    opacity: 1;\n    color: var(--prism-color);\n}\n\n.container {\n    position: relative;\n}\n"
  },
  {
    "path": "website/src/css/custom.css",
    "content": "@import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;600;700&display=swap');\n\nhtml[data-theme='dark'] {\n    --ifm-navbar-background-color: #1a1b23;\n    --ifm-background-color: #1a1b21;\n    --ifm-background-surface-color: #242736;\n\n    --ifm-font-color-base: #f2f3fb;\n\n    --ifm-pre-background: #242736;\n\n    --ifm-color-primary: #5d9df1;\n    --ifm-link-color: #5d9df1;\n    --ifm-heading-color: #f2f3fb;\n    --ifm-navbar-link-color: #f2f3fb;\n    --ifm-menu-color-active: #b2b8cc;\n\n    --docusaurus-highlighted-code-line-bg: rgba(255, 255, 255, 0.1);\n\n    --docsearch-text-color: #8c93a8;\n    --docsearch-highlight-color: #f3f4fa;\n\n    --color-background: #1a1b21;\n    --color-background-subtle: #2a2d39;\n    --color-background-muted: #252832;\n    --color-field-background: #101114;\n    --color-separator: #343847;\n    --color-border: #414758;\n    --color-card-background: #1e2027;\n    --color-card-background-hover: #252832;\n    --color-text: #f3f4fa;\n    --color-text-subtle: #8c93a8;\n    --color-text-muted: #b2b8cc;\n    --color-text-on-primary: #1a1b21;\n    --color-text-placeholder: #6e758a;\n    --color-black-action: #fff;\n    --color-icon: #b2b8cc;\n    --color-hover: #2d313e;\n    --color-primary-action-hover: #d1d5e4;\n}\n\n:root {\n    /* use default system font based on https://devhints.io/css-system-font-stack */\n    --ifm-font-family-base: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;\n    --ifm-heading-font-family: 'Lota Grotesque', sans-serif;\n    --ifm-font-weight-semibold: 600;\n    --ifm-font-color-base: #242736;\n\n    --ifm-navbar-item-padding-horizontal: 0;\n    --ifm-navbar-item-padding-vertical: 0;\n    --ifm-navbar-sidebar-width: 100%;\n\n    --ifm-navbar-link-color: #41465d;\n    --ifm-navbar-shadow: none;\n\n    --ifm-heading-margin-top: var(--ifm-heading-margin-bottom);\n    --ifm-hero-background-color: transparent;\n\n    --ifm-code-background: var(--ifm-pre-background) !important;\n    --ifm-code-padding-horizontal: 0.4rem;\n    --ifm-code-padding-vertical: 0.2rem;\n\n    --ifm-color-primary-lightest: #5d9df1;\n    --ifm-color-primary-lighter: #3a87ee;\n    --ifm-color-primary-light: #2e80ed;\n    --ifm-color-primary: #1672eb;\n    --ifm-color-primary-dark: #1266d5;\n    --ifm-color-primary-darker: #1161c9;\n    --ifm-color-primary-darkest: #0e50a6;\n\n    --ifm-link-color: hsl(214, 84%, 50%);\n    --ifm-link-hover-color: hsl(214, 84%, 65%);\n    --ifm-link-hover-decoration: none;\n    --ifm-pre-padding: 1.6rem;\n\n    --ifm-footer-background-color: #272c3d;\n    --ifm-footer-title-color: #f2f3fb;\n    --ifm-footer-link-color: #f2f3fb;\n    --ifm-menu-color-active: #555d76;\n    --max-layout-width: 1680px;\n\n    --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);\n    --docsearch-highlight-color: #242836;\n\n    --ifm-heading-color: #242736;\n\n    --docsearch-text-color: #6c7590;\n    --docsearch-highlight-color: #242836;\n\n    --color-background: #fff;\n    --color-background-subtle: #f3f4fa;\n    --color-background-muted: #f8f9fc;\n    --color-field-background: #f8f9fc;\n    --color-separator: #e0e3f2;\n    --color-border: #d0d5e9;\n    --color-card-background: #fff;\n    --color-card-background-hover: #f8f9fc;\n\n    --color-text: #242836;\n    --color-text-subtle: #6c7590;\n    --color-text-muted: #555d76;\n    --color-text-on-primary: #fff;\n    --color-text-placeholder: #969eb8;\n    --color-black-action: #272d3e;\n    --color-icon: #555d76;\n    --color-hover: #eef0f8;\n    --color-primary-action-hover: #2b3143;\n}\n\nfooter,\nnav {\n    --max-layout-width: 1200px;\n}\n\n@font-face {\n    font-family: 'Lota Grotesque';\n    src: url('/font/lota.woff2') format('woff2'),\n         url('/font/lota.woff') format('woff');\n    font-weight: 600;\n}\n\n.footer__title {\n    font-size: 1.25rem;\n    font-weight: 600;\n}\n\nhtml .DocSearch-Button {\n    border-radius: 6px !important;\n    font-weight: 400 !important;\n    background: var(--color-field-background) !important;\n    border: 1px solid var(--color-border) !important;\n    width: 256px;\n    height: 40px;\n    padding: 0;\n    padding-inline: 4px;\n\n    /* Annoying, but needed */\n    /* https://stackoverflow.com/questions/26140050/why-is-font-family-not-inherited-in-button-tags-automatically/26140154 */\n    font-family: inherit;\n\n    color: var(--color-text-placeholder);\n\n    &:hover {\n        color: var(--color-text-muted);\n        box-shadow: none !important;\n        background: var(--color-field-background) !important;\n    }\n}\n.DocSearch-Button-Placeholder {\n    display: block !important;\n    font-size: 16px !important;\n}\n\n.DocSearch-Search-Icon {\n    display: none;\n}\n\ndiv[class*=\"navbarSearchContainer\"] {\n    position: static;\n}\n\nhtml[data-theme=\"dark\"] .DocSearch-Button {\n    background: none;\n    border: 1px solid var(--docsearch-muted-color);\n}\n\nhtml[data-theme=\"dark\"] .DocSearch-Button .DocSearch-Search-Icon {\n    color: var(--docsearch-muted-color);\n}\n\nhtml.plugin-pages .main-wrapper {\n    overflow-x: hidden;\n}\n\n.main-wrapper > div {\n    max-width: var(--max-layout-width);\n}\n\naside > div > a {\n    padding-left: 16px;\n}\n\naside > div > a > b {\n    display: none;\n}\n\n@media (max-width: 1200px) {\n    .navbar__toggle {\n        display: inherit;\n    }\n    .navbar__item {\n        display: none;\n    }\n}\n\n@media (max-width: 767px) {\n    .navbar__items--right > div,\n    .navbar__items--right > a {\n        display: none;\n    }\n}\n\n.navbar__toggle {\n    margin: 0;\n    padding: 8px !important;\n\n    svg {\n        color: var(--color-icon);\n        width: 20px;\n        height: 20px;\n    }\n}\n\n.navbar__title {\n    /* Replaced by SVG */\n    display: none;\n}\n\n.navbar__inner {\n    /* .container */\n    max-width: var(--max-layout-width);\n    margin: auto;\n    width: 100%;\n}\n\n.navbar__items {\n    height: 28px;\n    @media (min-width: 768px) {\n        height: 40px;\n    }\n}\n\n.navbar__items--right {\n    gap: 16px;\n}\n\n.navbar__item, .navbar__link {\n    font-size: 16px;\n    font-weight: 500;\n    line-height: 24px; /* 150% */\n    padding: 0;\n    color: var(--color-text);\n    border-radius: 12px;\n\n    &:hover,\n    &:focus {\n        color: var(--color-text-muted);\n        background: var(--color-background-muted);\n    }\n}\n\n.navbar__item {\n    padding: 4px 8px;\n}\n\n.navbar__item.dropdown {\n    padding: 4px 16px 4px 8px;\n    a {\n        display: inline-flex;\n    }\n}\n\n.navbar__link--active {\n    color: var(--color-text-muted);\n    background: var(--color-background-muted);\n}\n\n.dropdown > .navbar__link::after {\n    border-color: currentColor;\n    border-style: solid;\n    border-width: 0.1em 0.1em 0 0;\n    content: '';\n    display: inline-block;\n    height: 0.3em;\n    left: 0.3em;\n    position: relative;\n    vertical-align: top;\n    width: 0.3em;\n    top: 8px;\n    transform: rotate(135deg);\n}\n\n.navbar {\n    border-bottom: 1px solid var(--color-separator);\n    height: auto;\n    background: var(--color-background);\n\n    padding: 16px;\n\n    @media (min-width: 768px) {\n        padding: 20px 40px;\n    }\n    @media (min-width: 1024px) {\n        padding: 20px 64px;\n    }\n}\n\nnav[class*='navbarHidden'] {\n    div[class*='navbarLogo'] {\n        display: none;\n    }\n}\n\n.navbar .icon {\n    font-size: 0;\n    padding: 4px;\n    margin-left: 20px;\n    line-height: 0;\n}\n\n.navbar .icon::before {\n    content: '';\n    display: block;\n    width: 24px;\n    height: 24px;\n    background-size: cover;\n}\n\n.navbar svg[class*=\"iconExternalLink\"],\naside svg[class*=\"iconExternalLink\"] {\n    display: none;\n}\n\nheader.hero div[class^=\"heroButtons\"] {\n    justify-content: inherit;\n}\n\narticle .card h2 {\n    margin-top: 0;\n}\n\n.tsd-kind-icon,\n.menu__link,\n.table-of-contents__link {\n    text-overflow: ellipsis;\n    width: 100%;\n    overflow: hidden;\n    white-space: nowrap;\n}\n\n.tsd-flag {\n    user-select: none;\n}\n\n.menu__caret:before,\n.menu__link--sublist:after {\n    float: right;\n}\n\n.table-of-contents__link {\n    height: 20px;\n}\n\nnav.navbar .dropdown__menu {\n    top: 32px;\n\n    min-width: 6rem;\n    background: var(--color-card-background);\n    border: 1px solid var(--color-border);\n}\n\n.dropdown__menu .dropdown__link {\n    width: 100%;\n    border-radius: 8px;\n}\n\n.dropdown__menu .dropdown__link--active {\n    color: var(--color-text-muted);\n    background: var(--color-background-muted);\n}\n\n.dropdown__menu .dropdown__link:hover,\n.dropdown__menu .dropdown__link--active:hover {\n    background: var(--color-background-muted);\n    color: var(--color-text-muted);\n}\n\n.navbar__logo {\n    height: 2rem;\n}\n\n.navbar__logo_appendix {\n    margin-left: -30px;\n    font-weight: bold;\n}\n\n.navbar__logo_appendix_sidebar {\n    display: block;\n    position: absolute;\n    top: 18px;\n    left: 213px;\n}\n\n.main-wrapper {\n    align-items: safe center;\n}\n\n.main-wrapper > div {\n    width: calc(min(100%, var(--max-layout-width))) !important;\n}\n\n.main-wrapper a[class*=\"sidebarLogo\"] {\n    margin: 0;\n\n    b {\n        display: none;\n    }\n\n    img {\n        height: 28px;\n        margin-top: 4px;\n        margin-bottom: 24px;\n        margin-left: 24px;\n    }\n}\n\ndiv[class*=\"sidebarViewport\"] {\n    top: 22px;\n}\n\nhtml.plugin-pages {\n    font-size: 18px;\n    line-height: 32px;\n}\n\nhtml.plugin-pages h2 {\n    font-size: 36px;\n    line-height: 48px;\n}\n\nhtml.plugin-docs .theme-doc-markdown {\n    font-size: 18px;\n    line-height: 32px;\n}\n\nhtml.plugin-docs .theme-doc-markdown h1 {\n    font-weight: 600;\n    font-size: 48px;\n    line-height: 64px;\n    color: #000;\n}\n\nhtml[data-theme=\"dark\"].plugin-docs .theme-doc-markdown h1 {\n    color: #fff;\n}\n\nhtml.plugin-typedoc-api .theme-doc-markdown h1 {\n    color: #000;\n}\n\nhtml[data-theme=\"dark\"].plugin-typedoc-api .theme-doc-markdown h1 {\n    color: #fff;\n}\n\nhtml.plugin-docs .theme-doc-markdown h2 {\n    font-size: 36px;\n    line-height: 48px;\n}\n\nhtml.plugin-docs .theme-doc-markdown h3 {\n    font-size: 28px;\n    line-height: 36px;\n    /*color: #242736;*/\n}\n\n.theme-doc-toc-desktop .table-of-contents {\n    font-size: 16px;\n    line-height: 24px;\n}\n\n.theme-doc-sidebar-menu .menu__link,\n.theme-doc-toc-desktop .table-of-contents .toc-highlight {\n    height: auto;\n    color: #6f7490;\n    background: none;\n}\n\n.theme-doc-sidebar-menu .menu__link:hover {\n    background: inherit;\n}\n\n.theme-doc-sidebar-menu .menu__link {\n    font-weight: 400;\n}\n\n.theme-doc-sidebar-menu .menu__link--active {\n    font-weight: 700;\n    color: var(--color-text-muted);\n}\n\n.theme-doc-sidebar-menu .menu__list-item-collapsible,\n.theme-doc-sidebar-menu .menu__list-item-collapsible--active {\n    background: none;\n}\n\n.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {\n    font-weight: 700;\n}\n\nhtml[data-theme='dark'] .theme-doc-sidebar-menu .menu__link,\nhtml[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .toc-highlight {\n    color: #b3b8d2;\n}\n\nhtml[data-theme='dark'] .theme-doc-sidebar-menu .menu__link--active,\nhtml[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {\n    color: #f2f3fb;\n}\n\n.theme-doc-sidebar-menu .menu__link:hover,\n.theme-doc-sidebar-menu .menu__link--active,\n.theme-doc-toc-desktop .table-of-contents .table-of-contents__link:hover,\n.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active {\n    color: #242736;\n}\n\n.hero {\n    position: relative;\n}\n\n.apiItemContainer .tsd-readme h1:first-child {\n    display: none;\n}\n\nhtml .theme-doc-sidebar-container {\n    border: 0;\n}\n\nhtml .theme-doc-sidebar-container button {\n    border: 0;\n    border-radius: 10px;\n}\n\nhtml .table-of-contents {\n    border-left: 0;\n}\n\nhtml .table-of-contents ul {\n    border-left: 2px solid #dfe2f5;\n}\n\nhtml.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before,\nhtml.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before {\n    text-transform: uppercase;\n    font-size: 18px;\n    line-height: 28px;\n    color: #6f7490;\n    padding: 20px 12px;\n}\n\n/*\nhtml.plugin-typedoc-api .theme-doc-sidebar-menu > li:first-child::before {\n    display: block;\n    content: 'Core';\n}\n\nhtml.plugin-typedoc-api .theme-doc-sidebar-menu > li:nth-child(6)::before {\n    display: block;\n    content: 'Advanced';\n    padding-top: 60px;\n}\n */\n\n#giscus-comments {\n    display: block;\n    margin-top: 50px;\n}\n\n.video-container {\n    margin: 85px auto 0;\n    max-width: 560px;\n    overflow: hidden;\n    position: relative;\n    width: 100%;\n    border-radius: 10px;\n}\n\n.yt-lite > .lty-playbtn {\n    border: 0;\n    cursor: pointer;\n}\n\n@media screen and (min-width: 768px) {\n    .runnable-code-block .code-block.no-title pre + div {\n        position: absolute;\n        right: 170px;\n        line-height: 28px;\n    }\n}\n\n.runnable-code-block .code-block button {\n    height: 36px;\n    margin-top: 1px;\n}\n\n.runnable-code-block:hover .code-block button {\n    opacity: 0.4;\n}\n\nhtml[data-theme='dark'] .runnable-code-block svg .apify-logo {\n    fill: #fff;\n}\n\n/*\n * Reset the line-number counter for each .prism-code scope\n */\n.prism-code {\n    counter-reset: line-number;\n}\n\n/*\n * Notice the chained .language-ts class name to .prism-code\n * You can chain more languages in order to add line numbers\n */\n.prism-code.language-ts .token-line::before,\n.prism-code.language-typescript .token-line::before,\n.prism-code.language-javascript .token-line::before,\n.prism-code.language-json .token-line::before,\n.prism-code.language-json5 .token-line::before,\n.prism-code.language-python .token-line::before,\n.prism-code.language-dockerfile .token-line::before,\n.prism-code.language-XML .token-line::before,\n.prism-code.language-js .token-line::before,\n.prism-code.language-python .token-line::before {\n    counter-increment: line-number;\n    content: counter(line-number);\n    margin-right: calc(var(--ifm-pre-padding) * 0.8);\n    text-align: right;\n    min-width: 1.5rem;\n    display: inline-block;\n    opacity: .3;\n    position: sticky;\n    left: var(--ifm-pre-padding);\n}\n\ndiv[class^=\"announcementBar_\"] {\n    background: #4585b6;\n    color: #fff;\n}\n\ndiv[class^=\"announcementBar_\"] button {\n    color: #fff;\n}\n\n.markdown blockquote {\n    --ifm-alert-background-color: var(--ifm-color-info-contrast-background);\n    --ifm-alert-background-color-highlight: rgba(84,199,236,.15);\n    --ifm-alert-foreground-color: var(--ifm-color-info-contrast-foreground);\n    --ifm-alert-border-color: var(--ifm-color-info-dark);\n    --ifm-code-background: var(--ifm-alert-background-color-highlight);\n    --ifm-link-color: var(--ifm-alert-foreground-color);\n    --ifm-link-hover-color: var(--ifm-alert-foreground-color);\n    --ifm-link-decoration: underline;\n    --ifm-tabs-color: var(--ifm-alert-foreground-color);\n    --ifm-tabs-color-active: var(--ifm-alert-foreground-color);\n    --ifm-tabs-color-active-border: var(--ifm-alert-border-color);\n    background-color: var(--ifm-alert-background-color);\n    border: var(--ifm-alert-border-width) solid var(--ifm-alert-border-color);\n    border-left-width: var(--ifm-alert-border-left-width);\n    border-radius: var(--ifm-alert-border-radius);\n    box-shadow: var(--ifm-alert-shadow);\n    padding: var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal);\n}\n\n.tsd-parameters li {\n    margin-bottom: 16px;\n}\n\n.tsd-parameters-title {\n    font-size: 16px;\n    margin-bottom: 16px !important;\n}\n\n.tsd-returns-title {\n    font-size: 16px;\n}\n\n.DocSearch-Button-Key {\n    background: var(--color-background-subtle) !important;\n    box-shadow: none !important;\n    border: 1px solid var(--color-border) !important;\n    padding: 0 !important;\n    color: var(--color-text-muted) !important;\n}\n\n.navbar-sidebar__brand {\n    border-bottom: 1px solid var(--color-separator);\n    flex-direction: column;\n    height: auto;\n    padding: 0;\n}\n\n.menu-primary {\n    padding: 0;\n    .menu__list-item {\n        border-bottom: 1px solid var(--color-separator);\n        margin: 0px 24px !important;\n        a {\n            margin: 8px 0px 4px;\n            padding: 8px;\n        }\n        display: flex;\n    }\n    .menu__link {\n        font-size: 16px;\n        font-weight: 500;\n        line-height: 24px;\n    }\n}\n\n.navbar-sidebar__close {\n    margin-left: 16px;\n    svg {\n        g {\n            stroke: var(--color-icon);\n        }\n        width: 32px;\n        height: 32px;\n        padding: 8px;\n    }\n}\n\n.DocSearch-Modal {\n    font-family: var(--ifm-font-family-base);\n\n    border-radius: 8px !important;\n    border: 1px solid var(--color-border) !important;\n    background: var(--color-card-background) !important;\n    box-shadow: none !important;\n\n    button {\n        font-family: var(--ifm-font-family-base);\n    }\n\n    .DocSearch-Logo {\n        display: none;\n    }\n\n    .DocSearch-Footer {\n        flex-direction: row;\n        border-top: 1px solid var(--color-border);\n        background: var(--color-background);\n        box-shadow: none;\n    }\n\n    .DocSearch-Label {\n        color: var(--color-text-subtle);\n        font-size: 14px;\n        font-weight: 400;\n        line-height: 20px;\n    }\n\n    .DocSearch-Commands-Key {\n        border-radius: 4px;\n        border: 1px solid var(--color-border);\n        background: var(--color-background-subtle);\n        box-shadow: none;\n        g {\n            stroke: var(--color-text-subtle);\n        }\n    }\n\n    .DocSearch-Clear {\n        color: var(--color-text-subtle);\n    }\n\n    .DocSearch-Form {\n        border-radius: 6px;\n        border-radius: var(--Radius-6, 6px);\n        border: 1px solid var(--color-border);\n        background: var(--color-background);\n        box-shadow: none;\n        height: 40px;\n        padding: 8px 12px;\n    }\n\n    .DocSearch-Input {\n        color: var(--color-text);\n        font-size: 14px;\n        line-height: 20px;\n        padding: 0;\n    }\n\n    .DocSearch-Input::placeholder {\n        color: var(--color-text-placeholder);\n        font-style: italic;\n    }\n\n    .DocSearch-Search-Icon {\n        width: 16px;\n        height: 16px;\n        path {\n            stroke: var(--color-text-muted);\n        }\n    }\n\n    .DocSearch-Reset {\n        display: none;\n    }\n\n    .DocSearch-Help {\n        color: var(--color-text-subtle);\n    }\n\n    .DocSearch-Hit-source {\n        color: var(--color-text-subtle);\n        font-size: 14px;\n        font-weight: 400;\n        line-height: 20px;\n        padding-bottom: 4px;\n        padding-left: 12px;\n        background: var(--color-card-background);\n    }\n\n    .DocSearch-Hit {\n        background: transparent;\n        a {\n            background: transparent !important;\n            padding: 0;\n            box-shadow: none;\n        }\n        a:hover {\n            background: var(--color-hover) !important;\n        }\n    }\n\n    .DocSearch-Hit[aria-selected='true'] a {\n        background: var(--color-hover) !important;\n    }\n\n    .DocSearch-Hit-Container {\n        background: transparent;\n        height: 50px;\n    }\n\n    .DocSearch-Screen-Icon {\n        display: none;\n    }\n\n    .DocSearch-NoResults {\n        margin: 0;\n        display: flex;\n        flex-direction: column;\n        width: 100%;\n        padding: 16px 8px;\n        gap: 24px;\n\n        .DocSearch-Title {\n            color: var(--color-text);\n            font-size: 16px;\n            font-weight: 500;\n            line-height: 24px;\n            width: fit-content;\n            margin: 0;\n        }\n    }\n\n    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-title,\n    .DocSearch-Hit-title {\n        color: var(--color-text) !important;\n        font-size: 16px;\n        font-style: normal;\n        font-weight: 500;\n        line-height: 24px; /* 150% */\n    }\n\n    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-path,\n    .DocSearch-Hit-path,\n    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-action,\n    .DocSearch-Hit-action,\n    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-icon,\n    .DocSearch-Hit-icon,\n    .DocSearch-Hit[aria-selected='true'] .DocSearch-Hit-Tree,\n    .DocSearch-Hit-Tree {\n        color: var(--color-text-muted) !important;\n    }\n\n    .DocSearch-Hit[aria-selected='true'] mark,\n    .DocSearch-Hit mark {\n        color: var(--color-text-subtle) !important;\n    }\n\n    .DocSearch-Help {\n        color: var(--color-text-subtle);\n        font-size: 14px;\n        font-weight: 400;\n        line-height: 16px;\n    }\n\n    .DocSearch-NoResults-Prefill-List {\n        padding: 0;\n        li {\n            list-style-type: none;\n            margin-top: 4px;\n        }\n    }\n\n    .DocSearch-Prefill {\n        color: var(--color-text);\n        font-size: 14px;\n        font-weight: 500;\n        line-height: 20px;\n        &:hover {\n            color: var(--color-text-subtle);\n            text-decoration: none;\n        }\n    }\n\n    .DocSearch-HitsFooter {\n        color: var(--color-text-subtle);\n        font-size: 14px;\n        font-weight: 400;\n        line-height: 16px;\n\n        a {\n            border: none;\n        }\n\n        a:hover {\n            color: var(--color-text);\n        }\n    }\n\n    .DocSearch-Hit-icon {\n        margin-left: 8px;\n        width: auto;\n        height: auto;\n        svg {\n            width: 16px;\n            height: 16px;\n        }\n    }\n\n    li[id*='recentSearches'] {\n        .DocSearch-Hit-icon {\n            display: none;\n        }\n    }\n\n    .DocSearch-SearchBar {\n        padding: 16px 16px 8px;\n    }\n\n    .DocSearch-Hit-Select-Icon {\n        display: none !important;\n    }\n\n    .DocSearch-Dropdown {\n        padding: 0 8px;\n    }\n\n    .DocSearch-Cancel {\n        color: var(--color-text-subtle);\n        font-size: 14px;\n        font-weight: 500;\n        line-height: 20px;\n        &:hover {\n            color: var(--color-text);\n        }\n    }\n\n    .DocSearch-NoResults-Prefill-List ul {\n        padding: 0;\n    }\n}\n"
  },
  {
    "path": "website/src/pages/home_page_example.py",
    "content": "import asyncio\n\nfrom crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext\n\n\nasync def main() -> None:\n    crawler = PlaywrightCrawler(\n        max_requests_per_crawl=10,  # Limit the max requests per crawl.\n        headless=True,  # Run in headless mode (set to False to see the browser).\n        browser_type='firefox',  # Use Firefox browser.\n    )\n\n    # Define the default request handler, which will be called for every request.\n    @crawler.router.default_handler\n    async def request_handler(context: PlaywrightCrawlingContext) -> None:\n        context.log.info(f'Processing {context.request.url} ...')\n\n        # Extract data from the page using Playwright API.\n        data = {\n            'url': context.request.url,\n            'title': await context.page.title(),\n        }\n\n        # Push the extracted data to the default dataset.\n        await context.push_data(data)\n\n        # Extract all links on the page and enqueue them.\n        await context.enqueue_links()\n\n    # Run the crawler with the initial list of URLs.\n    await crawler.run(['https://crawlee.dev'])\n\n    # Export the entire dataset to a CSV file.\n    await crawler.export_data('results.csv')\n\n    # Or access the data directly.\n    data = await crawler.get_data()\n    crawler.log.info(f'Extracted data: {data.items}')\n\n\nif __name__ == '__main__':\n    asyncio.run(main())\n"
  },
  {
    "path": "website/src/pages/index.js",
    "content": "/* eslint-disable max-len */\nimport Link from '@docusaurus/Link';\nimport useDocusaurusContext from '@docusaurus/useDocusaurusContext';\nimport CodeBlock from '@theme/CodeBlock';\nimport Layout from '@theme/Layout';\nimport ThemedImage from '@theme/ThemedImage';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './index.module.css';\nimport Button from '../components/Button';\nimport HomepageCliExample from '../components/Homepage/HomepageCliExample';\nimport HomepageCtaSection from '../components/Homepage/HomepageCtaSection';\nimport HomepageHeroSection from '../components/Homepage/HomepageHeroSection';\nimport LanguageInfoWidget from '../components/Homepage/LanguageInfoWidget';\nimport RiverSection from '../components/Homepage/RiverSection';\nimport RunnableCodeBlock from '../components/RunnableCodeBlock';\nimport ThreeCardsWithIcon from '../components/Homepage/ThreeCardsWithIcon';\n\nimport HomePageExample from '!!raw-loader!roa-loader!./home_page_example.py';\n\nfunction GetStartedSection() {\n    return (\n        <section className={styles.languageGetStartedSection}>\n            <LanguageInfoWidget\n                language=\"Python\"\n                githubUrl=\"https://github.com/apify/crawlee-python\"\n                to=\"/python/docs/quick-start\"\n            />\n        </section>\n    );\n}\n\nfunction CodeExampleSection() {\n    return (\n        <section className={styles.codeExampleSection}>\n            <div className={styles.decorativeRow} />\n            <div className={styles.codeBlockContainer}>\n                <RunnableCodeBlock className=\"language-python\" language=\"python\">\n                    {HomePageExample}\n                </RunnableCodeBlock>\n            </div>\n            <div className={styles.dashedSeparator} />\n            <div className={styles.decorativeRow} />\n        </section>\n    );\n}\n\nconst benefitsCodeBlockCrawler = `fingerprint_generator = DefaultFingerprintGenerator(\n    header_options=HeaderGeneratorOptions(\n        browsers=['chromium', 'firefox'],\n        devices=['mobile'],\n        locales=['en-US']\n    ),\n)`;\n\n// TODO:\nconst benefitsCodeBlockHeadless = `crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()\n\n@crawler.router.default_handler\nasync def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:\n    prices = await context.query_selector_all('span.price')\n    await context.enqueue_links()`;\n\nfunction BenefitsSection() {\n    return (\n        <section className={styles.benefitsSection}>\n            <h2>What are the benefits?</h2>\n            <RiverSection\n                title=\"Unblock websites by default\"\n                description=\"Crawlee crawls stealthily with zero configuration, but you can customize its behavior to overcome any protection. Real-world fingerprints included.\"\n                content={\n                    <CodeBlock className=\"code-block\" language=\"python\">\n                        {benefitsCodeBlockCrawler}\n                    </CodeBlock>\n                }\n                to=\"/docs/guides/avoid-blocking\"\n            />\n            <div className={styles.trianglesSeparator} />\n            <RiverSection\n                title=\"Work with your favorite tools\"\n                description=\"Crawlee integrates BeautifulSoup, Cheerio, Puppeteer, Playwright, and other popular open-source tools. No need to learn new syntax.\"\n                content={\n                    <ThemedImage\n                        alt=\"Work with your favorite tools\"\n                        sources={{\n                            light: '/python/img/favorite-tools-light.webp',\n                            dark: '/python/img/favorite-tools-dark.webp',\n                        }}\n                    />\n                }\n                reversed\n                to=\"/docs/quick-start#choose-your-crawler\"\n            />\n            <div className={styles.trianglesSeparator} />\n            <RiverSection\n                title=\"One API for headless and HTTP\"\n                description=\"Switch between HTTP and headless without big rewrites thanks to a shared API. Or even let Adaptive crawler decide if JS rendering is needed.\"\n                content={\n                    <CodeBlock className=\"code-block\" language=\"python\">\n                        {benefitsCodeBlockHeadless}\n                    </CodeBlock>\n                }\n                to=\"/api\"\n            />\n        </section>\n    );\n}\n\nfunction OtherFeaturesSection() {\n    return (\n        <section className={styles.otherFeaturesSection}>\n            <h2>What else is in Crawlee?</h2>\n            <div className={styles.cardsWithContentContainer}>\n                <div className={styles.cardsWithImageContainer}>\n                    <Link className={styles.cardWithImage} to=\"/docs/guides/scaling-crawlers\">\n                        <ThemedImage\n                            sources={{\n                                light: '/python/img/auto-scaling-light.webp',\n                                dark: '/python/img/auto-scaling-dark.webp',\n                            }}\n                            alt=\"\"\n                        />\n                        <div className={styles.cardWithImageText}>\n                            <h3 className={styles.cardWithImageTitle}>\n                                Auto scaling\n                            </h3>\n                            <div className={styles.cardWithImageDescription}>\n                                Crawlers automatically adjust concurrency based\n                                on available system resources. Avoid memory\n                                errors in small containers and run faster in\n                                large ones.\n                            </div>\n                        </div>\n                    </Link>\n                    <Link className={styles.cardWithImage} to=\"/docs/guides/proxy-management\">\n                        <ThemedImage\n                            sources={{\n                                light: '/python/img/smart-proxy-light.webp',\n                                dark: '/python/img/smart-proxy-dark.webp',\n                            }}\n                            alt=\"\"\n                        />\n                        <div className={styles.cardWithImageText}>\n                            <h3 className={styles.cardWithImageTitle}>\n                                Smart proxy rotation\n                            </h3>\n                            <div className={styles.cardWithImageDescription}>\n                                Crawlee uses a pool of sessions represented by\n                                different proxies to maintain the proxy\n                                performance and keep IPs healthy. Blocked\n                                proxies are removed from the pool automatically.\n                            </div>\n                        </div>\n                    </Link>\n                </div>\n                <ThreeCardsWithIcon\n                    cards={[\n                        {\n                            icon: (\n                                <ThemedImage\n                                    sources={{\n                                        light: '/python/img/queue-light-icon.svg',\n                                        dark: '/python/img/queue-dark-icon.svg',\n                                    }}\n                                    alt=\"\"\n                                />\n                            ),\n                            title: 'Queue and storage',\n                            description:\n                                'Pause and resume crawlers thanks to a persistent queue of URLs and storage for structured data.',\n                            to: '/docs/guides/storages',\n                        },\n                        {\n                            icon: (\n                                <ThemedImage\n                                    sources={{\n                                        light: '/python/img/scraping-utils-light-icon.svg',\n                                        dark: '/python/img/scraping-utils-dark-icon.svg',\n                                    }}\n                                    alt=\"\"\n                                />\n                            ),\n                            title: 'Handy scraping utils',\n                            description:\n                                'Sitemaps, infinite scroll, contact extraction, large asset blocking and many more utils included.',\n                            to: '/docs/guides/avoid-blocking',\n\n                        },\n                        {\n                            icon: (\n                                <ThemedImage\n                                    sources={{\n                                        light: '/python/img/routing-light-icon.svg',\n                                        dark: '/python/img/routing-dark-icon.svg',\n                                    }}\n                                    alt=\"\"\n                                />\n                            ),\n                            title: 'Routing & middleware',\n                            description:\n                                'Keep your code clean and organized while managing complex crawls with a built-in router that streamlines the process.',\n                            to: '/api/class/Router',\n                        },\n                    ]}\n                />\n            </div>\n        </section>\n    );\n}\n\nfunction DeployToCloudSection() {\n    return (\n        <section className={styles.deployToCloudSection}>\n            <div className={styles.deployToCloudLeftSide}>\n                <h2>Deploy to cloud </h2>\n                <div className={styles.deployToCloudDescription}>\n                    Crawlee, by Apify, works anywhere, but Apify offers the best\n                    experience. Easily turn your project into an{' '}\n                    <Link to=\"https://apify.com/actors\" rel=\"dofollow\">\n                        Actor\n                    </Link>\n                    —a serverless micro-app with built-in infra, proxies, and\n                    storage.\n                </div>\n                <Button\n                    withIcon\n                    to=\"https://docs.apify.com/platform/actors/development/deployment\"\n                >\n                    Deploy to Apify\n                </Button>\n            </div>\n            <div className={styles.deployToCloudRightSide}>\n                <div\n                    className={styles.dashedSeparatorVertical}\n                    id={styles.verticalStepLine}\n                />\n                <div className={styles.deployToCloudStep}>\n                    <div className={styles.deployToCloudStepNumber}>\n                        <div>1</div>\n                    </div>\n                    <div className={styles.deployToCloudStepText}>\n                        Install Apify SDK and Apify CLI.\n                    </div>\n                </div>\n                <div className={styles.deployToCloudStep}>\n                    <div className={styles.deployToCloudStepNumber}>\n                        <div>2</div>\n                    </div>\n                    <div className={styles.deployToCloudStepText}>\n                        Add <pre>Actor.init()</pre> to the beginning and{' '}\n                        <pre>Actor.exit()</pre> to the end of your code.\n                    </div>\n                </div>\n                <div className={styles.deployToCloudStep}>\n                    <div className={styles.deployToCloudStepNumber}>\n                        <div>3</div>\n                    </div>\n                    <div className={styles.deployToCloudStepText}>\n                        Use the Apify CLI to push the code to the Apify\n                        platform.\n                    </div>\n                </div>\n            </div>\n        </section>\n    );\n}\n\nfunction BuildFastScrapersSection() {\n    return (\n        <section className={styles.buildFastScrapersSection}>\n            <div className={styles.dashedDecorativeCircle} />\n            <div className={styles.dashedSeparator} />\n            <h2>Crawlee helps you build scrapers faster</h2>\n            <ThreeCardsWithIcon\n                cards={[\n                    {\n                        icon: (\n                            <ThemedImage\n                                sources={{\n                                    light: '/python/img/zero-setup-light-icon.svg',\n                                    dark: '/python/img/zero-setup-dark-icon.svg',\n                                }}\n                                alt=\"\"\n                            />\n                        ),\n                        title: 'Zero setup required',\n                        description:\n                            'Copy code example, install Crawlee and go. No CLI required, no complex file structure, no boilerplate.',\n                        actionLink: {\n                            text: 'Get started',\n                            href: '/docs/quick-start',\n                        },\n                    },\n                    {\n                        icon: (\n                            <ThemedImage\n                                sources={{\n                                    light: '/python/img/defaults-light-icon.svg',\n                                    dark: '/python/img/defaults-dark-icon.svg',\n                                }}\n                                alt=\"\"\n                            />\n                        ),\n                        title: 'Reasonable defaults',\n                        description:\n                            'Unblocking, proxy rotation and other core features are already turned on. But also very configurable.',\n                        actionLink: {\n                            text: 'Learn more',\n                            href: '/docs/examples',\n                        },\n                    },\n                    {\n                        icon: (\n                            <ThemedImage\n                                sources={{\n                                    light: '/python/img/community-light-icon.svg',\n                                    dark: '/python/img/community-dark-icon.svg',\n                                }}\n                                alt=\"\"\n                            />\n                        ),\n                        title: 'Helpful community',\n                        description:\n                            'Join our Discord community of over 10k developers and get fast answers to your web scraping questions.',\n                        actionLink: {\n                            text: 'Join Discord',\n                            href: 'https://discord.gg/jyEM2PRvMU',\n                        },\n                    },\n                ]}\n            />\n        </section>\n    );\n}\n\nexport default function JavascriptHomepage() {\n    const { siteConfig } = useDocusaurusContext();\n    return (\n        <Layout description={siteConfig.description}>\n            <div id={styles.homepageContainer}>\n                <HomepageHeroSection />\n                <GetStartedSection />\n                <div className={clsx(styles.dashedSeparator, styles.codeExampleTopSeparator)} />\n                <CodeExampleSection />\n                <HomepageCliExample />\n                <div className={styles.dashedSeparator}>\n                    <div\n                        className={styles.dashedDecorativeCircle}\n                        id={styles.ctaDecorativeCircle}\n                    />\n                </div>\n                <BenefitsSection />\n                <div className={styles.dashedSeparator} />\n                <OtherFeaturesSection />\n                <div className={styles.dashedSeparator} />\n                <DeployToCloudSection />\n                <div className={styles.dashedSeparator} />\n                <BuildFastScrapersSection />\n                <HomepageCtaSection />\n            </div>\n        </Layout>\n    );\n}\n"
  },
  {
    "path": "website/src/pages/index.module.css",
    "content": "/************* PAGE LAYOUT *************/\n\n#homepageContainer {\n    width: calc(100% - 48px) !important;\n    max-width: 1200px !important;\n    border-left: 1px solid var(--color-separator);\n    border-right: 1px solid var(--color-separator);\n    margin: 0 24px;\n}\n\n.dashedSeparator {\n    position: relative;\n    width: 100%;\n    border-bottom: 1px dashed var(--color-separator);\n}\n\n.dashedSeparatorVertical {\n    position: relative;\n    border-right: 1px dashed var(--color-separator);\n}\n\n.dashedDecorativeCircle {\n    width: 120px;\n    height: 120px;\n    border: 1px dashed var(--color-separator);\n    border-radius: 50%;\n    position: absolute;\n    transform: translate(-50%, -50%);\n}\n\n.fadedOutSeparator {\n    border: none;\n    height: 1px;\n    background-image:\n        linear-gradient(\n            90deg,\n            transparent,\n            transparent 50%,\n            var(--color-background) 50%,\n            var(--color-background) 100%\n        ),\n        linear-gradient(\n            90deg,\n            var(--color-separator) 0%,\n            transparent 50%,\n            var(--color-separator) 100%\n        );\n    background-size:\n        6px 1px,\n        100% 1px;\n}\n\n.fadedOutSeparatorVertical {\n    border: none;\n    width: 1px;\n    background-image:\n        linear-gradient(\n            180deg,\n            transparent,\n            transparent 50%,\n            var(--color-background) 50%,\n            var(--color-background) 100%\n        ),\n        linear-gradient(\n            180deg,\n            var(--color-separator) 0%,\n            transparent 50%,\n            var(--color-separator) 100%\n        );\n    background-size:\n        1px 6px,\n        1px 100%;\n}\n\n.trianglesSeparator {\n    width: 100%;\n    height: 32px;\n    background-position: center;\n    background-repeat: repeat-x;\n    background-image: url(\"../../static/img/triangles_light.svg\");\n\n    html[data-theme=\"dark\"] & {\n        background-image: url(\"../../static/img/triangles_dark.svg\");\n    }\n\n    /* TABLET */\n    @media (min-width: 768px) {\n        background-position: unset;\n        background-repeat: repeat;\n        height: 52px;\n    }\n}\n\n/* most separators and decorations are not displayed on mobile */\n.dashedSeparatorVertical,\n.dashedDecorativeCircle,\n.fadedOutSeparator,\n.fadedOutSeparatorVertical {\n    display: none;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .dashedSeparatorVertical,\n    .dashedDecorativeCircle,\n    .fadedOutSeparator,\n    .fadedOutSeparatorVertical {\n        display: block;\n    }\n\n    #homepageContainer {\n        width: calc(100% - 80px) !important;\n        margin: 0 40px;\n    }\n}\n\n/* DESKTOP */\n@media (min-width: 1024px) {\n    .dashedSeparatorVertical,\n    .dashedDecorativeCircle,\n    .fadedOutSeparator,\n    .fadedOutSeparatorVertical {\n        display: block;\n    }\n\n    #homepageContainer {\n        width: calc(100% - 128px) !important;\n        margin: 0 64px;\n    }\n}\n\n/************* LANGUAGE GET STARTED SECTION *************/\n\n.languageGetStartedSection {\n    display: flex;\n    flex-direction: column;\n    gap: 32px;\n    margin: 0 0 32px 0;\n\n    div[class^=\"languageGetStartedContainer\"] {\n        flex: 1;\n    }\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .languageGetStartedSection {\n        flex-direction: row;\n        align-items: stretch;\n        justify-content: space-around;\n        gap: 0;\n        margin: 0;\n    }\n}\n\n/************* CODE EXAMPLE SECTION *************/\n\n.codeExampleTopSeparator {\n    display: none;\n}\n@media (min-width: 768px) {\n    .codeExampleTopSeparator {\n        display: block;\n    }\n}\n\n.languageSwitchContainer {\n    place-self: center;\n    margin: 32px 0 16px 0;\n}\n\n.codeBlockContainer {\n    :global(.theme-code-block) {\n        margin-bottom: 32px;\n        border-radius: 0;\n        box-shadow: none;\n        border-bottom: 1px dashed var(--color-separator);\n        border-top: 1px dashed var(--color-separator);\n        code {\n            font-size: 14px;\n            background: var(--color-background-muted);\n            padding: 16px 8px 16px 4px;\n\n            span::before {\n                margin-right: 16px !important;\n                left: unset !important;\n                margin-right: 16px !important;\n                color: var(--color-text-subtle) !important;\n                opacity: 1 !important;\n            }\n        }\n    }\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .codeBlockContainer :global(.theme-code-block) {\n        margin-bottom: 0;\n        border-bottom: none;\n        border-top: none;\n    }\n\n    .codeExampleSection {\n        position: relative;\n    }\n\n    .languageSwitchContainer {\n        margin: 0;\n        position: absolute;\n        top: calc(46px - 18px);\n        left: calc(50% - 90px);\n    }\n\n    .decorativeRow {\n        position: relative;\n        height: 46px;\n        border-bottom: 1px dashed var(--color-separator);\n\n        &::before {\n            content: \" \";\n            position: absolute;\n            left: 40px;\n            height: 100%;\n            border-right: 1px dashed var(--color-separator);\n        }\n\n        &::after {\n            content: \" \";\n            position: absolute;\n            right: 40px;\n            height: 100%;\n            border-left: 1px dashed var(--color-separator);\n        }\n    }\n\n    .codeBlockContainer {\n        margin: 0 40px;\n        border-left: 1px dashed var(--color-separator);\n        border-right: 1px dashed var(--color-separator);\n    }\n}\n\n@media (min-width: 1024px) {\n    .decorativeRow {\n        &::before {\n            left: 60px;\n        }\n\n        &::after {\n            right: 60px;\n        }\n    }\n    .codeBlockContainer {\n        margin: 0 60px;\n    }\n}\n\n#ctaDecorativeCircle {\n    width: 120px;\n    height: 120px;\n}\n\n/************** BENEFITS SECTION ***********/\n\n.benefitsSection {\n    margin-bottom: 60px;\n\n    h2 {\n        margin: 32px 0;\n        text-align: center;\n        padding: 0 12px;\n\n        /* TABLET */\n        @media (min-width: 768px) {\n            margin: 80px 0;\n        }\n    }\n}\n\n/************** OTHER FEATURES SECTION ***********/\n\n.otherFeaturesSection {\n    display: flex;\n    flex-direction: column;\n\n    h2 {\n        padding: 32px 12px;\n\n        text-align: center;\n        color: var(--color-text);\n        font-weight: 400;\n\n        line-height: 46px !important;\n        font-size: 36px !important;\n\n        @media (min-width: 768px) {\n            line-height: 56px !important;\n            font-size: 48px !important;\n            margin: 80px 0 64px;\n            padding: 32px 24px;\n        }\n    }\n    margin-bottom: 40px;\n\n    @media (min-width: 768px) {\n        margin-bottom: 80px;\n    }\n}\n\n.cardsWithContentContainer {\n    display: flex;\n    flex-direction: column;\n    gap: 20px;\n    background-position-x: 5px;\n    background-image: url(\"../../static/img/triangles_light.svg\");\n\n    html[data-theme=\"dark\"] & {\n        background-image: url(\"../../static/img/triangles_dark.svg\");\n    }\n\n    @media (min-width: 768px) {\n        gap: 48px;\n    }\n}\n\n.cardsWithImageContainer {\n    display: flex;\n    flex-direction: column;\n    gap: 20px;\n    width: 100%;\n\n    @media (min-width: 768px) {\n        gap: 32px;\n        flex-direction: row;\n    }\n}\n\n.cardWithImage {\n    flex: 1;\n    display: flex;\n    flex-direction: column;\n    overflow: hidden;\n    background: var(--color-card-background);\n    border-block: 1px solid var(--color-separator);\n    transition: background 0.1s ease;\n\n    @media (min-width: 768px) {\n        border: 1px solid var(--color-separator);\n    }\n\n    &:first-child {\n        border-left: 0;\n    }\n    &:last-child {\n        border-right: 0;\n    }\n\n    &:hover {\n        background: var(--color-card-background-hover);\n    }\n}\n\n.cardWithImage img {\n    width: 100%;\n    height: 250px;\n    object-fit: cover;\n}\n\n.cardWithImage:last-child img {\n    object-position: left 90%;\n}\n\n.cardWithImageText {\n    padding: 40px 24px;\n    border-top: 1px solid var(--color-separator);\n}\n\n.cardWithImageTitle {\n    margin: 0;\n\n    color: var(--color-text);\n    font-size: 26px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 34px;\n}\n\n.cardWithImageDescription {\n    margin-top: 12px;\n    color: var(--color-text-muted);\n    font-family: var(--ifm-font-family-base);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 24px;\n}\n\n/************** DEPLOY TO CLOUD SECTION ***********/\n\n.deployToCloudSection {\n    padding: 32px 16px;\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n    gap: 48px;\n}\n\n.deployToCloudLeftSide {\n    display: flex;\n    flex-direction: column;\n    flex-basis: 50%;\n    gap: 24px;\n    text-align: center;\n    font-style: normal;\n    font-weight: 400;\n\n    a {\n        width: fit-content;\n        margin: auto;\n    }\n\n    h2 {\n        color: var(--color-text);\n        font-family: \"Lota Grotesque\";\n        font-size: 38px;\n        line-height: 46px;\n    }\n}\n\n.deployToCloudDescription {\n    color: var(--color-text-muted);\n    font-size: 16px;\n    line-height: 24px;\n\n    a {\n        color: inherit;\n        text-decoration: underline;\n    }\n}\n\n.deployToCloudRightSide {\n    display: flex;\n    flex-direction: column;\n    gap: 24px;\n    flex-basis: 50%;\n    position: relative;\n}\n\n.deployToCloudStep {\n    display: flex;\n    flex-direction: row;\n    gap: 16px;\n    align-items: center;\n}\n\n.deployToCloudStepNumber {\n    display: flex;\n    justify-content: center;\n    align-items: center;\n    width: 72px;\n    height: 72px;\n    padding: 16px;\n    border-radius: 8px;\n    border: 1px solid var(--color-separator);\n    background: var(--color-background);\n    color: var(--color-text-muted);\n    font-size: 16px;\n    font-style: normal;\n    font-weight: 400;\n    line-height: 24px;\n    z-index: 1;\n    div {\n        display: flex;\n        justify-content: center;\n        align-items: center;\n        height: 40px;\n        width: 40px;\n        border-radius: 50%;\n        border: 1px dashed var(--color-separator);\n        flex-shrink: 0;\n    }\n}\n\n.deployToCloudStepText {\n    display: inline-flex;\n    align-items: baseline;\n    flex-wrap: wrap;\n    gap: 4px;\n    color: var(--color-text);\n    font-size: 14px;\n    font-style: normal;\n    font-weight: 500;\n    line-height: 20px;\n\n    pre {\n        margin: 0;\n        padding: 0;\n        background-color: transparent;\n    }\n}\n\n#verticalStepLine {\n    position: absolute;\n    left: 36px;\n    height: 100%;\n    z-index: 0;\n}\n\n/* TABLET */\n@media (min-width: 768px) {\n    .deployToCloudSection {\n        padding: 96px 40px;\n        flex-direction: row;\n    }\n    .deployToCloudLeftSide {\n        text-align: left;\n\n        a {\n            margin: 0;\n        }\n\n        h2 {\n            color: var(--color-text);\n            font-family: \"Lota Grotesque\";\n            font-size: 48px;\n            line-height: 58px;\n        }\n    }\n    .deployToCloudDescription {\n        font-size: 18px;\n        line-height: 28px;\n    }\n}\n\n/************** BUILD SCRAPERS FAST SECTION ***********/\n\n.buildFastScrapersSection {\n    position: relative;\n\n    padding: 40px 0 32px;\n\n    border-bottom: 1px solid var(--color-separator);\n\n    h2 {\n        margin: 0;\n        padding: 32px 0;\n        text-align: center;\n        color: var(--color-text);\n        font-weight: 400;\n        padding-inline: 12px;\n\n        line-height: 46px !important;\n        font-size: 36px !important;\n\n        @media (min-width: 768px) {\n            padding-inline: 24px;\n\n            line-height: 56px !important;\n            font-size: 48px !important;\n            padding: 80px 0 64px;\n        }\n    }\n\n    div[class*=\"dashedDecorativeCircle\"] {\n        display: none;\n    }\n\n    @media (min-width: 1024px) {\n        padding: 80px 0 60px;\n        div[class*=\"dashedDecorativeCircle\"] {\n            display: block;\n        }\n    }\n}\n\n.buildFastScrapersContent {\n    border-block: 1px solid var(--color-separator);\n}\n"
  },
  {
    "path": "website/src/plugins/docusaurus-plugin-segment/index.js",
    "content": "const path = require('path');\n\nmodule.exports = function (context, options) {\n    const { writeKey, allowedInDev = false } = options;\n\n    return {\n        name: 'docusaurus-plugin-segment',\n\n        getClientModules() {\n            return [path.resolve(__dirname, './segment')];\n        },\n\n        injectHtmlTags() {\n            if (process.env.NODE_ENV !== 'production' && !allowedInDev) {\n                return {};\n            }\n\n            if (!writeKey) {\n                console.warn('You need to specify a Segment writeKey in the plugin options');\n                return {};\n            }\n\n            return {\n                headTags: [\n                    {\n                        tagName: 'script',\n                        innerHTML: `\n            !function(){var i=\"analytics\",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error(\"Segment snippet included twice.\");else{analytics.invoked=!0;analytics.methods=[\"trackSubmit\",\"trackClick\",\"trackLink\",\"trackForm\",\"pageview\",\"identify\",\"reset\",\"group\",\"track\",\"ready\",\"alias\",\"debug\",\"page\",\"screen\",\"once\",\"off\",\"on\",\"addSourceMiddleware\",\"addIntegrationMiddleware\",\"setAnonymousId\",\"addDestinationMiddleware\",\"register\"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if([\"track\",\"screen\",\"alias\",\"group\",\"page\",\"identify\"].indexOf(e)>-1){var c=document.querySelector(\"link[rel='canonical']\");n.push({__t:\"bpc\",c:c&&c.getAttribute(\"href\")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n<analytics.methods.length;n++){var key=analytics.methods[n];analytics[key]=analytics.factory(key)}analytics.load=function(key,n){var t=document.createElement(\"script\");t.type=\"text/javascript\";t.async=!0;t.setAttribute(\"data-global-segment-analytics-key\",i);t.src=\"https://cdn.segment.com/analytics.js/v1/\" + key + \"/analytics.min.js\";var r=document.getElementsByTagName(\"script\")[0];r.parentNode.insertBefore(t,r);analytics._loadOptions=n};analytics._writeKey=\"${writeKey}\";;analytics.SNIPPET_VERSION=\"5.2.0\";\n            analytics.load(\"${writeKey}\", { integrations: { \"Segment.io\": { apiHost: \"analytics.apify.com/v1\" } } });\n            }}();\n            `,\n                    },\n                ],\n            };\n        },\n    };\n};\n"
  },
  {
    "path": "website/src/plugins/docusaurus-plugin-segment/segment.js",
    "content": "import ExecutionEnvironment from '@docusaurus/ExecutionEnvironment';\n\nexport default ExecutionEnvironment.canUseDOM ? {\n    onRouteUpdate() {\n        // this forces deferred execution that ensures `window.location` is in sync\n        setTimeout(() => {\n            // Don't track page views on development\n            if (process.env.NODE_ENV === 'production' && window.analytics) {\n                window.analytics.page({\n                    app: 'crawlee',\n                    path: window.location.pathname,\n                    url: window.location.href,\n                    search: window.location.search,\n                });\n            }\n        }, 0);\n    },\n} : null;\n"
  },
  {
    "path": "website/src/theme/ColorModeToggle/index.js",
    "content": "import { translate } from '@docusaurus/Translate';\nimport useIsBrowser from '@docusaurus/useIsBrowser';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport IconDarkMode from './dark-mode-icon.svg';\nimport IconLightMode from './light-mode-icon.svg';\nimport styles from './styles.module.css';\n\nfunction ColorModeToggle({\n    className,\n    value,\n    onChange,\n}) {\n    const isBrowser = useIsBrowser();\n    const title = translate(\n        {\n            message: 'Switch between dark and light mode (currently {mode})',\n            id: 'theme.colorToggle.ariaLabel',\n            description: 'The ARIA label for the navbar color mode toggle',\n        },\n        {\n            mode:\n                value === 'dark'\n                    ? translate({\n                        message: 'dark mode',\n                        id: 'theme.colorToggle.ariaLabel.mode.dark',\n                        description: 'The name for the dark color mode',\n                    })\n                    : translate({\n                        message: 'light mode',\n                        id: 'theme.colorToggle.ariaLabel.mode.light',\n                        description: 'The name for the light color mode',\n                    }),\n        },\n    );\n    return (\n        <div className={className}>\n            <button\n                className={clsx(\n                    'clean-btn',\n                    styles.toggleButton,\n                    !isBrowser && styles.toggleButtonDisabled,\n                )}\n                type=\"button\"\n                onClick={() => onChange(value === 'dark' ? 'light' : 'dark')}\n                disabled={!isBrowser}\n                title={title}\n                aria-label={title}>\n                <IconLightMode\n                    className={clsx(styles.toggleIcon, styles.lightToggleIcon)}\n\n                />\n                <IconDarkMode\n                    className={clsx(styles.toggleIcon, styles.darkToggleIcon)}\n                />\n                <span />\n            </button>\n        </div>\n    );\n}\n\nexport default React.memo(ColorModeToggle);\n"
  },
  {
    "path": "website/src/theme/ColorModeToggle/styles.module.css",
    "content": ".toggleButton {\n    padding: 4px;\n    display: flex;\n    gap: 4px;\n    align-items: center;\n    transition: all var(--ifm-transition-fast);\n    position: relative;\n    border-radius: 150px;\n    background-color: var(--color-background-subtle);\n}\n\n.toggleButton span {\n    width: 44px;\n    height: 36px;\n    border-radius: 50%;\n    background: #fff;\n    position: absolute;\n    transition: all var(--ifm-transition-fast);\n    left: 0;\n    margin: 4px;\n\n    border-radius: 150px;\n    background-color: var(--color-background);\n\n    /* Light/L1 */\n    box-shadow:\n        0px 0.5px 1.5px 0px rgba(63, 71, 93, 0.15),\n        0.4px 0.8px 1px -1.2px rgba(63, 71, 93, 0.14),\n        1px 2px 2.5px -2.5px rgba(63, 71, 93, 0.13);\n}\n\n.toggleButton svg {\n    z-index: 1;\n    margin: 8px 12px;\n    width: 20px;\n    height: 20px;\n    path {\n        stroke: var(--color-icon);\n    }\n}\n\n[data-theme='dark'] .toggleButton span {\n    left: 48px;\n}\n\n.toggleButtonDisabled {\n    cursor: not-allowed;\n}\n"
  },
  {
    "path": "website/src/theme/DocItem/Content/index.js",
    "content": "import { useDoc } from '@docusaurus/plugin-content-docs/client';\nimport LLMButtons from '@site/src/components/LLMButtons';\nimport Heading from '@theme/Heading';\nimport MDXContent from '@theme/MDXContent';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './styles.module.css';\n\nfunction useSyntheticTitle() {\n    const { metadata, frontMatter, contentTitle } = useDoc();\n    const shouldRender = !frontMatter.hide_title && typeof contentTitle === 'undefined';\n\n    if (!shouldRender) {\n        return null;\n    }\n\n    return metadata.title;\n}\n\nexport default function DocItemContent({ children }) {\n    const syntheticTitle = useSyntheticTitle();\n\n    return (\n        <div className={clsx('markdown')}>\n            {syntheticTitle && (\n                <div className={styles.docItemContent}>\n                    {syntheticTitle && <Heading as=\"h1\">{syntheticTitle}</Heading>}\n                    <LLMButtons />\n                </div>\n            )}\n            <MDXContent>{children}</MDXContent>\n        </div>\n    );\n}"
  },
  {
    "path": "website/src/theme/DocItem/Content/styles.module.css",
    "content": ".docItemContent {\n    display: flex;\n    align-items: center;\n    justify-content: space-between;\n    gap: 1rem;\n    flex-wrap: wrap;\n    padding-bottom: calc(\n        var(--ifm-h1-vertical-rhythm-bottom) * var(--ifm-leading)\n    );\n  \n    h1 {\n        margin: 0 !important;\n        flex: 1 1 auto;\n        min-width: 12rem;\n    }\n  \n    @media (max-width: 767px) {\n        flex-direction: column;\n        align-items: flex-start;\n        gap: 0.75rem;\n    }\n  }"
  },
  {
    "path": "website/src/theme/DocItem/Layout/index.js",
    "content": "import { useDoc } from '@docusaurus/plugin-content-docs/client';\nimport { useWindowSize, useColorMode } from '@docusaurus/theme-common';\nimport Giscus from '@giscus/react';\nimport DocBreadcrumbs from '@theme/DocBreadcrumbs';\nimport DocItemContent from '@theme/DocItem/Content';\nimport DocItemFooter from '@theme/DocItem/Footer';\nimport DocItemPaginator from '@theme/DocItem/Paginator';\nimport DocItemTOCDesktop from '@theme/DocItem/TOC/Desktop';\nimport DocItemTOCMobile from '@theme/DocItem/TOC/Mobile';\nimport DocVersionBadge from '@theme/DocVersionBadge';\nimport DocVersionBanner from '@theme/DocVersionBanner';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './styles.module.css';\n\n/**\n * Decide if the toc should be rendered, on mobile or desktop viewports\n */\nfunction useDocTOC() {\n    const {\n        frontMatter,\n        toc,\n    } = useDoc();\n    const windowSize = useWindowSize();\n    const hidden = frontMatter.hide_table_of_contents;\n    const canRender = !hidden && toc.length > 0;\n    const mobile = canRender ? <DocItemTOCMobile/> : undefined;\n    const desktop = canRender && (windowSize === 'desktop' || windowSize === 'ssr') ? (\n        <DocItemTOCDesktop/>\n    ) : undefined;\n    return {\n        hidden,\n        mobile,\n        desktop,\n    };\n}\n\nexport default function DocItemLayout({ children }) {\n    const docTOC = useDocTOC();\n    const { colorMode } = useColorMode();\n    return (\n        <div className=\"row\">\n            <div className={clsx('col', !docTOC.hidden && styles.docItemCol)}>\n                <DocVersionBanner/>\n                <div className={styles.docItemContainer}>\n                    <article>\n                        <DocBreadcrumbs/>\n                        <DocVersionBadge/>\n                        {docTOC.mobile}\n                        <DocItemContent>{children}</DocItemContent>\n                        <DocItemFooter/>\n                    </article>\n                    <DocItemPaginator/>\n\n                    <Giscus\n                        id=\"giscus-comments\"\n                        repo=\"apify/crawlee-python\"\n                        repoId=\"R_kgDOLDBXgA\"\n                        category=\"Comments\"\n                        categoryId=\"DIC_kwDOLDBXgM4CgQI1\"\n                        mapping=\"pathname\"\n                        reactionsEnabled=\"1\"\n                        emitMetadata=\"0\"\n                        inputPosition=\"top\"\n                        theme={colorMode}\n                        lang=\"en\"\n                        strict=\"1\"\n                    />\n                </div>\n            </div>\n\n            {docTOC.desktop && <div className=\"col col--3\">{docTOC.desktop}</div>}\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/theme/DocItem/Layout/styles.module.css",
    "content": ".docItemContainer {\n    margin-bottom: 50px;\n}\n\n.docItemContainer header + *,\n.docItemContainer article > *:first-child {\n  margin-top: 0;\n}\n\n@media (min-width: 997px) {\n  .docItemCol {\n    max-width: 75% !important;\n  }\n}\n"
  },
  {
    "path": "website/src/theme/Footer/LinkItem/index.js",
    "content": "import isInternalUrl from '@docusaurus/isInternalUrl';\nimport Link from '@docusaurus/Link';\nimport useBaseUrl from '@docusaurus/useBaseUrl';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './index.module.css';\n\nexport default function FooterLinkItem({ item }) {\n    const ExternalLinkIcon = require('../../../../static/img/external-link.svg').default;\n\n    const { to, href, label, prependBaseUrlToHref, className, ...props } = item;\n    const toUrl = useBaseUrl(to);\n    const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true });\n\n    return (\n        <Link\n            className={clsx('footer__link-item', className, styles.footerLink)}\n            {...(href\n                ? {\n                    href: prependBaseUrlToHref ? normalizedHref : href,\n                }\n                : {\n                    to: toUrl,\n                })}\n            {...props}>\n            {label}\n            {href && !isInternalUrl(href) && <ExternalLinkIcon className={styles.externalLinkIcon} />}\n        </Link>\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Footer/LinkItem/index.module.css",
    "content": ".footerLink {\n    color: var(--color-text);\n    cursor: pointer;\n    font-size: 14px;\n    line-height: 20px;\n    &:hover {\n        color: var(--color-text-subtle);\n        path {\n            fill: var(--color-text-subtle);\n        }\n    }\n}\n\n.externalLinkIcon {\n    margin-left: 5px;\n    path {\n        fill: var(--color-text);\n    }\n}\n"
  },
  {
    "path": "website/src/theme/Footer/index.js",
    "content": "import Link from '@docusaurus/Link';\nimport { useThemeConfig } from '@docusaurus/theme-common';\nimport useBaseUrl from '@docusaurus/useBaseUrl';\nimport LinkItem from '@theme/Footer/LinkItem';\nimport NavbarColorModeToggle from '@theme/Navbar/ColorModeToggle';\nimport ThemedImage from '@theme/ThemedImage';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './index.module.css';\n\nfunction FooterLinksColumn({ column }) {\n    return (\n        <div>\n            <div className={styles.footerTitle}>{column.title}</div>\n            <ul className={clsx(styles.footerList, 'clean-list')}>\n                {column.items.map((item, i) => (\n                    <li key={i}>\n                        <LinkItem item={item} />\n                    </li>\n                ))}\n            </ul>\n        </div>\n    );\n}\n\nfunction Footer() {\n    const { footer } = useThemeConfig();\n\n    const { links, style } = footer;\n\n    const HearthIcon = require('../../../static/img/hearth.svg').default;\n    const logoSources = {\n        light: useBaseUrl('/img/crawlee-light.svg'),\n        dark: useBaseUrl('/img/crawlee-dark.svg'),\n    };\n\n    if (!footer) {\n        return null;\n    }\n\n    return (\n        <footer className={clsx(styles.footer, style)\n        } >\n            <div className={styles.footerTop}>\n                <div className={styles.footerTopRow}>\n                    <div className={styles.footerTopRowLeft}>\n                        <Link href=\"https://crawlee.dev\" width=\"120\" className={styles.footerLogo} target=\"_self\" rel=\"dofollow\">\n                            <ThemedImage\n                                width=\"120\"\n                                alt=\"Docusaurus themed image\"\n                                sources={logoSources}\n                            />\n                        </Link>\n                        <NavbarColorModeToggle />\n                    </div>\n                    <div className={styles.footerTopRowRight}>\n                        <FooterLinksColumn column={links[0]} />\n                        <FooterLinksColumn column={links[1]} />\n                        <FooterLinksColumn column={links[2]} />\n                    </div>\n                </div>\n            </div>\n\n            <div className={styles.footerBottom}>\n                <div className={styles.footerBottomRow}>\n                    <div>\n                        <HearthIcon className={styles.hearthIcon} />\n                        Crawlee is forever free and open source\n                    </div>\n                    <div>© {new Date().getFullYear()} Apify</div>\n                </div>\n            </div>\n        </footer >\n    );\n}\n\nexport default React.memo(Footer);\n"
  },
  {
    "path": "website/src/theme/Footer/index.module.css",
    "content": ".footer {\n    background: var(--color-background);\n    color: var(--color-text);\n}\n\n.footerBottom,\n.footerTop {\n    border-top: 1px solid var(--color-separator);\n\n    @media (min-width: 768px) {\n        padding: 40px 40px;\n    }\n    @media (min-width: 1024px) {\n        padding: 40px 64px;\n    }\n}\n\n.footerTopRow {\n    max-width: var(--max-layout-width);\n    margin: auto;\n\n    display: flex;\n    flex-direction: column;\n    @media (min-width: 768px) {\n        flex-direction: row;\n    }\n}\n\n.footerTopRowRight {\n    flex-direction: column;\n    display: flex;\n    flex: 3;\n    gap: 32px;\n    padding: 16px 40px 40px;\n\n    @media (min-width: 768px) {\n        gap: 0;\n        flex-direction: row;\n        padding: 0;\n        justify-content: space-between;\n    }\n}\n\n.footerTopRowLeft {\n    display: flex;\n    flex-direction: column;\n    justify-content: space-between;\n    flex: 2;\n    padding: 32px 40px 24px;\n    gap: 32px;\n\n    border-bottom: 1px solid var(--color-separator);\n\n    img {\n        display: block !important;\n    }\n\n    @media (min-width: 768px) {\n        padding: 0;\n        border: 0;\n        gap: 0;\n    }\n}\n\n.footerBottomRow {\n    max-width: var(--max-layout-width);\n    margin: auto;\n\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n    gap: 24px;\n    padding: 24px 40px;\n\n    font-size: 14px;\n    line-height: 20px;\n    text-align: center;\n\n    @media (min-width: 768px) {\n        gap: 0;\n        padding: 0;\n        flex-direction: row;\n        justify-content: space-between;\n    }\n}\n\n.hearthIcon {\n    margin-right: 8px;\n    path {\n        fill: var(--color-text-muted);\n    }\n}\n\n.footerTitle {\n    font-size: 16px;\n    font-weight: 700;\n    line-height: 24px;\n}\n\n.footerList {\n    margin: 0;\n    li {\n        margin-top: 16px;\n        height: 28px;\n    }\n}\n\n.footerLogo {\n    width: fit-content;\n}\n"
  },
  {
    "path": "website/src/theme/MDXComponents/A.js",
    "content": "/* eslint-disable react/prop-types */\nimport Link from '@docusaurus/Link';\nimport useDocusaurusContext from '@docusaurus/useDocusaurusContext';\nimport React from 'react';\n\nexport default function MDXA(props) {\n    const { siteConfig } = useDocusaurusContext();\n    if (props.href?.startsWith(siteConfig.url)) {\n        const { href, ...rest } = props;\n        rest.to = props.href.replace(siteConfig.url + siteConfig.baseUrl, '/');\n        props = rest;\n    }\n\n    return <Link {...props} />;\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/Content/index.js",
    "content": "import Link from '@docusaurus/Link';\nimport { useLocation } from '@docusaurus/router';\nimport { useThemeConfig } from '@docusaurus/theme-common';\nimport {\n    splitNavbarItems,\n    useNavbarMobileSidebar,\n} from '@docusaurus/theme-common/internal';\nimport NavbarLogo from '@theme/Navbar/Logo';\nimport NavbarMobileSidebarToggle from '@theme/Navbar/MobileSidebar/Toggle';\nimport NavbarSearch from '@theme/Navbar/Search';\nimport NavbarItem from '@theme/NavbarItem';\nimport SearchBar from '@theme/SearchBar';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './styles.module.css';\n\nfunction useNavbarItems() {\n    return useThemeConfig().navbar.items;\n}\n\nfunction NavbarItems({ items, className }) {\n    return (\n        <div className={clsx(styles.navbarItems, className)}>\n            {items.map((item, i) => (\n                <NavbarItem {...item} key={i} />\n            ))}\n        </div>\n    );\n}\n\nfunction NavbarContentLayout({ left, right }) {\n    return (\n        <div className=\"navbar__inner\">\n            <div className=\"navbar__items\">{left}</div>\n            <div className=\"navbar__items navbar__items--right\">{right}</div>\n        </div>\n    );\n}\n\nconst VERSIONS_ITEM = {\n    type: 'docsVersionDropdown',\n    position: 'left',\n    label: 'Versions',\n    dropdownItemsAfter: [\n        {\n            href: 'https://sdk.apify.com/docs/guides/getting-started',\n            label: '2.2',\n        },\n        {\n            href: 'https://sdk.apify.com/docs/1.3.1/guides/getting-started',\n            label: '1.3',\n        },\n    ],\n    dropdownItemsBefore: [],\n};\n\nexport default function NavbarContent() {\n    const location = useLocation();\n    const mobileSidebar = useNavbarMobileSidebar();\n    const items = useNavbarItems();\n    const effectiveItems = location.pathname?.endsWith('/python/')\n        || location.pathname?.endsWith('/python')\n        ? items\n        : [...items, VERSIONS_ITEM];\n    const [leftItems, rightItems] = splitNavbarItems(effectiveItems);\n    const searchBarItem = items.find((item) => item.type === 'search');\n    return (\n        <NavbarContentLayout\n            left={\n                <>\n                    <NavbarLogo />\n                    <NavbarItems items={leftItems} />\n                </>\n            }\n            right={\n                <>\n                    {rightItems?.length > 0 && (\n                        <NavbarItems items={rightItems} />\n                    )}\n                    {!searchBarItem && (\n                        <NavbarSearch>\n                            <SearchBar />\n                        </NavbarSearch>\n                    )}\n                    <Link\n                        className={styles.getStartedButton}\n                        to=\"/docs/quick-start\"\n                    >\n                        Get started\n                    </Link>\n                    {!mobileSidebar.disabled && <NavbarMobileSidebarToggle />}\n                </>\n            }\n        />\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/Content/styles.module.css",
    "content": ".navbarItems {\n    display: flex;\n    align-items: center;\n    margin-inline: auto;\n    gap: 16px;\n}\n\n.navbarItems__leftMargin {\n    margin-left: 40px;\n}\n\n.getStartedButton {\n    color: var(--color-text-on-primary);\n    background: var(--color-black-action);\n    border-radius: 8px;\n    font-size: 16px;\n    font-weight: 500;\n    line-height: 24px;\n    padding: 8px 16px !important;\n    border: none;\n    transition: background-color 0.2s;\n\n    &:hover {\n        color: var(--color-text-on-primary);\n        background-color: var(--color-primary-action-hover);\n    }\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/Logo/index.js",
    "content": "import Link from '@docusaurus/Link';\nimport { useThemeConfig } from '@docusaurus/theme-common';\nimport useBaseUrl from '@docusaurus/useBaseUrl';\nimport Logo from '@theme/Logo';\nimport ThemedImage from '@theme/ThemedImage';\nimport React from 'react';\n\nimport styles from './index.module.css';\n\nexport default function LogoWrapper(props) {\n    const ArrowsIcon = require('../../../../static/img/menu-arrows.svg').default;\n    const CheckIcon = require('../../../../static/img/check.svg').default;\n    const { navbar: { logo } } = useThemeConfig();\n    const javascriptLogo = {\n        light: useBaseUrl('img/crawlee-javascript-light.svg'),\n        dark: useBaseUrl('img/crawlee-javascript-dark.svg'),\n    };\n    const languageAgnosticLogo = {\n        light: useBaseUrl('img/crawlee-light.svg'),\n        dark: useBaseUrl('img/crawlee-dark.svg'),\n    };\n    const pythonLogo = {\n        light: useBaseUrl(logo.src),\n        dark: useBaseUrl(logo.srcDark || logo.src),\n    };\n    return (\n        <div className={styles.navbarLogo}>\n            <div className={styles.logoWithArrows}>\n                <Logo titleClassName=\"navbar__title\" />\n                <ArrowsIcon />\n            </div>\n            <div className={styles.menuWrapper}>\n                <div className={styles.menu}>\n                    <Link className={styles.menuItem} href=\"https://crawlee.dev/js\" target=\"_self\" rel=\"dofollow\">\n                        <ThemedImage sources={javascriptLogo} alt=\"Crawlee JavaScript\" />\n                    </Link>\n                    <Link className={styles.menuItem} to=\"/\" >\n                        <ThemedImage sources={pythonLogo} alt=\"Crawlee Python\" />\n                        <CheckIcon />\n                    </Link>\n                    <Link className={styles.menuItem} href=\"https://crawlee.dev\" target=\"_self\" rel=\"dofollow\">\n                        <ThemedImage sources={languageAgnosticLogo} alt=\"Crawlee\" />\n                    </Link>\n                </div>\n            </div>\n        </div >\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/Logo/index.module.css",
    "content": ".navbarLogo {\n    position: relative;\n    cursor: pointer;\n\n    /* do not display the other theme logo when loading */\n    a {\n        img:nth-child(2) {\n            display: none !important;\n        }\n    }\n}\n\n.logoWithArrows {\n    display: flex;\n    align-items: center;\n    width: 220px;\n\n    svg {\n        margin: 0 2px;\n        g {\n            stroke: var(--color-icon);\n        }\n    }\n\n    img {\n        display: block !important;\n        height: 28px;\n    }\n}\n\n.menuWrapper {\n    position: absolute;\n    left: 0;\n    top: 100%;\n\n    z-index: 100;\n    padding-top: 6px;\n}\n\n.menu {\n    width: 230px;\n    border-radius: 8px;\n    border: 1px solid var(--color-border);\n    box-shadow:\n        0px 4px 8px 0px rgba(36, 39, 54, 0.12),\n        0px 2px 4px 0px rgba(36, 39, 54, 0.08),\n        0px 0px 1px 0px rgba(36, 39, 54, 0.24);\n\n    background: var(--color-card-background);\n    padding: 8px 0;\n    overflow: hidden;\n    transition: all 0.3s;\n\n    flex-direction: column;\n    align-items: flex-start;\n\n    padding: 8px;\n\n    display: none;\n\n    img {\n        height: 24px;\n        width: auto;\n        display: block !important;\n    }\n}\n\n.navbarLogo:hover {\n    .menu {\n        display: flex;\n    }\n}\n\n.menuItem {\n    padding: 8px;\n    width: 100%;\n    border-radius: 12px;\n    display: flex;\n    justify-content: space-between;\n    align-items: center;\n    path {\n        fill: var(--color-icon);\n    }\n    &:hover {\n        background: var(--color-hover);\n    }\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/MobileSidebar/Header/index.js",
    "content": "import Link from '@docusaurus/Link';\nimport { useLocation } from '@docusaurus/router';\nimport { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal';\nimport { translate } from '@docusaurus/Translate';\nimport IconClose from '@theme/Icon/Close';\nimport NavbarLogo from '@theme/Navbar/Logo';\nimport SearchBar from '@theme/SearchBar';\nimport clsx from 'clsx';\nimport React from 'react';\n\nimport styles from './index.module.css';\n\nfunction CloseButton() {\n    const mobileSidebar = useNavbarMobileSidebar();\n    return (\n        <button\n            type=\"button\"\n            aria-label={translate({\n                id: 'theme.docs.sidebar.closeSidebarButtonAriaLabel',\n                message: 'Close navigation bar',\n                description: 'The ARIA label for close button of mobile sidebar',\n            })}\n            className=\"clean-btn navbar-sidebar__close\"\n            onClick={() => mobileSidebar.toggle()}>\n            <IconClose color=\"var(--ifm-color-emphasis-600)\" />\n        </button>\n    );\n}\nexport default function NavbarMobileSidebarHeader() {\n    const { toggle, shown } = useNavbarMobileSidebar();\n    const closeSidebar = () => shown && toggle();\n\n    return (\n        <div className=\"navbar-sidebar__brand\">\n            <div className={styles.navbarHeader}>\n                <NavbarLogo />\n                <div className={clsx(styles.navbarButtonsWrapper, styles.navbarButtonsWrapperDesktop)} >\n                    <div onClick={closeSidebar} >\n                        <SearchBar />\n                    </div>\n                    <Link className={styles.getStartedButton} to=\"/docs/quick-start\" onClick={closeSidebar} >\n                        Get started\n                    </Link>\n                </div>\n                <CloseButton />\n            </div>\n            <div className={clsx(styles.navbarButtonsWrapper, styles.navbarButtonsWrapperMobile)} >\n                <Link className={styles.getStartedButton} to=\"/docs/quick-start\" onClick={closeSidebar}>\n                    Get started\n                </Link>\n                <div onClick={closeSidebar} >\n                    <SearchBar />\n                </div>\n            </div>\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/MobileSidebar/Header/index.module.css",
    "content": ".getStartedButton {\n    color: var(--color-text-on-primary);\n    background: var(--color-black-action);\n    border-radius: 8px;\n    font-size: 16px;\n    font-weight: 500;\n    line-height: 24px;\n    padding: 8px 16px !important;\n    border: none;\n    &:hover {\n        color: var(--color-text-on-primary);\n    }\n    text-align: center;\n}\n\n.navbarHeader {\n    display: flex;\n    width: 100%;\n    align-items: center;\n    justify-content: space-between;\n    padding: 16px;\n\n    @media (min-width: 768px) {\n        padding: 20px 40px;\n    }\n    @media (min-width: 1024px) {\n        padding: 20px 64px;\n    }\n}\n\n.navbarButtonsWrapper {\n    display: flex;\n    gap: 16px;\n    margin-left: auto;\n}\n\n.navbarButtonsWrapperDesktop {\n    display: flex;\n    @media (max-width: 767px) {\n        display: none;\n    }\n}\n.navbarButtonsWrapperMobile {\n    border-top: 1px solid var(--color-separator);\n    display: none;\n    @media (max-width: 767px) {\n        display: flex;\n    }\n    width: 100%;\n    margin: 0;\n    flex-direction: column;\n    gap: 16px;\n    button {\n        width: 100%;\n    }\n    padding: 16px 24px;\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/MobileSidebar/Layout/index.js",
    "content": "import { useNavbarSecondaryMenu } from '@docusaurus/theme-common/internal';\nimport clsx from 'clsx';\nimport React from 'react';\n\nexport default function NavbarMobileSidebarLayout({\n    header,\n    primaryMenu,\n    secondaryMenu,\n}) {\n    const { shown: secondaryMenuShown } = useNavbarSecondaryMenu();\n    return (\n        <div className=\"navbar-sidebar\">\n            {header}\n            <div\n                className={clsx('navbar-sidebar__items', {\n                    'navbar-sidebar__items--show-secondary': secondaryMenuShown,\n                })}>\n                <div className=\"navbar-sidebar__item menu menu-primary\">{primaryMenu}</div>\n                <div className=\"navbar-sidebar__item menu menu-secondary\">{secondaryMenu}</div>\n            </div>\n        </div>\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js",
    "content": "import { useThemeConfig } from '@docusaurus/theme-common';\nimport { useNavbarMobileSidebar } from '@docusaurus/theme-common/internal';\nimport NavbarItem from '@theme/NavbarItem';\nimport React from 'react';\n\nfunction useNavbarItems() {\n    return useThemeConfig().navbar.items;\n}\n// The primary menu displays the navbar items\nexport default function NavbarMobilePrimaryMenu() {\n    const mobileSidebar = useNavbarMobileSidebar();\n    const items = useNavbarItems();\n\n    return (\n        <ul className=\"menu__list\">\n            {items.map((item, i) => (\n                <NavbarItem\n                    mobile\n                    {...item}\n                    onClick={() => mobileSidebar.toggle()}\n                    key={i}\n                />\n            ))}\n        </ul>\n    );\n}\n"
  },
  {
    "path": "website/src/theme/Navbar/MobileSidebar/index.js",
    "content": "import {\n    useLockBodyScroll,\n    useNavbarMobileSidebar,\n    useWindowSize,\n} from '@docusaurus/theme-common/internal';\nimport NavbarMobileSidebarHeader from '@theme/Navbar/MobileSidebar/Header';\nimport NavbarMobileSidebarLayout from '@theme/Navbar/MobileSidebar/Layout';\nimport NavbarMobileSidebarPrimaryMenu from '@theme/Navbar/MobileSidebar/PrimaryMenu';\nimport NavbarMobileSidebarSecondaryMenu from '@theme/Navbar/MobileSidebar/SecondaryMenu';\nimport React from 'react';\n\nexport default function NavbarMobileSidebar() {\n    const mobileSidebar = useNavbarMobileSidebar();\n    const windowSize = useWindowSize({\n        desktopBreakpoint: 1200,\n    });\n\n    useLockBodyScroll(mobileSidebar.shown);\n    const shouldRender = !mobileSidebar.disabled && windowSize === 'mobile';\n    if (!shouldRender) {\n        return null;\n    }\n    return (\n        <NavbarMobileSidebarLayout\n            header={<NavbarMobileSidebarHeader />}\n            primaryMenu={<NavbarMobileSidebarPrimaryMenu />}\n            secondaryMenu={<NavbarMobileSidebarSecondaryMenu />}\n        />\n    );\n}\n"
  },
  {
    "path": "website/src/theme/NavbarItem/ComponentTypes.js",
    "content": "import { useActiveDocContext, useLayoutDoc } from '@docusaurus/plugin-content-docs/client';\nimport DefaultNavbarItem from '@theme/NavbarItem/DefaultNavbarItem';\nimport DocSidebarNavbarItem from '@theme/NavbarItem/DocSidebarNavbarItem';\nimport DocsVersionDropdownNavbarItem from '@theme/NavbarItem/DocsVersionDropdownNavbarItem';\nimport DocsVersionNavbarItem from '@theme/NavbarItem/DocsVersionNavbarItem';\nimport DropdownNavbarItem from '@theme/NavbarItem/DropdownNavbarItem';\nimport HtmlNavbarItem from '@theme/NavbarItem/HtmlNavbarItem';\nimport LocaleDropdownNavbarItem from '@theme/NavbarItem/LocaleDropdownNavbarItem';\nimport SearchNavbarItem from '@theme/NavbarItem/SearchNavbarItem';\nimport React from 'react';\n\n// const versions = require('../../../versions.json');\n// const stable = versions[0];\n\nfunction DocNavbarItem({\n    docId,\n    label: staticLabel,\n    docsPluginId,\n    ...props\n}) {\n    const { activeDoc } = useActiveDocContext(docsPluginId);\n    const doc = useLayoutDoc(docId, docsPluginId);\n    // Draft items are not displayed in the navbar.\n    if (doc === null) {\n        return null;\n    }\n    return (\n        <DefaultNavbarItem\n            exact\n            {...props}\n            isActive={() => activeDoc?.path.startsWith(doc.path)}\n            label={staticLabel ?? doc.id}\n            to={doc.path}\n        />\n    );\n}\n\nfunction ApiNavbarItem(ctx) {\n    return (\n        <DefaultNavbarItem\n            exact\n            {...ctx}\n            label={ctx.label}\n            to={`api/${ctx.to}`}\n        />\n    );\n\n    // let version = {};\n    //\n    // try {\n    //     // eslint-disable-next-line react-hooks/rules-of-hooks\n    //     version = useDocsVersion();\n    // } catch {\n    //     version.version = stable;\n    // }\n    //\n    // const { siteConfig } = useDocusaurusContext();\n    //\n    // if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) {\n    //     return (\n    //         <DefaultNavbarItem\n    //             exact\n    //             {...ctx}\n    //             label={ctx.label}\n    //             to={`api/${ctx.to}`}\n    //         />\n    //     );\n    // }\n    //\n    // return (\n    //     <DefaultNavbarItem\n    //         exact\n    //         {...ctx}\n    //         label={ctx.label}\n    //         to={`api/${version.version === 'current' ? 'next' : version.version}/${ctx.to}`}\n    //     />\n    // );\n}\n\nconst ComponentTypes = {\n    'default': DefaultNavbarItem,\n    'localeDropdown': LocaleDropdownNavbarItem,\n    'search': SearchNavbarItem,\n    'dropdown': DropdownNavbarItem,\n    'html': HtmlNavbarItem,\n    'custom-api': ApiNavbarItem,\n    'doc': DocNavbarItem,\n    'docSidebar': DocSidebarNavbarItem,\n    'docsVersion': DocsVersionNavbarItem,\n    'docsVersionDropdown': DocsVersionDropdownNavbarItem,\n};\nexport default ComponentTypes;\n"
  },
  {
    "path": "website/static/.nojekyll",
    "content": ""
  },
  {
    "path": "website/static/js/custom.js",
    "content": "function load() {\n    const versions = document.querySelectorAll('.navbar .dropdown ul a');\n    const basePath = '';\n    const types = [`${basePath}/docs/next`, `${basePath}/docs`];\n    let i = 0;\n\n    for (const el of versions) {\n        const match = el.href.match(/\\/docs\\/(\\d+\\.\\d+(\\.\\d+)?)$/) || el.href.match(/\\/docs\\/(\\d+\\.\\d+(\\.\\d+)?)/);\n\n        if (!types[i++] && !match) {\n            continue;\n        }\n\n        const version = (types[i++] || match[0]).replace('/docs', '/api');\n\n        if (el.classList.contains('api-version-bound')) {\n            continue;\n        }\n\n        el.addEventListener('click', (e) => {\n            if (version && window.location.pathname.startsWith(`${basePath}/api`)) {\n                window.location.href = version;\n                e.preventDefault();\n            }\n        });\n        el.classList.add('api-version-bound');\n    }\n}\n\nsetInterval(() => {\n    if (document.querySelectorAll('.navbar .dropdown ul a').length > 0) {\n        load();\n    }\n}, 500);\n\nif (window.location.href.startsWith('https://apify.github.io/crawlee-python')) {\n    window.location.href = window.location.href.replace('https://apify.github.io/crawlee-python', 'https://crawlee.dev/python');\n}\n\nif (window.location.href.startsWith('https://crawlee.dev/crawlee-python')) {\n    window.location.href = window.location.href.replace('https://crawlee.dev/crawlee-python', 'https://crawlee.dev/python');\n}\n"
  },
  {
    "path": "website/static/robots.txt",
    "content": "User-agent: *\nSitemap: https://crawlee.dev/python/sitemap.xml\n"
  },
  {
    "path": "website/tools/docs-prettier.config.js",
    "content": "/**\n * @type {import('prettier').Options}\n */\nmodule.exports = {\n    parser: 'markdown',\n    arrowParens: 'avoid',\n    trailingComma: 'all',\n    singleQuote: true,\n    tabWidth: 4,\n    printWidth: 150,\n    proseWrap: 'always',\n};\n"
  },
  {
    "path": "website/tools/utils/externalLink.js",
    "content": "const { parse } = require('url');\n\nconst visit = import('unist-util-visit').then((m) => m.visit);\n\nconst internalUrls = ['crawlee.dev'];\n\n/**\n * @param {import('url').UrlWithStringQuery} href\n */\nfunction isInternal(href) {\n    return internalUrls.some(\n        (internalUrl) => href.host === internalUrl\n            || (!href.protocol && !href.host && (href.pathname || href.hash)),\n    );\n}\n\n/**\n * @type {import('unified').Plugin}\n */\nexports.externalLinkProcessor = () => {\n    return async (tree) => {\n        (await visit)(tree, 'element', (node) => {\n            if (\n                node.tagName === 'a'\n                && node.properties\n                && typeof node.properties.href === 'string'\n            ) {\n                const href = parse(node.properties.href);\n\n                if (!isInternal(href)) {\n                    node.properties.target = '_blank';\n                    node.properties.rel = 'noopener';\n                } else {\n                    node.properties.target = null;\n                    node.properties.rel = null;\n                }\n            }\n        });\n    };\n};\n"
  },
  {
    "path": "website/tools/website_gif/website_gif.mjs",
    "content": "/**\n * How to generate the gifs:\n *\n * 1. Set a breakpoint on the marked line\n * 2. Run the crawler with the debugger\n * 3. Setup your chrome and recording\n * 4. Resume, record, ???, profit!\n */\n\nimport { PuppeteerCrawler, sleep } from 'crawlee';\n\nconst crawler = new PuppeteerCrawler({\n    headless: false,\n    maxConcurrency: 1,\n    navigationTimeoutSecs: 100000,\n    requestHandlerTimeoutSecs: 10000,\n    browserPoolOptions: {\n        closeInactiveBrowserAfterSecs: 100000,\n        operationTimeoutSecs: 100000,\n    },\n    async requestHandler({ request }) {\n        if (request.userData.label === 'start') {\n            console.log('Waiting 5s, prepare recording!');\n            await sleep(5000); // <--- Set breakpoint here\n        } else {\n            await sleep(250);\n        }\n    },\n});\n\nawait crawler.run([\n    {\n        url: 'https://crawlee.dev',\n        userData: { label: 'start' },\n        uniqueKey: 'dark-start'\n    },\n    {\n        url: 'https://crawlee.dev/docs/quick-start',\n        uniqueKey: 'dark-1'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/setting-up',\n        uniqueKey: 'dark-2'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/first-crawler',\n        uniqueKey: 'dark-3'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/adding-urls',\n        uniqueKey: 'dark-4'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/real-world-project',\n        uniqueKey: 'dark-5'\n    },\n\n    // Light theme\n    {\n        url: 'https://crawlee.dev',\n        userData: { label: 'start' },\n        uniqueKey: 'light th-start'\n    },\n    {\n        url: 'https://crawlee.dev/docs/quick-start',\n        uniqueKey: 'light th-1'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/setting-up',\n        uniqueKey: 'light th-2'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/first-crawler',\n        uniqueKey: 'light th-3'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/adding-urls',\n        uniqueKey: 'light th-4'\n    },\n    {\n        url: 'https://crawlee.dev/docs/introduction/real-world-project',\n        uniqueKey: 'light th-5'\n    }\n]);\n"
  },
  {
    "path": "website/tsconfig.eslint.json",
    "content": "{\n\t\"extends\": \"@apify/tsconfig\",\n\t\"compilerOptions\": {\n\t\t\"jsx\": \"preserve\"\n\t},\n\t\"include\": [\n\t\t\"src/**/*.js\",\n\t\t\"src/**/*.ts\",\n\t\t\"src/**/*.jsx\",\n\t\t\"src/**/*.tsx\"\n\t]\n}\n"
  }
]