Repository: D4Vinci/Scrapling
Branch: main
Commit: 3ed59c2a8495
Files: 187
Total size: 1.4 MB

Directory structure:
gitextract_lhg67gwc/

├── .bandit.yml
├── .dockerignore
├── .github/
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE/
│   │   ├── 01-bug_report.yml
│   │   ├── 02-feature_request.yml
│   │   ├── 03-other.yml
│   │   ├── 04-docs_issue.yml
│   │   └── config.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── code-quality.yml
│       ├── docker-build.yml
│       ├── release-and-publish.yml
│       └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── ROADMAP.md
├── agent-skill/
│   ├── README.md
│   └── Scrapling-Skill/
│       ├── LICENSE.txt
│       ├── SKILL.md
│       ├── examples/
│       │   ├── 01_fetcher_session.py
│       │   ├── 02_dynamic_session.py
│       │   ├── 03_stealthy_session.py
│       │   ├── 04_spider.py
│       │   └── README.md
│       └── references/
│           ├── fetching/
│           │   ├── choosing.md
│           │   ├── dynamic.md
│           │   ├── static.md
│           │   └── stealthy.md
│           ├── mcp-server.md
│           ├── migrating_from_beautifulsoup.md
│           ├── parsing/
│           │   ├── adaptive.md
│           │   ├── main_classes.md
│           │   └── selection.md
│           └── spiders/
│               ├── advanced.md
│               ├── architecture.md
│               ├── getting-started.md
│               ├── proxy-blocking.md
│               ├── requests-responses.md
│               └── sessions.md
├── benchmarks.py
├── cleanup.py
├── docs/
│   ├── README_AR.md
│   ├── README_CN.md
│   ├── README_DE.md
│   ├── README_ES.md
│   ├── README_FR.md
│   ├── README_JP.md
│   ├── README_KR.md
│   ├── README_RU.md
│   ├── ai/
│   │   └── mcp-server.md
│   ├── api-reference/
│   │   ├── custom-types.md
│   │   ├── fetchers.md
│   │   ├── mcp-server.md
│   │   ├── proxy-rotation.md
│   │   ├── response.md
│   │   ├── selector.md
│   │   └── spiders.md
│   ├── benchmarks.md
│   ├── cli/
│   │   ├── extract-commands.md
│   │   ├── interactive-shell.md
│   │   └── overview.md
│   ├── development/
│   │   ├── adaptive_storage_system.md
│   │   └── scrapling_custom_types.md
│   ├── donate.md
│   ├── fetching/
│   │   ├── choosing.md
│   │   ├── dynamic.md
│   │   ├── static.md
│   │   └── stealthy.md
│   ├── index.md
│   ├── overrides/
│   │   └── main.html
│   ├── overview.md
│   ├── parsing/
│   │   ├── adaptive.md
│   │   ├── main_classes.md
│   │   └── selection.md
│   ├── requirements.txt
│   ├── spiders/
│   │   ├── advanced.md
│   │   ├── architecture.md
│   │   ├── getting-started.md
│   │   ├── proxy-blocking.md
│   │   ├── requests-responses.md
│   │   └── sessions.md
│   ├── stylesheets/
│   │   └── extra.css
│   └── tutorials/
│       ├── migrating_from_beautifulsoup.md
│       └── replacing_ai.md
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── scrapling/
│   ├── __init__.py
│   ├── cli.py
│   ├── core/
│   │   ├── __init__.py
│   │   ├── _shell_signatures.py
│   │   ├── _types.py
│   │   ├── ai.py
│   │   ├── custom_types.py
│   │   ├── mixins.py
│   │   ├── shell.py
│   │   ├── storage.py
│   │   ├── translator.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── _shell.py
│   │       └── _utils.py
│   ├── engines/
│   │   ├── __init__.py
│   │   ├── _browsers/
│   │   │   ├── __init__.py
│   │   │   ├── _base.py
│   │   │   ├── _config_tools.py
│   │   │   ├── _controllers.py
│   │   │   ├── _page.py
│   │   │   ├── _stealth.py
│   │   │   ├── _types.py
│   │   │   └── _validators.py
│   │   ├── constants.py
│   │   ├── static.py
│   │   └── toolbelt/
│   │       ├── __init__.py
│   │       ├── convertor.py
│   │       ├── custom.py
│   │       ├── fingerprints.py
│   │       ├── navigation.py
│   │       └── proxy_rotation.py
│   ├── fetchers/
│   │   ├── __init__.py
│   │   ├── chrome.py
│   │   ├── requests.py
│   │   └── stealth_chrome.py
│   ├── parser.py
│   ├── py.typed
│   └── spiders/
│       ├── __init__.py
│       ├── checkpoint.py
│       ├── engine.py
│       ├── request.py
│       ├── result.py
│       ├── scheduler.py
│       ├── session.py
│       └── spider.py
├── server.json
├── setup.cfg
├── tests/
│   ├── __init__.py
│   ├── ai/
│   │   ├── __init__.py
│   │   └── test_ai_mcp.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── test_cli.py
│   │   └── test_shell_functionality.py
│   ├── core/
│   │   ├── __init__.py
│   │   ├── test_shell_core.py
│   │   └── test_storage_core.py
│   ├── fetchers/
│   │   ├── __init__.py
│   │   ├── async/
│   │   │   ├── __init__.py
│   │   │   ├── test_dynamic.py
│   │   │   ├── test_dynamic_session.py
│   │   │   ├── test_requests.py
│   │   │   ├── test_requests_session.py
│   │   │   ├── test_stealth.py
│   │   │   └── test_stealth_session.py
│   │   ├── sync/
│   │   │   ├── __init__.py
│   │   │   ├── test_dynamic.py
│   │   │   ├── test_requests.py
│   │   │   ├── test_requests_session.py
│   │   │   └── test_stealth_session.py
│   │   ├── test_base.py
│   │   ├── test_constants.py
│   │   ├── test_impersonate_list.py
│   │   ├── test_pages.py
│   │   ├── test_proxy_rotation.py
│   │   ├── test_response_handling.py
│   │   ├── test_utils.py
│   │   └── test_validator.py
│   ├── parser/
│   │   ├── __init__.py
│   │   ├── test_adaptive.py
│   │   ├── test_attributes_handler.py
│   │   ├── test_general.py
│   │   └── test_parser_advanced.py
│   ├── requirements.txt
│   └── spiders/
│       ├── __init__.py
│       ├── test_checkpoint.py
│       ├── test_engine.py
│       ├── test_request.py
│       ├── test_result.py
│       ├── test_scheduler.py
│       ├── test_session.py
│       └── test_spider.py
├── tox.ini
└── zensical.toml

================================================
FILE CONTENTS
================================================

================================================
FILE: .bandit.yml
================================================
skips:
- B101
- B311
- B113  # `Requests call without timeout` these requests are done in the benchmark and examples scripts only
- B403  # We are using pickle for tests only
- B404  # Using subprocess library
- B602  # subprocess call with shell=True identified
- B110  # Try, Except, Pass detected.
- B104  # Possible binding to all interfaces.
- B301  # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
- B108  # Probable insecure usage of temp file/directory.

================================================
FILE: .dockerignore
================================================
# Github
.github/

# docs
docs/
images/
.cache/
.claude/

# cached files
__pycache__/
*.py[cod]
.cache
.DS_Store
*~
.*.sw[po]
.build
.ve
.env
.pytest
.benchmarks
.bootstrap
.appveyor.token
*.bak
*.db
*.db-*

# installation package
*.egg-info/
dist/
build/

# environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# C extensions
*.so

# pycharm
.idea/

# vscode
*.code-workspace

# Packages
*.egg
*.egg-info
dist
build
eggs
.eggs
parts
bin
var
sdist
wheelhouse
develop-eggs
.installed.cfg
lib
lib64
venv*/
.venv*/
pyvenv*/
pip-wheel-metadata/
poetry.lock

# Installer logs
pip-log.txt

# mypy
.mypy_cache/
.dmypy.json
dmypy.json
mypy.ini

# test caches
.tox/
.pytest_cache/
.coverage
htmlcov
report.xml
nosetests.xml
coverage.xml

# Translations
*.mo

# Buildout
.mr.developer.cfg

# IDE project files
.project
.pydevproject
.idea
*.iml
*.komodoproject

# Complexity
output/*.html
output/*/index.html

# Sphinx
docs/_build
public/
web/


================================================
FILE: .github/FUNDING.yml
================================================
github: D4Vinci
buy_me_a_coffee: d4vinci
ko_fi: d4vinci


================================================
FILE: .github/ISSUE_TEMPLATE/01-bug_report.yml
================================================
name: Bug report
description: Create a bug report to help us address errors in the repository
labels: [bug]
body:
  - type: checkboxes
    attributes:
      label: Have you searched if there an existing issue for this?
      description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).
      options:
        - label: I have searched the existing issues
          required: true

  - type: input
    attributes:
      label: "Python version (python --version)"
      placeholder: "Python 3.8"
    validations:
      required: true

  - type: input
    attributes:
      label: "Scrapling version (scrapling.__version__)"
      placeholder: "0.1"
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Dependencies version (pip3 freeze)"
      description: >
        This is the output of the command `pip3 freeze --all`. Note that the
        actual output might be different as compared to the placeholder text.
      placeholder: |
        cssselect==1.2.0
        lxml==5.3.0
        orjson==3.10.7
        ...
    validations:
      required: true

  - type: input
    attributes:
      label: "What's your operating system?"
      placeholder: "Windows 10"
    validations:
      required: true

  - type: dropdown
    attributes:
      label: 'Are you using a separate virtual environment?'
      description: "Please pay attention to this question"
      options:
        - 'No'
        - 'Yes'
      default: 0
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Expected behavior"
      description: "Describe the behavior you expect. May include images or videos."
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Actual behavior"
    validations:
      required: true

  - type: textarea
    attributes:
      label: Steps To Reproduce
      description: Steps to reproduce the behavior.
      placeholder: |
        1. In this environment...
        2. With this config...
        3. Run '...'
        4. See error...
    validations:
      required: false


================================================
FILE: .github/ISSUE_TEMPLATE/02-feature_request.yml
================================================
name: Feature request
description: Suggest features, propose improvements, discuss new ideas.
labels: [enhancement]
body:
  - type: checkboxes
    attributes:
      label: Have you searched if there an existing feature request for this?
      description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).
      options:
        - label: I have searched the existing requests
          required: true

  - type: textarea
    attributes:
      label: "Feature description"
      description: >
        This could include new topics or improving any existing features/implementations.
    validations:
      required: true

================================================
FILE: .github/ISSUE_TEMPLATE/03-other.yml
================================================
name: Other
description: Use this for any other issues. PLEASE provide as much information as possible.
labels: ["awaiting triage"]
body:
  - type: textarea
    id: issuedescription
    attributes:
      label: What would you like to share?
      description: Provide a clear and concise explanation of your issue.
    validations:
      required: true

  - type: textarea
    id: extrainfo
    attributes:
      label: Additional information
      description: Is there anything else we should know about this issue?
    validations:
      required: false

================================================
FILE: .github/ISSUE_TEMPLATE/04-docs_issue.yml
================================================
name: Documentation issue
description: Report incorrect, unclear, or missing documentation.
labels: [documentation]
body:
  - type: checkboxes
    attributes:
      label: Have you searched if there an existing issue for this?
      description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).
      options:
        - label: I have searched the existing issues
          required: true

  - type: input
    attributes:
      label: "Page URL"
      description: "Link to the documentation page with the issue."
      placeholder: "https://scrapling.readthedocs.io/en/latest/..."
    validations:
      required: true

  - type: dropdown
    attributes:
      label: "Type of issue"
      options:
        - Incorrect information
        - Unclear or confusing
        - Missing information
        - Typo or formatting
        - Broken link
        - Other
      default: 0
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Description"
      description: "Describe what's wrong and what you expected to find."
    validations:
      required: true


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: Discussions
  url: https://github.com/D4Vinci/Scrapling/discussions
  about: >
    The "Discussions" forum is where you want to start. 💖
- name: Ask on our discord server
  url: https://discord.gg/EMgGbDceNQ
  about: >
    Our community chat forum.

================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!--
  You are amazing! Thanks for contributing to Scrapling!
  Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).
-->

## Proposed change
<!--
  Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.
  If it fixes a bug or resolves a feature request, be sure to link to that issue in the additional information section.
-->


### Type of change:
<!--
  What type of change does your PR introduce to Scrapling?
  NOTE: Please, check at least 1 box!
  If your PR requires multiple boxes to be checked, you'll most likely need to
  split it into multiple PRs. This makes things easier and faster to code review.
-->


- [ ] Dependency upgrade
- [ ] Bugfix (non-breaking change which fixes an issue)
- [ ] New integration (thank you!)
- [ ] New feature (which adds functionality to an existing integration)
- [ ] Deprecation (breaking change to happen in the future)
- [ ] Breaking change (fix/feature causing existing functionality to break)
- [ ] Code quality improvements to existing code or addition of tests
- [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.
- [ ] Documentation change?

### Additional information
<!--
  Details are important and help maintainers processing your PR.
  Please be sure to fill out additional details, if applicable.
-->

- This PR fixes or closes an issue: fixes #
- This PR is related to an issue: #
- Link to documentation pull request: **

### Checklist:
* [ ] I have read [CONTRIBUTING.md](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md).
* [ ] This pull request is all my own work -- I have not plagiarized.
* [ ] I know that pull requests will not be merged if they fail the automated tests.
* [ ] All new Python files are placed inside an existing directory.
* [ ] All filenames are in all lowercase characters with no spaces or dashes.
* [ ] All functions and variable names follow Python naming conventions.
* [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).
* [ ] All functions have doc-strings.


================================================
FILE: .github/workflows/code-quality.yml
================================================
name: Code Quality

on:
  push:
    branches:
      - main
      - dev
    paths-ignore:
      - '*.md'
      - '**/*.md'
      - 'docs/**'
      - 'images/**'
      - '.github/**'
      - 'agent-skill/**'
      - '!.github/workflows/code-quality.yml'  # Always run when this workflow changes
  pull_request:
    branches:
      - main
      - dev
    paths-ignore:
      - '*.md'
      - '**/*.md'
      - 'docs/**'
      - 'images/**'
      - '.github/**'
      - 'agent-skill/**'
      - '*.yml'
      - '*.yaml'
      - 'ruff.toml'
  workflow_dispatch:  # Allow manual triggering

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  code-quality:
    name: Code Quality Checks
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write  # For PR annotations

    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Full history for better analysis

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.10'
          cache: 'pip'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install bandit[toml] ruff vermin mypy pyright
          pip install -e ".[all]"
          pip install lxml-stubs

      - name: Run Bandit (Security Linter)
        id: bandit
        continue-on-error: true
        run: |
          echo "::group::Bandit - Security Linter"
          bandit -r -c .bandit.yml scrapling/ -f json -o bandit-report.json
          bandit -r -c .bandit.yml scrapling/
          echo "::endgroup::"

      - name: Run Ruff Linter
        id: ruff-lint
        continue-on-error: true
        run: |
          echo "::group::Ruff - Linter"
          ruff check scrapling/ --output-format=github
          echo "::endgroup::"

      - name: Run Ruff Formatter Check
        id: ruff-format
        continue-on-error: true
        run: |
          echo "::group::Ruff - Formatter Check"
          ruff format --check scrapling/ --diff
          echo "::endgroup::"

      - name: Run Vermin (Python Version Compatibility)
        id: vermin
        continue-on-error: true
        run: |
          echo "::group::Vermin - Python 3.10+ Compatibility Check"
          vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
          echo "::endgroup::"

      - name: Run Mypy (Static Type Checker)
        id: mypy
        continue-on-error: true
        run: |
          echo "::group::Mypy - Static Type Checker"
          mypy scrapling/
          echo "::endgroup::"

      - name: Run Pyright (Static Type Checker)
        id: pyright
        continue-on-error: true
        run: |
          echo "::group::Pyright - Static Type Checker"
          pyright scrapling/
          echo "::endgroup::"

      - name: Check results and create summary
        if: always()
        run: |
          echo "# Code Quality Check Results" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY

          # Initialize status
          all_passed=true

          # Check Bandit
          if [ "${{ steps.bandit.outcome }}" == "success" ]; then
            echo "✅ **Bandit (Security)**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Bandit (Security)**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          # Check Ruff Linter
          if [ "${{ steps.ruff-lint.outcome }}" == "success" ]; then
            echo "✅ **Ruff Linter**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Ruff Linter**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          # Check Ruff Formatter
          if [ "${{ steps.ruff-format.outcome }}" == "success" ]; then
            echo "✅ **Ruff Formatter**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Ruff Formatter**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          # Check Vermin
          if [ "${{ steps.vermin.outcome }}" == "success" ]; then
            echo "✅ **Vermin (Python 3.10+)**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Vermin (Python 3.10+)**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          # Check Mypy
          if [ "${{ steps.mypy.outcome }}" == "success" ]; then
            echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          # Check Pyright
          if [ "${{ steps.pyright.outcome }}" == "success" ]; then
            echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
          else
            echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
            all_passed=false
          fi

          echo "" >> $GITHUB_STEP_SUMMARY

          if [ "$all_passed" == "true" ]; then
            echo "### 🎉 All checks passed!" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "Your code meets all quality standards." >> $GITHUB_STEP_SUMMARY
          else
            echo "### ⚠️ Some checks failed" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "Please review the errors above and fix them." >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "**Tip**: Run \`pre-commit run --all-files\` locally to catch these issues before pushing." >> $GITHUB_STEP_SUMMARY
            exit 1
          fi

      - name: Upload Bandit report
        if: always() && steps.bandit.outcome != 'skipped'
        uses: actions/upload-artifact@v6
        with:
          name: bandit-security-report
          path: bandit-report.json
          retention-days: 30


================================================
FILE: .github/workflows/docker-build.yml
================================================
name: Build and Push Docker Image

on:
  pull_request:
    types: [closed]
    branches:
      - main
  workflow_dispatch:
    inputs:
      tag:
        description: 'Docker image tag'
        required: true
        default: 'latest'

env:
  DOCKERHUB_IMAGE: pyd4vinci/scrapling
  GHCR_IMAGE: ghcr.io/${{ github.repository_owner }}/scrapling

jobs:
  build-and-push:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write

    steps:
    - name: Checkout repository
      uses: actions/checkout@v6

    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
      with:
        platforms: linux/amd64,linux/arm64

    - name: Log in to Docker Hub
      uses: docker/login-action@v3
      with:
        registry: docker.io
        username: ${{ secrets.DOCKER_USERNAME }}
        password: ${{ secrets.DOCKER_PASSWORD }}

    - name: Log in to GitHub Container Registry
      uses: docker/login-action@v3
      with:
        registry: ghcr.io
        username: ${{ github.actor }}
        password: ${{ secrets.CONTAINER_TOKEN }}

    - name: Extract metadata
      id: meta
      uses: docker/metadata-action@v5
      with:
        images: |
          ${{ env.DOCKERHUB_IMAGE }}
          ${{ env.GHCR_IMAGE }}
        tags: |
          type=ref,event=branch
          type=ref,event=pr
          type=semver,pattern={{version}}
          type=semver,pattern={{major}}.{{minor}}
          type=semver,pattern={{major}}
          type=raw,value=latest,enable={{is_default_branch}}
        labels: |
          org.opencontainers.image.title=Scrapling
          org.opencontainers.image.description=An undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
          org.opencontainers.image.vendor=D4Vinci
          org.opencontainers.image.licenses=BSD
          org.opencontainers.image.url=https://scrapling.readthedocs.io/en/latest/
          org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          org.opencontainers.image.documentation=https://scrapling.readthedocs.io/en/latest/

    - name: Build and push Docker image
      uses: docker/build-push-action@v6
      with:
        context: .
        platforms: linux/amd64,linux/arm64
        push: true
        tags: ${{ steps.meta.outputs.tags }}
        labels: ${{ steps.meta.outputs.labels }}
        cache-from: type=gha
        cache-to: type=gha,mode=max
        build-args: |
          BUILDKIT_INLINE_CACHE=1

    - name: Image digest
      run: echo ${{ steps.build.outputs.digest }}

================================================
FILE: .github/workflows/release-and-publish.yml
================================================
name: Create Release and Publish to PyPI
# Creates a GitHub release when a PR is merged to main (using PR title as version and body as release notes), then publishes to PyPI.

on:
  pull_request:
    types: [closed]
    branches:
      - main

jobs:
  create-release-and-publish:
    if: github.event.pull_request.merged == true
    runs-on: ubuntu-latest
    environment:
      name: PyPI
      url: https://pypi.org/p/scrapling
    permissions:
      contents: write
      id-token: write
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Get PR title
        id: pr_title
        run: echo "title=${{ github.event.pull_request.title }}" >> $GITHUB_OUTPUT

      - name: Save PR body to file
        uses: actions/github-script@v8
        with:
          script: |
            const fs = require('fs');
            fs.writeFileSync('pr_body.md', context.payload.pull_request.body || '');

      - name: Extract version
        id: extract_version
        run: |
          PR_TITLE="${{ steps.pr_title.outputs.title }}"
          if [[ $PR_TITLE =~ ^v ]]; then
            echo "version=$PR_TITLE" >> $GITHUB_OUTPUT
            echo "Valid version format found in PR title: $PR_TITLE"
          else
            echo "Error: PR title '$PR_TITLE' must start with 'v' (e.g., 'v1.0.0') to create a release."
            exit 1
          fi

      - name: Create Release
        uses: softprops/action-gh-release@v2
        with:
          tag_name: ${{ steps.extract_version.outputs.version }}
          name: Release ${{ steps.extract_version.outputs.version }}
          body_path: pr_body.md
          draft: false
          prerelease: false
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: 3.12

      - name: Upgrade pip
        run: python3 -m pip install --upgrade pip

      - name: Install build
        run: python3 -m pip install --upgrade build twine setuptools

      - name: Build a binary wheel and a source tarball
        run: python3 -m build --sdist --wheel --outdir dist/

      - name: Publish distribution 📦 to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1

================================================
FILE: .github/workflows/tests.yml
================================================
name: Tests
on:
  push:
    branches:
      - main
      - dev
    paths-ignore:
      - '*.md'
      - '**/*.md'
      - 'docs/**'
      - 'images/**'
      - '.github/**'
      - 'agent-skill/**'
      - '*.yml'
      - '*.yaml'
      - 'ruff.toml'
  pull_request:
    branches:
      - main
      - dev
    paths-ignore:
      - '*.md'
      - '**/*.md'
      - 'docs/**'
      - 'images/**'
      - '.github/**'
      - 'agent-skill/**'
      - '*.yml'
      - '*.yaml'
      - 'ruff.toml'

concurrency:
  group: ${{github.workflow}}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  tests:
    timeout-minutes: 60
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        include:
        - python-version: "3.10"
          os: macos-latest
          env:
            TOXENV: py310
        - python-version: "3.11"
          os: macos-latest
          env:
            TOXENV: py311
        - python-version: "3.12"
          os: macos-latest
          env:
            TOXENV: py312
        - python-version: "3.13"
          os: macos-latest
          env:
            TOXENV: py313

    steps:
    - uses: actions/checkout@v6

    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
        cache: 'pip'
        cache-dependency-path: |
          pyproject.toml
          tox.ini

    - name: Install all browsers dependencies
      run: |
        python3 -m pip install --upgrade pip
        python3 -m pip install playwright==1.58.0 patchright==1.58.2

    - name: Get Playwright version
      id: playwright-version
      run: |
        PLAYWRIGHT_VERSION=$(python3 -c "import importlib.metadata; print(importlib.metadata.version('playwright'))")
        echo "version=$PLAYWRIGHT_VERSION" >> $GITHUB_OUTPUT
        echo "Playwright version: $PLAYWRIGHT_VERSION"

    - name: Retrieve Playwright browsers from cache if any
      id: playwright-cache
      uses: actions/cache@v5
      with:
        path: |
          ~/.cache/ms-playwright
          ~/Library/Caches/ms-playwright
          ~/.ms-playwright
        key: ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-v1
        restore-keys: |
          ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-
          ${{ runner.os }}-playwright-

    - name: Install Playwright browsers
      run: |
        echo "Cache hit: ${{ steps.playwright-cache.outputs.cache-hit }}"
        if [ "${{ steps.playwright-cache.outputs.cache-hit }}" != "true" ]; then
          python3 -m playwright install chromium
        else
          echo "Skipping install - using cached Playwright browsers"
        fi
        python3 -m playwright install-deps chromium

    # Cache tox environments
    - name: Cache tox environments
      uses: actions/cache@v5
      with:
        path: .tox
        # Include python version and os in the cache key
        key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('/Users/runner/work/Scrapling/pyproject.toml') }}
        restore-keys: |
          tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-
          tox-v1-${{ runner.os }}-

    - name: Install tox
      run: pip install -U tox

    - name: Run tests
      env: ${{ matrix.env }}
      run: tox

================================================
FILE: .gitignore
================================================
# local files
site/*
local_tests/*
.mcpregistry_*

# AI related files
.claude/*
CLAUDE.md

# cached files
__pycache__/
*.py[cod]
.cache
.DS_Store
*~
.*.sw[po]
.build
.ve
.env
.pytest
.benchmarks
.bootstrap
.appveyor.token
*.bak
*.db
*.db-*

# installation package
*.egg-info/
dist/
build/

# environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# C extensions
*.so

# pycharm
.idea/

# vscode
*.code-workspace

# Packages
*.egg
*.egg-info
dist
build
eggs
.eggs
parts
bin
var
sdist
wheelhouse
develop-eggs
.installed.cfg
lib
lib64
venv*/
.venv*/
pyvenv*/
pip-wheel-metadata/
poetry.lock

# Installer logs
pip-log.txt

# mypy
.mypy_cache/
.dmypy.json
dmypy.json
mypy.ini

# test caches
.tox/
.pytest_cache/
.coverage
htmlcov
report.xml
nosetests.xml
coverage.xml

# Translations
*.mo

# Buildout
.mr.developer.cfg

# IDE project files
.project
.pydevproject
.idea
*.iml
*.komodoproject

# Complexity
output/*.html
output/*/index.html

# Sphinx
docs/_build
public/
web/


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/PyCQA/bandit
  rev: 1.9.0
  hooks:
  - id: bandit
    args: [-r, -c, .bandit.yml]
- repo: https://github.com/astral-sh/ruff-pre-commit
  # Ruff version.
  rev: v0.14.5
  hooks:
    # Run the linter.
    - id: ruff
      args: [ --fix ]
    # Run the formatter.
    - id: ruff-format
- repo: https://github.com/netromdk/vermin
  rev: v1.7.0
  hooks:
  - id: vermin
    args: ['-t=3.10-', '--violations', '--eval-annotations', '--no-tips']


================================================
FILE: .readthedocs.yaml
================================================
# See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details
# Example: https://github.com/readthedocs/test-builds/tree/zensical

version: 2

build:
  os: ubuntu-24.04
  apt_packages:
    - pngquant
  tools:
    python: "3.13"
  jobs:
    install:
      - pip install -r docs/requirements.txt
      - pip install ".[all]"
    build:
      html:
        - zensical build
    post_build:
      - mkdir -p $READTHEDOCS_OUTPUT/html/
      - cp --recursive site/* $READTHEDOCS_OUTPUT/html/


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
karim.shoair@pm.me.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Scrapling

Thank you for your interest in contributing to Scrapling! 

Everybody is invited and welcome to contribute to Scrapling. 

Minor changes are more likely to be included promptly. Adding unit tests for new features or test cases for bugs you've fixed helps us ensure that the Pull Request (PR) is acceptable.

There are many ways to contribute to Scrapling. Here are some of them:

- Report bugs and request features using the [GitHub issues](https://github.com/D4Vinci/Scrapling/issues). Please follow the issue template to help us resolve your issue quickly.
- Blog about Scrapling. Tell the world how you’re using Scrapling. This will help newcomers with more examples and increase the Scrapling project's visibility.
- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and share your ideas on how to improve Scrapling. We’re always open to suggestions.
- If you are not a developer, perhaps you would like to help with translating the [documentation](https://github.com/D4Vinci/Scrapling/tree/docs)?

## Making a Pull Request
To ensure that your PR gets accepted, please make sure that your PR is based on the latest changes from the dev branch and that it satisfies the following requirements:

- **The PR must be made against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling. Any PR made against the main branch will be rejected.**
- **The code should be passing all available tests. We use tox with GitHub's CI to run the current tests on all supported Python versions for every code-related commit.**
- **The code should be passing all code quality checks like `mypy` and `pyright`. We are using GitHub's CI to enforce code style checks as well.**
- **Make your changes, keep the code clean with an explanation of any part that might be vague, and remember to create a separate virtual environment for this project.**
- If you are adding a new feature, please add tests for it.
- If you are fixing a bug, please add code with the PR that reproduces the bug.
- Please follow the rules and coding style rules we explain below.


## Finding work

If you have decided to make a contribution to Scrapling, but you do not know what to contribute, here are some ways to find pending work:

- Check out the [contribution](https://github.com/D4Vinci/Scrapling/contribute) GitHub page, which lists open issues tagged as `good first issue`. These issues provide a good starting point.
- There are also the [help wanted](https://github.com/D4Vinci/Scrapling/issues?q=is%3Aissue%20label%3A%22help%20wanted%22%20state%3Aopen) issues, but know that some may require familiarity with the Scrapling code base first. You can also target any other issue, provided it is not tagged as `invalid`, `wontfix`, or similar tags.
- If you enjoy writing automated tests, you can work on increasing our test coverage. Currently, the test coverage is around 90–92%.
- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and ask questions in the `#help` channel.

## Coding style
Please follow these coding conventions as we do when writing code for Scrapling:
- We use [pre-commit](https://pre-commit.com/) to automatically address simple code issues before every commit, so please install it and run `pre-commit install` to set it up. This will install hooks to run [ruff](https://docs.astral.sh/ruff/), [bandit](https://github.com/PyCQA/bandit), and [vermin](https://github.com/netromdk/vermin) on every commit. We are currently using a workflow to automatically run these tools on every PR, so if your code doesn't pass these checks, the PR will be rejected.
- We use type hints for better code clarity and [pyright](https://github.com/microsoft/pyright)/[mypy](https://github.com/python/mypy) for static type checking. If your code isn't acceptable by those tools, your PR won't pass the code quality rule.
- We use the conventional commit messages format as [here](https://gist.github.com/qoomon/5dfcdf8eec66a051ecd85625518cfd13#types), so for example, we use the following prefixes for commit messages:
   
   | Prefix      | When to use it           |
   |-------------|--------------------------|
   | `feat:`     | New feature added        |
   | `fix:`      | Bug fix                  |
   | `docs:`     | Documentation change/add |
   | `test:`     | Tests                    |
   | `refactor:` | Code refactoring         |
   | `chore:`    | Maintenance tasks        |
    
    Then include the details of the change in the commit message body/description.

   Example:
   ```
   feat: add `adaptive` for similar elements
   
   - Added find_similar() method
   - Implemented pattern matching
   - Added tests and documentation
   ```

> Please don’t put your name in the code you contribute; git provides enough metadata to identify the author of the code.

## Development

### Getting started

1. Fork the repository and clone your fork:
   ```bash
   git clone https://github.com/<your-username>/Scrapling.git
   cd Scrapling
   git checkout dev
   ```

2. Create a virtual environment and install dependencies:
   ```bash
   python -m venv .venv
   source .venv/bin/activate  # On Windows: .venv\Scripts\activate
   pip install -e ".[all]"
   pip install -r tests/requirements.txt
   ```

3. Install browser dependencies:
   ```bash
   scrapling install
   ```

4. Set up pre-commit hooks:
   ```bash
   pip install pre-commit
   pre-commit install
   ```

### Tips

Setting the scrapling logging level to `debug` makes it easier to know what's happening in the background.
```python
import logging
logging.getLogger("scrapling").setLevel(logging.DEBUG)
```
Bonus: You can install the beta of the upcoming update from the dev branch as follows
```commandline
pip3 install git+https://github.com/D4Vinci/Scrapling.git@dev
```

## Tests
Scrapling includes a comprehensive test suite that can be executed with pytest. However, first, you need to install all libraries and `pytest-plugins` listed in `tests/requirements.txt`. Then, running the tests will result in an output like this:
   ```bash
   $ pytest tests -n auto
   =============================== test session starts ===============================
   platform darwin -- Python 3.13.8, pytest-8.4.2, pluggy-1.6.0 -- /Users/<redacted>/.venv/bin/python3.13
   cachedir: .pytest_cache
   rootdir: /Users/<redacted>/scrapling
   configfile: pytest.ini
   plugins: asyncio-1.2.0, anyio-4.11.0, xdist-3.8.0, httpbin-2.1.0, cov-7.0.0
   asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function
   10 workers [515 items]
   scheduling tests via LoadScheduling

   ...<shortened>...

   =============================== 271 passed in 52.68s ==============================
   ```
Here, `-n auto` runs tests in parallel across multiple processes to increase speed.

**Note:** You may need to run browser tests sequentially (`DynamicFetcher`/`StealthyFetcher`) to avoid conflicts. To run non-browser tests in parallel and browser tests separately:
```bash
# Non-browser tests (parallel)
pytest tests/ -k "not (DynamicFetcher or StealthyFetcher)" -n auto

# Browser tests (sequential)
pytest tests/ -k "DynamicFetcher or StealthyFetcher"
```

Bonus: You can also see the test coverage with the `pytest` plugin below
```bash
pytest --cov=scrapling tests/
```

## Building Documentation
Documentation is built using [Zensical](https://zensical.org/). You can build it locally using the following commands:
```bash
pip install zensical
pip install -r docs/requirements.txt
zensical build --clean  # Build the static site
zensical serve          # Local preview
```


================================================
FILE: Dockerfile
================================================
FROM python:3.12-slim-trixie

LABEL io.modelcontextprotocol.server.name="io.github.D4Vinci/Scrapling"
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1

WORKDIR /app

# Copy dependency file first for better layer caching
COPY pyproject.toml ./

# Install dependencies only
RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --no-install-project --all-extras --compile-bytecode

# Copy source code
COPY . .

# Install browsers and project in one optimized layer
RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/var/cache/apt \
    --mount=type=cache,target=/var/lib/apt \
    apt-get update && \
    uv run playwright install-deps chromium && \
    uv run playwright install chromium && \
    uv sync --all-extras --compile-bytecode && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Expose port for MCP server HTTP transport
EXPOSE 8000

# Set entrypoint to run scrapling
ENTRYPOINT ["uv", "run", "scrapling"]

# Default command (can be overridden)
CMD ["--help"]

================================================
FILE: LICENSE
================================================
BSD 3-Clause License

Copyright (c) 2024, Karim shoair

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: MANIFEST.in
================================================
include LICENSE
include *.db
include *.js
include scrapling/*.db
include scrapling/*.db*
include scrapling/*.db-*
include scrapling/py.typed
include scrapling/.scrapling_dependencies_installed
include .scrapling_dependencies_installed

recursive-exclude * __pycache__
recursive-exclude * *.py[co]

================================================
FILE: README.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://trendshift.io/repositories/14244" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14244" alt="D4Vinci%2FScrapling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
    <br/>
    <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md">Français</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> |  <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_KR.md">한국어</a>
    <br/>
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Selection methods</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetchers</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy Rotation</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP</strong></a>
</p>

Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.

Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.

Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Fetch website under the radar!
products = p.css('.product', auto_save=True)                                        # Scrape data that survives website design changes!
products = p.css('.product', adaptive=True)                                         # Later, if the website structure changes, pass `adaptive=True` to find them!
```
Or scale up to full crawls
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# Platinum Sponsors
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling handles Cloudflare Turnstile. For enterprise-grade protection, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> provides API endpoints that generate valid antibot tokens for <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b>, and <b>Incapsula</b>. Simple API calls, no browser automation required. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>Hey, we built <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a> because proxies shouldn't be complicated or overpriced. Fast residential and ISP proxies in 195+ locations, fair pricing, and real support. <br />
      <b>Try our FlappyBird game on the landing page for free data!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: residential proxies from $0.49/GB. Scraping browser with fully spoofed Chromium, residential IPs, auto CAPTCHA solving, and anti-bot bypass. </br>
      <b>Scraper API for hassle-free results. MCP and N8N integrations are available.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> provides 900+ stable APIs across 16+ platforms including TikTok, X, YouTube & Instagram, with 40M+ datasets. <br /> Also offers <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">DISCOUNTED AI models</a> — Claude, GPT, GEMINI & more up to 71% off.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> provides fast Residential and ISP proxies for developers and scrapers. Global IP coverage, high anonymity, smart rotation, and reliable performance for automation and data extraction. Use <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> to simplify large-scale web crawling.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    Close your laptop. Your scrapers keep running. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - cloud servers built for nonstop automation. Windows and Linux machines with full control. From €6.99/mo.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    Read a full review of <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling on The Web Scraping Club</a> (Nov 2025), the #1 newsletter dedicated to Web Scraping.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> provides reliable proxy infrastructure for web scraping, offering IPv4, IPv6, ISP, Residential, and Mobile proxies with stable performance, broad geo coverage, and flexible plans for business-scale data collection.
    </td>
  </tr>
</table>

<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# Sponsors 

<!-- sponsors -->


<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>

---

## Key Features

### Spiders — A Full Crawling Framework
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.

### Advanced Websites Fetching with Session Support
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.

### Adaptive Scraping & AI Integration
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### High-Performance & battle-tested Architecture
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.

### Developer/Web Scraper Friendly Experience
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.

## Getting Started

Let's give you a quick glimpse of what Scrapling can do without deep diving.

### Basic Usage
HTTP requests with session support
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Use latest version of Chrome's TLS fingerprint
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# Or use one-off requests
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Advanced stealth mode
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Keep the browser open until you finish
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# Or use one-off request style, it opens the browser for this request, then closes it after finishing
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Full browser automation
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Keep the browser open until you finish
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # XPath selector if you prefer it

# Or use one-off request style, it opens the browser for this request, then closes it after finishing
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
Build full crawlers with concurrent requests, multiple session types, and pause/resume:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10
    
    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }
            
        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"Scraped {len(result.items)} quotes")
result.items.to_json("quotes.json")
```
Use multiple session types in a single spider:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]
    
    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
    
    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Route protected pages through the stealth session
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # explicit callback
```
Pause and resume long crawls with checkpoints by running the spider like this:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.

### Advanced Parsing & Navigation
```python
from scrapling.fetchers import Fetcher

# Rich element selection and navigation
page = Fetcher.get('https://quotes.toscrape.com/')

# Get quotes with multiple selection methods
quotes = page.css('.quote')  # CSS selector
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-style
# Same as
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # and so on...
# Find element by text content
quotes = page.find_by_text('quote', tag='div')

# Advanced navigation
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Chained selectors
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Element relationships and similarity
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
You can use the parser right away if you don't want to fetch websites like below:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
And it works precisely the same way!

### Async Session Management Examples
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` is context-aware and can work in both sync/async patterns
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Async session usage
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']
    
    for url in urls:
        task = session.fetch(url)
        tasks.append(task)
    
    print(session.get_pool_stats())  # Optional - The status of the browser tabs pool (busy/free/error)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI & Interactive Shell

Scrapling includes a powerful command-line interface:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

Launch the interactive Web Scraping shell
```bash
scrapling shell
```
Extract pages to a file directly without programming (Extracts the content inside the `body` tag by default). If the output file ends with `.txt`, then the text content of the target will be extracted. If it ends in `.md`, it will be a Markdown representation of the HTML content; if it ends in `.html`, it will be the HTML content itself.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # All elements matching the CSS selector '#fromSkipToProducts'
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)

## Performance Benchmarks

Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.

### Text Extraction Speed Test (5000 nested elements)

| # |      Library      | Time (ms) | vs Scrapling | 
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### Element Similarity & Text Search Performance

Scrapling's adaptive element finding capabilities significantly outperform alternatives:

| Library     | Time (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |


> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.

## Installation

Scrapling requires Python 3.10 or higher:

```bash
pip install scrapling
```

This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.

### Optional Dependencies

1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:
    ```bash
    pip install "scrapling[fetchers]"
    
    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.

    Or you can install them from the code instead of running a command like this:
    ```python
    from scrapling.cli import install
    
    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. Extra features:
   - Install the MCP server feature:
       ```bash
       pip install "scrapling[ai]"
       ```
   - Install shell features (Web Scraping shell and the `extract` command): 
       ```bash
       pip install "scrapling[shell]"
       ```
   - Install everything: 
       ```bash
       pip install "scrapling[all]"
       ```
   Remember that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)

### Docker
You can also install a Docker image with all extras and browsers with the following command from DockerHub:
```bash
docker pull pyd4vinci/scrapling
```
Or download it from the GitHub registry:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
This image is automatically built and pushed using GitHub Actions and the repository's main branch.

## Contributing

We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.

## Disclaimer

> [!CAUTION]
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.

## 🎓 Citations
If you have used our library for research purposes please quote us with the following reference:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## License

This work is licensed under the BSD-3-Clause License.

## Acknowledgments

This project includes code adapted from:
- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule

---
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>


================================================
FILE: ROADMAP.md
================================================
## TODOs
- [x] Add more tests and increase the code coverage.
- [x] Structure the tests folder in a better way.
- [x] Add more documentation.
- [x] Add the browsing ability.
- [x] Create detailed documentation for the 'readthedocs' website, preferably add GitHub action for deploying it.
- [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
- [x] Need to add more functionality to `AttributesHandler` and more navigation functions to `Selector` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
- [x] Add `.filter` method to `Selectors` object and other similar methods.
- [ ] Add functionality to automatically detect pagination URLs
- [ ] Add the ability to auto-detect schemas in pages and manipulate them.
- [ ] Add `analyzer` ability that tries to learn about the page through meta-elements and return what it learned
- [ ] Add the ability to generate a regex from a group of elements (Like for all href attributes)
- 

================================================
FILE: agent-skill/README.md
================================================
# Scrapling Agent Skill

The skill aligns with the [AgentSkill](https://agentskills.io/specification) specification, so it will be readable by [OpenClaw](https://github.com/openclaw/openclaw), [Claude Code](https://claude.com/product/claude-code), and other agentic tools. It encapsulates almost all of the documentation website's content in Markdown, so the agent doesn't have to guess anything.

It can be used to answer almost 90% of any questions you would have about scrapling. We tested it on [OpenClaw](https://github.com/openclaw/openclaw) and [Claude Code](https://claude.com/product/claude-code), but please open a [ticket](https://github.com/D4Vinci/Scrapling/issues/new/choose) if you faced any issues or use our [Discord server](https://discord.gg/EMgGbDceNQ).

## Installation

You can use this [direct URL](https://github.com/D4Vinci/Scrapling/raw/refs/heads/main/agent-skill/Scrapling-Skill.zip) to download the ZIP file of the skill directly. We will try to update this page with all available methods.

### Clawhub
If you are an [OpenClaw](https://github.com/openclaw/openclaw) and [Claude Code](https://claude.com/product/claude-code), you can install the skill using [Clawhub](https://docs.openclaw.ai/tools/clawhub) directly:
```bash
clawhub install scrapling-official
```

Or go to the [Clawhub](https://docs.openclaw.ai/tools/clawhub) page from [here](https://clawhub.ai/D4Vinci/scrapling-official).

================================================
FILE: agent-skill/Scrapling-Skill/LICENSE.txt
================================================
BSD 3-Clause License

Copyright (c) 2024, Karim shoair

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: agent-skill/Scrapling-Skill/SKILL.md
================================================
---
name: scrapling-official
description: Scrape web pages using Scrapling with anti-bot bypass (like Cloudflare Turnstile), stealth headless browsing, spiders framework, adaptive scraping, and JavaScript rendering. Use when asked to scrape, crawl, or extract data from websites; web_fetch fails; the site has anti-bot protections; write Python code to scrape/crawl; or write spiders.
version: 0.4.2
license: Complete terms in LICENSE.txt
---

# Scrapling

Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.

Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.

Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.

**Requires: Python 3.10+**

**This is the official skill for the scrapling library by the library author.**


## Setup (once)

Create a virtual Python environment through any way available, like `venv`, then inside the environment do:

`pip install "scrapling[all]>=0.4.2"`

Then do this to download all the browsers' dependencies:

```bash
scrapling install --force
```

Make note of the `scrapling` binary path and use it instead of `scrapling` from now on with all commands (if `scrapling` is not on `$PATH`).

### Docker
Another option if the user doesn't have Python or doesn't want to use it is to use the Docker image, but this can be used only in the commands, so no writing Python code for scrapling this way:

```bash
docker pull pyd4vinci/scrapling
```
or
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```

## CLI Usage

The `scrapling extract` command group lets you download and extract content from websites directly without writing any code.

```bash
Usage: scrapling extract [OPTIONS] COMMAND [ARGS]...

Commands:
  get             Perform a GET request and save the content to a file.
  post            Perform a POST request and save the content to a file.
  put             Perform a PUT request and save the content to a file.
  delete          Perform a DELETE request and save the content to a file.
  fetch           Use a browser to fetch content with browser automation and flexible options.
  stealthy-fetch  Use a stealthy browser to fetch content with advanced stealth features.
```

### Usage pattern
- Choose your output format by changing the file extension. Here are some examples for the `scrapling extract get` command:
  - Convert the HTML content to Markdown, then save it to the file (great for documentation): `scrapling extract get "https://blog.example.com" article.md`
  - Save the HTML content as it is to the file: `scrapling extract get "https://example.com" page.html`
  - Save a clean version of the text content of the webpage to the file: `scrapling extract get "https://example.com" content.txt`
- Output to a temp file, read it back, then clean up.
- All commands can use CSS selectors to extract specific parts of the page through `--css-selector` or `-s`.

Which command to use generally:
- Use **`get`** with simple websites, blogs, or news articles.
- Use **`fetch`** with modern web apps, or sites with dynamic content.
- Use **`stealthy-fetch`** with protected sites, Cloudflare, or anti-bot systems.

> When unsure, start with `get`. If it fails or returns empty content, escalate to `fetch`, then `stealthy-fetch`. The speed of `fetch` and `stealthy-fetch` is nearly the same, so you are not sacrificing anything.

#### Key options (requests)

Those options are shared between the 4 HTTP request commands:

| Option                                     | Input type | Description                                                                                                                                    |
|:-------------------------------------------|:----------:|:-----------------------------------------------------------------------------------------------------------------------------------------------|
| -H, --headers                              |    TEXT    | HTTP headers in format "Key: Value" (can be used multiple times)                                                                               |
| --cookies                                  |    TEXT    | Cookies string in format "name1=value1; name2=value2"                                                                                          |
| --timeout                                  |  INTEGER   | Request timeout in seconds (default: 30)                                                                                                       |
| --proxy                                    |    TEXT    | Proxy URL in format "http://username:password@host:port"                                                                                       |
| -s, --css-selector                         |    TEXT    | CSS selector to extract specific content from the page. It returns all matches.                                                                |
| -p, --params                               |    TEXT    | Query parameters in format "key=value" (can be used multiple times)                                                                            |
| --follow-redirects / --no-follow-redirects |    None    | Whether to follow redirects (default: True)                                                                                                    |
| --verify / --no-verify                     |    None    | Whether to verify SSL certificates (default: True)                                                                                             |
| --impersonate                              |    TEXT    | Browser to impersonate. Can be a single browser (e.g., Chrome) or a comma-separated list for random selection (e.g., Chrome, Firefox, Safari). |
| --stealthy-headers / --no-stealthy-headers |    None    | Use stealthy browser headers (default: True)                                                                                                   |

Options shared between `post` and `put` only:

| Option     | Input type | Description                                                                             |
|:-----------|:----------:|:----------------------------------------------------------------------------------------|
| -d, --data |    TEXT    | Form data to include in the request body (as string, ex: "param1=value1&param2=value2") |
| -j, --json |    TEXT    | JSON data to include in the request body (as string)                                    |

Examples:

```bash
# Basic download
scrapling extract get "https://news.site.com" news.md

# Download with custom timeout
scrapling extract get "https://example.com" content.txt --timeout 60

# Extract only specific content using CSS selectors
scrapling extract get "https://blog.example.com" articles.md --css-selector "article"

# Send a request with cookies
scrapling extract get "https://scrapling.requestcatcher.com" content.md --cookies "session=abc123; user=john"

# Add user agent
scrapling extract get "https://api.site.com" data.json -H "User-Agent: MyBot 1.0"

# Add multiple headers
scrapling extract get "https://site.com" page.html -H "Accept: text/html" -H "Accept-Language: en-US"
```

#### Key options (browsers)

Both (`fetch` / `stealthy-fetch`) share options:


| Option                                   | Input type | Description                                                                                                                                              |
|:-----------------------------------------|:----------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------|
| --headless / --no-headless               |    None    | Run browser in headless mode (default: True)                                                                                                             |
| --disable-resources / --enable-resources |    None    | Drop unnecessary resources for speed boost (default: False)                                                                                              |
| --network-idle / --no-network-idle       |    None    | Wait for network idle (default: False)                                                                                                                   |
| --real-chrome / --no-real-chrome         |    None    | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) |
| --timeout                                |  INTEGER   | Timeout in milliseconds (default: 30000)                                                                                                                 |
| --wait                                   |  INTEGER   | Additional wait time in milliseconds after page load (default: 0)                                                                                        |
| -s, --css-selector                       |    TEXT    | CSS selector to extract specific content from the page. It returns all matches.                                                                          |
| --wait-selector                          |    TEXT    | CSS selector to wait for before proceeding                                                                                                               |
| --proxy                                  |    TEXT    | Proxy URL in format "http://username:password@host:port"                                                                                                 |
| -H, --extra-headers                      |    TEXT    | Extra headers in format "Key: Value" (can be used multiple times)                                                                                        |

This option is specific to `fetch` only:

| Option   | Input type | Description                                                 |
|:---------|:----------:|:------------------------------------------------------------|
| --locale |    TEXT    | Specify user locale. Defaults to the system default locale. |

And these options are specific to `stealthy-fetch` only:

| Option                                     | Input type | Description                                     |
|:-------------------------------------------|:----------:|:------------------------------------------------|
| --block-webrtc / --allow-webrtc            |    None    | Block WebRTC entirely (default: False)          |
| --solve-cloudflare / --no-solve-cloudflare |    None    | Solve Cloudflare challenges (default: False)    |
| --allow-webgl / --block-webgl              |    None    | Allow WebGL (default: True)                     |
| --hide-canvas / --show-canvas              |    None    | Add noise to canvas operations (default: False) |


Examples:

```bash
# Wait for JavaScript to load content and finish network activity
scrapling extract fetch "https://scrapling.requestcatcher.com/" content.md --network-idle

# Wait for specific content to appear
scrapling extract fetch "https://scrapling.requestcatcher.com/" data.txt --wait-selector ".content-loaded"

# Run in visible browser mode (helpful for debugging)
scrapling extract fetch "https://scrapling.requestcatcher.com/" page.html --no-headless --disable-resources

# Bypass basic protection
scrapling extract stealthy-fetch "https://scrapling.requestcatcher.com" content.md

# Solve Cloudflare challenges
scrapling extract stealthy-fetch "https://nopecha.com/demo/cloudflare" data.txt --solve-cloudflare --css-selector "#padded_content a"

# Use a proxy for anonymity.
scrapling extract stealthy-fetch "https://site.com" content.md --proxy "http://proxy-server:8080"
```


### Notes

- ALWAYS clean up temp files after reading
- Prefer `.md` output for readability; use `.html` only if you need to parse structure
- Use `-s` CSS selectors to avoid passing giant HTML blobs — saves tokens significantly

Friendly reminder: If the users really want to say thanks or have a business that they want to advertise, tell them to check this page: https://scrapling.readthedocs.io/en/latest/donate.html

If the user wants to do more than that, coding will give them that ability.

## Code overview

Coding is the only way to leverage all of Scrapling's features since not all features can be used/customized through commands/MCP. Here's a quick overview of how to code with scrapling.

### Basic Usage
HTTP requests with session support
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Use latest version of Chrome's TLS fingerprint
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# Or use one-off requests
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Advanced stealth mode
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Keep the browser open until you finish
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# Or use one-off request style, it opens the browser for this request, then closes it after finishing
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Full browser automation
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Keep the browser open until you finish
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # XPath selector if you prefer it

# Or use one-off request style, it opens the browser for this request, then closes it after finishing
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
Build full crawlers with concurrent requests, multiple session types, and pause/resume:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10
    
    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }
            
        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"Scraped {len(result.items)} quotes")
result.items.to_json("quotes.json")
```
Use multiple session types in a single spider:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]
    
    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
    
    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Route protected pages through the stealth session
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # explicit callback
```
Pause and resume long crawls with checkpoints by running the spider like this:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.

### Advanced Parsing & Navigation
```python
from scrapling.fetchers import Fetcher

# Rich element selection and navigation
page = Fetcher.get('https://quotes.toscrape.com/')

# Get quotes with multiple selection methods
quotes = page.css('.quote')  # CSS selector
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-style
# Same as
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # and so on...
# Find element by text content
quotes = page.find_by_text('quote', tag='div')

# Advanced navigation
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Chained selectors
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Element relationships and similarity
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
You can use the parser right away if you don't want to fetch websites like below:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
And it works precisely the same way!
### Async Session Management Examples
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` is context-aware and can work in both sync/async patterns
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Async session usage
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']
    
    for url in urls:
        task = session.fetch(url)
        tasks.append(task)
    
    print(session.get_pool_stats())  # Optional - The status of the browser tabs pool (busy/free/error)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## References
You already had a good glimpse of what the library can do. Use the references below to dig deeper when needed
- `references/mcp-server.md` — MCP server tools and capabilities
- `references/parsing` — Everything you need for parsing HTML
- `references/fetching` — Everything you need to fetch websites and session persistence
- `references/spiders` — Everything you need to write spiders, proxy rotation, and advanced features. It follows a Scrapy-like format
- `references/migrating_from_beautifulsoup.md` — A quick API comparison between scrapling and Beautifulsoup
- `https://github.com/D4Vinci/Scrapling/tree/main/docs` — Full official docs in Markdown for quick access (use only if current references do not look up-to-date).

This skill encapsulates almost all the published documentation in Markdown, so don't check external sources or search online without the user's permission.

## Guardrails (Always)
- Only scrape content you're authorized to access.
- Respect robots.txt and ToS.
- Add delays (download_delay) for large crawls.
- Don't bypass paywalls or authentication without permission.
- Never scrape personal/sensitive data.

================================================
FILE: agent-skill/Scrapling-Skill/examples/01_fetcher_session.py
================================================
"""
Example 1: Python - FetcherSession (persistent HTTP session with Chrome TLS fingerprint)

Scrapes all 10 pages of quotes.toscrape.com using a single HTTP session.
No browser launched — fast and lightweight.

Best for: static or semi-static sites, APIs, pages that don't require JavaScript.
"""

from scrapling.fetchers import FetcherSession

all_quotes = []

with FetcherSession(impersonate="chrome") as session:
    for i in range(1, 11):
        page = session.get(
            f"https://quotes.toscrape.com/page/{i}/",
            stealthy_headers=True,
        )
        quotes = page.css(".quote .text::text").getall()
        all_quotes.extend(quotes)
        print(f"Page {i}: {len(quotes)} quotes (status {page.status})")

print(f"\nTotal: {len(all_quotes)} quotes\n")
for i, quote in enumerate(all_quotes, 1):
    print(f"{i:>3}. {quote}")


================================================
FILE: agent-skill/Scrapling-Skill/examples/02_dynamic_session.py
================================================
"""
Example 2: Python - DynamicSession (Playwright browser automation, visible)

Scrapes all 10 pages of quotes.toscrape.com using a persistent browser session.
The browser window stays open across all page requests for efficiency.

Best for: JavaScript-heavy pages, SPAs, sites with dynamic content loading.

Set headless=True to run the browser hidden.
Set disable_resources=True to skip loading images/fonts for a speed boost.
"""

from scrapling.fetchers import DynamicSession

all_quotes = []

with DynamicSession(headless=False, disable_resources=True) as session:
    for i in range(1, 11):
        page = session.fetch(f"https://quotes.toscrape.com/page/{i}/")
        quotes = page.css(".quote .text::text").getall()
        all_quotes.extend(quotes)
        print(f"Page {i}: {len(quotes)} quotes (status {page.status})")

print(f"\nTotal: {len(all_quotes)} quotes\n")
for i, quote in enumerate(all_quotes, 1):
    print(f"{i:>3}. {quote}")


================================================
FILE: agent-skill/Scrapling-Skill/examples/03_stealthy_session.py
================================================
"""
Example 3: Python - StealthySession (Patchright stealth browser, visible)

Scrapes all 10 pages of quotes.toscrape.com using a persistent stealth browser session.
Bypasses anti-bot protections automatically (Cloudflare Turnstile, fingerprinting, etc.).

Best for: well-protected sites, Cloudflare-gated pages, sites that detect Playwright.

Set headless=True to run the browser hidden.
Add solve_cloudflare=True to auto-solve Cloudflare challenges.
"""

from scrapling.fetchers import StealthySession

all_quotes = []

with StealthySession(headless=False) as session:
    for i in range(1, 11):
        page = session.fetch(f"https://quotes.toscrape.com/page/{i}/")
        quotes = page.css(".quote .text::text").getall()
        all_quotes.extend(quotes)
        print(f"Page {i}: {len(quotes)} quotes (status {page.status})")

print(f"\nTotal: {len(all_quotes)} quotes\n")
for i, quote in enumerate(all_quotes, 1):
    print(f"{i:>3}. {quote}")


================================================
FILE: agent-skill/Scrapling-Skill/examples/04_spider.py
================================================
"""
Example 4: Python - Spider (auto-crawling framework)

Scrapes ALL pages of quotes.toscrape.com by following "Next" pagination links
automatically. No manual page looping needed.

The spider yields structured items (text + author + tags) and exports them to JSON.

Best for: multi-page crawls, full-site scraping, anything needing pagination or
link following across many pages.

Outputs:
  - Live stats to terminal during crawl
  - Final crawl stats at the end
  - quotes.json in the current directory
"""

from scrapling.spiders import Spider, Response


class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 5  # Fetch up to 5 pages at once

    async def parse(self, response: Response):
        # Extract all quotes on the current page
        for quote in response.css(".quote"):
            yield {
                "text": quote.css(".text::text").get(),
                "author": quote.css(".author::text").get(),
                "tags": quote.css(".tags .tag::text").getall(),
            }

        # Follow the "Next" button to the next page (if it exists)
        next_page = response.css(".next a")
        if next_page:
            yield response.follow(next_page[0].attrib["href"])


if __name__ == "__main__":
    result = QuotesSpider().start()

    print(f"\n{'=' * 50}")
    print(f"Scraped : {result.stats.items_scraped} quotes")
    print(f"Requests: {result.stats.requests_count}")
    print(f"Time    : {result.stats.elapsed_seconds:.2f}s")
    print(f"Speed   : {result.stats.requests_per_second:.2f} req/s")
    print(f"{'=' * 50}\n")

    for i, item in enumerate(result.items, 1):
        print(f"{i:>3}. [{item['author']}] {item['text']}")
        if item["tags"]:
            print(f"       Tags: {', '.join(item['tags'])}")

    # Export to JSON
    result.items.to_json("quotes.json", indent=True)
    print("\nExported to quotes.json")


================================================
FILE: agent-skill/Scrapling-Skill/examples/README.md
================================================
# Scrapling Examples

These examples scrape [quotes.toscrape.com](https://quotes.toscrape.com) — a safe, purpose-built scraping sandbox — and demonstrate every tool available in Scrapling, from plain HTTP to full browser automation and spiders.

All examples collect **all 100 quotes across 10 pages**.

## Quick Start

Make sure Scrapling is installed:

```bash
pip install "scrapling[all]>=0.4.2"
scrapling install --force
```

## Examples

| File                     | Tool              | Type                        | Best For                              |
|--------------------------|-------------------|-----------------------------|---------------------------------------|
| `01_fetcher_session.py`  | `FetcherSession`  | Python — persistent HTTP    | APIs, fast multi-page scraping        |
| `02_dynamic_session.py`  | `DynamicSession`  | Python — browser automation | Dynamic/SPA pages                     |
| `03_stealthy_session.py` | `StealthySession` | Python — stealth browser    | Cloudflare, fingerprint bypass        |
| `04_spider.py`           | `Spider`          | Python — auto-crawling      | Multi-page crawls, full-site scraping |

## Running

**Python scripts:**

```bash
python examples/01_fetcher_session.py
python examples/02_dynamic_session.py  # Opens a visible browser
python examples/03_stealthy_session.py # Opens a visible stealth browser
python examples/04_spider.py           # Auto-crawls all pages, exports quotes.json
```

## Escalation Guide

Start with the fastest, lightest option and escalate only if needed:

```
get / FetcherSession
  └─ If JS required → fetch / DynamicSession
       └─ If blocked → stealthy-fetch / StealthySession
            └─ If multi-page → Spider
```


================================================
FILE: agent-skill/Scrapling-Skill/references/fetching/choosing.md
================================================
# Fetchers basics

## Introduction
Fetchers are classes that do requests or fetch pages in a single-line fashion with many features and return a [Response](#response-object) object. All fetchers have separate session classes to keep the session running (e.g., a browser fetcher keeps the browser open until you finish all requests).

Fetchers are not wrappers built on top of other libraries. They use these libraries as an engine to request/fetch pages but add features the underlying engines don't have, while still fully leveraging and optimizing them for web scraping.

## Fetchers Overview

Scrapling provides three different fetcher classes with their session classes; each fetcher is designed for a specific use case.

The following table compares them and can be quickly used for guidance.


| Feature            | Fetcher                                           | DynamicFetcher                                                                    | StealthyFetcher                                                                            |
|--------------------|---------------------------------------------------|-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|
| Relative speed     | 🐇🐇🐇🐇🐇                                        | 🐇🐇🐇                                                                            | 🐇🐇🐇                                                                                     |
| Stealth            | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |
| Anti-Bot options   | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |
| JavaScript loading | ❌                                                 | ✅                                                                                 | ✅                                                                                          |
| Memory Usage       | ⭐                                                 | ⭐⭐⭐                                                                               | ⭐⭐⭐                                                                                        |
| Best used for      | Basic scraping when HTTP requests alone can do it | - Dynamically loaded websites <br/>- Small automation<br/>- Small-Mid protections | - Dynamically loaded websites <br/>- Small automation <br/>- Small-Complicated protections |
| Browser(s)         | ❌                                                 | Chromium and Google Chrome                                                        | Chromium and Google Chrome                                                                 |
| Browser API used   | ❌                                                 | PlayWright                                                                        | PlayWright                                                                                 |
| Setup Complexity   | Simple                                            | Simple                                                                            | Simple                                                                                     |

## Parser configuration in all fetchers
All fetchers share the same import method, as you will see in the upcoming pages
```python
>>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
```
Then you use it right away without initializing like this, and it will use the default parser settings:
```python
>>> page = StealthyFetcher.fetch('https://example.com') 
```
If you want to configure the parser ([Selector class](parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
```python
>>> from scrapling.fetchers import Fetcher
>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False)  # and the rest
```
or
```python
>>> from scrapling.fetchers import Fetcher
>>> Fetcher.adaptive=True
>>> Fetcher.keep_comments=False
>>> Fetcher.keep_cdata=False  # and the rest
```
Then, continue your code as usual.

The available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.

**Info:** The `adaptive` argument is disabled by default; you must enable it to use that feature.

### Set parser config per request
As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.

If your use case requires a different configuration for each request/fetch, you can pass a dictionary to the request method (`fetch`/`get`/`post`/...) to an argument named `selector_config`.

## Response Object
The `Response` object is the same as the [Selector](parsing/main_classes.md#selector) class, but it has additional details about the response, like response headers, status, cookies, etc., as shown below:
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.get('https://example.com')

>>> page.status          # HTTP status code
>>> page.reason          # Status message
>>> page.cookies         # Response cookies as a dictionary
>>> page.headers         # Response headers
>>> page.request_headers # Request headers
>>> page.history         # Response history of redirections, if any
>>> page.body            # Raw response body as bytes
>>> page.encoding        # Response encoding
>>> page.meta            # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.
```
All fetchers return the `Response` object.

**Note:** Unlike the [Selector](parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4.

================================================
FILE: agent-skill/Scrapling-Skill/references/fetching/dynamic.md
================================================
# Fetching dynamic websites

`DynamicFetcher` (formerly `PlayWrightFetcher`) provides flexible browser automation with multiple configuration options and built-in stealth improvements.

As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).

## Basic Usage
You have one primary way to import this Fetcher, which is the same for all fetchers.

```python
>>> from scrapling.fetchers import DynamicFetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

**Note:** The async version of the `fetch` method is `async_fetch`.

This fetcher provides three main run options that can be combined as desired.

Which are:

### 1. Vanilla Playwright
```python
DynamicFetcher.fetch('https://example.com')
```
Using it in that manner will open a Chromium browser and load the page. There are optimizations for speed, and some stealth goes automatically under the hood, but other than that, there are no tricks or extra features unless you enable some; it's just a plain PlayWright API.

### 2. Real Chrome
```python
DynamicFetcher.fetch('https://example.com', real_chrome=True)
```
If you have a Google Chrome browser installed, use this option. It's the same as the first option, but it will use the Google Chrome browser you installed on your device instead of Chromium. This will make your requests look more authentic, so they're less detectable for better results.

If you don't have Google Chrome installed and want to use this option, you can use the command below in the terminal to install it for the library instead of installing it manually:
```commandline
playwright install chrome
```

### 3. CDP Connection
```python
DynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')
```
Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).


**Notes:**
* There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.
* This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](stealthy.md).

## Full list of arguments
All arguments for `DynamicFetcher` and its session classes:

|      Argument       | Description                                                                                                                                                                                                                         | Optional |
|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|         url         | Target url                                                                                                                                                                                                                          |    ❌     |
|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |
|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |
|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |
|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |
|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |
|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |
|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |
|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |
|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |
|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |
|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |
|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                               |    ✔️    |
|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                   |    ✔️    |
|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |
|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |
|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |
|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |
|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |
|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |
|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |
|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |
|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |
|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).                                                                                                     |    ✔️    |
|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |
|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |
|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |

In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.

**Notes:**
1. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.


## Examples

### Resource Control

```python
# Disable unnecessary resources
page = DynamicFetcher.fetch('https://example.com', disable_resources=True)  # Blocks fonts, images, media, etc.
```

### Domain Blocking

```python
# Block requests to specific domains (and their subdomains)
page = DynamicFetcher.fetch('https://example.com', blocked_domains={"ads.example.com", "tracker.net"})
```

### Network Control

```python
# Wait for network idle (Consider fetch to be finished when there are no network connections for at least 500 ms)
page = DynamicFetcher.fetch('https://example.com', network_idle=True)

# Custom timeout (in milliseconds)
page = DynamicFetcher.fetch('https://example.com', timeout=30000)  # 30 seconds

# Proxy support (It can also be a dictionary with only the keys 'server', 'username', and 'password'.)
page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
```

### Proxy Rotation

```python
from scrapling.fetchers import DynamicSession, ProxyRotator

# Set up proxy rotation
rotator = ProxyRotator([
    "http://proxy1:8080",
    "http://proxy2:8080",
    "http://proxy3:8080",
])

# Use with session - rotates proxy automatically with each request
with DynamicSession(proxy_rotator=rotator, headless=True) as session:
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com')

    # Override rotator for a specific request
    page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')
```

**Warning:** By default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.

### Downloading Files

```python
page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')

with open(file='main_cover.png', mode='wb') as f:
    f.write(page.body)
```

The `body` attribute of the `Response` object always returns `bytes`.

### Browser Automation
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.

This function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.

In the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.
```python
from playwright.sync_api import Page

def scroll_page(page: Page):
    page.mouse.wheel(10, 0)
    page.mouse.move(100, 400)
    page.mouse.up()

page = DynamicFetcher.fetch('https://example.com', page_action=scroll_page)
```
Of course, if you use the async fetch version, the function must also be async.
```python
from playwright.async_api import Page

async def scroll_page(page: Page):
   await page.mouse.wheel(10, 0)
   await page.mouse.move(100, 400)
   await page.mouse.up()

page = await DynamicFetcher.async_fetch('https://example.com', page_action=scroll_page)
```

### Wait Conditions

```python
# Wait for the selector
page = DynamicFetcher.fetch(
    'https://example.com',
    wait_selector='h1',
    wait_selector_state='visible'
)
```
This is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.

After that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.

The states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):

- `attached`: Wait for an element to be present in the DOM.
- `detached`: Wait for an element to not be present in the DOM.
- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.
- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.

### Some Stealth Features

```python
page = DynamicFetcher.fetch(
    'https://example.com',
    google_search=True,
    useragent='Mozilla/5.0...',  # Custom user agent
    locale='en-US',  # Set browser locale
)
```

### General example
```python
from scrapling.fetchers import DynamicFetcher

def scrape_dynamic_content():
    # Use Playwright for JavaScript content
    page = DynamicFetcher.fetch(
        'https://example.com/dynamic',
        network_idle=True,
        wait_selector='.content'
    )
    
    # Extract dynamic content
    content = page.css('.content')
    
    return {
        'title': content.css('h1::text').get(),
        'items': [
            item.text for item in content.css('.item')
        ]
    }
```

## Session Management

To keep the browser open until you make multiple requests with the same configuration, use `DynamicSession`/`AsyncDynamicSession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.

```python
from scrapling.fetchers import DynamicSession

# Create a session with default configuration
with DynamicSession(
    headless=True,
    disable_resources=True,
    real_chrome=True
) as session:
    # Make multiple requests with the same browser instance
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com')
    page3 = session.fetch('https://dynamic-site.com')
    
    # All requests reuse the same tab on the same browser instance
```

### Async Session Usage

```python
import asyncio
from scrapling.fetchers import AsyncDynamicSession

async def scrape_multiple_sites():
    async with AsyncDynamicSession(
        network_idle=True,
        timeout=30000,
        max_pages=3
    ) as session:
        # Make async requests with shared browser configuration
        pages = await asyncio.gather(
            session.fetch('https://spa-app1.com'),
            session.fetch('https://spa-app2.com'),
            session.fetch('https://dynamic-content.com')
        )
        return pages
```

You may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:

1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.
2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.

This logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)

In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.

### Session Benefits

- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.
- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.
- **Consistent fingerprint**: Same browser fingerprint across all requests.
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.

## When to Use

Use DynamicFetcher when:

- Need browser automation
- Want multiple browser options
- Using a real Chrome browser
- Need custom browser config
- Want a few stealth options 

If you want more stealth and control without much config, check out the [StealthyFetcher](stealthy.md).

================================================
FILE: agent-skill/Scrapling-Skill/references/fetching/static.md
================================================
# HTTP requests

The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.

## Basic Usage
Import the Fetcher (same import pattern for all fetchers):

```python
>>> from scrapling.fetchers import Fetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

### Shared arguments
All methods for making requests here share some arguments, so let's discuss them first.

- **url**: The targeted URL
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
- **retry_delay**: Number of seconds to wait between retry attempts. **Defaults to 1 second**.
- **impersonate**: Impersonate specific browsers' TLS fingerprints. Accepts browser strings or a list of them like `"chrome110"`, `"firefox102"`, `"safari15_5"` to use specific versions or `"chrome"`, `"firefox"`, `"safari"`, `"edge"` to automatically use the latest version available. This makes your requests appear to come from real browsers at the TLS level. If you pass it a list of strings, it will choose a random one with each request. **Defaults to the latest available Chrome version.**
- **http3**: Use HTTP/3 protocol for requests. **Defaults to False**. It might be problematic if used with `impersonate`.
- **cookies**: Cookies to use in the request. Can be a dictionary of `name→value` or a list of dictionaries.
- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
- **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.
- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
- **cert**: Tuple of (cert, key) filenames for the client certificate.
- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.

**Notes:**
1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)
2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.
3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.

Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.

### HTTP Methods
There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.

Examples are the best way to explain this:

> Hence: `OPTIONS` and `HEAD` methods are not supported.
#### GET
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic GET
>>> page = Fetcher.get('https://example.com')
>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')
>>> # With parameters
>>> page = Fetcher.get('https://example.com/search', params={'q': 'query'})
>>>
>>> # With headers
>>> page = Fetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})
>>> # Basic HTTP authentication
>>> page = Fetcher.get("https://example.com", auth=("my_user", "password123"))
>>> # Browser impersonation
>>> page = Fetcher.get('https://example.com', impersonate='chrome')
>>> # HTTP/3 support
>>> page = Fetcher.get('https://example.com', http3=True)
```
And for asynchronous requests, it's a small adjustment 
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic GET
>>> page = await AsyncFetcher.get('https://example.com')
>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')
>>> # With parameters
>>> page = await AsyncFetcher.get('https://example.com/search', params={'q': 'query'})
>>>
>>> # With headers
>>> page = await AsyncFetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})
>>> # Basic HTTP authentication
>>> page = await AsyncFetcher.get("https://example.com", auth=("my_user", "password123"))
>>> # Browser impersonation
>>> page = await AsyncFetcher.get('https://example.com', impersonate='chrome110')
>>> # HTTP/3 support
>>> page = await AsyncFetcher.get('https://example.com', http3=True)
```
The `page` object in all cases is a [Response](choosing.md#response-object) object, which is a [Selector](parsing/main_classes.md#selector), so you can use it directly
```python
>>> page.css('.something.something')

>>> page = Fetcher.get('https://api.github.com/events')
>>> page.json()
[{'id': '<redacted>',
  'type': 'PushEvent',
  'actor': {'id': '<redacted>',
   'login': '<redacted>',
   'display_login': '<redacted>',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/<redacted>',
   'avatar_url': 'https://avatars.githubusercontent.com/u/<redacted>'},
  'repo': {'id': '<redacted>',
...
```
#### POST
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic POST
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, params={'q': 'query'})
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate="chrome")
>>> # Another example of form-encoded data
>>> page = Fetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)
>>> # JSON data
>>> page = Fetcher.post('https://example.com/api', json={'key': 'value'})
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic POST
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate="chrome")
>>> # Another example of form-encoded data
>>> page = await AsyncFetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)
>>> # JSON data
>>> page = await AsyncFetcher.post('https://example.com/api', json={'key': 'value'})
```
#### PUT
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic PUT
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'})
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')
>>> # Another example of form-encoded data
>>> page = Fetcher.put("https://scrapling.requestcatcher.com/put", data={'key': ['value1', 'value2']})
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic PUT
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'})
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')
>>> # Another example of form-encoded data
>>> page = await AsyncFetcher.put("https://scrapling.requestcatcher.com/put", data={'key': ['value1', 'value2']})
```

#### DELETE
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.delete('https://example.com/resource/123')
>>> page = Fetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = Fetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> page = await AsyncFetcher.delete('https://example.com/resource/123')
>>> page = await AsyncFetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = await AsyncFetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')
```

## Session Management

For making multiple requests with the same configuration, use the `FetcherSession` class. It can be used in both synchronous and asynchronous code without issue; the class automatically detects and changes the session type, without requiring a different import.

The `FetcherSession` class can accept nearly all the arguments that the methods can take, which enables you to specify a config for the entire session and later choose a different config for one of the requests effortlessly, as you will see in the following examples.

```python
from scrapling.fetchers import FetcherSession

# Create a session with default configuration
with FetcherSession(
    impersonate='chrome',
    http3=True,
    stealthy_headers=True,
    timeout=30,
    retries=3
) as session:
    # Make multiple requests with the same settings and the same cookies
    page1 = session.get('https://scrapling.requestcatcher.com/get')
    page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
    page3 = session.get('https://api.github.com/events')

    # All requests share the same session and connection pool
```

You can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:

```python
from scrapling.fetchers import FetcherSession, ProxyRotator

rotator = ProxyRotator([
    'http://proxy1:8080',
    'http://proxy2:8080',
    'http://proxy3:8080',
])

with FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:
    # Each request automatically uses the next proxy in rotation
    page1 = session.get('https://example.com/page1')
    page2 = session.get('https://example.com/page2')

    # You can check which proxy was used via the response metadata
    print(page1.meta['proxy'])
```

You can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:

```python
with FetcherSession(proxy='http://default-proxy:8080') as session:
    # Uses the session proxy
    page1 = session.get('https://example.com/page1')

    # Override the proxy for this specific request
    page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')
```

And here's an async example

```python
async with FetcherSession(impersonate='firefox', http3=True) as session:
    # All standard HTTP methods available
    response = await session.get('https://example.com')
    response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})
    response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})
    response = await session.delete('https://scrapling.requestcatcher.com/delete')
```
or better
```python
import asyncio
from scrapling.fetchers import FetcherSession

# Async session usage
async with FetcherSession(impersonate="safari") as session:
    urls = ['https://example.com/page1', 'https://example.com/page2']

    tasks = [
        session.get(url) for url in urls
    ]

    pages = await asyncio.gather(*tasks)
```

The `Fetcher` class uses `FetcherSession` to create a temporary session with each request you make.

### Session Benefits

- **A lot faster**: 10 times faster than creating a single session for each request
- **Cookie persistence**: Automatic cookie handling across requests
- **Resource efficiency**: Better memory and CPU usage for multiple requests
- **Centralized configuration**: Single place to manage request settings

## Examples
Some well-rounded examples to aid newcomers to Web Scraping

### Basic HTTP Request

```python
from scrapling.fetchers import Fetcher

# Make a request
page = Fetcher.get('https://example.com')

# Check the status
if page.status == 200:
    # Extract title
    title = page.css('title::text').get()
    print(f"Page title: {title}")

    # Extract all links
    links = page.css('a::attr(href)').getall()
    print(f"Found {len(links)} links")
```

### Product Scraping

```python
from scrapling.fetchers import Fetcher

def scrape_products():
    page = Fetcher.get('https://example.com/products')
    
    # Find all product elements
    products = page.css('.product')
    
    results = []
    for product in products:
        results.append({
            'title': product.css('.title::text').get(),
            'price': product.css('.price::text').re_first(r'\d+\.\d{2}'),
            'description': product.css('.description::text').get(),
            'in_stock': product.has_class('in-stock')
        })
    
    return results
```

### Downloading Files

```python
from scrapling.fetchers import Fetcher

page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
with open(file='main_cover.png', mode='wb') as f:
   f.write(page.body)
```

### Pagination Handling

```python
from scrapling.fetchers import Fetcher

def scrape_all_pages():
    base_url = 'https://example.com/products?page={}'
    page_num = 1
    all_products = []
    
    while True:
        # Get current page
        page = Fetcher.get(base_url.format(page_num))
        
        # Find products
        products = page.css('.product')
        if not products:
            break
            
        # Process products
        for product in products:
            all_products.append({
                'name': product.css('.name::text').get(),
                'price': product.css('.price::text').get()
            })
            
        # Next page
        page_num += 1
        
    return all_products
```

### Form Submission

```python
from scrapling.fetchers import Fetcher

# Submit login form
response = Fetcher.post(
    'https://example.com/login',
    data={
        'username': 'user@example.com',
        'password': 'password123'
    }
)

# Check login success
if response.status == 200:
    # Extract user info
    user_name = response.css('.user-name::text').get()
    print(f"Logged in as: {user_name}")
```

### Table Extraction

```python
from scrapling.fetchers import Fetcher

def extract_table():
    page = Fetcher.get('https://example.com/data')
    
    # Find table
    table = page.css('table')[0]
    
    # Extract headers
    headers = [
        th.text for th in table.css('thead th')
    ]
    
    # Extract rows
    rows = []
    for row in table.css('tbody tr'):
        cells = [td.text for td in row.css('td')]
        rows.append(dict(zip(headers, cells)))
        
    return rows
```

### Navigation Menu

```python
from scrapling.fetchers import Fetcher

def extract_menu():
    page = Fetcher.get('https://example.com')
    
    # Find navigation
    nav = page.css('nav')[0]
    
    menu = {}
    for item in nav.css('li'):
        links = item.css('a')
        if links:
            link = links[0]
            menu[link.text] = {
                'url': link['href'],
                'has_submenu': bool(item.css('.submenu'))
            }
            
    return menu
```

## When to Use

Use `Fetcher` when:

- Need rapid HTTP requests.
- Want minimal overhead.
- Don't need JavaScript execution (the website can be scraped through requests).
- Need some stealth features (ex, the targeted website is using protection but doesn't use JavaScript challenges).

Use `FetcherSession` when:

- Making multiple requests to the same or different sites.
- Need to maintain cookies/authentication between requests.
- Want connection pooling for better performance.
- Require consistent configuration across requests.
- Working with APIs that require a session state.

Use other fetchers when:

- Need browser automation.
- Need advanced anti-bot/stealth capabilities.
- Need JavaScript support or interacting with dynamic content

================================================
FILE: agent-skill/Scrapling-Skill/references/fetching/stealthy.md
================================================
# StealthyFetcher

`StealthyFetcher` is a stealthy browser-based fetcher similar to [DynamicFetcher](dynamic.md), using [Playwright's API](https://playwright.dev/python/docs/intro). It adds advanced anti-bot protection bypass capabilities, most handled automatically. It shares the same browser automation model as `DynamicFetcher`, using [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) for page interaction.

## Basic Usage
You have one primary way to import this Fetcher, which is the same for all fetchers.

```python
>>> from scrapling.fetchers import StealthyFetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

**Note:** The async version of the `fetch` method is `async_fetch`.

## What does it do?

The `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynamic.md) class, and here are some of the things it does:

1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. 
2. It bypasses CDP runtime leaks and WebRTC leaks.
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
4. It generates canvas noise to prevent fingerprinting through canvas.
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
6. and other anti-protection options...

## Full list of arguments
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments


|      Argument       | Description                                                                                                                                                                                                                         | Optional |
|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|         url         | Target url                                                                                                                                                                                                                          |    ❌     |
|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |
|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |
|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |
|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |
|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |
|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |
|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |
|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |
|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |
|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |
|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |
|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                               |    ✔️    |
|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                   |    ✔️    |
|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |
|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |
|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |
|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |
|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |
|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |
|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |
|  solve_cloudflare   | When enabled, fetcher solves all types of Cloudflare's Turnstile/Interstitial challenges before returning the response to you.                                                                                                      |    ✔️    |
|    block_webrtc     | Forces WebRTC to respect proxy settings to prevent local IP address leak.                                                                                                                                                           |    ✔️    |
|     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                    |    ✔️    |
|     allow_webgl     | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled.                                                                     |    ✔️    |
|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |
|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |
|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).                                                                                                     |    ✔️    |
|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |
|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |
|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |

In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.

**Notes:**

1. It's basically the same arguments as [DynamicFetcher](dynamic.md) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
2. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.

## Examples

### Cloudflare and stealth options

```python
# Automatic Cloudflare solver
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)

# Works with other stealth options
page = StealthyFetcher.fetch(
    'https://protected-site.com',
    solve_cloudflare=True,
    block_webrtc=True,
    real_chrome=True,
    hide_canvas=True,
    google_search=True,
    proxy='http://username:password@host:port',  # It can also be a dictionary with only the keys 'server', 'username', and 'password'.
)
```

The `solve_cloudflare` parameter enables automatic detection and solving all types of Cloudflare's Turnstile/Interstitial challenges:

- JavaScript challenges (managed)
- Interactive challenges (clicking verification boxes)
- Invisible challenges (automatic background verification)

And even solves the custom pages with embedded captcha.

**Important notes:**

1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.
2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.
3. This feature works seamlessly with proxies and other stealth options.

### Browser Automation
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.

This function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.

In the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.
```python
from playwright.sync_api import Page

def scroll_page(page: Page):
    page.mouse.wheel(10, 0)
    page.mouse.move(100, 400)
    page.mouse.up()

page = StealthyFetcher.fetch('https://example.com', page_action=scroll_page)
```
Of course, if you use the async fetch version, the function must also be async.
```python
from playwright.async_api import Page

async def scroll_page(page: Page):
   await page.mouse.wheel(10, 0)
   await page.mouse.move(100, 400)
   await page.mouse.up()

page = await StealthyFetcher.async_fetch('https://example.com', page_action=scroll_page)
```

### Wait Conditions
```python
# Wait for the selector
page = StealthyFetcher.fetch(
    'https://example.com',
    wait_selector='h1',
    wait_selector_state='visible'
)
```
This is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.

After that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.

The states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):

- `attached`: Wait for an element to be present in the DOM.
- `detached`: Wait for an element to not be present in the DOM.
- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.
- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.


### Real-world example (Amazon)
This is for educational purposes only; this example was generated by AI, which also shows how easy it is to work with Scrapling through AI
```python
def scrape_amazon_product(url):
    # Use StealthyFetcher to bypass protection
    page = StealthyFetcher.fetch(url)

    # Extract product details
    return {
        'title': page.css('#productTitle::text').get().clean(),
        'price': page.css('.a-price .a-offscreen::text').get(),
        'rating': page.css('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text').get(),
        'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
        'features': [
            li.get().clean() for li in page.css('#feature-bullets li span::text')
        ],
        'availability': page.css('#availability')[0].get_all_text(strip=True),
        'images': [
            img.attrib['src'] for img in page.css('#altImages img')
        ]
    }
```

## Session Management

To keep the browser open until you make multiple requests with the same configuration, use `StealthySession`/`AsyncStealthySession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.

```python
from scrapling.fetchers import StealthySession

# Create a session with default configuration
with StealthySession(
    headless=True,
    real_chrome=True,
    block_webrtc=True,
    solve_cloudflare=True
) as session:
    # Make multiple requests with the same browser instance
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com') 
    page3 = session.fetch('https://nopecha.com/demo/cloudflare')
    
    # All requests reuse the same tab on the same browser instance
```

### Async Session Usage

```python
import asyncio
from scrapling.fetchers import AsyncStealthySession

async def scrape_multiple_sites():
    async with AsyncStealthySession(
        real_chrome=True,
        block_webrtc=True,
        solve_cloudflare=True,
        timeout=60000,  # 60 seconds for Cloudflare challenges
        max_pages=3
    ) as session:
        # Make async requests with shared browser configuration
        pages = await asyncio.gather(
            session.fetch('https://site1.com'),
            session.fetch('https://site2.com'), 
            session.fetch('https://protected-site.com')
        )
        return pages
```

You may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:

1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.
2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.

This logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)

In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.

### Session Benefits

- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.
- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.
- **Consistent fingerprint**: Same browser fingerprint across all requests.
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.

## When to Use

Use StealthyFetcher when:

- Bypassing anti-bot protection
- Need a reliable browser fingerprint
- Full JavaScript support needed
- Want automatic stealth features
- Need browser automation
- Dealing with Cloudflare protection

================================================
FILE: agent-skill/Scrapling-Skill/references/mcp-server.md
================================================
# Scrapling MCP Server

The Scrapling MCP server exposes six web scraping tools over the MCP protocol. It supports CSS-selector-based content narrowing (reducing tokens by extracting only relevant elements before returning results) and three levels of scraping capability: plain HTTP, browser-rendered, and stealth (anti-bot bypass).

All tools return a `ResponseModel` with fields: `status` (int), `content` (list of strings), `url` (str).

## Tools

### `get` -- HTTP request (single URL)

Fast HTTP GET with browser fingerprint impersonation (TLS, headers). Suitable for static pages with no/low bot protection.

**Key parameters:**

| Parameter           | Type                               | Default      | Description                                                        |
|---------------------|------------------------------------|--------------|--------------------------------------------------------------------|
| `url`               | str                                | required     | URL to fetch                                                       |
| `extraction_type`   | `"markdown"` / `"html"` / `"text"` | `"markdown"` | Output format                                                      |
| `css_selector`      | str or null                        | null         | CSS selector to narrow content (applied after `main_content_only`) |
| `main_content_only` | bool                               | true         | Restrict to `<body>` content                                       |
| `impersonate`       | str                                | `"chrome"`   | Browser fingerprint to impersonate                                 |
| `proxy`             | str or null                        | null         | Proxy URL, e.g. `"http://user:pass@host:port"`                     |
| `proxy_auth`        | dict or null                       | null         | `{"username": "...", "password": "..."}`                           |
| `auth`              | dict or null                       | null         | HTTP basic auth, same format as proxy_auth                         |
| `timeout`           | number                             | 30           | Seconds before timeout                                             |
| `retries`           | int                                | 3            | Retry attempts on failure                                          |
| `retry_delay`       | int                                | 1            | Seconds between retries                                            |
| `stealthy_headers`  | bool                               | true         | Generate realistic browser headers and Google referer       |
| `http3`             | bool                               | false        | Use HTTP/3 (may conflict with `impersonate`)                       |
| `follow_redirects`  | bool                               | true         | Follow HTTP redirects                                              |
| `max_redirects`     | int                                | 30           | Max redirects (-1 for unlimited)                                   |
| `headers`           | dict or null                       | null         | Custom request headers                                             |
| `cookies`           | dict or null                       | null         | Request cookies                                                    |
| `params`            | dict or null                       | null         | Query string parameters                                            |
| `verify`            | bool                               | true         | Verify HTTPS certificates                                          |

### `bulk_get` -- HTTP request (multiple URLs)

Async concurrent version of `get`. Same parameters except `url` is replaced by `urls` (list of strings). All URLs are fetched in parallel. Returns a list of `ResponseModel`.

### `fetch` -- Browser fetch (single URL)

Opens a Chromium browser via Playwright to render JavaScript. Suitable for dynamic/SPA pages with no/low bot protection.

**Key parameters (beyond shared ones):**

| Parameter             | Type                | Default      | Description                                                                     |
|-----------------------|---------------------|--------------|---------------------------------------------------------------------------------|
| `url`                 | str                 | required     | URL to fetch                                                                    |
| `extraction_type`     | str                 | `"markdown"` | `"markdown"` / `"html"` / `"text"`                                              |
| `css_selector`        | str or null         | null         | Narrow content before extraction                                                |
| `main_content_only`   | bool                | true         | Restrict to `<body>`                                                            |
| `headless`            | bool                | true         | Run browser hidden (true) or visible (false)                                    |
| `proxy`               | str or dict or null | null         | String URL or `{"server": "...", "username": "...", "password": "..."}`         |
| `timeout`             | number              | 30000        | Timeout in **milliseconds**                                                     |
| `wait`                | number              | 0            | Extra wait (ms) after page load before extraction                               |
| `wait_selector`       | str or null         | null         | CSS selector to wait for before extraction                                      |
| `wait_selector_state` | str                 | `"attached"` | State for wait_selector: `"attached"` / `"visible"` / `"hidden"` / `"detached"` |
| `network_idle`        | bool                | false        | Wait until no network activity for 500ms                                        |
| `disable_resources`   | bool                | false        | Block fonts, images, media, stylesheets, etc. for speed                         |
| `google_search`       | bool                | true         | Set a Google referer header                                            |
| `real_chrome`         | bool                | false        | Use locally installed Chrome instead of bundled Chromium                        |
| `cdp_url`             | str or null         | null         | Connect to existing browser via CDP URL                                         |
| `extra_headers`       | dict or null        | null         | Additional request headers                                                      |
| `useragent`           | str or null         | null         | Custom user-agent (auto-generated if null)                                      |
| `cookies`             | list or null        | null         | Playwright-format cookies                                                       |
| `timezone_id`         | str or null         | null         | Browser timezone, e.g. `"America/New_York"`                                     |
| `locale`              | str or null         | null         | Browser locale, e.g. `"en-GB"`                                                  |

### `bulk_fetch` -- Browser fetch (multiple URLs)

Concurrent browser version of `fetch`. Same parameters except `url` is replaced by `urls` (list of strings). Each URL opens in a separate browser tab. Returns a list of `ResponseModel`.

### `stealthy_fetch` -- Stealth browser fetch (single URL)

Anti-bot bypass fetcher with fingerprint spoofing. Use this for sites with Cloudflare Turnstile/Interstitial or other strong protections.

**Additional parameters (beyond those in `fetch`):**

| Parameter          | Type         | Default | Description                                                      |
|--------------------|--------------|---------|------------------------------------------------------------------|
| `solve_cloudflare` | bool         | false   | Automatically solve Cloudflare Turnstile/Interstitial challenges |
| `hide_canvas`      | bool         | false   | Add noise to canvas operations to prevent fingerprinting         |
| `block_webrtc`     | bool         | false   | Force WebRTC to respect proxy settings (prevents IP leak)        |
| `allow_webgl`      | bool         | true    | Keep WebGL enabled (disabling is detectable by WAFs)             |
| `additional_args`  | dict or null | null    | Extra Playwright context args (overrides Scrapling defaults)     |

All parameters from `fetch` are also accepted.

### `bulk_stealthy_fetch` -- Stealth browser fetch (multiple URLs)

Concurrent stealth version. Same parameters as `stealthy_fetch` except `url` is replaced by `urls` (list of strings). Returns a list of `ResponseModel`.

## Tool selection guide

| Scenario                                 | Tool                                                          |
|------------------------------------------|---------------------------------------------------------------|
| Static page, no bot protection           | `get`                                                         |
| Multiple static pages                    | `bulk_get`                                                    |
| JavaScript-rendered / SPA page           | `fetch`                                                       |
| Multiple JS-rendered pages               | `bulk_fetch`                                                  |
| Cloudflare or strong anti-bot protection | `stealthy_fetch` (with `solve_cloudflare=true` for Turnstile) |
| Multiple protected pages                 | `bulk_stealthy_fetch`                                         |

Start with `get` (fastest, lowest resource cost). Escalate to `fetch` if content requires JS rendering. Escalate to `stealthy_fetch` only if blocked.

## Content extraction tips

- Use `css_selector` to narrow results before they reach the model -- this saves significant tokens.
- `main_content_only=true` (default) strips nav/footer by restricting to `<body>`.
- `extraction_type="markdown"` (default) is best for readability. Use `"text"` for minimal output, `"html"` when structure matters.
- If a `css_selector` matches multiple elements, all are returned in the `content` list.

## Setup

Start the server (stdio transport, used by most MCP clients):

```bash
scrapling mcp
```

Or with Streamable HTTP transport:

```bash
scrapling mcp --http
scrapling mcp --http --host 127.0.0.1 --port 8000
```

Docker alternative:

```bash
docker pull pyd4vinci/scrapling
docker run -i --rm scrapling mcp
```

The MCP server name when registering with a client is `ScraplingServer`. The command is the path to the `scrapling` binary and the argument is `mcp`.

================================================
FILE: agent-skill/Scrapling-Skill/references/migrating_from_beautifulsoup.md
================================================
# Migrating from BeautifulSoup to Scrapling

API comparison between BeautifulSoup and Scrapling. Scrapling is faster, provides equivalent parsing capabilities, and adds features for fetching and handling modern web pages.

Some BeautifulSoup shortcuts have no direct Scrapling equivalent. Scrapling avoids those shortcuts to preserve performance.


| Task                                                            | BeautifulSoup Code                                                                                            | Scrapling Code                                                                    |
|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|
| Parser import                                                   | `from bs4 import BeautifulSoup`                                                                               | `from scrapling.parser import Selector`                                           |
| Parsing HTML from string                                        | `soup = BeautifulSoup(html, 'html.parser')`                                                                   | `page = Selector(html)`                                                           |
| Finding a single element                                        | `element = soup.find('div', class_='example')`                                                                | `element = page.find('div', class_='example')`                                    |
| Finding multiple elements                                       | `elements = soup.find_all('div', class_='example')`                                                           | `elements = page.find_all('div', class_='example')`                               |
| Finding a single element (Example 2)                            | `element = soup.find('div', attrs={"class": "example"})`                                                      | `element = page.find('div', {"class": "example"})`                                |
| Finding a single element (Example 3)                            | `element = soup.find(re.compile("^b"))`                                                                       | `element = page.find(re.compile("^b"))`<br/>`element = page.find_by_regex(r"^b")` |
| Finding a single element (Example 4)                            | `element = soup.find(lambda e: len(list(e.children)) > 0)`                                                    | `element = page.find(lambda e: len(e.children) > 0)`                              |
| Finding a single element (Example 5)                            | `element = soup.find(["a", "b"])`                                                                             | `element = page.find(["a", "b"])`                                                 |
| Find element by its text content                                | `element = soup.find(text="some text")`                                                                       | `element = page.find_by_text("some text", partial=False)`                         |
| Using CSS selectors to find the first matching element          | `elements = soup.select_one('div.example')`                                                                   | `elements = page.css('div.example').first`                                        |
| Using CSS selectors to find all matching element                | `elements = soup.select('div.example')`                                                                       | `elements = page.css('div.example')`                                              |
| Get a prettified version of the page/element source             | `prettified = soup.prettify()`                                                                                | `prettified = page.prettify()`                                                    |
| Get a Non-pretty version of the page/element source             | `source = str(soup)`                                                                                          | `source = page.html_content`                                                      |
| Get tag name of an element                                      | `name = element.name`                                                                                         | `name = element.tag`                                                              |
| Extracting text content of an element                           | `string = element.string`                                                                                     | `string = element.text`                                                           |
| Extracting all the text in a document or beneath a tag          | `text = soup.get_text(strip=True)`                                                                            | `text = page.get_all_text(strip=True)`                                            |
| Access the dictionary of attributes                             | `attrs = element.attrs`                                                                                       | `attrs = element.attrib`                                                          |
| Extracting attributes                                           | `attr = element['href']`                                                                                      | `attr = element['href']`                                                          |
| Navigating to parent                                            | `parent = element.parent`                                                                                     | `parent = element.parent`                                                         |
| Get all parents of an element                                   | `parents = list(element.parents)`                                                                             | `parents = list(element.iterancestors())`                                         |
| Searching for an element in the parents of an element           | `target_parent = element.find_parent("a")`                                                                    | `target_parent = element.find_ancestor(lambda p: p.tag == 'a')`                   |
| Get all siblings of an element                                  | N/A                                                                                                           | `siblings = element.siblings`                                                     |
| Get next sibling of an element                                  | `next_element = element.next_sibling`                                                                         | `next_element = element.next`                                                     |
| Searching for an element in the siblings of an element          | `target_sibling = element.find_next_sibling("a")`<br/>`target_sibling = element.find_previous_sibling("a")`   | `target_sibling = element.siblings.search(lambda s: s.tag == 'a')`                |
| Searching for elements in the siblings of an element            | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')`                |
| Searching for an element in the next elements of an element     | `target_parent = element.find_next("a")`                                                                      | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')`           |
| Searching for elements in the next elements of an element       | `target_parent = element.find_all_next("a")`                                                                  | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')`           |
| Searching for an element in the ancestors of an element         | `target_parent = element.find_previous("a")` ¹                                                                | `target_parent = element.path.search(lambda p: p.tag == 'a')`                     |
| Searching for elements in the ancestors of an element           | `target_parent = element.find_all_previous("a")` ¹                                                            | `target_parent = element.path.filter(lambda p: p.tag == 'a')`                     |
| Get previous sibling of an element                              | `prev_element = element.previous_sibling`                                                                     | `prev_element = element.previous`                                                 |
| Navigating to children                                          | `children = list(element.children)`                                                                           | `children = element.children`                                                     |
| Get all descendants of an element                               | `children = list(element.descendants)`                                                                        | `children = element.below_elements`                                               |
| Filtering a group of elements that satisfies a condition        | `group = soup.find('p', 'story').css.filter('a')`                                                             | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')`              |


¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.

BeautifulSoup supports modifying/manipulating the parsed DOM. Scrapling does not — it is read-only and optimized for extraction.

### Full Example: Extracting Links

**With BeautifulSoup:**

```python
import requests
from bs4 import BeautifulSoup

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

links = soup.find_all('a')
for link in links:
    print(link['href'])
```

**With Scrapling:**

```python
from scrapling import Fetcher

url = 'https://example.com'
page = Fetcher.get(url)

links = page.css('a::attr(href)')
for link in links:
    print(link)
```

Scrapling combines fetching and parsing into a single step.

**Note:**

- **Parsers**: BeautifulSoup supports multiple parser engines. Scrapling always uses `lxml` for performance.
- **Element Types**: BeautifulSoup elements are `Tag` objects; Scrapling elements are `Selector` objects. Both provide similar navigation and extraction methods.
- **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). `page.css()` returns an empty `Selectors` list when no elements match. Use `page.css('.foo').first` to safely get the first match or `None`.
- **Text Extraction**: Scrapling's `TextHandler` provides additional text processing methods such as `clean()` for removing extra whitespace, consecutive spaces, or unwanted characters.

================================================
FILE: agent-skill/Scrapling-Skill/references/parsing/adaptive.md
================================================
# Adaptive scraping

Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.

Consider a page with a structure like this:
```html
<div class="container">
    <section class="products">
        <article class="product" id="p1">
            <h3>Product 1</h3>
            <p class="description">Description 1</p>
        </article>
        <article class="product" id="p2">
            <h3>Product 2</h3>
            <p class="description">Description 2</p>
        </article>
    </section>
</div>
```
To scrape the first product (the one with the `p1` ID), a selector like this would be used:
```python
page.css('#p1')
```
When website owners implement structural changes like
```html
<div class="new-container">
    <div class="product-wrapper">
        <section class="products">
            <article class="product new-class" data-id="p1">
                <div class="product-info">
                    <h3>Product 1</h3>
                    <p class="new-description">Description 1</p>
                </div>
            </article>
            <article class="product new-class" data-id="p2">
                <div class="product-info">
                    <h3>Product 2</h3>
                    <p class="new-description">Description 2</p>
                </div>
            </article>
        </section>
    </div>
</div>
```
The selector will no longer function, and your code needs maintenance. That's where Scrapling's `adaptive` feature comes into play.

With Scrapling, you can enable the `adaptive` feature the first time you select an element, and the next time you select that element and it doesn't exist, Scrapling will remember its properties and search on the website for the element with the highest percentage of similarity to that element.

```python
from scrapling import Selector, Fetcher
# Before the change
page = Selector(page_source, adaptive=True, url='example.com')
# or
Fetcher.adaptive = True
page = Fetcher.get('https://example.com')
# then
element = page.css('#p1', auto_save=True)
if not element:  # One day website changes?
    element = page.css('#p1', adaptive=True)  # Scrapling still finds it!
# the rest of your code...
```
It works with all selection methods, not just CSS/XPath selection.

## Real-World Scenario
This example uses [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/) to demonstrate adaptive scraping across different versions of a website. A copy of [StackOverflow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/) is compared against the current design to show that the adaptive feature can extract the same button using the same selector.

To extract the Questions button from the old design, a selector like `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` can be used (this specific selector was generated by Chrome).

Testing the same selector in both versions:
```python
>> from scrapling import Fetcher
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
>> new_url = "https://stackoverflow.com/"
>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
>> 
>> page = Fetcher.get(old_url, timeout=30)
>> element1 = page.css(selector, auto_save=True)[0]
>> 
>> # Same selector but used in the updated website
>> page = Fetcher.get(new_url)
>> element2 = page.css(selector, adaptive=True)[0]
>> 
>> if element1.text == element2.text:
...    print('Scrapling found the same element in the old and new designs!')
'Scrapling found the same element in the old and new designs!'
```
The `adaptive_domain` argument is used here because Scrapling sees `archive.org` and `stackoverflow.com` as two different domains and would isolate their `adaptive` data. Passing `adaptive_domain` tells Scrapling to treat them as the same website for adaptive data storage.

In a typical scenario with the same URL for both requests, the `adaptive_domain` argument is not needed. The adaptive logic works the same way with both the `Selector` and `Fetcher` classes.

**Note:** The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, it can be used to continue using the previously stored adaptive data for the new URL. Otherwise, Scrapling will consider it a new website and discard the old data.

## How the adaptive scraping feature works
Adaptive scraping works in two phases:

1. **Save Phase**: Store unique properties of elements
2. **Match Phase**: Find elements with similar properties later

After selecting an element through any method, the library can find it the next time the website is scraped, even if it undergoes structural/design changes.

The general logic is as follows:

  1. Scrapling saves that element's unique properties (methods shown below).
  2. Scrapling uses its configured database (SQLite by default) and saves each element's unique properties.
  3. Because everything about the element can be changed or removed by the website's owner(s), nothing from the element can be used as a unique identifier for the database. The storage system relies on two things:
     1. The domain of the current website. When using the `Selector` class, pass it when initializing; when using a fetcher, the domain is automatically taken from the URL.
     2. An `identifier` to query that element's properties from the database. The identifier does not always need to be set manually (see below).

     Together, they will later be used to retrieve the element's unique properties from the database.

  4. Later, when the website's structure changes, enabling `adaptive` causes Scrapling to retrieve the element's unique properties and match all elements on the page against them. A score is calculated based on their similarity to the desired element. Everything is taken into consideration in that comparison.
  5. The element(s) with the highest similarity score to the wanted element are returned.

### The unique properties
The unique properties Scrapling relies on are:

- Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).
- Element's parent tag name, attributes (names and values), and text.

The comparison between elements is not exact; it is based on how similar these values are. Everything is considered, including the values' order (e.g., the order in which class names are written).

## How to use adaptive feature
The adaptive feature can be applied to any found element and is added as arguments to CSS/XPath selection methods.

First, enable the `adaptive` feature by passing `adaptive=True` to the [Selector](main_classes.md#selector) class when initializing it, or enable it on the fetcher being used.

Examples:
```python
>>> from scrapling import Selector, Fetcher
>>> page = Selector(html_doc, adaptive=True)
# OR
>>> Fetcher.adaptive = True
>>> page = Fetcher.get('https://example.com')
```
When using the [Selector](main_classes.md#selector) class, pass the URL of the website with the `url` argument so Scrapling can separate the properties saved for each element by domain.

If no URL is passed, the word `default` will be used in place of the URL field while saving the element's unique properties. This is only an issue when using the same identifier for a different website without passing the URL parameter. The save process overwrites previous data, and the `adaptive` feature uses only the latest saved properties.

The `storage` and `storage_args` arguments control the database connection; by default, the SQLite class provided by the library is used.

There are two main ways to use the `adaptive` feature:

### The CSS/XPath Selection way
First, use the `auto_save` argument while selecting an element that exists on the page:
```python
element = page.css('#p1', auto_save=True)
```
When the element no longer exists, use the same selector with the `adaptive` argument to have the library find it:
```python
element = page.css('#p1', adaptive=True)
```
With the `css`/`xpath` methods, the identifier is set automatically to the selector string passed to the method.

Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.

### The manual way
Elements can be manually saved, retrieved, and relocated within the `adaptive` feature. This allows relocating any element found by any method.

Example of getting an element by text:
```python
>>> element = page.find_by_text('Tipping the Velvet', first_match=True)
```
Save its unique properties using the `save` method. The identifier must be set manually (use a meaningful identifier):
```python
>>> page.save(element, 'my_special_element')
```
Later, retrieve and relocate the element inside the page with `adaptive`:
```python
>>> element_dict = page.retrieve('my_special_element')
>>> page.relocate(element_dict, selector_type=True)
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
>>> page.relocate(element_dict, selector_type=True).css('::text').getall()
['Tipping the Velvet']
```
The `retrieve` and `relocate` methods are used here.

To keep it as a `lxml.etree` object, omit the `selector_type` argument:
```python
>>> page.relocate(element_dict)
[<Element a at 0x105a2a7b0>]
```

## Troubleshooting

### No Matches Found
```python
# 1. Check if data was saved
element_data = page.retrieve('identifier')
if not element_data:
    print("No data saved for this identifier")

# 2. Try with different identifier
products = page.css('.product', adaptive=True, identifier='old_selector')

# 3. Save again with new identifier
products = page.css('.new-product', auto_save=True, identifier='new_identifier')
```

### Wrong Elements Matched
```python
# Use more specific selectors
products = page.css('.product-list .product', auto_save=True)

# Or save with more context
product = page.find_by_text('Product Name').parent
page.save(product, 'specific_product')
```

## Known Issues
In the `adaptive` save process, only the unique properties of the first element in the selection results are saved. So if the selector you are using selects different elements on the page in other locations, `adaptive` will return the first element to you only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector, for example), as these selectors are separated and each is executed alone.


================================================
FILE: agent-skill/Scrapling-Skill/references/parsing/main_classes.md
================================================
# Parsing main classes

The [Selector](#selector) class is the core parsing engine in Scrapling, providing HTML parsing and element selection capabilities. You can always import it with any of the following imports
```python
from scrapling import Selector
from scrapling.parser import Selector
```
Usage:
```python
page = Selector(
    '<html>...</html>',
    url='https://example.com'
)

# Then select elements as you like
elements = page.css('.product')
```
In Scrapling, the main object you deal with after passing an HTML source or fetching a website is, of course, a [Selector](#selector) object. Any operation you do, like selection, navigation, etc., will return either a [Selector](#selector) object or a [Selectors](#selectors) object, given that the result is element/elements from the page, not text or similar.

The main page is a [Selector](#selector) object, and the elements within are [Selector](#selector) objects. Any text (text content inside elements or attribute values) is a [TextHandler](#texthandler) object, and element attributes are stored as [AttributesHandler](#attributeshandler).

## Selector
### Arguments explained
The most important one is `content`, it's used to pass the HTML code you want to parse, and it accepts the HTML content as `str` or `bytes`.

The arguments `url`, `adaptive`, `storage`, and `storage_args` are settings used with the `adaptive` feature. They are explained in the [adaptive](adaptive.md) feature page.

Arguments for parsing adjustments:

- **encoding**: This is the encoding that will be used while parsing the HTML. The default is `UTF-8`.
- **keep_comments**: This tells the library whether to keep HTML comments while parsing the page. It's disabled by default because it can cause issues with your scraping in various ways.
- **keep_cdata**: Same logic as the HTML comments. [cdata](https://stackoverflow.com/questions/7092236/what-is-cdata-in-html) is removed by default for cleaner HTML.

The arguments `huge_tree` and `root` are advanced features not covered here.

Most properties on the main page and its elements are lazily loaded (not initialized until accessed), which contributes to Scrapling's speed.

### Properties
Properties for traversal are separated in the [traversal](#traversal) section below.

Parsing this HTML page as an example:
```html
<html>
  <head>
    <title>Some page</title>
  </head>
  <body>
    <div class="product-list">
      <article class="product" data-id="1">
        <h3>Product 1</h3>
        <p class="description">This is product 1</p>
        <span class="price">$10.99</span>
        <div class="hidden stock">In stock: 5</div>
      </article>
    
      <article class="product" data-id="2">
        <h3>Product 2</h3>
        <p class="description">This is product 2</p>
        <span class="price">$20.99</span>
        <div class="hidden stock">In stock: 3</div>
      </article>
    
      <article class="product" data-id="3">
        <h3>Product 3</h3>
        <p class="description">This is product 3</p>
        <span class="price">$15.99</span>
        <div class="hidden stock">Out of stock</div>
      </article>
    </div>

    <script id="page-data" type="application/json">
      {
        "lastUpdated": "2024-09-22T10:30:00Z",
        "totalProducts": 3
      }
    </script>
  </body>
</html>
```
Load the page directly as shown before:
```python
from scrapling import Selector
page = Selector(html_doc)
```
Get all text content on the page recursively
```python
>>> page.get_all_text()
'Some page\n\n    \n\n      \nProduct 1\nThis is product 1\n$10.99\nIn stock: 5\nProduct 2\nThis is product 2\n$20.99\nIn stock: 3\nProduct 3\nThis is product 3\n$15.99\nOut of stock'
```
Get the first article (used as an example throughout):
```python
article = page.find('article')
```
With the same logic, get all text content on the element recursively
```python
>>> article.get_all_text()
'Product 1\nThis is product 1\n$10.99\nIn stock: 5'
```
But if you try to get the direct text content, it will be empty because it doesn't have direct text in the HTML code above
```python
>>> article.text
''
```
The `get_all_text` method has the following optional arguments:

1. **separator**: All strings collected will be concatenated using this separator. The default is '\n'.
2. **strip**: If enabled, strings will be stripped before concatenation. Disabled by default.
3. **ignore_tags**: A tuple of all tag names you want to ignore in the final results and ignore any elements nested within them. The default is `('script', 'style',)`.
4. **valid_values**: If enabled, the method will only collect elements with real values, so all elements with empty text content or only whitespaces will be ignored. It's enabled by default

The text returned is a [TextHandler](#texthandler), not a standard string. If the text content can be serialized to JSON, use `.json()` on it:
```python
>>> script = page.find('script')
>>> script.json()
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
```
Let's continue to get the element tag
```python
>>> article.tag
'article'
```
Using it on the page directly operates on the root `html` element:
```python
>>> page.tag
'html'
```
Getting the attributes of the element
```python
>>> print(article.attrib)
{'class': 'product', 'data-id': '1'}
```
Access a specific attribute with any of the following
```python
>>> article.attrib['class']
>>> article.attrib.get('class')
>>> article['class']  # new in v0.3
```
Check if the attributes contain a specific attribute with any of the methods below
```python
>>> 'class' in article.attrib
>>> 'class' in article  # new in v0.3
```
Get the HTML content of the element
```python
>>> article.html_content
'<article class="product" data-id="1"><h3>Product 1</h3>\n        <p class="description">This is product 1</p>\n        <span class="price">$10.99</span>\n        <div class="hidden stock">In stock: 5</div>\n      </article>'
```
Get the prettified version of the element's HTML content
```python
print(article.prettify())
```
```html
<article class="product" data-id="1"><h3>Product 1</h3>
    <p class="description">This is product 1</p>
    <span class="price">$10.99</span>
    <div class="hidden stock">In stock: 5</div>
</article>
```
Use the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.
```python
>>> page.body
'<html>\n  <head>\n    <title>Some page</title>\n  </head>\n  ...'
```
To get all the ancestors in the DOM tree of this element
```python
>>> article.path
[<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>,
 <data='<body> <div class="product-list"> <artic...' parent='<html><head><title>Some page</title></he...'>,
 <data='<html><head><title>Some page</title></he...'>]
```
Generate a CSS shortened selector if possible, or generate the full selector
```python
>>> article.generate_css_selector
'body > div > article'
>>> article.generate_full_css_selector
'body > div > article'
```
Same case with XPath
```python
>>> article.generate_xpath_selector
"//body/div/article"
>>> article.generate_full_xpath_selector
"//body/div/article"
```

### Traversal
Properties and methods for navigating elements on the page.

The `html` element is the root of the website's tree. Elements like `head` and `body` are "children" of `html`, and `html` is their "parent". The element `body` is a "sibling" of `head` and vice versa.

Accessing the parent of an element
```python
>>> article.parent
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
>>> article.parent.tag
'div'
```
Chaining is supported, as with all similar properties/methods:
```python
>>> article.parent.parent.tag
'body'
```
Get the children of an element
```python
>>> article.children
[<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>]
```
Get all elements underneath an element. It acts as a nested version of the `children` property
```python
>>> article.below_elements
[<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>]
```
This element returns the same result as the `children` property because its children don't have children.

Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
```python
>>> products_list = page.css('.product-list')[0]
>>> products_list.children
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]

>>> products_list.below_elements
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
...]
```
Get the siblings of an element
```python
>>> article.siblings
[<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
```
Get the next element of the current element
```python
>>> article.next
<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>
```
The same logic applies to the `previous` property
```python
>>> article.previous  # It's the first child, so it doesn't have a previous element
>>> second_article = page.css('.product[data-id="2"]')[0]
>>> second_article.previous
<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
```
Check if an element has a specific class name:
```python
>>> article.has_class('product')
True
```
Iterate over the entire ancestors' tree of any element:
```python
for ancestor in article.iterancestors():
    # do something with it...
```
Search for a specific ancestor that satisfies a search function. Pass a function that takes a [Selector](#selector) object as an argument and returns `True`/`False`:
```python
>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>

>>> article.find_ancestor(lambda ancestor: ancestor.css('.product-list'))  # Same result, different approach
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
```
## Selectors
The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.

In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.

Starting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `"#text"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.

```python
>>> page.css('a::text')              # -> Selectors (of text node Selectors)
>>> page.xpath('//a/text()')         # -> Selectors
>>> page.css('a::text').get()        # -> TextHandler (the first text value)
>>> page.css('a::text').getall()     # -> TextHandlers (all text values)
>>> page.css('a::attr(href)')        # -> Selectors
>>> page.xpath('//a/@href')          # -> Selectors
>>> page.css('.price_color')         # -> Selectors
```

### Data extraction methods
Starting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.

**On a [Selector](#selector) object:**

- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.
- `getall()` returns a `TextHandlers` list containing the single serialized string.
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.

```python
>>> page.css('h3')[0].get()        # Outer HTML of the element
'<h3>Product 1</h3>'

>>> page.css('h3::text')[0].get()  # Text value of the text node
'Product 1'
```

**On a [Selectors](#selectors) object:**

- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.
- `getall()` serializes **all** elements and returns a `TextHandlers` list.
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.

```python
>>> page.css('.price::text').get()      # First price text
'$10.99'

>>> page.css('.price::text').getall()   # All price texts
['$10.99', '$20.99', '$15.99']

>>> page.css('.price::text').get('')    # With default value
'$10.99'
```

These methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.

### Properties
Apart from the standard operations on Python lists (iteration, slicing, etc.), the following operations are available:

CSS and XPath selectors can be executed directly on the [Selector](#selector) instances, with the same return types as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available. This makes chaining methods straightforward:
```python
>>> page.css('.product_pod a')
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
...]

>>> page.css('.product_pod').css('a')  # Returns the same result
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
...]
```
The `re` and `re_first` methods can be run directly. They take the same arguments as the [Selector](#selector) class. In this class, `re_first` runs `re` on each [Selector](#selector) within and returns the first one with a result. The `re` method returns a [TextHandlers](#texthandlers) object combining all matches:
```python
>>> page.css('.price_color').re(r'[\d\.]+')
['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
...]

>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000',
 'tipping-the-velvet_999',
 'soumission_998',
 'sharp-objects_997',
...]
```
The `search` method searches the available [Selector](#selector) instances. The function passed must accept a [Selector](#selector) instance as the first argument and return True/False. Returns the first matching [Selector](#selector) instance, or `None`:
```python
# Find all the products with price '53.23'.
>>> search_function = lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) == 54.23
>>> page.css('.product_pod').search(search_function)
<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>
```
The `filter` method takes a function like `search` but returns a `Selectors` instance of all matching [Selector](#selector) instances:
```python
# Find all products with prices over $50
>>> filtering_function = lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) > 50
>>> page.css('.product_pod').filter(filtering_function)
[<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
 <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
 <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
...]
```
Safe access to the first or last element without index errors:
```python
>>> page.css('.product').first   # First Selector or None
<data='<article class="product" data-id="1"><h3...'>
>>> page.css('.product').last    # Last Selector or None
<data='<article class="product" data-id="3"><h3...'>
>>> page.css('.nonexistent').first  # Returns None instead of raising IndexError
```

Get the number of [Selector](#selector) instances in a [Selectors](#selectors) instance:
```python
page.css('.product_pod').length
```
which is equivalent to
```python
len(page.css('.product_pod'))
```

## TextHandler
All methods/properties that return a string return `TextHandler`, and those that return a list of strings return [TextHandlers](#texthandlers) instead.

TextHandler is a subclass of the standard Python string, so all standard string operations are supported.

TextHandler provides extra methods and properties beyond standard Python strings. All methods and properties in all classes that return string(s) return TextHandler, enabling chaining and cleaner code. It can also be imported directly and used on any string.
### Usage
All operations (slicing, indexing, etc.) and methods (`split`, `replace`, `strip`, etc.) return a `TextHandler`, so they can be chained.

The `re` and `re_first` methods exist in [Selector](#selector), [Selectors](#selectors), and [TextHandlers](#texthandlers) as well, accepting the same arguments.

- The `re` method takes a string/compiled regex pattern as the first argument. It searches the data for all strings matching the regex and returns them as a [TextHandlers](#texthandlers) instance. The `re_first` method takes the same arguments but returns only the first result as a `TextHandler` instance.
    
    Also, it takes other helpful arguments, which are:
    
    - **replace_entities**: This is enabled by default. It replaces character entity references with their corresponding characters.
    - **clean_match**: It's disabled by default. This causes the method to ignore all whitespace, including consecutive spaces, while matching.
    - **case_sensitive**: It's enabled by default. As the name implies, disabling it causes the regex to ignore letter case during compilation.
  
    The return result is [TextHandlers](#texthandlers) because the `re` method is used:
    ```python
    >>> page.css('.price_color').re(r'[\d\.]+')
    ['51.77',
     '53.74',
     '50.10',
     '47.82',
     '54.23',
    ...]
    
    >>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
    ['a-light-in-the-attic_1000',
     'tipping-the-velvet_999',
     'soumission_998',
     'sharp-objects_997',
    ...]
    ```
    Examples with custom strings demonstrating the other arguments:
    ```python
    >>> from scrapling import TextHandler
    >>> test_string = TextHandler('hi  there')  # Hence the two spaces
    >>> test_string.re('hi there')
    >>> test_string.re('hi there', clean_match=True)  # Using `clean_match` will clean the string before matching the regex
    ['hi there']
    
    >>> test_string2 = TextHandler('Oh, Hi Mark')
    >>> test_string2.re_first('oh, hi Mark')
    >>> test_string2.re_first('oh, hi Mark', case_sensitive=False)  # Hence disabling `case_sensitive`
    'Oh, Hi Mark'
    
    # Mixing arguments
    >>> test_string.re('hi there', clean_match=True, case_sensitive=False)
    ['hi There']
    ```
    Since `html_content` returns `TextHandler`, regex can be applied directly on HTML content:
    ```python
    >>> page.html_content.re('div class=".*">(.*)</div')
    ['In stock: 5', 'In stock: 3', 'Out of stock']
    ```

- The `.json()` method converts the content to a JSON object if possible; otherwise, it throws an error:
  ```python
  >>> page.css('#page-data::text').get()
    '\n      {\n        "lastUpdated": "2024-09-22T10:30:00Z",\n        "totalProducts": 3\n      }\n    '
  >>> page.css('#page-data::text').get().json()
    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  If no text node is specified while selecting an element, the text content is selected automatically:
  ```python
  >>> page.css('#page-data')[0].json()
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  The [Selector](#selector) class adds additional behavior. Given this page:
  ```html
  <html>
      <body>
          <div>
            <script id="page-data" type="application/json">
              {
                "lastUpdated": "2024-09-22T10:30:00Z",
                "totalProducts": 3
              }
            </script>
          </div>
      </body>
  </html>
  ```
  The [Selector](#selector) class has the `get_all_text` method, which returns a `TextHandler`. For example:
  ```python
  >>> page.css('div::text').get().json()
  ```
  This throws an error because the `div` tag has no direct text content. The `get_all_text` method handles this case:
  ```python
  >>> page.css('div')[0].get_all_text(ignore_tags=[]).json()
    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  The `ignore_tags` argument is used here because its default value is `('script', 'style',)`.

  When dealing with a JSON response:
  ```python
  >>> page = Selector("""{"some_key": "some_value"}""")
  ```
  The [Selector](#selector) class is optimized for HTML, so it treats this as a broken HTML response and wraps it. The `html_content` property shows:
  ```python
  >>> page.html_content
  '<html><body><p>{"some_key": "some_value"}</p></body></html>'
  ```
  The `json` method can be used directly:
  ```python
  >>> page.json()
  {'some_key': 'some_value'}
  ```
  For JSON responses, the [Selector](#selector) class keeps a raw copy of the content it receives. When `.json()` is called, it checks for that raw copy first and converts it to JSON. If the raw copy is unavailable (as with sub-elements), it checks the current element's text content, then falls back to `get_all_text`.

- The `.clean()` method removes all whitespace and consecutive spaces, returning a new `TextHandler` instance:
```python
>>> TextHandler('\n wonderful  idea, \reh?').clean()
'wonderful idea, eh?'
```
The `remove_entities` argument causes `clean` to replace HTML entities with their corresponding characters.

- The `.sort()` method sorts the string characters:
```python
>>> TextHandler('acb').sort()
'abc'
```
Or do it in reverse:
```python
>>> TextHandler('acb').sort(reverse=True)
'cba'
```

This class is returned in place of strings nearly everywhere in the library.

## TextHandlers
This class inherits from standard lists, adding `re` and `re_first` as new methods.

The `re_first` method runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`.

## AttributesHandler
This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.
```python
>>> print(page.find('script').attrib)
{'id': 'page-data', 'type': 'application/json'}
>>> type(page.find('script').attrib).__name__
'AttributesHandler'
```
Because it's read-only, it will use fewer resources than the standard dictionary. Still, it has the same dictionary method and properties, except those that allow you to modify/override the data.

It currently adds two extra simple methods:

- The `search_values` method

    Searches the current attributes by values (rather than keys) and returns a dictionary of each matching item.
    
    A simple example would be
    ```python
    >>> for i in page.find('script').attrib.search_values('page-data'):
            print(i)
    {'id': 'page-data'}
    ```
    But this method provides the `partial` argument as well, which allows you to search by part of the value:
    ```python
    >>> for i in page.find('script').attrib.search_values('page', partial=True):
            print(i)
    {'id': 'page-data'}
    ```
    A more practical example is using it with `find_all` to find all elements that have a specific value in their attributes:
    ```python
    >>> page.find_all(lambda element: list(element.attrib.search_values('product')))
    [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
     <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
     <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
    ```
    All these elements have 'product' as the value for the `class` attribute.
    
    The `list` function is used here because `search_values` returns a generator, so it would be `True` for all elements.

- The `json_string` property

    This property converts current attributes to a JSON string if the attributes are JSON serializable; otherwise, it throws an error.
  
    ```python
    >>>page.find('script').attrib.json_string
    b'{"id":"page-data","type":"application/json"}'
    ```

================================================
FILE: agent-skill/Scrapling-Skill/references/parsing/selection.md
================================================
# Querying elements
Scrapling currently supports parsing HTML pages exclusively (no XML feeds), because the adaptive feature does not work with XML.

In Scrapling, there are five main ways to find elements:

1. CSS3 Selectors
2. XPath Selectors
3. Finding elements based on filters/conditions.
4. Finding elements whose content contains a specific text
5. Finding elements whose content matches a specific regex

There are also other indirect ways to find elements. Scrapling can also find elements similar to a given element; see [Finding Similar Elements](#finding-similar-elements).

## CSS/XPath selectors

### What are CSS selectors?
[CSS](https://en.wikipedia.org/wiki/CSS) is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements.

Scrapling implements CSS3 selectors as described in the [W3C specification](http://www.w3.org/TR/2011/REC-css3-selectors-20110929/). CSS selectors support comes from `cssselect`, so it's better to read about which [selectors are supported from cssselect](https://cssselect.readthedocs.io/en/latest/#supported-selectors) and pseudo-functions/elements.

Also, Scrapling implements some non-standard pseudo-elements like:

* To select text nodes, use ``::text``.
* To select attribute values, use ``::attr(name)`` where name is the name of the attribute that you want the value of

The selector logic follows the same conventions as Scrapy/Parsel.

To select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.

### What are XPath selectors?
[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).

The logic follows the same conventions as Scrapy/Parsel. However, Scrapling does not implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method on returned elements.

To select elements with XPath selectors, use the `xpath` method, which follows the same logic as the CSS selectors method above.

> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.

### Selectors examples
Let's see some shared examples of using CSS and XPath Selectors.

Select all elements with the class `product`.
```python
products = page.css('.product')
products = page.xpath('//*[@class="product"]')
```
**Note:** The XPath version won't be accurate if there's another class; it's always better to rely on CSS for selecting by class.

Select the first element with the class `product`.
```python
product = page.css('.product')[0]
product = page.xpath('//*[@class="product"]')[0]
```
Get the text of the first element with the `h1` tag name
```python
title = page.css('h1::text').get()
title = page.xpath('//h1//text()').get()
```
Which is the same as doing
```python
title = page.css('h1')[0].text
title = page.xpath('//h1')[0].text
```
Get the `href` attribute of the first element with the `a` tag name
```python
link = page.css('a::attr(href)').get()
link = page.xpath('//a/@href').get()
```
Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
```python
title = page.css('.product h1:contains("Phone")::text').get()
title = page.xpath('//*[@class="product"]//h1[contains(text(),"Phone")]/text()').get()
```
You can nest and chain selectors as you want, given that they return results
```python
page.css('.product')[0].css('h1:contains("Phone")::text').get()
page.xpath('//*[@class="product"]')[0].xpath('//h1[contains(text(),"Phone")]/text()').get()
page.xpath('//*[@class="product"]')[0].css('h1:contains("Phone")::text').get()
```
Another example

All links that have 'image' in their 'href' attribute
```python
links = page.css('a[href*="image"]')
links = page.xpath('//a[contains(@href, "image")]')
for index, link in enumerate(links):
    link_value = link.attrib['href']  # Cleaner than link.css('::attr(href)').get()
    link_text = link.text
    print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
```

## Text-content selection
Scrapling provides two ways to select elements based on their direct text content:

1. Elements whose direct text content contains the given text with many options through the `find_by_text` method.
2. Elements whose direct text content matches the given regex pattern with many options through the `find_by_regex` method.

Anything achievable with `find_by_text` can also be done with `find_by_regex`, but both are provided for convenience.

With `find_by_text`, you pass the text as the first argument; with `find_by_regex`, the regex pattern is the first argument. Both methods share the following arguments:

* **first_match**: If `True` (the default), the method used will return the first result it finds.
* **case_sensitive**: If `True`, the case of the letters will be considered.
* **clean_match**: If `True`, all whitespaces and consecutive spaces will be replaced with a single space before matching.

By default, Scrapling searches for the exact matching of the text/pattern you pass to `find_by_text`, so the text content of the wanted element has to be ONLY the text you input, but that's why it also has one extra argument, which is:

* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore

**Note:** The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument.

### Finding Similar Elements
Scrapling can find elements similar to a given element, inspired by the AutoScraper library but usable with elements found by any method.

Given an element (e.g., a product found by title), calling `.find_similar()` on it causes Scrapling to:

1. Find all page elements with the same DOM tree depth as this element. 
2. All found elements will be checked, and those without the same tag name, parent tag name, and grandparent tag name will be dropped.
3. As a final check, Scrapling uses fuzzy matching to drop elements whose attributes don't resemble the original element's attributes. A configurable percentage controls this step (see arguments below).

Arguments for `find_similar()`:

* **similarity_threshold**: The percentage for comparing elements' attributes (step 3). Default is 0.2 (tag attributes must be at least 20% similar). Set to 0 to disable this check entirely.
* **ignore_attributes**: The attribute names passed will be ignored while matching the attributes in the last step. The default value is `('href', 'src',)` because URLs can change significantly across elements, making them unreliable.
* **match_text**: If `True`, the element's text content will be considered when matching (Step 3). Using this argument in typical cases is not recommended, but it depends.

### Examples
Examples of finding elements with raw text, regex, and `find_similar`.
```python
from scrapling.fetchers import Fetcher
page = Fetcher.get('https://books.toscrape.com/index.html')
```
Find the first element whose text fully matches this text
```python
>>> page.find_by_text('Tipping the Velvet')
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
```
Combining it with `page.urljoin` to return the full URL from the relative `href`.
```python
>>> page.find_by_text('Tipping the Velvet').attrib['href']
'catalogue/tipping-the-velvet_999/index.html'
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href'])
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
```
Get all matches if there are more (notice it returns a list)
```python
>>> page.find_by_text('Tipping the Velvet', first_match=False)
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
```
Get all elements that contain the word `the` (Partial matching)
```python
>>> results = page.find_by_text('the', partial=True, first_match=False)
>>> [i.text for i in results]
['A Light in the ...',
 'Tipping the Velvet',
 'The Requiem Red',
 'The Dirty Little Secrets ...',
 'The Coming Woman: A ...',
 'The Boys in the ...',
 'The Black Maria',
 'Mesaerion: The Best Science ...',
 "It's Only the Himalayas"]
```
The search is case-insensitive by default, so those results include `The`, not just the lowercase `the`. To limit to exact case:
```python
>>> results = page.find_by_text('the', partial=True, first_match=False, case_sensitive=True)
>>> [i.text for i in results]
['A Light in the ...',
 'Tipping the Velvet',
 'The Boys in the ...',
 "It's Only the Himalayas"]
```
Get the first element whose text content matches my price regex
```python
>>> page.find_by_regex(r'£[\d\.]+')
<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
>>> page.find_by_regex(r'£[\d\.]+').text
'£51.77'
```
It's the same if you pass the compiled regex as well; Scrapling will detect the input type and act upon that:
```python
>>> import re
>>> regex = re.compile(r'£[\d\.]+')
>>> page.find_by_regex(regex)
<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
>>> page.find_by_regex(regex).text
'£51.77'
```
Get all elements that match the regex
```python
>>> page.find_by_regex(r'£[\d\.]+', first_match=False)
[<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£53.74</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£50.10</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£47.82</p>' parent='<div class="product_price"> <p class="pr...'>,
 ...]
```
And so on...

Find all elements similar to the current element in location and attributes. For our case, ignore the 'title' attribute while matching
```python
>>> element = page.find_by_text('Tipping the Velvet')
>>> element.find_similar(ignore_attributes=['title'])
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
 <data='<a href="catalogue/sharp-objects_997/ind...' parent='<h3><a href="catalogue/sharp-objects_997...'>,
...]
```
The number of elements is 19, not 20, because the current element is not included in the results:
```python
>>> len(element.find_similar(ignore_attributes=['title']))
19
```
Get the `href` attribute from all similar elements
```python
>>> [
    element.attrib['href']
    for element in element.find_similar(ignore_attributes=['title'])
]
['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 ...]
```
Getting all books' data using that element as a starting point:
```python
>>> for product in element.parent.parent.find_similar():
        print({
            "name": product.css('h3 a::text').get(),
            "price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
            "stock": product.css('.availability::text').getall()[-1].clean()
        })
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
{'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}
...
```
### Advanced examples
Advanced examples using the `find_similar` method:

E-commerce Product Extraction
```python
def extract_product_grid(page):
    # Find the first product card
    first_product = page.find_by_text('Add to Cart').find_ancestor(
        lambda e: e.has_class('product-card')
    )

    # Find similar product cards
    products = first_product.find_similar()

    return [
        {
            'name': p.css('h3::text').get(),
            'price': p.css('.price::text').re_first(r'\d+\.\d{2}'),
            'stock': 'In stock' in p.text,
            'rating': p.css('.rating')[0].attrib.get('data-rating')
        }
        for p in products
    ]
```
Table Row Extraction
```python
def extract_table_data(page):
    # Find the first data row
    first_row = page.css('table tbody tr')[0]

    # Find similar rows
    rows = first_row.find_similar()

    return [
        {
            'column1': row.css('td:nth-child(1)::text').get(),
            'column2': row.css('td:nth-child(2)::text').get(),
            'column3': row.css('td:nth-child(3)::text').get()
        }
        for row in rows
    ]
```
Form Field Extraction
```python
def extract_form_fields(page):
    # Find first form field container
    first_field = page.css('input')[0].find_ancestor(
        lambda e: e.has_class('form-field')
    )

    # Find similar field containers
    fields = first_field.find_similar()

    return [
        {
            'label': f.css('label::text').get(),
            'type': f.css('input')[0].attrib.get('type'),
            'required': 'required' in f.css('input')[0].attrib
        }
        for f in fields
    ]
```
Extracting reviews from a website
```python
def extract_reviews(page):
    # Find first review
    first_review = page.find_by_text('Great product!')
    review_container = first_review.find_ancestor(
        lambda e: e.has_class('review')
    )
    
    # Find similar reviews
    all_reviews = review_container.find_similar()
    
    return [
        {
            'text': r.css('.review-text::text').get(),
            'rating': r.attrib.get('data-rating'),
            'author': r.css('.reviewer::text').get()
        }
        for r in all_reviews
    ]
```
## Filters-based searching
Inspired by BeautifulSoup's `find_all` function, elements can be found using the `find_all` and `find` methods. Both methods accept multiple filters and return all elements on the pages where all filters apply.

To be more specific:

* Any string passed is considered a tag name.
* Any iterable passed, like List/Tuple/Set, will be considered as an iterable of tag names.
* Any dictionary is considered a mapping of HTML element(s), attribute names, and attribute values.
* Any regex patterns passed are used to filter elements by content, like the `find_by_regex` method
* Any functions passed are used to filter elements
* Any keyword argument passed is considered as an HTML element attribute with its value.

It collects all passed arguments and keywords, and each filter passes its results to the following filter in a waterfall-like filtering system.

It filters all elements in the current page/element in the following order:

1. All elements with the passed tag name(s) get collected.
2. All elements that match all passed attribute(s) are collected; if a previous filter is used, then previously collected elements are filtered.
3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.

**Notes:**

1. The filtering process always starts from the first filter it finds in the filtering order above. If no tag name(s) are passed but attributes are passed, the process starts from step 2, and so on.
2. The order in which arguments are passed does not matter. The only order considered is the one explained above.

### Examples
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.get('https://quotes.toscrape.com/')
```
Find all elements with the tag name `div`.
```python
>>> page.find_all('div')
[<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
 <data='<div class="row header-box"> <div class=...' parent='<div class="container"> <div class="row...'>,
...]
```
Find all div elements with a class that equals `quote`.
```python
>>> page.find_all('div', class_='quote')
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Same as above.
```python
>>> page.find_all('div', {'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Find all elements with a class that equals `quote`.
```python
>>> page.find_all({'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
```python
>>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css('.text::text').get())
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
```
Find all elements that have children.
```python
>>> page.find_all(lambda element: len(element.children) > 0)
[<data='<html lang="en"><head><meta charset="UTF...'>,
 <data='<head><meta charset="UTF-8"><title>Quote...' parent='<html lang="en"><head><meta charset="UTF...'>,
 <data='<body> <div class="container"> <div clas...' parent='<html lang="en"><head><meta charset="UTF...'>,
...]
```
Find all elements that contain the word 'world' in their content.
```python
>>> page.find_all(lambda element: "world" in element.text)
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
 <data='<a class="tag" href="/tag/world/page/1/"...' parent='<div class="tags"> Tags: <meta class="ke...'>]
```
Find all span elements that match the given regex
```python
>>> page.find_all('span', re.compile(r'world'))
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>]
```
Find all div and span elements with class 'quote' (No span elements like that, so only div returned)
```python
>>> page.find_all(['div', 'span'], {'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Mix things up
```python
>>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text').getall()
['Albert Einstein',
 'J.K. Rowling',
...]
```
A bonus pro tip: Find all elements whose `href` attribute's value ends with the word 'Einstein'.
```python
>>> page.find_all({'href$': 'Einstein'})
[<data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>]
```
Another pro tip: Find all elements whose `href` attribute's value has '/author/' in it
```python
>>> page.find_all({'href*': '/author/'})
[<data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/J-K-Rowling">(about)</a...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
...]
```
And so on...

## Generating selectors
CSS/XPath selectors can be generated for any element, regardless of the method used to find it.

Generate a short CSS selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)
```python
>>> url_element = page.find({'href*': '/author/'})
>>> url_element.generate_css_selector
'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'
```
Generate a full CSS selector for the `url_element` element from the start of the page
```python
>>> url_element.generate_full_css_selector
'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'
```
Generate a short XPath selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)
```python
>>> url_element.generate_xpath_selector
'//body/div/div[2]/div/div/span[2]/a'
```
Generate a full XPath selector for the `url_element` element from the start of the page
```python
>>> url_element.generate_full_xpath_selector
'//body/div/div[2]/div/div/span[2]/a'
```
**Note:** When generating a short selector, Scrapling tries to find a unique element (e.g., one with an `id` attribute) as a stop point. If none exists, the short and full selectors will be identical.

## Using selectors with regular expressions
Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. These methods exist in `Selector`, `Selectors`, `TextHandler`, and `TextHandlers`, so they can be used directly on elements even without selecting a text node. See the [TextHandler](main_classes.md#texthandler) class for details.

Examples:
```python
>>> page.css('.price_color')[0].re_first(r'[\d\.]+')
'51.77'

>>> page.css('.price_color').re_first(r'[\d\.]+')
'51.77'

>>> page.css('.price_color').re(r'[\d\.]+')
['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
...]

>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000',
 'tipping-the-velvet_999',
 'soumission_998',
 'sharp-objects_997',
...]

>>> filtering_function = lambda e: e.parent.tag == 'h3' and e.parent.parent.has_class('product_pod')  # As above selector
>>> page.find('a', filtering_function).attrib['href'].re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000']

>>> page.find_by_text('Tipping the Velvet').attrib['href'].re(r'catalogue/(.*)/index.html')
['tipping-the-velvet_999']
```
See the [TextHandler](main_classes.md#texthandler) class for more details on regex methods.

================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/advanced.md
================================================
# Advanced usages

## Concurrency Control

The spider system uses three class attributes to control how aggressively it crawls:

| Attribute                        | Default | Description                                                      |
|----------------------------------|---------|------------------------------------------------------------------|
| `concurrent_requests`            | `4`     | Maximum number of requests being processed at the same time      |
| `concurrent_requests_per_domain` | `0`     | Maximum concurrent requests per domain (0 = no per-domain limit) |
| `download_delay`                 | `0.0`   | Seconds to wait before each request                              |

```python
class PoliteSpider(Spider):
    name = "polite"
    start_urls = ["https://example.com"]

    # Be gentle with the server
    concurrent_requests = 4
    concurrent_requests_per_domain = 2
    download_delay = 1.0  # Wait 1 second between requests

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

When `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.

**Tip:** The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.

### Using uvloop

The `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:

```python
result = MySpider().start(use_uvloop=True)
```

This can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.

## Pause & Resume

The spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:

```python
spider = MySpider(crawldir="crawl_data/my_spider")
result = spider.start()

if result.paused:
    print("Crawl was paused. Run again to resume.")
else:
    print("Crawl completed!")
```

### How It Works

1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.
2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.
3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.
4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.

**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).** 

You can change the interval as follows:

```python
# Save checkpoint every 2 minutes
spider = MySpider(crawldir="crawl_data/my_spider", interval=120.0)
```

The writing to the disk is atomic, so it's totally safe.

**Tip:** Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.

### Knowing If You're Resuming

The `on_start()` hook receives a `resuming` flag:

```python
async def on_start(self, resuming: bool = False):
    if resuming:
        self.logger.info("Resuming from checkpoint!")
    else:
        self.logger.info("Starting fresh crawl")
```

## Streaming

For long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:

```python
import anyio

async def main():
    spider = MySpider()
    async for item in spider.stream():
        print(f"Got item: {item}")
        # Access real-time stats
        print(f"Items so far: {spider.stats.items_scraped}")
        print(f"Requests made: {spider.stats.requests_count}")

anyio.run(main)
```

Key differences from `start()`:

- `stream()` must be called from an async context
- Items are yielded one by one as they're scraped, not collected into a list
- You can access `spider.stats` during iteration for real-time statistics

**Note:** The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics).

You can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.

```python
import anyio

async def main():
    spider = MySpider(crawldir="crawl_data/my_spider")
    async for item in spider.stream():
        print(f"Got item: {item}")
        # Access real-time stats
        print(f"Items so far: {spider.stats.items_scraped}")
        print(f"Requests made: {spider.stats.requests_count}")

anyio.run(main)
```
You can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.

## Lifecycle Hooks

The spider provides several hooks you can override to add custom behavior at different stages of the crawl:

### on_start

Called before crawling begins. Use it for setup tasks like loading data or initializing resources:

```python
async def on_start(self, resuming: bool = False):
    self.logger.info("Spider starting up")
    # Load seed URLs from a database, initialize counters, etc.
```

### on_close

Called after crawling finishes (whether completed or paused). Use it for cleanup:

```python
async def on_close(self):
    self.logger.info("Spider shutting down")
    # Close database connections, flush buffers, etc.
```

### on_error

Called when a request fails with an exception. Use it for error tracking or custom recovery logic:

```python
async def on_error(self, request: Request, error: Exception):
    self.logger.error(f"Failed: {request.url} - {error}")
    # Log to error tracker, save failed URL for later, etc.
```

### on_scraped_item

Called for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:

```python
async def on_scraped_item(self, item: dict) -> dict | None:
    # Drop items without a title
    if not item.get("title"):
        return None

    # Modify items (e.g., add timestamps)
    item["scraped_at"] = "2026-01-01"
    return item
```

**Tip:** This hook can also be used to direct items through your own pipelines and drop them from the spider.

### start_requests

Override `start_requests()` for custom initial request generation instead of using `start_urls`:

```python
async def start_requests(self):
    # POST request to log in first
    yield Request(
        "https://example.com/login",
        method="POST",
        data={"user": "admin", "pass": "secret"},
        callback=self.after_login,
    )

async def after_login(self, response: Response):
    # Now crawl the authenticated pages
    yield response.follow("/dashboard", callback=self.parse)
```

## Results & Statistics

The `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:

```python
result = MySpider().start()

# Items
print(f"Total items: {len(result.items)}")
result.items.to_json("output.json", indent=True)

# Did the crawl complete?
print(f"Completed: {result.completed}")
print(f"Paused: {result.paused}")

# Statistics
stats = result.stats
print(f"Requests: {stats.requests_count}")
print(f"Failed: {stats.failed_requests_count}")
print(f"Blocked: {stats.blocked_requests_count}")
print(f"Offsite filtered: {stats.offsite_requests_count}")
print(f"Items scraped: {stats.items_scraped}")
print(f"Items dropped: {stats.items_dropped}")
print(f"Response bytes: {stats.response_bytes}")
print(f"Duration: {stats.elapsed_seconds:.1f}s")
print(f"Speed: {stats.requests_per_second:.1f} req/s")
```

### Detailed Stats

The `CrawlStats` object tracks granular information:

```python
stats = result.stats

# Status code distribution
print(stats.response_status_count)
# {'status_200': 150, 'status_404': 3, 'status_403': 1}

# Bytes downloaded per domain
print(stats.domains_response_bytes)
# {'example.com': 1234567, 'api.example.com': 45678}

# Requests per session
print(stats.sessions_requests_count)
# {'http': 120, 'stealth': 34}

# Proxies used during the crawl
print(stats.proxies)
# ['http://proxy1:8080', 'http://proxy2:8080']

# Log level counts
print(stats.log_levels_counter)
# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}

# Timing information
print(stats.start_time)       # Unix timestamp when crawl started
print(stats.end_time)         # Unix timestamp when crawl finished
print(stats.download_delay)   # The download delay used (seconds)

# Concurrency settings used
print(stats.concurrent_requests)             # Global concurrency limit
print(stats.concurrent_requests_per_domain)  # Per-domain concurrency limit

# Custom stats (set by your spider code)
print(stats.custom_stats)
# {'login_attempts': 3, 'pages_with_errors': 5}

# Export everything as a dict
print(stats.to_dict())
```

## Logging

The spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:

| Attribute             | Default                                                      | Description                                        |
|-----------------------|--------------------------------------------------------------|----------------------------------------------------|
| `logging_level`       | `logging.DEBUG`                                              | Minimum log level                                  |
| `logging_format`      | `"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"` | Log message format                                 |
| `logging_date_format` | `"%Y-%m-%d %H:%M:%S"`                                        | Date format in log messages                        |
| `log_file`            | `None`                                                       | Path to a log file (in addition to console output) |

```python
import logging

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    logging_level = logging.INFO
    log_file = "logs/my_spider.log"

    async def parse(self, response: Response):
        self.logger.info(f"Processing {response.url}")
        yield {"title": response.css("title::text").get("")}
```

The log file directory is created automatically if it doesn't exist. Both console and file output use the same format.

================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/architecture.md
================================================
# Spiders architecture

Scrapling's spider system is an async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.

## Data Flow

The diagram below shows how data flows through the spider system when a crawl is running:

Here's what happens step by step when you run a spider:

1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.
2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.
3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).
4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.
5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.
6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.
7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.


## Components

### Spider

The central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.

```python
from scrapling.spiders import Spider, Response, Request

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def parse(self, response: Response):
        for link in response.css("a::attr(href)").getall():
            yield response.follow(link, callback=self.parse_page)

    async def parse_page(self, response: Response):
        yield {"title": response.css("h1::text").get("")}
```

### Crawler Engine

The engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.

### Scheduler

A priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.

### Session Manager

Manages one or more named session instances. Each session is one of:

- [FetcherSession](../fetching/static.md)
- [AsyncDynamicSession](../fetching/dynamic.md)
- [AsyncStealthySession](../fetching/stealthy.md)

When a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).

### Checkpoint System

An optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.

### Output

Scraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.


## Comparison with Scrapy

If you're coming from Scrapy, here's how Scrapling's spider system maps:

| Concept            | Scrapy                        | Scrapling                                                       |
|--------------------|-------------------------------|-----------------------------------------------------------------|
| Spider definition  | `scrapy.Spider` subclass      | `scrapling.spiders.Spider` subclass                             |
| Initial requests   | `start_requests()`            | `async start_requests()`                                        |
| Callbacks          | `def parse(self, response)`   | `async def parse(self, response)`                               |
| Following links    | `response.follow(url)`        | `response.follow(url)`                                          |
| Item output        | `yield dict` or `yield Item`  | `yield dict`                                                    |
| Request scheduling | Scheduler + Dupefilter        | Scheduler with built-in deduplication                           |
| Downloading        | Downloader + Middlewares      | Session Manager with multi-session support                      |
| Item processing    | Item Pipelines                | `on_scraped_item()` hook                                        |
| Blocked detection  | Through custom middlewares    | Built-in `is_blocked()` + `retry_blocked_request()` hooks       |
| Concurrency        | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute                           |
| Domain filtering   | `allowed_domains`             | `allowed_domains`                                               |
| Pause/Resume       | `JOBDIR` setting              | `crawldir` constructor argument                                 |
| Export             | Feed exports                  | `result.items.to_json()` / `to_jsonl()` or custom through hooks |
| Running            | `scrapy crawl spider_name`    | `MySpider().start()`                                            |
| Streaming          | N/A                           | `async for item in spider.stream()`                             |
| Multi-session      | N/A                           | Multiple sessions with different types per spider               |

================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/getting-started.md
================================================
# Getting started

## Your First Spider

A spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:

```python
from scrapling.spiders import Spider, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com"]

    async def parse(self, response: Response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(""),
                "author": quote.css("small.author::text").get(""),
            }
```

Every spider needs three things:

1. **`name`** — A unique identifier for the spider.
2. **`start_urls`** — A list of URLs to start crawling from.
3. **`parse()`** — An async generator method that processes each response and yields results.

The `parse()` method processes each response. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.

## Running the Spider

To run your spider, create an instance and call `start()`:

```python
result = QuotesSpider().start()
```

The `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.

Those stats are in the returned `CrawlResult` object, which gives you everything you need:

```python
result = QuotesSpider().start()

# Access scraped items
for item in result.items:
    print(item["text"], "-", item["author"])

# Check statistics
print(f"Scraped {result.stats.items_scraped} items")
print(f"Made {result.stats.requests_count} requests")
print(f"Took {result.stats.elapsed_seconds:.1f} seconds")

# Did the crawl finish or was it paused?
print(f"Completed: {result.completed}")
```

## Following Links

Most crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:

```python
from scrapling.spiders import Spider, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com"]

    async def parse(self, response: Response):
        # Extract items from the current page
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(""),
                "author": quote.css("small.author::text").get(""),
            }

        # Follow the "next page" link
        next_page = response.css("li.next a::attr(href)").get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)
```

`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.

You can point follow-up requests at different callback methods for different page types:

```python
async def parse(self, response: Response):
    for link in response.css("a.product-link::attr(href)").getall():
        yield response.follow(link, callback=self.parse_product)

async def parse_product(self, response: Response):
    yield {
        "name": response.css("h1::text").get(""),
        "price": response.css(".price::text").get(""),
    }
```

**Note:** All callback methods must be async generators (using `async def` and `yield`).

## Exporting Data

The `ItemList` returned in `result.items` has built-in export methods:

```python
result = QuotesSpider().start()

# Export as JSON
result.items.to_json("quotes.json")

# Export as JSON with pretty-printing
result.items.to_json("quotes.json", indent=True)

# Export as JSON Lines (one JSON object per line)
result.items.to_jsonl("quotes.jsonl")
```

Both methods create parent directories automatically if they don't exist.

## Filtering Domains

Use `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    allowed_domains = {"example.com"}

    async def parse(self, response: Response):
        for link in response.css("a::attr(href)").getall():
            # Links to other domains are silently dropped
            yield response.follow(link, callback=self.parse)
```

Subdomains are matched automatically — setting `allowed_domains = {"example.com"}` also allows `sub.example.com`, `blog.example.com`, etc.

When a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.


================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/proxy-blocking.md
================================================
# Proxy management and handling Blocks

Scrapling's `ProxyRotator` manages proxy rotation across requests. It works with all session types and integrates with the spider's blocked request retry system.

## ProxyRotator

The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession, ProxyRotator

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    def configure_sessions(self, manager):
        rotator = ProxyRotator([
            "http://proxy1:8080",
            "http://proxy2:8080",
            "http://user:pass@proxy3:8080",
        ])
        manager.add("default", FetcherSession(proxy_rotator=rotator))

    async def parse(self, response: Response):
        # Check which proxy was used
        print(f"Proxy used: {response.meta.get('proxy')}")
        yield {"title": response.css("title::text").get("")}
```

Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page.


Browser sessions support both string and dict proxy formats:

```python
from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator

# String proxies work for all session types
rotator = ProxyRotator([
    "http://proxy1:8080",
    "http://proxy2:8080",
])

# Dict proxies (Playwright format) work for browser sessions
rotator = ProxyRotator([
    {"server": "http://proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://proxy2:8080"},
])

# Then inside the spider
def configure_sessions(self, manager):
    rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"])
    manager.add("browser", AsyncStealthySession(proxy_rotator=rotator))
```

**Important:**

1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if needed.
2. By default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.

## Custom Rotation Strategies

By default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.

You can provide a custom strategy function to change this behavior, but it has to match the below signature:

```python
from scrapling.core._types import ProxyType

def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:
    ...
```

It receives the list of proxies and the current index, and must return the chosen proxy and the next index.

Below are some examples of custom rotation strategies you can use.

### Random Rotation

```python
import random
from scrapling.fetchers import ProxyRotator

def random_strategy(proxies, current_index):
    idx = random.randint(0, len(proxies) - 1)
    return proxies[idx], idx

rotator = ProxyRotator(
    ["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"],
    strategy=random_strategy,
)
```

### Weighted Rotation

```python
import random

def weighted_strategy(proxies, current_index):
    # First proxy gets 60% of traffic, others split the rest
    weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)
    proxy = random.choices(proxies, weights=weights, k=1)[0]
    return proxy, current_index  # Index doesn't matter for weighted

rotator = ProxyRotator(proxies, strategy=weighted_strategy)
```


## Per-Request Proxy Override

You can override the rotator for individual requests by passing `proxy=` as a keyword argument:

```python
async def parse(self, response: Response):
    # This request uses the rotator's next proxy
    yield response.follow("/page1", callback=self.parse_page)

    # This request uses a specific proxy, bypassing the rotator
    yield response.follow(
        "/special-page",
        callback=self.parse_page,
        proxy="http://special-proxy:8080",
    )
```

This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).

## Blocked Request Handling

The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.

The retry system works like this:

1. After a response comes back, the spider calls the `is_blocked(response)` method.
2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.
3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.
4. This repeats up to `max_blocked_retries` times (default: 3).

**Tip:**

1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.
2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.

### Custom Block Detection

Override `is_blocked()` to add your own detection logic:

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def is_blocked(self, response: Response) -> bool:
        # Check status codes (default behavior)
        if response.status in {403, 429, 503}:
            return True

        # Check response content
        body = response.body.decode("utf-8", errors="ignore")
        if "access denied" in body.lower() or "rate limit" in body.lower():
            return True

        return False

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

### Customizing Retries

Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):

```python
from scrapling.spiders import Spider, SessionManager, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession


class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    max_blocked_retries = 5

    def configure_sessions(self, manager: SessionManager) -> None:
        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))
        manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)

    async def retry_blocked_request(self, request: Request, response: Response) -> Request:
        request.sid = "stealth"
        self.logger.info(f"Retrying blocked request: {request.url}")
        return request

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.


Putting it all together:

```python
from scrapling.spiders import Spider, SessionManager, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator


cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"])

# A format acceptable by the browser
expensive_proxies = ProxyRotator([
    {"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"},
    {"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"},
])


class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    max_blocked_retries = 5

    def configure_sessions(self, manager: SessionManager) -> None:
        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))
        manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)

    async def retry_blocked_request(self, request: Request, response: Response) -> Request:
        request.sid = "stealth"
        self.logger.info(f"Retrying blocked request: {request.url}")
        return request

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```
The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies.

================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/requests-responses.md
================================================
# Requests & Responses

This page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.

## The Request Object

A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:

```python
from scrapling.spiders import Request

# Direct construction
request = Request(
    "https://example.com/page",
    callback=self.parse_page,
    priority=5,
)

# Via response.follow (preferred in callbacks)
request = response.follow("/page", callback=self.parse_page)
```

Here are all the arguments you can pass to `Request`:

| Argument      | Type       | Default    | Description                                                                                           |
|---------------|------------|------------|-------------------------------------------------------------------------------------------------------|
| `url`         | `str`      | *required* | The URL to fetch                                                                                      |
| `sid`         | `str`      | `""`       | Session ID — routes the request to a specific session (see [Sessions](sessions.md))                   |
| `callback`    | `callable` | `None`     | Async generator method to process the response. Defaults to `parse()`                                 |
| `priority`    | `int`      | `0`        | Higher values are processed first                                                                     |
| `dont_filter` | `bool`     | `False`    | If `True`, skip deduplication (allow duplicate requests)                                              |
| `meta`        | `dict`     | `{}`       | Arbitrary metadata passed through to the response                                                     |
| `**kwargs`    |            |            | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |

Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:

```python
yield Request(
    "https://example.com/api",
    method="POST",
    data={"key": "value"},
    callback=self.parse_result,
)
```

## Response.follow()

`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:

- **Relative URLs** are resolved automatically against the current page URL
- **Referer header** is set to the current page URL by default
- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)
- **Callback, session ID, and priority** are inherited from the original request if not specified

```python
async def parse(self, response: Response):
    # Minimal — inherits callback, sid, priority from current request
    yield response.follow("/next-page")

    # Override specific fields
    yield response.follow(
        "/product/123",
        callback=self.parse_product,
        priority=10,
    )

    # Pass additional metadata to
    yield response.follow(
        "/details",
        callback=self.parse_details,
        meta={"category": "electronics"},
    )
```

| Argument           | Type       | Default    | Description                                                |
|--------------------|------------|------------|------------------------------------------------------------|
| `url`              | `str`      | *required* | URL to follow (absolute or relative)                       |
| `sid`              | `str`      | `""`       | Session ID (inherits from original request if empty)       |
| `callback`         | `callable` | `None`     | Callback method (inherits from original request if `None`) |
| `priority`         | `int`      | `None`     | Priority (inherits from original request if `None`)        |
| `dont_filter`      | `bool`     | `False`    | Skip deduplication                                         |
| `meta`             | `dict`     | `None`     | Metadata (merged with existing response meta)              |
| **`referer_flow`** | `bool`     | `True`     | Set current URL as Referer header                          |
| `**kwargs`         |            |            | Merged with original request's session kwargs              |

### Disabling Referer Flow

By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:

```python
yield response.follow("/page", referer_flow=False)
```

## Callbacks

Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types:

- **`dict`** — A scraped item, added to the results
- **`Request`** — A follow-up request, added to the queue
- **`None`** — Silently ignored

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def parse(self, response: Response):
        # Yield items (dicts)
        yield {"url": response.url, "title": response.css("title::text").get("")}

        # Yield follow-up requests
        for link in response.css("a::attr(href)").getall():
            yield response.follow(link, callback=self.parse_page)

    async def parse_page(self, response: Response):
        yield {"content": response.css("article::text").get("")}
```

**Note:** All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.

## Request Priority

Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:

```python
async def parse(self, response: Response):
    # High priority — process product pages first
    for link in response.css("a.product::attr(href)").getall():
        yield response.follow(link, callback=self.parse_product, priority=10)

    # Low priority — pagination links processed after products
    next_page = response.css("a.next::attr(href)").get()
    if next_page:
        yield response.follow(next_page, callback=self.parse, priority=0)
```

When using `response.follow()`, the priority is inherited from the original request unless you specify a new one.

## Deduplication

The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.

To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:

```python
yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard)

# Or with response.follow
yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard)
```

You can fine-tune what goes into the fingerprint using class attributes on your spider:

| Attribute            | Default | Effect                                                                                                          |
|----------------------|---------|-----------------------------------------------------------------------------------------------------------------|
| `fp_include_kwargs`  | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |
| `fp_keep_fragments`  | `False` | Keep URL fragments (`#section`) when computing fingerprints                                                     |
| `fp_include_headers` | `False` | Include request headers in the fingerprint                                                                      |

For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:

```python
class MySpider(Spider):
    name = "my_spider"
    fp_keep_fragments = True
    # ...
```

## Request Meta

The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:

```python
async def parse(self, response: Response):
    for product in response.css("div.product"):
        category = product.css("span.category::text").get("")
        link = product.css("a::attr(href)").get()
        if link:
            yield response.follow(
                link,
                callback=self.parse_product,
                meta={"category": category},
            )

async def parse_product(self, response: Response):
    yield {
        "name": response.css("h1::text").get(""),
        "price": response.css(".price::text").get(""),
        # Access meta from the request
        "category": response.meta.get("category", ""),
    }
```

When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).

The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled.

================================================
FILE: agent-skill/Scrapling-Skill/references/spiders/sessions.md
================================================
# Spiders sessions

A spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages.

## What are Sessions?

A session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.

By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:


| Session Type                                    | Use Case                                 |
|-------------------------------------------------|------------------------------------------|
| [FetcherSession](../fetching/static.md)         | Fast HTTP requests, no JavaScript        |
| [AsyncDynamicSession](../fetching/dynamic.md)   | Browser automation, JavaScript rendering |
| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc.        |


## Configuring Sessions

Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    def configure_sessions(self, manager):
        manager.add("default", FetcherSession())

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

The `manager.add()` method takes:

| Argument     | Type      | Default    | Description                                  |
|--------------|-----------|------------|----------------------------------------------|
| `session_id` | `str`     | *required* | A name to reference this session in requests |
| `session`    | `Session` | *required* | The session instance                         |
| `default`    | `bool`    | `False`    | Make this the default session                |
| `lazy`       | `bool`    | `False`    | Start the session only when first used       |

**Notes:**

1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:
    1. The first session you add to the manager becomes the default automatically.
    2. The session that gets `default=True` while added to the manager.
2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.
3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.

## Multi-Session Spider

Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        # Fast HTTP for listing pages (default)
        manager.add("http", FetcherSession())

        # Stealth browser for protected product pages
        manager.add("stealth", AsyncStealthySession(
            headless=True,
            network_idle=True,
        ))

    async def parse(self, response: Response):
        for link in response.css("a.product::attr(href)").getall():
            # Route product pages through the stealth session
            yield response.follow(link, sid="stealth", callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield response.follow(next_page)

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```

The key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.

Sessions can also be different instances of the same class with different configurations:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        chrome_requests = FetcherSession(impersonate="chrome")
        firefox_requests = FetcherSession(impersonate="firefox")

        manager.add("chrome", chrome_requests)
        manager.add("firefox", firefox_requests)

    async def parse(self, response: Response):
        for link in response.css("a.product::attr(href)").getall():
            yield response.follow(link, callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield response.follow(next_page, sid="firefox")

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```

## Session Arguments

Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:

```python
async def parse(self, response: Response):
    # Pass extra headers for this specific request
    yield Request(
        "https://api.example.com/data",
        headers={"Authorization": "Bearer token123"},
        callback=self.parse_api,
    )

    # Use a different HTTP method
    yield Request(
        "https://example.com/submit",
        method="POST",
        data={"field": "value"},
        sid="firefox",
        callback=self.parse_result,
    )
```

**Warning:** When using `FetcherSession` in spiders, you cannot use `.get()` and `.post()` methods directly. By default, the request is an HTTP GET request; to use another HTTP method, pass it to the `method` argument as in the above example. This unifies the `Request` interface across all session types.

For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:

```python
async def parse(self, response: Response):
    # Use Cloudflare solver with the `AsyncStealthySession` we configured above
    yield Request(
        "https://nopecha.com/demo/cloudflare",
        sid="stealth",
        callback=self.parse_result,
        solve_cloudflare=True,
        block_webrtc=True,
        hide_canvas=True,
        google_search=True,
    )

    yield response.follow(
        "/dynamic-page",
        sid="browser",
        callback=self.parse_dynamic,
        wait_selector="div.loaded",
        network_idle=True,
    )
```

**Warning:** Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        manager.add("http", FetcherSession(impersonate='chrome'))

    async def parse(self, response: Response):
        # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one
        # so I override it like this
        for link in response.css("a.product::attr(href)").getall():
            yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield Request(next_page)

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```
**Note:** Upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider.

================================================
FILE: benchmarks.py
================================================
import functools
import time
import timeit
from statistics import mean

import requests
from autoscraper import AutoScraper
from bs4 import BeautifulSoup
from lxml import etree, html
from mechanicalsoup import StatefulBrowser
from parsel import Selector
from pyquery import PyQuery as pq
from selectolax.parser import HTMLParser

from scrapling import Selector as ScraplingSelector

large_html = (
    "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
)


def benchmark(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
        print(f"-> {benchmark_name}", end=" ", flush=True)
        # Warm-up phase
        timeit.repeat(
            lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
        )
        # Measure time (1 run, repeat 100 times, take average)
        times = timeit.repeat(
            lambda: func(*args, **kwargs),
            number=1,
            repeat=100,
            globals=globals(),
            timer=time.process_time,
        )
        min_time = round(mean(times) * 1000, 2)  # Convert to milliseconds
        print(f"average execution time: {min_time} ms")
        return min_time

    return wrapper


@benchmark
def test_lxml():
    return [
        e.text
        for e in etree.fromstring(
            large_html,
            # Scrapling and Parsel use the same parser inside, so this is just to make it fair
            parser=html.HTMLParser(recover=True, huge_tree=True),
        ).cssselect(".item")
    ]


@benchmark
def test_bs4_lxml():
    return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]


@benchmark
def test_bs4_html5lib():
    return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]


@benchmark
def test_pyquery():
    return [e.text() for e in pq(large_html)(".item").items()]


@benchmark
def test_scrapling():
    # No need to do `.extract()` like parsel to extract text
    # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
    # for obvious reasons, of course.
    return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall()


@benchmark
def test_parsel():
    return Selector(text=large_html).css(".item::text").extract()


@benchmark
def test_mechanicalsoup():
    browser = StatefulBrowser()
    browser.open_fake_page(large_html)
    return [e.text for e in browser.page.select(".item")]


@benchmark
def test_selectolax():
    return [node.text() for node in HTMLParser(large_html).css(".item")]


def display(results):
    # Sort and display results
    sorted_results = sorted(results.items(), key=lambda x: x[1])  # Sort by time
    scrapling_time = results["Scrapling"]
    print("\nRanked Results (fastest to slowest):")
    print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
    print("-" * 50)
    for i, (test_name, test_time) in enumerate(sorted_results, 1):
        compare = round(test_time / scrapling_time, 3)
        print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")


@benchmark
def test_scrapling_text(request_html):
    return ScraplingSelector(request_html, adaptive=False).find_by_text("Tipping the Velvet", first_match=True, clean_match=False).find_similar(ignore_attributes=["title"])


@benchmark
def test_autoscraper(request_html):
    # autoscraper by default returns elements text
    return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])


if __name__ == "__main__":
    print(
        " Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
    )
    results1 = {
        "Raw Lxml": test_lxml(),
        "Parsel/Scrapy": test_parsel(),
        "Scrapling": test_scrapling(),
        "Selectolax": test_selectolax(),
        "PyQuery": test_pyquery(),
        "BS4 with Lxml": test_bs4_lxml(),
        "MechanicalSoup": test_mechanicalsoup(),
        "BS4 with html5lib": test_bs4_html5lib(),
    }

    display(results1)
    print("\n" + "=" * 25)
    req = requests.get("https://books.toscrape.com/index.html")
    print(
        " Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
    )
    results2 = {
        "Scrapling": test_scrapling_text(req.text),
        "AutoScraper": test_autoscraper(req.text),
    }
    display(results2)


================================================
FILE: cleanup.py
================================================
import shutil
from pathlib import Path


# Clean up after installing for local development
def clean():
    # Get the current directory
    base_dir = Path.cwd()

    # Directories and patterns to clean
    cleanup_patterns = [
        "build",
        "dist",
        "*.egg-info",
        "__pycache__",
        ".eggs",
        ".pytest_cache",
    ]

    # Clean directories
    for pattern in cleanup_patterns:
        for path in base_dir.glob(pattern):
            try:
                if path.is_dir():
                    shutil.rmtree(path)
                else:
                    path.unlink()
                print(f"Removed: {path}")
            except Exception as e:
                print(f"Could not remove {path}: {e}")

    # Remove compiled Python files
    for path in base_dir.rglob("*.py[co]"):
        try:
            path.unlink()
            print(f"Removed compiled file: {path}")
        except Exception as e:
            print(f"Could not remove {path}: {e}")


if __name__ == "__main__":
    clean()


================================================
FILE: docs/README_AR.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>طرق الاختيار</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>اختيار Fetcher</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>العناكب</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>تدوير البروكسي</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>واجهة سطر الأوامر</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>وضع MCP</strong></a>
</p>

Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.

محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.

زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # احصل على الموقع بشكل خفي!
products = p.css('.product', auto_save=True)                                        # استخرج بيانات تنجو من تغييرات تصميم الموقع!
products = p.css('.product', adaptive=True)                                         # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!
```
أو توسع إلى عمليات زحف كاملة
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# الرعاة البلاتينيون
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling يتعامل مع Cloudflare Turnstile. للحماية على مستوى المؤسسات، توفر <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> نقاط نهاية API تولّد رموز antibot صالحة لـ <b>Akamai</b>، <b>DataDome</b>، <b>Kasada</b> و <b>Incapsula</b>. استدعاءات API بسيطة، بدون أتمتة متصفح. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>مرحباً، لقد بنينا <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a> لأن البروكسيات لا يجب أن تكون معقدة أو باهظة الثمن. <br /> بروكسيات سكنية و ISP سريعة في أكثر من 195 موقعاً، أسعار عادلة، ودعم حقيقي. <br />
      <b>جرّب لعبة FlappyBird على صفحة الهبوط للحصول على بيانات مجانية!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: بروكسيات سكنية بدءاً من 0.49$/جيجابايت. متصفح سكرابينج مع Chromium مُزيّف بالكامل، عناوين IP سكنية، حل تلقائي لـ CAPTCHA، وتجاوز أنظمة مكافحة البوتات. </br>
      <b>واجهة Scraper API لنتائج بدون عناء. تكاملات MCP و N8N متاحة.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> يوفر أكثر من 900 واجهة API مستقرة عبر أكثر من 16 منصة تشمل TikTok و X و YouTube و Instagram، مع أكثر من 40 مليون مجموعة بيانات. <br /> يقدم أيضاً <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">نماذج ذكاء اصطناعي بأسعار مخفضة</a> — Claude و GPT و GEMINI والمزيد بخصم يصل إلى 71%.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> يوفر بروكسيات سكنية و ISP سريعة للمطورين والسكرابرز. تغطية IP عالمية، إخفاء هوية عالي، تدوير ذكي، وأداء موثوق للأتمتة واستخراج البيانات. استخدم <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> لتبسيط زحف الويب على نطاق واسع.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    أغلق حاسوبك. أدوات الكشط تواصل العمل. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - خوادم سحابية مصممة للأتمتة المتواصلة. أجهزة Windows وLinux مع تحكم كامل. بدءًا من 6.99 يورو/شهريًا.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    اقرأ مراجعة كاملة عن <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling على The Web Scraping Club</a> (نوفمبر 2025)، النشرة الإخبارية الأولى المخصصة لكشط الويب.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> يوفر بنية تحتية موثوقة للبروكسي لكشط الويب، بما في ذلك بروكسيات IPv4 وIPv6 وISP والسكنية والمحمولة مع أداء مستقر وتغطية جغرافية واسعة وخطط مرنة لجمع البيانات على نطاق الأعمال.
    </td>
  </tr>
</table>

<i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# الرعاة

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci) واختر المستوى الذي يناسبك!</sub></i>

---

## الميزات الرئيسية

### Spiders — إطار عمل زحف كامل
- 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.
- ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.
- 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.
- 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.
- 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.
- 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.
- 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.

### جلب متقدم للمواقع مع دعم الجلسات
- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.
- **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
- **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
- **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.
- **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.
- **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.

### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
- 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
- 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث بالتعبيرات العادية والمزيد.
- 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### بنية عالية الأداء ومختبرة ميدانياً
- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.
- 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
- ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
- 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.

### تجربة صديقة للمطورين/مستخرجي الويب
- 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
- 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
- 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.
- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.
- 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
- 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.
- 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.

## البدء

لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.

### الاستخدام الأساسي
طلبات HTTP مع دعم الجلسات
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # استخدم أحدث إصدار من بصمة TLS لـ Chrome
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# أو استخدم طلبات لمرة واحدة
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
وضع التخفي المتقدم
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # أبقِ المتصفح مفتوحاً حتى تنتهي
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
أتمتة المتصفح الكاملة
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # أبقِ المتصفح مفتوحاً حتى تنتهي
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # محدد XPath إذا كنت تفضله

# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"Scraped {len(result.items)} quotes")
result.items.to_json("quotes.json")
```
استخدم أنواع جلسات متعددة في Spider واحد:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # وجّه الصفحات المحمية عبر جلسة التخفي
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # callback صريح
```
أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.

### التحليل المتقدم والتنقل
```python
from scrapling.fetchers import Fetcher

# اختيار عناصر غني وتنقل
page = Fetcher.get('https://quotes.toscrape.com/')

# احصل على الاقتباسات بطرق اختيار متعددة
quotes = page.css('.quote')  # محدد CSS
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # بأسلوب BeautifulSoup
# نفس الشيء مثل
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # وهكذا...
# البحث عن عنصر بمحتوى النص
quotes = page.find_by_text('quote', tag='div')

# التنقل المتقدم
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # محددات متسلسلة
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# علاقات العناصر والتشابه
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
وهو يعمل بنفس الطريقة تماماً!

### أمثلة إدارة الجلسات بشكل Async
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# استخدام جلسة async
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## واجهة سطر الأوامر والـ Shell التفاعلي

يتضمن Scrapling واجهة سطر أوامر قوية:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

تشغيل Shell الـ Web Scraping التفاعلي
```bash
scrapling shell
```
استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)

## معايير الأداء

Scrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.

### اختبار سرعة استخراج النص (5000 عنصر متداخل)

| # |      المكتبة      | الوقت (ms) | vs Scrapling |
|---|:-----------------:|:----------:|:------------:|
| 1 |     Scrapling     |    2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |
| 3 |     Raw Lxml      |    2.54    |    1.257     |
| 4 |      PyQuery      |   24.17    |     ~12x     |
| 5 |    Selectolax     |   82.63    |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |


### أداء تشابه العناصر والبحث النصي

قدرات العثور على العناصر التكيفية لـ Scrapling تتفوق بشكل كبير على البدائل:

| المكتبة     | الوقت (ms) | vs Scrapling |
|-------------|:----------:|:------------:|
| Scrapling   |    2.39    |     1.0x     |
| AutoScraper |   12.45    |    5.209x    |


> تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.

## التثبيت

يتطلب Scrapling إصدار Python 3.10 أو أعلى:

```bash
pip install scrapling
```

يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.

### التبعيات الاختيارية

1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.

    أو يمكنك تثبيتها من الكود بدلاً من تشغيل أمر كالتالي:
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. ميزات إضافية:
   - تثبيت ميزة خادم MCP:
       ```bash
       pip install "scrapling[ai]"
       ```
   - تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):
       ```bash
       pip install "scrapling[shell]"
       ```
   - تثبيت كل شيء:
       ```bash
       pip install "scrapling[all]"
       ```
   تذكر أنك تحتاج إلى تثبيت تبعيات المتصفح مع `scrapling install` بعد أي من هذه الإضافات (إذا لم تكن قد فعلت ذلك بالفعل)

### Docker
يمكنك أيضاً تثبيت صورة Docker مع جميع الإضافات والمتصفحات باستخدام الأمر التالي من DockerHub:
```bash
docker pull pyd4vinci/scrapling
```
أو تنزيلها من سجل GitHub:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
يتم بناء هذه الصورة ودفعها تلقائياً باستخدام GitHub Actions والفرع الرئيسي للمستودع.

## المساهمة

نرحب بالمساهمات! يرجى قراءة [إرشادات المساهمة](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) قبل البدء.

## إخلاء المسؤولية

> [!CAUTION]
> يتم توفير هذه المكتبة للأغراض التعليمية والبحثية فقط. باستخدام هذه المكتبة، فإنك توافق على الامتثال لقوانين استخراج البيانات والخصوصية المحلية والدولية. المؤلفون والمساهمون غير مسؤولين عن أي إساءة استخدام لهذا البرنامج. احترم دائماً شروط خدمة المواقع وملفات robots.txt.

## 🎓 الاستشهادات
إذا استخدمت مكتبتنا لأغراض بحثية، يرجى الاستشهاد بنا بالمرجع التالي:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## الترخيص

هذا العمل مرخص بموجب ترخيص BSD-3-Clause.

## الشكر والتقدير

يتضمن هذا المشروع كوداً معدلاً من:
- Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)

---
<div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>


================================================
FILE: docs/README_CN.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>选择方法</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>选择 Fetcher</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>爬虫</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>代理轮换</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP 模式</strong></a>
</p>

Scrapling 是一个自适应 Web Scraping 框架，能处理从单个请求到大规模爬取的一切需求。

它的解析器能够从网站变化中学习，并在页面更新时自动重新定位您的元素。它的 Fetcher 能够开箱即用地绕过 Cloudflare Turnstile 等反机器人系统。它的 Spider 框架让您可以扩展到并发、多 Session 爬取，支持暂停/恢复和自动 Proxy 轮换——只需几行 Python 代码。一个库，零妥协。

极速爬取，实时统计和 Streaming。由 Web Scraper 为 Web Scraper 和普通用户而构建，每个人都能找到适合自己的功能。

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # 隐秘地获取网站！
products = p.css('.product', auto_save=True)                                        # 抓取在网站设计变更后仍能存活的数据！
products = p.css('.product', adaptive=True)                                         # 之后，如果网站结构改变，传递 `adaptive=True` 来找到它们！
```
或扩展为完整爬取
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# 铂金赞助商
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling 可处理 Cloudflare Turnstile。对于企业级保护，<a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> 提供 API 端点，生成适用于 <b>Akamai</b>、<b>DataDome</b>、<b>Kasada</b> 和 <b>Incapsula</b> 的有效 antibot 令牌。简单的 API 调用，无需浏览器自动化。 </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>嘿，我们创建了 <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a>，因为代理不应该复杂或昂贵。 <br /> 覆盖 195+ 地区的快速住宅和 ISP 代理，公平定价，真正的支持。 <br />
      <b>在落地页试试我们的 FlappyBird 游戏，获取免费流量！</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>：住宅代理低至 0.49 美元/GB。具备完全伪装 Chromium 的爬虫浏览器、住宅 IP、自动验证码解决和反机器人绕过。</br>
      <b>Scraper API 轻松获取结果。支持 MCP 和 N8N 集成。</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> 提供覆盖 16+ 平台（包括 TikTok、X、YouTube 和 Instagram）的 900+ 稳定 API，拥有 4000 万+ 数据集。<br /> 还提供<a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">优惠 AI 模型</a> — Claude、GPT、GEMINI 等，最高优惠 71%。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> 提供面向开发者和爬虫的快速住宅和 ISP 代理。全球 IP 覆盖、高匿名性、智能轮换，以及可靠的自动化和数据提取性能。使用 <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> 简化大规模网页爬取。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    合上笔记本电脑，您的爬虫仍在运行。<br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - 为不间断自动化而生的云服务器。Windows 和 Linux 系统，完全掌控。低至 €6.99/月。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    阅读 <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">The Web Scraping Club 上关于 Scrapling 的完整评测</a>（2025 年 11 月），这是排名第一的网页抓取专业通讯。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> 提供可靠的网页抓取代理基础设施，包括 IPv4、IPv6、ISP、住宅和移动代理，具备稳定性能、广泛的地理覆盖和灵活的企业级数据采集方案。
    </td>
  </tr>
</table>

<i><sub>想在这里展示您的广告吗？点击 [这里](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# 赞助商

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>想在这里展示您的广告吗？点击 [这里](https://github.com/sponsors/D4Vinci) 并选择适合您的级别！</sub></i>

---

## 主要特性

### Spider — 完整的爬取框架
- 🕷️ **类 Scrapy 的 Spider API**：使用 `start_urls`、async `parse` callback 和`Request`/`Response` 对象定义 Spider。
- ⚡ **并发爬取**：可配置的并发限制、按域名节流和下载延迟。
- 🔄 **多 Session 支持**：统一接口，支持 HTTP 请求和隐秘无头浏览器在同一个 Spider 中使用——通过 ID 将请求路由到不同的 Session。
- 💾 **暂停与恢复**：基于 Checkpoint 的爬取持久化。按 Ctrl+C 优雅关闭；重启后从上次停止的地方继续。
- 📡 **Streaming 模式**：通过 `async for item in spider.stream()` 以实时统计 Streaming 抓取的数据——非常适合 UI、管道和长时间运行的爬取。
- 🛡️ **被阻止请求检测**：自动检测并重试被阻止的请求，支持自定义逻辑。
- 📦 **内置导出**：通过钩子和您自己的管道导出结果，或使用内置的 JSON/JSONL，分别通过 `result.items.to_json()`/`result.items.to_jsonl()`。

### 支持 Session 的高级网站获取
- **HTTP 请求**：使用 `Fetcher` 类进行快速和隐秘的 HTTP 请求。可以模拟浏览器的 TLS fingerprint、标头并使用 HTTP/3。
- **动态加载**：通过 `DynamicFetcher` 类使用完整的浏览器自动化获取动态网站，支持 Playwright 的 Chromium 和 Google Chrome。
- **反机器人绕过**：使用 `StealthyFetcher` 的高级隐秘功能和 fingerprint 伪装。可以轻松自动绕过所有类型的 Cloudflare Turnstile/Interstitial。
- **Session 管理**：使用 `FetcherSession`、`StealthySession` 和 `DynamicSession` 类实现持久化 Session 支持，用于跨请求的 cookie 和状态管理。
- **Proxy 轮换**：内置 `ProxyRotator`，支持轮询或自定义策略，适用于所有 Session 类型，并支持按请求覆盖 Proxy。
- **域名屏蔽**：在基于浏览器的 Fetcher 中屏蔽对特定域名（及其子域名）的请求。
- **Async 支持**：所有 Fetcher 和专用 async Session 类的完整 async 支持。

### 自适应抓取和 AI 集成
- 🔄 **智能元素跟踪**：使用智能相似性算法在网站更改后重新定位元素。
- 🎯 **智能灵活选择**：CSS 选择器、XPath 选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
- 🔍 **查找相似元素**：自动定位与已找到元素相似的元素。
- 🤖 **与 AI 一起使用的 MCP 服务器**：内置 MCP 服务器用于 AI 辅助 Web Scraping 和数据提取。MCP 服务器具有强大的自定义功能，利用 Scrapling 在将内容传递给 AI（Claude/Cursor 等）之前提取目标内容，从而加快操作并通过最小化 token 使用来降低成本。（[演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE)）

### 高性能和经过实战测试的架构
- 🚀 **闪电般快速**：优化性能超越大多数 Python 抓取库。
- 🔋 **内存高效**：优化的数据结构和延迟加载，最小内存占用。
- ⚡ **快速 JSON 序列化**：比标准库快 10 倍。
- 🏗️ **经过实战测试**：Scrapling 不仅拥有 92% 的测试覆盖率和完整的类型提示覆盖率，而且在过去一年中每天被数百名 Web Scraper 使用。

### 对开发者/Web Scraper 友好的体验
- 🎯 **交互式 Web Scraping Shell**：可选的内置 IPython Shell，具有 Scrapling 集成、快捷方式和新工具，可加快 Web Scraping 脚本开发，例如将 curl 请求转换为 Scrapling 请求并在浏览器中查看请求结果。
- 🚀 **直接从终端使用**：可选地，您可以使用 Scrapling 抓取 URL 而无需编写任何代码！
- 🛠️ **丰富的导航 API**：使用父级、兄弟级和子级导航方法进行高级 DOM 遍历。
- 🧬 **增强的文本处理**：内置正则表达式、清理方法和优化的字符串操作。
- 📝 **自动选择器生成**：为任何元素生成强大的 CSS/XPath 选择器。
- 🔌 **熟悉的 API**：类似于 Scrapy/BeautifulSoup，使用与 Scrapy/Parsel 相同的伪元素。
- 📘 **完整的类型覆盖**：完整的类型提示，出色的 IDE 支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。
- 🔋 **现成的 Docker 镜像**：每次发布时，包含所有浏览器的 Docker 镜像会自动构建和推送。

## 入门

让我们快速展示 Scrapling 的功能，无需深入了解。

### 基本用法
支持 Session 的 HTTP 请求
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # 使用 Chrome 的最新版本 TLS fingerprint
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# 或使用一次性请求
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
高级隐秘模式
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # 保持浏览器打开直到完成
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# 或使用一次性请求样式，为此请求打开浏览器，完成后关闭
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
完整的浏览器自动化
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 保持浏览器打开直到完成
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # 如果您偏好 XPath 选择器

# 或使用一次性请求样式，为此请求打开浏览器，完成后关闭
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spider
构建具有并发请求、多种 Session 类型和暂停/恢复功能的完整爬虫：
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"抓取了 {len(result.items)} 条引用")
result.items.to_json("quotes.json")
```
在单个 Spider 中使用多种 Session 类型：
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # 将受保护的页面路由到隐秘 Session
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # 显式 callback
```
通过如下方式运行 Spider 来暂停和恢复长时间爬取，使用 Checkpoint：
```python
QuotesSpider(crawldir="./crawl_data").start()
```
按 Ctrl+C 优雅暂停——进度会自动保存。之后，当您再次启动 Spider 时，传递相同的 `crawldir`，它将从上次停止的地方继续。

### 高级解析与导航
```python
from scrapling.fetchers import Fetcher

# 丰富的元素选择和导航
page = Fetcher.get('https://quotes.toscrape.com/')

# 使用多种选择方法获取引用
quotes = page.css('.quote')  # CSS 选择器
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup 风格
# 等同于
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # 等等...
# 按文本内容查找元素
quotes = page.find_by_text('quote', tag='div')

# 高级导航
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # 链式选择器
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# 元素关系和相似性
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
如果您不想获取网站，可以直接使用解析器，如下所示：
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
用法完全相同！

### Async Session 管理示例
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession`是上下文感知的，可以在 sync/async 模式下工作
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Async Session 用法
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # 可选 - 浏览器标签池的状态（忙/空闲/错误）
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI 和交互式 Shell

Scrapling 包含强大的命令行界面：

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

启动交互式 Web Scraping Shell
```bash
scrapling shell
```
直接将页面提取到文件而无需编程（默认提取 `body` 标签内的内容）。如果输出文件以`.txt` 结尾，则将提取目标的文本内容。如果以`.md` 结尾，它将是 HTML 内容的 Markdown 表示；如果以`.html` 结尾，它将是 HTML 内容本身。
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # 所有匹配 CSS 选择器'#fromSkipToProducts' 的元素
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> 还有许多其他功能，但我们希望保持此页面简洁，包括 MCP 服务器和交互式 Web Scraping Shell。查看完整文档 [这里](https://scrapling.readthedocs.io/en/latest/)

## 性能基准

Scrapling 不仅功能强大——它还速度极快。以下基准测试将 Scrapling 的解析器与其他流行库的最新版本进行了比较。

### 文本提取速度测试（5000 个嵌套元素）

| # |         库         | 时间 (ms)  | vs Scrapling |
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### 元素相似性和文本搜索性能

Scrapling 的自适应元素查找功能明显优于替代方案：

| 库           | 时间 (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |


> 所有基准测试代表 100+ 次运行的平均值。请参阅 [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) 了解方法。

## 安装

Scrapling 需要 Python 3.10 或更高版本：

```bash
pip install scrapling
```

此安装仅包括解析器引擎及其依赖项，没有任何 Fetcher 或命令行依赖项。

### 可选依赖项

1. 如果您要使用以下任何额外功能、Fetcher 或它们的类，您将需要安装 Fetcher 的依赖项和它们的浏览器依赖项，如下所示：
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    这会下载所有浏览器，以及它们的系统依赖项和 fingerprint 操作依赖项。

    或者你可以从代码中安装，而不是运行命令：
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. 额外功能：
   - 安装 MCP 服务器功能：
       ```bash
       pip install "scrapling[ai]"
       ```
   - 安装 Shell 功能（Web Scraping Shell 和 `extract` 命令）：
       ```bash
       pip install "scrapling[shell]"
       ```
   - 安装所有内容：
       ```bash
       pip install "scrapling[all]"
       ```
   请记住，在安装任何这些额外功能后（如果您还没有安装），您需要使用 `scrapling install` 安装浏览器依赖项

### Docker
您还可以使用以下命令从 DockerHub 安装包含所有额外功能和浏览器的 Docker 镜像：
```bash
docker pull pyd4vinci/scrapling
```
或从 GitHub 注册表下载：
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
此镜像使用 GitHub Actions 和仓库主分支自动构建和推送。

## 贡献

我们欢迎贡献！在开始之前，请阅读我们的 [贡献指南](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)。

## 免责声明

> [!CAUTION]
> 此库仅用于教育和研究目的。使用此库即表示您同意遵守本地和国际数据抓取和隐私法律。作者和贡献者对本软件的任何滥用不承担责任。始终尊重网站的服务条款和 robots.txt 文件。

## 🎓 引用
如果您将我们的库用于研究目的，请使用以下参考文献引用我们：
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## 许可证

本作品根据 BSD-3-Clause 许可证授权。

## 致谢

此项目包含改编自以下内容的代码：
- Parsel（BSD 许可证）——用于 [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块

---
<div align="center"><small>由 Karim Shoair 用❤️设计和制作。</small></div><br>


================================================
FILE: docs/README_DE.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Auswahlmethoden</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Einen Fetcher wählen</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy-Rotation</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP-Modus</strong></a>
</p>

Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.

Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.

Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Website unbemerkt abrufen!
products = p.css('.product', auto_save=True)                                        # Daten scrapen, die Website-Designänderungen überleben!
products = p.css('.product', adaptive=True)                                         # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!
```
Oder auf vollständige Crawls hochskalieren
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# Platin-Sponsoren
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling bewältigt Cloudflare Turnstile. Für Schutz auf Unternehmensebene bietet <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> API-Endpunkte, die gültige Antibot-Tokens für <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> und <b>Incapsula</b> generieren. Einfache API-Aufrufe, keine Browser-Automatisierung nötig. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>Hey, wir haben <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a> gebaut, weil Proxies nicht kompliziert oder überteuert sein sollten. <br /> Schnelle Residential- und ISP-Proxies in über 195 Standorten, faire Preise und echter Support. <br />
      <b>Probieren Sie unser FlappyBird-Spiel auf der Landingpage für kostenlose Daten!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: Residential-Proxies ab 0,49 $/GB. Scraping-Browser mit vollständig gefälschtem Chromium, Residential-IPs, automatischer CAPTCHA-Lösung und Anti-Bot-Umgehung. </br>
      <b>Scraper-API für problemlose Ergebnisse. MCP- und N8N-Integrationen verfügbar.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> bietet über 900 stabile APIs auf mehr als 16 Plattformen, darunter TikTok, X, YouTube und Instagram, mit über 40 Mio. Datensätzen. <br /> Bietet außerdem <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">vergünstigte KI-Modelle</a> — Claude, GPT, GEMINI und mehr mit bis zu 71% Rabatt.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> bietet schnelle Residential- und ISP-Proxies für Entwickler und Scraper. Globale IP-Abdeckung, hohe Anonymität, intelligente Rotation und zuverlässige Leistung für Automatisierung und Datenextraktion. Verwenden Sie <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a>, um großflächiges Web-Crawling zu vereinfachen.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    Klappe den Laptop zu. Deine Scraper laufen weiter. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - Cloud-Server für ununterbrochene Automatisierung. Windows- und Linux-Maschinen mit voller Kontrolle. Ab €6,99/Monat.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    Lesen Sie eine vollständige Rezension von <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling auf The Web Scraping Club</a> (Nov. 2025), dem führenden Newsletter für Web Scraping.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> bietet zuverlässige Proxy-Infrastruktur für Web Scraping mit IPv4-, IPv6-, ISP-, Residential- und Mobile-Proxys – stabile Leistung, breite geografische Abdeckung und flexible Tarife für die Datenerfassung im Unternehmensmaßstab.
    </td>
  </tr>
</table>

<i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# Sponsoren

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci) und wählen Sie die Stufe, die zu Ihnen passt!</sub></i>

---

## Hauptmerkmale

### Spiders -- Ein vollständiges Crawling-Framework
- 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.
- ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.
- 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.
- 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.
- 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.
- 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.
- 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.

### Erweitertes Website-Abrufen mit Session-Unterstützung
- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.
- **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.
- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
- **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
- **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.
- **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.
- **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.

### Adaptives Scraping & KI-Integration
- 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
- 🎯 **Intelligente flexible Auswahl**: CSS-Selektoren, XPath-Selektoren, filterbasierte Suche, Textsuche, Regex-Suche und mehr.
- 🔍 **Ähnliche Elemente finden**: Elemente, die gefundenen Elementen ähnlich sind, automatisch lokalisieren.
- 🤖 **MCP-Server für die Verwendung mit KI**: Integrierter MCP-Server für KI-unterstütztes Web Scraping und Datenextraktion. Der MCP-Server verfügt über leistungsstarke, benutzerdefinierte Funktionen, die Scrapling nutzen, um gezielten Inhalt zu extrahieren, bevor er an die KI (Claude/Cursor/etc.) übergeben wird, wodurch Vorgänge beschleunigt und Kosten durch Minimierung der Token-Nutzung gesenkt werden. ([Demo-Video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### Hochleistungs- und praxiserprobte Architektur
- 🚀 **Blitzschnell**: Optimierte Leistung, die die meisten Python-Scraping-Bibliotheken übertrifft.
- 🔋 **Speichereffizient**: Optimierte Datenstrukturen und Lazy Loading für einen minimalen Speicher-Footprint.
- ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
- 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.

### Entwickler-/Web-Scraper-freundliche Erfahrung
- 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
- 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
- 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
- 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
- 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
- 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.
- 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.

## Erste Schritte

Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.

### Grundlegende Verwendung
HTTP-Anfragen mit Session-Unterstützung
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Neueste Version von Chromes TLS-Fingerprint verwenden
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# Oder einmalige Anfragen verwenden
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Erweiterter Stealth-Modus
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Browser offen halten, bis Sie fertig sind
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Vollständige Browser-Automatisierung
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Browser offen halten, bis Sie fertig sind
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # XPath-Selektor, falls bevorzugt

# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"{len(result.items)} Zitate gescrapt")
result.items.to_json("quotes.json")
```
Mehrere Session-Typen in einem einzigen Spider verwenden:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Geschützte Seiten über die Stealth-Session leiten
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # Expliziter Callback
```
Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.

### Erweitertes Parsing & Navigation
```python
from scrapling.fetchers import Fetcher

# Umfangreiche Elementauswahl und Navigation
page = Fetcher.get('https://quotes.toscrape.com/')

# Zitate mit verschiedenen Auswahlmethoden abrufen
quotes = page.css('.quote')  # CSS-Selektor
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-Stil
# Gleich wie
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # und so weiter...
# Element nach Textinhalt finden
quotes = page.find_by_text('quote', tag='div')

# Erweiterte Navigation
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Verkettete Selektoren
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Elementbeziehungen und Ähnlichkeit
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
Und es funktioniert genau auf die gleiche Weise!

### Beispiele für async Session-Verwaltung
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Async-Session-Verwendung
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI & Interaktive Shell

Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

Interaktive Web-Scraping-Shell starten
```bash
scrapling shell
```
Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an

## Leistungsbenchmarks

Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.

### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)

| # |    Bibliothek     | Zeit (ms) | vs Scrapling |
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### Element-Ähnlichkeit & Textsuche-Leistung

Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutlich:

| Bibliothek  | Zeit (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |


> Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.

## Installation

Scrapling erfordert Python 3.10 oder höher:

```bash
pip install scrapling
```

Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.

### Optionale Abhängigkeiten

1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.

    Oder Sie können sie aus dem Code heraus installieren, anstatt einen Befehl auszuführen:
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. Zusätzliche Funktionen:
   - MCP-Server-Funktion installieren:
       ```bash
       pip install "scrapling[ai]"
       ```
   - Shell-Funktionen installieren (Web-Scraping-Shell und der `extract`-Befehl):
       ```bash
       pip install "scrapling[shell]"
       ```
   - Alles installieren:
       ```bash
       pip install "scrapling[all]"
       ```
   Denken Sie daran, dass Sie nach einem dieser Extras (falls noch nicht geschehen) die Browser-Abhängigkeiten mit `scrapling install` installieren müssen

### Docker
Sie können auch ein Docker-Image mit allen Extras und Browsern mit dem folgenden Befehl von DockerHub installieren:
```bash
docker pull pyd4vinci/scrapling
```
Oder laden Sie es aus der GitHub-Registry herunter:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
Dieses Image wird automatisch mit GitHub Actions und dem Hauptzweig des Repositorys erstellt und gepusht.

## Beitragen

Wir freuen uns über Beiträge! Bitte lesen Sie unsere [Beitragsrichtlinien](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md), bevor Sie beginnen.

## Haftungsausschluss

> [!CAUTION]
> Diese Bibliothek wird nur zu Bildungs- und Forschungszwecken bereitgestellt. Durch die Nutzung dieser Bibliothek erklären Sie sich damit einverstanden, lokale und internationale Gesetze zum Daten-Scraping und Datenschutz einzuhalten. Die Autoren und Mitwirkenden sind nicht verantwortlich für Missbrauch dieser Software. Respektieren Sie immer die Nutzungsbedingungen von Websites und robots.txt-Dateien.

## 🎓 Zitierungen
Wenn Sie unsere Bibliothek für Forschungszwecke verwendet haben, zitieren Sie uns bitte mit der folgenden Referenz:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## Lizenz

Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.

## Danksagungen

Dieses Projekt enthält angepassten Code von:
- Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul

---
<div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>


================================================
FILE: docs/README_ES.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Métodos de selección</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Elegir un fetcher</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Rotación de proxy</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>Modo MCP</strong></a>
</p>

Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.

Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.

Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # ¡Obtén el sitio web bajo el radar!
products = p.css('.product', auto_save=True)                                        # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
products = p.css('.product', adaptive=True)                                         # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!
```
O escala a rastreos completos
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# Patrocinadores Platino
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling maneja Cloudflare Turnstile. Para protección de nivel empresarial, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> proporciona endpoints API que generan tokens antibot válidos para <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> e <b>Incapsula</b>. Simples llamadas API, sin automatización de navegador. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>Oye, creamos <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a> porque los proxies no deberían ser complicados ni caros. <br /> Proxies residenciales e ISP rápidos en más de 195 ubicaciones, precios justos y soporte real. <br />
      <b>¡Prueba nuestro juego FlappyBird en la página de inicio para obtener datos gratis!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: proxies residenciales desde 0,49 $/GB. Navegador de scraping con Chromium totalmente falsificado, IPs residenciales, resolución automática de CAPTCHA y evasión anti-bot. </br>
      <b>API Scraper para resultados sin complicaciones. Integraciones MCP y N8N disponibles.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> ofrece más de 900 APIs estables en más de 16 plataformas, incluyendo TikTok, X, YouTube e Instagram, con más de 40M de conjuntos de datos. <br /> También ofrece <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">modelos de IA con descuento</a> — Claude, GPT, GEMINI y más con hasta un 71% de descuento.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> ofrece proxies residenciales e ISP rápidos para desarrolladores y scrapers. Cobertura IP global, alto anonimato, rotación inteligente y rendimiento fiable para automatización y extracción de datos. Usa <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> para simplificar el crawling web a gran escala.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    Cierra tu portátil. Tus scrapers siguen funcionando. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - servidores en la nube diseñados para automatización ininterrumpida. Máquinas Windows y Linux con control total. Desde €6,99/mes.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    Lee una reseña completa de <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling en The Web Scraping Club</a> (nov. 2025), el boletín número uno dedicado al Web Scraping.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> ofrece una infraestructura de proxy fiable para web scraping, con proxies IPv4, IPv6, ISP, residenciales y móviles con rendimiento estable, amplia cobertura geográfica y planes flexibles para la recopilación de datos a escala empresarial.
    </td>
  </tr>
</table>

<i><sub>¿Quieres mostrar tu anuncio aquí? Haz clic [aquí](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# Patrocinadores

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>¿Quieres mostrar tu anuncio aquí? ¡Haz clic [aquí](https://github.com/sponsors/D4Vinci) y elige el nivel que te convenga!</sub></i>

---

## Características Principales

### Spiders — Un Framework Completo de Rastreo
- 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.
- ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.
- 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.
- 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.
- 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.
- 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.
- 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.

### Obtención Avanzada de Sitios Web con Soporte de Session
- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.
- **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
- **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
- **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.
- **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.
- **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.

### Scraping Adaptativo e Integración con IA
- 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
- 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
- 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### Arquitectura de Alto Rendimiento y Probada en Batalla
- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.
- 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
- ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.

### Experiencia Amigable para Desarrolladores/Web Scrapers
- 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
- 🚀 **Úsalo directamente desde la Terminal**: Opcionalmente, ¡puedes usar Scrapling para hacer scraping de una URL sin escribir ni una sola línea de código!
- 🛠️ **API de Navegación Rica**: Recorrido avanzado del DOM con métodos de navegación de padres, hermanos e hijos.
- 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
- 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
- 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.
- 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.

## Primeros Pasos

Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.

### Uso Básico
Solicitudes HTTP con soporte de sesión
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Usa la última versión del fingerprint TLS de Chrome
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# O usa solicitudes de una sola vez
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Modo sigiloso avanzado
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Mantén el navegador abierto hasta que termines
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Automatización completa del navegador
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Mantén el navegador abierto hasta que termines
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # Selector XPath si lo prefieres

# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"Se extrajeron {len(result.items)} citas")
result.items.to_json("quotes.json")
```
Usa múltiples tipos de sesión en un solo Spider:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Enruta las páginas protegidas a través de la sesión sigilosa
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # callback explícito
```
Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.

### Análisis Avanzado y Navegación
```python
from scrapling.fetchers import Fetcher

# Selección rica de elementos y navegación
page = Fetcher.get('https://quotes.toscrape.com/')

# Obtén citas con múltiples métodos de selección
quotes = page.css('.quote')  # Selector CSS
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # Estilo BeautifulSoup
# Igual que
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # y así sucesivamente...
# Encuentra elementos por contenido de texto
quotes = page.find_by_text('quote', tag='div')

# Navegación avanzada
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Selectores encadenados
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Relaciones y similitud de elementos
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
¡Y funciona exactamente de la misma manera!

### Ejemplos de Gestión de Session Async
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Uso de sesión async
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI y Shell Interactivo

Scrapling incluye una poderosa interfaz de línea de comandos:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

Lanzar el Shell interactivo de Web Scraping
```bash
scrapling shell
```
Extraer páginas a un archivo directamente sin programar (Extrae el contenido dentro de la etiqueta `body` por defecto). Si el archivo de salida termina con `.txt`, entonces se extraerá el contenido de texto del objetivo. Si termina con `.md`, será una representación Markdown del contenido HTML; si termina con `.html`, será el contenido HTML en sí mismo.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Todos los elementos que coinciden con el selector CSS '#fromSkipToProducts'
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)

## Benchmarks de Rendimiento

Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.

### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)

| # |    Biblioteca     | Tiempo (ms) | vs Scrapling |
|---|:-----------------:|:-----------:|:------------:|
| 1 |     Scrapling     |    2.02     |     1.0x     |
| 2 |   Parsel/Scrapy   |    2.04     |     1.01     |
| 3 |     Raw Lxml      |    2.54     |    1.257     |
| 4 |      PyQuery      |    24.17    |     ~12x     |
| 5 |    Selectolax     |    82.63    |     ~41x     |
| 6 |  MechanicalSoup   |   1549.71   |   ~767.1x    |
| 7 |   BS4 with Lxml   |   1584.31   |   ~784.3x    |
| 8 | BS4 with html5lib |   3391.91   |   ~1679.1x   |


### Rendimiento de Similitud de Elementos y Búsqueda de Texto

Las capacidades de búsqueda adaptativa de elementos de Scrapling superan significativamente a las alternativas:

| Biblioteca  | Tiempo (ms) | vs Scrapling |
|-------------|:-----------:|:------------:|
| Scrapling   |    2.39     |     1.0x     |
| AutoScraper |    12.45    |    5.209x    |


> Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.

## Instalación

Scrapling requiere Python 3.10 o superior:

```bash
pip install scrapling
```

Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.

### Dependencias Opcionales

1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.

    O puedes instalarlos desde el código en lugar de ejecutar un comando:
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. Características adicionales:
   - Instalar la característica del servidor MCP:
       ```bash
       pip install "scrapling[ai]"
       ```
   - Instalar características del Shell (Shell de Web Scraping y el comando `extract`):
       ```bash
       pip install "scrapling[shell]"
       ```
   - Instalar todo:
       ```bash
       pip install "scrapling[all]"
       ```
   Recuerda que necesitas instalar las dependencias del navegador con `scrapling install` después de cualquiera de estos extras (si no lo hiciste ya)

### Docker
También puedes instalar una imagen Docker con todos los extras y navegadores con el siguiente comando desde DockerHub:
```bash
docker pull pyd4vinci/scrapling
```
O descárgala desde el registro de GitHub:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
Esta imagen se construye y publica automáticamente usando GitHub Actions y la rama principal del repositorio.

## Contribuir

¡Damos la bienvenida a las contribuciones! Por favor lee nuestras [pautas de contribución](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) antes de comenzar.

## Descargo de Responsabilidad

> [!CAUTION]
> Esta biblioteca se proporciona solo con fines educativos y de investigación. Al usar esta biblioteca, aceptas cumplir con las leyes locales e internacionales de scraping de datos y privacidad. Los autores y contribuyentes no son responsables de ningún mal uso de este software. Respeta siempre los términos de servicio de los sitios web y los archivos robots.txt.

## 🎓 Citas
Si has utilizado nuestra biblioteca con fines de investigación, por favor cítanos con la siguiente referencia:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## Licencia

Este trabajo está licenciado bajo la Licencia BSD-3-Clause.

## Agradecimientos

Este proyecto incluye código adaptado de:
- Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)

---
<div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>


================================================
FILE: docs/README_FR.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Méthodes de sélection</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetchers</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Rotation de proxy</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP</strong></a>
</p>

Scrapling est un framework de Web Scraping adaptatif qui gère tout, d'une simple requête à un crawl à grande échelle.

Son parser apprend des modifications de sites web et relocalise automatiquement vos éléments lorsque les pages sont mises à jour. Ses fetchers contournent les systèmes anti-bot comme Cloudflare Turnstile nativement. Et son framework Spider vous permet de monter en charge vers des crawls concurrents multi-sessions avec pause/reprise et rotation automatique de proxy — le tout en quelques lignes de Python. Une seule bibliothèque, zéro compromis.

Des crawls ultra-rapides avec des statistiques en temps réel et du streaming. Conçu par des Web Scrapers pour des Web Scrapers et des utilisateurs réguliers, il y en a pour tout le monde.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Récupérer un site web en toute discrétion !
products = p.css('.product', auto_save=True)                                        # Scraper des données qui survivent aux changements de design !
products = p.css('.product', adaptive=True)                                         # Plus tard, si la structure du site change, passez `adaptive=True` pour les retrouver !
```
Ou montez en charge vers des crawls complets
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# Sponsors Platine
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling gère Cloudflare Turnstile. Pour une protection de niveau entreprise, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> fournit des endpoints API qui génèrent des tokens antibot valides pour <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> et <b>Incapsula</b>. De simples appels API, sans automatisation de navigateur. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>Nous avons créé <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a> parce que les proxies ne devraient pas être compliqués ni trop chers. Des proxies résidentiels et ISP rapides dans plus de 195 localisations, des prix équitables et un vrai support. <br />
      <b>Essayez notre jeu FlappyBird sur la page d'accueil pour des données gratuites !</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a> : proxies résidentiels à partir de 0,49 $/Go. Navigateur de scraping avec Chromium entièrement falsifié, IPs résidentielles, résolution automatique de CAPTCHA et contournement anti-bot. </br>
      <b>API Scraper pour des résultats sans tracas. Intégrations MCP et N8N disponibles.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> propose plus de 900 APIs stables sur plus de 16 plateformes, dont TikTok, X, YouTube et Instagram, avec plus de 40M de jeux de données. <br /> Propose également des <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">modèles IA à prix réduit</a> — Claude, GPT, GEMINI et plus, jusqu'à 71% de réduction.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> fournit des proxies résidentiels et ISP rapides pour les développeurs et les scrapeurs. Couverture IP mondiale, anonymat élevé, rotation intelligente et performances fiables pour l'automatisation et l'extraction de données. Utilisez <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> pour simplifier le crawling web à grande échelle.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    Fermez votre ordinateur. Vos scrapers continuent de tourner. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - des serveurs cloud conçus pour l'automatisation sans interruption. Machines Windows et Linux avec contrôle total. À partir de 6,99 €/mois.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    Lisez une critique complète de <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling sur The Web Scraping Club</a> (nov. 2025), la newsletter n°1 dédiée au Web Scraping.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> fournit une infrastructure proxy fiable pour le web scraping, avec des proxys IPv4, IPv6, ISP, résidentiels et mobiles offrant des performances stables, une large couverture géographique et des plans flexibles pour la collecte de données à l'échelle entreprise.
    </td>
  </tr>
</table>

<i><sub>Vous souhaitez afficher votre publicité ici ? Cliquez [ici](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# Sponsors

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>Vous souhaitez afficher votre publicité ici ? Cliquez [ici](https://github.com/sponsors/D4Vinci) et choisissez le niveau qui vous convient !</sub></i>

---

## Fonctionnalités principales

### Spiders — Un framework de crawling complet
- 🕷️ **API Spider à la Scrapy** : Définissez des spiders avec `start_urls`, des callbacks async `parse` et des objets `Request`/`Response`.
- ⚡ **Crawling concurrent** : Limites de concurrence configurables, throttling par domaine et délais de téléchargement.
- 🔄 **Support multi-sessions** : Interface unifiée pour les requêtes HTTP et les navigateurs headless furtifs dans un seul spider — routez les requêtes vers différentes sessions par ID.
- 💾 **Pause & Reprise** : Persistance du crawl basée sur des checkpoints. Appuyez sur Ctrl+C pour un arrêt gracieux ; redémarrez pour reprendre là où vous vous étiez arrêté.
- 📡 **Mode streaming** : Diffusez les éléments scrapés en temps réel via `async for item in spider.stream()` avec des statistiques en temps réel — idéal pour les UI, pipelines et crawls de longue durée.
- 🛡️ **Détection des requêtes bloquées** : Détection automatique et réessai des requêtes bloquées avec une logique personnalisable.
- 📦 **Export intégré** : Exportez les résultats via des hooks et votre propre pipeline ou l'export JSON/JSONL intégré avec `result.items.to_json()` / `result.items.to_jsonl()` respectivement.

### Récupération avancée de sites web avec support de sessions
- **Requêtes HTTP** : Requêtes HTTP rapides et furtives avec la classe `Fetcher`. Peut imiter l'empreinte TLS des navigateurs, les headers et utiliser HTTP/3.
- **Chargement dynamique** : Récupérez des sites web dynamiques avec une automatisation complète du navigateur via la classe `DynamicFetcher` supportant Chromium de Playwright et Google Chrome.
- **Contournement anti-bot** : Capacités de furtivité avancées avec `StealthyFetcher` et usurpation d'empreinte. Peut facilement contourner tous les types de Turnstile/Interstitial de Cloudflare avec l'automatisation.
- **Gestion de sessions** : Support de sessions persistantes avec les classes `FetcherSession`, `StealthySession` et `DynamicSession` pour la gestion des cookies et de l'état entre les requêtes.
- **Rotation de proxy** : `ProxyRotator` intégré avec des stratégies de rotation cycliques ou personnalisées sur tous les types de sessions, plus des surcharges de proxy par requête.
- **Blocage de domaines** : Bloquez les requêtes vers des domaines spécifiques (et leurs sous-domaines) dans les fetchers basés sur navigateur.
- **Support async** : Support async complet sur tous les fetchers et classes de sessions async dédiées.

### Scraping adaptatif & Intégration IA
- 🔄 **Suivi intelligent des éléments** : Relocalisez les éléments après des modifications de site web en utilisant des algorithmes de similarité intelligents.
- 🎯 **Sélection flexible intelligente** : Sélecteurs CSS, sélecteurs XPath, recherche par filtres, recherche textuelle, recherche regex et plus encore.
- 🔍 **Trouver des éléments similaires** : Localisez automatiquement des éléments similaires aux éléments trouvés.
- 🤖 **Serveur MCP pour utilisation avec l'IA** : Serveur MCP intégré pour le Web Scraping et l'extraction de données assistés par IA. Le serveur MCP dispose de capacités puissantes et personnalisées qui exploitent Scrapling pour extraire du contenu ciblé avant de le transmettre à l'IA (Claude/Cursor/etc.), accélérant ainsi les opérations et réduisant les coûts en minimisant l'utilisation de tokens. ([vidéo de démonstration](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### Architecture haute performance et éprouvée
- 🚀 **Ultra rapide** : Performance optimisée surpassant la plupart des bibliothèques de scraping Python.
- 🔋 **Économe en mémoire** : Structures de données optimisées et chargement paresseux pour une empreinte mémoire minimale.
- ⚡ **Sérialisation JSON rapide** : 10x plus rapide que la bibliothèque standard.
- 🏗️ **Éprouvé en conditions réelles** : Non seulement Scrapling dispose d'une couverture de tests de 92% et d'une couverture complète des type hints, mais il est utilisé quotidiennement par des centaines de Web Scrapers depuis l'année dernière.

### Expérience conviviale pour développeurs/Web Scrapers
- 🎯 **Shell interactif de Web Scraping** : Shell IPython intégré optionnel avec intégration Scrapling, raccourcis et nouveaux outils pour accélérer le développement de scripts de Web Scraping, comme la conversion de requêtes curl en requêtes Scrapling et l'affichage des résultats dans votre navigateur.
- 🚀 **Utilisez-le directement depuis le terminal** : Optionnellement, vous pouvez utiliser Scrapling pour scraper une URL sans écrire une seule ligne de code !
- 🛠️ **API de navigation riche** : Traversée avancée du DOM avec des méthodes de navigation parent, frère et enfant.
- 🧬 **Traitement de texte amélioré** : Regex intégrées, méthodes de nettoyage et opérations sur les chaînes optimisées.
- 📝 **Génération automatique de sélecteurs** : Générez des sélecteurs CSS/XPath robustes pour n'importe quel élément.
- 🔌 **API familière** : Similaire à Scrapy/BeautifulSoup avec les mêmes pseudo-éléments utilisés dans Scrapy/Parsel.
- 📘 **Couverture de types complète** : Type hints complets pour un excellent support IDE et la complétion de code. L'ensemble de la base de code est automatiquement analysé avec **PyRight** et **MyPy** à chaque modification.
- 🔋 **Image Docker prête à l'emploi** : À chaque version, une image Docker contenant tous les navigateurs est automatiquement construite et publiée.

## Pour commencer

Voici un aperçu rapide de ce que Scrapling peut faire sans entrer dans les détails.

### Utilisation de base
Requêtes HTTP avec support de sessions
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Utiliser la dernière version de l'empreinte TLS de Chrome
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# Ou utiliser des requêtes ponctuelles
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Mode furtif avancé
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Garder le navigateur ouvert jusqu'à ce que vous ayez terminé
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# Ou utiliser le style requête ponctuelle : ouvre le navigateur pour cette requête, puis le ferme après
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Automatisation complète du navigateur
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Garder le navigateur ouvert jusqu'à ce que vous ayez terminé
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # Sélecteur XPath si vous le préférez

# Ou utiliser le style requête ponctuelle : ouvre le navigateur pour cette requête, puis le ferme après
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spiders
Construisez des crawlers complets avec des requêtes concurrentes, plusieurs types de sessions et pause/reprise :
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"{len(result.items)} citations scrapées")
result.items.to_json("quotes.json")
```
Utilisez plusieurs types de sessions dans un seul spider :
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Router les pages protégées via la session furtive
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # Callback explicite
```
Mettez en pause et reprenez les longs crawls avec des checkpoints en lançant le spider ainsi :
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Appuyez sur Ctrl+C pour mettre en pause gracieusement — la progression est sauvegardée automatiquement. Plus tard, lorsque vous relancez le spider, passez le même `crawldir`, et il reprendra là où il s'était arrêté.

### Parsing avancé & Navigation
```python
from scrapling.fetchers import Fetcher

# Sélection riche d'éléments et navigation
page = Fetcher.get('https://quotes.toscrape.com/')

# Obtenir des citations avec plusieurs méthodes de sélection
quotes = page.css('.quote')  # Sélecteur CSS
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # Style BeautifulSoup
# Identique à
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # et ainsi de suite...
# Trouver un élément par contenu textuel
quotes = page.find_by_text('quote', tag='div')

# Navigation avancée
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Sélecteurs chaînés
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Relations et similarité entre éléments
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
Vous pouvez utiliser le parser directement si vous ne souhaitez pas récupérer de sites web, comme ci-dessous :
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
Et cela fonctionne exactement de la même manière !

### Exemples de gestion de sessions async
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` est sensible au contexte et peut fonctionner en mode sync comme async
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Utilisation de session async
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # Optionnel - Le statut du pool d'onglets du navigateur (occupé/libre/erreur)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI & Shell interactif

Scrapling inclut une interface en ligne de commande puissante :

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

Lancer le shell interactif de Web Scraping
```bash
scrapling shell
```
Extraire des pages directement dans un fichier sans programmation (extrait par défaut le contenu de la balise `body`). Si le fichier de sortie se termine par `.txt`, le contenu textuel de la cible sera extrait. S'il se termine par `.md`, ce sera une représentation Markdown du contenu HTML ; s'il se termine par `.html`, ce sera le contenu HTML lui-même.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Tous les éléments correspondant au sélecteur CSS '#fromSkipToProducts'
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> Il existe de nombreuses fonctionnalités supplémentaires, mais nous souhaitons garder cette page concise, y compris le serveur MCP et le shell interactif de Web Scraping. Consultez la documentation complète [ici](https://scrapling.readthedocs.io/en/latest/)

## Benchmarks de performance

Scrapling n'est pas seulement puissant — il est aussi ultra rapide. Les benchmarks suivants comparent le parser de Scrapling avec les dernières versions d'autres bibliothèques populaires.

### Test de vitesse d'extraction de texte (5000 éléments imbriqués)

| # |   Bibliothèque    | Temps (ms) | vs Scrapling |
|---|:-----------------:|:----------:|:------------:|
| 1 |     Scrapling     |    2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |
| 3 |     Raw Lxml      |    2.54    |    1.257     |
| 4 |      PyQuery      |   24.17    |     ~12x     |
| 5 |    Selectolax     |   82.63    |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |


### Performance de similarité d'éléments & recherche textuelle

Les capacités adaptatives de recherche d'éléments de Scrapling surpassent significativement les alternatives :

| Bibliothèque | Temps (ms) | vs Scrapling |
|--------------|:----------:|:------------:|
| Scrapling    |    2.39    |     1.0x     |
| AutoScraper  |   12.45    |    5.209x    |


> Tous les benchmarks représentent des moyennes de plus de 100 exécutions. Voir [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) pour la méthodologie.

## Installation

Scrapling nécessite Python 3.10 ou supérieur :

```bash
pip install scrapling
```

Cette installation n'inclut que le moteur de parsing et ses dépendances, sans aucun fetcher ni dépendance en ligne de commande.

### Dépendances optionnelles

1. Si vous allez utiliser l'une des fonctionnalités supplémentaires ci-dessous, les fetchers ou leurs classes, vous devrez installer les dépendances des fetchers et leurs dépendances navigateur comme suit :
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # installation normale
    scrapling install  --force  # réinstallation forcée
    ```

    Cela télécharge tous les navigateurs, ainsi que leurs dépendances système et les dépendances de manipulation d'empreintes.

    Ou vous pouvez les installer depuis le code au lieu d'exécuter une commande :
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # installation normale
    install(["--force"], standalone_mode=False) # réinstallation forcée
    ```

2. Fonctionnalités supplémentaires :
   - Installer la fonctionnalité serveur MCP :
       ```bash
       pip install "scrapling[ai]"
       ```
   - Installer les fonctionnalités shell (shell de Web Scraping et la commande `extract`) :
       ```bash
       pip install "scrapling[shell]"
       ```
   - Tout installer :
       ```bash
       pip install "scrapling[all]"
       ```
   N'oubliez pas que vous devez installer les dépendances navigateur avec `scrapling install` après l'un de ces extras (si vous ne l'avez pas déjà fait)

### Docker
Vous pouvez également installer une image Docker avec tous les extras et navigateurs avec la commande suivante depuis DockerHub :
```bash
docker pull pyd4vinci/scrapling
```
Ou téléchargez-la depuis le registre GitHub :
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
Cette image est automatiquement construite et publiée en utilisant GitHub Actions et la branche principale du dépôt.

## Contribuer

Les contributions sont les bienvenues ! Veuillez lire nos [directives de contribution](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) avant de commencer.

## Avertissement

> [!CAUTION]
> Cette bibliothèque est fournie uniquement à des fins éducatives et de recherche. En utilisant cette bibliothèque, vous acceptez de vous conformer aux lois locales et internationales sur le scraping de données et la confidentialité. Les auteurs et contributeurs ne sont pas responsables de toute utilisation abusive de ce logiciel. Respectez toujours les conditions d'utilisation des sites web et les fichiers robots.txt.

## 🎓 Citations
Si vous avez utilisé notre bibliothèque à des fins de recherche, veuillez nous citer avec la référence suivante :
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## Licence

Ce travail est sous licence BSD-3-Clause.

## Remerciements

Ce projet inclut du code adapté de :
- Parsel (Licence BSD) — Utilisé pour le sous-module [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)

---
<div align="center"><small>Conçu et développé avec ❤️ par Karim Shoair.</small></div><br>


================================================
FILE: docs/README_JP.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>選択メソッド</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetcher の選び方</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>スパイダー</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>プロキシローテーション</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP モード</strong></a>
</p>

Scrapling は、単一のリクエストから本格的なクロールまですべてを処理する適応型 Web Scraping フレームワークです。

そのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcher はすぐに使える Cloudflare Turnstile などのアンチボットシステムを回避します。そして Spider フレームワークにより、Pause & Resume や自動 Proxy 回転機能を備えた並行マルチ Session クロールへとスケールアップできます — すべてわずか数行の Python で。1 つのライブラリ、妥協なし。

リアルタイム統計と Streaming による超高速クロール。Web Scraper によって、Web Scraper と一般ユーザーのために構築され、誰にでも何かがあります。

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # レーダーの下でウェブサイトを取得！
products = p.css('.product', auto_save=True)                                        # ウェブサイトのデザイン変更に耐えるデータをスクレイプ！
products = p.css('.product', adaptive=True)                                         # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける！
```
または本格的なクロールへスケールアップ
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# プラチナスポンサー
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling は Cloudflare Turnstile に対応。エンタープライズレベルの保護には、<a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a>が<b>Akamai</b>、<b>DataDome</b>、<b>Kasada</b>、<b>Incapsula</b>向けの有効な antibot トークンを生成する API エンドポイントを提供。シンプルな API 呼び出しで、ブラウザ自動化不要。 </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>プロキシは複雑で高価であるべきではないと考え、<a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a>を構築しました。 <br /> 195以上のロケーションの高速レジデンシャル・ISPプロキシ、公正な価格設定、そして本物のサポート。 <br />
      <b>ランディングページでFlappyBird ゲームを試して無料データをゲット！</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>：レジデンシャルプロキシが $0.49/GB から。完全に偽装された Chromium によるスクレイピングブラウザ、レジデンシャル IP、自動 CAPTCHA 解決、アンチボットバイパス。</br>
      <b>Scraper API で手間なく結果を取得。MCP と N8N の統合に対応。</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> は TikTok、X、YouTube、Instagram を含む 16 以上のプラットフォームで 900 以上の安定した API を提供し、4,000 万以上のデータセットを保有。<br /> さらに <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">割引 AI モデル</a>も提供 — Claude、GPT、GEMINI など最大 71% オフ。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> は開発者やスクレイパー向けの高速なレジデンシャルおよび ISP プロキシを提供。グローバル IP カバレッジ、高い匿名性、スマートなローテーション、自動化とデータ抽出のための信頼性の高いパフォーマンス。<a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> で大規模ウェブクローリングを簡素化。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    ノートパソコンを閉じても、スクレイパーは動き続けます。<br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - ノンストップ自動化のために構築されたクラウドサーバー。Windows と Linux マシンを完全制御。月額 €6.99 から。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">The Web Scraping Club で Scrapling の詳細レビュー</a>（2025年11月）をお読みください。Web スクレイピング専門の No.1 ニュースレターです。
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> は Web スクレイピング向けの信頼性の高いプロキシインフラを提供しています。IPv4、IPv6、ISP、レジデンシャル、モバイルプロキシに対応し、安定したパフォーマンス、幅広い地理的カバレッジ、企業規模のデータ収集に柔軟なプランを備えています。
    </td>
  </tr>
</table>

<i><sub>ここに広告を表示したいですか？[こちら](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)をクリック</sub></i>
# スポンサー

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>ここに広告を表示したいですか？[こちら](https://github.com/sponsors/D4Vinci)をクリックして、あなたに合ったティアを選択してください！</sub></i>

---

## 主な機能

### Spider — 本格的なクロールフレームワーク
- 🕷️ **Scrapy 風の Spider API**：`start_urls`、async `parse` callback、`Request`/`Response` オブジェクトで Spider を定義。
- ⚡ **並行クロール**：設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。
- 🔄 **マルチ Session サポート**：HTTP リクエストとステルスヘッドレスブラウザの統一インターフェース — ID によって異なる Session にリクエストをルーティング。
- 💾 **Pause & Resume**：Checkpoint ベースのクロール永続化。Ctrl+C で正常にシャットダウン；再起動すると中断したところから再開。
- 📡 **Streaming モード**：`async for item in spider.stream()` でリアルタイム統計とともにスクレイプされたアイテムを Streaming で受信 — UI、パイプライン、長時間実行クロールに最適。
- 🛡️ **ブロックされたリクエストの検出**：カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。
- 📦 **組み込みエクスポート**：フックや独自のパイプライン、または組み込みの JSON/JSONL で結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。

### Session サポート付き高度なウェブサイト取得
- **HTTP リクエスト**：`Fetcher` クラスで高速かつステルスな HTTP リクエスト。ブラウザの TLS fingerprint、ヘッダーを模倣し、HTTP/3 を使用可能。
- **動的読み込み**：Playwright の Chromium と Google Chrome をサポートする `DynamicFetcher` クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。
- **アンチボット回避**：`StealthyFetcher` と fingerprint 偽装による高度なステルス機能。自動化で Cloudflare の Turnstile/Interstitial のすべてのタイプを簡単に回避。
- **Session 管理**：リクエスト間で Cookie と状態を管理するための `FetcherSession`、`StealthySession`、`DynamicSession` クラスによる永続的な Session サポート。
- **Proxy 回転**：すべての Session タイプに対応したラウンドロビンまたはカスタム戦略の組み込み `ProxyRotator`、さらにリクエストごとの Proxy オーバーライド。
- **ドメインブロック**：ブラウザベースの Fetcher で特定のドメイン（およびそのサブドメイン）へのリクエストをブロック。
- **async サポート**：すべての Fetcher および専用 async Session クラス全体での完全な async サポート。

### 適応型スクレイピングと AI 統合
- 🔄 **スマート要素追跡**：インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
- 🎯 **スマート柔軟選択**：CSS セレクタ、XPath セレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
- 🔍 **類似要素の検出**：見つかった要素に類似した要素を自動的に特定。
- 🤖 **AI と使用する MCP サーバー**：AI 支援 Web Scraping とデータ抽出のための組み込み MCP サーバー。MCP サーバーは、AI（Claude/Cursor など）に渡す前に Scrapling を活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。（[デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE)）

### 高性能で実戦テスト済みのアーキテクチャ
- 🚀 **超高速**：ほとんどの Python スクレイピングライブラリを上回る最適化されたパフォーマンス。
- 🔋 **メモリ効率**：最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
- ⚡ **高速 JSON シリアル化**：標準ライブラリの 10 倍の速度。
- 🏗️ **実戦テスト済み**：Scrapling は 92% のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人の Web Scraper によって毎日使用されてきました。

### 開発者/Web Scraper にやさしい体験
- 🎯 **インタラクティブ Web Scraping Shell**：Scrapling 統合、ショートカット、curl リクエストを Scrapling リクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込み IPython Shell で、Web Scraping スクリプトの開発を加速。
- 🚀 **ターミナルから直接使用**：オプションで、コードを一行も書かずに Scrapling を使用して URL をスクレイプできます！
- 🛠️ **豊富なナビゲーション API**：親、兄弟、子のナビゲーションメソッドによる高度な DOM トラバーサル。
- 🧬 **強化されたテキスト処理**：組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
- 📝 **自動セレクタ生成**：任意の要素に対して堅牢な CSS/XPath セレクタを生成。
- 🔌 **馴染みのある API**：Scrapy/Parsel で使用されている同じ疑似要素を持つ Scrapy/BeautifulSoup に似た設計。
- 📘 **完全な型カバレッジ**：優れた IDE サポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。
- 🔋 **すぐに使える Docker イメージ**：各リリースで、すべてのブラウザを含む Docker イメージが自動的にビルドおよびプッシュされます。

## はじめに

深く掘り下げずに、Scrapling にできることの簡単な概要をお見せしましょう。

### 基本的な使い方
Session サポート付き HTTP リクエスト
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Chrome の TLS fingerprint の最新バージョンを使用
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# または一回限りのリクエストを使用
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
高度なステルスモード
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # 完了するまでブラウザを開いたままにする
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
完全なブラウザ自動化
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 完了するまでブラウザを開いたままにする
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # お好みであれば XPath セレクタを使用

# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spider
並行リクエスト、複数の Session タイプ、Pause & Resume を備えた本格的なクローラーを構築：
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"{len(result.items)}件の引用をスクレイプしました")
result.items.to_json("quotes.json")
```
単一の Spider で複数の Session タイプを使用：
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # 保護されたページはステルス Session を通してルーティング
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # 明示的な callback
```
Checkpoint を使用して長時間のクロールをPause & Resume：
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Ctrl+C を押すと正常に一時停止し、進捗は自動的に保存されます。後で Spider を再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。

### 高度なパースとナビゲーション
```python
from scrapling.fetchers import Fetcher

# 豊富な要素選択とナビゲーション
page = Fetcher.get('https://quotes.toscrape.com/')

# 複数の選択メソッドで引用を取得
quotes = page.css('.quote')  # CSS セレクタ
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup スタイル
# 以下と同じ
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # など...
# テキスト内容で要素を検索
quotes = page.find_by_text('quote', tag='div')

# 高度なナビゲーション
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # チェーンセレクタ
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# 要素の関連性と類似性
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
ウェブサイトを取得せずにパーサーをすぐに使用することもできます：
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
まったく同じ方法で動作します！

### 非同期 Session 管理の例
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` はコンテキストアウェアで、同期/非同期両方のパターンで動作可能
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# 非同期 Session の使用
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # オプション - ブラウザタブプールのステータス（ビジー/フリー/エラー）
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI とインタラクティブ Shell

Scrapling には強力なコマンドラインインターフェースが含まれています：

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

インタラクティブ Web Scraping Shell を起動
```bash
scrapling shell
```
プログラミングせずに直接ページをファイルに抽出（デフォルトで`body`タグ内のコンテンツを抽出）。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTML コンテンツの Markdown 表現になります。`.html` で終わる場合、HTML コンテンツそのものになります。
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # CSS セレクタ'#fromSkipToProducts'に一致するすべての要素
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> MCP サーバーやインタラクティブ Web Scraping Shell など、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください

## パフォーマンスベンチマーク

Scrapling は強力であるだけでなく、超高速です。以下のベンチマークは、Scrapling のパーサーを他の人気ライブラリの最新バージョンと比較しています。

### テキスト抽出速度テスト（5000 個のネストされた要素）

| # |      ライブラリ      | 時間 (ms) | vs Scrapling |
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### 要素類似性とテキスト検索のパフォーマンス

Scrapling の適応型要素検索機能は代替手段を大幅に上回ります：

| ライブラリ     | 時間 (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |


> すべてのベンチマークは 100 回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。

## インストール

Scrapling には Python 3.10 以上が必要です：

```bash
pip install scrapling
```

このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcher やコマンドライン依存関係は含まれていません。

### オプションの依存関係

1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcher の依存関係とブラウザの依存関係を次のようにインストールする必要があります：
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint 操作依存関係がダウンロードされます。

    または、コマンドを実行する代わりにコードからインストールすることもできます：
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. 追加機能：
   - MCP サーバー機能をインストール：
       ```bash
       pip install "scrapling[ai]"
       ```
   - Shell 機能（Web Scraping Shell と`extract`コマンド）をインストール：
       ```bash
       pip install "scrapling[shell]"
       ```
   - すべてをインストール：
       ```bash
       pip install "scrapling[all]"
       ```
   これらの追加機能のいずれかの後（まだインストールしていない場合）、`scrapling install`でブラウザの依存関係をインストールする必要があることを忘れないでください

### Docker
DockerHub から次のコマンドですべての追加機能とブラウザを含む Docker イメージをインストールすることもできます：
```bash
docker pull pyd4vinci/scrapling
```
または GitHub レジストリからダウンロード：
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
このイメージは、GitHub Actions とリポジトリのメインブランチを使用して自動的にビルドおよびプッシュされます。

## 貢献

貢献を歓迎します！始める前に[貢献ガイドライン](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)をお読みください。

## 免責事項

> [!CAUTION]
> このライブラリは教育および研究目的のみで提供されています。このライブラリを使用することにより、地域および国際的なデータスクレイピングおよびプライバシー法に準拠することに同意したものとみなされます。著者および貢献者は、このソフトウェアの誤用について責任を負いません。常にウェブサイトの利用規約とrobots.txt ファイルを尊重してください。

## 🎓 引用
研究目的で当ライブラリを使用された場合は、以下の参考文献で引用してください：
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## ライセンス

この作品は BSD-3-Clause ライセンスの下でライセンスされています。

## 謝辞

このプロジェクトには次から適応されたコードが含まれています：
- Parsel（BSD ライセンス）— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) サブモジュールに使用

---
<div align="center"><small>Karim Shoair によって❤️でデザインおよび作成されました。</small></div><br>


================================================
FILE: docs/README_KR.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>선택 메서드</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetcher 선택 가이드</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spider</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>프록시 로테이션</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP 서버</strong></a>
</p>

Scrapling은 단일 요청부터 대규모 크롤링까지 모든 것을 처리하는 적응형 Web Scraping 프레임워크입니다.

파서는 웹사이트 변경 사항을 학습하고, 페이지가 업데이트되면 요소를 자동으로 재배치합니다. Fetcher는 Cloudflare Turnstile 같은 안티봇 시스템을 별도 설정 없이 우회합니다. Spider 프레임워크를 사용하면 일시정지/재개 및 자동 프록시 로테이션을 갖춘 동시 멀티 세션 크롤링으로 확장할 수 있습니다 — 모두 Python 몇 줄이면 됩니다. 하나의 라이브러리, 타협 없는 성능.

실시간 통계와 스트리밍을 통한 초고속 크롤링. Web Scraper가 만들고, Web Scraper와 일반 사용자 모두를 위해 설계했습니다.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # 탐지를 피해 웹사이트를 가져옵니다!
products = p.css('.product', auto_save=True)                                        # 웹사이트 디자인 변경에도 살아남는 데이터를 스크레이핑!
products = p.css('.product', adaptive=True)                                         # 나중에 웹사이트 구조가 바뀌면, `adaptive=True`를 전달해서 찾으세요!
```
또는 본격적인 크롤링으로 확장
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# 플래티넘 스폰서
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling은 Cloudflare Turnstile을 처리합니다. 엔터프라이즈급 보호가 필요하다면, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a>가 <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b>, <b>Incapsula</b>용 유효한 안티봇 토큰을 생성하는 API 엔드포인트를 제공합니다. 간단한 API 호출만으로, 브라우저 자동화가 필요 없습니다. </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>프록시는 복잡하거나 비쌀 이유가 없다는 생각으로 <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a>를 만들었습니다. <br /> 195개 이상 지역의 빠른 레지덴셜 및 ISP 프록시, 합리적인 가격, 실질적인 지원. <br />
      <b>랜딩 페이지에서 FlappyBird 게임을 플레이하고 무료 데이터를 받으세요!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: 레지덴셜 프록시 GB당 $0.49부터. 완전히 위장된 Chromium 스크레이핑 브라우저, 레지덴셜 IP, 자동 CAPTCHA 해결, 안티봇 우회.</br>
      <b>Scraper API로 번거로움 없이 결과를 얻으세요. MCP 및 N8N 통합 지원.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a>는 TikTok, X, YouTube, Instagram 등 16개 이상 플랫폼에서 900개 이상의 안정적인 API를 제공하며, 4,000만 이상의 데이터셋을 보유하고 있습니다. <br /> <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">할인된 AI 모델</a>도 제공 — Claude, GPT, GEMINI 등 최대 71% 할인.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a>는 개발자와 스크레이퍼를 위한 빠른 레지덴셜 및 ISP 프록시를 제공합니다. 글로벌 IP 커버리지, 높은 익명성, 스마트 로테이션, 자동화와 데이터 추출을 위한 안정적인 성능. <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a>로 대규모 웹 크롤링을 간소화하세요.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    노트북을 닫으세요. 스크래퍼는 계속 작동합니다. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - 논스톱 자동화를 위한 클라우드 서버. Windows 및 Linux 머신을 완벽하게 제어. 월 €6.99부터.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">The Web Scraping Club에서 Scrapling의 전체 리뷰</a>(2025년 11월)를 읽어보세요. 웹 스크래핑 전문 No.1 뉴스레터입니다.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a>는 웹 스크래핑을 위한 안정적인 프록시 인프라를 제공합니다. IPv4, IPv6, ISP, 주거용 및 모바일 프록시를 지원하며, 안정적인 성능, 광범위한 지역 커버리지, 기업 규모의 데이터 수집을 위한 유연한 요금제를 갖추고 있습니다.
    </td>
  </tr>
</table>

<i><sub>여기에 광고를 게재하고 싶으신가요? [여기](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)를 클릭하세요</sub></i>
# 스폰서

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>여기에 광고를 게재하고 싶으신가요? [여기](https://github.com/sponsors/D4Vinci)를 클릭하고 원하는 티어를 선택하세요!</sub></i>

---

## 주요 기능

### Spider — 본격적인 크롤링 프레임워크
- 🕷️ **Scrapy 스타일 Spider API**: `start_urls`, 비동기 `parse` 콜백, `Request`/`Response` 객체로 Spider를 정의합니다.
- ⚡ **동시 크롤링**: 설정 가능한 동시 요청 수 제한, 도메인별 스로틀링, 다운로드 딜레이를 지원합니다.
- 🔄 **멀티 세션 지원**: HTTP 요청과 스텔스 헤드리스 브라우저를 하나의 인터페이스로 통합 — ID로 요청을 다른 세션에 라우팅합니다.
- 💾 **일시정지 & 재개**: 체크포인트 기반의 크롤링 영속화. Ctrl+C로 정상 종료하고, 재시작하면 중단된 지점부터 이어갑니다.
- 📡 **스트리밍 모드**: `async for item in spider.stream()`으로 스크레이핑된 아이템을 실시간 통계와 함께 스트리밍으로 수신 — UI, 파이프라인, 장시간 크롤링에 적합합니다.
- 🛡️ **차단된 요청 감지**: 커스텀 로직을 통한 차단된 요청의 자동 감지 및 재시도를 지원합니다.
- 📦 **내장 내보내기**: 훅이나 자체 파이프라인, 또는 내장 JSON/JSONL로 결과를 내보냅니다. 각각 `result.items.to_json()` / `result.items.to_jsonl()`을 사용합니다.

### 세션을 지원하는 고급 웹사이트 가져오기
- **HTTP 요청**: `Fetcher` 클래스로 빠르고 은밀한 HTTP 요청. 브라우저의 TLS fingerprint, 헤더를 모방하고, HTTP/3를 사용할 수 있습니다.
- **동적 로딩**: Playwright의 Chromium과 Google Chrome을 지원하는 `DynamicFetcher` 클래스로 완전한 브라우저 자동화를 통해 동적 웹사이트를 가져옵니다.
- **안티봇 우회**: `StealthyFetcher`와 fingerprint 위장을 통한 고급 스텔스 기능. 자동화로 모든 유형의 Cloudflare Turnstile/Interstitial을 손쉽게 우회합니다.
- **세션 관리**: `FetcherSession`, `StealthySession`, `DynamicSession` 클래스로 요청 간 쿠키와 상태를 관리하는 영속적 세션을 지원합니다.
- **프록시 로테이션**: 모든 세션 타입에 대응하는 순환 또는 커스텀 전략의 내장 `ProxyRotator`와 요청별 프록시 오버라이드를 제공합니다.
- **도메인 차단**: 브라우저 기반 Fetcher에서 특정 도메인(및 하위 도메인)으로의 요청을 차단합니다.
- **비동기 지원**: 모든 Fetcher와 전용 비동기 세션 클래스에서 완전한 비동기를 지원합니다.

### 적응형 스크레이핑 & AI 통합
- 🔄 **스마트 요소 추적**: 지능적인 유사도 알고리즘으로 웹사이트 변경 후에도 요소를 재배치합니다.
- 🎯 **유연한 스마트 선택**: CSS selector, XPath selector, 필터 기반 검색, 텍스트 검색, 정규식 검색 등을 지원합니다.
- 🔍 **유사 요소 찾기**: 발견된 요소와 유사한 요소를 자동으로 찾아냅니다.
- 🤖 **AI와 함께 사용하는 MCP 서버**: AI 기반 Web Scraping과 데이터 추출을 위한 내장 MCP 서버. AI(Claude/Cursor 등)에 전달하기 전에 Scrapling을 활용해 대상 콘텐츠를 추출하는 강력한 커스텀 기능을 갖추고 있어, 작업 속도를 높이고 토큰 사용량을 최소화해 비용을 절감합니다. ([데모 영상](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### 고성능 & 실전 검증된 아키텍처
- 🚀 **초고속**: 대부분의 Python 스크레이핑 라이브러리를 능가하는 최적화된 성능.
- 🔋 **메모리 효율**: 최적화된 데이터 구조와 지연 로딩으로 메모리 사용을 최소화합니다.
- ⚡ **고속 JSON 직렬화**: 표준 라이브러리보다 10배 빠릅니다.
- 🏗️ **실전 검증**: Scrapling은 92%의 테스트 커버리지와 완전한 타입 힌트 커버리지를 갖추고 있을 뿐 아니라, 지난 1년간 수백 명의 Web Scraper가 매일 사용해 왔습니다.

### 개발자/Web Scraper 친화적 경험
- 🎯 **인터랙티브 Web Scraping Shell**: Scrapling 통합, 단축키, curl 요청을 Scrapling 요청으로 변환하거나 브라우저에서 요청 결과를 확인하는 등의 도구를 갖춘 선택적 내장 IPython Shell로, Web Scraping 스크립트 개발을 가속합니다.
- 🚀 **터미널에서 바로 사용**: 코드 한 줄 없이 Scrapling으로 URL을 스크레이핑할 수 있습니다!
- 🛠️ **풍부한 내비게이션 API**: 부모, 형제, 자식 탐색 메서드를 통한 고급 DOM 순회를 지원합니다.
- 🧬 **향상된 텍스트 처리**: 내장 정규식, 클리닝 메서드, 최적화된 문자열 연산을 제공합니다.
- 📝 **자동 셀렉터 생성**: 모든 요소에 대해 견고한 CSS/XPath selector를 생성합니다.
- 🔌 **익숙한 API**: Scrapy/Parsel에서 사용하는 것과 동일한 의사 요소(pseudo-element)를 가진 Scrapy/BeautifulSoup 스타일의 API.
- 📘 **완전한 타입 커버리지**: 뛰어난 IDE 지원과 코드 자동완성을 위한 완전한 타입 힌트. 코드베이스 전체가 변경될 때마다 **PyRight**와 **MyPy**로 자동 검사됩니다.
- 🔋 **바로 사용 가능한 Docker 이미지**: 매 릴리스마다 모든 브라우저를 포함한 Docker 이미지가 자동으로 빌드 및 푸시됩니다.

## 시작하기

깊이 들어가지 않고, Scrapling이 할 수 있는 것들을 간단히 살펴보겠습니다.

### 기본 사용법
세션을 지원하는 HTTP 요청
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Chrome의 최신 TLS fingerprint 사용
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# 또는 일회성 요청 사용
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
고급 스텔스 모드
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # 작업이 끝날 때까지 브라우저를 열어둡니다
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# 또는 일회성 요청 스타일 — 이 요청을 위해 브라우저를 열고, 완료 후 닫습니다
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
완전한 브라우저 자동화
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 작업이 끝날 때까지 브라우저를 열어둡니다
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # 원하시면 XPath selector도 사용 가능

# 또는 일회성 요청 스타일 — 이 요청을 위해 브라우저를 열고, 완료 후 닫습니다
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spider
동시 요청, 여러 세션 타입, 일시정지 & 재개를 갖춘 본격적인 크롤러 구축:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"{len(result.items)}개의 인용구를 스크레이핑했습니다")
result.items.to_json("quotes.json")
```
하나의 Spider에서 여러 세션 타입 사용:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # 보호된 페이지는 스텔스 세션을 통해 라우팅
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # 명시적 콜백
```
체크포인트를 사용해 장시간 크롤링을 일시정지 & 재개:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Ctrl+C를 누르면 정상적으로 일시정지되고, 진행 상황이 자동 저장됩니다. 이후 Spider를 다시 시작할 때 동일한 `crawldir`을 전달하면 중단된 지점부터 재개합니다.

### 고급 파싱 & 내비게이션
```python
from scrapling.fetchers import Fetcher

# 풍부한 요소 선택과 내비게이션
page = Fetcher.get('https://quotes.toscrape.com/')

# 여러 선택 메서드로 인용구 가져오기
quotes = page.css('.quote')  # CSS selector
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup 스타일
# 아래와 동일
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # 등등...
# 텍스트 내용으로 요소 찾기
quotes = page.find_by_text('quote', tag='div')

# 고급 내비게이션
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # 체이닝 셀렉터
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# 요소 관계와 유사도
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
웹사이트를 가져오지 않고 파서를 바로 사용할 수도 있습니다:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
사용법은 완전히 동일합니다!

### 비동기 세션 관리 예시
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession`은 컨텍스트 인식이 가능하며 동기/비동기 패턴 모두에서 작동
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# 비동기 세션 사용
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # 선택 사항 - 브라우저 탭 풀 상태 (사용 중/유휴/에러)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI & 인터랙티브 Shell

Scrapling에는 강력한 커맨드라인 인터페이스가 포함되어 있습니다:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

인터랙티브 Web Scraping Shell 실행
```bash
scrapling shell
```
프로그래밍 없이 페이지를 파일로 바로 추출합니다 (기본적으로 `body` 태그 내부의 콘텐츠를 추출). 출력 파일이 `.txt`로 끝나면 대상의 텍스트 콘텐츠가 추출됩니다. `.md`로 끝나면 HTML 콘텐츠의 Markdown 표현이 됩니다. `.html`로 끝나면 HTML 콘텐츠 자체가 됩니다.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # CSS selector '#fromSkipToProducts'에 매칭되는 모든 요소
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> MCP 서버와 인터랙티브 Web Scraping Shell 등 더 많은 기능이 있지만, 이 페이지는 간결하게 유지하겠습니다. 전체 문서는 [여기](https://scrapling.readthedocs.io/en/latest/)에서 확인하세요.

## 성능 벤치마크

Scrapling은 강력할 뿐만 아니라 초고속입니다. 아래 벤치마크는 Scrapling의 파서를 다른 인기 라이브러리의 최신 버전과 비교한 것입니다.

### 텍스트 추출 속도 테스트 (5000개 중첩 요소)

| # |      Library      | Time (ms) | vs Scrapling |
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### 요소 유사도 & 텍스트 검색 성능

Scrapling의 적응형 요소 찾기 기능은 대안들을 크게 앞섭니다:

| Library     | Time (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |


> 모든 벤치마크는 100회 이상 실행의 평균입니다. 측정 방법은 [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)를 참조하세요.

## 설치

Scrapling은 Python 3.10 이상이 필요합니다:

```bash
pip install scrapling
```

이 설치에는 파서 엔진과 의존성만 포함되며, Fetcher나 커맨드라인 의존성은 포함되지 않습니다.

### 선택적 의존성

1. 아래의 추가 기능, Fetcher, 또는 관련 클래스를 사용하려면 Fetcher 의존성과 브라우저 의존성을 다음과 같이 설치해야 합니다:
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # 일반 설치
    scrapling install  --force  # 강제 재설치
    ```

    이렇게 하면 모든 브라우저와 시스템 의존성, fingerprint 조작 의존성이 다운로드됩니다.

    또는 명령어 대신 코드에서 설치할 수도 있습니다:
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # 일반 설치
    install(["--force"], standalone_mode=False) # 강제 재설치
    ```

2. 추가 기능:
   - MCP 서버 기능 설치:
       ```bash
       pip install "scrapling[ai]"
       ```
   - Shell 기능 (Web Scraping Shell 및 `extract` 명령어) 설치:
       ```bash
       pip install "scrapling[shell]"
       ```
   - 모든 기능 설치:
       ```bash
       pip install "scrapling[all]"
       ```
   위 추가 기능을 설치한 후에도 (아직 하지 않았다면) `scrapling install`로 브라우저 의존성을 설치해야 합니다.

### Docker
DockerHub에서 모든 추가 기능과 브라우저가 포함된 Docker 이미지를 설치할 수도 있습니다:
```bash
docker pull pyd4vinci/scrapling
```
또는 GitHub 레지스트리에서 다운로드:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
이 이미지는 GitHub Actions와 레포지토리의 main 브랜치를 사용하여 자동으로 빌드 및 푸시됩니다.

## 기여하기

기여를 환영합니다! 시작하기 전에 [기여 가이드라인](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)을 읽어주세요.

## 면책 조항

> [!CAUTION]
> 이 라이브러리는 교육 및 연구 목적으로만 제공됩니다. 이 라이브러리를 사용함으로써, 국내외 데이터 스크레이핑 및 개인정보 보호 관련 법률을 준수하는 데 동의한 것으로 간주됩니다. 저자와 기여자는 이 소프트웨어의 오용에 대해 책임지지 않습니다. 항상 웹사이트의 이용약관과 robots.txt 파일을 존중하세요.

## 🎓 인용
연구 목적으로 이 라이브러리를 사용하셨다면, 아래 참고 문헌으로 인용해 주세요:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## 라이선스

이 프로젝트는 BSD-3-Clause 라이선스 하에 배포됩니다.

## 감사의 말

이 프로젝트에는 다음에서 차용한 코드가 포함되어 있습니다:
- Parsel (BSD 라이선스) — [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) 서브모듈에 사용

---
<div align="center"><small>Karim Shoair가 ❤️으로 디자인하고 만들었습니다.</small></div><br>


================================================
FILE: docs/README_RU.md
================================================
<!-- mcp-name: io.github.D4Vinci/Scrapling -->

<h1 align="center">
    <a href="https://scrapling.readthedocs.io">
        <picture>
          <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
          <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
        </picture>
    </a>
    <br>
    <small>Effortless Web Scraping for the Modern Web</small>
</h1>

<p align="center">
    <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
        <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
    <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
        <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
    <a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
    <a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
        <img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
    <a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
        <img alt="OpenClaw Skill" src="https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official"></a>
    <br/>
    <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
      <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
    </a>
    <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
      <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
    </a>
    <br/>
    <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
        <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
</p>

<p align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Методы выбора</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Выбор Fetcher</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Пауки</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Ротация прокси</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
    &middot;
    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>Режим MCP</strong></a>
</p>

Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.

Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.

Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого.

```python
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Загрузите сайт незаметно!
products = p.css('.product', auto_save=True)                                        # Скрапьте данные, которые переживут изменения дизайна сайта!
products = p.css('.product', adaptive=True)                                         # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!
```
Или масштабируйте до полного обхода
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

<p align="center">
    <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:4px 0;">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
    </a>
</p>

# Платиновые спонсоры
<table>
  <tr>
    <td width="200">
      <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
      </a>
    </td>
    <td> Scrapling справляется с Cloudflare Turnstile. Для защиты корпоративного уровня
      <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
        <b>Hyper Solutions</b>
      </a> предоставляет API-эндпоинты, генерирующие валидные antibot-токены для <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> и <b>Incapsula</b> . Простые API-вызовы, без автоматизации браузера.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
      </a>
    </td>
    <td>Мы создали
      <a href="https://birdproxies.com/t/scrapling">
        <b>BirdProxies</b>
      </a>, потому что прокси не должны быть сложными или дорогими. <br /> Быстрые резидентные и ISP прокси в 195+ локациях, честные цены и настоящая поддержка. <br />
      <b>Попробуйте нашу игру FlappyBird на лендинге и получите бесплатные данные!</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
      </a>
    </td>
    <td>
      <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling">
        <b>Evomi</b>
      </a>: резидентные прокси от $0.49/ГБ. Браузер для скрапинга с полностью подменённым Chromium, резидентными IP, автоматическим решением CAPTCHA и обходом анти-бот систем. </br>
      <b>Scraper API для получения результатов без лишних сложностей. Доступны интеграции с MCP и N8N.</b>
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
      </a>
    </td>
    <td>
      <a href="https://tikhub.io/?ref=KarimShoair" target="_blank">TikHub.io</a> предоставляет более 900 стабильных API на 16+ платформах, включая TikTok, X, YouTube и Instagram, с более чем 40 млн наборов данных. <br /> Также предлагает <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">AI-модели со скидкой</a> — Claude, GPT, GEMINI и другие со скидкой до 71%.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
      </a>
    </td>
    <td>
    <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> предоставляет быстрые резидентные и ISP прокси для разработчиков и скраперов. Глобальное покрытие IP, высокая анонимность, умная ротация и надёжная производительность для автоматизации и извлечения данных. Используйте <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> для упрощения масштабного веб-краулинга.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
      </a>
    </td>
    <td>
    Закройте ноутбук. Ваши скраперы продолжают работать. <br />
    <a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - облачные серверы для непрерывной автоматизации. Машины на Windows и Linux с полным контролем. От €6,99/мес.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
      </a>
    </td>
    <td>
    Прочитайте полный обзор <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling на The Web Scraping Club</a> (ноябрь 2025) — рассылка №1, посвящённая веб-скрейпингу.
    </td>
  </tr>
  <tr>
    <td width="200">
      <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
        <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
      </a>
    </td>
    <td>
    <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> предоставляет надёжную прокси-инфраструктуру для веб-скрейпинга: IPv4, IPv6, ISP, резидентные и мобильные прокси со стабильной производительностью, широким географическим покрытием и гибкими тарифами для сбора данных в масштабах бизнеса.
    </td>
  </tr>
</table>

<i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
# Спонсоры

<!-- sponsors -->

<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a><a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>

<!-- /sponsors -->

<i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci) и выберите подходящий вам уровень!</sub></i>

---

## Ключевые особенности

### Spider'ы — полноценный фреймворк для обхода сайтов
- 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.
- ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.
- 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.
- 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.
- 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.
- 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемой логикой.
- 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.

### Продвинутая загрузка сайтов с поддержкой Session
- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.
- **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.
- **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
- **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.
- **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.
- **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.

### Адаптивный скрапинг и интеграция с ИИ
- 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
- 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.
- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### Высокопроизводительная и проверенная в боях архитектура
- 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.
- 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.
- 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.

### Удобный для разработчиков/веб-скраперов опыт
- 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов запросов в браузере.
- 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
- 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
- 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
- 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.
- 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
- 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.
- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.

## Начало работы

Давайте кратко покажем, на что способен Scrapling, без глубокого погружения.

### Базовое использование
HTTP-запросы с поддержкой Session
```python
from scrapling.fetchers import Fetcher, FetcherSession

with FetcherSession(impersonate='chrome') as session:  # Используйте последнюю версию TLS fingerprint Chrome
    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
    quotes = page.css('.quote .text::text').getall()

# Или используйте одноразовые запросы
page = Fetcher.get('https://quotes.toscrape.com/')
quotes = page.css('.quote .text::text').getall()
```
Расширенный режим скрытности
```python
from scrapling.fetchers import StealthyFetcher, StealthySession

with StealthySession(headless=True, solve_cloudflare=True) as session:  # Держите браузер открытым, пока не закончите
    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
    data = page.css('#padded_content a').getall()

# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
data = page.css('#padded_content a').getall()
```
Полная автоматизация браузера
```python
from scrapling.fetchers import DynamicFetcher, DynamicSession

with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Держите браузер открытым, пока не закончите
    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
    data = page.xpath('//span[@class="text"]/text()').getall()  # XPath-селектор, если вы предпочитаете его

# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
data = page.css('.quote .text::text').getall()
```

### Spider'ы
Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:
```python
from scrapling.spiders import Spider, Request, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com/"]
    concurrent_requests = 10

    async def parse(self, response: Response):
        for quote in response.css('.quote'):
            yield {
                "text": quote.css('.text::text').get(),
                "author": quote.css('.author::text').get(),
            }

        next_page = response.css('.next a')
        if next_page:
            yield response.follow(next_page[0].attrib['href'])

result = QuotesSpider().start()
print(f"Извлечено {len(result.items)} цитат")
result.items.to_json("quotes.json")
```
Используйте несколько типов сессий в одном Spider:
```python
from scrapling.spiders import Spider, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class MultiSessionSpider(Spider):
    name = "multi"
    start_urls = ["https://example.com/"]

    def configure_sessions(self, manager):
        manager.add("fast", FetcherSession(impersonate="chrome"))
        manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)

    async def parse(self, response: Response):
        for link in response.css('a::attr(href)').getall():
            # Направляйте защищённые страницы через stealth-сессию
            if "protected" in link:
                yield Request(link, sid="stealth")
            else:
                yield Request(link, sid="fast", callback=self.parse)  # явный callback
```
Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:
```python
QuotesSpider(crawldir="./crawl_data").start()
```
Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.

### Продвинутый парсинг и навигация
```python
from scrapling.fetchers import Fetcher

# Богатый выбор элементов и навигация
page = Fetcher.get('https://quotes.toscrape.com/')

# Получение цитат различными методами выбора
quotes = page.css('.quote')  # CSS-селектор
quotes = page.xpath('//div[@class="quote"]')  # XPath
quotes = page.find_all('div', {'class': 'quote'})  # В стиле BeautifulSoup
# То же самое, что
quotes = page.find_all('div', class_='quote')
quotes = page.find_all(['div'], class_='quote')
quotes = page.find_all(class_='quote')  # и так далее...
# Найти элемент по текстовому содержимому
quotes = page.find_by_text('quote', tag='div')

# Продвинутая навигация
quote_text = page.css('.quote')[0].css('.text::text').get()
quote_text = page.css('.quote').css('.text::text').getall()  # Цепочка селекторов
first_quote = page.css('.quote')[0]
author = first_quote.next_sibling.css('.author::text')
parent_container = first_quote.parent

# Связи элементов и подобие
similar_elements = first_quote.find_similar()
below_elements = first_quote.below_elements()
```
Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:
```python
from scrapling.parser import Selector

page = Selector("<html>...</html>")
```
И он работает точно так же!

### Примеры async Session
```python
import asyncio
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession

async with FetcherSession(http3=True) as session:  # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах
    page1 = session.get('https://quotes.toscrape.com/')
    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')

# Использование async-сессии
async with AsyncStealthySession(max_pages=2) as session:
    tasks = []
    urls = ['https://example.com/page1', 'https://example.com/page2']

    for url in urls:
        task = session.fetch(url)
        tasks.append(task)

    print(session.get_pool_stats())  # Опционально — статус пула вкладок браузера (занят/свободен/ошибка)
    results = await asyncio.gather(*tasks)
    print(session.get_pool_stats())
```

## CLI и интерактивная Shell

Scrapling включает мощный интерфейс командной строки:

[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)

Запустить интерактивную Web Scraping Shell
```bash
scrapling shell
```
Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
```bash
scrapling extract get 'https://example.com' content.md
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
```

> [!NOTE]
> Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)

## Тесты производительности

Scrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.

### Тест скорости извлечения текста (5000 вложенных элементов)

| # |    Библиотека     | Время (мс) | vs Scrapling |
|---|:-----------------:|:----------:|:------------:|
| 1 |     Scrapling     |    2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |
| 3 |     Raw Lxml      |    2.54    |    1.257     |
| 4 |      PyQuery      |   24.17    |     ~12x     |
| 5 |    Selectolax     |   82.63    |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |


### Производительность подобия элементов и текстового поиска

Возможности адаптивного поиска элементов Scrapling значительно превосходят альтернативы:

| Библиотека  | Время (мс) | vs Scrapling |
|-------------|:----------:|:------------:|
| Scrapling   |    2.39    |     1.0x     |
| AutoScraper |   12.45    |    5.209x    |


> Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.

## Установка

Scrapling требует Python 3.10 или выше:

```bash
pip install scrapling
```

Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.

### Опциональные зависимости

1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:
    ```bash
    pip install "scrapling[fetchers]"

    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.

    Или вы можете установить их из кода вместо выполнения команды:
    ```python
    from scrapling.cli import install

    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. Дополнительные возможности:
   - Установить функцию MCP-сервера:
       ```bash
       pip install "scrapling[ai]"
       ```
   - Установить функции Shell (Web Scraping Shell и команда `extract`):
       ```bash
       pip install "scrapling[shell]"
       ```
   - Установить всё:
       ```bash
       pip install "scrapling[all]"
       ```
   Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)

### Docker
Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
```bash
docker pull pyd4vinci/scrapling
```
Или скачайте его из реестра GitHub:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
Этот образ автоматически создаётся и публикуется с помощью GitHub Actions и основной ветки репозитория.

## Участие в разработке

Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.

## Отказ от ответственности

> [!CAUTION]
> Эта библиотека предоставляется только в образовательных и исследовательских целях. Используя эту библиотеку, вы соглашаетесь соблюдать местные и международные законы о скрапинге данных и конфиденциальности. Авторы и участники не несут ответственности за любое неправомерное использование этого программного обеспечения. Всегда уважайте условия обслуживания веб-сайтов и файлы robots.txt.

## 🎓 Цитирование
Если вы использовали нашу библиотеку в исследовательских целях, пожалуйста, цитируйте нас со следующей ссылкой:
```text
  @misc{scrapling,
    author = {Karim Shoair},
    title = {Scrapling},
    year = {2024},
    url = {https://github.com/D4Vinci/Scrapling},
    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}
  }
```

## Лицензия

Эта работа лицензирована по лицензии BSD-3-Clause.

## Благодарности

Этот проект включает код, адаптированный из:
- Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)

---
<div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>


================================================
FILE: docs/ai/mcp-server.md
================================================
# Scrapling MCP Server Guide

<iframe width="560" height="315" src="https://www.youtube.com/embed/qyFk3ZNwOxE?si=3FHzgcYCb66iJ6e3" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>

The **Scrapling MCP Server** is a new feature that brings Scrapling's powerful Web Scraping capabilities directly to your favorite AI chatbot or AI agent. This integration allows you to scrape websites, extract data, and bypass anti-bot protections conversationally through Claude's AI interface or any interface that supports MCP.

## Features

The Scrapling MCP Server provides six powerful tools for web scraping:

### 🚀 Basic HTTP Scraping
- **`get`**: Fast HTTP requests with browser fingerprint impersonation, generating real browser headers matching the TLS version, HTTP/3, and more!
- **`bulk_get`**: An async version of the above tool that allows scraping of multiple URLs at the same time!

### 🌐 Dynamic Content Scraping  
- **`fetch`**: Rapidly fetch dynamic content with Chromium/Chrome browser with complete control over the request/browser, and more!
- **`bulk_fetch`**: An async version of the above tool that allows scraping of multiple URLs in different browser tabs at the same time!

### 🔒 Stealth Scraping
- **`stealthy_fetch`**: Uses our Stealthy browser to bypass Cloudflare Turnstile/Interstitial and other anti-bot systems with complete control over the request/browser! 
- **`bulk_stealthy_fetch`**: An async version of the above tool that allows stealth scraping of multiple URLs in different browser tabs at the same time!

### Key Capabilities
- **Smart Content Extraction**: Convert web pages/elements to Markdown, HTML, or extract a clean version of the text content
- **CSS Selector Support**: Use the Scrapling engine to target specific elements with precision before handing the content to the AI
- **Anti-Bot Bypass**: Handle Cloudflare Turnstile, Interstitial, and other protections
- **Proxy Support**: Use proxies for anonymity and geo-targeting
- **Browser Impersonation**: Mimic real browsers with TLS fingerprinting, real browser headers matching that version, and more
- **Parallel Processing**: Scrape multiple URLs concurrently for efficiency

#### But why use Scrapling MCP Server instead of other available tools?

Aside from its stealth capabilities and ability to bypass Cloudflare Turnstile/Interstitial, Scrapling's server is the only one that lets you select specific elements to pass to the AI, saving a lot of time and tokens!

The way other servers work is that they extract the content, then pass it all to the AI to extract the fields you want. This causes the AI to consume far more tokens than needed (from irrelevant content). Scrapling solves this problem by allowing you to pass a CSS selector to narrow down the content you want before passing it to the AI, which makes the whole process much faster and more efficient.

If you don't know how to write/use CSS selectors, don't worry. You can tell the AI in the prompt to write selectors to match possible fields for you and watch it try different combinations until it finds the right one, as we will show in the examples section.

## Installation

Install Scrapling with MCP Support, then double-check that the browser dependencies are installed.

```bash
# Install Scrapling with MCP server dependencies
pip install "scrapling[ai]"

# Install browser dependencies
scrapling install
```

Or use the Docker image directly from the Docker registry:
```bash
docker pull pyd4vinci/scrapling
```
Or download it from the GitHub registry:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```

## Setting up the MCP Server

Here we will explain how to add Scrapling MCP Server to [Claude Desktop](https://claude.ai/download) and [Claude Code](https://www.anthropic.com/claude-code), but the same logic applies to any other chatbot that supports MCP:

### Claude Desktop

1. Open Claude Desktop
2. Click the hamburger menu (☰) at the top left → Settings → Developer → Edit Config
3. Add the Scrapling MCP server configuration:
```json
"ScraplingServer": {
  "command": "scrapling",
  "args": [
    "mcp"
  ]
}
```
If that's the first MCP server you're adding, set the content of the file to this: 
```json
{
  "mcpServers": {
    "ScraplingServer": {
      "command": "scrapling",
      "args": [
        "mcp"
      ]
    }
  }
}
```
As per the [official article](https://modelcontextprotocol.io/quickstart/user), this action either creates a new configuration file if none exists or opens your existing configuration. The file is located at

1. **MacOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
2. **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`

To ensure it's working, use the full path to the `scrapling` executable. Open the terminal and execute the following command:

1. **MacOS**: `which scrapling`
2. **Windows**: `where scrapling`

For me, on my Mac, it returned `/Users/<MyUsername>/.venv/bin/scrapling`, so the config I used in the end is:
```json
{
  "mcpServers": {
    "ScraplingServer": {
      "command": "/Users/<MyUsername>/.venv/bin/scrapling",
      "args": [
        "mcp"
      ]
    }
  }
}
```
#### Docker
If you are using the Docker image, then it would be something like
```json
{
  "mcpServers": {
    "ScraplingServer": {
      "command": "docker",
      "args": [
        "run", "-i", "--rm", "scrapling", "mcp"
      ]
    }
  }
}
```

The same logic applies to [Cursor](https://cursor.com/docs/context/mcp), [WindSurf](https://windsurf.com/university/tutorials/configuring-first-mcp-server), and others.

### Claude Code
Here it's much simpler to do. If you have [Claude Code](https://www.anthropic.com/claude-code) installed, open the terminal and execute the following command:

```bash
claude mcp add ScraplingServer "/Users/<MyUsername>/.venv/bin/scrapling" mcp
```
Same as above, to get Scrapling's executable path, open the terminal and execute the following command:

1. **MacOS**: `which scrapling`
2. **Windows**: `where scrapling`

Here's the main article from Anthropic on [how to add MCP servers to Claude code](https://docs.anthropic.com/en/docs/claude-code/mcp#option-1%3A-add-a-local-stdio-server) for further details.


Then, after you've added the server, you need to completely quit and restart the app you used above. In Claude Desktop, you should see an MCP server indicator (🔧) in the bottom-right corner of the chat input or see `ScraplingServer` in the `Search and tools` dropdown in the chat input box.

### Streamable HTTP
As per version 0.3.6, we have added the ability to make the MCP server use the 'Streamable HTTP' transport mode instead of the traditional 'stdio' transport.

So instead of using the following command (the 'stdio' one):
```bash
scrapling mcp
```
Use the following to enable 'Streamable HTTP' transport mode:
```bash
scrapling mcp --http
```
Hence, the default value for the host the server is listening to is '0.0.0.0' and the port is 8000, which both can be configured as below:
```bash
scrapling mcp --http --host '127.0.0.1' --port 8000
```

## Examples

Now we will show you some examples of prompts we used while testing the MCP server, but you are probably more creative than we are and better at prompt engineering than we are :)

We will gradually go from simple prompts to more complex ones. We will use Claude Desktop for the examples, but the same logic applies to the rest, of course.

1. **Basic Web Scraping**

    Extract the main content from a webpage as Markdown:
    
    ```
    Scrape the main content from https://example.com and convert it to markdown format.
    ```
    
    Claude will use the `get` tool to fetch the page and return clean, readable content. If it fails, it will continue retrying every second for 3 attempts, unless you instruct it otherwise. If it fails to retrieve content for any reason, such as protection or if it's a dynamic website, it will automatically try the other tools. If Claude didn't do that automatically for some reason, you can add that to the prompt.
    
    A more optimized version of the same prompt would be:
    ```
    Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
    ```
    This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.

2. **Targeted Data Extraction**

    Extract specific elements using CSS selectors:
    
    ```
    Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
    ```
    
    The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.

3. **E-commerce Data Collection**

    Another example of a bit more complex prompt:
    ```
    Extract product information from these e-commerce URLs using bulk browser fetches:
    - https://shop1.com/product-a
    - https://shop2.com/product-b  
    - https://shop3.com/product-c
    
    Get the product names, prices, and descriptions from each page.
    ```
    
    Claude will use `bulk_fetch` to concurrently scrape all URLs, then analyze the extracted data.

4. **More advanced workflow**

    Let's say I want to get all the action games available on PlayStation's store first page right now. I can use the following prompt to do that:
    ```
    Extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
    ```
    Note that I instructed it to use a bulk request for all the URLs collected. If I hadn't mentioned it, sometimes it works as intended, and other times it makes a separate request to each URL, which takes significantly longer. This prompt takes approximately one minute to complete.
    
    However, because I wasn't specific enough, it actually used the `stealthy_fetch` here and the `bulk_stealthy_fetch` in the second step, which unnecessarily consumed a large number of tokens. A better prompt would be:
    ```
    Use normal requests to extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
    ```
    And if you know how to write CSS selectors, you can instruct Claude to apply the selectors to the elements you want, and it will nearly complete the task immediately.
    ```
    Use normal requests to extract the URLs of all games on the page below, then perform a bulk request to them and return a list of all action games.
    The selector for games in the first page is `[href*="/concept/"]` and the selector for the genre in the second request is `[data-qa="gameInfo#releaseInformation#genre-value"]`.
    
    URL: https://store.playstation.com/en-us/pages/browse
    ```

5. **Get data from a website with Cloudflare protection**

    If you think the website you are targeting has Cloudflare protection, tell Claude instead of letting it discover it on its own.
    ```
    What's the price of this product? Be cautious, as it utilizes Cloudflare's Turnstile protection. Make the browser visible while you work.

    https://ao.com/product/oo101uk-ninja-woodfire-outdoor-pizza-oven-brown-99357-685.aspx
    ```

6. **Long workflow**

    You can, for example, use a prompt like this:
    ```
    Extract all product URLs for the following category, then return the prices and details for the first 3 products.
    
    https://www.arnotts.ie/furniture/bedroom/bed-frames/
    ```
    But a better prompt would be:
    ```
    Go to the following category URL and extract all product URLs using the CSS selector "a". Then, fetch the first 3 product pages in parallel and extract each product’s price and details.
    
    Keep the output in markdown format to reduce irrelevant content.
    
    Category URL:
    https://www.arnotts.ie/furniture/bedroom/bed-frames/
    ```

And so on, you get the idea. Your creativity is the key here.

## Best Practices

Here is some technical advice for you.

### 1. Choose the Right Tool
- **`get`**: Fast, simple websites
- **`fetch`**: Sites with JavaScript/dynamic content  
- **`stealthy_fetch`**: Protected sites, Cloudflare, anti-bot systems

### 2. Optimize Performance
- Use bulk tools for multiple URLs
- Disable unnecessary resources
- Set appropriate timeouts
- Use CSS selectors for targeted extraction

### 3. Handle Dynamic Content
- Use `network_idle` for SPAs
- Set `wait_selector` for specific elements
- Increase timeout for slow-loading sites

### 4. Data Quality
- Use `main_content_only=true` to avoid navigation/ads
- Choose an appropriate `extraction_type` for your use case

## Legal and Ethical Considerations

⚠️ **Important Guidelines:**

- **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules
- **Respect rate limits**: Don't overwhelm servers with requests
- **Terms of Service**: Read and comply with website terms
- **Copyright**: Respect intellectual property rights
- **Privacy**: Be mindful of personal data protection laws
- **Commercial use**: Ensure you have permission for business purposes

---

*Built with ❤️ by the Scrapling team. Happy scraping!*

================================================
FILE: docs/api-reference/custom-types.md
================================================
---
search:
  exclude: true
---

# Custom Types API Reference

Here's the reference information for all custom types of classes Scrapling implemented, with all their parameters, attributes, and methods.

You can import all of them directly like below:

```python
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
```

## ::: scrapling.core.custom_types.TextHandler
    handler: python
    :docstring:

## ::: scrapling.core.custom_types.TextHandlers
    handler: python
    :docstring:

## ::: scrapling.core.custom_types.AttributesHandler
    handler: python
    :docstring:


================================================
FILE: docs/api-reference/fetchers.md
================================================
---
search:
  exclude: true
---

# Fetchers Classes

Here's the reference information for all fetcher-type classes' parameters, attributes, and methods.

You can import all of them directly like below:

```python
from scrapling.fetchers import (
    Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,
    FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession
)
```

## ::: scrapling.fetchers.Fetcher
    handler: python
    :docstring:

## ::: scrapling.fetchers.AsyncFetcher
    handler: python
    :docstring:

## ::: scrapling.fetchers.DynamicFetcher
    handler: python
    :docstring:

## ::: scrapling.fetchers.StealthyFetcher
    handler: python
    :docstring:


## Session Classes

### HTTP Sessions

## ::: scrapling.fetchers.FetcherSession
    handler: python
    :docstring:

### Stealth Sessions

## ::: scrapling.fetchers.StealthySession
    handler: python
    :docstring:

## ::: scrapling.fetchers.AsyncStealthySession
    handler: python
    :docstring:

### Dynamic Sessions

## ::: scrapling.fetchers.DynamicSession
    handler: python
    :docstring:

## ::: scrapling.fetchers.AsyncDynamicSession
    handler: python
    :docstring:


================================================
FILE: docs/api-reference/mcp-server.md
================================================
---
search:
  exclude: true
---

# MCP Server API Reference

The **Scrapling MCP Server** provides six powerful tools for web scraping through the Model Context Protocol (MCP). This server integrates Scrapling's capabilities directly into AI chatbots and agents, allowing conversational web scraping with advanced anti-bot bypass features.

You can start the MCP server by running:

```bash
scrapling mcp
```

Or import the server class directly:

```python
from scrapling.core.ai import ScraplingMCPServer

server = ScraplingMCPServer()
server.serve(http=False, host="0.0.0.0", port=8000)
```

## Response Model

The standardized response structure that's returned by all MCP server tools:

## ::: scrapling.core.ai.ResponseModel
    handler: python
    :docstring:

## MCP Server Class

The main MCP server class that provides all web scraping tools:

## ::: scrapling.core.ai.ScraplingMCPServer
    handler: python
    :docstring:

================================================
FILE: docs/api-reference/proxy-rotation.md
================================================
---
search:
  exclude: true
---

# Proxy Rotation

The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.

You can import it directly like below:

```python
from scrapling.fetchers import ProxyRotator
```

## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator
    handler: python
    :docstring:


================================================
FILE: docs/api-reference/response.md
================================================
---
search:
  exclude: true
---

# Response Class

The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.

You can import the `Response` class like below:

```python
from scrapling.engines.toolbelt.custom import Response
```

## ::: scrapling.engines.toolbelt.custom.Response
    handler: python
    :docstring:


================================================
FILE: docs/api-reference/selector.md
================================================
---
search:
  exclude: true
---

# Selector Class

The `Selector` class is the core parsing engine in Scrapling that provides HTML parsing and element selection capabilities.

Here's the reference information for the `Selector` class, with all its parameters, attributes, and methods.

You can import the `Selector` class directly from `scrapling`:

```python
from scrapling.parser import Selector
```

## ::: scrapling.parser.Selector
    handler: python
    :docstring:

## ::: scrapling.parser.Selectors
    handler: python
    :docstring:


================================================
FILE: docs/api-reference/spiders.md
================================================
---
search:
  exclude: true
---

# Spider Classes

Here's the reference information for the spider framework classes' parameters, attributes, and methods.

You can import them directly like below:

```python
from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response
```

## ::: scrapling.spiders.Spider
    handler: python
    :docstring:

## ::: scrapling.spiders.Request
    handler: python
    :docstring:

## Result Classes

## ::: scrapling.spiders.result.CrawlResult
    handler: python
    :docstring:

## ::: scrapling.spiders.result.CrawlStats
    handler: python
    :docstring:

## ::: scrapling.spiders.result.ItemList
    handler: python
    :docstring:

## Session Management

## ::: scrapling.spiders.session.SessionManager
    handler: python
    :docstring:


================================================
FILE: docs/benchmarks.md
================================================
# Performance Benchmarks

Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.

### Text Extraction Speed Test (5000 nested elements)

| # |      Library      | Time (ms) | vs Scrapling | 
|---|:-----------------:|:---------:|:------------:|
| 1 |     Scrapling     |   2.02    |     1.0x     |
| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |
| 3 |     Raw Lxml      |   2.54    |    1.257     |
| 4 |      PyQuery      |   24.17   |     ~12x     |
| 5 |    Selectolax     |   82.63   |     ~41x     |
| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |
| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |
| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |


### Element Similarity & Text Search Performance

Scrapling's adaptive element finding capabilities significantly outperform alternatives:

| Library     | Time (ms) | vs Scrapling |
|-------------|:---------:|:------------:|
| Scrapling   |   2.39    |     1.0x     |
| AutoScraper |   12.45   |    5.209x    |

> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.


================================================
FILE: docs/cli/extract-commands.md
================================================
# Scrapling Extract Command Guide

**Web Scraping through the terminal without requiring any programming!**

The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
    4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).


## What is the Extract Command group?

The extract command is a set of simple terminal tools that:

- **Downloads web pages** and saves their content to files.
- **Converts HTML to readable formats** like Markdown, keeps it as HTML, or just extracts the text content of the page.
- **Supports custom CSS selectors** to extract specific parts of the page.
- **Handles HTTP requests and fetching through browsers**
- **Highly customizable** with custom headers, cookies, proxies, and the rest of the options. Almost all the options available through the code are also accessible through the command line.

## Quick Start

- **Basic Website Download**

    Download a website's text content as clean, readable text:
    ```bash
    scrapling extract get "https://example.com" page_content.txt
    ```
    This makes an HTTP GET request and saves the webpage's text content to `page_content.txt`.

- **Save as Different Formats**

    Choose your output format by changing the file extension:
    ```bash
    # Convert the HTML content to Markdown, then save it to the file (great for documentation)
    scrapling extract get "https://blog.example.com" article.md
    
    # Save the HTML content as it is to the file
    scrapling extract get "https://example.com" page.html
    
    # Save a clean version of the text content of the webpage to the file
    scrapling extract get "https://example.com" content.txt
  
    # Or use the Docker image with something like this:
    docker run -v $(pwd)/output:/output scrapling extract get "https://blog.example.com" /output/article.md 
    ```

- **Extract Specific Content**

    All commands can use CSS selectors to extract specific parts of the page through `--css-selector` or `-s` as you will see in the examples below.

## Available Commands

You can display the available commands through `scrapling extract --help` to get the following list:
```bash
Usage: scrapling extract [OPTIONS] COMMAND [ARGS]...

  Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content.

Options:
  --help  Show this message and exit.

Commands:
  get             Perform a GET request and save the content to a file.
  post            Perform a POST request and save the content to a file.
  put             Perform a PUT request and save the content to a file.
  delete          Perform a DELETE request and save the content to a file.
  fetch           Use DynamicFetcher to fetch content with browser...
  stealthy-fetch  Use StealthyFetcher to fetch content with advanced...
```

We will go through each command in detail below.

### HTTP Requests

1. **GET Request**

    The most common command for downloading website content:
    
    ```bash
    scrapling extract get [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Basic download
    scrapling extract get "https://news.site.com" news.md
    
    # Download with custom timeout
    scrapling extract get "https://example.com" content.txt --timeout 60
    
    # Extract only specific content using CSS selectors
    scrapling extract get "https://blog.example.com" articles.md --css-selector "article"
   
    # Send a request with cookies
    scrapling extract get "https://scrapling.requestcatcher.com" content.md --cookies "session=abc123; user=john"
   
    # Add user agent
    scrapling extract get "https://api.site.com" data.json -H "User-Agent: MyBot 1.0"
    
    # Add multiple headers
    scrapling extract get "https://site.com" page.html -H "Accept: text/html" -H "Accept-Language: en-US"
    ```
    Get the available options for the command with `scrapling extract get --help` as follows:
    ```bash
    Usage: scrapling extract get [OPTIONS] URL OUTPUT_FILE
    
      Perform a GET request and save the content to a file.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      -H, --headers TEXT                             HTTP headers in format "Key: Value" (can be used multiple times)
      --cookies TEXT                                 Cookies string in format "name1=value1;name2=value2"
      --timeout INTEGER                              Request timeout in seconds (default: 30)
      --proxy TEXT                                   Proxy URL in format "http://username:password@host:port"
      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.
      -p, --params TEXT                              Query parameters in format "key=value" (can be used multiple times)
      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)
      --verify / --no-verify                         Whether to verify SSL certificates (default: True)
      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).
      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)
      --help                                         Show this message and exit.
    
    ```
    Note that the options will work in the same way for all other request commands, so no need to repeat them.

2. **Post Request**
    
    ```bash
    scrapling extract post [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Submit form data
    scrapling extract post "https://api.site.com/search" results.html --data "query=python&type=tutorial"
    
    # Send JSON data
    scrapling extract post "https://api.site.com" response.json --json '{"username": "test", "action": "search"}'
    ```
    Get the available options for the command with `scrapling extract post --help` as follows:
    ```bash
    Usage: scrapling extract post [OPTIONS] URL OUTPUT_FILE
    
      Perform a POST request and save the content to a file.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      -d, --data TEXT                                Form data to include in the request body (as string, ex: "param1=value1&param2=value2")
      -j, --json TEXT                                JSON data to include in the request body (as string)
      -H, --headers TEXT                             HTTP headers in format "Key: Value" (can be used multiple times)
      --cookies TEXT                                 Cookies string in format "name1=value1;name2=value2"
      --timeout INTEGER                              Request timeout in seconds (default: 30)
      --proxy TEXT                                   Proxy URL in format "http://username:password@host:port"
      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.
      -p, --params TEXT                              Query parameters in format "key=value" (can be used multiple times)
      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)
      --verify / --no-verify                         Whether to verify SSL certificates (default: True)
      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).
      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)
      --help                                         Show this message and exit.
    
    ```

3. **Put Request**
    
    ```bash
    scrapling extract put [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Send data
    scrapling extract put "https://scrapling.requestcatcher.com/put" results.html --data "update=info" --impersonate "firefox"
    
    # Send JSON data
    scrapling extract put "https://scrapling.requestcatcher.com/put" response.json --json '{"username": "test", "action": "search"}'
    ```
    Get the available options for the command with `scrapling extract put --help` as follows:
    ```bash
    Usage: scrapling extract put [OPTIONS] URL OUTPUT_FILE
    
      Perform a PUT request and save the content to a file.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      -d, --data TEXT                                Form data to include in the request body
      -j, --json TEXT                                JSON data to include in the request body (as string)
      -H, --headers TEXT                             HTTP headers in format "Key: Value" (can be used multiple times)
      --cookies TEXT                                 Cookies string in format "name1=value1;name2=value2"
      --timeout INTEGER                              Request timeout in seconds (default: 30)
      --proxy TEXT                                   Proxy URL in format "http://username:password@host:port"
      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.
      -p, --params TEXT                              Query parameters in format "key=value" (can be used multiple times)
      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)
      --verify / --no-verify                         Whether to verify SSL certificates (default: True)
      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).
      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)
      --help                                         Show this message and exit.
    ```

4. **Delete Request**
    
    ```bash
    scrapling extract delete [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Send data
    scrapling extract delete "https://scrapling.requestcatcher.com/delete" results.html
    
    # Send JSON data
    scrapling extract delete "https://scrapling.requestcatcher.com/" response.txt --impersonate "chrome"
    ```
    Get the available options for the command with `scrapling extract delete --help` as follows:
    ```bash
    Usage: scrapling extract delete [OPTIONS] URL OUTPUT_FILE
    
      Perform a DELETE request and save the content to a file.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      -H, --headers TEXT                             HTTP headers in format "Key: Value" (can be used multiple times)
      --cookies TEXT                                 Cookies string in format "name1=value1;name2=value2"
      --timeout INTEGER                              Request timeout in seconds (default: 30)
      --proxy TEXT                                   Proxy URL in format "http://username:password@host:port"
      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.
      -p, --params TEXT                              Query parameters in format "key=value" (can be used multiple times)
      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)
      --verify / --no-verify                         Whether to verify SSL certificates (default: True)
      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).
      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)
      --help                                         Show this message and exit.
    ```

### Browsers fetching

1. **fetch - Handle Dynamic Content**

    For websites that load content with dynamic content or have slight protection
    
    ```bash
    scrapling extract fetch [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Wait for JavaScript to load content and finish network activity
    scrapling extract fetch "https://scrapling.requestcatcher.com/" content.md --network-idle
    
    # Wait for specific content to appear
    scrapling extract fetch "https://scrapling.requestcatcher.com/" data.txt --wait-selector ".content-loaded"
    
    # Run in visible browser mode (helpful for debugging)
    scrapling extract fetch "https://scrapling.requestcatcher.com/" page.html --no-headless --disable-resources
    ```
    Get the available options for the command with `scrapling extract fetch --help` as follows:
    ```bash
    Usage: scrapling extract fetch [OPTIONS] URL OUTPUT_FILE
    
      Use DynamicFetcher to fetch content with browser automation.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      --headless / --no-headless                  Run browser in headless mode (default: True)
      --disable-resources / --enable-resources    Drop unnecessary resources for speed boost (default: False)
      --network-idle / --no-network-idle          Wait for network idle (default: False)
      --timeout INTEGER                           Timeout in milliseconds (default: 30000)
      --wait INTEGER                              Additional wait time in milliseconds after page load (default: 0)
      -s, --css-selector TEXT                     CSS selector to extract specific content from the page. It returns all matches.
      --wait-selector TEXT                        CSS selector to wait for before proceeding
      --locale TEXT                               Specify user locale. Defaults to the system default locale.
      --real-chrome/--no-real-chrome              If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
      --proxy TEXT                                Proxy URL in format "http://username:password@host:port"
      -H, --extra-headers TEXT                    Extra headers in format "Key: Value" (can be used multiple times)
      --help                                      Show this message and exit.
    ```

2. **stealthy-fetch - Bypass Protection**

    For websites with anti-bot protection or Cloudflare protection
    
    ```bash
    scrapling extract stealthy-fetch [URL] [OUTPUT_FILE] [OPTIONS]
    ```
    
    **Examples:**
    ```bash
    # Bypass basic protection
    scrapling extract stealthy-fetch "https://scrapling.requestcatcher.com" content.md
    
    # Solve Cloudflare challenges
    scrapling extract stealthy-fetch "https://nopecha.com/demo/cloudflare" data.txt --solve-cloudflare --css-selector "#padded_content a"
    
    # Use a proxy for anonymity.
    scrapling extract stealthy-fetch "https://site.com" content.md --proxy "http://proxy-server:8080"
    ```
    Get the available options for the command with `scrapling extract stealthy-fetch --help` as follows:
    ```bash
    Usage: scrapling extract stealthy-fetch [OPTIONS] URL OUTPUT_FILE
    
      Use StealthyFetcher to fetch content with advanced stealth features.
    
      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.
    
    Options:
      --headless / --no-headless                  Run browser in headless mode (default: True)
      --disable-resources / --enable-resources    Drop unnecessary resources for speed boost (default: False)
      --block-webrtc / --allow-webrtc             Block WebRTC entirely (default: False)
      --solve-cloudflare / --no-solve-cloudflare  Solve Cloudflare challenges (default: False)
      --allow-webgl / --block-webgl               Allow WebGL (default: True)
      --network-idle / --no-network-idle          Wait for network idle (default: False)
      --real-chrome/--no-real-chrome              If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
      --timeout INTEGER                           Timeout in milliseconds (default: 30000)
      --wait INTEGER                              Additional wait time in milliseconds after page load (default: 0)
      -s, --css-selector TEXT                     CSS selector to extract specific content from the page. It returns all matches.
      --wait-selector TEXT                        CSS selector to wait for before proceeding
      --hide-canvas / --show-canvas               Add noise to canvas operations (default: False)
      --proxy TEXT                                Proxy URL in format "http://username:password@host:port"
      -H, --extra-headers TEXT                    Extra headers in format "Key: Value" (can be used multiple times)
      --help                                      Show this message and exit.
    ```

## When to use each command

If you are not a Web Scraping expert and can't decide what to choose, you can use the following formula to help you decide:

- Use **`get`** with simple websites, blogs, or news articles
- Use **`fetch`** with modern web apps, or sites with dynamic content
- Use **`stealthy-fetch`** with protected sites, Cloudflare, or anti-bot systems

## Legal and Ethical Considerations

⚠️ **Important Guidelines:**

- **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules
- **Respect rate limits**: Don't overwhelm servers with requests
- **Terms of Service**: Read and comply with website terms
- **Copyright**: Respect intellectual property rights
- **Privacy**: Be mindful of personal data protection laws
- **Commercial use**: Ensure you have permission for business purposes

---

*Happy scraping! Remember to always respect website policies and comply with all applicable laws and regulations.*

================================================
FILE: docs/cli/interactive-shell.md
================================================
# Scrapling Interactive Shell Guide

<script src="https://asciinema.org/a/736339.js" id="asciicast-736339" async data-autoplay="1" data-loop="1" data-cols="225" data-rows="40" data-start-at="00:06" data-speed="1.5" data-theme="tango"></script>

**Powerful Web Scraping REPL for Developers and Data Scientists**

The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
    4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).


## Why use the Interactive Shell?

The interactive shell transforms web scraping from a slow script-and-run cycle into a fast, exploratory experience. It's perfect for:

- **Rapid prototyping**: Test scraping strategies instantly
- **Data exploration**: Interactively navigate and extract from websites  
- **Learning Scrapling**: Experiment with features in real-time
- **Debugging scrapers**: Step through requests and inspect results
- **Converting workflows**: Transform curl commands from browser DevTools to a Fetcher request in a one-liner

## Getting Started

### Launch the Shell

```bash
# Start the interactive shell
scrapling shell

# Execute code and exit (useful for scripting)
scrapling shell -c "get('https://quotes.toscrape.com'); print(len(page.css('.quote')))"

# Set logging level
scrapling shell --loglevel info
```

Once launched, you'll see the Scrapling banner and can immediately start scraping as the video above shows:

```python
# No imports needed - everything is ready!
>>> get('https://news.ycombinator.com')

>>> # Explore the page structure
>>> page.css('a')[:5]  # Look at first 5 links

>>> # Refine your selectors
>>> stories = page.css('.titleline>a')
>>> len(stories)
30

>>> # Extract specific data
>>> for story in stories[:3]:
...     title = story.text
...     url = story['href']
...     print(f"{title}: {url}")

>>> # Try different approaches
>>> titles = page.css('.titleline>a::text')  # Direct text extraction
>>> urls = page.css('.titleline>a::attr(href)')  # Direct attribute extraction
```

## Built-in Shortcuts

The shell provides convenient shortcuts that eliminate boilerplate code:

- **`get(url, **kwargs)`** - HTTP GET request (instead of `Fetcher.get`)
- **`post(url, **kwargs)`** - HTTP POST request (instead of `Fetcher.post`)
- **`put(url, **kwargs)`** - HTTP PUT request (instead of `Fetcher.put`)
- **`delete(url, **kwargs)`** - HTTP DELETE request (instead of `Fetcher.delete`)
- **`fetch(url, **kwargs)`** - Browser-based fetch (instead of `DynamicFetcher.fetch`) 
- **`stealthy_fetch(url, **kwargs)`** - Stealthy browser fetch (instead of `StealthyFetcher.fetch`)

The most commonly used classes are automatically available without any import, including `Fetcher`, `AsyncFetcher`, `DynamicFetcher`, `StealthyFetcher`, and `Selector`.

### Smart Page Management

The shell automatically tracks your requests and pages:

- **Current Page Access**

    The `page` and `response` commands are automatically updated with the last fetched page:
    
    ```python
    >>> get('https://quotes.toscrape.com')
    >>> # 'page' and 'response' both refer to the last fetched page
    >>> page.url
    'https://quotes.toscrape.com'
    >>> response.status  # Same as page.status
    200
    ```

- **Page History**

    The `pages` command keeps track of the last five pages (it's a `Selectors` object):
    
    ```python
    >>> get('https://site1.com')
    >>> get('https://site2.com') 
    >>> get('https://site3.com')
    
    >>> # Access last 5 pages
    >>> len(pages)  # `Selectors` object with `page` history
    3
    >>> pages[0].url  # First page in history
    'https://site1.com'
    >>> pages[-1].url  # Most recent page
    'https://site3.com'
    
    >>> # Work with historical pages
    >>> for i, old_page in enumerate(pages):
    ...     print(f"Page {i}: {old_page.url} - {old_page.status}")
    ```

## Additional helpful commands

### Page Visualization

View scraped pages in your browser:

```python
>>> get('https://quotes.toscrape.com')
>>> view(page)  # Opens the page HTML in your default browser
```

### Curl Command Integration

The shell provides a few functions to help you convert curl commands from the browser DevTools to `Fetcher` requests: `uncurl` and `curl2fetcher`.

First, you need to copy a request as a curl command like the following:

<img src="../assets/scrapling_shell_curl.png" title="Copying a request as a curl command from Chrome" alt="Copying a request as a curl command from Chrome" style="width: 70%;"/>

- **Convert Curl command to Request Object**

    ```python
    >>> curl_cmd = '''curl 'https://scrapling.requestcatcher.com/post' \
    ...   -X POST \
    ...   -H 'Content-Type: application/json' \
    ...   -d '{"name": "test", "value": 123}' '''
    
    >>> request = uncurl(curl_cmd)
    >>> request.method
    'post'
    >>> request.url
    'https://scrapling.requestcatcher.com/post'
    >>> request.headers
    {'Content-Type': 'application/json'}
    ```

- **Execute Curl Command Directly**

    ```python
    >>> # Convert and execute in one step
    >>> curl2fetcher(curl_cmd)
    >>> page.status
    200
    >>> page.json()['json']
    {'name': 'test', 'value': 123}
    ```

### IPython Features

The shell inherits all IPython capabilities:

```python
>>> # Magic commands
>>> %time page = get('https://example.com')  # Time execution
>>> %history  # Show command history
>>> %save filename.py 1-10  # Save commands 1-10 to file

>>> # Tab completion works everywhere
>>> page.c<TAB>  # Shows: css, cookies, headers, etc.
>>> Fetcher.<TAB>  # Shows all Fetcher methods

>>> # Object inspection
>>> get? # Show get documentation
```

## Examples

Here are a few examples generated via AI:

#### E-commerce Data Collection

```python
>>> # Start with product listing page
>>> catalog = get('https://shop.example.com/products')

>>> # Find product links
>>> product_links = catalog.css('.product-link::attr(href)')
>>> print(f"Found {len(product_links)} products")

>>> # Sample a few products first
>>> for link in product_links[:3]:
...     product = get(f"https://shop.example.com{link}")
...     name = product.css('.product-name::text').get('')
...     price = product.css('.price::text').get('')
...     print(f"{name}: {price}")

>>> # Scale up with sessions for efficiency
>>> from scrapling.fetchers import FetcherSession
>>> with FetcherSession() as session:
...     products = []
...     for link in product_links:
...         product = session.get(f"https://shop.example.com{link}")
...         products.append({
...             'name': product.css('.product-name::text').get(''),
...             'price': product.css('.price::text').get(''),
...             'url': link
...         })
```

#### API Integration and Testing

```python
>>> # Test API endpoints interactively
>>> response = get('https://jsonplaceholder.typicode.com/posts/1')
>>> response.json()
{'userId': 1, 'id': 1, 'title': 'sunt aut...', 'body': 'quia et...'}

>>> # Test POST requests
>>> new_post = post('https://jsonplaceholder.typicode.com/posts', 
...                 json={'title': 'Test Post', 'body': 'Test content', 'userId': 1})
>>> new_post.json()['id']
101

>>> # Test with different data
>>> updated = put(f'https://jsonplaceholder.typicode.com/posts/{new_post.json()["id"]}',
...               json={'title': 'Updated Title'})
```

## Getting Help

If you need help other than what is available in-terminal, you can:

- [Scrapling Documentation](https://scrapling.readthedocs.io/)
- [Discord Community](https://discord.gg/EMgGbDceNQ)
- [GitHub Issues](https://github.com/D4Vinci/Scrapling/issues)  

And that's it! Happy scraping! The shell makes web scraping as easy as a conversation.

================================================
FILE: docs/cli/overview.md
================================================
# Command Line Interface

Since v0.3, Scrapling includes a powerful command-line interface that provides three main capabilities:

1. **Interactive Shell**: An interactive Web Scraping shell based on IPython that provides many shortcuts and useful tools
2. **Extract Commands**: Scrape websites from the terminal without any programming
3. **Utility Commands**: Installation and management tools

```bash
# Launch interactive shell
scrapling shell

# Convert the content of a page to markdown and save it to a file
scrapling extract get "https://example.com" content.md

# Get help for any command
scrapling --help
scrapling extract --help
```

## Requirements
This section requires you to install the extra `shell` dependency group, like the following:
```bash
pip install "scrapling[shell]"
```
and the installation of the fetchers' dependencies with the following command
```bash
scrapling install
```
This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.

================================================
FILE: docs/development/adaptive_storage_system.md
================================================
# Writing your retrieval system

Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.

You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.

So first, to make your storage class work, it must do the big 3:

1. Inherit from the abstract class `scrapling.core.storage.StorageSystemMixin` and accept a string argument, which will be the `url` argument to maintain the library logic.
2. Use the decorator `functools.lru_cache` on top of the class to follow the Singleton design pattern as other classes.
3. Implement methods `save` and `retrieve`, as you see from the type hints:
    - The method `save` returns nothing and will get two arguments from the library
        * The first one is of type `lxml.html.HtmlElement`, which is the element itself. It must be converted to a dictionary using the `element_to_dict` function in the submodule `scrapling.core.utils._StorageTools` to maintain the same format, and then saved to your database as you wish.
        * The second one is a string, the identifier used for retrieval. The combination result of this identifier and the `url` argument from initialization must be unique for each row, or the `adaptive` data will be messed up.
    - The method `retrieve` takes a string, which is the identifier; using it with the `url` passed on initialization, the element's dictionary is retrieved from the database and returned if it exists; otherwise, it returns `None`.

> If the instructions weren't clear enough for you, you can check my implementation using SQLite3 in [storage_adaptors](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/storage.py) file

If your class meets these criteria, the rest is straightforward. If you plan to use the library in a threaded application, ensure your class supports it. The default used class is thread-safe.

Some helper functions are added to the abstract class if you want to use them. It's easier to see it for yourself in the [code](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/storage.py); it's heavily commented :)


## Real-World Example: Redis Storage

Here's a more practical example generated by AI using Redis:

```python
import redis
import orjson
from functools import lru_cache
from scrapling.core.storage import StorageSystemMixin
from scrapling.core.utils import _StorageTools

@lru_cache(None)
class RedisStorage(StorageSystemMixin):
    def __init__(self, host='localhost', port=6379, db=0, url=None):
        super().__init__(url)
        self.redis = redis.Redis(
            host=host,
            port=port,
            db=db,
            decode_responses=False
        )
        
    def save(self, element, identifier: str) -> None:
        # Convert element to dictionary
        element_dict = _StorageTools.element_to_dict(element)
        
        # Create key
        key = f"scrapling:{self._get_base_url()}:{identifier}"
        
        # Store as JSON
        self.redis.set(
            key,
            orjson.dumps(element_dict)
        )
        
    def retrieve(self, identifier: str) -> dict | None:
        # Get data
        key = f"scrapling:{self._get_base_url()}:{identifier}"
        data = self.redis.get(key)
        
        # Parse JSON if exists
        if data:
            return orjson.loads(data)
        return None
```

================================================
FILE: docs/development/scrapling_custom_types.md
================================================
# Using Scrapling's custom types

> You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)

### All current types can be imported alone, like below
```python
>>> from scrapling.core.custom_types import TextHandler, AttributesHandler

>>> somestring = TextHandler('{}')
>>> somestring.json()
'{}'
>>> somedict_1 = AttributesHandler({'a': 1})
>>> somedict_2 = AttributesHandler(a=1)
```

Note that `TextHandler` is a subclass of Python's `str`, so all standard operations/methods that work with Python strings will work.
If you want to check the type in your code, it's better to use Python's built-in `issubclass` function.

The class `AttributesHandler` is a subclass of `collections.abc.Mapping`, so it's immutable (read-only), and all operations are inherited from it. The data passed can be accessed later through the `_data` property, but be careful; it's of type `types.MappingProxyType`, so it's immutable (read-only) as well (faster than `collections.abc.Mapping` by fractions of seconds).

So, to make it simple for you, if you are new to Python, the same operations and methods from the Python standard `dict` type will all work with the class `AttributesHandler` except for the ones that try to modify the actual data.

If you want to modify the data inside `AttributesHandler`, you have to convert it to a dictionary first, e.g., using the `dict` function, and then change it outside.

================================================
FILE: docs/donate.md
================================================
I've been creating all of these projects in my spare time and have invested considerable resources & effort in providing them to the community for free. By becoming a sponsor, you'd be directly funding my coffee reserves, helping me fulfill my responsibilities, and enabling me to continuously update existing projects and potentially create new ones.

You can sponsor me directly through the [GitHub Sponsors program](https://github.com/sponsors/D4Vinci) or [Buy Me a Coffee](https://buymeacoffee.com/d4vinci).

Thank you, stay curious, and hack the planet! ❤️

## Advertisement
If you are looking to **advertise** your business to our target audience, check out the [available tiers](https://github.com/sponsors/D4Vinci):

### 1. [The Silver tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=435495) ($100/month)
Perks:

1. Your logo will be featured at [the top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#sponsors).
2. The same logo will be featured at [the top of Scrapling's PyPI page](https://pypi.org/project/scrapling/) and [the top of Docker's image page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.

### 2. [The Gold tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=591422) ($200/month)
Perks:

1. Your logo will be featured at [the top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#sponsors).
2. The same logo will be featured at [the top of Scrapling's PyPI page](https://pypi.org/project/scrapling/) and [the top of Docker's image page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.
3. Your logo will be featured as a top sponsor on [Scrapling's website](https://scrapling.readthedocs.io/en/latest/) main page.

### 3. [The Platinum tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646) ($300/month)
Perks:

1. Your logo will have a special placement at [the very top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#platinum-sponsors) with a 30-word paragraph or less.
2. The same logo will be featured at [the PyPI page](https://pypi.org/project/scrapling/)/[the Docker page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.
3. A special placement for your logo as a top sponsor on [Scrapling's website](https://scrapling.readthedocs.io/en/latest/) main page.
4. A partner role at our Discord server and an announcement on the Twitter page and the Discord server.
5. A Shoutout at the end of each Release notes.

================================================
FILE: docs/fetching/choosing.md
================================================
# Fetchers basics

## Introduction
Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.

This feature was introduced because, before v0.2, Scrapling was only a parsing engine. The target here is to gradually become the one-stop shop for all Web Scraping needs.

> Fetchers are not wrappers built on top of other libraries. However, they only use these libraries as an engine to request/fetch pages. To further clarify this, all fetchers have features that the underlying engines don't, while still fully leveraging those engines and optimizing them for Web Scraping.

## Fetchers Overview

Scrapling provides three different fetcher classes with their session classes; each fetcher is designed for a specific use case.

The following table compares them and can be quickly used for guidance.


| Feature            | Fetcher                                           | DynamicFetcher                                                                    | StealthyFetcher                                                                            |
|--------------------|---------------------------------------------------|-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|
| Relative speed     | 🐇🐇🐇🐇🐇                                        | 🐇🐇🐇                                                                            | 🐇🐇🐇                                                                                     |
| Stealth            | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |
| Anti-Bot options   | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |
| JavaScript loading | ❌                                                 | ✅                                                                                 | ✅                                                                                          |
| Memory Usage       | ⭐                                                 | ⭐⭐⭐                                                                               | ⭐⭐⭐                                                                                        |
| Best used for      | Basic scraping when HTTP requests alone can do it | - Dynamically loaded websites <br/>- Small automation<br/>- Small-Mid protections | - Dynamically loaded websites <br/>- Small automation <br/>- Small-Complicated protections |
| Browser(s)         | ❌                                                 | Chromium and Google Chrome                                                        | Chromium and Google Chrome                                                                 |
| Browser API used   | ❌                                                 | PlayWright                                                                        | PlayWright                                                                                 |
| Setup Complexity   | Simple                                            | Simple                                                                            | Simple                                                                                     |

In the following pages, we will talk about each one in detail.

## Parser configuration in all fetchers
All fetchers share the same import method, as you will see in the upcoming pages
```python
>>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
```
Then you use it right away without initializing like this, and it will use the default parser settings:
```python
>>> page = StealthyFetcher.fetch('https://example.com') 
```
If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
```python
>>> from scrapling.fetchers import Fetcher
>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False)  # and the rest
```
or
```python
>>> from scrapling.fetchers import Fetcher
>>> Fetcher.adaptive=True
>>> Fetcher.keep_comments=False
>>> Fetcher.keep_cdata=False  # and the rest
```
Then, continue your code as usual.

The available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.

!!! info

    The `adaptive` argument is disabled by default; you must enable it to use that feature.

### Set parser config per request
As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.

If your use case requires a different configuration for each request/fetch, you can pass a dictionary to the request method (`fetch`/`get`/`post`/...) to an argument named `selector_config`.

## Response Object
The `Response` object is the same as the [Selector](../parsing/main_classes.md#selector) class, but it has additional details about the response, like response headers, status, cookies, etc., as shown below:
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.get('https://example.com')

>>> page.status          # HTTP status code
>>> page.reason          # Status message
>>> page.cookies         # Response cookies as a dictionary
>>> page.headers         # Response headers
>>> page.request_headers # Request headers
>>> page.history         # Response history of redirections, if any
>>> page.body            # Raw response body as bytes
>>> page.encoding        # Response encoding
>>> page.meta            # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.
```
All fetchers return the `Response` object.

!!! note

    Unlike the [Selector](../parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4.

================================================
FILE: docs/fetching/dynamic.md
================================================
# Fetching dynamic websites

Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.

As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.

## Basic Usage
You have one primary way to import this Fetcher, which is the same for all fetchers.

```python
>>> from scrapling.fetchers import DynamicFetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)

!!! abstract

    The async version of the `fetch` method is `async_fetch`, of course.


This fetcher currently provides three main run options that can be combined as desired.

Which are:

### 1. Vanilla Playwright
```python
DynamicFetcher.fetch('https://example.com')
```
Using it in that manner will open a Chromium browser and load the page. There are optimizations for speed, and some stealth goes automatically under the hood, but other than that, there are no tricks or extra features unless you enable some; it's just a plain PlayWright API.

### 2. Real Chrome
```python
DynamicFetcher.fetch('https://example.com', real_chrome=True)
```
If you have a Google Chrome browser installed, use this option. It's the same as the first option, but it will use the Google Chrome browser you installed on your device instead of Chromium. This will make your requests look more authentic, so they're less detectable for better results.

If you don't have Google Chrome installed and want to use this option, you can use the command below in the terminal to install it for the library instead of installing it manually:
```commandline
playwright install chrome
```

### 3. CDP Connection
```python
DynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')
```
Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).


!!! note "Notes:"

    * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.<br/>
    * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md).

## Full list of arguments
Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.

|      Argument       | Description                                                                                                                                                                                                                         | Optional |
|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|         url         | Target url                                                                                                                                                                                                                          |    ❌     |
|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |
|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |
|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |
|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |
|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |
|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |
|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |
|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |
|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |
|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |
|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |
|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                                                                                                      |    ✔️    |
|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                                |    ✔️    |
|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |
|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |
|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |
|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |
|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |
|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |
|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |
|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |
|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |
|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).                                                                                                     |    ✔️    |
|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |
|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |
|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |

In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.

!!! note "Notes:"

    1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
    2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
    3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
    4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.


## Examples
It's easier to understand with examples, so let's take a look.

### Resource Control

```python
# Disable unnecessary resources
page = DynamicFetcher.fetch('https://example.com', disable_resources=True)  # Blocks fonts, images, media, etc.
```

### Domain Blocking

```python
# Block requests to specific domains (and their subdomains)
page = DynamicFetcher.fetch('https://example.com', blocked_domains={"ads.example.com", "tracker.net"})
```

### Network Control

```python
# Wait for network idle (Consider fetch to be finished when there are no network connections for at least 500 ms)
page = DynamicFetcher.fetch('https://example.com', network_idle=True)

# Custom timeout (in milliseconds)
page = DynamicFetcher.fetch('https://example.com', timeout=30000)  # 30 seconds

# Proxy support (It can also be a dictionary with only the keys 'server', 'username', and 'password'.)
page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
```

### Proxy Rotation

```python
from scrapling.fetchers import DynamicSession, ProxyRotator

# Set up proxy rotation
rotator = ProxyRotator([
    "http://proxy1:8080",
    "http://proxy2:8080",
    "http://proxy3:8080",
])

# Use with session - rotates proxy automatically with each request
with DynamicSession(proxy_rotator=rotator, headless=True) as session:
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com')

    # Override rotator for a specific request
    page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')
```

!!! warning

    Remember that by default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.

### Downloading Files

```python
page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')

with open(file='main_cover.png', mode='wb') as f:
    f.write(page.body)
```

The `body` attribute of the `Response` object always returns `bytes`.

### Browser Automation
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.

This function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.

In the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.
```python
from playwright.sync_api import Page

def scroll_page(page: Page):
    page.mouse.wheel(10, 0)
    page.mouse.move(100, 400)
    page.mouse.up()

page = DynamicFetcher.fetch('https://example.com', page_action=scroll_page)
```
Of course, if you use the async fetch version, the function must also be async.
```python
from playwright.async_api import Page

async def scroll_page(page: Page):
   await page.mouse.wheel(10, 0)
   await page.mouse.move(100, 400)
   await page.mouse.up()

page = await DynamicFetcher.async_fetch('https://example.com', page_action=scroll_page)
```

### Wait Conditions

```python
# Wait for the selector
page = DynamicFetcher.fetch(
    'https://example.com',
    wait_selector='h1',
    wait_selector_state='visible'
)
```
This is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.

After that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.

The states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):

- `attached`: Wait for an element to be present in the DOM.
- `detached`: Wait for an element to not be present in the DOM.
- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.
- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.

### Some Stealth Features

```python
page = DynamicFetcher.fetch(
    'https://example.com',
    google_search=True,
    useragent='Mozilla/5.0...',  # Custom user agent
    locale='en-US',  # Set browser locale
)
```

### General example
```python
from scrapling.fetchers import DynamicFetcher

def scrape_dynamic_content():
    # Use Playwright for JavaScript content
    page = DynamicFetcher.fetch(
        'https://example.com/dynamic',
        network_idle=True,
        wait_selector='.content'
    )
    
    # Extract dynamic content
    content = page.css('.content')
    
    return {
        'title': content.css('h1::text').get(),
        'items': [
            item.text for item in content.css('.item')
        ]
    }
```

## Session Management

To keep the browser open until you make multiple requests with the same configuration, use `DynamicSession`/`AsyncDynamicSession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.

```python
from scrapling.fetchers import DynamicSession

# Create a session with default configuration
with DynamicSession(
    headless=True,
    disable_resources=True,
    real_chrome=True
) as session:
    # Make multiple requests with the same browser instance
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com')
    page3 = session.fetch('https://dynamic-site.com')
    
    # All requests reuse the same tab on the same browser instance
```

### Async Session Usage

```python
import asyncio
from scrapling.fetchers import AsyncDynamicSession

async def scrape_multiple_sites():
    async with AsyncDynamicSession(
        network_idle=True,
        timeout=30000,
        max_pages=3
    ) as session:
        # Make async requests with shared browser configuration
        pages = await asyncio.gather(
            session.fetch('https://spa-app1.com'),
            session.fetch('https://spa-app2.com'),
            session.fetch('https://dynamic-content.com')
        )
        return pages
```

You may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:

1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.
2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.

This logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)

In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.

### Session Benefits

- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.
- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.
- **Consistent fingerprint**: Same browser fingerprint across all requests.
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.

## When to Use

Use DynamicFetcher when:

- Need browser automation
- Want multiple browser options
- Using a real Chrome browser
- Need custom browser config
- Want a few stealth options 

If you want more stealth and control without much config, check out the [StealthyFetcher](stealthy.md).

================================================
FILE: docs/fetching/static.md
================================================
# HTTP requests

The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.

## Basic Usage
You have one primary way to import this Fetcher, which is the same for all fetchers.

```python
>>> from scrapling.fetchers import Fetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

### Shared arguments
All methods for making requests here share some arguments, so let's discuss them first.

- **url**: The targeted URL
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
- **retry_delay**: Number of seconds to wait between retry attempts. **Defaults to 1 second**.
- **impersonate**: Impersonate specific browsers' TLS fingerprints. Accepts browser strings or a list of them like `"chrome110"`, `"firefox102"`, `"safari15_5"` to use specific versions or `"chrome"`, `"firefox"`, `"safari"`, `"edge"` to automatically use the latest version available. This makes your requests appear to come from real browsers at the TLS level. If you pass it a list of strings, it will choose a random one with each request. **Defaults to the latest available Chrome version.**
- **http3**: Use HTTP/3 protocol for requests. **Defaults to False**. It might be problematic if used with `impersonate`.
- **cookies**: Cookies to use in the request. Can be a dictionary of `name→value` or a list of dictionaries.
- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
- **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.
- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
- **cert**: Tuple of (cert, key) filenames for the client certificate.
- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.

!!! note "Notes:"

    1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)<br/>
    2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.<br/>
    3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.

Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.

### HTTP Methods
There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.

Examples are the best way to explain this:

> Hence: `OPTIONS` and `HEAD` methods are not supported.
#### GET
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic GET
>>> page = Fetcher.get('https://example.com')
>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')
>>> # With parameters
>>> page = Fetcher.get('https://example.com/search', params={'q': 'query'})
>>>
>>> # With headers
>>> page = Fetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})
>>> # Basic HTTP authentication
>>> page = Fetcher.get("https://example.com", auth=("my_user", "password123"))
>>> # Browser impersonation
>>> page = Fetcher.get('https://example.com', impersonate='chrome')
>>> # HTTP/3 support
>>> page = Fetcher.get('https://example.com', http3=True)
```
And for asynchronous requests, it's a small adjustment 
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic GET
>>> page = await AsyncFetcher.get('https://example.com')
>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')
>>> # With parameters
>>> page = await AsyncFetcher.get('https://example.com/search', params={'q': 'query'})
>>>
>>> # With headers
>>> page = await AsyncFetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})
>>> # Basic HTTP authentication
>>> page = await AsyncFetcher.get("https://example.com", auth=("my_user", "password123"))
>>> # Browser impersonation
>>> page = await AsyncFetcher.get('https://example.com', impersonate='chrome110')
>>> # HTTP/3 support
>>> page = await AsyncFetcher.get('https://example.com', http3=True)
```
Needless to say, the `page` object in all cases is [Response](choosing.md#response-object) object, which is a [Selector](../parsing/main_classes.md#selector) as we said, so you can use it directly
```python
>>> page.css('.something.something')

>>> page = Fetcher.get('https://api.github.com/events')
>>> page.json()
[{'id': '<redacted>',
  'type': 'PushEvent',
  'actor': {'id': '<redacted>',
   'login': '<redacted>',
   'display_login': '<redacted>',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/<redacted>',
   'avatar_url': 'https://avatars.githubusercontent.com/u/<redacted>'},
  'repo': {'id': '<redacted>',
...
```
#### POST
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic POST
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, params={'q': 'query'})
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate="chrome")
>>> # Another example of form-encoded data
>>> page = Fetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)
>>> # JSON data
>>> page = Fetcher.post('https://example.com/api', json={'key': 'value'})
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic POST
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate="chrome")
>>> # Another example of form-encoded data
>>> page = await AsyncFetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)
>>> # JSON data
>>> page = await AsyncFetcher.post('https://example.com/api', json={'key': 'value'})
```
#### PUT
```python
>>> from scrapling.fetchers import Fetcher
>>> # Basic PUT
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'})
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')
>>> # Another example of form-encoded data
>>> page = Fetcher.put("https://scrapling.requestcatcher.com/put", data={'key': ['value1', 'value2']})
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> # Basic PUT
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'})
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')
>>> # Another example of form-encoded data
>>> page = await AsyncFetcher.put("https://scrapling.requestcatcher.com/put", data={'key': ['value1', 'value2']})
```

#### DELETE
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.delete('https://example.com/resource/123')
>>> page = Fetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = Fetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')
```
And for asynchronous requests, it's a small adjustment
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> page = await AsyncFetcher.delete('https://example.com/resource/123')
>>> page = await AsyncFetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate="chrome")
>>> page = await AsyncFetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')
```

## Session Management

For making multiple requests with the same configuration, use the `FetcherSession` class. It can be used in both synchronous and asynchronous code without issue; the class automatically detects and changes the session type, without requiring a different import.

The `FetcherSession` class can accept nearly all the arguments that the methods can take, which enables you to specify a config for the entire session and later choose a different config for one of the requests effortlessly, as you will see in the following examples.

```python
from scrapling.fetchers import FetcherSession

# Create a session with default configuration
with FetcherSession(
    impersonate='chrome',
    http3=True,
    stealthy_headers=True,
    timeout=30,
    retries=3
) as session:
    # Make multiple requests with the same settings and the same cookies
    page1 = session.get('https://scrapling.requestcatcher.com/get')
    page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
    page3 = session.get('https://api.github.com/events')

    # All requests share the same session and connection pool
```

You can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:

```python
from scrapling.fetchers import FetcherSession, ProxyRotator

rotator = ProxyRotator([
    'http://proxy1:8080',
    'http://proxy2:8080',
    'http://proxy3:8080',
])

with FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:
    # Each request automatically uses the next proxy in rotation
    page1 = session.get('https://example.com/page1')
    page2 = session.get('https://example.com/page2')

    # You can check which proxy was used via the response metadata
    print(page1.meta['proxy'])
```

You can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:

```python
with FetcherSession(proxy='http://default-proxy:8080') as session:
    # Uses the session proxy
    page1 = session.get('https://example.com/page1')

    # Override the proxy for this specific request
    page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')
```

And here's an async example

```python
async with FetcherSession(impersonate='firefox', http3=True) as session:
    # All standard HTTP methods available
    response = await session.get('https://example.com')
    response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})
    response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})
    response = await session.delete('https://scrapling.requestcatcher.com/delete')
```
or better
```python
import asyncio
from scrapling.fetchers import FetcherSession

# Async session usage
async with FetcherSession(impersonate="safari") as session:
    urls = ['https://example.com/page1', 'https://example.com/page2']

    tasks = [
        session.get(url) for url in urls
    ]

    pages = await asyncio.gather(*tasks)
```

The `Fetcher` class uses `FetcherSession` to create a temporary session with each request you make.

### Session Benefits

- **A lot faster**: 10 times faster than creating a single session for each request
- **Cookie persistence**: Automatic cookie handling across requests
- **Resource efficiency**: Better memory and CPU usage for multiple requests
- **Centralized configuration**: Single place to manage request settings

## Examples
Some well-rounded examples to aid newcomers to Web Scraping

### Basic HTTP Request

```python
from scrapling.fetchers import Fetcher

# Make a request
page = Fetcher.get('https://example.com')

# Check the status
if page.status == 200:
    # Extract title
    title = page.css('title::text').get()
    print(f"Page title: {title}")

    # Extract all links
    links = page.css('a::attr(href)').getall()
    print(f"Found {len(links)} links")
```

### Product Scraping

```python
from scrapling.fetchers import Fetcher

def scrape_products():
    page = Fetcher.get('https://example.com/products')
    
    # Find all product elements
    products = page.css('.product')
    
    results = []
    for product in products:
        results.append({
            'title': product.css('.title::text').get(),
            'price': product.css('.price::text').re_first(r'\d+\.\d{2}'),
            'description': product.css('.description::text').get(),
            'in_stock': product.has_class('in-stock')
        })
    
    return results
```

### Downloading Files

```python
from scrapling.fetchers import Fetcher

page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
with open(file='main_cover.png', mode='wb') as f:
   f.write(page.body)
```

### Pagination Handling

```python
from scrapling.fetchers import Fetcher

def scrape_all_pages():
    base_url = 'https://example.com/products?page={}'
    page_num = 1
    all_products = []
    
    while True:
        # Get current page
        page = Fetcher.get(base_url.format(page_num))
        
        # Find products
        products = page.css('.product')
        if not products:
            break
            
        # Process products
        for product in products:
            all_products.append({
                'name': product.css('.name::text').get(),
                'price': product.css('.price::text').get()
            })
            
        # Next page
        page_num += 1
        
    return all_products
```

### Form Submission

```python
from scrapling.fetchers import Fetcher

# Submit login form
response = Fetcher.post(
    'https://example.com/login',
    data={
        'username': 'user@example.com',
        'password': 'password123'
    }
)

# Check login success
if response.status == 200:
    # Extract user info
    user_name = response.css('.user-name::text').get()
    print(f"Logged in as: {user_name}")
```

### Table Extraction

```python
from scrapling.fetchers import Fetcher

def extract_table():
    page = Fetcher.get('https://example.com/data')
    
    # Find table
    table = page.css('table')[0]
    
    # Extract headers
    headers = [
        th.text for th in table.css('thead th')
    ]
    
    # Extract rows
    rows = []
    for row in table.css('tbody tr'):
        cells = [td.text for td in row.css('td')]
        rows.append(dict(zip(headers, cells)))
        
    return rows
```

### Navigation Menu

```python
from scrapling.fetchers import Fetcher

def extract_menu():
    page = Fetcher.get('https://example.com')
    
    # Find navigation
    nav = page.css('nav')[0]
    
    menu = {}
    for item in nav.css('li'):
        links = item.css('a')
        if links:
            link = links[0]
            menu[link.text] = {
                'url': link['href'],
                'has_submenu': bool(item.css('.submenu'))
            }
            
    return menu
```

## When to Use

Use `Fetcher` when:

- Need rapid HTTP requests.
- Want minimal overhead.
- Don't need JavaScript execution (the website can be scraped through requests).
- Need some stealth features (ex, the targeted website is using protection but doesn't use JavaScript challenges).

Use `FetcherSession` when:

- Making multiple requests to the same or different sites.
- Need to maintain cookies/authentication between requests.
- Want connection pooling for better performance.
- Require consistent configuration across requests.
- Working with APIs that require a session state.

Use other fetchers when:

- Need browser automation.
- Need advanced anti-bot/stealth capabilities.
- Need JavaScript support or interacting with dynamic content

================================================
FILE: docs/fetching/stealthy.md
================================================
# Fetching dynamic websites with hard protections

Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.

As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.

!!! success "Prerequisites"

    1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason.
    2. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
    3. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
    4. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.

## Basic Usage
You have one primary way to import this Fetcher, which is the same for all fetchers.

```python
>>> from scrapling.fetchers import StealthyFetcher
```
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)

!!! abstract

    The async version of the `fetch` method is `async_fetch`, of course.

## What does it do?

The `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynamic.md#introduction) class, and here are some of the things it does:

1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. 
2. It bypasses CDP runtime leaks and WebRTC leaks.
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
4. It generates canvas noise to prevent fingerprinting through canvas.
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
6. and other anti-protection options...

## Full list of arguments
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments


|      Argument       | Description                                                                                                                                                                                                                         | Optional |
|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|         url         | Target url                                                                                                                                                                                                                          |    ❌     |
|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |
|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |
|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |
|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |
|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |
|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |
|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |
|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |
|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |
|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |
|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |
|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                                                                                                      |    ✔️    |
|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                                |    ✔️    |
|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |
|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |
|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |
|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |
|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |
|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |
|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |
|  solve_cloudflare   | When enabled, fetcher solves all types of Cloudflare's Turnstile/Interstitial challenges before returning the response to you.                                                                                                      |    ✔️    |
|    block_webrtc     | Forces WebRTC to respect proxy settings to prevent local IP address leak.                                                                                                                                                           |    ✔️    |
|     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                    |    ✔️    |
|     allow_webgl     | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled.                                                                     |    ✔️    |
|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |
|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |
|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too).                                                                                                     |    ✔️    |
|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |
|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |
|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |

In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.

!!! note "Notes:"

    1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
    2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
    3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
    4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.

## Examples
It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.

### Cloudflare and stealth options

```python
# Automatic Cloudflare solver
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)

# Works with other stealth options
page = StealthyFetcher.fetch(
    'https://protected-site.com',
    solve_cloudflare=True,
    block_webrtc=True,
    real_chrome=True,
    hide_canvas=True,
    google_search=True,
    proxy='http://username:password@host:port',  # It can also be a dictionary with only the keys 'server', 'username', and 'password'.
)
```

The `solve_cloudflare` parameter enables automatic detection and solving all types of Cloudflare's Turnstile/Interstitial challenges:

- JavaScript challenges (managed)
- Interactive challenges (clicking verification boxes)
- Invisible challenges (automatic background verification)

And even solves the custom pages with embedded captcha.

!!! notes "**Important notes:**"

    1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.
    2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.
    3. This feature works seamlessly with proxies and other stealth options.

### Browser Automation
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.

This function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.

In the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.
```python
from playwright.sync_api import Page

def scroll_page(page: Page):
    page.mouse.wheel(10, 0)
    page.mouse.move(100, 400)
    page.mouse.up()

page = StealthyFetcher.fetch('https://example.com', page_action=scroll_page)
```
Of course, if you use the async fetch version, the function must also be async.
```python
from playwright.async_api import Page

async def scroll_page(page: Page):
   await page.mouse.wheel(10, 0)
   await page.mouse.move(100, 400)
   await page.mouse.up()

page = await StealthyFetcher.async_fetch('https://example.com', page_action=scroll_page)
```

### Wait Conditions
```python
# Wait for the selector
page = StealthyFetcher.fetch(
    'https://example.com',
    wait_selector='h1',
    wait_selector_state='visible'
)
```
This is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.

After that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.

The states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):

- `attached`: Wait for an element to be present in the DOM.
- `detached`: Wait for an element to not be present in the DOM.
- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.
- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.


### Real-world example (Amazon)
This is for educational purposes only; this example was generated by AI, which also shows how easy it is to work with Scrapling through AI
```python
def scrape_amazon_product(url):
    # Use StealthyFetcher to bypass protection
    page = StealthyFetcher.fetch(url)

    # Extract product details
    return {
        'title': page.css('#productTitle::text').get().clean(),
        'price': page.css('.a-price .a-offscreen::text').get(),
        'rating': page.css('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text').get(),
        'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
        'features': [
            li.get().clean() for li in page.css('#feature-bullets li span::text')
        ],
        'availability': page.css('#availability')[0].get_all_text(strip=True),
        'images': [
            img.attrib['src'] for img in page.css('#altImages img')
        ]
    }
```

## Session Management

To keep the browser open until you make multiple requests with the same configuration, use `StealthySession`/`AsyncStealthySession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.

```python
from scrapling.fetchers import StealthySession

# Create a session with default configuration
with StealthySession(
    headless=True,
    real_chrome=True,
    block_webrtc=True,
    solve_cloudflare=True
) as session:
    # Make multiple requests with the same browser instance
    page1 = session.fetch('https://example1.com')
    page2 = session.fetch('https://example2.com') 
    page3 = session.fetch('https://nopecha.com/demo/cloudflare')
    
    # All requests reuse the same tab on the same browser instance
```

### Async Session Usage

```python
import asyncio
from scrapling.fetchers import AsyncStealthySession

async def scrape_multiple_sites():
    async with AsyncStealthySession(
        real_chrome=True,
        block_webrtc=True,
        solve_cloudflare=True,
        timeout=60000,  # 60 seconds for Cloudflare challenges
        max_pages=3
    ) as session:
        # Make async requests with shared browser configuration
        pages = await asyncio.gather(
            session.fetch('https://site1.com'),
            session.fetch('https://site2.com'), 
            session.fetch('https://protected-site.com')
        )
        return pages
```

You may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:

1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.
2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.

This logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)

In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.

### Session Benefits

- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.
- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.
- **Consistent fingerprint**: Same browser fingerprint across all requests.
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.

## Using Camoufox as an engine

This fetcher used a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced by [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. If you see that Camoufox is stable on your device, has no high memory issues, and you want to continue using it, then you can.

First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:
```commandline
pip install camoufox
playwright install-deps firefox
camoufox fetch
```
Then you will inherit from `StealthySession` and set it as below:
```python
from scrapling.fetchers import StealthySession
from playwright.sync_api import sync_playwright
from camoufox.utils import launch_options as generate_launch_options

class StealthySession(StealthySession):
    def start(self):
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = sync_playwright().start()
            # Configure camoufox run options here
            launch_options = generate_launch_options(**{"headless": True, "user_data_dir": ''})
            # Here's an example, part of what we have been doing before v0.3.13
            launch_options = generate_launch_options(**{
                "geoip": False,
                "proxy": self._config.proxy,
                "headless": self._config.headless,
                "humanize": True if self._config.solve_cloudflare else False,  # Better enable humanize for Cloudflare, otherwise it's up to you
                "i_know_what_im_doing": True,  # To turn warnings off with the user configurations
                "allow_webgl": self._config.allow_webgl,
                "block_webrtc": self._config.block_webrtc,
                "os": None,
                "user_data_dir": self._config.user_data_dir,
                "firefox_user_prefs": {
                    # This is what enabling `enable_cache` does internally, so we do it from here instead
                    "browser.sessionhistory.max_entries": 10,
                    "browser.sessionhistory.max_total_viewers": -1,
                    "browser.cache.memory.enable": True,
                    "browser.cache.disk_cache_ssl": True,
                    "browser.cache.disk.smart_size.enabled": True,
                },
                # etc...
            })
            self.context = self.playwright.firefox.launch_persistent_context(**launch_options)
        else:
            raise RuntimeError("Session has been already started")
```
After that, you can use it normally as before, even for solving Cloudflare challenges:
```python
with StealthySession(solve_cloudflare=True, headless=True) as session:
    page = session.fetch('https://sergiodemo.com/security/challenge/legacy-challenge')
    if page.css('#page-not-found-404'):
        print('Cloudflare challenge solved successfully!')
```

The same logic applies to the `AsyncStealthySession` class with a few differences:
```python
from scrapling.fetchers import AsyncStealthySession
from playwright.async_api import async_playwright
from camoufox.utils import launch_options as generate_launch_options

class AsyncStealthySession(AsyncStealthySession):
    async def start(self):
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = await async_playwright().start()
            # Configure camoufox run options here
            launch_options = generate_launch_options(**{"headless": True, "user_data_dir": ''})
            # or set the launch options as in the above example
            self.context = await self.playwright.firefox.launch_persistent_context(**launch_options)
        else:
            raise RuntimeError("Session has been already started")
 
async with AsyncStealthySession(solve_cloudflare=True, headless=True) as session:
    page = await session.fetch('https://sergiodemo.com/security/challenge/legacy-challenge')
    if page.css('#page-not-found-404'):
        print('Cloudflare challenge solved successfully!')
```

Enjoy! :)

## When to Use

Use StealthyFetcher when:

- Bypassing anti-bot protection
- Need a reliable browser fingerprint
- Full JavaScript support needed
- Want automatic stealth features
- Need browser automation
- Dealing with Cloudflare protection

================================================
FILE: docs/index.md
================================================
<style>
.md-typeset h1 {
  display: none;
}
[data-md-color-scheme="default"] .only-dark { display: none; }
[data-md-color-scheme="slate"] .only-light { display: none; }
</style>

<br/>
<div align="center">
    <a href="https://scrapling.readthedocs.io/en/latest/" alt="poster">
        <img alt="Scrapling" src="assets/cover_light.svg" class="only-light">
        <img alt="Scrapling" src="assets/cover_dark.svg" class="only-dark">
    </a>
</div>

<h2 align="center"><i>Effortless Web Scraping for the Modern Web</i></h2><br>

Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.

Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.

Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.

```python
from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
StealthyFetcher.adaptive = True
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Fetch website under the radar!
products = page.css('.product', auto_save=True)                                        # Scrape data that survives website design changes!
products = page.css('.product', adaptive=True)                                         # Later, if the website structure changes, pass `adaptive=True` to find them!
```
Or scale up to full crawls
```python
from scrapling.spiders import Spider, Response

class MySpider(Spider):
  name = "demo"
  start_urls = ["https://example.com/"]

  async def parse(self, response: Response):
      for item in response.css('.product'):
          yield {"title": item.css('h2::text').get()}

MySpider().start()
```

## Top Sponsors 

<style>
.ad {
    width:240px;
    height:100px;
}

</style>

<!-- sponsors -->
<div style="text-align: center;">
  <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png" class="ad">
  </a>
  <a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg" class="ad">
  </a>
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png" class="ad">
  </a>
  <a href="https://tikhub.io/?ref=KarimShoair" target="_blank" title="Unlock the Power of Social Media Data & AI">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg" class="ad">
  </a>
  <a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png" class="ad">
  </a>
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png" class="ad">
  </a>
  <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png" class="ad">
  </a>
  <a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png" class="ad">
  </a>
  <br />
  <br />
</div>
<!-- /sponsors -->

<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci), choose a plan, and enjoy the rest of the perks!</sub></i>

## Key Features

### Spiders — A Full Crawling Framework
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.

### Advanced Websites Fetching with Session Support
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.

### Adaptive Scraping & AI Integration
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))

### High-Performance & battle-tested Architecture
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.

### Developer/Web Scraper Friendly Experience
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.


## Star History
Scrapling’s GitHub stars have grown steadily since its release (see chart below).

<div id="chartContainer">
  <a href="https://github.com/D4Vinci/Scrapling">
    <img id="chartImage" alt="Star History Chart" loading="lazy" src="https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date" height="400"/>
  </a>
</div>

<script>
const observer = new MutationObserver((mutations) => {
  mutations.forEach((mutation) => {
    if (mutation.attributeName === 'data-md-color-media') {
      const colorMedia = document.body.getAttribute('data-md-color-media');
      const isDarkScheme = document.body.getAttribute('data-md-color-scheme') === 'slate';
      const chartImg = document.querySelector('#chartImage');
      const baseUrl = 'https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date';
      
      if (colorMedia === '(prefers-color-scheme)' ? isDarkScheme : colorMedia.includes('dark')) {
        chartImg.src = `${baseUrl}&theme=dark`;
      } else {
        chartImg.src = baseUrl;
      }
    }
  });
});

observer.observe(document.body, {
  attributes: true,
  attributeFilter: ['data-md-color-media', 'data-md-color-scheme']
});
</script>


## Installation
Scrapling requires Python 3.10 or higher:

```bash
pip install scrapling
```

This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.

### Optional Dependencies

1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:
    ```bash
    pip install "scrapling[fetchers]"
    
    scrapling install           # normal install
    scrapling install  --force  # force reinstall
    ```

    This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.

    Or you can install them from the code instead of running a command like this:
    ```python
    from scrapling.cli import install
    
    install([], standalone_mode=False)          # normal install
    install(["--force"], standalone_mode=False) # force reinstall
    ```

2. Extra features:


     - Install the MCP server feature:
       ```bash
       pip install "scrapling[ai]"
       ```
     - Install shell features (Web Scraping shell and the `extract` command): 
         ```bash
         pip install "scrapling[shell]"
         ```
     - Install everything: 
         ```bash
         pip install "scrapling[all]"
         ```
     Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)

### Docker
You can also install a Docker image with all extras and browsers with the following command from DockerHub:
```bash
docker pull pyd4vinci/scrapling
```
Or download it from the GitHub registry:
```bash
docker pull ghcr.io/d4vinci/scrapling:latest
```
This image is automatically built and pushed using GitHub Actions and the repository's main branch.

## How the documentation is organized
Scrapling has extensive documentation, so we try to follow the [Diátaxis documentation framework](https://diataxis.fr/).

## Support

If you like Scrapling and want to support its development:

- ⭐ Star the [GitHub repository](https://github.com/D4Vinci/Scrapling)
- 🚀 Follow us on [Twitter](https://x.com/Scrapling_dev) and join the [discord server](https://discord.gg/EMgGbDceNQ)
- 💝 Consider [sponsoring the project or buying me a coffee](donate.md) :wink:
- 🐛 Report bugs and suggest features through [GitHub Issues](https://github.com/D4Vinci/Scrapling/issues)

## License

This project is licensed under the BSD-3 License. See the [LICENSE](https://github.com/D4Vinci/Scrapling/blob/main/LICENSE) file for details.

================================================
FILE: docs/overrides/main.html
================================================
{% extends "base.html" %}

{% block announce %}
  <a href="https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling" target="_blank" style="display:flex; justify-content:center; padding:0px 0;">
    <img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png" alt="At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies." style="max-height:60px;">
  </a>
{% endblock %}

{% block extrahead %}
    <!-- Open Graph -->
    <meta property="og:image" content="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/main_cover.png" />
    <meta property="og:image:type" content="image/png" />
    <meta property="og:image:width" content="1344" />
    <meta property="og:image:height" content="768" />
    <meta property="og:type" content="website" />
    <meta property="og:site_name" content="Scrapling documentation" />

    <!-- Twitter -->
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:image" content="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/main_cover.png" />
    <meta name="twitter:site" content="@Scrapling_dev" />
    <meta name="twitter:creator" content="@D4Vinci1" />

    <!-- General -->
    <meta name="author" content="Karim Shoair" />
    <meta name="theme-color" content="#673ab7" />
{% endblock %}

================================================
FILE: docs/overview.md
================================================
## Pick Your Path

Not sure where to start? Pick the path that matches what you're trying to do:

| I want to... | Start here |
|:---|:---|
| **Parse HTML** I already have | [Querying elements](parsing/selection.md) — CSS, XPath, and text-based selection |
| **Quickly scrape a page** and prototype | Pick a [fetcher](fetching/choosing.md) and test right away, or launch the [interactive shell](cli/interactive-shell.md) |
| **Build a crawler** that scales | [Spiders](spiders/getting-started.md) — concurrent, multi-session crawls with pause/resume |
| **Scrape without writing code** | [CLI extract commands](cli/extract-commands.md) or hook up the [MCP server](ai/mcp-server.md) to your favourite AI tool |
| **Migrate** from another library | [From BeautifulSoup](tutorials/migrating_from_beautifulsoup.md) or [Scrapy comparison](spiders/architecture.md#comparison-with-scrapy) |

---

We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.

Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page:
```html
<html>
  <head>
    <title>Complex Web Page</title>
    <style>
      .hidden { display: none; }
    </style>
  </head>
  <body>
    <header>
      <nav>
        <ul>
          <li> <a href="#home">Home</a> </li>
          <li> <a href="#about">About</a> </li>
          <li> <a href="#contact">Contact</a> </li>
        </ul>
      </nav>
    </header>
    <main>
      <section id="products" schema='{"jsonable": "data"}'>
        <h2>Products</h2>
        <div class="product-list">
          <article class="product" data-id="1">
            <h3>Product 1</h3>
            <p class="description">This is product 1</p>
            <span class="price">$10.99</span>
            <div class="hidden stock">In stock: 5</div>
          </article>

          <article class="product" data-id="2">
            <h3>Product 2</h3>
            <p class="description">This is product 2</p>
            <span class="price">$20.99</span>
            <div class="hidden stock">In stock: 3</div>
          </article>

          <article class="product" data-id="3">
            <h3>Product 3</h3>
            <p class="description">This is product 3</p>
            <span class="price">$15.99</span>
            <div class="hidden stock">Out of stock</div>
          </article>
        </div>
      </section>
      
      <section id="reviews">
        <h2>Customer Reviews</h2>
        <div class="review-list">
          <div class="review" data-rating="5">
            <p class="review-text">Great product!</p>
            <span class="reviewer">John Doe</span>
          </div>
          <div class="review" data-rating="4">
            <p class="review-text">Good value for money.</p>
            <span class="reviewer">Jane Smith</span>
          </div>
        </div>
      </section>
    </main>
    <script id="page-data" type="application/json">
      {
        "lastUpdated": "2024-09-22T10:30:00Z",
        "totalProducts": 3
      }
    </script>
  </body>
</html>
```
Starting with loading raw HTML above like this
```python
from scrapling.parser import Selector
page = Selector(html_doc)
page  # <data='<html><head><title>Complex Web Page</tit...'>
```
Get all text content on the page recursively
```python
page.get_all_text(ignore_tags=('script', 'style'))
# 'Complex Web Page\nHome\nAbout\nContact\nProducts\nProduct 1\nThis is product 1\n$10.99\nIn stock: 5\nProduct 2\nThis is product 2\n$20.99\nIn stock: 3\nProduct 3\nThis is product 3\n$15.99\nOut of stock\nCustomer Reviews\nGreat product!\nJohn Doe\nGood value for money.\nJane Smith'
```

## Finding elements
If there's an element you want to find on the page, you will find it! Your creativity level is the only limitation!

Finding the first HTML `section` element
```python
section_element = page.find('section')
# <data='<section id="products" schema='{"jsonabl...' parent='<main><section id="products" schema='{"j...'>
```
Find all `section` elements
```python
section_elements = page.find_all('section')
# [<data='<section id="products" schema='{"jsonabl...' parent='<main><section id="products" schema='{"j...'>, <data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
```
Find all `section` elements whose `id` attribute value is `products`.
```python
section_elements = page.find_all('section', {'id':"products"})
# Same as
section_elements = page.find_all('section', id="products")
# [<data='<section id="products" schema='{"jsonabl...' parent='<main><section id="products" schema='{"j...'>]
```
Find all `section` elements whose `id` attribute value contains `product`.
```python
section_elements = page.find_all('section', {'id*':"product"})
```
Find all `h3` elements whose text content matches this regex `Product \d`
```python
page.find_all('h3', re.compile(r'Product \d'))
# [<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class="product" data-id="2"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class="product" data-id="3"><h3...'>]
```
Find all `h3` and `h2` elements whose text content matches the regex `Product` only
```python
page.find_all(['h3', 'h2'], re.compile(r'Product'))
# [<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class="product" data-id="2"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class="product" data-id="3"><h3...'>, <data='<h2>Products</h2>' parent='<section id="products" schema='{"jsonabl...'>]
```
Find all elements whose text content matches exactly `Products` (Whitespaces are not taken into consideration)
```python
page.find_by_text('Products', first_match=False)
# [<data='<h2>Products</h2>' parent='<section id="products" schema='{"jsonabl...'>]
```
Or find all elements whose text content matches regex `Product \d`
```python
page.find_by_regex(r'Product \d', first_match=False)
# [<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class="product" data-id="2"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class="product" data-id="3"><h3...'>]
```
Find all elements that are similar to the element you want
```python
target_element = page.find_by_regex(r'Product \d', first_match=True)
# <data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>
target_element.find_similar()
# [<data='<h3>Product 2</h3>' parent='<article class="product" data-id="2"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class="product" data-id="3"><h3...'>]
```
Find the first element that matches a CSS selector
```python
page.css('.product-list [data-id="1"]')[0]
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
```
Find all elements that match a CSS selector
```python
page.css('.product-list article')
# [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>, <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>, <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
```
Find the first element that matches an XPath selector
```python
page.xpath("//*[@id='products']/div/article")[0]
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
```
Find all elements that match an XPath selector
```python
page.xpath("//*[@id='products']/div/article")
# [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>, <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>, <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
```

With this, we just scratched the surface of these functions; more advanced options with these selection methods are shown later.
## Accessing elements' data
It's as simple as
```python
>>> section_element.tag
'section'
>>> print(section_element.attrib)
{'id': 'products', 'schema': '{"jsonable": "data"}'}
>>> section_element.attrib['schema'].json()  # If an attribute value can be converted to json, then use `.json()` to convert it
{'jsonable': 'data'}
>>> section_element.text  # Direct text content
''
>>> section_element.get_all_text()  # All text content recursively
'Products\nProduct 1\nThis is product 1\n$10.99\nIn stock: 5\nProduct 2\nThis is product 2\n$20.99\nIn stock: 3\nProduct 3\nThis is product 3\n$15.99\nOut of stock'
>>> section_element.html_content  # The HTML content of the element
'<section id="products" schema=\'{"jsonable": "data"}\'><h2>Products</h2>\n        <div class="product-list">\n          <article class="product" data-id="1"><h3>Product 1</h3>\n            <p class="description">This is product 1</p>\n            <span class="price">$10.99</span>\n            <div class="hidden stock">In stock: 5</div>\n          </article><article class="product" data-id="2"><h3>Product 2</h3>\n            <p class="description">This is product 2</p>\n            <span class="price">$20.99</span>\n            <div class="hidden stock">In stock: 3</div>\n          </article><article class="product" data-id="3"><h3>Product 3</h3>\n            <p class="description">This is product 3</p>\n            <span class="price">$15.99</span>\n            <div class="hidden stock">Out of stock</div>\n          </article></div>\n      </section>'
>>> print(section_element.prettify())  # The prettified version
'''
<section id="products" schema='{"jsonable": "data"}'><h2>Products</h2>
    <div class="product-list">
      <article class="product" data-id="1"><h3>Product 1</h3>
        <p class="description">This is product 1</p>
        <span class="price">$10.99</span>
        <div class="hidden stock">In stock: 5</div>
      </article><article class="product" data-id="2"><h3>Product 2</h3>
        <p class="description">This is product 2</p>
        <span class="price">$20.99</span>
        <div class="hidden stock">In stock: 3</div>
      </article><article class="product" data-id="3"><h3>Product 3</h3>
        <p class="description">This is product 3</p>
        <span class="price">$15.99</span>
        <div class="hidden stock">Out of stock</div>
      </article>
    </div>
</section>
'''
>>> section_element.path  # All the ancestors in the DOM tree of this element
[<data='<main><section id="products" schema='{"j...' parent='<body> <header><nav><ul><li> <a href="#h...'>,
 <data='<body> <header><nav><ul><li> <a href="#h...' parent='<html><head><title>Complex Web Page</tit...'>,
 <data='<html><head><title>Complex Web Page</tit...'>]
>>> section_element.generate_css_selector
'#products'
>>> section_element.generate_full_css_selector
'body > main > #products > #products'
>>> section_element.generate_xpath_selector
"//*[@id='products']"
>>> section_element.generate_full_xpath_selector
"//body/main/*[@id='products']"
```

## Navigation
Using the elements we found above 

```python
>>> section_element.parent
<data='<main><section id="products" schema='{"j...' parent='<body> <header><nav><ul><li> <a href="#h...'>
>>> section_element.parent.tag
'main'
>>> section_element.parent.parent.tag
'body'
>>> section_element.children
[<data='<h2>Products</h2>' parent='<section id="products" schema='{"jsonabl...'>,
 <data='<div class="product-list"> <article clas...' parent='<section id="products" schema='{"jsonabl...'>]
>>> section_element.siblings
[<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
>>> section_element.next  # gets the next element, the same logic applies to `quote.previous`.
<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>
>>> section_element.children.css('h2::text').getall()
['Products']
>>> page.css('[data-id="1"]')[0].has_class('product')
True
```
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below
```python
for ancestor in section_element.iterancestors():
    # do something with it...
```
You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:
```python
>>> section_element.find_ancestor(lambda ancestor: ancestor.css('nav'))
<data='<body> <header><nav><ul><li> <a href="#h...' parent='<html><head><title>Complex Web Page</tit...'>
```

## Fetching websites
Instead of passing the raw HTML to Scrapling, you can retrieve a website's response directly via HTTP requests or by fetching it in a browser.

A fetcher is made for every use case.

### HTTP Requests
For simple HTTP requests, there's a `Fetcher` class that can be imported and used as below:
```python
from scrapling.fetchers import Fetcher
page = Fetcher.get('https://scrapling.requestcatcher.com/get', impersonate="chrome")
```
With that out of the way, here's how to do all HTTP methods:
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
>>> page = Fetcher.put('https://scrapling.requestcatcher.com/put', data={'key': 'value'})
>>> page = Fetcher.delete('https://scrapling.requestcatcher.com/delete')
```
For Async requests, you will replace the import like below:
```python
>>> from scrapling.fetchers import AsyncFetcher
>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)
>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
>>> page = await AsyncFetcher.put('https://scrapling.requestcatcher.com/put', data={'key': 'value'})
>>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')
```

!!! note "Notes:"

    1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a Google referer header. It's enabled by default.
    2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
    3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic

This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)

### Dynamic loading
We have you covered if you deal with dynamic websites like most today!

The `DynamicFetcher` class (formerly `PlayWrightFetcher`) offers many options for fetching and loading web pages using Chromium-based browsers.
```python
>>> from scrapling.fetchers import DynamicFetcher
>>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # Vanilla Playwright option
>>> page.css("#search a::attr(href)").get()
'https://github.com/D4Vinci/Scrapling'
>>> # The async version of fetch
>>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)
>>> page.css("#search a::attr(href)").get()
'https://github.com/D4Vinci/Scrapling'
```
It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:

- Vanilla Playwright without any modifications other than the ones you chose. It uses the Chromium browser.
- Real browsers like your Chrome browser by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.


Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/dynamic.md) for all details and the complete list of arguments.

### Dynamic anti-protection loading
We also have you covered if you deal with dynamic websites with annoying anti-protections!

The `StealthyFetcher` class uses a stealthy version of the `DynamicFetcher` explained above. 

Some of the things it does:

1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. 
2. It bypasses CDP runtime leaks and WebRTC leaks.
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
4. It generates canvas noise to prevent fingerprinting through canvas.
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
6. and other anti-protection options...

```python
>>> from scrapling.fetchers import StealthyFetcher
>>> page = StealthyFetcher.fetch('https://www.browserscan.net/bot-detection')  # Running headless by default
>>> page.status == 200
True
>>> page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)  # Solve Cloudflare captcha automatically if presented
>>> page.status == 200
True
>>> page = StealthyFetcher.fetch('https://www.browserscan.net/bot-detection', humanize=True, os_randomize=True) # and the rest of arguments...
>>> # The async version of fetch
>>> page = await StealthyFetcher.async_fetch('https://www.browserscan.net/bot-detection')
>>> page.status == 200
True
```

Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/stealthy.md) for all details and the complete list of arguments.

---

That's Scrapling at a glance. If you want to learn more, continue to the next section.

================================================
FILE: docs/parsing/adaptive.md
================================================
# Adaptive scraping

!!! success "Prerequisites"

    1. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class.

Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.

Let's say you are scraping a page with a structure like this:
```html
<div class="container">
    <section class="products">
        <article class="product" id="p1">
            <h3>Product 1</h3>
            <p class="description">Description 1</p>
        </article>
        <article class="product" id="p2">
            <h3>Product 2</h3>
            <p class="description">Description 2</p>
        </article>
    </section>
</div>
```
And you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
```python
page.css('#p1')
```
When website owners implement structural changes like
```html
<div class="new-container">
    <div class="product-wrapper">
        <section class="products">
            <article class="product new-class" data-id="p1">
                <div class="product-info">
                    <h3>Product 1</h3>
                    <p class="new-description">Description 1</p>
                </div>
            </article>
            <article class="product new-class" data-id="p2">
                <div class="product-info">
                    <h3>Product 2</h3>
                    <p class="new-description">Description 2</p>
                </div>
            </article>
        </section>
    </div>
</div>
```
The selector will no longer function, and your code needs maintenance. That's where Scrapling's `adaptive` feature comes into play.

With Scrapling, you can enable the `adaptive` feature the first time you select an element, and the next time you select that element and it doesn't exist, Scrapling will remember its properties and search on the website for the element with the highest percentage of similarity to that element, and without AI :)

```python
from scrapling import Selector, Fetcher
# Before the change
page = Selector(page_source, adaptive=True, url='example.com')
# or
Fetcher.adaptive = True
page = Fetcher.get('https://example.com')
# then
element = page.css('#p1', auto_save=True)
if not element:  # One day website changes?
    element = page.css('#p1', adaptive=True)  # Scrapling still finds it!
# the rest of your code...
```
Below, I will show you an example of how to use this feature. Then, we will dive deep into how to use it and provide details about this feature. Note that it works with all selection methods, not just CSS/XPATH selection.

## Real-World Scenario
Let's use a real website as an example and use one of the fetchers to fetch its source. To achieve this, we need to identify a website that is about to update its design/structure, copy its source, and then wait for the website to change. Of course, that's nearly impossible to know unless I know the website's owner, but that will make it a staged test, haha.

To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/). Here is a copy of [StackOverFlow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/); pretty old, eh?</br>Let's see if the adaptive feature can extract the same button in the old design from 2010 and the current design using the same selector :)

If I want to extract the Questions button from the old design, I can use a selector like this: `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a`. This selector is too specific because it was generated by Google Chrome.


Now, let's test the same selector in both versions
```python
>> from scrapling import Fetcher
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
>> new_url = "https://stackoverflow.com/"
>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
>> 
>> page = Fetcher.get(old_url, timeout=30)
>> element1 = page.css(selector, auto_save=True)[0]
>> 
>> # Same selector but used in the updated website
>> page = Fetcher.get(new_url)
>> element2 = page.css(selector, adaptive=True)[0]
>> 
>> if element1.text == element2.text:
...    print('Scrapling found the same element in the old and new designs!')
'Scrapling found the same element in the old and new designs!'
```
Note that I introduced a new argument called `adaptive_domain`. This is because, for Scrapling, these are two different domains (`archive.org` and `stackoverflow.com`), so Scrapling will isolate their `adaptive` data. To inform Scrapling that they are the same website, we must pass the custom domain we wish to use while saving `adaptive` data for both, ensuring Scrapling doesn't isolate them.

The code will be the same in a real-world scenario, except it will use the same URL for both requests, so you won't need to use the `adaptive_domain` argument. This is the closest example I can give to real-world cases, so I hope it didn't confuse you :)

Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.

!!! info

    The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data.

## How the adaptive scraping feature works
Adaptive scraping works in two phases:

1. **Save Phase**: Store unique properties of elements
2. **Match Phase**: Find elements with similar properties later

Let's say you've selected an element through any method and want the library to find it the next time you scrape this website, even if it undergoes structural/design changes. 

With as few technical details as possible, the general logic goes as follows:

  1. You tell Scrapling to save that element's unique properties in one of the ways we will show below.
  2. Scrapling uses its configured database (SQLite by default) and saves each element's unique properties.
  3. Now, because everything about the element can be changed or removed by the website's owner(s), nothing from the element can be used as a unique identifier for the database. To solve this issue, I made the storage system rely on two things:
     1. The domain of the current website. If you are using the `Selector` class, pass it when initializing; if you are using a fetcher, the domain will be automatically taken from the URL.
     2. An `identifier` to query that element's properties from the database. You don't always have to set the identifier yourself; we'll discuss this later.

     Together, they will later be used to retrieve the element's unique properties from the database.

  4. Later, when the website's structure changes, you tell Scrapling to find the element by enabling `adaptive`. Scrapling retrieves the element's unique properties and matches all elements on the page against the unique properties we already have for this element. A score is calculated based on their similarity to the desired element. In that comparison, everything is taken into consideration, as you will see later 
  5. The element(s) with the highest similarity score to the wanted element are returned.

### The unique properties
You might wonder what unique properties we are referring to when discussing the removal or alteration of all element properties.

For Scrapling, the unique elements we are relying on are:

- Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).
- Element's parent tag name, attributes (names and values), and text.

But you need to understand that the comparison between elements isn't exact; it's more about how similar these values are. So everything is considered, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.

## How to use adaptive feature
The adaptive feature can be applied to any found element, and it's added as arguments to CSS/XPath Selection methods, as you saw above, but we will get back to that later.

First, you must enable the `adaptive` feature by passing `adaptive=True` to the [Selector](main_classes.md#selector) class when you initialize it or enable it in the fetcher you are using of the available fetchers, as we will show.

Examples:
```python
>>> from scrapling import Selector, Fetcher
>>> page = Selector(html_doc, adaptive=True)
# OR
>>> Fetcher.adaptive = True
>>> page = Fetcher.get('https://example.com')
```
If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.

If you didn't pass a URL, the word `default` will be used in place of the URL field while saving the element's unique properties. So, this will only be an issue if you use the same identifier later for a different website and don't pass the URL parameter when initializing it. The save process overwrites previous data, and the `adaptive` feature uses only the latest saved properties.

Besides those arguments, we have `storage` and `storage_args`. Both are for the class to connect to the database; by default, it uses the SQLite class provided by the library. Those arguments shouldn't matter unless you want to write your own storage system, which we will cover on a [separate page in the development section](../development/adaptive_storage_system.md).

Now that you've enabled the `adaptive` feature globally, you have two main ways to use it.

### The CSS/XPath Selection way
As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below
```python
element = page.css('#p1', auto_save=True)
```
And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you
```python
element = page.css('#p1', adaptive=True)
```
Pretty simple, eh?

Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`xpath` methods, the identifier is set automatically as the selector you passed here to make things easier :)

Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.

### The manual way
You manually save and retrieve an element, then relocate it, which all happens within the `adaptive` feature, as shown below. This allows you to relocate any element using any method or selection!

First, let's say you got an element like this by text:
```python
>>> element = page.find_by_text('Tipping the Velvet', first_match=True)
```
You can save its unique properties using the `save` method, as shown below, but you must set the identifier yourself. For this example, I chose `my_special_element` as an identifier, but it's best to use a meaningful identifier in your code for the same reason you use meaningful variable names :)
```python
>>> page.save(element, 'my_special_element')
```
Now, later, when you want to retrieve it and relocate it inside the page with `adaptive`, it would be like this
```python
>>> element_dict = page.retrieve('my_special_element')
>>> page.relocate(element_dict, selector_type=True)
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
>>> page.relocate(element_dict, selector_type=True).css('::text').getall()
['Tipping the Velvet']
```
Hence, the `retrieve` and `relocate` methods are used.

If you want to keep it as a `lxml.etree` object, leave the `selector_type` argument
```python
>>> page.relocate(element_dict)
[<Element a at 0x105a2a7b0>]
```

## Troubleshooting

### No Matches Found
```python
# 1. Check if data was saved
element_data = page.retrieve('identifier')
if not element_data:
    print("No data saved for this identifier")

# 2. Try with different identifier
products = page.css('.product', adaptive=True, identifier='old_selector')

# 3. Save again with new identifier
products = page.css('.new-product', auto_save=True, identifier='new_identifier')
```

### Wrong Elements Matched
```python
# Use more specific selectors
products = page.css('.product-list .product', auto_save=True)

# Or save with more context
product = page.find_by_text('Product Name').parent
page.save(product, 'specific_product')
```

## Known Issues
In the `adaptive` save process, only the unique properties of the first element in the selection results are saved. So if the selector you are using selects different elements on the page in other locations, `adaptive` will return the first element to you only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector, for example), as these selectors are separated and each is executed alone.

## Final thoughts
Explaining this feature in detail without complications turned out to be challenging. However, still, if there's something left unclear, you can head out to the [discussions section](https://github.com/D4Vinci/Scrapling/discussions), and I will reply to you ASAP, or the Discord server, or reach out to me privately and have a chat :)

================================================
FILE: docs/parsing/main_classes.md
================================================
# Parsing main classes

!!! success "Prerequisites"

    - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.

After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.

The [Selector](#selector) class is the core parsing engine in Scrapling, providing HTML parsing and element selection capabilities. You can always import it with any of the following imports
```python
from scrapling import Selector
from scrapling.parser import Selector
```
Then use it directly as you already learned in the [overview](../overview.md) page
```python
page = Selector(
    '<html>...</html>',
    url='https://example.com'
)

# Then select elements as you like
elements = page.css('.product')
```
In Scrapling, the main object you deal with after passing an HTML source or fetching a website is, of course, a [Selector](#selector) object. Any operation you do, like selection, navigation, etc., will return either a [Selector](#selector) object or a [Selectors](#selectors) object, given that the result is element/elements from the page, not text or similar.

In other words, the main page is a [Selector](#selector) object, and the elements within are [Selector](#selector) objects, and so on. Any text, such as the text content inside elements or the text inside element attributes, is a [TextHandler](#texthandler) object, and the attributes of each element are stored as [AttributesHandler](#attributeshandler). We will return to both objects later, so let's focus on the [Selector](#selector) object.

## Selector
### Arguments explained
The most important one is `content`, it's used to pass the HTML code you want to parse, and it accepts the HTML content as `str` or `bytes`.

Otherwise, you have the arguments `url`, `adaptive`, `storage`, and `storage_args`. All these arguments are settings used with the `adaptive` feature, and they don't make a difference if you are not going to use that feature, so just ignore them for now, and we will explain them in the [adaptive](adaptive.md) feature page.

Then you have the arguments for parsing adjustments or adjusting/manipulating the HTML content while the library is parsing it:

- **encoding**: This is the encoding that will be used while parsing the HTML. The default is `UTF-8`.
- **keep_comments**: This tells the library whether to keep HTML comments while parsing the page. It's disabled by default because it can cause issues with your scraping in various ways.
- **keep_cdata**: Same logic as the HTML comments. [cdata](https://stackoverflow.com/questions/7092236/what-is-cdata-in-html) is removed by default for cleaner HTML.

I have intended to ignore the arguments `huge_tree` and `root` to avoid making this page more complicated than needed.
You may notice that I'm doing that a lot because it involves advanced features that you don't need to know to use the library. The development section will cover these missing parts if you are very invested.

After that, most properties on the main page and its elements are lazily loaded. This means they don't get initialized until you use them like the text content of a page/element, and this is one of the reasons for Scrapling speed :)

### Properties
You have already seen much of this on the [overview](../overview.md) page, but don't worry if you didn't. We will review it more thoroughly using more advanced methods/usages. For clarity, the properties for traversal are separated below in the [traversal](#traversal) section.

Let's say we are parsing this HTML page for simplicity:
```html
<html>
  <head>
    <title>Some page</title>
  </head>
  <body>
    <div class="product-list">
      <article class="product" data-id="1">
        <h3>Product 1</h3>
        <p class="description">This is product 1</p>
        <span class="price">$10.99</span>
        <div class="hidden stock">In stock: 5</div>
      </article>
    
      <article class="product" data-id="2">
        <h3>Product 2</h3>
        <p class="description">This is product 2</p>
        <span class="price">$20.99</span>
        <div class="hidden stock">In stock: 3</div>
      </article>
    
      <article class="product" data-id="3">
        <h3>Product 3</h3>
        <p class="description">This is product 3</p>
        <span class="price">$15.99</span>
        <div class="hidden stock">Out of stock</div>
      </article>
    </div>

    <script id="page-data" type="application/json">
      {
        "lastUpdated": "2024-09-22T10:30:00Z",
        "totalProducts": 3
      }
    </script>
  </body>
</html>
```
Load the page directly as shown before:
```python
from scrapling import Selector
page = Selector(html_doc)
```
Get all text content on the page recursively
```python
>>> page.get_all_text()
'Some page\n\n    \n\n      \nProduct 1\nThis is product 1\n$10.99\nIn stock: 5\nProduct 2\nThis is product 2\n$20.99\nIn stock: 3\nProduct 3\nThis is product 3\n$15.99\nOut of stock'
```
Get the first article, as explained before; we will use it as an example
```python
article = page.find('article')
```
With the same logic, get all text content on the element recursively
```python
>>> article.get_all_text()
'Product 1\nThis is product 1\n$10.99\nIn stock: 5'
```
But if you try to get the direct text content, it will be empty because it doesn't have direct text in the HTML code above
```python
>>> article.text
''
```
The `get_all_text` method has the following optional arguments:

1. **separator**: All strings collected will be concatenated using this separator. The default is '\n'.
2. **strip**: If enabled, strings will be stripped before concatenation. Disabled by default.
3. **ignore_tags**: A tuple of all tag names you want to ignore in the final results and ignore any elements nested within them. The default is `('script', 'style',)`.
4. **valid_values**: If enabled, the method will only collect elements with real values, so all elements with empty text content or only whitespaces will be ignored. It's enabled by default

By the way, the text returned here is not a standard string but a [TextHandler](#texthandler); we will get to this in detail later, so if the text content can be serialized to JSON, use `.json()` on it
```python
>>> script = page.find('script')
>>> script.json()
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
```
Let's continue to get the element tag
```python
>>> article.tag
'article'
```
If you use it on the page directly, you will find that you are operating on the root `html` element
```python
>>> page.tag
'html'
```
Now, I think I've hammered the (`page`/`element`) idea, so I won't return to it.

Getting the attributes of the element
```python
>>> print(article.attrib)
{'class': 'product', 'data-id': '1'}
```
Access a specific attribute with any of the following
```python
>>> article.attrib['class']
>>> article.attrib.get('class')
>>> article['class']  # new in v0.3
```
Check if the attributes contain a specific attribute with any of the methods below
```python
>>> 'class' in article.attrib
>>> 'class' in article  # new in v0.3
```
Get the HTML content of the element
```python
>>> article.html_content
'<article class="product" data-id="1"><h3>Product 1</h3>\n        <p class="description">This is product 1</p>\n        <span class="price">$10.99</span>\n        <div class="hidden stock">In stock: 5</div>\n      </article>'
```
Get the prettified version of the element's HTML content
```python
print(article.prettify())
```
```html
<article class="product" data-id="1"><h3>Product 1</h3>
    <p class="description">This is product 1</p>
    <span class="price">$10.99</span>
    <div class="hidden stock">In stock: 5</div>
</article>
```
Use the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.
```python
>>> page.body
'<html>\n  <head>\n    <title>Some page</title>\n  </head>\n  ...'
```
To get all the ancestors in the DOM tree of this element
```python
>>> article.path
[<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>,
 <data='<body> <div class="product-list"> <artic...' parent='<html><head><title>Some page</title></he...'>,
 <data='<html><head><title>Some page</title></he...'>]
```
Generate a CSS shortened selector if possible, or generate the full selector
```python
>>> article.generate_css_selector
'body > div > article'
>>> article.generate_full_css_selector
'body > div > article'
```
Same case with XPath
```python
>>> article.generate_xpath_selector
"//body/div/article"
>>> article.generate_full_xpath_selector
"//body/div/article"
```

### Traversal
Using the elements we found above, we will go over the properties/methods for moving on the page in detail.

If you are unfamiliar with the DOM tree or the tree data structure in general, the following traversal part can be confusing. I recommend you look up these concepts online to better understand them.

If you are too lazy to search about it, here's a quick explanation to give you a good idea.<br/>
In simple words, the `html` element is the root of the website's tree, as every page starts with an `html` element.<br/>
This element will be positioned directly above elements such as `head` and `body`. These are considered "children" of the `html` element, and the `html` element is considered their "parent". The element `body` is a "sibling" of the element `head` and vice versa.

Accessing the parent of an element
```python
>>> article.parent
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
>>> article.parent.tag
'div'
```
You can chain it as you want, which applies to all similar properties/methods we will review.
```python
>>> article.parent.parent.tag
'body'
```
Get the children of an element
```python
>>> article.children
[<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>]
```
Get all elements underneath an element. It acts as a nested version of the `children` property
```python
>>> article.below_elements
[<data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>]
```
This element returns the same result as the `children` property because its children don't have children.

Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
```python
>>> products_list = page.css('.product-list')[0]
>>> products_list.children
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]

>>> products_list.below_elements
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<h3>Product 1</h3>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<p class="description">This is product 1...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<span class="price">$10.99</span>' parent='<article class="product" data-id="1"><h3...'>,
 <data='<div class="hidden stock">In stock: 5</d...' parent='<article class="product" data-id="1"><h3...'>,
 <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
...]
```
Get the siblings of an element
```python
>>> article.siblings
[<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
 <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
```
Get the next element of the current element
```python
>>> article.next
<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>
```
The same logic applies to the `previous` property
```python
>>> article.previous  # It's the first child, so it doesn't have a previous element
>>> second_article = page.css('.product[data-id="2"]')[0]
>>> second_article.previous
<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
```
You can check easily and pretty fast if an element has a specific class name or not
```python
>>> article.has_class('product')
True
```
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the example below
```python
for ancestor in article.iterancestors():
    # do something with it...
```
You can search for a specific ancestor of an element that satisfies a search function; all you need to do is pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below:
```python
>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>

>>> article.find_ancestor(lambda ancestor: ancestor.css('.product-list'))  # Same result, different approach
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
```
## Selectors
The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.

In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.

Starting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `"#text"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.

```python
>>> page.css('a::text')              # -> Selectors (of text node Selectors)
>>> page.xpath('//a/text()')         # -> Selectors
>>> page.css('a::text').get()        # -> TextHandler (the first text value)
>>> page.css('a::text').getall()     # -> TextHandlers (all text values)
>>> page.css('a::attr(href)')        # -> Selectors
>>> page.xpath('//a/@href')          # -> Selectors
>>> page.css('.price_color')         # -> Selectors
```

### Data extraction methods
Starting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.

**On a [Selector](#selector) object:**

- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.
- `getall()` returns a `TextHandlers` list containing the single serialized string.
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.

```python
>>> page.css('h3')[0].get()        # Outer HTML of the element
'<h3>Product 1</h3>'

>>> page.css('h3::text')[0].get()  # Text value of the text node
'Product 1'
```

**On a [Selectors](#selectors) object:**

- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.
- `getall()` serializes **all** elements and returns a `TextHandlers` list.
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.

```python
>>> page.css('.price::text').get()      # First price text
'$10.99'

>>> page.css('.price::text').getall()   # All price texts
['$10.99', '$20.99', '$15.99']

>>> page.css('.price::text').get('')    # With default value
'$10.99'
```

These methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.

Now, let's see what [Selectors](#selectors) class adds to the table with that out of the way.
### Properties
Apart from the standard operations on Python lists, such as iteration and slicing.

You can do the following:

Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available here. This, of course, makes chaining methods very straightforward.
```python
>>> page.css('.product_pod a')
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
...]

>>> page.css('.product_pod').css('a')  # Returns the same result
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<div class="image_container"> <a href="c...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
...]
```
Run the `re` and `re_first` methods directly. They take the same arguments passed to the [Selector](#selector) class. I will leave the explanation of these methods to the [TextHandler](#texthandler) section below.

However, in this class, the `re_first` behaves differently as it runs `re` on each [Selector](#selector) within and returns the first one with a result. The `re` method will return a [TextHandlers](#texthandlers) object as normal, which combines all the [TextHandler](#texthandler) instances into one [TextHandlers](#texthandlers) instance.
```python
>>> page.css('.price_color').re(r'[\d\.]+')
['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
...]

>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000',
 'tipping-the-velvet_999',
 'soumission_998',
 'sharp-objects_997',
...]
```
With the `search` method, you can search quickly in the available [Selector](#selector) instances. The function you pass must accept a [Selector](#selector) instance as the first argument and return True/False. The method will return the first [Selector](#selector) instance that satisfies the function; otherwise, it will return `None`.
```python
# Find all the products with price '53.23'.
>>> search_function = lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) == 54.23
>>> page.css('.product_pod').search(search_function)
<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>
```
You can use the `filter` method, too, which takes a function like the `search` method but returns an `Selectors` instance of all the [Selector](#selector) instances that satisfy the function
```python
# Find all products with prices over $50
>>> filtering_function = lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) > 50
>>> page.css('.product_pod').filter(filtering_function)
[<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
 <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
 <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
...]
```
You can safely access the first or last element without worrying about index errors:
```python
>>> page.css('.product').first   # First Selector or None
<data='<article class="product" data-id="1"><h3...'>
>>> page.css('.product').last    # Last Selector or None
<data='<article class="product" data-id="3"><h3...'>
>>> page.css('.nonexistent').first  # Returns None instead of raising IndexError
```

If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:
```python
page.css('.product_pod').length
```
which is equivalent to
```python
len(page.css('.product_pod'))
```
Yup, like JavaScript :)

## TextHandler
This class is mandatory to understand, as all methods/properties that should return a string for you will return `TextHandler`, and the ones that should return a list of strings will return [TextHandlers](#texthandlers) instead.

TextHandler is a subclass of the standard Python string, so you can do anything with it that you can do with a Python string. So, what is the difference that requires a different naming?

Of course, TextHandler provides extra methods and properties that standard Python strings can't do. We will review them now, but remember that all methods and properties in all classes that return string(s) return TextHandler, which opens the door for creativity and makes the code shorter and cleaner, as you will see. Also, you can import it directly and use it on any string, which we will explain [later](../development/scrapling_custom_types.md).
### Usage
First, before discussing the added methods, you need to know that all operations on it, like slicing, accessing by index, etc., and methods like `split`, `replace`, `strip`, etc., all return a `TextHandler` again, so you can chain them as you want. If you find a method or property that returns a standard string instead of `TextHandler`, please open an issue, and we will override it as well.

First, we start with the `re` and `re_first` methods. These are the same methods that exist in the other classes ([Selector](#selector), [Selectors](#selectors), and [TextHandlers](#texthandlers)), so they accept the same arguments.

- The `re` method takes a string/compiled regex pattern as the first argument. It searches the data for all strings matching the regex and returns them as a [TextHandlers](#texthandlers) instance. The `re_first` method takes the same arguments and behaves similarly, but, as you probably figured out from the name, it returns only the first result as a `TextHandler` instance.
    
    Also, it takes other helpful arguments, which are:
    
    - **replace_entities**: This is enabled by default. It replaces character entity references with their corresponding characters.
    - **clean_match**: It's disabled by default. This causes the method to ignore all whitespace, including consecutive spaces, while matching.
    - **case_sensitive**: It's enabled by default. As the name implies, disabling it causes the regex to ignore letter case during compilation.
  
    You have seen these examples before; the return result is [TextHandlers](#texthandlers) because we used the `re` method.
    ```python
    >>> page.css('.price_color').re(r'[\d\.]+')
    ['51.77',
     '53.74',
     '50.10',
     '47.82',
     '54.23',
    ...]
    
    >>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
    ['a-light-in-the-attic_1000',
     'tipping-the-velvet_999',
     'soumission_998',
     'sharp-objects_997',
    ...]
    ```
    To explain the other arguments better, we will use a custom string for each example below
    ```python
    >>> from scrapling import TextHandler
    >>> test_string = TextHandler('hi  there')  # Hence the two spaces
    >>> test_string.re('hi there')
    >>> test_string.re('hi there', clean_match=True)  # Using `clean_match` will clean the string before matching the regex
    ['hi there']
    
    >>> test_string2 = TextHandler('Oh, Hi Mark')
    >>> test_string2.re_first('oh, hi Mark')
    >>> test_string2.re_first('oh, hi Mark', case_sensitive=False)  # Hence disabling `case_sensitive`
    'Oh, Hi Mark'
    
    # Mixing arguments
    >>> test_string.re('hi there', clean_match=True, case_sensitive=False)
    ['hi There']
    ```
    Another use of the idea of replacing strings with `TextHandler` everywhere is that a property like `html_content` returns `TextHandler`, so you can do regex on the HTML content if you want:
    ```python
    >>> page.html_content.re('div class=".*">(.*)</div')
    ['In stock: 5', 'In stock: 3', 'Out of stock']
    ```

- You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error
  ```python
  >>> page.css('#page-data::text').get()
    '\n      {\n        "lastUpdated": "2024-09-22T10:30:00Z",\n        "totalProducts": 3\n      }\n    '
  >>> page.css('#page-data::text').get().json()
    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this
  ```python
  >>> page.css('#page-data')[0].json()
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:
  ```html
  <html>
      <body>
          <div>
            <script id="page-data" type="application/json">
              {
                "lastUpdated": "2024-09-22T10:30:00Z",
                "totalProducts": 3
              }
            </script>
          </div>
      </body>
  </html>
  ```
  The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>
  So, as you know here, if you did something like this
  ```python
  >>> page.css('div::text').get().json()
  ```
  You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>
  In this case, the `get_all_text` method comes to the rescue, so you can do something like that
  ```python
  >>> page.css('div')[0].get_all_text(ignore_tags=[]).json()
    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
  ```
  I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>
  Another related behavior to be aware of occurs when using any fetcher, which we will explain later. If you have a JSON response like this example:
  ```python
  >>> page = Selector("""{"some_key": "some_value"}""")
  ```
  Because the [Selector](#selector) class is optimized to deal with HTML pages, it will deal with it as a broken HTML response and fix it, so if you used the `html_content` property, you get this
  ```python
  >>> page.html_content
  '<html><body><p>{"some_key": "some_value"}</p></body></html>'
  ```
  Here, you can use the `json` method directly, and it will work
  ```python
  >>> page.json()
  {'some_key': 'some_value'}
  ```
  You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>
  Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is unavailable, as with the elements, it checks the current element's text content; otherwise, it uses the `get_all_text` method directly.<br/>

- Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance
```python
>>> TextHandler('\n wonderful  idea, \reh?').clean()
'wonderful idea, eh?'
```
Also, you can pass the `remove_entities` argument to make `clean` replace HTML entities with their corresponding characters.

- Another method that might be helpful in some cases is the `.sort()` method to sort the string for you, as you do with lists
```python
>>> TextHandler('acb').sort()
'abc'
```
Or do it in reverse:
```python
>>> TextHandler('acb').sort(reverse=True)
'cba'
```

Other methods and properties will be added over time, but remember that this class is returned in place of strings nearly everywhere in the library.

## TextHandlers
You probably guessed it: This class is similar to [Selectors](#selectors) and [Selector](#selector), but here it inherits the same logic and method as standard lists, with only `re` and `re_first` as new methods.

The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.

## AttributesHandler
This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.
```python
>>> print(page.find('script').attrib)
{'id': 'page-data', 'type': 'application/json'}
>>> type(page.find('script').attrib).__name__
'AttributesHandler'
```
Because it's read-only, it will use fewer resources than the standard dictionary. Still, it has the same dictionary method and properties, except those that allow you to modify/override the data.

It currently adds two extra simple methods:

- The `search_values` method

    In standard dictionaries, you can do `dict.get("key_name")` to check if a key exists. However, if you want to search by values rather than keys, you will need some additional code lines. This method does that for you. It allows you to search the current attributes by values and returns a dictionary of each matching item.
    
    A simple example would be
    ```python
    >>> for i in page.find('script').attrib.search_values('page-data'):
            print(i)
    {'id': 'page-data'}
    ```
    But this method provides the `partial` argument as well, which allows you to search by part of the value:
    ```python
    >>> for i in page.find('script').attrib.search_values('page', partial=True):
            print(i)
    {'id': 'page-data'}
    ```
    These examples won't happen in the real world; most likely, a more real-world example would be using it with the `find_all` method to find all elements that have a specific value in their arguments:
    ```python
    >>> page.find_all(lambda element: list(element.attrib.search_values('product')))
    [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
     <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
     <data='<article class="product" data-id="3"><h3...' parent='<div class="product-list"> <article clas...'>]
    ```
    All these elements have 'product' as the value for the `class` attribute.
    
    Hence, I used the `list` function here because `search_values` returns a generator, so it would be `True` for all elements.

- The `json_string` property

    This property converts current attributes to a JSON string if the attributes are JSON serializable; otherwise, it throws an error.
  
    ```python
    >>>page.find('script').attrib.json_string
    b'{"id":"page-data","type":"application/json"}'
    ```

================================================
FILE: docs/parsing/selection.md
================================================
# Querying elements
Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)

In Scrapling, there are five main ways to find elements:

1. CSS3 Selectors
2. XPath Selectors
3. Finding elements based on filters/conditions.
4. Finding elements whose content contains a specific text
5. Finding elements whose content matches a specific regex

Of course, there are other indirect ways to find elements with Scrapling, but here we will discuss the main ways in detail. We will also bring up one of the most remarkable features of Scrapling: the ability to find elements that are similar to the element you have; you can jump to that section directly from [here](#finding-similar-elements).

If you are new to Web Scraping, have little to no experience writing selectors, and want to start quickly, I recommend you jump directly to learning the `find`/`find_all` methods from [here](#filters-based-searching).

## CSS/XPath selectors

### What are CSS selectors?
[CSS](https://en.wikipedia.org/wiki/CSS) is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements.

Scrapling implements CSS3 selectors as described in the [W3C specification](http://www.w3.org/TR/2011/REC-css3-selectors-20110929/). CSS selectors support comes from `cssselect`, so it's better to read about which [selectors are supported from cssselect](https://cssselect.readthedocs.io/en/latest/#supported-selectors) and pseudo-functions/elements.

Also, Scrapling implements some non-standard pseudo-elements like:

* To select text nodes, use ``::text``.
* To select attribute values, use ``::attr(name)`` where name is the name of the attribute that you want the value of

In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)

To select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.

### What are XPath selectors?
[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).

In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.

To select elements with XPath selectors, you have the `xpath` method. Again, this method follows the same logic as the CSS selectors method above.

> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.

### Selectors examples
Let's see some shared examples of using CSS and XPath Selectors.

Select all elements with the class `product`.
```python
products = page.css('.product')
products = page.xpath('//*[@class="product"]')
```
!!! info "Note:"

    The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class**

Select the first element with the class `product`.
```python
product = page.css('.product')[0]
product = page.xpath('//*[@class="product"]')[0]
```
Get the text of the first element with the `h1` tag name
```python
title = page.css('h1::text').get()
title = page.xpath('//h1//text()').get()
```
Which is the same as doing
```python
title = page.css('h1')[0].text
title = page.xpath('//h1')[0].text
```
Get the `href` attribute of the first element with the `a` tag name
```python
link = page.css('a::attr(href)').get()
link = page.xpath('//a/@href').get()
```
Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
```python
title = page.css('.product h1:contains("Phone")::text').get()
title = page.xpath('//*[@class="product"]//h1[contains(text(),"Phone")]/text()').get()
```
You can nest and chain selectors as you want, given that they return results
```python
page.css('.product')[0].css('h1:contains("Phone")::text').get()
page.xpath('//*[@class="product"]')[0].xpath('//h1[contains(text(),"Phone")]/text()').get()
page.xpath('//*[@class="product"]')[0].css('h1:contains("Phone")::text').get()
```
Another example

All links that have 'image' in their 'href' attribute
```python
links = page.css('a[href*="image"]')
links = page.xpath('//a[contains(@href, "image")]')
for index, link in enumerate(links):
    link_value = link.attrib['href']  # Cleaner than link.css('::attr(href)').get()
    link_text = link.text
    print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
```

## Text-content selection
Scrapling provides the ability to select elements based on their direct text content, and you have two ways to do this:

1. Elements whose direct text content contains the given text with many options through the `find_by_text` method.
2. Elements whose direct text content matches the given regex pattern with many options through the `find_by_regex` method.

What you can do with `find_by_text` can be done with `find_by_regex` if you are good enough with regular expressions (regex), but we are providing more options to make them easier for all users to access.

With `find_by_text`, you pass the text as the first argument; with `find_by_regex`, the regex pattern is the first argument. Both methods share the following arguments:

* **first_match**: If `True` (the default), the method used will return the first result it finds.
* **case_sensitive**: If `True`, the case of the letters will be considered.
* **clean_match**: If `True`, all whitespaces and consecutive spaces will be replaced with a single space before matching.

By default, Scrapling searches for the exact matching of the text/pattern you pass to `find_by_text`, so the text content of the wanted element has to be ONLY the text you input, but that's why it also has one extra argument, which is:

* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore

!!! abstract "Note:"

    The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples.

### Finding Similar Elements
One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.

So, how does it work?

Imagine a scenario where you found a product by its title, for example, and you want to extract other products listed in the same table/container. With the element you have, you can call the method `.find_similar()` on it, and Scrapling will:

1. Find all page elements with the same DOM tree depth as this element. 
2. All found elements will be checked, and those without the same tag name, parent tag name, and grandparent tag name will be dropped.
3. Now we are sure (like 99% sure) that these elements are the ones we want, but as a last check, Scrapling will use fuzzy matching to drop the elements whose attributes don't look like the attributes of our element. There's a percentage to control this step, and I recommend you not play with it unless the default settings don't get the elements you want.

That's a lot of talking, I know, but I had to go deep. I will give examples of using this method in the next section, but first, these are the arguments that can be passed to this method:

* **similarity_threshold**: This is the percentage we discussed in step 3 for comparing elements' attributes. The default value is 0.2. In Simpler words, the tag attributes of both elements should be at least 20% similar. If you want to turn off this check (basically Step 3), you can set this attribute to 0, but I recommend you read what the other arguments do first.
* **ignore_attributes**: The attribute names passed will be ignored while matching the attributes in the last step. The default value is `('href', 'src',)` because URLs can change significantly across elements, making them unreliable.
* **match_text**: If `True`, the element's text content will be considered when matching (Step 3). Using this argument in typical cases is not recommended, but it depends.

Now, let's check out the examples below.

### Examples
Let's see some shared examples of finding elements with raw text and regex.

I will use the `Fetcher` class with these examples, but it will be explained in detail later.
```python
from scrapling.fetchers import Fetcher
page = Fetcher.get('https://books.toscrape.com/index.html')
```
Find the first element whose text fully matches this text
```python
>>> page.find_by_text('Tipping the Velvet')
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
```
Combining it with `page.urljoin` to return the full URL from the relative `href`.
```python
>>> page.find_by_text('Tipping the Velvet').attrib['href']
'catalogue/tipping-the-velvet_999/index.html'
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href'])
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
```
Get all matches if there are more (notice it returns a list)
```python
>>> page.find_by_text('Tipping the Velvet', first_match=False)
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
```
Get all elements that contain the word `the` (Partial matching)
```python
>>> results = page.find_by_text('the', partial=True, first_match=False)
>>> [i.text for i in results]
['A Light in the ...',
 'Tipping the Velvet',
 'The Requiem Red',
 'The Dirty Little Secrets ...',
 'The Coming Woman: A ...',
 'The Boys in the ...',
 'The Black Maria',
 'Mesaerion: The Best Science ...',
 "It's Only the Himalayas"]
```
The search is case-insensitive, so those results include `The`, not just the lowercase `the`; let's limit the search to elements with `the` only.
```python
>>> results = page.find_by_text('the', partial=True, first_match=False, case_sensitive=True)
>>> [i.text for i in results]
['A Light in the ...',
 'Tipping the Velvet',
 'The Boys in the ...',
 "It's Only the Himalayas"]
```
Get the first element whose text content matches my price regex
```python
>>> page.find_by_regex(r'£[\d\.]+')
<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
>>> page.find_by_regex(r'£[\d\.]+').text
'£51.77'
```
It's the same if you pass the compiled regex as well; Scrapling will detect the input type and act upon that:
```python
>>> import re
>>> regex = re.compile(r'£[\d\.]+')
>>> page.find_by_regex(regex)
<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>
>>> page.find_by_regex(regex).text
'£51.77'
```
Get all elements that match the regex
```python
>>> page.find_by_regex(r'£[\d\.]+', first_match=False)
[<data='<p class="price_color">£51.77</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£53.74</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£50.10</p>' parent='<div class="product_price"> <p class="pr...'>,
 <data='<p class="price_color">£47.82</p>' parent='<div class="product_price"> <p class="pr...'>,
 ...]
```
And so on...

Find all elements similar to the current element in location and attributes. For our case, ignore the 'title' attribute while matching
```python
>>> element = page.find_by_text('Tipping the Velvet')
>>> element.find_similar(ignore_attributes=['title'])
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<h3><a href="catalogue/a-light-in-the-at...'>,
 <data='<a href="catalogue/soumission_998/index....' parent='<h3><a href="catalogue/soumission_998/in...'>,
 <data='<a href="catalogue/sharp-objects_997/ind...' parent='<h3><a href="catalogue/sharp-objects_997...'>,
...]
```
Notice that the number of elements is 19, not 20, because the current element is not included in the results.
```python
>>> len(element.find_similar(ignore_attributes=['title']))
19
```
Get the `href` attribute from all similar elements
```python
>>> [
    element.attrib['href']
    for element in element.find_similar(ignore_attributes=['title'])
]
['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 ...]
```
To increase the complexity a little bit, let's say we want to get all the books' data using that element as a starting point for some reason
```python
>>> for product in element.parent.parent.find_similar():
        print({
            "name": product.css('h3 a::text').get(),
            "price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
            "stock": product.css('.availability::text').getall()[-1].clean()
        })
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
{'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}
...
```
### Advanced examples 
See more advanced or real-world examples using the `find_similar` method.

E-commerce Product Extraction
```python
def extract_product_grid(page):
    # Find the first product card
    first_product = page.find_by_text('Add to Cart').find_ancestor(
        lambda e: e.has_class('product-card')
    )

    # Find similar product cards
    products = first_product.find_similar()

    return [
        {
            'name': p.css('h3::text').get(),
            'price': p.css('.price::text').re_first(r'\d+\.\d{2}'),
            'stock': 'In stock' in p.text,
            'rating': p.css('.rating')[0].attrib.get('data-rating')
        }
        for p in products
    ]
```
Table Row Extraction
```python
def extract_table_data(page):
    # Find the first data row
    first_row = page.css('table tbody tr')[0]

    # Find similar rows
    rows = first_row.find_similar()

    return [
        {
            'column1': row.css('td:nth-child(1)::text').get(),
            'column2': row.css('td:nth-child(2)::text').get(),
            'column3': row.css('td:nth-child(3)::text').get()
        }
        for row in rows
    ]
```
Form Field Extraction
```python
def extract_form_fields(page):
    # Find first form field container
    first_field = page.css('input')[0].find_ancestor(
        lambda e: e.has_class('form-field')
    )

    # Find similar field containers
    fields = first_field.find_similar()

    return [
        {
            'label': f.css('label::text').get(),
            'type': f.css('input')[0].attrib.get('type'),
            'required': 'required' in f.css('input')[0].attrib
        }
        for f in fields
    ]
```
Extracting reviews from a website
```python
def extract_reviews(page):
    # Find first review
    first_review = page.find_by_text('Great product!')
    review_container = first_review.find_ancestor(
        lambda e: e.has_class('review')
    )
    
    # Find similar reviews
    all_reviews = review_container.find_similar()
    
    return [
        {
            'text': r.css('.review-text::text').get(),
            'rating': r.attrib.get('data-rating'),
            'author': r.css('.reviewer::text').get()
        }
        for r in all_reviews
    ]
```
## Filters-based searching
This search method is arguably the best way to find elements in Scrapling, as it is powerful and easier for newcomers to Web Scraping to learn than writing selectors. 

Inspired by BeautifulSoup's `find_all` function, you can find elements using the `find_all` and `find` methods. Both methods can accept multiple filters and return all elements on the pages where all these filters apply.

To be more specific:

* Any string passed is considered a tag name.
* Any iterable passed, like List/Tuple/Set, will be considered as an iterable of tag names.
* Any dictionary is considered a mapping of HTML element(s), attribute names, and attribute values.
* Any regex patterns passed are used to filter elements by content, like the `find_by_regex` method
* Any functions passed are used to filter elements
* Any keyword argument passed is considered as an HTML element attribute with its value.

It collects all passed arguments and keywords, and each filter passes its results to the following filter in a waterfall-like filtering system.

It filters all elements in the current page/element in the following order:

1. All elements with the passed tag name(s) get collected.
2. All elements that match all passed attribute(s) are collected; if a previous filter is used, then previously collected elements are filtered.
3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.

!!! note "Notes:"

    1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.
    2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.

Check examples to clear any confusion :)

### Examples
```python
>>> from scrapling.fetchers import Fetcher
>>> page = Fetcher.get('https://quotes.toscrape.com/')
```
Find all elements with the tag name `div`.
```python
>>> page.find_all('div')
[<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
 <data='<div class="row header-box"> <div class=...' parent='<div class="container"> <div class="row...'>,
...]
```
Find all div elements with a class that equals `quote`.
```python
>>> page.find_all('div', class_='quote')
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Same as above.
```python
>>> page.find_all('div', {'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Find all elements with a class that equals `quote`.
```python
>>> page.find_all({'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
```python
>>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css('.text::text').get())
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
```
Find all elements that have children.
```python
>>> page.find_all(lambda element: len(element.children) > 0)
[<data='<html lang="en"><head><meta charset="UTF...'>,
 <data='<head><meta charset="UTF-8"><title>Quote...' parent='<html lang="en"><head><meta charset="UTF...'>,
 <data='<body> <div class="container"> <div clas...' parent='<html lang="en"><head><meta charset="UTF...'>,
...]
```
Find all elements that contain the word 'world' in their content.
```python
>>> page.find_all(lambda element: "world" in element.text)
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
 <data='<a class="tag" href="/tag/world/page/1/"...' parent='<div class="tags"> Tags: <meta class="ke...'>]
```
Find all span elements that match the given regex
```python
>>> page.find_all('span', re.compile(r'world'))
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>]
```
Find all div and span elements with class 'quote' (No span elements like that, so only div returned)
```python
>>> page.find_all(['div', 'span'], {'class': 'quote'})
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
...]
```
Mix things up
```python
>>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text').getall()
['Albert Einstein',
 'J.K. Rowling',
...]
```
A bonus pro tip: Find all elements whose `href` attribute's value ends with the word 'Einstein'.
```python
>>> page.find_all({'href$': 'Einstein'})
[<data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>]
```
Another pro tip: Find all elements whose `href` attribute's value has '/author/' in it
```python
>>> page.find_all({'href*': '/author/'})
[<data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/J-K-Rowling">(about)</a...' parent='<span>by <small class="author" itemprop=...'>,
 <data='<a href="/author/Albert-Einstein">(about...' parent='<span>by <small class="author" itemprop=...'>,
...]
```
And so on...

## Generating selectors
You can always generate CSS/XPath selectors for any element that can be reused here or anywhere else, and the most remarkable thing is that it doesn't matter what method you used to find that element!

Generate a short CSS selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)
```python
>>> url_element = page.find({'href*': '/author/'})
>>> url_element.generate_css_selector
'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'
```
Generate a full CSS selector for the `url_element` element from the start of the page
```python
>>> url_element.generate_full_css_selector
'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'
```
Generate a short XPath selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)
```python
>>> url_element.generate_xpath_selector
'//body/div/div[2]/div/div/span[2]/a'
```
Generate a full XPath selector for the `url_element` element from the start of the page
```python
>>> url_element.generate_full_xpath_selector
'//body/div/div[2]/div/div/span[2]/a'
```
!!! abstract "Note:"

    When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same.

## Using selectors with regular expressions
Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node. 

We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:
```python
>>> page.css('.price_color')[0].re_first(r'[\d\.]+')
'51.77'

>>> page.css('.price_color').re_first(r'[\d\.]+')
'51.77'

>>> page.css('.price_color').re(r'[\d\.]+')
['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
...]

>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000',
 'tipping-the-velvet_999',
 'soumission_998',
 'sharp-objects_997',
...]

>>> filtering_function = lambda e: e.parent.tag == 'h3' and e.parent.parent.has_class('product_pod')  # As above selector
>>> page.find('a', filtering_function).attrib['href'].re(r'catalogue/(.*)/index.html')
['a-light-in-the-attic_1000']

>>> page.find_by_text('Tipping the Velvet').attrib['href'].re(r'catalogue/(.*)/index.html')
['tipping-the-velvet_999']
```
And so on. You get the idea. We will explain this in more detail on the next page, along with the [TextHandler](main_classes.md#texthandler) class.

================================================
FILE: docs/requirements.txt
================================================
zensical>=0.0.27
mkdocstrings>=1.0.3
mkdocstrings-python>=2.0.3
griffe-inherited-docstrings>=1.1.3
griffe-runtime-objects>=0.3.1
griffe-sphinx>=0.2.1
black>=26.1.0
pngquant

================================================
FILE: docs/spiders/advanced.md
================================================
# Advanced usages

## Introduction

!!! success "Prerequisites"

    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.

This page covers the spider system's advanced features: concurrency control, pause/resume, streaming, lifecycle hooks, statistics, and logging.

## Concurrency Control

The spider system uses three class attributes to control how aggressively it crawls:

| Attribute                        | Default | Description                                                      |
|----------------------------------|---------|------------------------------------------------------------------|
| `concurrent_requests`            | `4`     | Maximum number of requests being processed at the same time      |
| `concurrent_requests_per_domain` | `0`     | Maximum concurrent requests per domain (0 = no per-domain limit) |
| `download_delay`                 | `0.0`   | Seconds to wait before each request                              |

```python
class PoliteSpider(Spider):
    name = "polite"
    start_urls = ["https://example.com"]

    # Be gentle with the server
    concurrent_requests = 4
    concurrent_requests_per_domain = 2
    download_delay = 1.0  # Wait 1 second between requests

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

When `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.

!!! tip

    The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.

### Using uvloop

The `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:

```python
result = MySpider().start(use_uvloop=True)
```

This can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.

## Pause & Resume

The spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:

```python
spider = MySpider(crawldir="crawl_data/my_spider")
result = spider.start()

if result.paused:
    print("Crawl was paused. Run again to resume.")
else:
    print("Crawl completed!")
```

### How It Works

1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.
2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.
3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.
4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.

**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).** 

You can change the interval as follows:

```python
# Save checkpoint every 2 minutes
spider = MySpider(crawldir="crawl_data/my_spider", interval=120.0)
```

The writing to the disk is atomic, so it's totally safe.

!!! tip

    Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.

### Knowing If You're Resuming

The `on_start()` hook receives a `resuming` flag:

```python
async def on_start(self, resuming: bool = False):
    if resuming:
        self.logger.info("Resuming from checkpoint!")
    else:
        self.logger.info("Starting fresh crawl")
```

## Streaming

For long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:

```python
import anyio

async def main():
    spider = MySpider()
    async for item in spider.stream():
        print(f"Got item: {item}")
        # Access real-time stats
        print(f"Items so far: {spider.stats.items_scraped}")
        print(f"Requests made: {spider.stats.requests_count}")

anyio.run(main)
```

Key differences from `start()`:

- `stream()` must be called from an async context
- Items are yielded one by one as they're scraped, not collected into a list
- You can access `spider.stats` during iteration for real-time statistics

!!! abstract 

    The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics)

You can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.

```python
import anyio

async def main():
    spider = MySpider(crawldir="crawl_data/my_spider")
    async for item in spider.stream():
        print(f"Got item: {item}")
        # Access real-time stats
        print(f"Items so far: {spider.stats.items_scraped}")
        print(f"Requests made: {spider.stats.requests_count}")

anyio.run(main)
```
You can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.

## Lifecycle Hooks

The spider provides several hooks you can override to add custom behavior at different stages of the crawl:

### on_start

Called before crawling begins. Use it for setup tasks like loading data or initializing resources:

```python
async def on_start(self, resuming: bool = False):
    self.logger.info("Spider starting up")
    # Load seed URLs from a database, initialize counters, etc.
```

### on_close

Called after crawling finishes (whether completed or paused). Use it for cleanup:

```python
async def on_close(self):
    self.logger.info("Spider shutting down")
    # Close database connections, flush buffers, etc.
```

### on_error

Called when a request fails with an exception. Use it for error tracking or custom recovery logic:

```python
async def on_error(self, request: Request, error: Exception):
    self.logger.error(f"Failed: {request.url} - {error}")
    # Log to error tracker, save failed URL for later, etc.
```

### on_scraped_item

Called for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:

```python
async def on_scraped_item(self, item: dict) -> dict | None:
    # Drop items without a title
    if not item.get("title"):
        return None

    # Modify items (e.g., add timestamps)
    item["scraped_at"] = "2026-01-01"
    return item
```

!!! tip

    This hook can also be used to direct items through your own pipelines and drop them from the spider.

### start_requests

Override `start_requests()` for custom initial request generation instead of using `start_urls`:

```python
async def start_requests(self):
    # POST request to log in first
    yield Request(
        "https://example.com/login",
        method="POST",
        data={"user": "admin", "pass": "secret"},
        callback=self.after_login,
    )

async def after_login(self, response: Response):
    # Now crawl the authenticated pages
    yield response.follow("/dashboard", callback=self.parse)
```

## Results & Statistics

The `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:

```python
result = MySpider().start()

# Items
print(f"Total items: {len(result.items)}")
result.items.to_json("output.json", indent=True)

# Did the crawl complete?
print(f"Completed: {result.completed}")
print(f"Paused: {result.paused}")

# Statistics
stats = result.stats
print(f"Requests: {stats.requests_count}")
print(f"Failed: {stats.failed_requests_count}")
print(f"Blocked: {stats.blocked_requests_count}")
print(f"Offsite filtered: {stats.offsite_requests_count}")
print(f"Items scraped: {stats.items_scraped}")
print(f"Items dropped: {stats.items_dropped}")
print(f"Response bytes: {stats.response_bytes}")
print(f"Duration: {stats.elapsed_seconds:.1f}s")
print(f"Speed: {stats.requests_per_second:.1f} req/s")
```

### Detailed Stats

The `CrawlStats` object tracks granular information:

```python
stats = result.stats

# Status code distribution
print(stats.response_status_count)
# {'status_200': 150, 'status_404': 3, 'status_403': 1}

# Bytes downloaded per domain
print(stats.domains_response_bytes)
# {'example.com': 1234567, 'api.example.com': 45678}

# Requests per session
print(stats.sessions_requests_count)
# {'http': 120, 'stealth': 34}

# Proxies used during the crawl
print(stats.proxies)
# ['http://proxy1:8080', 'http://proxy2:8080']

# Log level counts
print(stats.log_levels_counter)
# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}

# Timing information
print(stats.start_time)       # Unix timestamp when crawl started
print(stats.end_time)         # Unix timestamp when crawl finished
print(stats.download_delay)   # The download delay used (seconds)

# Concurrency settings used
print(stats.concurrent_requests)             # Global concurrency limit
print(stats.concurrent_requests_per_domain)  # Per-domain concurrency limit

# Custom stats (set by your spider code)
print(stats.custom_stats)
# {'login_attempts': 3, 'pages_with_errors': 5}

# Export everything as a dict
print(stats.to_dict())
```

## Logging

The spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:

| Attribute             | Default                                                      | Description                                        |
|-----------------------|--------------------------------------------------------------|----------------------------------------------------|
| `logging_level`       | `logging.DEBUG`                                              | Minimum log level                                  |
| `logging_format`      | `"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"` | Log message format                                 |
| `logging_date_format` | `"%Y-%m-%d %H:%M:%S"`                                        | Date format in log messages                        |
| `log_file`            | `None`                                                       | Path to a log file (in addition to console output) |

```python
import logging

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    logging_level = logging.INFO
    log_file = "logs/my_spider.log"

    async def parse(self, response: Response):
        self.logger.info(f"Processing {response.url}")
        yield {"title": response.css("title::text").get("")}
```

The log file directory is created automatically if it doesn't exist. Both console and file output use the same format.

================================================
FILE: docs/spiders/architecture.md
================================================
# Spiders architecture

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.

Scrapling's spider system is a Scrapy-inspired async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.

If you're familiar with Scrapy, you'll feel right at home. If not, don't worry — the system is designed to be straightforward.

## Data Flow

The diagram below shows how data flows through the spider system when a crawl is running:

<img src="../assets/spider_architecture.png" title="Spider architecture diagram by @TrueSkills" alt="Spider architecture diagram by @TrueSkills" style="width: 70%;"/>

Here's what happens step by step when you run a spider without many details:

1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.
2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.
3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).
4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.
5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.
6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.
7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.


## Components

### Spider

The central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.

```python
from scrapling.spiders import Spider, Response, Request

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def parse(self, response: Response):
        for link in response.css("a::attr(href)").getall():
            yield response.follow(link, callback=self.parse_page)

    async def parse_page(self, response: Response):
        yield {"title": response.css("h1::text").get("")}
```

### Crawler Engine

The engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.

### Scheduler

A priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.

### Session Manager

Manages one or more named session instances. Each session is one of:

- [FetcherSession](../fetching/static.md)
- [AsyncDynamicSession](../fetching/dynamic.md)
- [AsyncStealthySession](../fetching/stealthy.md)

When a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).

### Checkpoint System

An optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.

### Output

Scraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.


## Comparison with Scrapy

If you're coming from Scrapy, here's how Scrapling's spider system maps:

| Concept            | Scrapy                        | Scrapling                                                       |
|--------------------|-------------------------------|-----------------------------------------------------------------|
| Spider definition  | `scrapy.Spider` subclass      | `scrapling.spiders.Spider` subclass                             |
| Initial requests   | `start_requests()`            | `async start_requests()`                                        |
| Callbacks          | `def parse(self, response)`   | `async def parse(self, response)`                               |
| Following links    | `response.follow(url)`        | `response.follow(url)`                                          |
| Item output        | `yield dict` or `yield Item`  | `yield dict`                                                    |
| Request scheduling | Scheduler + Dupefilter        | Scheduler with built-in deduplication                           |
| Downloading        | Downloader + Middlewares      | Session Manager with multi-session support                      |
| Item processing    | Item Pipelines                | `on_scraped_item()` hook                                        |
| Blocked detection  | Through custom middlewares    | Built-in `is_blocked()` + `retry_blocked_request()` hooks       |
| Concurrency        | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute                           |
| Domain filtering   | `allowed_domains`             | `allowed_domains`                                               |
| Pause/Resume       | `JOBDIR` setting              | `crawldir` constructor argument                                 |
| Export             | Feed exports                  | `result.items.to_json()` / `to_jsonl()` or custom through hooks |
| Running            | `scrapy crawl spider_name`    | `MySpider().start()`                                            |
| Streaming          | N/A                           | `async for item in spider.stream()`                             |
| Multi-session      | N/A                           | Multiple sessions with different types per spider               |

================================================
FILE: docs/spiders/getting-started.md
================================================
# Getting started

## Introduction

!!! success "Prerequisites"

    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.
    3. You've read the [Architecture](architecture.md) page for a high-level overview of how the spider system works.

The spider system lets you build concurrent, multi-page crawlers in just a few lines of code. If you've used Scrapy before, the patterns will feel familiar. If not, this guide will walk you through everything you need to get started.

## Your First Spider

A spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:

```python
from scrapling.spiders import Spider, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com"]

    async def parse(self, response: Response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(""),
                "author": quote.css("small.author::text").get(""),
            }
```

Every spider needs three things:

1. **`name`** — A unique identifier for the spider.
2. **`start_urls`** — A list of URLs to start crawling from.
3. **`parse()`** — An async generator method that processes each response and yields results.

The `parse()` method is where the magic happens. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.

## Running the Spider

To run your spider, create an instance and call `start()`:

```python
result = QuotesSpider().start()
```

The `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.

Those stats are in the returned `CrawlResult` object, which gives you everything you need:

```python
result = QuotesSpider().start()

# Access scraped items
for item in result.items:
    print(item["text"], "-", item["author"])

# Check statistics
print(f"Scraped {result.stats.items_scraped} items")
print(f"Made {result.stats.requests_count} requests")
print(f"Took {result.stats.elapsed_seconds:.1f} seconds")

# Did the crawl finish or was it paused?
print(f"Completed: {result.completed}")
```

## Following Links

Most crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:

```python
from scrapling.spiders import Spider, Response

class QuotesSpider(Spider):
    name = "quotes"
    start_urls = ["https://quotes.toscrape.com"]

    async def parse(self, response: Response):
        # Extract items from the current page
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(""),
                "author": quote.css("small.author::text").get(""),
            }

        # Follow the "next page" link
        next_page = response.css("li.next a::attr(href)").get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)
```

`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.

You can point follow-up requests at different callback methods for different page types:

```python
async def parse(self, response: Response):
    for link in response.css("a.product-link::attr(href)").getall():
        yield response.follow(link, callback=self.parse_product)

async def parse_product(self, response: Response):
    yield {
        "name": response.css("h1::text").get(""),
        "price": response.css(".price::text").get(""),
    }
```

!!! note

    All callback methods must be async generators (using `async def` and `yield`).

## Exporting Data

The `ItemList` returned in `result.items` has built-in export methods:

```python
result = QuotesSpider().start()

# Export as JSON
result.items.to_json("quotes.json")

# Export as JSON with pretty-printing
result.items.to_json("quotes.json", indent=True)

# Export as JSON Lines (one JSON object per line)
result.items.to_jsonl("quotes.jsonl")
```

Both methods create parent directories automatically if they don't exist.

## Filtering Domains

Use `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    allowed_domains = {"example.com"}

    async def parse(self, response: Response):
        for link in response.css("a::attr(href)").getall():
            # Links to other domains are silently dropped
            yield response.follow(link, callback=self.parse)
```

Subdomains are matched automatically — setting `allowed_domains = {"example.com"}` also allows `sub.example.com`, `blog.example.com`, etc.

When a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.

## What's Next

Now that you have the basics, you can explore:

- [Requests & Responses](requests-responses.md) — learn about request priority, deduplication, metadata, and more.
- [Sessions](sessions.md) — use multiple fetcher types (HTTP, browser, stealth) in a single spider.
- [Proxy management & blocking](proxy-blocking.md) — rotate proxies across requests and how to handle blocking in the spider.
- [Advanced features](advanced.md) — concurrency control, pause/resume, streaming, lifecycle hooks, and logging.

================================================
FILE: docs/spiders/proxy-blocking.md
================================================
# Proxy management and handling Blocks

## Introduction

!!! success "Prerequisites"

    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
    2. You've read the [Sessions](sessions.md) page and understand how to configure sessions.

When scraping at scale, you'll often need to rotate through multiple proxies to avoid rate limits and blocks. Scrapling's `ProxyRotator` makes this straightforward — it works with all session types and integrates with the spider's blocked request retry system.

If you don't know what a proxy is or how to choose a good one, [this guide can help](https://substack.thewebscraping.club/p/everything-about-proxies).

## ProxyRotator

The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession, ProxyRotator

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    def configure_sessions(self, manager):
        rotator = ProxyRotator([
            "http://proxy1:8080",
            "http://proxy2:8080",
            "http://user:pass@proxy3:8080",
        ])
        manager.add("default", FetcherSession(proxy_rotator=rotator))

    async def parse(self, response: Response):
        # Check which proxy was used
        print(f"Proxy used: {response.meta.get('proxy')}")
        yield {"title": response.css("title::text").get("")}
```

Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page.


When you use it with browser sessions, you will need some adjustments, like below:

```python
from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator

# String proxies work for all session types
rotator = ProxyRotator([
    "http://proxy1:8080",
    "http://proxy2:8080",
])

# Dict proxies (Playwright format) work for browser sessions
rotator = ProxyRotator([
    {"server": "http://proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://proxy2:8080"},
])

# Then inside the spider
def configure_sessions(self, manager):
    rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"])
    manager.add("browser", AsyncStealthySession(proxy_rotator=rotator))
```

!!! info

    1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if you want, as we will show later.
    2. Remember that by default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.

## Custom Rotation Strategies

By default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.

You can provide a custom strategy function to change this behavior, but it has to match the below signature:

```python
from scrapling.core._types import ProxyType

def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:
    ...
```

It receives the list of proxies and the current index, and must return the chosen proxy and the next index.

Below are some examples of custom rotation strategies you can use.

### Random Rotation

```python
import random
from scrapling.fetchers import ProxyRotator

def random_strategy(proxies, current_index):
    idx = random.randint(0, len(proxies) - 1)
    return proxies[idx], idx

rotator = ProxyRotator(
    ["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"],
    strategy=random_strategy,
)
```

### Weighted Rotation

```python
import random

def weighted_strategy(proxies, current_index):
    # First proxy gets 60% of traffic, others split the rest
    weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)
    proxy = random.choices(proxies, weights=weights, k=1)[0]
    return proxy, current_index  # Index doesn't matter for weighted

rotator = ProxyRotator(proxies, strategy=weighted_strategy)
```


## Per-Request Proxy Override

You can override the rotator for individual requests by passing `proxy=` as a keyword argument:

```python
async def parse(self, response: Response):
    # This request uses the rotator's next proxy
    yield response.follow("/page1", callback=self.parse_page)

    # This request uses a specific proxy, bypassing the rotator
    yield response.follow(
        "/special-page",
        callback=self.parse_page,
        proxy="http://special-proxy:8080",
    )
```

This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).

## Blocked Request Handling

The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.

The retry system works like this:

1. After a response comes back, the spider calls the `is_blocked(response)` method.
2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.
3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.
4. This repeats up to `max_blocked_retries` times (default: 3).

!!! tip

    1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.
    2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.

### Custom Block Detection

Override `is_blocked()` to add your own detection logic:

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def is_blocked(self, response: Response) -> bool:
        # Check status codes (default behavior)
        if response.status in {403, 429, 503}:
            return True

        # Check response content
        body = response.body.decode("utf-8", errors="ignore")
        if "access denied" in body.lower() or "rate limit" in body.lower():
            return True

        return False

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

### Customizing Retries

Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):

```python
from scrapling.spiders import Spider, SessionManager, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession


class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    max_blocked_retries = 5

    def configure_sessions(self, manager: SessionManager) -> None:
        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))
        manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)

    async def retry_blocked_request(self, request: Request, response: Response) -> Request:
        request.sid = "stealth"
        self.logger.info(f"Retrying blocked request: {request.url}")
        return request

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.


Putting it all together:

```python
from scrapling.spiders import Spider, SessionManager, Request, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator


cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"])

# A format acceptable by the browser
expensive_proxies = ProxyRotator([
    {"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"},
    {"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"},
    {"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"},
])


class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]
    max_blocked_retries = 5

    def configure_sessions(self, manager: SessionManager) -> None:
        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))
        manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)

    async def retry_blocked_request(self, request: Request, response: Response) -> Request:
        request.sid = "stealth"
        self.logger.info(f"Retrying blocked request: {request.url}")
        return request

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```
The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies.

================================================
FILE: docs/spiders/requests-responses.md
================================================
# Requests & Responses

!!! success "Prerequisites"

    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.

This page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.

## The Request Object

A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:

```python
from scrapling.spiders import Request

# Direct construction
request = Request(
    "https://example.com/page",
    callback=self.parse_page,
    priority=5,
)

# Via response.follow (preferred in callbacks)
request = response.follow("/page", callback=self.parse_page)
```

Here are all the arguments you can pass to `Request`:

| Argument      | Type       | Default    | Description                                                                                           |
|---------------|------------|------------|-------------------------------------------------------------------------------------------------------|
| `url`         | `str`      | *required* | The URL to fetch                                                                                      |
| `sid`         | `str`      | `""`       | Session ID — routes the request to a specific session (see [Sessions](sessions.md))                   |
| `callback`    | `callable` | `None`     | Async generator method to process the response. Defaults to `parse()`                                 |
| `priority`    | `int`      | `0`        | Higher values are processed first                                                                     |
| `dont_filter` | `bool`     | `False`    | If `True`, skip deduplication (allow duplicate requests)                                              |
| `meta`        | `dict`     | `{}`       | Arbitrary metadata passed through to the response                                                     |
| `**kwargs`    |            |            | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |

Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:

```python
yield Request(
    "https://example.com/api",
    method="POST",
    data={"key": "value"},
    callback=self.parse_result,
)
```

## Response.follow()

`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:

- **Relative URLs** are resolved automatically against the current page URL
- **Referer header** is set to the current page URL by default
- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)
- **Callback, session ID, and priority** are inherited from the original request if not specified

```python
async def parse(self, response: Response):
    # Minimal — inherits callback, sid, priority from current request
    yield response.follow("/next-page")

    # Override specific fields
    yield response.follow(
        "/product/123",
        callback=self.parse_product,
        priority=10,
    )

    # Pass additional metadata to
    yield response.follow(
        "/details",
        callback=self.parse_details,
        meta={"category": "electronics"},
    )
```

| Argument           | Type       | Default    | Description                                                |
|--------------------|------------|------------|------------------------------------------------------------|
| `url`              | `str`      | *required* | URL to follow (absolute or relative)                       |
| `sid`              | `str`      | `""`       | Session ID (inherits from original request if empty)       |
| `callback`         | `callable` | `None`     | Callback method (inherits from original request if `None`) |
| `priority`         | `int`      | `None`     | Priority (inherits from original request if `None`)        |
| `dont_filter`      | `bool`     | `False`    | Skip deduplication                                         |
| `meta`             | `dict`     | `None`     | Metadata (merged with existing response meta)              |
| **`referer_flow`** | `bool`     | `True`     | Set current URL as Referer header                          |
| `**kwargs`         |            |            | Merged with original request's session kwargs              |

### Disabling Referer Flow

By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:

```python
yield response.follow("/page", referer_flow=False)
```

## Callbacks

Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types:

- **`dict`** — A scraped item, added to the results
- **`Request`** — A follow-up request, added to the queue
- **`None`** — Silently ignored

```python
class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    async def parse(self, response: Response):
        # Yield items (dicts)
        yield {"url": response.url, "title": response.css("title::text").get("")}

        # Yield follow-up requests
        for link in response.css("a::attr(href)").getall():
            yield response.follow(link, callback=self.parse_page)

    async def parse_page(self, response: Response):
        yield {"content": response.css("article::text").get("")}
```

!!! tip "Note:"

    All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.

## Request Priority

Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:

```python
async def parse(self, response: Response):
    # High priority — process product pages first
    for link in response.css("a.product::attr(href)").getall():
        yield response.follow(link, callback=self.parse_product, priority=10)

    # Low priority — pagination links processed after products
    next_page = response.css("a.next::attr(href)").get()
    if next_page:
        yield response.follow(next_page, callback=self.parse, priority=0)
```

When using `response.follow()`, the priority is inherited from the original request unless you specify a new one.

## Deduplication

The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.

To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:

```python
yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard)

# Or with response.follow
yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard)
```

You can fine-tune what goes into the fingerprint using class attributes on your spider:

| Attribute            | Default | Effect                                                                                                          |
|----------------------|---------|-----------------------------------------------------------------------------------------------------------------|
| `fp_include_kwargs`  | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |
| `fp_keep_fragments`  | `False` | Keep URL fragments (`#section`) when computing fingerprints                                                     |
| `fp_include_headers` | `False` | Include request headers in the fingerprint                                                                      |

For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:

```python
class MySpider(Spider):
    name = "my_spider"
    fp_keep_fragments = True
    # ...
```

## Request Meta

The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:

```python
async def parse(self, response: Response):
    for product in response.css("div.product"):
        category = product.css("span.category::text").get("")
        link = product.css("a::attr(href)").get()
        if link:
            yield response.follow(
                link,
                callback=self.parse_product,
                meta={"category": category},
            )

async def parse_product(self, response: Response):
    yield {
        "name": response.css("h1::text").get(""),
        "price": response.css(".price::text").get(""),
        # Access meta from the request
        "category": response.meta.get("category", ""),
    }
```

When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).

The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled.

================================================
FILE: docs/spiders/sessions.md
================================================
# Spiders sessions

!!! success "Prerequisites"

    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
    2. You're familiar with [Fetchers basics](../fetching/choosing.md) and the differences between HTTP, Dynamic, and Stealthy sessions.

A spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages. This page shows you how to configure and use sessions.

## What are Sessions?

As you should already know, a session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.

By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:


| Session Type                                    | Use Case                                 |
|-------------------------------------------------|------------------------------------------|
| [FetcherSession](../fetching/static.md)         | Fast HTTP requests, no JavaScript        |
| [AsyncDynamicSession](../fetching/dynamic.md)   | Browser automation, JavaScript rendering |
| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc.        |


## Configuring Sessions

Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class MySpider(Spider):
    name = "my_spider"
    start_urls = ["https://example.com"]

    def configure_sessions(self, manager):
        manager.add("default", FetcherSession())

    async def parse(self, response: Response):
        yield {"title": response.css("title::text").get("")}
```

The `manager.add()` method takes:

| Argument     | Type      | Default    | Description                                  |
|--------------|-----------|------------|----------------------------------------------|
| `session_id` | `str`     | *required* | A name to reference this session in requests |
| `session`    | `Session` | *required* | The session instance                         |
| `default`    | `bool`    | `False`    | Make this the default session                |
| `lazy`       | `bool`    | `False`    | Start the session only when first used       |

!!! note "Notes:"

    1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:
        1. The first session you add to the managed becomes the default automatically.
        2. The session that gets `default=True` while added to the manager.
    2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.
    3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.

## Multi-Session Spider

Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession, AsyncStealthySession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        # Fast HTTP for listing pages (default)
        manager.add("http", FetcherSession())

        # Stealth browser for protected product pages
        manager.add("stealth", AsyncStealthySession(
            headless=True,
            network_idle=True,
        ))

    async def parse(self, response: Response):
        for link in response.css("a.product::attr(href)").getall():
            # Route product pages through the stealth session
            yield response.follow(link, sid="stealth", callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield response.follow(next_page)

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```

The key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.

Note that the sessions don't have to be from different classes only, but can be the same session, but different instances with different configurations, for example, like below:

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        chrome_requests = FetcherSession(impersonate="chrome")
        firefox_requests = FetcherSession(impersonate="firefox")

        manager.add("chrome", chrome_requests)
        manager.add("firefox", firefox_requests)

    async def parse(self, response: Response):
        for link in response.css("a.product::attr(href)").getall():
            yield response.follow(link, callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield response.follow(next_page, sid="firefox")

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```

Or you can separate concerns and keep a session with its cookies/state for specific requests, etc...

## Session Arguments

Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:

```python
async def parse(self, response: Response):
    # Pass extra headers for this specific request
    yield Request(
        "https://api.example.com/data",
        headers={"Authorization": "Bearer token123"},
        callback=self.parse_api,
    )

    # Use a different HTTP method
    yield Request(
        "https://example.com/submit",
        method="POST",
        data={"field": "value"},
        sid="firefox",
        callback=self.parse_result,
    )
```

!!! warning

    Normally, when you use `FetcherSession`, `Fetcher`, or `AsyncFetcher`, you specify the HTTP method to use with the corresponding method like `.get()` and `.post()`. But while using `FetcherSession` in spiders, you can't do this. By default, the request is an _HTTP GET_ request; if you want to use another HTTP method, you have to pass it to the `method` argument, as in the above example. The reason for this is to unify the `Request` interface across all session types.

For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:

```python
async def parse(self, response: Response):
    # Use Cloudflare solver with the `AsyncStealthySession` we configured above
    yield Request(
        "https://nopecha.com/demo/cloudflare",
        sid="stealth",
        callback=self.parse_result,
        solve_cloudflare=True,
        block_webrtc=True,
        hide_canvas=True,
        google_search=True,
    )

    yield response.follow(
        "/dynamic-page",
        sid="browser",
        callback=self.parse_dynamic,
        wait_selector="div.loaded",
        network_idle=True,
    )
```

!!! warning

    Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.

```python
from scrapling.spiders import Spider, Response
from scrapling.fetchers import FetcherSession

class ProductSpider(Spider):
    name = "products"
    start_urls = ["https://shop.example.com/products"]

    def configure_sessions(self, manager):
        manager.add("http", FetcherSession(impersonate='chrome'))

    async def parse(self, response: Response):
        # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one
        # so I override it like this
        for link in response.css("a.product::attr(href)").getall():
            yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product)

        next_page = response.css("a.next::attr(href)").get()
        if next_page:
            yield Request(next_page)

    async def parse_product(self, response: Response):
        yield {
            "name": response.css("h1::text").get(""),
            "price": response.css(".price::text").get(""),
        }
```
!!! info

    No need to mention that, upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider.

================================================
FILE: docs/stylesheets/extra.css
================================================
.md-grid {
  max-width: 90%;
}

@font-face {
  font-family: 'Maple Mono';
  font-style: normal;
  font-display: swap;
  font-weight: 400;
  src: url(https://cdn.jsdelivr.net/fontsource/fonts/maple-mono@latest/latin-400-normal.woff2) format('woff2'), url(https://cdn.jsdelivr.net/fontsource/fonts/maple-mono@latest/latin-400-normal.woff) format('woff');
}

:root {
  --md-code-font: 'Maple Mono';
}
[align="center"] code {
  font-family: 'Maple Mono';
  font-style: italic;
  font-weight: 800;
}

/* Announcement banner background */
[data-md-color-scheme="default"] .md-banner {
  background-color: #232946;
}

[data-md-color-scheme="slate"] .md-banner {
  background-color: #141428;
}

================================================
FILE: docs/tutorials/migrating_from_beautifulsoup.md
================================================
# Migrating from BeautifulSoup to Scrapling

If you're already familiar with BeautifulSoup, you're in for a treat. Scrapling is much faster, provides the same parsing capabilities as BS, adds additional parsing capabilities not found in BS, and introduces powerful new features for fetching and handling modern web pages. This guide will help you quickly adapt your existing BeautifulSoup code to leverage Scrapling's capabilities.

Below is a table that covers the most common operations you'll perform when scraping web pages. Each row illustrates how to achieve a specific task using BeautifulSoup and the corresponding method in Scrapling.

You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, which is one of the reasons BeautifulSoup is slower than Scrapling. The point is: If the same feature can be used in a short one-liner, there is no need to sacrifice performance to shorten that short line :)


| Task                                                            | BeautifulSoup Code                                                                                            | Scrapling Code                                                                    |
|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|
| Parser import                                                   | `from bs4 import BeautifulSoup`                                                                               | `from scrapling.parser import Selector`                                           |
| Parsing HTML from string                                        | `soup = BeautifulSoup(html, 'html.parser')`                                                                   | `page = Selector(html)`                                                           |
| Finding a single element                                        | `element = soup.find('div', class_='example')`                                                                | `element = page.find('div', class_='example')`                                    |
| Finding multiple elements                                       | `elements = soup.find_all('div', class_='example')`                                                           | `elements = page.find_all('div', class_='example')`                               |
| Finding a single element (Example 2)                            | `element = soup.find('div', attrs={"class": "example"})`                                                      | `element = page.find('div', {"class": "example"})`                                |
| Finding a single element (Example 3)                            | `element = soup.find(re.compile("^b"))`                                                                       | `element = page.find(re.compile("^b"))`<br/>`element = page.find_by_regex(r"^b")` |
| Finding a single element (Example 4)                            | `element = soup.find(lambda e: len(list(e.children)) > 0)`                                                    | `element = page.find(lambda e: len(e.children) > 0)`                              |
| Finding a single element (Example 5)                            | `element = soup.find(["a", "b"])`                                                                             | `element = page.find(["a", "b"])`                                                 |
| Find element by its text content                                | `element = soup.find(text="some text")`                                                                       | `element = page.find_by_text("some text", partial=False)`                         |
| Using CSS selectors to find the first matching element          | `elements = soup.select_one('div.example')`                                                                   | `elements = page.css('div.example').first`                                        |
| Using CSS selectors to find all matching element                | `elements = soup.select('div.example')`                                                                       | `elements = page.css('div.example')`                                              |
| Get a prettified version of the page/element source             | `prettified = soup.prettify()`                                                                                | `prettified = page.prettify()`                                                    |
| Get a Non-pretty version of the page/element source             | `source = str(soup)`                                                                                          | `source = page.html_content`                                                      |
| Get tag name of an element                                      | `name = element.name`                                                                                         | `name = element.tag`                                                              |
| Extracting text content of an element                           | `string = element.string`                                                                                     | `string = element.text`                                                           |
| Extracting all the text in a document or beneath a tag          | `text = soup.get_text(strip=True)`                                                                            | `text = page.get_all_text(strip=True)`                                            |
| Access the dictionary of attributes                             | `attrs = element.attrs`                                                                                       | `attrs = element.attrib`                                                          |
| Extracting attributes                                           | `attr = element['href']`                                                                                      | `attr = element['href']`                                                          |
| Navigating to parent                                            | `parent = element.parent`                                                                                     | `parent = element.parent`                                                         |
| Get all parents of an element                                   | `parents = list(element.parents)`                                                                             | `parents = list(element.iterancestors())`                                         |
| Searching for an element in the parents of an element           | `target_parent = element.find_parent("a")`                                                                    | `target_parent = element.find_ancestor(lambda p: p.tag == 'a')`                   |
| Get all siblings of an element                                  | N/A                                                                                                           | `siblings = element.siblings`                                                     |
| Get next sibling of an element                                  | `next_element = element.next_sibling`                                                                         | `next_element = element.next`                                                     |
| Searching for an element in the siblings of an element          | `target_sibling = element.find_next_sibling("a")`<br/>`target_sibling = element.find_previous_sibling("a")`   | `target_sibling = element.siblings.search(lambda s: s.tag == 'a')`                |
| Searching for elements in the siblings of an element            | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')`                |
| Searching for an element in the next elements of an element     | `target_parent = element.find_next("a")`                                                                      | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')`           |
| Searching for elements in the next elements of an element       | `target_parent = element.find_all_next("a")`                                                                  | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')`           |
| Searching for an element in the ancestors of an element         | `target_parent = element.find_previous("a")` ¹                                                                | `target_parent = element.path.search(lambda p: p.tag == 'a')`                     |
| Searching for elements in the ancestors of an element           | `target_parent = element.find_all_previous("a")` ¹                                                            | `target_parent = element.path.filter(lambda p: p.tag == 'a')`                     |
| Get previous sibling of an element                              | `prev_element = element.previous_sibling`                                                                     | `prev_element = element.previous`                                                 |
| Navigating to children                                          | `children = list(element.children)`                                                                           | `children = element.children`                                                     |
| Get all descendants of an element                               | `children = list(element.descendants)`                                                                        | `children = element.below_elements`                                               |
| Filtering a group of elements that satisfies a condition        | `group = soup.find('p', 'story').css.filter('a')`                                                             | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')`              |


¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.

**One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)

### Putting It All Together

Here's a simple example of scraping a web page to extract all the links using BeautifulSoup and Scrapling.

**With BeautifulSoup:**

```python
import requests
from bs4 import BeautifulSoup

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

links = soup.find_all('a')
for link in links:
    print(link['href'])
```

**With Scrapling:**

```python
from scrapling import Fetcher

url = 'https://example.com'
page = Fetcher.get(url)

links = page.css('a::attr(href)')
for link in links:
    print(link)
```

As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.

!!! abstract "**Additional Notes:**"

    - **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.
    - **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.
    - **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). In Scrapling, `page.css()` returns an empty `Selectors` list when no elements match, and you can use `page.css('.foo').first` to safely get the first match or `None`. To avoid errors, check for `None` or empty results before accessing properties.
    - **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.

The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.

This guide should make your transition from BeautifulSoup to Scrapling smooth and straightforward. Happy scraping!

================================================
FILE: docs/tutorials/replacing_ai.md
================================================
# Scrapling: A Free Alternative to AI for Robust Web Scraping

Web scraping has long been a vital tool for data extraction, indexing, and preparing datasets, among other purposes. But experienced users often encounter persistent issues that can hinder effectiveness. Recently, there's been a noticeable shift toward AI-based web scraping, driven by its potential to address these challenges.

In this article, we will discuss these common issues, why companies are shifting toward that approach, the problems with that approach, and how scrapling solves them for you without the cost of using AI.

## Common issues and challenging goals

If you have been doing Web Scraping for a long time, you probably noticed that there are repeating problems with Web Scraping, like:

1. **Rapidly changing website structures** — Sites frequently update their DOM structures, breaking static XPath/CSS selectors.
2. **Unstable selectors** — Class names and IDs often change or use randomly generated values that break scrapers or make scraping these websites difficult.
3. **Increasingly complex anti-bot measures** — CAPTCHA systems, browser fingerprinting, and behavior analysis make traditional scraping difficult
and others

But that's only if you are doing targeted Web Scraping for known websites, in which case you can write specific code for every website.

If you start thinking about bigger goals like Broad Scraping or Generic Web Scraping, or what you like to call it, then the above issues intensify, and you will face new issues like:

1. **Extreme Website Diversity** — Generic scraping must handle countless variations in HTML structures, CSS usage, JavaScript frameworks, and backend technologies.
2. **Identifying Relevant Data** — How does the scraper know what data is important on a page it has never seen before?
3. **Pagination variations** — Infinite scroll, traditional pagination, "load more" buttons, all requiring different approaches
and more

How will you solve that manually? I'm referring to generic web scraping of various websites that don't share any common technologies.

## AI to the rescue, but at a high cost

Of course, AI can easily solve most of these issues because it can understand the page source and identify the fields you want or create selectors for them. That's, of course, if you already solved the anti-bot measures through other tools :)

This approach is, of course, beautiful. I love AI and find it very fascinating, especially Generative AI. You will probably spend a lot of time on prompt engineering and tweaking the prompts, but if that's cool with you, you will soon hit the real issue with using AI here.

Most websites have vast amounts of content per page, which you will need to pass to the AI somehow so it can do its magic. This will burn through tokens like fire in a haystack, quickly accumulating high costs.

Unless money is irrelevant to you, you will try to find less expensive approaches, and that's where Scrapling comes into play :smile:

## Scrapling got you covered

Scrapling can handle almost all issues you will face during Web Scraping, and the following updates will cover the rest carefully.

### Solving issue T1: Rapidly changing website structures
That's why the [adaptive](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html) feature was made. You knew I would talk about it, and here we are :)

While Web Scraping, if you have the `adaptive` feature enabled, you can save any element's unique properties so you can find it again later when the website's structure changes. The most frustrating thing about changes is that anything about an element can change, so there's nothing to rely on. 

That's how the adaptive feature works: it stores everything unique about an element. When the website structure changes, it returns the element with the highest similarity score of the previous element.

I have already explained this in more detail, with many examples. Read more from [here](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html#how-the-adaptive-feature-works).

### Solving issue T2: Unstable selectors
If you have been doing Web scraping for a long enough time, you have likely experienced this once. I'm referring to a website that employs poor design patterns, built on raw HTML without any IDs/classes, or uses random class names with nothing else to rely on, etc...

In these cases, standard selection methods with CSS/XPath selectors won't be optimal, and that's why Scrapling provides three more methods for Selection:

1. [Selection by element content](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#text-content-selection): Through text content (`find_by_text`) or regex that matches text content (`find_by_regex`)
2. [Selecting elements similar to another element](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#finding-similar-elements): You find an element, and we will do the rest!
3. [Selecting elements by filters](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#filters-based-searching): You specify conditions/filters that this element must fulfill, we find it!

There is no need to explain any of these; click on the links, and it will be clear how Scrapling solves this.

### Solving issue T3: Increasingly complex anti-bot measures
It's well known that creating an undetectable spider requires more than residential/mobile proxies and human-like behavior. It also needs a hard-to-detect browser, which Scrapling provides two main options to solve:

1. [DynamicFetcher](https://scrapling.readthedocs.io/en/latest/fetching/dynamic.html) — This fetcher provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.
2. [StealthyFetcher](https://scrapling.readthedocs.io/en/latest/fetching/stealthy.html) — Because we live in a harsh world and you need to take [full measure instead of half-measures](https://www.youtube.com/watch?v=7BE4QcwX4dU), `StealthyFetcher` was born. This fetcher uses our stealthy browser -- a version of [DynamicFetcher](https://scrapling.readthedocs.io/en/latest/fetching/dynamic.html) that nearly bypasses all annoying anti-protections, provides tools to handle the rest, and automatically bypasses all types of Cloudflare's Turnstile/Interstitial!

We keep improving these two with each update, so stay tuned :)

### Solving issues B1 & B2: Extreme Website Diversity / Identifying Relevant Data

This one is tough to handle, but Scrapling's flexibility makes it possible. 

I talked with someone who uses AI to extract prices from different websites. He is only interested in prices and titles, so he uses AI to find the price for him.

I told him you don't need to use AI here and gave this code as an example
```python
price_element = page.find_by_regex(r'£[\d\.,]+', first_match=True)  # Get the first element that contains a text that matches price regex eg. £10.50
# If you want the container/element that contains the price element
price_element_container = price_element.parent or price_element.find_ancestor(lambda ancestor: ancestor.has_class('product'))  # or other methods...
target_element_selector = price_element_container.generate_css_selector or price_element_container.generate_full_css_selector # or xpath
```
Then he said What about cases like this:
```html
<span class='currency'> $ </span> <span class='a-price'> 45,000 </span>
```
So, I updated the code like this
```python
price_element_container = page.find_by_regex(r'[\d,]+', first_match=True).parent # Adjusted the regex for this example
full_price_data = price_element_container.get_all_text(strip=True)  # Returns '$45,000' in this case
```
This was enough for his use case. You can use the first regex, and if it doesn't find anything, use the following regex, and so on. Try to cover the most common patterns first, then the less common ones, and so on.
It will be a bit boring, but it's definitely less expensive than AI.

This example illustrates the point I aim to convey here. Not every challenge will need AI to be solved, but sometimes you need to be creative, and that might save you a lot of money.

### Solving issue B3: Pagination variations
This issue, Scrapling currently doesn't have a direct method to automatically extract pagination's URLs for you, but it will be added with the upcoming updates :)

But you can handle most websites if you search for the most common patterns with `page.find_by_text('Next')['href']` or `page.find_by_text('load more')['href']` or selectors like `'a[href*="?page="]'` or `'a[href*="/page/"]'`—you get the idea.

## Cost Comparison and Savings
For a quick comparison.

| Aspect         | Scrapling                                                                  | AI-Based Tools (e.g., Browse AI, Oxylabs)                                  |
|----------------|----------------------------------------------------------------------------|----------------------------------------------------------------------------|
| Cost Structure | Likely free or low-cost, no per-use fees                                   | Starts at $19/month (Browse AI) to $49/month (Oxylabs), scales with usage  |
| Setup Effort   | Requires little technical expertise, manual setup                          | Often no-code, easier for non-technical users                              |
| Usage options  | Through code, terminal, or MCP server.                                     | Often through GUI or API, depending on the option the company is providing |
| Scalability    | Depends on user implementation                                             | Built-in support for large-scale, managed services                         |
| Adaptability   | High with features like `adaptive` and the non-selectors selection methods | High, automatic with AI, but costly for frequent changes                   |

This table is based on pricing from [Browse AI Pricing](https://www.browse.ai/pricing) and [Oxylabs Web Scraper API Pricing](https://oxylabs.io/products/scraper-api/web/pricing)

## Conclusion
While AI offers powerful capabilities, its cost can be prohibitive for many Web scraping tasks. Scrapling provides a robust, flexible, and cost-effective toolkit for tackling the real-world challenges of both targeted and broad scraping, often eliminating the need for expensive AI solutions. You can build resilient scrapers more efficiently by leveraging features like `adaptive`, diverse selection methods, and advanced fetchers.

Explore the documentation further and see how Scrapling can simplify your future Web Scraping projects!

================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "scrapling"
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
version = "0.4.2"
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
readme = {file = "README.md", content-type = "text/markdown"}
license = {file = "LICENSE"}
authors = [
    {name = "Karim Shoair", email = "karim.shoair@pm.me"}
]
maintainers = [
    {name = "Karim Shoair", email = "karim.shoair@pm.me"}
]
keywords = [
    "web-scraping",
    "scraping",
    "automation",
    "browser-automation",
    "data-extraction",
    "html-parsing",
    "undetectable",
    "playwright",
    "selenium-alternative",
    "web-crawler",
    "browser",
    "crawling",
    "headless",
    "scraper",
    "chrome",
]
requires-python = ">=3.10"
classifiers = [
    "Operating System :: OS Independent",
    "Development Status :: 4 - Beta",
    # "Development Status :: 5 - Production/Stable",
    # "Development Status :: 6 - Mature",
    # "Development Status :: 7 - Inactive",
    "Intended Audience :: Developers",
    "Intended Audience :: Information Technology",
    "License :: OSI Approved :: BSD License",
    "Natural Language :: English",
    "Topic :: Internet :: WWW/HTTP",
    "Topic :: Internet :: WWW/HTTP :: Browsers",
    "Topic :: Text Processing :: Markup",
    "Topic :: Text Processing :: Markup :: HTML",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Topic :: Software Development :: Libraries",
    "Topic :: Software Development :: Libraries :: Application Frameworks",
    "Topic :: Software Development :: Libraries :: Python Modules",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3 :: Only",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: Implementation :: CPython",
    "Typing :: Typed",
]
dependencies = [
    "lxml>=6.0.2",
    "cssselect>=1.4.0",
    "orjson>=3.11.7",
    "tld>=0.13.2",
    "w3lib>=2.4.0",
    "typing_extensions",
]

[project.optional-dependencies]
fetchers = [
    "click>=8.3.0",
    "curl_cffi>=0.14.0",
    "playwright==1.58.0",
    "patchright==1.58.2",
    "browserforge>=1.2.4",
    "apify-fingerprint-datapoints>=0.11.0",
    "msgspec>=0.20.0",
    "anyio>=4.12.1"
]
ai = [
    "mcp>=1.26.0",
    "markdownify>=1.2.0",
    "scrapling[fetchers]",
]
shell = [
    "IPython>=8.37",  # The last version that supports Python 3.10
    "markdownify>=1.2.0",
    "scrapling[fetchers]",
]
all = [
    "scrapling[ai,shell]",
]

[project.urls]
Homepage = "https://github.com/D4Vinci/Scrapling"
Changelog = "https://github.com/D4Vinci/Scrapling/releases"
Documentation = "https://scrapling.readthedocs.io/en/latest/"
Repository = "https://github.com/D4Vinci/Scrapling"
"Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
"Discord" = "https://discord.gg/EMgGbDceNQ"
"Release Notes" = "https://github.com/D4Vinci/Scrapling/releases"

[project.scripts]
scrapling = "scrapling.cli:main"

[tool.setuptools]
zip-safe = false
include-package-data = true

[tool.setuptools.packages.find]
where = ["."]
include = ["scrapling*"]

[tool.mypy]
python_version = "3.10"
warn_unused_configs = true
ignore_missing_imports = true
check_untyped_defs = true

[tool.pyright]
pythonVersion = "3.10"
typeCheckingMode = "basic"
include = ["scrapling"]
ignore = ["tests", "benchmarks.py"]

================================================
FILE: pytest.ini
================================================
[pytest]
asyncio_mode = strict
asyncio_default_fixture_loop_scope = function
addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose
markers =
    asyncio: marks tests as async
asyncio_fixture_scope = function

================================================
FILE: ruff.toml
================================================
exclude = [
    ".git",
    ".venv",
    "__pycache__",
    "docs",
    ".github",
    "build",
    "dist",
    "tests",
    "benchmarks.py",
]

# Assume Python 3.10
target-version = "py310"
# Allow lines to be as long as 120.
line-length = 120

[lint]
select = ["E", "F", "W"]
ignore = ["E501", "F401", "F811"]

[format]
# Like Black, use double quotes for strings.
quote-style = "double"


================================================
FILE: scrapling/__init__.py
================================================
__author__ = "Karim Shoair (karim.shoair@pm.me)"
__version__ = "0.4.2"
__copyright__ = "Copyright (c) 2024 Karim Shoair"

from typing import Any, TYPE_CHECKING

if TYPE_CHECKING:
    from scrapling.parser import Selector, Selectors
    from scrapling.core.custom_types import AttributesHandler, TextHandler
    from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher


# Lazy import mapping
_LAZY_IMPORTS = {
    "Fetcher": ("scrapling.fetchers", "Fetcher"),
    "Selector": ("scrapling.parser", "Selector"),
    "Selectors": ("scrapling.parser", "Selectors"),
    "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
    "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
    "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
    "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
    "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
}
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]


def __getattr__(name: str) -> Any:
    if name in _LAZY_IMPORTS:
        module_path, class_name = _LAZY_IMPORTS[name]
        module = __import__(module_path, fromlist=[class_name])
        return getattr(module, class_name)
    else:
        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__() -> list[str]:
    """Support for dir() and autocomplete."""
    return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])


================================================
FILE: scrapling/cli.py
================================================
from pathlib import Path
from subprocess import check_output
from sys import executable as python_executable

from scrapling.core.utils import log
from scrapling.engines.toolbelt.custom import Response
from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable

from orjson import loads as json_loads, JSONDecodeError

try:
    from click import command, option, Choice, group, argument
except (ImportError, ModuleNotFoundError) as e:
    raise ModuleNotFoundError(
        "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
    ) from e

__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
__PACKAGE_DIR__ = Path(__file__).parent


def __Execute(cmd: List[str], help_line: str) -> None:  # pragma: no cover
    print(f"Installing {help_line}...")
    _ = check_output(cmd, shell=False)  # nosec B603
    # I meant to not use try except here


def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Parse JSON string into a Python object"""
    if not json_string:
        return None

    try:
        return json_loads(json_string)
    except JSONDecodeError as err:  # pragma: no cover
        raise ValueError(f"Invalid JSON data '{json_string}': {err}")


def __Request_and_Save(
    fetcher_func: Callable[..., Response],
    url: str,
    output_file: str,
    css_selector: Optional[str] = None,
    **kwargs,
) -> None:
    """Make a request using the specified fetcher function and save the result"""
    from scrapling.core.shell import Convertor

    # Handle relative paths - convert to an absolute path based on the current working directory
    output_path = Path(output_file)
    if not output_path.is_absolute():
        output_path = Path.cwd() / output_file

    response = fetcher_func(url, **kwargs)
    Convertor.write_content_to_file(response, str(output_path), css_selector)
    log.info(f"Content successfully saved to '{output_path}'")


def __ParseExtractArguments(
    headers: List[str], cookies: str, params: str, json: Optional[str] = None
) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
    """Parse arguments for extract command"""
    parsed_headers, parsed_cookies = _ParseHeaders(headers)
    if cookies:
        for key, value in _CookieParser(cookies):
            try:
                parsed_cookies[key] = value
            except Exception as err:
                raise ValueError(f"Could not parse cookies '{cookies}': {err}")

    parsed_json = __ParseJSONData(json)
    parsed_params = {}
    for param in params:
        if "=" in param:
            key, value = param.split("=", 1)
            parsed_params[key] = value

    return parsed_headers, parsed_cookies, parsed_params, parsed_json


def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
    """Build a request object using the specified arguments"""
    # Parse parameters
    parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
    # Build request arguments
    request_kwargs: Dict[str, Any] = {
        "headers": parsed_headers if parsed_headers else None,
        "cookies": parsed_cookies if parsed_cookies else None,
    }
    if parsed_json:
        request_kwargs["json"] = parsed_json
    if parsed_params:
        request_kwargs["params"] = parsed_params
    if "proxy" in kwargs:
        request_kwargs["proxy"] = kwargs.pop("proxy")

    # Parse impersonate parameter if it contains commas (for random selection)
    if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""):
        kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")]

    return {**request_kwargs, **kwargs}


@command(help="Install all Scrapling's Fetchers dependencies")
@option(
    "-f",
    "--force",
    "force",
    is_flag=True,
    default=False,
    type=bool,
    help="Force Scrapling to reinstall all Fetchers dependencies",
)
def install(force):  # pragma: no cover
    if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
        __Execute(
            [python_executable, "-m", "playwright", "install", "chromium"],
            "Playwright browsers",
        )
        __Execute(
            [
                python_executable,
                "-m",
                "playwright",
                "install-deps",
                "chromium",
            ],
            "Playwright dependencies",
        )
        from tld.utils import update_tld_names

        update_tld_names(fail_silently=True)
        # if no errors raised by the above commands, then we add the below file
        __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
    else:
        print("The dependencies are already installed")


@command(help="Run Scrapling's MCP server (Check the docs for more info).")
@option(
    "--http",
    is_flag=True,
    default=False,
    help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
)
@option(
    "--host",
    type=str,
    default="0.0.0.0",
    help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
)
@option(
    "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
)
def mcp(http, host, port):
    from scrapling.core.ai import ScraplingMCPServer

    server = ScraplingMCPServer()
    server.serve(http, host, port)


@command(help="Interactive scraping console")
@option(
    "-c",
    "--code",
    "code",
    is_flag=False,
    default="",
    type=str,
    help="Evaluate the code in the shell, print the result and exit",
)
@option(
    "-L",
    "--loglevel",
    "level",
    is_flag=False,
    default="debug",
    type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
    help="Log level (default: DEBUG)",
)
def shell(code, level):
    from scrapling.core.shell import CustomShell

    console = CustomShell(code=code, log_level=level)
    console.start()


@group(
    help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
)
def extract():
    """Extract content from web pages and save to files"""
    pass


@extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option(
    "--headers",
    "-H",
    multiple=True,
    help='HTTP headers in format "Key: Value" (can be used multiple times)',
)
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option(
    "--params",
    "-p",
    multiple=True,
    help='Query parameters in format "key=value" (can be used multiple times)',
)
@option(
    "--follow-redirects/--no-follow-redirects",
    default=True,
    help="Whether to follow redirects (default: True)",
)
@option(
    "--verify/--no-verify",
    default=True,
    help="Whether to verify SSL certificates (default: True)",
)
@option(
    "--impersonate",
    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
)
@option(
    "--stealthy-headers/--no-stealthy-headers",
    default=True,
    help="Use stealthy browser headers (default: True)",
)
def get(
    url,
    output_file,
    headers,
    cookies,
    timeout,
    proxy,
    css_selector,
    params,
    follow_redirects,
    verify,
    impersonate,
    stealthy_headers,
):
    """
    Perform a GET request and save the content to a file.

    :param url: Target URL for the request.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param headers: HTTP headers to include in the request.
    :param cookies: Cookies to use in the request.
    :param timeout: Number of seconds to wait before timing out.
    :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
    :param css_selector: CSS selector to extract specific content.
    :param params: Query string parameters for the request.
    :param follow_redirects: Whether to follow redirects.
    :param verify: Whether to verify HTTPS certificates.
    :param impersonate: Browser version to impersonate.
    :param stealthy_headers: If enabled, creates and adds real browser headers.
    """

    kwargs = __BuildRequest(
        headers,
        cookies,
        params,
        None,
        timeout=timeout,
        follow_redirects=follow_redirects,
        verify=verify,
        stealthy_headers=stealthy_headers,
        impersonate=impersonate,
        proxy=proxy,
    )
    from scrapling.fetchers import Fetcher

    __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)


@extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option(
    "--data",
    "-d",
    help='Form data to include in the request body (as string, ex: "param1=value1&param2=value2")',
)
@option("--json", "-j", help="JSON data to include in the request body (as string)")
@option(
    "--headers",
    "-H",
    multiple=True,
    help='HTTP headers in format "Key: Value" (can be used multiple times)',
)
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option(
    "--params",
    "-p",
    multiple=True,
    help='Query parameters in format "key=value" (can be used multiple times)',
)
@option(
    "--follow-redirects/--no-follow-redirects",
    default=True,
    help="Whether to follow redirects (default: True)",
)
@option(
    "--verify/--no-verify",
    default=True,
    help="Whether to verify SSL certificates (default: True)",
)
@option(
    "--impersonate",
    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
)
@option(
    "--stealthy-headers/--no-stealthy-headers",
    default=True,
    help="Use stealthy browser headers (default: True)",
)
def post(
    url,
    output_file,
    data,
    json,
    headers,
    cookies,
    timeout,
    proxy,
    css_selector,
    params,
    follow_redirects,
    verify,
    impersonate,
    stealthy_headers,
):
    """
    Perform a POST request and save the content to a file.

    :param url: Target URL for the request.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param data: Form data to include in the request body. (as string, ex: "param1=value1&param2=value2")
    :param json: A JSON serializable object to include in the body of the request.
    :param headers: Headers to include in the request.
    :param cookies: Cookies to use in the request.
    :param timeout: Number of seconds to wait before timing out.
    :param proxy: Proxy URL to use.
    :param css_selector: CSS selector to extract specific content.
    :param params: Query string parameters for the request.
    :param follow_redirects: Whether to follow redirects.
    :param verify: Whether to verify HTTPS certificates.
    :param impersonate: Browser version to impersonate.
    :param stealthy_headers: If enabled, creates and adds real browser headers.
    """

    kwargs = __BuildRequest(
        headers,
        cookies,
        params,
        json,
        timeout=timeout,
        follow_redirects=follow_redirects,
        verify=verify,
        stealthy_headers=stealthy_headers,
        impersonate=impersonate,
        proxy=proxy,
        data=data,
    )
    from scrapling.fetchers import Fetcher

    __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)


@extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option("--data", "-d", help="Form data to include in the request body")
@option("--json", "-j", help="JSON data to include in the request body (as string)")
@option(
    "--headers",
    "-H",
    multiple=True,
    help='HTTP headers in format "Key: Value" (can be used multiple times)',
)
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option(
    "--params",
    "-p",
    multiple=True,
    help='Query parameters in format "key=value" (can be used multiple times)',
)
@option(
    "--follow-redirects/--no-follow-redirects",
    default=True,
    help="Whether to follow redirects (default: True)",
)
@option(
    "--verify/--no-verify",
    default=True,
    help="Whether to verify SSL certificates (default: True)",
)
@option(
    "--impersonate",
    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
)
@option(
    "--stealthy-headers/--no-stealthy-headers",
    default=True,
    help="Use stealthy browser headers (default: True)",
)
def put(
    url,
    output_file,
    data,
    json,
    headers,
    cookies,
    timeout,
    proxy,
    css_selector,
    params,
    follow_redirects,
    verify,
    impersonate,
    stealthy_headers,
):
    """
    Perform a PUT request and save the content to a file.

    :param url: Target URL for the request.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param data: Form data to include in the request body.
    :param json: A JSON serializable object to include in the body of the request.
    :param headers: Headers to include in the request.
    :param cookies: Cookies to use in the request.
    :param timeout: Number of seconds to wait before timing out.
    :param proxy: Proxy URL to use.
    :param css_selector: CSS selector to extract specific content.
    :param params: Query string parameters for the request.
    :param follow_redirects: Whether to follow redirects.
    :param verify: Whether to verify HTTPS certificates.
    :param impersonate: Browser version to impersonate.
    :param stealthy_headers: If enabled, creates and adds real browser headers.
    """

    kwargs = __BuildRequest(
        headers,
        cookies,
        params,
        json,
        timeout=timeout,
        follow_redirects=follow_redirects,
        verify=verify,
        stealthy_headers=stealthy_headers,
        impersonate=impersonate,
        proxy=proxy,
        data=data,
    )
    from scrapling.fetchers import Fetcher

    __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)


@extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option(
    "--headers",
    "-H",
    multiple=True,
    help='HTTP headers in format "Key: Value" (can be used multiple times)',
)
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option(
    "--params",
    "-p",
    multiple=True,
    help='Query parameters in format "key=value" (can be used multiple times)',
)
@option(
    "--follow-redirects/--no-follow-redirects",
    default=True,
    help="Whether to follow redirects (default: True)",
)
@option(
    "--verify/--no-verify",
    default=True,
    help="Whether to verify SSL certificates (default: True)",
)
@option(
    "--impersonate",
    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
)
@option(
    "--stealthy-headers/--no-stealthy-headers",
    default=True,
    help="Use stealthy browser headers (default: True)",
)
def delete(
    url,
    output_file,
    headers,
    cookies,
    timeout,
    proxy,
    css_selector,
    params,
    follow_redirects,
    verify,
    impersonate,
    stealthy_headers,
):
    """
    Perform a DELETE request and save the content to a file.

    :param url: Target URL for the request.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param headers: Headers to include in the request.
    :param cookies: Cookies to use in the request.
    :param timeout: Number of seconds to wait before timing out.
    :param proxy: Proxy URL to use.
    :param css_selector: CSS selector to extract specific content.
    :param params: Query string parameters for the request.
    :param follow_redirects: Whether to follow redirects.
    :param verify: Whether to verify HTTPS certificates.
    :param impersonate: Browser version to impersonate.
    :param stealthy_headers: If enabled, creates and adds real browser headers.
    """

    kwargs = __BuildRequest(
        headers,
        cookies,
        params,
        None,
        timeout=timeout,
        follow_redirects=follow_redirects,
        verify=verify,
        stealthy_headers=stealthy_headers,
        impersonate=impersonate,
        proxy=proxy,
    )
    from scrapling.fetchers import Fetcher

    __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)


@extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option(
    "--headless/--no-headless",
    default=True,
    help="Run browser in headless mode (default: True)",
)
@option(
    "--disable-resources/--enable-resources",
    default=False,
    help="Drop unnecessary resources for speed boost (default: False)",
)
@option(
    "--network-idle/--no-network-idle",
    default=False,
    help="Wait for network idle (default: False)",
)
@option(
    "--timeout",
    type=int,
    default=30000,
    help="Timeout in milliseconds (default: 30000)",
)
@option(
    "--wait",
    type=int,
    default=0,
    help="Additional wait time in milliseconds after page load (default: 0)",
)
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option("--wait-selector", help="CSS selector to wait for before proceeding")
@option("--locale", default=None, help="Specify user locale. Defaults to the system default locale.")
@option(
    "--real-chrome/--no-real-chrome",
    default=False,
    help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
)
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--extra-headers",
    "-H",
    multiple=True,
    help='Extra headers in format "Key: Value" (can be used multiple times)',
)
def fetch(
    url,
    output_file,
    headless,
    disable_resources,
    network_idle,
    timeout,
    wait,
    css_selector,
    wait_selector,
    locale,
    real_chrome,
    proxy,
    extra_headers,
):
    """
    Opens up a browser and fetch content using DynamicFetcher.

    :param url: Target url.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param headless: Run the browser in headless/hidden or headful/visible mode.
    :param disable_resources: Drop requests of unnecessary resources for a speed boost.
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
    :param css_selector: CSS selector to extract specific content.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param locale: Set the locale for the browser.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param proxy: The proxy to be used with requests.
    :param extra_headers: Extra headers to add to the request.
    """

    # Parse parameters
    parsed_headers, _ = _ParseHeaders(extra_headers, False)

    # Build request arguments
    kwargs = {
        "headless": headless,
        "disable_resources": disable_resources,
        "network_idle": network_idle,
        "timeout": timeout,
        "locale": locale,
        "real_chrome": real_chrome,
    }

    if wait > 0:
        kwargs["wait"] = wait
    if wait_selector:
        kwargs["wait_selector"] = wait_selector
    if proxy:
        kwargs["proxy"] = proxy
    if parsed_headers:
        kwargs["extra_headers"] = parsed_headers

    from scrapling.fetchers import DynamicFetcher

    __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)


@extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
@argument("url", required=True)
@argument("output_file", required=True)
@option(
    "--headless/--no-headless",
    default=True,
    help="Run browser in headless mode (default: True)",
)
@option(
    "--disable-resources/--enable-resources",
    default=False,
    help="Drop unnecessary resources for speed boost (default: False)",
)
@option(
    "--block-webrtc/--allow-webrtc",
    default=False,
    help="Block WebRTC entirely (default: False)",
)
@option(
    "--solve-cloudflare/--no-solve-cloudflare",
    default=False,
    help="Solve Cloudflare challenges (default: False)",
)
@option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
@option(
    "--network-idle/--no-network-idle",
    default=False,
    help="Wait for network idle (default: False)",
)
@option(
    "--real-chrome/--no-real-chrome",
    default=False,
    help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
)
@option(
    "--hide-canvas/--show-canvas",
    default=False,
    help="Add noise to canvas operations (default: False)",
)
@option(
    "--timeout",
    type=int,
    default=30000,
    help="Timeout in milliseconds (default: 30000)",
)
@option(
    "--wait",
    type=int,
    default=0,
    help="Additional wait time in milliseconds after page load (default: 0)",
)
@option(
    "--css-selector",
    "-s",
    help="CSS selector to extract specific content from the page. It returns all matches.",
)
@option("--wait-selector", help="CSS selector to wait for before proceeding")
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
@option(
    "--extra-headers",
    "-H",
    multiple=True,
    help='Extra headers in format "Key: Value" (can be used multiple times)',
)
def stealthy_fetch(
    url,
    output_file,
    headless,
    disable_resources,
    block_webrtc,
    solve_cloudflare,
    allow_webgl,
    network_idle,
    real_chrome,
    hide_canvas,
    timeout,
    wait,
    css_selector,
    wait_selector,
    proxy,
    extra_headers,
):
    """
    Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.

    :param url: Target url.
    :param output_file: Output file path (.md for Markdown, .html for HTML).
    :param headless: Run the browser in headless/hidden, or headful/visible mode.
    :param disable_resources: Drop requests of unnecessary resources for a speed boost.
    :param block_webrtc: Blocks WebRTC entirely.
    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
    :param allow_webgl: Allow WebGL (recommended to keep enabled).
    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
    :param css_selector: CSS selector to extract specific content.
    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
    :param proxy: The proxy to be used with requests.
    :param extra_headers: Extra headers to add to the request.
    """

    # Parse parameters
    parsed_headers, _ = _ParseHeaders(extra_headers, False)

    # Build request arguments
    kwargs = {
        "headless": headless,
        "disable_resources": disable_resources,
        "block_webrtc": block_webrtc,
        "solve_cloudflare": solve_cloudflare,
        "allow_webgl": allow_webgl,
        "network_idle": network_idle,
        "real_chrome": real_chrome,
        "hide_canvas": hide_canvas,
        "timeout": timeout,
    }

    if wait > 0:
        kwargs["wait"] = wait
    if wait_selector:
        kwargs["wait_selector"] = wait_selector
    if proxy:
        kwargs["proxy"] = proxy
    if parsed_headers:
        kwargs["extra_headers"] = parsed_headers

    from scrapling.fetchers import StealthyFetcher

    __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)


@group()
def main():
    pass


# Adding commands
main.add_command(install)
main.add_command(shell)
main.add_command(extract)
main.add_command(mcp)


================================================
FILE: scrapling/core/__init__.py
================================================


================================================
FILE: scrapling/core/_shell_signatures.py
================================================
from scrapling.core._types import (
    Any,
    Dict,
    List,
    Tuple,
    Sequence,
    Callable,
    Optional,
    SetCookieParam,
    SelectorWaitStates,
)

# Parameter definitions for shell function signatures (defined once at module level)
# Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection
_REQUESTS_PARAMS = {
    "params": Optional[Dict | List | Tuple],
    "cookies": Any,
    "auth": Optional[Tuple[str, str]],
    "impersonate": Any,
    "http3": Optional[bool],
    "stealthy_headers": Optional[bool],
    "proxies": Any,
    "proxy": Optional[str],
    "proxy_auth": Optional[Tuple[str, str]],
    "timeout": Optional[int | float],
    "headers": Any,
    "retries": Optional[int],
    "retry_delay": Optional[int],
    "follow_redirects": Optional[bool],
    "max_redirects": Optional[int],
    "verify": Optional[bool],
    "cert": Optional[str | Tuple[str, str]],
    "selector_config": Optional[Dict],
}

_FETCH_PARAMS = {
    "headless": bool,
    "disable_resources": bool,
    "network_idle": bool,
    "load_dom": bool,
    "wait_selector": Optional[str],
    "wait_selector_state": SelectorWaitStates,
    "cookies": Sequence[SetCookieParam],
    "google_search": bool,
    "wait": int | float,
    "timezone_id": str | None,
    "page_action": Optional[Callable],
    "proxy": Optional[str | Dict[str, str] | Tuple],
    "extra_headers": Optional[Dict[str, str]],
    "timeout": int | float,
    "init_script": Optional[str],
    "user_data_dir": str,
    "selector_config": Optional[Dict],
    "additional_args": Optional[Dict],
    "locale": Optional[str],
    "real_chrome": bool,
    "cdp_url": Optional[str],
    "useragent": Optional[str],
    "extra_flags": Optional[List[str]],
}

_STEALTHY_FETCH_PARAMS = {
    "headless": bool,
    "disable_resources": bool,
    "network_idle": bool,
    "load_dom": bool,
    "wait_selector": Optional[str],
    "wait_selector_state": SelectorWaitStates,
    "cookies": Sequence[SetCookieParam],
    "google_search": bool,
    "wait": int | float,
    "timezone_id": str | None,
    "page_action": Optional[Callable],
    "proxy": Optional[str | Dict[str, str] | Tuple],
    "extra_headers": Optional[Dict[str, str]],
    "timeout": int | float,
    "init_script": Optional[str],
    "user_data_dir": str,
    "selector_config": Optional[Dict],
    "additional_args": Optional[Dict],
    "locale": Optional[str],
    "real_chrome": bool,
    "cdp_url": Optional[str],
    "useragent": Optional[str],
    "extra_flags": Optional[List[str]],
    "allow_webgl": bool,
    "hide_canvas": bool,
    "block_webrtc": bool,
    "solve_cloudflare": bool,
}

# Mapping of function names to their parameter definitions
Signatures_map = {
    "get": _REQUESTS_PARAMS,
    "post": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
    "put": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
    "delete": _REQUESTS_PARAMS,
    "fetch": _FETCH_PARAMS,
    "stealthy_fetch": _STEALTHY_FETCH_PARAMS,
}


================================================
FILE: scrapling/core/_types.py
================================================
"""
Type definitions for type checking purposes.
"""

from typing import (
    TYPE_CHECKING,
    TypeAlias,
    cast,
    overload,
    Any,
    Callable,
    Dict,
    Generator,
    AsyncGenerator,
    Generic,
    Iterable,
    List,
    Set,
    Literal,
    Optional,
    Iterator,
    Pattern,
    Sequence,
    Tuple,
    TypeVar,
    Union,
    Match,
    Mapping,
    Awaitable,
    Protocol,
    Coroutine,
    SupportsIndex,
)
from typing_extensions import Self, Unpack, TypedDict

# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
ProxyType = Union[str, Dict[str, str]]
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
extraction_types = Literal["text", "html", "markdown"]
StrOrBytes = Union[str, bytes]


# Copied from `playwright._impl._api_structures.SetCookieParam`
class SetCookieParam(TypedDict, total=False):
    name: str
    value: str
    url: Optional[str]
    domain: Optional[str]
    path: Optional[str]
    expires: Optional[float]
    httpOnly: Optional[bool]
    secure: Optional[bool]
    sameSite: Optional[Literal["Lax", "None", "Strict"]]
    partitionKey: Optional[str]


================================================
FILE: scrapling/core/ai.py
================================================
from asyncio import gather

from mcp.server.fastmcp import FastMCP
from pydantic import BaseModel, Field

from scrapling.core.shell import Convertor
from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
from scrapling.engines.static import ImpersonateType
from scrapling.fetchers import (
    Fetcher,
    FetcherSession,
    DynamicFetcher,
    AsyncDynamicSession,
    StealthyFetcher,
    AsyncStealthySession,
)
from scrapling.core._types import (
    Optional,
    Tuple,
    Mapping,
    Dict,
    List,
    Any,
    Generator,
    Sequence,
    SetCookieParam,
    extraction_types,
    SelectorWaitStates,
)


class ResponseModel(BaseModel):
    """Request's response information structure."""

    status: int = Field(description="The status code returned by the website.")
    content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
    url: str = Field(description="The URL given by the user that resulted in this response.")


def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
    """Convert a content generator to a list of ResponseModel objects."""
    return ResponseModel(status=page.status, content=[result for result in content], url=page.url)


def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:
    """Convert a credentials dictionary to a tuple accepted by fetchers."""
    if not credentials:
        return None

    username = credentials.get("username")
    password = credentials.get("password")

    if username is None or password is None:
        raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys")

    return username, password


class ScraplingMCPServer:
    @staticmethod
    def get(
        url: str,
        impersonate: ImpersonateType = "chrome",
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        params: Optional[Dict] = None,
        headers: Optional[Mapping[str, Optional[str]]] = None,
        cookies: Optional[Dict[str, str]] = None,
        timeout: Optional[int | float] = 30,
        follow_redirects: bool = True,
        max_redirects: int = 30,
        retries: Optional[int] = 3,
        retry_delay: Optional[int] = 1,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Dict[str, str]] = None,
        auth: Optional[Dict[str, str]] = None,
        verify: Optional[bool] = True,
        http3: Optional[bool] = False,
        stealthy_headers: Optional[bool] = True,
    ) -> ResponseModel:
        """Make GET HTTP request to a URL and return a structured output of the result.
        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param url: The URL to request.
        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param params: Query string parameters for the request.
        :param headers: Headers to include in the request.
        :param cookies: Cookies to use in the request.
        :param timeout: Number of seconds to wait before timing out.
        :param follow_redirects: Whether to follow redirects. Defaults to True.
        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
        :param retries: Number of retry attempts. Defaults to 3.
        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
                     Cannot be used together with the `proxies` parameter.
        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
        :param verify: Whether to verify HTTPS certificates.
        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
        """
        normalized_proxy_auth = _normalize_credentials(proxy_auth)
        normalized_auth = _normalize_credentials(auth)

        page = Fetcher.get(
            url,
            auth=normalized_auth,
            proxy=proxy,
            http3=http3,
            verify=verify,
            params=params,
            proxy_auth=normalized_proxy_auth,
            retry_delay=retry_delay,
            stealthy_headers=stealthy_headers,
            impersonate=impersonate,
            headers=headers,
            cookies=cookies,
            timeout=timeout,
            retries=retries,
            max_redirects=max_redirects,
            follow_redirects=follow_redirects,
        )
        return _content_translator(
            Convertor._extract_content(
                page,
                css_selector=css_selector,
                extraction_type=extraction_type,
                main_content_only=main_content_only,
            ),
            page,
        )

    @staticmethod
    async def bulk_get(
        urls: List[str],
        impersonate: ImpersonateType = "chrome",
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        params: Optional[Dict] = None,
        headers: Optional[Mapping[str, Optional[str]]] = None,
        cookies: Optional[Dict[str, str]] = None,
        timeout: Optional[int | float] = 30,
        follow_redirects: bool = True,
        max_redirects: int = 30,
        retries: Optional[int] = 3,
        retry_delay: Optional[int] = 1,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Dict[str, str]] = None,
        auth: Optional[Dict[str, str]] = None,
        verify: Optional[bool] = True,
        http3: Optional[bool] = False,
        stealthy_headers: Optional[bool] = True,
    ) -> List[ResponseModel]:
        """Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param urls: A list of the URLs to request.
        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param params: Query string parameters for the request.
        :param headers: Headers to include in the request.
        :param cookies: Cookies to use in the request.
        :param timeout: Number of seconds to wait before timing out.
        :param follow_redirects: Whether to follow redirects. Defaults to True.
        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
        :param retries: Number of retry attempts. Defaults to 3.
        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
                     Cannot be used together with the `proxies` parameter.
        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
        :param verify: Whether to verify HTTPS certificates.
        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
        """
        normalized_proxy_auth = _normalize_credentials(proxy_auth)
        normalized_auth = _normalize_credentials(auth)

        async with FetcherSession() as session:
            tasks: List[Any] = [
                session.get(
                    url,
                    auth=normalized_auth,
                    proxy=proxy,
                    http3=http3,
                    verify=verify,
                    params=params,
                    headers=headers,
                    cookies=cookies,
                    timeout=timeout,
                    retries=retries,
                    proxy_auth=normalized_proxy_auth,
                    retry_delay=retry_delay,
                    impersonate=impersonate,
                    max_redirects=max_redirects,
                    follow_redirects=follow_redirects,
                    stealthy_headers=stealthy_headers,
                )
                for url in urls
            ]
            responses = await gather(*tasks)
            return [
                _content_translator(
                    Convertor._extract_content(
                        page,
                        css_selector=css_selector,
                        extraction_type=extraction_type,
                        main_content_only=main_content_only,
                    ),
                    page,
                )
                for page in responses
            ]

    @staticmethod
    async def fetch(
        url: str,
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        headless: bool = True,  # noqa: F821
        google_search: bool = True,
        real_chrome: bool = False,
        wait: int | float = 0,
        proxy: Optional[str | Dict[str, str]] = None,
        timezone_id: str | None = None,
        locale: str | None = None,
        extra_headers: Optional[Dict[str, str]] = None,
        useragent: Optional[str] = None,
        cdp_url: Optional[str] = None,
        timeout: int | float = 30000,
        disable_resources: bool = False,
        wait_selector: Optional[str] = None,
        cookies: Sequence[SetCookieParam] | None = None,
        network_idle: bool = False,
        wait_selector_state: SelectorWaitStates = "attached",
    ) -> ResponseModel:
        """Use playwright to open a browser to fetch a URL and return a structured output of the result.
        Note: This is only suitable for low-mid protection levels.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param url: The URL to request.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        """
        page = await DynamicFetcher.async_fetch(
            url,
            wait=wait,
            proxy=proxy,
            locale=locale,
            timeout=timeout,
            cookies=cookies,
            cdp_url=cdp_url,
            headless=headless,
            useragent=useragent,
            timezone_id=timezone_id,
            real_chrome=real_chrome,
            network_idle=network_idle,
            wait_selector=wait_selector,
            extra_headers=extra_headers,
            google_search=google_search,
            disable_resources=disable_resources,
            wait_selector_state=wait_selector_state,
        )
        return _content_translator(
            Convertor._extract_content(
                page,
                css_selector=css_selector,
                extraction_type=extraction_type,
                main_content_only=main_content_only,
            ),
            page,
        )

    @staticmethod
    async def bulk_fetch(
        urls: List[str],
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        headless: bool = True,  # noqa: F821
        google_search: bool = True,
        real_chrome: bool = False,
        wait: int | float = 0,
        proxy: Optional[str | Dict[str, str]] = None,
        timezone_id: str | None = None,
        locale: str | None = None,
        extra_headers: Optional[Dict[str, str]] = None,
        useragent: Optional[str] = None,
        cdp_url: Optional[str] = None,
        timeout: int | float = 30000,
        disable_resources: bool = False,
        wait_selector: Optional[str] = None,
        cookies: Sequence[SetCookieParam] | None = None,
        network_idle: bool = False,
        wait_selector_state: SelectorWaitStates = "attached",
    ) -> List[ResponseModel]:
        """Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
        Note: This is only suitable for low-mid protection levels.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param urls: A list of the URLs to request.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        """
        async with AsyncDynamicSession(
            wait=wait,
            proxy=proxy,
            locale=locale,
            timeout=timeout,
            cookies=cookies,
            cdp_url=cdp_url,
            headless=headless,
            max_pages=len(urls),
            useragent=useragent,
            timezone_id=timezone_id,
            real_chrome=real_chrome,
            network_idle=network_idle,
            wait_selector=wait_selector,
            google_search=google_search,
            extra_headers=extra_headers,
            disable_resources=disable_resources,
            wait_selector_state=wait_selector_state,
        ) as session:
            tasks = [session.fetch(url) for url in urls]
            responses = await gather(*tasks)
            return [
                _content_translator(
                    Convertor._extract_content(
                        page,
                        css_selector=css_selector,
                        extraction_type=extraction_type,
                        main_content_only=main_content_only,
                    ),
                    page,
                )
                for page in responses
            ]

    @staticmethod
    async def stealthy_fetch(
        url: str,
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        headless: bool = True,  # noqa: F821
        google_search: bool = True,
        real_chrome: bool = False,
        wait: int | float = 0,
        proxy: Optional[str | Dict[str, str]] = None,
        timezone_id: str | None = None,
        locale: str | None = None,
        extra_headers: Optional[Dict[str, str]] = None,
        useragent: Optional[str] = None,
        hide_canvas: bool = False,
        cdp_url: Optional[str] = None,
        timeout: int | float = 30000,
        disable_resources: bool = False,
        wait_selector: Optional[str] = None,
        cookies: Sequence[SetCookieParam] | None = None,
        network_idle: bool = False,
        wait_selector_state: SelectorWaitStates = "attached",
        block_webrtc: bool = False,
        allow_webgl: bool = True,
        solve_cloudflare: bool = False,
        additional_args: Optional[Dict] = None,
    ) -> ResponseModel:
        """Use the stealthy fetcher to fetch a URL and return a structured output of the result.
        Note: This is the only suitable fetcher for high protection levels.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param url: The URL to request.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        page = await StealthyFetcher.async_fetch(
            url,
            wait=wait,
            proxy=proxy,
            locale=locale,
            cdp_url=cdp_url,
            timeout=timeout,
            cookies=cookies,
            headless=headless,
            useragent=useragent,
            timezone_id=timezone_id,
            real_chrome=real_chrome,
            hide_canvas=hide_canvas,
            allow_webgl=allow_webgl,
            network_idle=network_idle,
            block_webrtc=block_webrtc,
            wait_selector=wait_selector,
            google_search=google_search,
            extra_headers=extra_headers,
            additional_args=additional_args,
            solve_cloudflare=solve_cloudflare,
            disable_resources=disable_resources,
            wait_selector_state=wait_selector_state,
        )
        return _content_translator(
            Convertor._extract_content(
                page,
                css_selector=css_selector,
                extraction_type=extraction_type,
                main_content_only=main_content_only,
            ),
            page,
        )

    @staticmethod
    async def bulk_stealthy_fetch(
        urls: List[str],
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = True,
        headless: bool = True,  # noqa: F821
        google_search: bool = True,
        real_chrome: bool = False,
        wait: int | float = 0,
        proxy: Optional[str | Dict[str, str]] = None,
        timezone_id: str | None = None,
        locale: str | None = None,
        extra_headers: Optional[Dict[str, str]] = None,
        useragent: Optional[str] = None,
        hide_canvas: bool = False,
        cdp_url: Optional[str] = None,
        timeout: int | float = 30000,
        disable_resources: bool = False,
        wait_selector: Optional[str] = None,
        cookies: Sequence[SetCookieParam] | None = None,
        network_idle: bool = False,
        wait_selector_state: SelectorWaitStates = "attached",
        block_webrtc: bool = False,
        allow_webgl: bool = True,
        solve_cloudflare: bool = False,
        additional_args: Optional[Dict] = None,
    ) -> List[ResponseModel]:
        """Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
        Note: This is the only suitable fetcher for high protection levels.
        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.

        :param urls: A list of the URLs to request.
        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
            - Markdown will convert the page content to Markdown format.
            - HTML will return the raw HTML content of the page.
            - Text will return the text content of the page.
        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        async with AsyncStealthySession(
            wait=wait,
            proxy=proxy,
            locale=locale,
            cdp_url=cdp_url,
            timeout=timeout,
            cookies=cookies,
            headless=headless,
            useragent=useragent,
            timezone_id=timezone_id,
            real_chrome=real_chrome,
            hide_canvas=hide_canvas,
            allow_webgl=allow_webgl,
            network_idle=network_idle,
            block_webrtc=block_webrtc,
            wait_selector=wait_selector,
            google_search=google_search,
            extra_headers=extra_headers,
            additional_args=additional_args,
            solve_cloudflare=solve_cloudflare,
            disable_resources=disable_resources,
            wait_selector_state=wait_selector_state,
        ) as session:
            tasks = [session.fetch(url) for url in urls]
            responses = await gather(*tasks)
            return [
                _content_translator(
                    Convertor._extract_content(
                        page,
                        css_selector=css_selector,
                        extraction_type=extraction_type,
                        main_content_only=main_content_only,
                    ),
                    page,
                )
                for page in responses
            ]

    def serve(self, http: bool, host: str, port: int):
        """Serve the MCP server."""
        server = FastMCP(name="Scrapling", host=host, port=port)
        server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
        server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
        server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
        server.add_tool(
            self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
        )
        server.add_tool(
            self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
        )
        server.add_tool(
            self.bulk_stealthy_fetch,
            title="bulk_stealthy_fetch",
            description=self.bulk_stealthy_fetch.__doc__,
            structured_output=True,
        )
        server.run(transport="stdio" if not http else "streamable-http")


================================================
FILE: scrapling/core/custom_types.py
================================================
from collections.abc import Mapping
from types import MappingProxyType
from re import compile as re_compile, UNICODE, IGNORECASE

from orjson import dumps, loads
from w3lib.html import replace_entities as _replace_entities

from scrapling.core._types import (
    Any,
    cast,
    Dict,
    List,
    Union,
    overload,
    TypeVar,
    Literal,
    Pattern,
    Iterable,
    Generator,
    SupportsIndex,
)
from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__

# Define type variable for AttributeHandler value type
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
__CLEANING_TABLE__ = str.maketrans("\t\r\n", "   ")


class TextHandler(str):
    """Extends standard Python string by adding more functionality"""

    __slots__ = ()

    def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":  # pragma: no cover
        lst = super().__getitem__(key)
        return TextHandler(lst)

    def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]:  # pragma: no cover
        return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])

    def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().strip(chars))

    def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().lstrip(chars))

    def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().rstrip(chars))

    def capitalize(self) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().capitalize())

    def casefold(self) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().casefold())

    def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().center(width, fillchar))

    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().expandtabs(tabsize))

    def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().format(*args, **kwargs))

    def format_map(self, mapping) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().format_map(mapping))

    def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().join(iterable))

    def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().ljust(width, fillchar))

    def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().rjust(width, fillchar))

    def swapcase(self) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().swapcase())

    def title(self) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().title())

    def translate(self, table) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().translate(table))

    def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]:  # pragma: no cover
        return TextHandler(super().zfill(width))

    def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
        return TextHandler(super().replace(old, new, count))

    def upper(self) -> Union[str, "TextHandler"]:
        return TextHandler(super().upper())

    def lower(self) -> Union[str, "TextHandler"]:
        return TextHandler(super().lower())

    ##############

    def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
        """Return a sorted version of the string"""
        return self.__class__("".join(sorted(self, reverse=reverse)))

    def clean(self, remove_entities=False) -> Union[str, "TextHandler"]:
        """Return a new version of the string after removing all white spaces and consecutive spaces"""
        data = self.translate(__CLEANING_TABLE__)
        if remove_entities:
            data = _replace_entities(data)
        return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())

    # For easy copy-paste from Scrapy/parsel code when needed :)
    def get(self, default=None):  # pragma: no cover
        return self

    def get_all(self):  # pragma: no cover
        return self

    extract = get_all
    extract_first = get

    def json(self) -> Dict:
        """Return JSON response if the response is jsonable otherwise throw error"""
        # Using str function as a workaround for orjson issue with subclasses of str.
        # Check this out: https://github.com/ijl/orjson/issues/445
        return loads(str(self))

    @overload
    def re(
        self,
        regex: str | Pattern,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
        *,
        check_match: Literal[True],
    ) -> bool: ...

    @overload
    def re(
        self,
        regex: str | Pattern,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
        check_match: Literal[False] = False,
    ) -> "TextHandlers": ...

    def re(
        self,
        regex: str | Pattern,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
        check_match: bool = False,
    ) -> Union["TextHandlers", bool]:
        """Apply the given regex to the current text and return a list of strings with the matches.

        :param regex: Can be either a compiled regular expression or a string.
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
        :param check_match: Used to quickly check if this regex matches or not without any operations on the results

        """
        if isinstance(regex, str):
            if case_sensitive:
                regex = re_compile(regex, UNICODE)
            else:
                regex = re_compile(regex, flags=UNICODE | IGNORECASE)

        input_text = self.clean() if clean_match else self
        results = regex.findall(input_text)
        if check_match:
            return bool(results)

        if all(_is_iterable(res) for res in results):
            results = flatten(results)

        if not replace_entities:
            return TextHandlers([TextHandler(string) for string in results])

        return TextHandlers([TextHandler(_replace_entities(s)) for s in results])

    def re_first(
        self,
        regex: str | Pattern,
        default: Any = None,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> "TextHandler":
        """Apply the given regex to text and return the first match if found, otherwise return the default value.

        :param regex: Can be either a compiled regular expression or a string.
        :param default: The default value to be returned if there is no match
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it

        """
        result = self.re(
            regex,
            replace_entities,
            clean_match=clean_match,
            case_sensitive=case_sensitive,
        )
        return result[0] if result else default


class TextHandlers(List[TextHandler]):
    """
    The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
    """

    __slots__ = ()

    @overload
    def __getitem__(self, pos: SupportsIndex) -> TextHandler:  # pragma: no cover
        pass

    @overload
    def __getitem__(self, pos: slice) -> "TextHandlers":  # pragma: no cover
        pass

    def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
        lst = super().__getitem__(pos)
        if isinstance(pos, slice):
            return TextHandlers(cast(List[TextHandler], lst))
        return TextHandler(cast(TextHandler, lst))

    def re(
        self,
        regex: str | Pattern,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> "TextHandlers":
        """Call the ``.re()`` method for each element in this list and return
        their results flattened as TextHandlers.

        :param regex: Can be either a compiled regular expression or a string.
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
        """
        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
        return TextHandlers(flatten(results))

    def re_first(
        self,
        regex: str | Pattern,
        default: Any = None,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> TextHandler:  # pragma: no cover
        """Call the ``.re_first()`` method for each element in this list and return
        the first result or the default value otherwise.

        :param regex: Can be either a compiled regular expression or a string.
        :param default: The default value to be returned if there is no match
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
        """
        for n in self:
            for result in n.re(regex, replace_entities, clean_match, case_sensitive):
                return result
        return default

    # For easy copy-paste from Scrapy/parsel code when needed :)
    def get(self, default=None):
        """Returns the first item of the current list
        :param default: the default value to return if the current list is empty
        """
        return self[0] if len(self) > 0 else default

    def extract(self):
        return self

    extract_first = get
    get_all = extract


class AttributesHandler(Mapping[str, _TextHandlerType]):
    """A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.
    If the standard dictionary is needed, convert this class to a dictionary with the `dict` function
    """

    __slots__ = ("_data",)

    def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
        mapping = (
            {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
            if mapping is not None
            else {}
        )

        if kwargs:
            mapping.update(
                {key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
            )

        # Fastest read-only mapping type
        self._data: Mapping[str, Any] = MappingProxyType(mapping)

    def get(self, key: str, default: Any = None) -> _TextHandlerType:
        """Acts like the standard dictionary `.get()` method"""
        return self._data.get(key, default)

    def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
        """Search current attributes by values and return a dictionary of each matching item
        :param keyword: The keyword to search for in the attribute values
        :param partial: If True, the function will search if keyword in each value instead of perfect match
        """
        for key, value in self._data.items():
            if partial:
                if keyword in value:
                    yield AttributesHandler({key: value})
            else:
                if keyword == value:
                    yield AttributesHandler({key: value})

    @property
    def json_string(self) -> bytes:
        """Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
        return dumps(dict(self._data))

    def __getitem__(self, key: str) -> _TextHandlerType:
        return self._data[key]

    def __iter__(self):
        return iter(self._data)

    def __len__(self):
        return len(self._data)

    def __repr__(self):
        return f"{self.__class__.__name__}({self._data})"

    def __str__(self):
        return str(self._data)

    def __contains__(self, key):
        return key in self._data


================================================
FILE: scrapling/core/mixins.py
================================================
from scrapling.core._types import Any, Dict


class SelectorsGeneration:
    """
    Functions for generating selectors
    Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
    Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
    """

    # Note: This is a mixin class meant to be used with Selector.
    # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
    # through self, which will be a Selector instance at runtime.

    def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
        """Generate a selector for the current element.
        :return: A string of the generated selector.
        """
        if self._is_text_node(self._root):
            return ""

        selectorPath = []
        target = self
        css = selection.lower() == "css"
        while target is not None:
            if target.parent:
                if target.attrib.get("id"):
                    # id is enough
                    part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
                    selectorPath.append(part)
                    if not full_path:
                        return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
                else:
                    part = f"{target.tag}"
                    # We won't use classes anymore because I some websites share exact classes between elements
                    # classes = target.attrib.get('class', '').split()
                    # if classes and css:
                    #     part += f".{'.'.join(classes)}"
                    # else:
                    counter: Dict[str, int] = {}
                    for child in target.parent.children:
                        counter.setdefault(child.tag, 0)
                        counter[child.tag] += 1
                        if child._root == target._root:
                            break

                    if counter[target.tag] > 1:
                        part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"

                selectorPath.append(part)
                target = target.parent
                if target is None or target.tag == "html":
                    return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
            else:
                break

        return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))

    @property
    def generate_css_selector(self: Any) -> str:
        """Generate a CSS selector for the current element
        :return: A string of the generated selector.
        """
        return self._general_selection()

    @property
    def generate_full_css_selector(self: Any) -> str:
        """Generate a complete CSS selector for the current element
        :return: A string of the generated selector.
        """
        return self._general_selection(full_path=True)

    @property
    def generate_xpath_selector(self: Any) -> str:
        """Generate an XPath selector for the current element
        :return: A string of the generated selector.
        """
        return self._general_selection("xpath")

    @property
    def generate_full_xpath_selector(self: Any) -> str:
        """Generate a complete XPath selector for the current element
        :return: A string of the generated selector.
        """
        return self._general_selection("xpath", full_path=True)


================================================
FILE: scrapling/core/shell.py
================================================
# -*- coding: utf-8 -*-
from sys import stderr
from copy import deepcopy
from functools import wraps
from re import sub as re_sub
from collections import namedtuple
from shlex import split as shlex_split
from inspect import signature, Parameter
from tempfile import mkstemp as make_temp_file
from argparse import ArgumentParser, SUPPRESS
from webbrowser import open as open_in_browser
from urllib.parse import urlparse, urlunparse, parse_qsl
from logging import (
    DEBUG,
    INFO,
    WARNING,
    ERROR,
    CRITICAL,
    FATAL,
    getLogger,
    getLevelName,
)

from orjson import loads as json_loads, JSONDecodeError

from ._shell_signatures import Signatures_map
from scrapling import __version__
from scrapling.core.utils import log
from scrapling.parser import Selector, Selectors
from scrapling.core.custom_types import TextHandler
from scrapling.engines.toolbelt.custom import Response
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
from scrapling.core._types import (
    Callable,
    Dict,
    Any,
    cast,
    Optional,
    Generator,
    extraction_types,
)


_known_logging_levels = {
    "debug": DEBUG,
    "info": INFO,
    "warning": WARNING,
    "error": ERROR,
    "critical": CRITICAL,
    "fatal": FATAL,
}


# Define the structure for parsed context - Simplified for Fetcher args
Request = namedtuple(
    "Request",
    [
        "method",
        "url",
        "params",
        "data",  # Can be str, bytes, or dict (for urlencoded)
        "json_data",  # Python object (dict/list) for JSON payload
        "headers",
        "cookies",
        "proxy",
        "follow_redirects",  # Added for -L flag
    ],
)


# Suppress exit on error to handle parsing errors gracefully
class NoExitArgumentParser(ArgumentParser):  # pragma: no cover
    def error(self, message):
        log.error(f"Curl arguments parsing error: {message}")
        raise ValueError(f"Curl arguments parsing error: {message}")

    def exit(self, status=0, message=None):
        if message:
            log.error(f"Scrapling shell exited with status {status}: {message}")
            self._print_message(message, stderr)
        raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")


class CurlParser:
    """Builds the argument parser for relevant curl flags from DevTools."""

    def __init__(self) -> None:
        from scrapling.fetchers import Fetcher as __Fetcher

        self.__fetcher = __Fetcher
        # We will use argparse parser to parse the curl command directly instead of regex
        # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
        _parser = NoExitArgumentParser(add_help=False)  # Disable default help
        # Basic curl arguments
        _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
        _parser.add_argument("url")
        _parser.add_argument("-X", "--request", dest="method", default=None)
        _parser.add_argument("-H", "--header", action="append", default=[])
        _parser.add_argument(
            "-A", "--user-agent", help="Will be parsed from -H if present"
        )  # Note: DevTools usually includes this in -H

        # Data arguments (prioritizing types common from DevTools)
        _parser.add_argument("-d", "--data", default=None)
        _parser.add_argument("--data-raw", default=None)  # Often used by browsers for JSON body
        _parser.add_argument("--data-binary", default=None)
        # Keep urlencode for completeness, though less common from browser copy/paste
        _parser.add_argument("--data-urlencode", action="append", default=[])
        _parser.add_argument("-G", "--get", action="store_true")  # Use GET and put data in URL

        _parser.add_argument(
            "-b",
            "--cookie",
            default=None,
            help="Send cookies from string/file (string format used by DevTools)",
        )

        # Proxy
        _parser.add_argument("-x", "--proxy", default=None)
        _parser.add_argument("-U", "--proxy-user", default=None)  # Basic proxy auth

        # Connection/Security
        _parser.add_argument("-k", "--insecure", action="store_true")
        _parser.add_argument("--compressed", action="store_true")  # Very common from browsers

        # Other flags often included but may not map directly to request args
        _parser.add_argument("-i", "--include", action="store_true")
        _parser.add_argument("-s", "--silent", action="store_true")
        _parser.add_argument("-v", "--verbose", action="store_true")

        self.parser: NoExitArgumentParser = _parser
        self._supported_methods = ("get", "post", "put", "delete")

    # --- Main Parsing Logic ---
    def parse(self, curl_command: str) -> Optional[Request]:
        """Parses the curl command string into a structured context for Fetcher."""

        clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")

        try:
            tokens = shlex_split(clean_command)  # Split the string using shell-like syntax
        except ValueError as e:  # pragma: no cover
            log.error(f"Could not split command line: {e}")
            return None

        try:
            parsed_args, unknown = self.parser.parse_known_args(tokens)
            if unknown:
                raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")

        except ValueError:  # pragma: no cover
            return None

        except AttributeError:
            raise

        except Exception as e:  # pragma: no cover
            log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
            return None

        # --- Determine Method ---
        method = "get"  # Default
        if parsed_args.get:  # `-G` forces GET
            method = "get"

        elif parsed_args.method:
            method = parsed_args.method.strip().lower()

        # Infer POST if data is present (unless overridden by -X or -G)
        elif any(
            [
                parsed_args.data,
                parsed_args.data_raw,
                parsed_args.data_binary,
                parsed_args.data_urlencode,
            ]
        ):
            method = "post"

        headers, cookies = _ParseHeaders(parsed_args.header)

        if parsed_args.cookie:
            # We are focusing on the string format from DevTools.
            try:
                for key, value in _CookieParser(parsed_args.cookie):
                    # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
                    cookies[key] = value
                log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
            except Exception as e:  # pragma: no cover
                log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")

        # --- Process Data Payload ---
        params = dict()
        data_payload: Optional[str | bytes | Dict] = None
        json_payload: Optional[Any] = None

        # DevTools often uses --data-raw for JSON bodies
        # Precedence: --data-binary > --data-raw / -d > --data-urlencode
        if parsed_args.data_binary is not None:  # pragma: no cover
            try:
                data_payload = parsed_args.data_binary.encode("utf-8")
                log.debug("Using data from --data-binary as bytes.")
            except Exception as e:
                log.warning(
                    f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
                )
                data_payload = parsed_args.data_binary  # Fallback to string

        elif parsed_args.data_raw is not None:
            data_payload = parsed_args.data_raw.lstrip("$")

        elif parsed_args.data is not None:
            data_payload = parsed_args.data

        elif parsed_args.data_urlencode:  # pragma: no cover
            # Combine and parse urlencoded data
            combined_data = "&".join(parsed_args.data_urlencode)
            try:
                data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
            except Exception as e:
                log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
                data_payload = combined_data

        # Check if raw data looks like JSON, prefer 'json' param if so
        if isinstance(data_payload, str):
            try:
                maybe_json = json_loads(data_payload)
                if isinstance(maybe_json, (dict, list)):
                    json_payload = maybe_json
                    data_payload = None
            except JSONDecodeError:
                pass  # Not JSON, keep it in data_payload

        # Handle `-G`: Move data to params if the method is GET
        if method == "get" and data_payload:  # pragma: no cover
            if isinstance(data_payload, dict):  # From --data-urlencode likely
                params.update(data_payload)
            elif isinstance(data_payload, str):
                try:
                    params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
                except ValueError:
                    log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")

            if params:
                data_payload = None  # Clear data as it's moved to params
                json_payload = None  # Should not have JSON body with -G

        # --- Process Proxy ---
        proxies: Optional[Dict[str, str]] = None
        if parsed_args.proxy:
            proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy

            if parsed_args.proxy_user:
                user_pass = parsed_args.proxy_user
                parts = urlparse(proxy_url)
                netloc_parts = parts.netloc.split("@")
                netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
                proxy_url = urlunparse(
                    (
                        parts.scheme,
                        netloc,
                        parts.path,
                        parts.params,
                        parts.query,
                        parts.fragment,
                    )
                )

            # Standard proxy dict format
            proxies = {"http": proxy_url, "https": proxy_url}
            log.debug(f"Using proxy configuration: {proxies}")

        # --- Final Context ---
        return Request(
            method=method,
            url=parsed_args.url,
            params=params,
            data=data_payload,
            json_data=json_payload,
            headers=headers,
            cookies=cookies,
            proxy=proxies,
            follow_redirects=True,  # Scrapling default is True
        )

    def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
        if isinstance(curl_command, (Request, str)):
            request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command

            # Ensure request parsing was successful before proceeding
            if request is None:  # pragma: no cover
                log.error("Failed to parse curl command, cannot convert to fetcher.")
                return None

            request_args = request._asdict()
            method = request_args.pop("method").strip().lower()
            if method in self._supported_methods:
                request_args["json"] = request_args.pop("json_data")

                # Ensure data/json are removed for non-POST/PUT methods
                if method not in ("post", "put"):
                    _ = request_args.pop("data", None)
                    _ = request_args.pop("json", None)

                try:
                    return getattr(self.__fetcher, method)(**request_args)
                except Exception as e:  # pragma: no cover
                    log.error(f"Error calling Fetcher.{method}: {e}")
                    return None
            else:  # pragma: no cover
                log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
                return None

        else:  # pragma: no cover
            log.error("Input must be a valid curl command string or a Request object.")
            return None


def _unpack_signature(func, signature_name=None):
    """
    Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.

    This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.
    """
    try:
        sig = signature(func)
        func_name = signature_name or getattr(func, "__name__", None)

        # Check if this function has known parameters
        if func_name not in Signatures_map:
            return sig

        new_params = []
        for param in sig.parameters.values():
            if param.kind == Parameter.VAR_KEYWORD:
                # Replace **kwargs with individual keyword-only parameters
                for field_name, field_type in Signatures_map[func_name].items():
                    new_params.append(
                        Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)
                    )
            else:
                new_params.append(param)

        # Reconstruct signature with unpacked parameters
        if len(new_params) != len(sig.parameters):
            return sig.replace(parameters=new_params)
        return sig

    except Exception:  # pragma: no cover
        return signature(func)


def show_page_in_browser(page: Selector):  # pragma: no cover
    if not page or not isinstance(page, Selector):
        log.error("Input must be of type `Selector`")
        return

    try:
        fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
        with open(fd, "w", encoding=page.encoding) as f:
            f.write(page.html_content)

        open_in_browser(f"file://{fname}")
    except IOError as e:
        log.error(f"Failed to write temporary file for viewing: {e}")
    except Exception as e:
        log.error(f"An unexpected error occurred while viewing the page: {e}")


class CustomShell:
    """A custom IPython shell with minimal dependencies"""

    def __init__(self, code, log_level="debug"):
        from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
        from scrapling.fetchers import (
            Fetcher as __Fetcher,
            AsyncFetcher as __AsyncFetcher,
            FetcherSession as __FetcherSession,
            DynamicFetcher as __DynamicFetcher,
            DynamicSession as __DynamicSession,
            AsyncDynamicSession as __AsyncDynamicSession,
            StealthyFetcher as __StealthyFetcher,
            StealthySession as __StealthySession,
            AsyncStealthySession as __AsyncStealthySession,
        )

        self.__InteractiveShellEmbed = __InteractiveShellEmbed
        self.__Fetcher = __Fetcher
        self.__AsyncFetcher = __AsyncFetcher
        self.__FetcherSession = __FetcherSession
        self.__DynamicFetcher = __DynamicFetcher
        self.__DynamicSession = __DynamicSession
        self.__AsyncDynamicSession = __AsyncDynamicSession
        self.__StealthyFetcher = __StealthyFetcher
        self.__StealthySession = __StealthySession
        self.__AsyncStealthySession = __AsyncStealthySession
        self.code = code
        self.page = None
        self.pages = Selectors([])
        self._curl_parser = CurlParser()
        log_level = log_level.strip().lower()

        if _known_logging_levels.get(log_level):
            self.log_level = _known_logging_levels[log_level]
        else:  # pragma: no cover
            log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
            self.log_level = DEBUG

        self.shell = None

        # Initialize your application components
        self.init_components()

    def init_components(self):
        """Initialize application components"""
        # This is where you'd set up your application-specific objects
        if self.log_level:
            getLogger("scrapling").setLevel(self.log_level)

        settings = self.__Fetcher.display_config()
        settings.pop("storage", None)
        settings.pop("storage_args", None)
        log.info(f"Scrapling {__version__} shell started")
        log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
        log.info(f"Fetchers' parsing settings: {settings}")

    @staticmethod
    def banner():
        """Create a custom banner for the shell"""
        return f"""
-> Available Scrapling objects:
   - Fetcher/AsyncFetcher/FetcherSession
   - DynamicFetcher/DynamicSession/AsyncDynamicSession
   - StealthyFetcher/StealthySession/AsyncStealthySession
   - Selector

-> Useful shortcuts:
   - {"get":<30} Shortcut for `Fetcher.get`
   - {"post":<30} Shortcut for `Fetcher.post`
   - {"put":<30} Shortcut for `Fetcher.put`
   - {"delete":<30} Shortcut for `Fetcher.delete`
   - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
   - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`

-> Useful commands
   - {"page / response":<30} The response object of the last page you fetched
   - {"pages":<30} Selectors object of the last 5 response objects you fetched
   - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
   - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
   - {"view(page)":<30} View page in a browser
   - {"help()":<30} Show this help message (Shell help)

Type 'exit' or press Ctrl+D to exit.
        """

    def update_page(self, result):  # pragma: no cover
        """Update the current page and add to pages history"""
        self.page = result
        if isinstance(result, (Response, Selector)):
            self.pages.append(result)
            if len(self.pages) > 5:
                self.pages.pop(0)  # Remove the oldest item

            # Update in IPython namespace too
            if self.shell:
                self.shell.user_ns["page"] = self.page
                self.shell.user_ns["response"] = self.page
                self.shell.user_ns["pages"] = self.pages

        return result

    def create_wrapper(
        self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
    ) -> Callable:
        """Create a wrapper that preserves function signature but updates page"""

        @wraps(func)
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            result = func(*args, **kwargs)
            return self.update_page(result)

        if get_signature:
            # Explicitly preserve and unpack signature for IPython introspection and autocompletion
            setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
        else:
            setattr(wrapper, "__signature__", signature(func))

        return wrapper

    def get_namespace(self):
        """Create a namespace with application-specific objects"""

        # Create wrapped versions of fetch functions
        get = self.create_wrapper(self.__Fetcher.get)
        post = self.create_wrapper(self.__Fetcher.post)
        put = self.create_wrapper(self.__Fetcher.put)
        delete = self.create_wrapper(self.__Fetcher.delete)
        dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
        stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch, signature_name="stealthy_fetch")
        curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)

        # Create the namespace dictionary
        return {
            "get": get,
            "post": post,
            "put": put,
            "delete": delete,
            "Fetcher": self.__Fetcher,
            "AsyncFetcher": self.__AsyncFetcher,
            "FetcherSession": self.__FetcherSession,
            "DynamicSession": self.__DynamicSession,
            "AsyncDynamicSession": self.__AsyncDynamicSession,
            "StealthySession": self.__StealthySession,
            "AsyncStealthySession": self.__AsyncStealthySession,
            "fetch": dynamic_fetch,
            "DynamicFetcher": self.__DynamicFetcher,
            "stealthy_fetch": stealthy_fetch,
            "StealthyFetcher": self.__StealthyFetcher,
            "Selector": Selector,
            "page": self.page,
            "response": self.page,
            "pages": self.pages,
            "view": show_page_in_browser,
            "uncurl": self._curl_parser.parse,
            "curl2fetcher": curl2fetcher,
            "help": self.show_help,
        }

    def show_help(self):  # pragma: no cover
        """Show help information"""
        print(self.banner())

    def start(self):  # pragma: no cover
        """Start the interactive shell"""

        # Get our namespace with application objects
        namespace = self.get_namespace()
        ipython_shell = self.__InteractiveShellEmbed(
            banner1=self.banner(),
            banner2="",
            enable_tip=False,
            exit_msg="Bye Bye",
            user_ns=namespace,
        )
        self.shell = ipython_shell

        # If a command was provided, execute it and exit
        if self.code:
            log.info(f"Executing provided code: {self.code}")
            try:
                ipython_shell.run_cell(self.code, store_history=False)
            except Exception as e:
                log.error(f"Error executing initial code: {e}")
            return

        ipython_shell()


class Convertor:
    """Utils for the extract shell command"""

    _extension_map: Dict[str, extraction_types] = {
        "md": "markdown",
        "html": "html",
        "txt": "text",
    }

    @classmethod
    def _convert_to_markdown(cls, body: TextHandler) -> str:
        """Convert HTML content to Markdown"""
        from markdownify import markdownify

        return markdownify(body)

    @classmethod
    def _strip_noise_tags(cls, page: Selector) -> Selector:
        """Return a copy of the Selector with noise tags removed."""
        clean_root = deepcopy(page._root)
        for element in clean_root.iter(*{"script", "style", "noscript", "svg"}):
            element.drop_tree()
        return Selector(root=clean_root, url=page.url)

    @classmethod
    def _extract_content(
        cls,
        page: Selector,
        extraction_type: extraction_types = "markdown",
        css_selector: Optional[str] = None,
        main_content_only: bool = False,
    ) -> Generator[str, None, None]:
        """Extract the content of a Selector"""
        if not page or not isinstance(page, Selector):  # pragma: no cover
            raise TypeError("Input must be of type `Selector`")
        elif not extraction_type or extraction_type not in cls._extension_map.values():
            raise ValueError(f"Unknown extraction type: {extraction_type}")
        else:
            if main_content_only:
                page = cast(Selector, page.css("body").first) or page
                page = cls._strip_noise_tags(page)

            pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
            for page in pages:
                match extraction_type:
                    case "markdown":
                        yield cls._convert_to_markdown(page.html_content)
                    case "html":
                        yield page.html_content
                    case "text":
                        txt_content = page.get_all_text(
                            strip=True, ignore_tags=("script", "style", "noscript", "svg", "iframe")
                        )
                        for s in (
                            "\n",
                            "\r",
                            "\t",
                            " ",
                        ):
                            # Remove consecutive white-spaces
                            txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
                        yield txt_content
            yield ""

    @classmethod
    def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
        """Write a Selector's content to a file"""
        if not page or not isinstance(page, Selector):  # pragma: no cover
            raise TypeError("Input must be of type `Selector`")
        elif not filename or not isinstance(filename, str) or not filename.strip():
            raise ValueError("Filename must be provided")
        elif not filename.endswith((".md", ".html", ".txt")):
            raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
        else:
            with open(filename, "w", encoding=page.encoding) as f:
                extension = filename.split(".")[-1]
                f.write(
                    "".join(
                        cls._extract_content(
                            page,
                            cls._extension_map[extension],
                            css_selector=css_selector,
                        )
                    )
                )


================================================
FILE: scrapling/core/storage.py
================================================
from hashlib import sha256
from threading import RLock
from functools import lru_cache
from abc import ABC, abstractmethod
from sqlite3 import connect as db_connect

from orjson import dumps, loads
from lxml.html import HtmlElement

from scrapling.core.utils import _StorageTools, log
from scrapling.core._types import Dict, Optional, Any, cast


class StorageSystemMixin(ABC):  # pragma: no cover
    # If you want to make your own storage system, you have to inherit from this
    def __init__(self, url: Optional[str] = None):
        """
        :param url: URL of the website we are working on to separate it from other websites data
        """
        # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
        self.url = url.lower() if (url and isinstance(url, str)) else None

    @lru_cache(64, typed=True)
    def _get_base_url(self, default_value: str = "default") -> str:
        if not self.url:
            return default_value

        try:
            from tld import get_tld, Result

            # Fixing the inaccurate return type hint in `get_tld`
            extracted: Result | None = cast(
                Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
            )
            if not extracted:
                return default_value
            return extracted.fld or extracted.domain or default_value
        except AttributeError:
            return default_value

    @abstractmethod
    def save(self, element: HtmlElement, identifier: str) -> None:
        """Saves the element's unique properties to the storage for retrieval and relocation later

        :param element: The element itself which we want to save to storage.
        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
            the docs for more info.
        """
        raise NotImplementedError("Storage system must implement `save` method")

    @abstractmethod
    def retrieve(self, identifier: str) -> Optional[Dict]:
        """Using the identifier, we search the storage and return the unique properties of the element

        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
            the docs for more info.
        :return: A dictionary of the unique properties
        """
        raise NotImplementedError("Storage system must implement `save` method")

    @staticmethod
    @lru_cache(128, typed=True)
    def _get_hash(identifier: str) -> str:
        """If you want to hash identifier in your storage system, use this safer"""
        _identifier = identifier.lower().strip()
        # Hash functions have to take bytes
        _identifier_bytes = _identifier.encode("utf-8")

        hash_value = sha256(_identifier_bytes).hexdigest()
        return f"{hash_value}_{len(_identifier_bytes)}"  # Length to reduce collision chance


@lru_cache(1, typed=True)
class SQLiteStorageSystem(StorageSystemMixin):
    """The recommended system to use, it's race condition safe and thread safe.
    Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
    > It's optimized for threaded applications, but running it without threads shouldn't make it slow."""

    def __init__(self, storage_file: str, url: Optional[str] = None):
        """
        :param storage_file: File to be used to store elements' data.
        :param url: URL of the website we are working on to separate it from other websites data

        """
        super().__init__(url)
        self.storage_file = storage_file
        self.lock = RLock()  # Better than Lock for reentrancy
        # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
        # `check_same_thread=False` to allow it to be used across different threads.
        self.connection = db_connect(self.storage_file, check_same_thread=False)
        # WAL (Write-Ahead Logging) allows for better concurrency.
        self.connection.execute("PRAGMA journal_mode=WAL")
        self.cursor = self.connection.cursor()
        self._setup_database()
        log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')

    def _setup_database(self) -> None:
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS storage (
                id INTEGER PRIMARY KEY,
                url TEXT,
                identifier TEXT,
                element_data TEXT,
                UNIQUE (url, identifier)
            )
        """)
        self.connection.commit()

    def save(self, element: HtmlElement, identifier: str) -> None:
        """Saves the elements unique properties to the storage for retrieval and relocation later

        :param element: The element itself which we want to save to storage.
        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
            the docs for more info.
        """
        url = self._get_base_url()
        element_data = _StorageTools.element_to_dict(element)
        with self.lock:
            self.cursor.execute(
                """
                INSERT OR REPLACE INTO storage (url, identifier, element_data)
                VALUES (?, ?, ?)
            """,
                (url, identifier, dumps(element_data)),
            )
            self.cursor.fetchall()
            self.connection.commit()

    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
        """Using the identifier, we search the storage and return the unique properties of the element

        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
            the docs for more info.
        :return: A dictionary of the unique properties
        """
        url = self._get_base_url()
        with self.lock:
            self.cursor.execute(
                "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
                (url, identifier),
            )
            result = self.cursor.fetchone()
            if result:
                return loads(result[0])
            return None

    def close(self):
        """Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
        with self.lock:
            self.connection.commit()
            self.cursor.close()
            self.connection.close()

    def __del__(self):
        """To ensure all connections are closed when the object is destroyed."""
        self.close()


================================================
FILE: scrapling/core/translator.py
================================================
"""
Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...

To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...

So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)

    If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
"""

from functools import lru_cache

from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement

from scrapling.core._types import Any, Protocol, Self


class XPathExpr(OriginalXPathExpr):
    textnode: bool = False
    attribute: str | None = None

    @classmethod
    def from_xpath(
        cls,
        xpath: OriginalXPathExpr,
        textnode: bool = False,
        attribute: str | None = None,
    ) -> Self:
        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
        x.textnode = textnode
        x.attribute = attribute
        return x

    def __str__(self) -> str:
        path = super().__str__()
        if self.textnode:
            if path == "*":  # pragma: no cover
                path = "text()"
            elif path.endswith("::*/*"):  # pragma: no cover
                path = path[:-3] + "text()"
            else:
                path += "/text()"

        if self.attribute is not None:
            if path.endswith("::*/*"):  # pragma: no cover
                path = path[:-2]
            path += f"/@{self.attribute}"

        return path

    def join(
        self: Self,
        combiner: str,
        other: OriginalXPathExpr,
        *args: Any,
        **kwargs: Any,
    ) -> Self:
        if not isinstance(other, XPathExpr):
            raise ValueError(  # pragma: no cover
                f"Expressions of type {__name__}.XPathExpr can ony join expressions"
                f" of the same type (or its descendants), got {type(other)}"
            )
        super().join(combiner, other, *args, **kwargs)
        self.textnode = other.textnode
        self.attribute = other.attribute
        return self


# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
class TranslatorProtocol(Protocol):
    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pyright: ignore # pragma: no cover
        pass

    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pyright: ignore # pragma: no cover
        pass


class TranslatorMixin:
    """This mixin adds support to CSS pseudo elements via dynamic dispatch.

    Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
    """

    def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
        # https://github.com/python/mypy/issues/14757
        xpath = super().xpath_element(selector)  # type: ignore[safe-super]
        return XPathExpr.from_xpath(xpath)

    def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
        """
        Dispatch method that transforms XPath to support the pseudo-element.
        """
        if isinstance(pseudo_element, FunctionalPseudoElement):
            method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
            method = getattr(self, method_name, None)
            if not method:  # pragma: no cover
                raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
            xpath = method(xpath, pseudo_element)
        else:
            method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
            method = getattr(self, method_name, None)
            if not method:  # pragma: no cover
                raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
            xpath = method(xpath)
        return xpath

    @staticmethod
    def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
        """Support selecting attribute values using ::attr() pseudo-element"""
        if function.argument_types() not in (["STRING"], ["IDENT"]):  # pragma: no cover
            raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
        return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)

    @staticmethod
    def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
        """Support selecting text nodes using ::text pseudo-element"""
        return XPathExpr.from_xpath(xpath, textnode=True)


class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
    def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
        return super().css_to_xpath(css, prefix)


translator = HTMLTranslator()
# Using a function instead of the translator directly to avoid Pyright override error


@lru_cache(maxsize=256)
def css_to_xpath(query: str) -> str:
    """Return the translated XPath version of a given CSS query"""
    return translator.css_to_xpath(query)


================================================
FILE: scrapling/core/utils/__init__.py
================================================
from ._utils import (
    log,
    set_logger,
    reset_logger,
    __CONSECUTIVE_SPACES_REGEX__,
    flatten,
    _is_iterable,
    _StorageTools,
    clean_spaces,
    html_forbidden,
)


================================================
FILE: scrapling/core/utils/_shell.py
================================================
from http import cookies as Cookie


from scrapling.core._types import (
    List,
    Dict,
    Tuple,
)


def _CookieParser(cookie_string):
    # Errors will be handled on call so the log can be specified
    cookie_parser = Cookie.SimpleCookie()
    cookie_parser.load(cookie_string)
    for key, morsel in cookie_parser.items():
        yield key, morsel.value


def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
    """Parses headers into separate header and cookie dictionaries."""
    header_dict = dict()
    cookie_dict = dict()

    for header_line in header_lines:
        if ":" not in header_line:
            if header_line.endswith(";"):
                header_key = header_line[:-1].strip()
                header_value = ""
                header_dict[header_key] = header_value
            else:
                raise ValueError(f"Could not parse header without colon: '{header_line}'.")
        else:
            header_key, header_value = header_line.split(":", 1)
            header_key = header_key.strip()
            header_value = header_value.strip()

            if parse_cookies:
                if header_key.lower() == "cookie":
                    try:
                        cookie_dict = {key: value for key, value in _CookieParser(header_value)}
                    except Exception as e:  # pragma: no cover
                        raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
                else:
                    header_dict[header_key] = header_value
            else:
                header_dict[header_key] = header_value

    return header_dict, cookie_dict


================================================
FILE: scrapling/core/utils/_utils.py
================================================
import logging
from itertools import chain
from re import compile as re_compile
from contextvars import ContextVar, Token

from lxml import html

from scrapling.core._types import Any, Dict, Iterable, List

# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
from functools import lru_cache  # isort:skip

html_forbidden = (html.HtmlComment,)

__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")


@lru_cache(1, typed=True)
def setup_logger():
    """Create and configure a logger with a standard format.

    :returns: logging.Logger: Configured logger instance
    """
    logger = logging.getLogger("scrapling")
    logger.setLevel(logging.INFO)

    formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)

    # Add handler to logger (if not already added)
    if not logger.handlers:
        logger.addHandler(console_handler)

    return logger


_current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger())


class LoggerProxy:
    def __getattr__(self, name: str):
        return getattr(_current_logger.get(), name)


log = LoggerProxy()


def set_logger(logger: logging.Logger) -> Token:
    """Set the current context logger. Returns token for reset."""
    return _current_logger.set(logger)


def reset_logger(token: Token) -> None:
    """Reset logger to previous state using token."""
    _current_logger.reset(token)


def flatten(lst: Iterable[Any]) -> List[Any]:
    return list(chain.from_iterable(lst))


def _is_iterable(obj: Any) -> bool:
    # This will be used only in regex functions to make sure it's iterable but not string/bytes
    return isinstance(
        obj,
        (
            list,
            tuple,
        ),
    )


class _StorageTools:
    @staticmethod
    def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
        if not element.attrib:
            return {}
        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}

    @classmethod
    def element_to_dict(cls, element: html.HtmlElement) -> Dict:
        parent = element.getparent()
        result = {
            "tag": str(element.tag),
            "attributes": cls.__clean_attributes(element),
            "text": element.text.strip() if element.text else None,
            "path": cls._get_element_path(element),
        }
        if parent is not None:
            result.update(
                {
                    "parent_name": parent.tag,
                    "parent_attribs": dict(parent.attrib),
                    "parent_text": parent.text.strip() if parent.text else None,
                }
            )

            siblings = [child.tag for child in parent.iterchildren() if child != element]
            if siblings:
                result.update({"siblings": tuple(siblings)})

        children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
        if children:
            result.update({"children": tuple(children)})

        return result

    @classmethod
    def _get_element_path(cls, element: html.HtmlElement):
        parent = element.getparent()
        return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))


@lru_cache(128, typed=True)
def clean_spaces(string):
    string = string.translate(__CLEANING_TABLE__)
    return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)


================================================
FILE: scrapling/engines/__init__.py
================================================


================================================
FILE: scrapling/engines/_browsers/__init__.py
================================================


================================================
FILE: scrapling/engines/_browsers/_base.py
================================================
from time import time
from asyncio import sleep as asyncio_sleep, Lock
from contextlib import contextmanager, asynccontextmanager

from playwright.sync_api._generated import Page
from playwright.sync_api import (
    Frame,
    BrowserContext,
    Response as SyncPlaywrightResponse,
)
from playwright.async_api._generated import Page as AsyncPage
from playwright.async_api import (
    Frame as AsyncFrame,
    Response as AsyncPlaywrightResponse,
    BrowserContext as AsyncBrowserContext,
)
from playwright._impl._errors import Error as PlaywrightError

from scrapling.parser import Selector
from scrapling.engines._browsers._page import PageInfo, PagePool
from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
from scrapling.engines.toolbelt.navigation import (
    construct_proxy_dict,
    create_intercept_handler,
    create_async_intercept_handler,
)
from scrapling.core._types import (
    Any,
    Dict,
    List,
    Set,
    Optional,
    Callable,
    TYPE_CHECKING,
    cast,
    overload,
    Tuple,
    ProxyType,
    Generator,
    AsyncGenerator,
)
from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS


class SyncSession:
    _config: "PlaywrightConfig | StealthConfig"
    _context_options: Dict[str, Any]

    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
        raise NotImplementedError  # pragma: no cover

    def __init__(self, max_pages: int = 1):
        self.max_pages = max_pages
        self.page_pool = PagePool(max_pages)
        self._max_wait_for_page = 60
        self.playwright: Any = None
        self.context: Any = None
        self.browser: Any = None
        self._is_alive = False

    def start(self) -> None:
        pass

    def close(self):  # pragma: no cover
        """Close all resources"""
        if not self._is_alive:
            return

        if self.context:
            self.context.close()
            self.context = None

        if self.browser:
            self.browser.close()
            self.browser = None

        if self.playwright:
            self.playwright.stop()
            self.playwright = None  # pyright: ignore

        self._is_alive = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:
        """Initialize the browser context."""
        if config.init_script:
            ctx.add_init_script(path=config.init_script)

        if config.cookies:  # pragma: no cover
            ctx.add_cookies(config.cookies)

        return ctx

    def _get_page(
        self,
        timeout: int | float,
        extra_headers: Optional[Dict[str, str]],
        disable_resources: bool,
        blocked_domains: Optional[Set[str]] = None,
        context: Optional[BrowserContext] = None,
    ) -> PageInfo[Page]:  # pragma: no cover
        """Get a new page to use"""
        # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
        ctx = context if context is not None else self.context
        assert ctx is not None, "Browser context not initialized"
        page = ctx.new_page()
        page.set_default_navigation_timeout(timeout)
        page.set_default_timeout(timeout)
        if extra_headers:
            page.set_extra_http_headers(extra_headers)

        if disable_resources or blocked_domains:
            page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
        page_info = self.page_pool.add_page(page)
        page_info.mark_busy()
        return page_info

    def get_pool_stats(self) -> Dict[str, int]:
        """Get statistics about the current page pool"""
        return {
            "total_pages": self.page_pool.pages_count,
            "busy_pages": self.page_pool.busy_count,
            "max_pages": self.max_pages,
        }

    @staticmethod
    def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
        """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
        try:
            page.wait_for_load_state("networkidle", timeout=timeout)
        except (PlaywrightError, Exception):
            pass

    def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
        page.wait_for_load_state(state="load")
        if load_dom:
            page.wait_for_load_state(state="domcontentloaded")
        if network_idle:
            self._wait_for_networkidle(page)

    @staticmethod
    def _create_response_handler(page_info: PageInfo[Page], response_container: List) -> Callable:
        """Create a response handler that captures the final navigation response.

        :param page_info: The PageInfo object containing the page
        :param response_container: A list to store the final response (mutable container)
        :return: A callback function for page.on("response", ...)
        """

        def handle_response(finished_response: SyncPlaywrightResponse):
            if (
                finished_response.request.resource_type == "document"
                and finished_response.request.is_navigation_request()
                and finished_response.request.frame == page_info.page.main_frame
            ):
                response_container[0] = finished_response

        return handle_response

    @contextmanager
    def _page_generator(
        self,
        timeout: int | float,
        extra_headers: Optional[Dict[str, str]],
        disable_resources: bool,
        proxy: Optional[ProxyType] = None,
        blocked_domains: Optional[Set[str]] = None,
    ) -> Generator["PageInfo[Page]", None, None]:
        """Acquire a page - either from persistent context or fresh context with proxy."""
        if proxy:
            # Rotation mode: create fresh context with the provided proxy
            if not self.browser:  # pragma: no cover
                raise RuntimeError("Browser not initialized for proxy rotation mode")
            context_options = self._build_context_with_proxy(proxy)
            context: BrowserContext = self.browser.new_context(**context_options)

            try:
                context = self._initialize_context(self._config, context)
                page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
                yield page_info
            finally:
                context.close()
        else:
            # Standard mode: use PagePool with persistent context
            page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
            try:
                yield page_info
            finally:
                page_info.page.close()
                self.page_pool.pages.remove(page_info)


class AsyncSession:
    _config: "PlaywrightConfig | StealthConfig"
    _context_options: Dict[str, Any]

    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
        raise NotImplementedError  # pragma: no cover

    def __init__(self, max_pages: int = 1):
        self.max_pages = max_pages
        self.page_pool = PagePool(max_pages)
        self._max_wait_for_page = 60
        self.playwright: Any = None
        self.context: Any = None
        self.browser: Any = None
        self._is_alive = False
        self._lock = Lock()

    async def start(self) -> None:
        pass

    async def close(self):
        """Close all resources"""
        if not self._is_alive:  # pragma: no cover
            return

        if self.context:
            await self.context.close()
            self.context = None  # pyright: ignore

        if self.browser:
            await self.browser.close()
            self.browser = None

        if self.playwright:
            await self.playwright.stop()
            self.playwright = None  # pyright: ignore

        self._is_alive = False

    async def __aenter__(self):
        await self.start()
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()

    async def _initialize_context(
        self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext
    ) -> AsyncBrowserContext:
        """Initialize the browser context."""
        if config.init_script:  # pragma: no cover
            await ctx.add_init_script(path=config.init_script)

        if config.cookies:  # pragma: no cover
            await ctx.add_cookies(config.cookies)

        return ctx

    async def _get_page(
        self,
        timeout: int | float,
        extra_headers: Optional[Dict[str, str]],
        disable_resources: bool,
        blocked_domains: Optional[Set[str]] = None,
        context: Optional[AsyncBrowserContext] = None,
    ) -> PageInfo[AsyncPage]:  # pragma: no cover
        """Get a new page to use"""
        ctx = context if context is not None else self.context
        if TYPE_CHECKING:
            assert ctx is not None, "Browser context not initialized"

        async with self._lock:
            # If we're at max capacity after cleanup, wait for busy pages to finish
            if context is None and self.page_pool.pages_count >= self.max_pages:
                # Only applies when using persistent context
                start_time = time()
                while time() - start_time < self._max_wait_for_page:
                    await asyncio_sleep(0.05)
                    if self.page_pool.pages_count < self.max_pages:
                        break
                else:
                    raise TimeoutError(
                        f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
                    )

            page = await ctx.new_page()
            page.set_default_navigation_timeout(timeout)
            page.set_default_timeout(timeout)
            if extra_headers:
                await page.set_extra_http_headers(extra_headers)

            if disable_resources or blocked_domains:
                await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))

            return self.page_pool.add_page(page)

    def get_pool_stats(self) -> Dict[str, int]:
        """Get statistics about the current page pool"""
        return {
            "total_pages": self.page_pool.pages_count,
            "busy_pages": self.page_pool.busy_count,
            "max_pages": self.max_pages,
        }

    @staticmethod
    async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
        """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
        try:
            await page.wait_for_load_state("networkidle", timeout=timeout)
        except (PlaywrightError, Exception):
            pass

    async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
        await page.wait_for_load_state(state="load")
        if load_dom:
            await page.wait_for_load_state(state="domcontentloaded")
        if network_idle:
            await self._wait_for_networkidle(page)

    @staticmethod
    def _create_response_handler(page_info: PageInfo[AsyncPage], response_container: List) -> Callable:
        """Create an async response handler that captures the final navigation response.

        :param page_info: The PageInfo object containing the page
        :param response_container: A list to store the final response (mutable container)
        :return: A callback function for page.on("response", ...)
        """

        async def handle_response(finished_response: AsyncPlaywrightResponse):
            if (
                finished_response.request.resource_type == "document"
                and finished_response.request.is_navigation_request()
                and finished_response.request.frame == page_info.page.main_frame
            ):
                response_container[0] = finished_response

        return handle_response

    @asynccontextmanager
    async def _page_generator(
        self,
        timeout: int | float,
        extra_headers: Optional[Dict[str, str]],
        disable_resources: bool,
        proxy: Optional[ProxyType] = None,
        blocked_domains: Optional[Set[str]] = None,
    ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
        """Acquire a page - either from persistent context or fresh context with proxy."""
        if proxy:
            # Rotation mode: create fresh context with the provided proxy
            if not self.browser:  # pragma: no cover
                raise RuntimeError("Browser not initialized for proxy rotation mode")
            context_options = self._build_context_with_proxy(proxy)
            context: AsyncBrowserContext = await self.browser.new_context(**context_options)

            try:
                context = await self._initialize_context(self._config, context)
                page_info = await self._get_page(
                    timeout, extra_headers, disable_resources, blocked_domains, context=context
                )
                yield page_info
            finally:
                await context.close()
        else:
            # Standard mode: use PagePool with persistent context
            page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
            try:
                yield page_info
            finally:
                await page_info.page.close()
                self.page_pool.pages.remove(page_info)


class BaseSessionMixin:
    _config: "PlaywrightConfig | StealthConfig"

    @overload
    def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...

    @overload
    def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...

    def __validate_routine__(
        self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
    ) -> PlaywrightConfig | StealthConfig:
        # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
        self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
        self._browser_options: Dict[str, Any] = {
            "args": DEFAULT_ARGS,
            "ignore_default_args": HARMFUL_ARGS,
        }
        if "__max_pages" in params:
            params["max_pages"] = params.pop("__max_pages")

        config = validate(params, model=model)
        self._headers_keys = (
            {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
        )

        return config

    def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
        config: PlaywrightConfig | StealthConfig = self._config
        self._context_options.update(
            {
                "proxy": config.proxy,
                "locale": config.locale,
                "timezone_id": config.timezone_id,
                "extra_http_headers": config.extra_headers,
            }
        )
        # The default useragent in the headful is always correct now in the current versions of Playwright
        if config.useragent:
            self._context_options["user_agent"] = config.useragent
        elif not config.useragent and config.headless:
            self._context_options["user_agent"] = (
                __default_chrome_useragent__ if config.real_chrome else __default_useragent__
            )

        if not config.cdp_url:
            flags = self._browser_options["args"]
            if config.extra_flags or extra_flags:
                flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))

            self._browser_options.update(
                {
                    "args": flags,
                    "headless": config.headless,
                    "channel": "chrome" if config.real_chrome else "chromium",
                }
            )

            self._user_data_dir = config.user_data_dir
        else:
            self._browser_options = {}

        if config.additional_args:
            self._context_options.update(config.additional_args)

    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
        """
        Build context options with a specific proxy for rotation mode.

        :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.
        :return: Dictionary of context options for browser.new_context().
        """

        context_options = self._context_options.copy()

        # Override proxy if provided
        if proxy:
            context_options["proxy"] = construct_proxy_dict(proxy)

        return context_options


class DynamicSessionMixin(BaseSessionMixin):
    def __validate__(self, **params):
        self._config = self.__validate_routine__(params, model=PlaywrightConfig)
        self.__generate_options__()


class StealthySessionMixin(BaseSessionMixin):
    def __validate__(self, **params):
        self._config = self.__validate_routine__(params, model=StealthConfig)
        self._context_options.update(
            {
                "is_mobile": False,
                "has_touch": False,
                # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
                "service_workers": "allow",
                "ignore_https_errors": True,
                "screen": {"width": 1920, "height": 1080},
                "viewport": {"width": 1920, "height": 1080},
                "permissions": ["geolocation", "notifications"],
            }
        )
        self.__generate_stealth_options()

    def __generate_stealth_options(self) -> None:
        config = cast(StealthConfig, self._config)
        flags: Tuple[str, ...] = tuple()
        if not config.cdp_url:
            flags = tuple(DEFAULT_ARGS) + tuple(STEALTH_ARGS)

            if config.block_webrtc:
                flags += (
                    "--webrtc-ip-handling-policy=disable_non_proxied_udp",
                    "--force-webrtc-ip-handling-policy",  # Ensures the policy is enforced
                )
            if not config.allow_webgl:
                flags += (
                    "--disable-webgl",
                    "--disable-webgl-image-chromium",
                    "--disable-webgl2",
                )
            if config.hide_canvas:
                flags += ("--fingerprinting-canvas-image-data-noise",)

        super(StealthySessionMixin, self).__generate_options__(flags)

    @staticmethod
    def _detect_cloudflare(page_content: str) -> str | None:
        """
        Detect the type of Cloudflare challenge present in the provided page content.

        This function analyzes the given page content to identify whether a specific
        type of Cloudflare challenge is present. It checks for three predefined
        challenge types: non-interactive, managed, and interactive. If a challenge
        type is detected, it returns the corresponding type as a string. If no
        challenge type is detected, it returns None.

        Args:
            page_content (str): The content of the page to analyze for Cloudflare
                challenge types.

        Returns:
            str: A string representing the detected Cloudflare challenge type, if
                found. Returns None if no challenge matches.
        """
        challenge_types = (
            "non-interactive",
            "managed",
            "interactive",
        )
        for ctype in challenge_types:
            if f"cType: '{ctype}'" in page_content:
                return ctype

        # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
        selector = Selector(content=page_content)
        if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
            return "embedded"

        return None


================================================
FILE: scrapling/engines/_browsers/_config_tools.py
================================================
from scrapling.engines.toolbelt.fingerprints import generate_headers

__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
__default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")


================================================
FILE: scrapling/engines/_browsers/_controllers.py
================================================
from time import sleep as time_sleep
from asyncio import sleep as asyncio_sleep

from playwright.sync_api import (
    Locator,
    sync_playwright,
)
from playwright.async_api import (
    async_playwright,
    Locator as AsyncLocator,
)

from scrapling.core.utils import log
from scrapling.core._types import Optional, ProxyType, Unpack
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig


class DynamicSession(SyncSession, DynamicSessionMixin):
    """A Browser session manager with page pooling."""

    __slots__ = (
        "_config",
        "_context_options",
        "_browser_options",
        "_user_data_dir",
        "_headers_keys",
        "max_pages",
        "page_pool",
        "_max_wait_for_page",
        "playwright",
        "context",
    )

    def __init__(self, **kwargs: Unpack[PlaywrightSession]):
        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        self.__validate__(**kwargs)
        super().__init__()

    def start(self):
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = sync_playwright().start()

            try:
                if self._config.cdp_url:  # pragma: no cover
                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                    if not self._config.proxy_rotator and self.browser:
                        self.context = self.browser.new_context(**self._context_options)
                elif self._config.proxy_rotator:
                    self.browser = self.playwright.chromium.launch(**self._browser_options)
                else:
                    persistent_options = (
                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                    )
                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

                if self.context:
                    self.context = self._initialize_context(self._config, self.context)

                self._is_alive = True
            except Exception:
                # Clean up playwright if browser setup fails
                self.playwright.stop()
                self.playwright = None
                raise
        else:
            raise RuntimeError("Session has been already started")

    def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
        """Opens up the browser and do your request based on your chosen options.

        :param url: The Target url.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
        :return: A `Response` object.
        """
        static_proxy = kwargs.pop("proxy", None)

        params = _validate(kwargs, self, PlaywrightConfig)
        if not self._is_alive:  # pragma: no cover
            raise RuntimeError("Context manager has been closed")

        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
        referer = (
            "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
        )

        for attempt in range(self._config.retries):
            proxy: Optional[ProxyType] = None
            if self._config.proxy_rotator and static_proxy is None:
                proxy = self._config.proxy_rotator.get_proxy()
            else:
                proxy = static_proxy

            with self._page_generator(
                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
            ) as page_info:
                final_response = [None]
                page = page_info.page
                page.on("response", self._create_response_handler(page_info, final_response))

                try:
                    first_response = page.goto(url, referer=referer)
                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if not first_response:
                        raise RuntimeError(f"Failed to get response for {url}")

                    if params.page_action:
                        try:
                            _ = params.page_action(page)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error executing page_action: {e}")

                    if params.wait_selector:
                        try:
                            waiter: Locator = page.locator(params.wait_selector)
                            waiter.first.wait_for(state=params.wait_selector_state)
                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                    page.wait_for_timeout(params.wait)

                    response = ResponseFactory.from_playwright_response(
                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                    )
                    return response

                except Exception as e:
                    page_info.mark_error()
                    if attempt < self._config.retries - 1:
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                            )
                        else:
                            log.warning(
                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                            )
                        time_sleep(self._config.retry_delay)
                    else:
                        log.error(f"Failed after {self._config.retries} attempts: {e}")
                        raise

        raise RuntimeError("Request failed")  # pragma: no cover


class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
    """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""

    __slots__ = (
        "_config",
        "_context_options",
        "_browser_options",
        "_user_data_dir",
        "_headers_keys",
    )

    def __init__(self, **kwargs: Unpack[PlaywrightSession]):
        """A Browser session manager with page pooling

        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        self.__validate__(**kwargs)
        super().__init__(max_pages=self._config.max_pages)

    async def start(self) -> None:
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = await async_playwright().start()
            try:
                if self._config.cdp_url:
                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                    if not self._config.proxy_rotator and self.browser:
                        self.context = await self.browser.new_context(**self._context_options)
                elif self._config.proxy_rotator:
                    self.browser = await self.playwright.chromium.launch(**self._browser_options)
                else:
                    persistent_options = (
                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                    )
                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

                if self.context:
                    self.context = await self._initialize_context(self._config, self.context)

                self._is_alive = True
            except Exception:
                # Clean up playwright if browser setup fails
                await self.playwright.stop()
                self.playwright = None
                raise
        else:
            raise RuntimeError("Session has been already started")

    async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
        """Opens up the browser and do your request based on your chosen options.

        :param url: The Target url.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
        :return: A `Response` object.
        """
        static_proxy = kwargs.pop("proxy", None)

        params = _validate(kwargs, self, PlaywrightConfig)

        if not self._is_alive:  # pragma: no cover
            raise RuntimeError("Context manager has been closed")

        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
        referer = (
            "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
        )

        for attempt in range(self._config.retries):
            proxy: Optional[ProxyType] = None
            if self._config.proxy_rotator and static_proxy is None:
                proxy = self._config.proxy_rotator.get_proxy()
            else:
                proxy = static_proxy

            async with self._page_generator(
                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
            ) as page_info:
                final_response = [None]
                page = page_info.page
                page.on("response", self._create_response_handler(page_info, final_response))

                try:
                    first_response = await page.goto(url, referer=referer)
                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if not first_response:
                        raise RuntimeError(f"Failed to get response for {url}")

                    if params.page_action:
                        try:
                            _ = await params.page_action(page)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error executing page_action: {e}")

                    if params.wait_selector:
                        try:
                            waiter: AsyncLocator = page.locator(params.wait_selector)
                            await waiter.first.wait_for(state=params.wait_selector_state)
                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                    await page.wait_for_timeout(params.wait)

                    response = await ResponseFactory.from_async_playwright_response(
                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                    )
                    return response

                except Exception as e:
                    page_info.mark_error()
                    if attempt < self._config.retries - 1:
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                            )
                        else:
                            log.warning(
                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                            )
                        await asyncio_sleep(self._config.retry_delay)
                    else:
                        log.error(f"Failed after {self._config.retries} attempts: {e}")
                        raise

        raise RuntimeError("Request failed")  # pragma: no cover


================================================
FILE: scrapling/engines/_browsers/_page.py
================================================
from threading import RLock
from dataclasses import dataclass

from playwright.sync_api._generated import Page as SyncPage
from playwright.async_api._generated import Page as AsyncPage

from scrapling.core._types import Optional, List, Literal, overload, TypeVar, Generic, cast

PageState = Literal["ready", "busy", "error"]  # States that a page can be in
PageType = TypeVar("PageType", SyncPage, AsyncPage)


@dataclass
class PageInfo(Generic[PageType]):
    """Information about the page and its current state"""

    __slots__ = ("page", "state", "url")
    page: PageType
    state: PageState
    url: Optional[str]

    def mark_busy(self, url: str = ""):
        """Mark the page as busy"""
        self.state = "busy"
        self.url = url

    def mark_error(self):
        """Mark the page as having an error"""
        self.state = "error"

    def __repr__(self):
        return f'Page(URL="{self.url!r}", state={self.state!r})'

    def __eq__(self, other_page):
        """Comparing this page to another page object."""
        if other_page.__class__ is not self.__class__:
            return NotImplemented
        return self.page == other_page.page


class PagePool:
    """Manages a pool of browser pages/tabs with state tracking"""

    __slots__ = ("max_pages", "pages", "_lock")

    def __init__(self, max_pages: int = 5):
        self.max_pages = max_pages
        self.pages: List[PageInfo[SyncPage] | PageInfo[AsyncPage]] = []
        self._lock = RLock()

    @overload
    def add_page(self, page: SyncPage) -> PageInfo[SyncPage]: ...

    @overload
    def add_page(self, page: AsyncPage) -> PageInfo[AsyncPage]: ...

    def add_page(self, page: SyncPage | AsyncPage) -> PageInfo[SyncPage] | PageInfo[AsyncPage]:
        """Add a new page to the pool"""
        with self._lock:
            if len(self.pages) >= self.max_pages:
                raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")

            if isinstance(page, AsyncPage):
                page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(
                    PageInfo[AsyncPage], PageInfo(page, "ready", "")
                )
            else:
                page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))

            self.pages.append(page_info)
            return page_info

    @property
    def pages_count(self) -> int:
        """Get the total number of pages"""
        return len(self.pages)

    @property
    def busy_count(self) -> int:
        """Get the number of busy pages"""
        with self._lock:
            return sum(1 for p in self.pages if p.state == "busy")

    def cleanup_error_pages(self):
        """Remove pages in error state"""
        with self._lock:
            self.pages = [p for p in self.pages if p.state != "error"]


================================================
FILE: scrapling/engines/_browsers/_stealth.py
================================================
from random import randint
from re import compile as re_compile
from time import sleep as time_sleep
from asyncio import sleep as asyncio_sleep

from playwright.sync_api import Locator, Page, BrowserContext
from playwright.async_api import (
    Page as async_Page,
    Locator as AsyncLocator,
    BrowserContext as AsyncBrowserContext,
)
from patchright.sync_api import sync_playwright
from patchright.async_api import async_playwright

from scrapling.core.utils import log
from scrapling.core._types import Any, Optional, ProxyType, Unpack
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig

__CF_PATTERN__ = re_compile(r"^https?://challenges\.cloudflare\.com/cdn-cgi/challenge-platform/.*")


class StealthySession(SyncSession, StealthySessionMixin):
    """A Stealthy Browser session manager with page pooling."""

    __slots__ = (
        "_config",
        "_context_options",
        "_browser_options",
        "_user_data_dir",
        "_headers_keys",
        "max_pages",
        "page_pool",
        "_max_wait_for_page",
        "playwright",
        "context",
    )

    def __init__(self, **kwargs: Unpack[StealthSession]):
        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        self.__validate__(**kwargs)
        super().__init__()

    def start(self) -> None:
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = sync_playwright().start()

            try:
                if self._config.cdp_url:  # pragma: no cover
                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                    if not self._config.proxy_rotator:
                        assert self.browser is not None
                        self.context = self.browser.new_context(**self._context_options)
                elif self._config.proxy_rotator:
                    self.browser = self.playwright.chromium.launch(**self._browser_options)
                else:
                    persistent_options = (
                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                    )
                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)

                if self.context:
                    self.context = self._initialize_context(self._config, self.context)

                self._is_alive = True
            except Exception:
                # Clean up playwright if browser setup fails
                self.playwright.stop()
                self.playwright = None
                raise
        else:
            raise RuntimeError("Session has been already started")

    def _cloudflare_solver(self, page: Page) -> None:  # pragma: no cover
        """Solve the cloudflare challenge displayed on the playwright page passed

        :param page: The targeted page
        :return:
        """
        self._wait_for_networkidle(page, timeout=5000)
        challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
        if not challenge_type:
            log.error("No Cloudflare challenge found.")
            return None
        else:
            log.info(f'The turnstile version discovered is "{challenge_type}"')
            if challenge_type == "non-interactive":
                while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
                    log.info("Waiting for Cloudflare wait page to disappear.")
                    page.wait_for_timeout(1000)
                    page.wait_for_load_state()
                log.info("Cloudflare captcha is solved")
                return None

            else:
                box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
                if challenge_type != "embedded":
                    box_selector = ".main-content p+div>div>div"
                    while "Verifying you are human." in ResponseFactory._get_page_content(page):
                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                        page.wait_for_timeout(500)

                outer_box: Any = {}
                iframe = page.frame(url=__CF_PATTERN__)
                if iframe is not None:
                    self._wait_for_page_stability(iframe, True, False)

                    if challenge_type != "embedded":
                        while not iframe.frame_element().is_visible():
                            # Double-checking that the iframe is loaded
                            page.wait_for_timeout(500)

                    outer_box = iframe.frame_element().bounding_box()

                if not iframe or not outer_box:
                    if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
                        log.info("Cloudflare captcha is solved")
                        return None

                    outer_box = page.locator(box_selector).last.bounding_box()

                # Calculate the Captcha coordinates for any viewport
                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)

                # Move the mouse to the center of the window, then press and hold the left mouse button
                page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
                self._wait_for_networkidle(page)

                if challenge_type != "embedded":
                    attempts = 0
                    while "<title>Just a moment...</title>" in ResponseFactory._get_page_content(page):
                        # Wait for the page
                        if attempts >= 100:
                            log.info("Cloudflare page didn't disappear after 10s, continuing...")
                            break
                        page.wait_for_timeout(100)
                        attempts += 1

                    # page.locator(box_selector).last.wait_for(state="detached")
                    # page.locator(".zone-name-title").wait_for(state="hidden")

                self._wait_for_page_stability(page, True, False)

                if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
                    log.info("Cloudflare captcha is solved")
                    return None
                else:
                    log.info("Looks like Cloudflare captcha is still present, solving again")
                    return self._cloudflare_solver(page)

    def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
        """Opens up the browser and do your request based on your chosen options.

        :param url: The Target url.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
        :return: A `Response` object.
        """
        static_proxy = kwargs.pop("proxy", None)

        params = _validate(kwargs, self, StealthConfig)
        if not self._is_alive:  # pragma: no cover
            raise RuntimeError("Context manager has been closed")

        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
        referer = (
            "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
        )

        for attempt in range(self._config.retries):
            proxy: Optional[ProxyType] = None
            if self._config.proxy_rotator and static_proxy is None:
                proxy = self._config.proxy_rotator.get_proxy()
            else:
                proxy = static_proxy

            with self._page_generator(
                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
            ) as page_info:
                final_response = [None]
                page = page_info.page
                page.on("response", self._create_response_handler(page_info, final_response))

                try:
                    first_response = page.goto(url, referer=referer)
                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if not first_response:
                        raise RuntimeError(f"Failed to get response for {url}")

                    if params.solve_cloudflare:
                        self._cloudflare_solver(page)
                        # Make sure the page is fully loaded after the captcha
                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if params.page_action:
                        try:
                            _ = params.page_action(page)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error executing page_action: {e}")

                    if params.wait_selector:
                        try:
                            waiter: Locator = page.locator(params.wait_selector)
                            waiter.first.wait_for(state=params.wait_selector_state)
                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                    page.wait_for_timeout(params.wait)

                    response = ResponseFactory.from_playwright_response(
                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                    )
                    return response

                except Exception as e:
                    page_info.mark_error()
                    if attempt < self._config.retries - 1:
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                            )
                        else:
                            log.warning(
                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                            )
                        time_sleep(self._config.retry_delay)
                    else:
                        log.error(f"Failed after {self._config.retries} attempts: {e}")
                        raise

        raise RuntimeError("Request failed")  # pragma: no cover


class AsyncStealthySession(AsyncSession, StealthySessionMixin):
    """An async Stealthy Browser session manager with page pooling."""

    __slots__ = (
        "_config",
        "_context_options",
        "_browser_options",
        "_user_data_dir",
        "_headers_keys",
    )

    def __init__(self, **kwargs: Unpack[StealthSession]):
        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.

        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        """
        self.__validate__(**kwargs)
        super().__init__(max_pages=self._config.max_pages)

    async def start(self) -> None:
        """Create a browser for this instance and context."""
        if not self.playwright:
            self.playwright = await async_playwright().start()
            try:
                if self._config.cdp_url:
                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
                    if not self._config.proxy_rotator:
                        assert self.browser is not None
                        self.context = await self.browser.new_context(**self._context_options)
                elif self._config.proxy_rotator:
                    self.browser = await self.playwright.chromium.launch(**self._browser_options)
                else:
                    persistent_options = (
                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
                    )
                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)

                if self.context:
                    self.context = await self._initialize_context(self._config, self.context)

                self._is_alive = True
            except Exception:
                # Clean up playwright if browser setup fails
                await self.playwright.stop()
                self.playwright = None
                raise
        else:
            raise RuntimeError("Session has been already started")

    async def _cloudflare_solver(self, page: async_Page) -> None:  # pragma: no cover
        """Solve the cloudflare challenge displayed on the playwright page passed

        :param page: The targeted page
        :return:
        """
        await self._wait_for_networkidle(page, timeout=5000)
        challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
        if not challenge_type:
            log.error("No Cloudflare challenge found.")
            return None
        else:
            log.info(f'The turnstile version discovered is "{challenge_type}"')
            if challenge_type == "non-interactive":
                while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
                    log.info("Waiting for Cloudflare wait page to disappear.")
                    await page.wait_for_timeout(1000)
                    await page.wait_for_load_state()
                log.info("Cloudflare captcha is solved")
                return None

            else:
                box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
                if challenge_type != "embedded":
                    box_selector = ".main-content p+div>div>div"
                    while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                        await page.wait_for_timeout(500)

                outer_box: Any = {}
                iframe = page.frame(url=__CF_PATTERN__)
                if iframe is not None:
                    await self._wait_for_page_stability(iframe, True, False)

                    if challenge_type != "embedded":
                        while not await (await iframe.frame_element()).is_visible():
                            # Double-checking that the iframe is loaded
                            await page.wait_for_timeout(500)

                    outer_box = await (await iframe.frame_element()).bounding_box()

                if not iframe or not outer_box:
                    if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
                        log.info("Cloudflare captcha is solved")
                        return None

                    outer_box = await page.locator(box_selector).last.bounding_box()

                # Calculate the Captcha coordinates for any viewport
                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)

                # Move the mouse to the center of the window, then press and hold the left mouse button
                await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
                await self._wait_for_networkidle(page)

                if challenge_type != "embedded":
                    attempts = 0
                    while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
                        # Wait for the page
                        if attempts >= 100:
                            log.info("Cloudflare page didn't disappear after 10s, continuing...")
                            break
                        await page.wait_for_timeout(100)
                        attempts += 1

                    # await page.locator(box_selector).last.wait_for(state="detached")
                    # await page.locator(".zone-name-title").wait_for(state="hidden")

                await self._wait_for_page_stability(page, True, False)

                if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
                    log.info("Cloudflare captcha is solved")
                    return None
                else:
                    log.info("Looks like Cloudflare captcha is still present, solving again")
                    return await self._cloudflare_solver(page)

    async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
        """Opens up the browser and do your request based on your chosen options.

        :param url: The Target url.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
        :return: A `Response` object.
        """
        static_proxy = kwargs.pop("proxy", None)

        params = _validate(kwargs, self, StealthConfig)

        if not self._is_alive:  # pragma: no cover
            raise RuntimeError("Context manager has been closed")

        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
        referer = (
            "https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
        )

        for attempt in range(self._config.retries):
            proxy: Optional[ProxyType] = None
            if self._config.proxy_rotator and static_proxy is None:
                proxy = self._config.proxy_rotator.get_proxy()
            else:
                proxy = static_proxy

            async with self._page_generator(
                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
            ) as page_info:
                final_response = [None]
                page = page_info.page
                page.on("response", self._create_response_handler(page_info, final_response))

                try:
                    first_response = await page.goto(url, referer=referer)
                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if not first_response:
                        raise RuntimeError(f"Failed to get response for {url}")

                    if params.solve_cloudflare:
                        await self._cloudflare_solver(page)
                        # Make sure the page is fully loaded after the captcha
                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)

                    if params.page_action:
                        try:
                            _ = await params.page_action(page)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error executing page_action: {e}")

                    if params.wait_selector:
                        try:
                            waiter: AsyncLocator = page.locator(params.wait_selector)
                            await waiter.first.wait_for(state=params.wait_selector_state)
                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
                        except Exception as e:  # pragma: no cover
                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")

                    await page.wait_for_timeout(params.wait)

                    response = await ResponseFactory.from_async_playwright_response(
                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                    )
                    return response

                except Exception as e:
                    page_info.mark_error()
                    if attempt < self._config.retries - 1:
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
                            )
                        else:
                            log.warning(
                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
                            )
                        await asyncio_sleep(self._config.retry_delay)
                    else:
                        log.error(f"Failed after {self._config.retries} attempts: {e}")
                        raise

        raise RuntimeError("Request failed")  # pragma: no cover


================================================
FILE: scrapling/engines/_browsers/_types.py
================================================
from io import BytesIO

from curl_cffi.requests import (
    ProxySpec,
    CookieTypes,
    BrowserTypeLiteral,
)

from scrapling.core._types import (
    Dict,
    List,
    Set,
    Tuple,
    Mapping,
    Optional,
    Callable,
    Sequence,
    TypedDict,
    TypeAlias,
    SetCookieParam,
    SelectorWaitStates,
)
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator

# Type alias for `impersonate` parameter - accepts a single browser or list of browsers
ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None


# Types for session initialization
class RequestsSession(TypedDict, total=False):
    impersonate: ImpersonateType
    http3: Optional[bool]
    stealthy_headers: Optional[bool]
    proxies: Optional[ProxySpec]
    proxy: Optional[str]
    proxy_auth: Optional[Tuple[str, str]]
    proxy_rotator: Optional[ProxyRotator]
    timeout: Optional[int | float]
    headers: Optional[Mapping[str, Optional[str]]]
    retries: Optional[int]
    retry_delay: Optional[int]
    follow_redirects: Optional[bool]
    max_redirects: Optional[int]
    verify: Optional[bool]
    cert: Optional[str | Tuple[str, str]]
    selector_config: Optional[Dict]


# Types for GET request method parameters
class GetRequestParams(RequestsSession, total=False):
    params: Optional[Dict | List | Tuple]
    cookies: Optional[CookieTypes]
    auth: Optional[Tuple[str, str]]


# Types for POST/PUT/DELETE request method parameters
class DataRequestParams(GetRequestParams, total=False):
    data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes]
    json: Optional[Dict | List]


# Types for browser session
class PlaywrightSession(TypedDict, total=False):
    max_pages: int
    headless: bool
    disable_resources: bool
    network_idle: bool
    load_dom: bool
    wait_selector: Optional[str]
    wait_selector_state: SelectorWaitStates
    cookies: Sequence[SetCookieParam] | None
    google_search: bool
    wait: int | float
    timezone_id: str | None
    page_action: Optional[Callable]
    proxy: Optional[str | Dict[str, str] | Tuple]
    proxy_rotator: Optional[ProxyRotator]
    extra_headers: Optional[Dict[str, str]]
    timeout: int | float
    init_script: Optional[str]
    user_data_dir: str
    selector_config: Optional[Dict]
    additional_args: Optional[Dict]
    locale: Optional[str]
    real_chrome: bool
    cdp_url: Optional[str]
    useragent: Optional[str]
    extra_flags: Optional[List[str]]
    blocked_domains: Optional[Set[str]]
    retries: int
    retry_delay: int | float


class PlaywrightFetchParams(TypedDict, total=False):
    load_dom: bool
    wait: int | float
    network_idle: bool
    google_search: bool
    timeout: int | float
    disable_resources: bool
    wait_selector: Optional[str]
    page_action: Optional[Callable]
    selector_config: Optional[Dict]
    extra_headers: Optional[Dict[str, str]]
    wait_selector_state: SelectorWaitStates
    blocked_domains: Optional[Set[str]]
    proxy: Optional[str | Dict[str, str]]


class StealthSession(PlaywrightSession, total=False):
    allow_webgl: bool
    hide_canvas: bool
    block_webrtc: bool
    solve_cloudflare: bool


class StealthFetchParams(PlaywrightFetchParams, total=False):
    solve_cloudflare: bool


================================================
FILE: scrapling/engines/_browsers/_validators.py
================================================
from pathlib import Path
from typing import Annotated
from functools import lru_cache
from urllib.parse import urlparse
from dataclasses import dataclass, fields

from msgspec import Struct, Meta, convert, ValidationError

from scrapling.core._types import (
    Any,
    Dict,
    List,
    Set,
    Tuple,
    Optional,
    Callable,
    Sequence,
    overload,
    SetCookieParam,
    SelectorWaitStates,
)
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams


# Custom validators for msgspec
@lru_cache(8)
def _is_invalid_file_path(value: str) -> bool | str:  # pragma: no cover
    """Fast file path validation"""
    path = Path(value)
    if not path.exists():
        return f"Init script path not found: {value}"
    if not path.is_file():
        return f"Init script is not a file: {value}"
    if not path.is_absolute():
        return f"Init script is not a absolute path: {value}"
    return False


@lru_cache(2)
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
    """Fast CDP URL validation"""
    if not cdp_url.startswith(("ws://", "wss://")):
        return "CDP URL must use 'ws://' or 'wss://' scheme"

    netloc = urlparse(cdp_url).netloc
    if not netloc:  # pragma: no cover
        return "Invalid hostname for the CDP URL"
    return False


# Type aliases for cleaner annotations
PagesCount = Annotated[int, Meta(ge=1, le=50)]
RetriesCount = Annotated[int, Meta(ge=1, le=10)]
Seconds = Annotated[int, float, Meta(ge=0)]


class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
    """Configuration struct for validation"""

    max_pages: PagesCount = 1
    headless: bool = True
    disable_resources: bool = False
    network_idle: bool = False
    load_dom: bool = True
    wait_selector: Optional[str] = None
    wait_selector_state: SelectorWaitStates = "attached"
    cookies: Sequence[SetCookieParam] | None = []
    google_search: bool = True
    wait: Seconds = 0
    timezone_id: str | None = ""
    page_action: Optional[Callable] = None
    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
    proxy_rotator: Optional[ProxyRotator] = None
    extra_headers: Optional[Dict[str, str]] = None
    timeout: Seconds = 30000
    init_script: Optional[str] = None
    user_data_dir: str = ""
    selector_config: Optional[Dict] = {}
    additional_args: Optional[Dict] = {}
    locale: str | None = None
    real_chrome: bool = False
    cdp_url: Optional[str] = None
    useragent: Optional[str] = None
    extra_flags: Optional[List[str]] = None
    blocked_domains: Optional[Set[str]] = None
    retries: RetriesCount = 3
    retry_delay: Seconds = 1

    def __post_init__(self):  # pragma: no cover
        """Custom validation after msgspec validation"""
        if self.page_action and not callable(self.page_action):
            raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
        if self.proxy and self.proxy_rotator:
            raise ValueError(
                "Cannot use 'proxy_rotator' together with 'proxy'. "
                "Use either a static proxy or proxy rotation, not both."
            )
        if self.proxy:
            self.proxy = construct_proxy_dict(self.proxy)
        if self.cdp_url:
            cdp_msg = _is_invalid_cdp_url(self.cdp_url)
            if cdp_msg:
                raise ValueError(cdp_msg)

        if not self.cookies:
            self.cookies = []
        if not self.extra_flags:
            self.extra_flags = []
        if not self.selector_config:
            self.selector_config = {}
        if not self.additional_args:
            self.additional_args = {}

        if self.init_script is not None:
            validation_msg = _is_invalid_file_path(self.init_script)
            if validation_msg:
                raise ValueError(validation_msg)


class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
    allow_webgl: bool = True
    hide_canvas: bool = False
    block_webrtc: bool = False
    solve_cloudflare: bool = False

    def __post_init__(self):
        """Custom validation after msgspec validation"""
        super(StealthConfig, self).__post_init__()
        # Cloudflare timeout adjustment
        if self.solve_cloudflare and self.timeout < 60_000:
            self.timeout = 60_000


@dataclass
class _fetch_params:
    """A dataclass of all parameters used by `fetch` calls"""

    google_search: bool
    timeout: Seconds
    wait: Seconds
    page_action: Optional[Callable]
    extra_headers: Optional[Dict[str, str]]
    disable_resources: bool
    wait_selector: Optional[str]
    wait_selector_state: SelectorWaitStates
    network_idle: bool
    load_dom: bool
    blocked_domains: Optional[Set[str]]
    solve_cloudflare: bool
    selector_config: Dict


def validate_fetch(
    method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
    session: Any,
    model: type[PlaywrightConfig] | type[StealthConfig],
) -> _fetch_params:  # pragma: no cover
    result: Dict[str, Any] = {}
    overrides: Dict[str, Any] = {}
    kwargs_dict: Dict[str, Any] = dict(method_kwargs)

    # Get all field names that _fetch_params needs
    fetch_param_fields = {f.name for f in fields(_fetch_params)}

    for key in fetch_param_fields:
        if key in kwargs_dict:
            overrides[key] = kwargs_dict[key]
        elif hasattr(session, "_config") and hasattr(session._config, key):
            result[key] = getattr(session._config, key)

    if overrides:
        validated_config = validate(overrides, model)
        # Extract ONLY the fields that were actually overridden (not all fields)
        # This prevents validated defaults from overwriting session config values
        validated_dict = {
            field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
        }

        # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
        if "solve_cloudflare" in overrides:
            validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]

        # Start with session defaults, then overwrite with validated overrides
        result.update(validated_dict)

    # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
    result.setdefault("solve_cloudflare", False)
    result.setdefault("blocked_domains", None)

    return _fetch_params(**result)


# Cache default values for each model to reduce validation overhead
models_default_values = {}

for _model in (StealthConfig, PlaywrightConfig):
    _defaults = {}
    if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
        for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__):  # type: ignore
            # Skip factory defaults - these are msgspec._core.Factory instances
            if type(default_value).__name__ != "Factory":
                _defaults[field_name] = default_value

    models_default_values[_model.__name__] = _defaults.copy()


def _filter_defaults(params: Dict, model: str) -> Dict:
    """Filter out parameters that match their default values to reduce validation overhead."""
    defaults = models_default_values[model]
    return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}


@overload
def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...


@overload
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...


def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
    try:
        # Filter out params with the default values (no need to validate them) to speed up validation
        filtered = _filter_defaults(params, model.__name__)
        return convert(filtered, model)
    except ValidationError as e:
        raise TypeError(f"Invalid argument type: {e}") from e


================================================
FILE: scrapling/engines/constants.py
================================================
# Disable loading these resources for speed
EXTRA_RESOURCES = {
    "font",
    "image",
    "media",
    "beacon",
    "object",
    "imageset",
    "texttrack",
    "websocket",
    "csp_report",
    "stylesheet",
}

HARMFUL_ARGS = (
    # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
    "--enable-automation",
    "--disable-popup-blocking",
    "--disable-component-update",
    "--disable-default-apps",
    "--disable-extensions",
)

DEFAULT_ARGS = (
    # Speed up chromium browsers by default
    "--no-pings",
    "--no-first-run",
    "--disable-infobars",
    "--disable-breakpad",
    "--no-service-autorun",
    "--homepage=about:blank",
    "--password-store=basic",
    "--disable-hang-monitor",
    "--no-default-browser-check",
    "--disable-session-crashed-bubble",
    "--disable-search-engine-choice-screen",
)

STEALTH_ARGS = (
    # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
    # Generally this will make the browser faster and less detectable
    # "--incognito",
    "--test-type",
    "--lang=en-US",
    "--mute-audio",
    "--disable-sync",
    "--hide-scrollbars",
    "--disable-logging",
    "--start-maximized",  # For headless check bypass
    "--enable-async-dns",
    "--accept-lang=en-US",
    "--use-mock-keychain",
    "--disable-translate",
    "--disable-voice-input",
    "--window-position=0,0",
    "--disable-wake-on-wifi",
    "--ignore-gpu-blocklist",
    "--enable-tcp-fast-open",
    "--enable-web-bluetooth",
    "--disable-cloud-import",
    "--disable-print-preview",
    "--disable-dev-shm-usage",
    # '--disable-popup-blocking',
    "--metrics-recording-only",
    "--disable-crash-reporter",
    "--disable-partial-raster",
    "--disable-gesture-typing",
    "--disable-checker-imaging",
    "--disable-prompt-on-repost",
    "--force-color-profile=srgb",
    "--font-render-hinting=none",
    "--aggressive-cache-discard",
    "--disable-cookie-encryption",
    "--disable-domain-reliability",
    "--disable-threaded-animation",
    "--disable-threaded-scrolling",
    "--enable-simple-cache-backend",
    "--disable-background-networking",
    "--enable-surface-synchronization",
    "--disable-image-animation-resync",
    "--disable-renderer-backgrounding",
    "--disable-ipc-flooding-protection",
    "--prerender-from-omnibox=disabled",
    "--safebrowsing-disable-auto-update",
    "--disable-offer-upload-credit-cards",
    "--disable-background-timer-throttling",
    "--disable-new-content-rendering-timeout",
    "--run-all-compositor-stages-before-draw",
    "--disable-client-side-phishing-detection",
    "--disable-backgrounding-occluded-windows",
    "--disable-layer-tree-host-memory-pressure",
    "--autoplay-policy=user-gesture-required",
    "--disable-offer-store-unmasked-wallet-cards",
    "--disable-blink-features=AutomationControlled",
    "--disable-component-extensions-with-background-pages",
    "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
    "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
    "--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees",
)


================================================
FILE: scrapling/engines/static.py
================================================
from abc import ABC
from random import choice
from time import sleep as time_sleep
from asyncio import sleep as asyncio_sleep

from curl_cffi.curl import CurlError
from curl_cffi import CurlHttpVersion
from curl_cffi.requests import (
    BrowserTypeLiteral,
    Session as CurlSession,
    AsyncSession as AsyncCurlSession,
)

from scrapling.core.utils import log
from scrapling.core._types import (
    Any,
    Dict,
    Tuple,
    Unpack,
    Optional,
    Awaitable,
    SUPPORTED_HTTP_METHODS,
)

from .toolbelt.custom import Response
from .toolbelt.convertor import ResponseFactory
from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
from .toolbelt.fingerprints import generate_headers, __default_useragent__

_NO_SESSION: Any = object()


def _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:
    """
    Handle browser selection logic for the ` impersonate ` parameter.

    If impersonate is a list, randomly select one browser from it.
    If it's a string or None, return as is.
    """
    if isinstance(impersonate, list):
        if not impersonate:
            return None
        return choice(impersonate)
    return impersonate


class _ConfigurationLogic(ABC):
    # Core Logic Handler (Internal Engine)
    __slots__ = (
        "_default_impersonate",
        "_stealth",
        "_default_proxies",
        "_default_proxy",
        "_default_proxy_auth",
        "_default_timeout",
        "_default_headers",
        "_default_retries",
        "_default_retry_delay",
        "_default_follow_redirects",
        "_default_max_redirects",
        "_default_verify",
        "_default_cert",
        "_default_http3",
        "selector_config",
        "_is_alive",
        "_proxy_rotator",
    )

    def __init__(self, **kwargs: Unpack[RequestsSession]):
        self._default_impersonate = kwargs.get("impersonate", "chrome")
        self._stealth = kwargs.get("stealthy_headers", True)
        self._default_proxies = kwargs.get("proxies") or {}
        self._default_proxy = kwargs.get("proxy") or None
        self._default_proxy_auth = kwargs.get("proxy_auth") or None
        self._default_timeout = kwargs.get("timeout", 30)
        self._default_headers = kwargs.get("headers") or {}
        self._default_retries = kwargs.get("retries", 3)
        self._default_retry_delay = kwargs.get("retry_delay", 1)
        self._default_follow_redirects = kwargs.get("follow_redirects", True)
        self._default_max_redirects = kwargs.get("max_redirects", 30)
        self._default_verify = kwargs.get("verify", True)
        self._default_cert = kwargs.get("cert") or None
        self._default_http3 = kwargs.get("http3", False)
        self.selector_config = kwargs.get("selector_config") or {}
        self._is_alive = False
        self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")

        if self._proxy_rotator and (self._default_proxy or self._default_proxies):
            raise ValueError(
                "Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
                "Use either a static proxy or proxy rotation, not both."
            )

    @staticmethod
    def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
        """Get parameter from kwargs if present, otherwise return default."""
        return kwargs[key] if key in kwargs else default

    def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
        """Merge request-specific arguments with default session arguments."""
        url = method_kwargs.pop("url")

        # Get parameters from kwargs or use defaults
        impersonate = self._get_param(method_kwargs, "impersonate", self._default_impersonate)
        impersonate = _select_random_browser(impersonate)
        http3_enabled = self._get_param(method_kwargs, "http3", self._default_http3)
        stealth = self._get_param(method_kwargs, "stealth", self._stealth)

        final_args = {
            "url": url,
            # Curl automatically generates the suitable browser headers when you use `impersonate`
            "headers": self._headers_job(
                url,
                self._get_param(method_kwargs, "headers", self._default_headers),
                stealth,
                bool(impersonate),
            ),
            "proxies": self._get_param(method_kwargs, "proxies", self._default_proxies),
            "proxy": self._get_param(method_kwargs, "proxy", self._default_proxy),
            "proxy_auth": self._get_param(method_kwargs, "proxy_auth", self._default_proxy_auth),
            "timeout": self._get_param(method_kwargs, "timeout", self._default_timeout),
            "allow_redirects": self._get_param(method_kwargs, "follow_redirects", self._default_follow_redirects),
            "max_redirects": self._get_param(method_kwargs, "max_redirects", self._default_max_redirects),
            "verify": self._get_param(method_kwargs, "verify", self._default_verify),
            "cert": self._get_param(method_kwargs, "cert", self._default_cert),
            "impersonate": impersonate,
        }

        # Add any remaining parameters that weren't explicitly handled above
        # Skip the ones we already processed plus internal params
        skip_keys = {
            "impersonate",
            "http3",
            "stealth",
            "headers",
            "proxies",
            "proxy",
            "proxy_auth",
            "timeout",
            "follow_redirects",
            "max_redirects",
            "verify",
            "cert",
            "retries",
            "retry_delay",
            "selector_config",
            # Browser session params (ignored by HTTP sessions)
            "extra_headers",
            "google_search",
        }
        for k, v in method_kwargs.items():
            if k not in skip_keys and v is not None:
                final_args[k] = v

        if http3_enabled:  # pragma: no cover
            final_args["http_version"] = CurlHttpVersion.V3ONLY
            if impersonate:
                log.warning(
                    "The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
                )

        return final_args

    def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
        """
        1. Adds a useragent to the headers if it doesn't have one
        2. Generates real headers and append them to current headers
        3. Sets a Google referer header.
        """
        # Merge session headers with request headers, request takes precedence (if it was set)
        final_headers = {**self._default_headers, **(headers if headers else {})}
        headers_keys = {k.lower() for k in final_headers}
        if stealth:
            if "referer" not in headers_keys:
                final_headers["referer"] = "https://www.google.com/"

            if not impersonate_enabled:  # Curl will generate the suitable headers
                extra_headers = generate_headers(browser_mode=False)
                final_headers.update(
                    {k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
                )  # Don't overwrite user-supplied headers

        elif "user-agent" not in headers_keys and not impersonate_enabled:  # pragma: no cover
            final_headers["User-Agent"] = __default_useragent__
            log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")

        return final_headers


class _SyncSessionLogic(_ConfigurationLogic):
    __slots__ = ("_curl_session",)

    def __init__(self, **kwargs: Unpack[RequestsSession]):
        super().__init__(**kwargs)
        self._curl_session: Optional[CurlSession] = None

    def __enter__(self):
        """Creates and returns a new synchronous Fetcher Session"""
        if self._is_alive:
            raise RuntimeError("This FetcherSession instance already has an active synchronous session.")

        self._curl_session = CurlSession()
        self._is_alive = True
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Closes the active synchronous session managed by this instance, if any."""
        # For type checking (not accessed error)
        _ = (
            exc_type,
            exc_val,
            exc_tb,
        )
        if self._curl_session:
            self._curl_session.close()
            self._curl_session = None

        self._is_alive = False

    def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
        """
        Perform an HTTP request using the configured session.
        """
        stealth = self._stealth if stealth is None else stealth

        selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
        max_retries = self._get_param(kwargs, "retries", self._default_retries)
        retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
        static_proxy = kwargs.pop("proxy", None)

        session = self._curl_session
        one_off_request = False
        if session is _NO_SESSION and self.__enter__ is None:
            # For usage inside FetcherClient
            # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
            session = CurlSession()
            one_off_request = True

        if not session:
            raise RuntimeError("No active session available.")  # pragma: no cover

        try:
            for attempt in range(max_retries):
                if self._proxy_rotator and static_proxy is None:
                    proxy = self._proxy_rotator.get_proxy()
                else:
                    proxy = static_proxy

                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
                try:
                    response = session.request(method, **request_args)
                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
                    return result
                except CurlError as e:  # pragma: no cover
                    if attempt < max_retries - 1:
                        # Now if the rotator is enabled, we will try again with the new proxy
                        # If it's not enabled, then we will try again with the same proxy
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
                            )
                        else:
                            log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
                        time_sleep(retry_delay)
                    else:
                        log.error(f"Failed after {max_retries} attempts: {e}")
                        raise  # Raise the exception if all retries fail
        finally:
            if session and one_off_request:
                session.close()

        raise RuntimeError("No active session available.")  # pragma: no cover

    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
        """
        Perform a GET request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)

    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
        """
        Perform a POST request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)

    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
        """
        Perform a PUT request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)

    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
        """
        Perform a DELETE request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
        # But some websites accept it, it depends on the implementation used.
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)


class _ASyncSessionLogic(_ConfigurationLogic):
    __slots__ = ("_async_curl_session",)

    def __init__(self, **kwargs: Unpack[RequestsSession]):
        super().__init__(**kwargs)
        self._async_curl_session: Optional[AsyncCurlSession] = None

    async def __aenter__(self):  # pragma: no cover
        """Creates and returns a new asynchronous Session."""
        if self._is_alive:
            raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")

        self._async_curl_session = AsyncCurlSession()
        self._is_alive = True
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Closes the active asynchronous session managed by this instance, if any."""
        # For type checking (not accessed error)
        _ = (
            exc_type,
            exc_val,
            exc_tb,
        )
        if self._async_curl_session:
            await self._async_curl_session.close()
            self._async_curl_session = None

        self._is_alive = False

    async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
        """
        Perform an HTTP request using the configured session.
        """
        stealth = self._stealth if stealth is None else stealth

        selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
        max_retries = self._get_param(kwargs, "retries", self._default_retries)
        retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
        static_proxy = kwargs.pop("proxy", None)

        session = self._async_curl_session
        one_off_request = False
        if session is _NO_SESSION and self.__aenter__ is None:
            # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
            # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
            # 2. `curl_cffi` doesn't support making async requests without sessions
            # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
            session = AsyncCurlSession()
            one_off_request = True

        if not session:
            raise RuntimeError("No active session available.")  # pragma: no cover

        try:
            # Determine if we should use proxy rotation
            for attempt in range(max_retries):
                if self._proxy_rotator and static_proxy is None:
                    proxy = self._proxy_rotator.get_proxy()
                else:
                    proxy = static_proxy

                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
                try:
                    response = await session.request(method, **request_args)
                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
                    return result
                except CurlError as e:  # pragma: no cover
                    if attempt < max_retries - 1:
                        # Now if the rotator is enabled, we will try again with the new proxy
                        # If it's not enabled, then we will try again with the same proxy
                        if is_proxy_error(e):
                            log.warning(
                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
                            )
                        else:
                            log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")

                        await asyncio_sleep(retry_delay)
                    else:
                        log.error(f"Failed after {max_retries} attempts: {e}")
                        raise  # Raise the exception if all retries fail
        finally:
            if session and one_off_request:
                await session.close()

        raise RuntimeError("No active session available.")  # pragma: no cover

    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
        """
        Perform a GET request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)

    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
        """
        Perform a POST request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)

    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
        """
        Perform a PUT request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)

    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
        """
        Perform a DELETE request.

        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.

        :param url: Target URL for the request.
        :param kwargs: Additional keyword arguments including:
            - data: Form data to include in the request body.
            - json: A JSON serializable object to include in the body of the request.
            - params: Query string parameters for the request.
            - headers: Headers to include in the request.
            - cookies: Cookies to use in the request.
            - timeout: Number of seconds to wait before timing out.
            - follow_redirects: Whether to follow redirects. Defaults to True.
            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
            - retries: Number of retry attempts. Defaults to 3.
            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
            - proxies: Dict of proxies to use.
            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
            - verify: Whether to verify HTTPS certificates.
            - cert: Tuple of (cert, key) filenames for the client certificate.
            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
        :return: A `Response` object.
        """
        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
        # But some websites accept it, it depends on the implementation used.
        stealthy_headers = kwargs.pop("stealthy_headers", None)
        return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)


class FetcherSession:
    """
    A factory context manager that provides configured Fetcher sessions.

    When this manager is used in a 'with' or 'async with' block,
    it yields a new session configured with the manager's defaults.
    A single instance of this manager should ideally be used for one active
    session at a time (or sequentially). Re-entering a context with the
    same manager instance while a session is already active is disallowed.
    """

    __slots__ = (
        "_default_impersonate",
        "_stealth",
        "_default_proxies",
        "_default_proxy",
        "_default_proxy_auth",
        "_default_timeout",
        "_default_headers",
        "_default_retries",
        "_default_retry_delay",
        "_default_follow_redirects",
        "_default_max_redirects",
        "_default_verify",
        "_default_cert",
        "_default_http3",
        "selector_config",
        "_client",
        "_is_alive",
        "_proxy_rotator",
    )

    def __init__(
        self,
        impersonate: ImpersonateType = "chrome",
        http3: Optional[bool] = False,
        stealthy_headers: Optional[bool] = True,
        proxies: Optional[Dict[str, str]] = None,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Tuple[str, str]] = None,
        timeout: Optional[int | float] = 30,
        headers: Optional[Dict[str, str]] = None,
        retries: Optional[int] = 3,
        retry_delay: Optional[int] = 1,
        follow_redirects: bool = True,
        max_redirects: int = 30,
        verify: bool = True,
        cert: Optional[str | Tuple[str, str]] = None,
        selector_config: Optional[Dict] = None,
        proxy_rotator: Optional[ProxyRotator] = None,
    ):
        """
        :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
        :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
                     Cannot be used together with the `proxies` parameter.
        :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
        :param timeout: Number of seconds to wait before timing out.
        :param headers: Headers to include in the session with every request.
        :param retries: Number of retry attempts. Defaults to 3.
        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
        :param follow_redirects: Whether to follow redirects. Defaults to True.
        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
        :param verify: Whether to verify HTTPS certificates. Defaults to True.
        :param cert: Tuple of (cert, key) filenames for the client certificate.
        :param selector_config: Arguments passed when creating the final Selector class.
        :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
        """
        self._default_impersonate: ImpersonateType = impersonate
        self._stealth = stealthy_headers
        self._default_proxies = proxies or {}
        self._default_proxy = proxy or None
        self._default_proxy_auth = proxy_auth or None
        self._default_timeout = timeout
        self._default_headers = headers or {}
        self._default_retries = retries
        self._default_retry_delay = retry_delay
        self._default_follow_redirects = follow_redirects
        self._default_max_redirects = max_redirects
        self._default_verify = verify
        self._default_cert = cert
        self._default_http3 = http3
        self.selector_config = selector_config or {}
        self._is_alive = False
        self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
        self._proxy_rotator = proxy_rotator

    def __enter__(self) -> _SyncSessionLogic:
        """Creates and returns a new synchronous Fetcher Session"""
        if self._client is None:
            # Use **vars(self) to avoid repeating all parameters
            config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
            config["stealthy_headers"] = self._stealth
            config["selector_config"] = self.selector_config
            config["proxy_rotator"] = self._proxy_rotator
            self._client = _SyncSessionLogic(**config)
            self._is_alive = True
            return self._client.__enter__()
        raise RuntimeError("This FetcherSession instance already has an active synchronous session.")

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self._client is not None and isinstance(self._client, _SyncSessionLogic):
            self._client.__exit__(exc_type, exc_val, exc_tb)
            self._client = None
            self._is_alive = False
            return
        raise RuntimeError("Cannot exit invalid session")

    async def __aenter__(self) -> _ASyncSessionLogic:
        """Creates and returns a new asynchronous Session."""
        if self._client is None:
            # Use **vars(self) to avoid repeating all parameters
            config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
            config["stealthy_headers"] = self._stealth
            config["selector_config"] = self.selector_config
            config["proxy_rotator"] = self._proxy_rotator
            self._client = _ASyncSessionLogic(**config)
            self._is_alive = True
            return await self._client.__aenter__()
        raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
            await self._client.__aexit__(exc_type, exc_val, exc_tb)
            self._client = None
            self._is_alive = False
            return
        raise RuntimeError("Cannot exit invalid session")


class FetcherClient(_SyncSessionLogic):
    __slots__ = ("__enter__", "__exit__")

    def __init__(self, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.__enter__: Any = None
        self.__exit__: Any = None
        self._curl_session: Any = _NO_SESSION


class AsyncFetcherClient(_ASyncSessionLogic):
    __slots__ = ("__aenter__", "__aexit__")

    def __init__(self, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.__aenter__: Any = None
        self.__aexit__: Any = None
        self._async_curl_session: Any = _NO_SESSION


================================================
FILE: scrapling/engines/toolbelt/__init__.py
================================================
from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation

__all__ = ["ProxyRotator", "is_proxy_error", "cyclic_rotation"]


================================================
FILE: scrapling/engines/toolbelt/convertor.py
================================================
from functools import lru_cache
from re import compile as re_compile

from curl_cffi.requests import Response as CurlResponse
from playwright._impl._errors import Error as PlaywrightError
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse

from scrapling.core.utils import log
from .custom import Response, StatusText
from scrapling.core._types import Dict, Optional

__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")


class ResponseFactory:
    """
    Factory class for creating `Response` objects from various sources.

    This class provides multiple static and instance methods for building standardized `Response` objects
    from diverse input sources such as Playwright responses, asynchronous Playwright responses,
    and raw HTTP request responses. It supports handling response histories, constructing the proper
    response objects, and managing encoding, headers, cookies, and other attributes.
    """

    @classmethod
    @lru_cache(maxsize=16)
    def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
        """Extract browser encoding from headers.
        Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
        """
        if content_type:
            # Because Playwright can't do that by themselves like all libraries for some reason :3
            match = __CHARSET_RE__.search(content_type)
            return match.group(1) if match else default
        return default

    @classmethod
    def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
        """Process response history to build a list of `Response` objects"""
        history: list[Response] = []
        current_request = first_response.request.redirected_from

        try:
            while current_request:
                try:
                    current_response = current_request.response()
                    history.insert(
                        0,
                        Response(
                            **{
                                "url": current_request.url,
                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
                                "content": "",
                                "status": current_response.status if current_response else 301,
                                "reason": (current_response.status_text or StatusText.get(current_response.status))
                                if current_response
                                else StatusText.get(301),
                                "encoding": cls.__extract_browser_encoding(
                                    current_response.headers.get("content-type", "")
                                )
                                if current_response
                                else "utf-8",
                                "cookies": tuple(),
                                "headers": current_response.all_headers() if current_response else {},
                                "request_headers": current_request.all_headers(),
                                **parser_arguments,
                            }
                        ),
                    )
                except Exception as e:  # pragma: no cover
                    log.error(f"Error processing redirect: {e}")
                    break

                current_request = current_request.redirected_from
        except Exception as e:  # pragma: no cover
            log.error(f"Error processing response history: {e}")

        return history

    @classmethod
    def from_playwright_response(
        cls,
        page: SyncPage,
        first_response: SyncResponse,
        final_response: Optional[SyncResponse],
        parser_arguments: Dict,
        meta: Optional[Dict] = None,
    ) -> Response:
        """
        Transforms a Playwright response into an internal `Response` object, encapsulating
        the page's content, response status, headers, and relevant metadata.

        The function handles potential issues, such as empty or missing final responses,
        by falling back to the first response if necessary. Encoding and status text
        are also derived from the provided response headers or reasonable defaults.
        Additionally, the page content and cookies are extracted for further use.

        :param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
            the `Response` object.
        :param meta: Additional meta data to be saved with the response.

        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
        :rtype: Response
        """
        # In case we didn't catch a document type somehow
        final_response = final_response if final_response else first_response
        if not final_response:
            raise ValueError("Failed to get a response from the page")

        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
        # PlayWright API sometimes give empty status text for some reason!
        status_text = final_response.status_text or StatusText.get(final_response.status)

        history = cls._process_response_history(first_response, parser_arguments)
        try:
            if "html" in final_response.all_headers().get("content-type", ""):
                page_content = cls._get_page_content(page).encode("utf-8")
            else:
                page_content = final_response.body()
        except Exception as e:  # pragma: no cover
            log.error(f"Error getting page content: {e}")
            page_content = b""

        return Response(
            **{
                "url": page.url,
                "content": page_content,
                "status": final_response.status,
                "reason": status_text,
                "encoding": encoding,
                "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
                "headers": first_response.all_headers(),
                "request_headers": first_response.request.all_headers(),
                "history": history,
                "meta": meta,
                **parser_arguments,
            }
        )

    @classmethod
    async def _async_process_response_history(
        cls, first_response: AsyncResponse, parser_arguments: Dict
    ) -> list[Response]:
        """Process response history to build a list of `Response` objects"""
        history: list[Response] = []
        current_request = first_response.request.redirected_from

        try:
            while current_request:
                try:
                    current_response = await current_request.response()
                    history.insert(
                        0,
                        Response(
                            **{
                                "url": current_request.url,
                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
                                "content": "",
                                "status": current_response.status if current_response else 301,
                                "reason": (current_response.status_text or StatusText.get(current_response.status))
                                if current_response
                                else StatusText.get(301),
                                "encoding": cls.__extract_browser_encoding(
                                    current_response.headers.get("content-type", "")
                                )
                                if current_response
                                else "utf-8",
                                "cookies": tuple(),
                                "headers": await current_response.all_headers() if current_response else {},
                                "request_headers": await current_request.all_headers(),
                                **parser_arguments,
                            }
                        ),
                    )
                except Exception as e:  # pragma: no cover
                    log.error(f"Error processing redirect: {e}")
                    break

                current_request = current_request.redirected_from
        except Exception as e:  # pragma: no cover
            log.error(f"Error processing response history: {e}")

        return history

    @classmethod
    def _get_page_content(cls, page: SyncPage) -> str:
        """
        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
        :param page: The page to extract content from.
        :return:
        """
        while True:
            try:
                return page.content() or ""
            except PlaywrightError:
                page.wait_for_timeout(500)
                continue
        return ""  # pyright: ignore

    @classmethod
    async def _get_async_page_content(cls, page: AsyncPage) -> str:
        """
        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
        :param page: The page to extract content from.
        :return:
        """
        while True:
            try:
                return (await page.content()) or ""
            except PlaywrightError:
                await page.wait_for_timeout(500)
                continue
        return ""  # pyright: ignore

    @classmethod
    async def from_async_playwright_response(
        cls,
        page: AsyncPage,
        first_response: AsyncResponse,
        final_response: Optional[AsyncResponse],
        parser_arguments: Dict,
        meta: Optional[Dict] = None,
    ) -> Response:
        """
        Transforms a Playwright response into an internal `Response` object, encapsulating
        the page's content, response status, headers, and relevant metadata.

        The function handles potential issues, such as empty or missing final responses,
        by falling back to the first response if necessary. Encoding and status text
        are also derived from the provided response headers or reasonable defaults.
        Additionally, the page content and cookies are extracted for further use.

        :param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
            the `Response` object.
        :param meta: Additional meta data to be saved with the response.

        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
        :rtype: Response
        """
        # In case we didn't catch a document type somehow
        final_response = final_response if final_response else first_response
        if not final_response:
            raise ValueError("Failed to get a response from the page")

        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
        # PlayWright API sometimes give empty status text for some reason!
        status_text = final_response.status_text or StatusText.get(final_response.status)

        history = await cls._async_process_response_history(first_response, parser_arguments)
        try:
            if "html" in (await final_response.all_headers()).get("content-type", ""):
                page_content = (await cls._get_async_page_content(page)).encode("utf-8")
            else:
                page_content = await final_response.body()
        except Exception as e:  # pragma: no cover
            log.error(f"Error getting page content in async: {e}")
            page_content = b""

        return Response(
            **{
                "url": page.url,
                "content": page_content,
                "status": final_response.status,
                "reason": status_text,
                "encoding": encoding,
                "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
                "headers": await first_response.all_headers(),
                "request_headers": await first_response.request.all_headers(),
                "history": history,
                "meta": meta,
                **parser_arguments,
            }
        )

    @staticmethod
    def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
        """Takes `curl_cffi` response and generates `Response` object from it.

        :param response: `curl_cffi` response object
        :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
        :param meta: Optional metadata dictionary to attach to the Response.
        :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
        """
        return Response(
            **{
                "url": response.url,
                "content": response.content,
                "status": response.status_code,
                "reason": response.reason,
                "encoding": response.encoding or "utf-8",
                "cookies": dict(response.cookies),
                "headers": dict(response.headers),
                "request_headers": dict(response.request.headers) if response.request else {},
                "method": response.request.method if response.request else "GET",
                "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                "meta": meta,
                **parser_arguments,
            }
        )


================================================
FILE: scrapling/engines/toolbelt/custom.py
================================================
"""
Functions related to custom types or type checking
"""

from functools import lru_cache

from scrapling.core.utils import log
from scrapling.core._types import (
    Any,
    Dict,
    cast,
    List,
    Tuple,
    Union,
    Optional,
    Callable,
    Sequence,
    TYPE_CHECKING,
    AsyncGenerator,
)
from scrapling.core.custom_types import MappingProxyType
from scrapling.parser import Selector, SQLiteStorageSystem

if TYPE_CHECKING:
    from scrapling.spiders import Request


class Response(Selector):
    """This class is returned by all engines as a way to unify the response type between different libraries."""

    def __init__(
        self,
        url: str,
        content: str | bytes,
        status: int,
        reason: str,
        cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
        headers: Dict,
        request_headers: Dict,
        encoding: str = "utf-8",
        method: str = "GET",
        history: List | None = None,
        meta: Dict[str, Any] | None = None,
        **selector_config: Any,
    ):
        if isinstance(content, str):
            content = content.encode("utf-8")

        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
        self.status = status
        self.reason = reason
        self.cookies = cookies
        self.headers = headers
        self.request_headers = request_headers
        self.history = history or []
        super().__init__(
            content=content,
            url=adaptive_domain or url,
            encoding=encoding,
            **selector_config,
        )
        # For easier debugging while working from a Python shell
        log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")

        if meta and not isinstance(meta, dict):
            raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")

        self.meta: Dict[str, Any] = meta or {}
        self.request: Optional["Request"] = None  # Will be set by crawler

    @property
    def body(self) -> bytes:
        """Return the raw body of the response as bytes."""
        return cast(bytes, cast(Sequence, self._raw_body))

    def follow(
        self,
        url: str,
        sid: str = "",
        callback: Callable[["Response"], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
        priority: int | None = None,
        dont_filter: bool = False,
        meta: dict[str, Any] | None = None,
        referer_flow: bool = True,
        **kwargs: Any,
    ) -> Any:
        """Create a Request to follow a URL.

        This is a helper method for spiders to easily follow links found in pages.

        **IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`.

        :param url: The URL to follow (can be relative, will be joined with current URL)
        :param sid: The session id to use
        :param callback: Spider callback method to use
        :param priority: The priority number to use, the higher the number, the higher priority to be processed first.
        :param dont_filter: If this request has been done before, disable the filter to allow it again.
        :param meta: Additional meta data to included in the request
        :param referer_flow: Enabled by default, set the current response url as referer for the new request url.
        :param kwargs: Additional Request arguments
        :return: Request object ready to be yielded
        """
        from scrapling.spiders import Request

        if not self.request or not isinstance(self.request, Request):
            raise TypeError("This response has no request set yet.")

        # Merge original session kwargs with new kwargs (new takes precedence)
        session_kwargs = {**self.request._session_kwargs, **kwargs}

        if referer_flow:
            # For requests
            headers = session_kwargs.get("headers", {})
            headers["referer"] = self.url
            session_kwargs["headers"] = headers

            # For browsers
            extra_headers = session_kwargs.get("extra_headers", {})
            extra_headers["referer"] = self.url
            session_kwargs["extra_headers"] = extra_headers

            session_kwargs["google_search"] = False

        return Request(
            url=self.urljoin(url),
            sid=sid or self.request.sid,
            callback=callback or self.request.callback,
            priority=priority if priority is not None else self.request.priority,
            dont_filter=dont_filter,
            meta={**(self.meta or {}), **(meta or {})},
            **session_kwargs,
        )

    def __str__(self) -> str:
        return f"<{self.status} {self.url}>"


class BaseFetcher:
    __slots__ = ()
    huge_tree: bool = True
    adaptive: Optional[bool] = False
    storage: Any = SQLiteStorageSystem
    keep_cdata: Optional[bool] = False
    storage_args: Optional[Dict] = None
    keep_comments: Optional[bool] = False
    adaptive_domain: str = ""
    parser_keywords: Tuple = (
        "huge_tree",
        "adaptive",
        "storage",
        "keep_cdata",
        "storage_args",
        "keep_comments",
        "adaptive_domain",
    )  # Left open for the user

    def __init__(self, *args, **kwargs):
        # For backward-compatibility before 0.2.99
        args_str = ", ".join(args) or ""
        kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
        if args_str:
            args_str += ", "

        log.warning(
            f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
        )
        pass

    @classmethod
    def display_config(cls):
        return dict(
            huge_tree=cls.huge_tree,
            keep_comments=cls.keep_comments,
            keep_cdata=cls.keep_cdata,
            adaptive=cls.adaptive,
            storage=cls.storage,
            storage_args=cls.storage_args,
            adaptive_domain=cls.adaptive_domain,
        )

    @classmethod
    def configure(cls, **kwargs):
        """Set multiple arguments for the parser at once globally

        :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
        """
        for key, value in kwargs.items():
            key = key.strip().lower()
            if hasattr(cls, key):
                if key in cls.parser_keywords:
                    setattr(cls, key, value)
                else:
                    # Yup, no fun allowed LOL
                    raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
            else:
                raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')

        if not kwargs:
            raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")

    @classmethod
    def _generate_parser_arguments(cls) -> Dict:
        # Selector class parameters
        # I won't validate Selector's class parameters here again, I will leave it to be validated later
        parser_arguments = dict(
            huge_tree=cls.huge_tree,
            keep_comments=cls.keep_comments,
            keep_cdata=cls.keep_cdata,
            adaptive=cls.adaptive,
            storage=cls.storage,
            storage_args=cls.storage_args,
            adaptive_domain=cls.adaptive_domain,
        )

        return parser_arguments


class StatusText:
    """A class that gets the status text of the response status code.

    Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
    """

    _phrases = MappingProxyType(
        {
            100: "Continue",
            101: "Switching Protocols",
            102: "Processing",
            103: "Early Hints",
            200: "OK",
            201: "Created",
            202: "Accepted",
            203: "Non-Authoritative Information",
            204: "No Content",
            205: "Reset Content",
            206: "Partial Content",
            207: "Multi-Status",
            208: "Already Reported",
            226: "IM Used",
            300: "Multiple Choices",
            301: "Moved Permanently",
            302: "Found",
            303: "See Other",
            304: "Not Modified",
            305: "Use Proxy",
            307: "Temporary Redirect",
            308: "Permanent Redirect",
            400: "Bad Request",
            401: "Unauthorized",
            402: "Payment Required",
            403: "Forbidden",
            404: "Not Found",
            405: "Method Not Allowed",
            406: "Not Acceptable",
            407: "Proxy Authentication Required",
            408: "Request Timeout",
            409: "Conflict",
            410: "Gone",
            411: "Length Required",
            412: "Precondition Failed",
            413: "Payload Too Large",
            414: "URI Too Long",
            415: "Unsupported Media Type",
            416: "Range Not Satisfiable",
            417: "Expectation Failed",
            418: "I'm a teapot",
            421: "Misdirected Request",
            422: "Unprocessable Entity",
            423: "Locked",
            424: "Failed Dependency",
            425: "Too Early",
            426: "Upgrade Required",
            428: "Precondition Required",
            429: "Too Many Requests",
            431: "Request Header Fields Too Large",
            451: "Unavailable For Legal Reasons",
            500: "Internal Server Error",
            501: "Not Implemented",
            502: "Bad Gateway",
            503: "Service Unavailable",
            504: "Gateway Timeout",
            505: "HTTP Version Not Supported",
            506: "Variant Also Negotiates",
            507: "Insufficient Storage",
            508: "Loop Detected",
            510: "Not Extended",
            511: "Network Authentication Required",
        }
    )

    @classmethod
    @lru_cache(maxsize=128)
    def get(cls, status_code: int) -> str:
        """Get the phrase for a given HTTP status code."""
        return cls._phrases.get(status_code, "Unknown Status Code")


================================================
FILE: scrapling/engines/toolbelt/fingerprints.py
================================================
"""
Functions related to generating headers and fingerprints generally
"""

from functools import lru_cache
from platform import system as platform_system

from browserforge.headers import Browser, HeaderGenerator
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS

from scrapling.core._types import Dict, Literal, Tuple

__OS_NAME__ = platform_system()
OSName = Literal["linux", "macos", "windows"]
# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
chromium_version = 145
chrome_version = 145


@lru_cache(1, typed=True)
def get_os_name() -> OSName | Tuple:
    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.

    :return: Current OS name or `None` otherwise
    """
    match __OS_NAME__:  # pragma: no cover
        case "Linux":
            return "linux"
        case "Darwin":
            return "macos"
        case "Windows":
            return "windows"
        case _:
            return SUPPORTED_OPERATING_SYSTEMS


def generate_headers(browser_mode: bool | str = False) -> Dict:
    """Generate real browser-like headers using browserforge's generator

    :param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything
    :return: A dictionary of the generated headers
    """
    # In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,
    # So we don't raise any inconsistency red flags while websites fingerprinting us
    os_name = get_os_name()
    ver = chrome_version if browser_mode and browser_mode == "chrome" else chromium_version
    browsers = [Browser(name="chrome", min_version=ver, max_version=ver)]
    if not browser_mode:
        os_name = ("windows", "macos", "linux")
        browsers.extend(
            [
                Browser(name="firefox", min_version=142),
                Browser(name="edge", min_version=140),
            ]
        )
    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()


__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")


================================================
FILE: scrapling/engines/toolbelt/navigation.py
================================================
"""
Functions related to files and URLs
"""

from urllib.parse import urlparse

from playwright.async_api import Route as async_Route
from msgspec import Struct, structs, convert, ValidationError
from playwright.sync_api import Route

from scrapling.core.utils import log
from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
from scrapling.engines.constants import EXTRA_RESOURCES


class ProxyDict(Struct):
    server: str
    username: str = ""
    password: str = ""


def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
    """Create a route handler that blocks both resource types and specific domains.

    :param disable_resources: Whether to block default resource types.
    :param blocked_domains: Set of domain names to block requests to.
    :return: A sync route handler function.
    """
    disabled_resources = EXTRA_RESOURCES if disable_resources else set()
    domains = blocked_domains or set()

    def handler(route: Route):
        if route.request.resource_type in disabled_resources:
            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
            route.abort()
        elif domains:
            hostname = urlparse(route.request.url).hostname or ""
            if any(hostname == d or hostname.endswith("." + d) for d in domains):
                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
                route.abort()
            else:
                route.continue_()
        else:
            route.continue_()

    return handler


def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
    """Create an async route handler that blocks both resource types and specific domains.

    :param disable_resources: Whether to block default resource types.
    :param blocked_domains: Set of domain names to block requests to.
    :return: An async route handler function.
    """
    disabled_resources = EXTRA_RESOURCES if disable_resources else set()
    domains = blocked_domains or set()

    async def handler(route: async_Route):
        if route.request.resource_type in disabled_resources:
            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
            await route.abort()
        elif domains:
            hostname = urlparse(route.request.url).hostname or ""
            if any(hostname == d or hostname.endswith("." + d) for d in domains):
                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
                await route.abort()
            else:
                await route.continue_()
        else:
            await route.continue_()

    return handler


def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
    """Validate a proxy and return it in the acceptable format for Playwright
    Reference: https://playwright.dev/python/docs/network#http-proxy

    :param proxy_string: A string or a dictionary representation of the proxy.
    :return:
    """
    if isinstance(proxy_string, str):
        proxy = urlparse(proxy_string)
        if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
            raise ValueError("Invalid proxy string!")

        try:
            result = {
                "server": f"{proxy.scheme}://{proxy.hostname}",
                "username": proxy.username or "",
                "password": proxy.password or "",
            }
            if proxy.port:
                result["server"] += f":{proxy.port}"
            return result
        except ValueError:
            # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
            raise ValueError("The proxy argument's string is in invalid format!")

    elif isinstance(proxy_string, dict):
        try:
            validated = convert(proxy_string, ProxyDict)
            result_dict = structs.asdict(validated)
            return result_dict
        except ValidationError as e:
            raise TypeError(f"Invalid proxy dictionary: {e}")

    raise TypeError(f"Invalid proxy string: {proxy_string}")


================================================
FILE: scrapling/engines/toolbelt/proxy_rotation.py
================================================
from threading import Lock

from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType


RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]
_PROXY_ERROR_INDICATORS = {
    "net::err_proxy",
    "net::err_tunnel",
    "connection refused",
    "connection reset",
    "connection timed out",
    "failed to connect",
    "could not resolve proxy",
}


def _get_proxy_key(proxy: ProxyType) -> str:
    """Generate a unique key for a proxy (for dicts it's server plus username)."""
    if isinstance(proxy, str):
        return proxy
    server = proxy.get("server", "")
    username = proxy.get("username", "")
    return f"{server}|{username}"


def is_proxy_error(error: Exception) -> bool:
    """Check if an error is proxy-related. Works for both HTTP and browser errors."""
    error_msg = str(error).lower()
    return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)


def cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:
    """Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end."""
    idx = current_index % len(proxies)
    return proxies[idx], (idx + 1) % len(proxies)


class ProxyRotator:
    """
    A thread-safe proxy rotator with pluggable rotation strategies.

    Supports:
    - Cyclic rotation (default)
    - Custom rotation strategies via callable
    - Both string URLs and Playwright-style dict proxies
    """

    __slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock")

    def __init__(
        self,
        proxies: List[ProxyType],
        strategy: RotationStrategy = cyclic_rotation,
    ):
        """
        Initialize the proxy rotator.

        :param proxies: List of proxy URLs or Playwright-style proxy dicts.
            - String format: "http://proxy1:8080" or "http://user:pass@proxy:8080"
            - Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"}
        :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation.
        """
        if not proxies:
            raise ValueError("At least one proxy must be provided")

        if not callable(strategy):
            raise TypeError(f"strategy must be callable, got {type(strategy).__name__}")

        self._strategy = strategy
        self._lock = Lock()

        # Validate and store proxies
        self._proxies: List[ProxyType] = []
        self._proxy_to_index: Dict[str, int] = {}  # O(1) lookup by unique key (server + username)
        for i, proxy in enumerate(proxies):
            if isinstance(proxy, (str, dict)):
                if isinstance(proxy, dict) and "server" not in proxy:
                    raise ValueError("Proxy dict must have a 'server' key")

                self._proxy_to_index[_get_proxy_key(proxy)] = i
                self._proxies.append(proxy)
            else:
                raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.")

        self._current_index = 0

    def get_proxy(self) -> ProxyType:
        """Get the next proxy according to the rotation strategy."""
        with self._lock:
            proxy, self._current_index = self._strategy(self._proxies, self._current_index)
            return proxy

    @property
    def proxies(self) -> List[ProxyType]:
        """Get a copy of all configured proxies."""
        return list(self._proxies)

    def __len__(self) -> int:
        """Return the total number of configured proxies."""
        return len(self._proxies)

    def __repr__(self) -> str:
        return f"ProxyRotator(proxies={len(self._proxies)})"


================================================
FILE: scrapling/fetchers/__init__.py
================================================
from typing import TYPE_CHECKING, Any
from scrapling.engines.toolbelt import ProxyRotator

if TYPE_CHECKING:
    from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
    from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
    from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession


# Lazy import mapping
_LAZY_IMPORTS = {
    "Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
    "AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
    "FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
    "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
    "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
    "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
    "StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
    "StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
    "AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
}

__all__ = [
    "Fetcher",
    "AsyncFetcher",
    "ProxyRotator",
    "FetcherSession",
    "DynamicFetcher",
    "DynamicSession",
    "AsyncDynamicSession",
    "StealthyFetcher",
    "StealthySession",
    "AsyncStealthySession",
]


def __getattr__(name: str) -> Any:
    if name in _LAZY_IMPORTS:
        module_path, class_name = _LAZY_IMPORTS[name]
        module = __import__(module_path, fromlist=[class_name])
        return getattr(module, class_name)
    else:
        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__() -> list[str]:
    """Support for dir() and autocomplete."""
    return sorted(list(_LAZY_IMPORTS.keys()))


================================================
FILE: scrapling/fetchers/chrome.py
================================================
from scrapling.core._types import Unpack
from scrapling.engines._browsers._types import PlaywrightSession
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession


class DynamicFetcher(BaseFetcher):
    """A `Fetcher` that provide many options to fetch/load websites' pages through chromium-based browsers."""

    @classmethod
    def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
        """Opens up a browser and do your request based on your chosen options below.

        :param url: Target url.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request.
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
        :return: A `Response` object.
        """
        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
            "custom_config", {}
        )  # Checking `custom_config` for backward compatibility
        if not isinstance(selector_config, dict):
            raise TypeError("Argument `selector_config` must be a dictionary.")

        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

        with DynamicSession(**kwargs) as session:
            return session.fetch(url)

    @classmethod
    async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
        """Opens up a browser and do your request based on your chosen options below.

        :param url: Target url.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request.
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
        :return: A `Response` object.
        """
        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
            "custom_config", {}
        )  # Checking `custom_config` for backward compatibility
        if not isinstance(selector_config, dict):
            raise TypeError("Argument `selector_config` must be a dictionary.")

        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

        async with AsyncDynamicSession(**kwargs) as session:
            return await session.fetch(url)


PlayWrightFetcher = DynamicFetcher  # For backward-compatibility


================================================
FILE: scrapling/fetchers/requests.py
================================================
from scrapling.engines.static import (
    FetcherSession,
    FetcherClient as _FetcherClient,
    AsyncFetcherClient as _AsyncFetcherClient,
)
from scrapling.engines.toolbelt.custom import BaseFetcher


__FetcherClientInstance__ = _FetcherClient()
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()


class Fetcher(BaseFetcher):
    """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""

    get = __FetcherClientInstance__.get
    post = __FetcherClientInstance__.post
    put = __FetcherClientInstance__.put
    delete = __FetcherClientInstance__.delete


class AsyncFetcher(BaseFetcher):
    """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""

    get = __AsyncFetcherClientInstance__.get
    post = __AsyncFetcherClientInstance__.post
    put = __AsyncFetcherClientInstance__.put
    delete = __AsyncFetcherClientInstance__.delete


================================================
FILE: scrapling/fetchers/stealth_chrome.py
================================================
from scrapling.core._types import Unpack
from scrapling.engines._browsers._types import StealthSession
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession


class StealthyFetcher(BaseFetcher):
    """A `Fetcher` class type which is a completely stealthy built on top of Chromium.

    It works as real browsers passing almost all online tests/protections with many customization options.
    """

    @classmethod
    def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
        """
        Opens up a browser and do your request based on your chosen options below.

        :param url: Target url.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        :return: A `Response` object.
        """
        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
            "custom_config", {}
        )  # Checking `custom_config` for backward compatibility
        if not isinstance(selector_config, dict):
            raise TypeError("Argument `selector_config` must be a dictionary.")

        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

        with StealthySession(**kwargs) as engine:
            return engine.fetch(url)

    @classmethod
    async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
        """
        Opens up a browser and do your request based on your chosen options below.

        :param url: Target url.
        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
        :param cookies: Set cookies for the next request.
        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
            rules. Defaults to the system default locale.
        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
        :param google_search: Enabled by default, Scrapling will set a Google referer header.
        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
        :return: A `Response` object.
        """
        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
            "custom_config", {}
        )  # Checking `custom_config` for backward compatibility
        if not isinstance(selector_config, dict):
            raise TypeError("Argument `selector_config` must be a dictionary.")

        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}

        async with AsyncStealthySession(**kwargs) as engine:
            return await engine.fetch(url)


================================================
FILE: scrapling/parser.py
================================================
from pathlib import Path
from inspect import signature
from urllib.parse import urljoin
from difflib import SequenceMatcher
from re import Pattern as re_Pattern

from lxml.html import HtmlElement, HTMLParser
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
from lxml.etree import (
    XPath,
    tostring,
    fromstring,
    XPathError,
    XPathEvalError,
    _ElementUnicodeResult,
)

from scrapling.core._types import (
    Any,
    Set,
    Dict,
    cast,
    List,
    Tuple,
    Union,
    TypeVar,
    Pattern,
    Callable,
    Literal,
    Optional,
    Iterable,
    overload,
    Generator,
    SupportsIndex,
    TYPE_CHECKING,
)
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
from scrapling.core.mixins import SelectorsGeneration
from scrapling.core.storage import (
    SQLiteStorageSystem,
    StorageSystemMixin,
    _StorageTools,
)
from scrapling.core.translator import css_to_xpath as _css_to_xpath
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log

__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
# Attributes that are Python reserved words and can't be used directly
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
# https://www.w3schools.com/python/python_ref_keywords.asp
_whitelisted = {
    "class_": "class",
    "for_": "for",
}
_T = TypeVar("_T")
# Pre-compiled selectors for efficiency
_find_all_elements = XPath(".//*")
_find_all_elements_with_spaces = XPath(
    ".//*[normalize-space(text())]"
)  # This selector gets all elements with text content
_find_all_text_nodes = XPath(".//text()")


class Selector(SelectorsGeneration):
    __slots__ = (
        "url",
        "encoding",
        "__adaptive_enabled",
        "_root",
        "_storage",
        "__keep_comments",
        "__huge_tree_enabled",
        "__attributes",
        "__text",
        "__tag",
        "__keep_cdata",
        "_raw_body",
    )

    def __init__(
        self,
        content: Optional[str | bytes] = None,
        url: str = "",
        encoding: str = "utf-8",
        huge_tree: bool = True,
        root: Optional[HtmlElement] = None,
        keep_comments: Optional[bool] = False,
        keep_cdata: Optional[bool] = False,
        adaptive: Optional[bool] = False,
        _storage: Optional[StorageSystemMixin] = None,
        storage: Any = SQLiteStorageSystem,
        storage_args: Optional[Dict] = None,
        **_,
    ):
        """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
        with expressions in CSS, XPath, or with simply text. Check the docs for more info.

        Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
        inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
        not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
        It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`

        :param content: HTML content as either string or bytes.
        :param url: It allows storing a URL with the HTML data for retrieving later.
        :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
        :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
             the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
        :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
            Don't use it unless you know what you are doing!
        :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
        :param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
            priority over all adaptive related arguments/functions in the class.
        :param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
        :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
            If empty, default values will be used.
        """
        if root is None and content is None:
            raise ValueError("Selector class needs HTML content, or root arguments to work")

        self.url = url
        self._raw_body: str | bytes = ""
        self.encoding = encoding
        self.__keep_cdata = keep_cdata
        self.__huge_tree_enabled = huge_tree
        self.__keep_comments = keep_comments
        # For selector stuff
        self.__text: Optional[TextHandler] = None
        self.__attributes: Optional[AttributesHandler] = None
        self.__tag: Optional[str] = None
        self._storage: Optional[StorageSystemMixin] = None
        if root is None:
            body: str | bytes
            if isinstance(content, str):
                body = content.strip().replace("\x00", "") or "<html/>"
            elif isinstance(content, bytes):
                body = content.replace(b"\x00", b"")
            else:
                raise TypeError(f"content argument must be str or bytes, got {type(content)}")

            # https://lxml.de/api/lxml.etree.HTMLParser-class.html
            _parser_kwargs: Dict[str, Any] = dict(
                recover=True,
                remove_blank_text=True,
                remove_comments=(not keep_comments),
                encoding=encoding,
                compact=True,
                huge_tree=huge_tree,
                default_doctype=True,  # Supported by lxml but missing from stubs
                strip_cdata=(not keep_cdata),
            )
            parser = HTMLParser(**_parser_kwargs)
            self._root = cast(HtmlElement, fromstring(body or "<html/>", parser=parser, base_url=url or ""))
            self._raw_body = content

        else:
            self._root = cast(HtmlElement, root)

            if self._is_text_node(root):
                self.__adaptive_enabled = False
                return

        self.__adaptive_enabled = bool(adaptive)

        if self.__adaptive_enabled:
            if _storage is not None:
                self._storage = _storage
            else:
                if not storage_args:
                    storage_args = {
                        "storage_file": __DEFAULT_DB_FILE__,
                        "url": url,
                    }

                if not hasattr(storage, "__wrapped__"):
                    raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")

                if not issubclass(storage.__wrapped__, StorageSystemMixin):  # pragma: no cover
                    raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")

                self._storage = storage(**storage_args)

    def __getitem__(self, key: str) -> TextHandler:
        if self._is_text_node(self._root):
            raise TypeError("Text nodes do not have attributes")
        return self.attrib[key]

    def __contains__(self, key: str) -> bool:
        if self._is_text_node(self._root):
            return False
        return key in self.attrib

    # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
    @staticmethod
    def _is_text_node(
        element: HtmlElement | _ElementUnicodeResult,
    ) -> bool:
        """Return True if the given element is a result of a string expression
        Examples:
            XPath -> '/text()', '/@attribute', etc...
            CSS3 -> '::text', '::attr(attrib)'...
        """
        # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
        return issubclass(type(element), _ElementUnicodeResult)

    def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> "Selector":
        """Used internally to convert a single HtmlElement or text node to Selector directly without checks"""
        return Selector(
            root=element,
            url=self.url,
            encoding=self.encoding,
            adaptive=self.__adaptive_enabled,
            _storage=self._storage,
            keep_comments=self.__keep_comments,
            keep_cdata=self.__keep_cdata,
            huge_tree=self.__huge_tree_enabled,
        )

    def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
        # Store them for non-repeated call-ups
        url = self.url
        encoding = self.encoding
        adaptive = self.__adaptive_enabled
        storage = self._storage
        comments = self.__keep_comments
        cdata = self.__keep_cdata
        huge_tree = self.__huge_tree_enabled

        return Selectors(
            Selector(
                root=el,
                url=url,
                encoding=encoding,
                adaptive=adaptive,
                _storage=storage,
                keep_comments=comments,
                keep_cdata=cdata,
                huge_tree=huge_tree,
            )
            for el in elements
        )

    def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
        """Used internally in all functions to convert results to Selectors in bulk"""
        if not result:
            return Selectors()

        return self.__elements_convertor(result)

    def __getstate__(self) -> Any:
        # lxml don't like it :)
        raise TypeError("Can't pickle Selector objects")

    # The following four properties I made them into functions instead of variables directly
    # So they don't slow down the process of initializing many instances of the class and gets executed only
    # when the user needs them for the first time for that specific element and gets cached for next times
    # Doing that only made the library performance test sky rocked multiple times faster than before
    # because I was executing them on initialization before :))
    @property
    def tag(self) -> str:
        """Get the tag name of the element"""
        if self._is_text_node(self._root):
            return "#text"
        if not self.__tag:
            self.__tag = str(self._root.tag)
        return self.__tag or ""

    @property
    def text(self) -> TextHandler:
        """Get text content of the element"""
        if self._is_text_node(self._root):
            return TextHandler(str(self._root))
        if self.__text is None:
            # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
            # before extracting text, then keep `keep_comments` set to False while initializing the first class
            self.__text = TextHandler(self._root.text or "")
        return self.__text

    def get_all_text(
        self,
        separator: str = "\n",
        strip: bool = False,
        ignore_tags: Tuple = (
            "script",
            "style",
        ),
        valid_values: bool = True,
    ) -> TextHandler:
        """Get all child strings of this element, concatenated using the given separator.

        :param separator: Strings will be concatenated using this separator.
        :param strip: If True, strings will be stripped before being concatenated.
        :param ignore_tags: A tuple of all tag names you want to ignore
        :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored

        :return: A TextHandler
        """
        if self._is_text_node(self._root):
            return TextHandler(str(self._root))

        ignored_elements: set[Any] = set()
        if ignore_tags:
            ignored_elements.update(self._root.iter(*ignore_tags))

        _all_strings = []

        def append_text(text: str) -> None:
            processed_text = text.strip() if strip else text
            if not valid_values or processed_text.strip():
                _all_strings.append(processed_text)

        def is_visible_text_node(text_node: _ElementUnicodeResult) -> bool:
            parent = text_node.getparent()
            if parent is None:
                return False

            owner = parent.getparent() if text_node.is_tail else parent
            while owner is not None:
                if owner in ignored_elements:
                    return False
                owner = owner.getparent()
            return True

        for text_node in cast(list[_ElementUnicodeResult], _find_all_text_nodes(self._root)):
            text = str(text_node)
            if text and is_visible_text_node(text_node):
                append_text(text)

        return cast(TextHandler, TextHandler(separator).join(_all_strings))

    def urljoin(self, relative_url: str) -> str:
        """Join this Selector's url with a relative url to form an absolute full URL."""
        return urljoin(self.url, relative_url)

    @property
    def attrib(self) -> AttributesHandler:
        """Get attributes of the element"""
        if self._is_text_node(self._root):
            return AttributesHandler({})
        if not self.__attributes:
            self.__attributes = AttributesHandler(self._root.attrib)
        return self.__attributes

    @property
    def html_content(self) -> TextHandler:
        """Return the inner HTML code of the element"""
        if self._is_text_node(self._root):
            return TextHandler(str(self._root))
        content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
        if isinstance(content, bytes):
            content = content.strip().decode(self.encoding)
        return TextHandler(content)

    @property
    def body(self) -> str | bytes:
        """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
        if self._is_text_node(self._root):
            return ""
        return self._raw_body

    def prettify(self) -> TextHandler:
        """Return a prettified version of the element's inner html-code"""
        if self._is_text_node(self._root):
            return TextHandler(str(self._root))
        content = tostring(
            self._root,
            encoding=self.encoding,
            pretty_print=True,
            method="html",
            with_tail=False,
        )
        if isinstance(content, bytes):
            content = content.strip().decode(self.encoding)
        return TextHandler(content)

    def has_class(self, class_name: str) -> bool:
        """Check if the element has a specific class
        :param class_name: The class name to check for
        :return: True if element has class with that name otherwise False
        """
        if self._is_text_node(self._root):
            return False
        return class_name in self._root.classes

    @property
    def parent(self) -> Optional["Selector"]:
        """Return the direct parent of the element or ``None`` otherwise"""
        _parent = self._root.getparent()
        return self.__element_convertor(_parent) if _parent is not None else None

    @property
    def below_elements(self) -> "Selectors":
        """Return all elements under the current element in the DOM tree"""
        if self._is_text_node(self._root):
            return Selectors()
        below = cast(List, _find_all_elements(self._root))
        return self.__elements_convertor(below) if below is not None else Selectors()

    @property
    def children(self) -> "Selectors":
        """Return the children elements of the current element or empty list otherwise"""
        if self._is_text_node(self._root):
            return Selectors()
        return Selectors(
            self.__element_convertor(child)
            for child in self._root.iterchildren()
            if not isinstance(child, html_forbidden)
        )

    @property
    def siblings(self) -> "Selectors":
        """Return other children of the current element's parent or empty list otherwise"""
        if self.parent:
            return Selectors(child for child in self.parent.children if child._root != self._root)
        return Selectors()

    def iterancestors(self) -> Generator["Selector", None, None]:
        """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
        if self._is_text_node(self._root):
            return
        for ancestor in self._root.iterancestors():
            yield self.__element_convertor(ancestor)

    def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
        """Loop over all ancestors of the element till one match the passed function
        :param func: A function that takes each ancestor as an argument and returns True/False
        :return: The first ancestor that match the function or ``None`` otherwise.
        """
        for ancestor in self.iterancestors():
            if func(ancestor):
                return ancestor
        return None

    @property
    def path(self) -> "Selectors":
        """Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
        lst = list(self.iterancestors())
        return Selectors(lst)

    @property
    def next(self) -> Optional["Selector"]:
        """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
        if self._is_text_node(self._root):
            return None
        next_element = self._root.getnext()
        while next_element is not None and isinstance(next_element, html_forbidden):
            # Ignore HTML comments and unwanted types
            next_element = next_element.getnext()

        return self.__element_convertor(next_element) if next_element is not None else None

    @property
    def previous(self) -> Optional["Selector"]:
        """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
        if self._is_text_node(self._root):
            return None
        prev_element = self._root.getprevious()
        while prev_element is not None and isinstance(prev_element, html_forbidden):
            # Ignore HTML comments and unwanted types
            prev_element = prev_element.getprevious()

        return self.__element_convertor(prev_element) if prev_element is not None else None

    def get(self) -> TextHandler:
        """
        Serialize this element to a string.
        For text nodes, returns the text value. For HTML elements, returns the outer HTML.
        """
        if self._is_text_node(self._root):
            return TextHandler(str(self._root))
        return self.html_content

    def getall(self) -> TextHandlers:
        """Return a single-element list containing this element's serialized string."""
        return TextHandlers([self.get()])

    extract = getall
    extract_first = get

    def __str__(self) -> str:
        if self._is_text_node(self._root):
            return str(self._root)
        return self.html_content

    def __repr__(self) -> str:
        length_limit = 40

        if self._is_text_node(self._root):
            text = str(self._root)
            if len(text) > length_limit:
                text = text[:length_limit].strip() + "..."
            return f"<text='{text}'>"

        content = clean_spaces(self.html_content)
        if len(content) > length_limit:
            content = content[:length_limit].strip() + "..."
        data = f"<data='{content}'"

        if self.parent:
            parent_content = clean_spaces(self.parent.html_content)
            if len(parent_content) > length_limit:
                parent_content = parent_content[:length_limit].strip() + "..."

            data += f" parent='{parent_content}'"

        return data + ">"

    # From here we start with the selecting functions
    @overload
    def relocate(
        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
    ) -> "Selectors": ...

    @overload
    def relocate(
        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
    ) -> List[HtmlElement]: ...

    def relocate(
        self,
        element: Union[Dict, HtmlElement, "Selector"],
        percentage: int = 0,
        selector_type: bool = False,
    ) -> Union[List[HtmlElement], "Selectors"]:
        """This function will search again for the element in the page tree, used automatically on page structure change

        :param element: The element we want to relocate in the tree
        :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
         calculation depends solely on the page structure, so don't play with this number unless you must know
         what you are doing!
        :param selector_type: If True, the return result will be converted to `Selectors` object
        :return: List of pure HTML elements that got the highest matching score or 'Selectors' object
        """
        score_table: Dict[float, List[Any]] = {}
        # Note: `element` will most likely always be a dictionary at this point.
        if isinstance(element, self.__class__):
            element = element._root

        if issubclass(type(element), HtmlElement):
            element = _StorageTools.element_to_dict(element)

        for node in cast(List, _find_all_elements(self._root)):
            # Collect all elements in the page, then for each element get the matching score of it against the node.
            # Hence: the code doesn't stop even if the score was 100%
            # because there might be another element(s) left in page with the same score
            score = self.__calculate_similarity_score(cast(Dict, element), node)
            score_table.setdefault(score, []).append(node)

        if score_table:
            highest_probability = max(score_table.keys())
            if score_table[highest_probability] and highest_probability >= percentage:
                if log.getEffectiveLevel() < 20:
                    # No need to execute this part if the logging level is not debugging
                    log.debug(f"Highest probability was {highest_probability}%")
                    log.debug("Top 5 best matching elements are: ")
                    for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
                        log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")

                if not selector_type:
                    return score_table[highest_probability]
                return self.__elements_convertor(score_table[highest_probability])
        return []

    def css(
        self,
        selector: str,
        identifier: str = "",
        adaptive: bool = False,
        auto_save: bool = False,
        percentage: int = 0,
    ) -> "Selectors":
        """Search the current tree with CSS3 selectors

        **Important:
        It's recommended to use the identifier argument if you plan to use a different selector later
        and want to relocate the same element(s)**

        :param selector: The CSS3 selector to be used.
        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
        :param identifier: A string that will be used to save/retrieve element's data in adaptive,
         otherwise the selector will be used.
        :param auto_save: Automatically save new elements for `adaptive` later
        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
         number unless you must know what you are doing!

        :return: `Selectors` class.
        """
        if self._is_text_node(self._root):
            return Selectors()

        try:
            if not self.__adaptive_enabled or "," not in selector:
                # No need to split selectors in this case, let's save some CPU cycles :)
                xpath_selector = _css_to_xpath(selector)
                return self.xpath(
                    xpath_selector,
                    identifier or selector,
                    adaptive,
                    auto_save,
                    percentage,
                )

            results = Selectors()
            for single_selector in split_selectors(selector):
                # I'm doing this only so the `save` function saves data correctly for combined selectors
                # Like using the ',' to combine two different selectors that point to different elements.
                xpath_selector = _css_to_xpath(single_selector.canonical())
                results += self.xpath(
                    xpath_selector,
                    identifier or single_selector.canonical(),
                    adaptive,
                    auto_save,
                    percentage,
                )

            return Selectors(results)
        except (
            SelectorError,
            SelectorSyntaxError,
        ) as e:
            raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e

    def xpath(
        self,
        selector: str,
        identifier: str = "",
        adaptive: bool = False,
        auto_save: bool = False,
        percentage: int = 0,
        **kwargs: Any,
    ) -> "Selectors":
        """Search the current tree with XPath selectors

        **Important:
        It's recommended to use the identifier argument if you plan to use a different selector later
        and want to relocate the same element(s)**

         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**

        :param selector: The XPath selector to be used.
        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
        :param identifier: A string that will be used to save/retrieve element's data in adaptive,
         otherwise the selector will be used.
        :param auto_save: Automatically save new elements for `adaptive` later
        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
         number unless you must know what you are doing!

        :return: `Selectors` class.
        """
        if self._is_text_node(self._root):
            return Selectors()

        try:
            if elements := self._root.xpath(selector, **kwargs):
                if not self.__adaptive_enabled and auto_save:
                    log.warning(
                        "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
                    )
                elif self.__adaptive_enabled and auto_save:
                    self.save(elements[0], identifier or selector)

                return self.__handle_elements(elements)
            elif self.__adaptive_enabled:
                if adaptive:
                    element_data = self.retrieve(identifier or selector)
                    if element_data:
                        elements = self.relocate(element_data, percentage)
                        if elements is not None and auto_save:
                            self.save(elements[0], identifier or selector)

                return self.__handle_elements(elements)
            else:
                if adaptive:
                    log.warning(
                        "Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
                    )
                elif auto_save:
                    log.warning(
                        "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
                    )

                return self.__handle_elements(elements)

        except (
            SelectorError,
            SelectorSyntaxError,
            XPathError,
            XPathEvalError,
        ) as e:
            raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e

    def find_all(
        self,
        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
        **kwargs: str,
    ) -> "Selectors":
        """Find elements by filters of your creations for ease.

        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
        :param kwargs: The attributes you want to filter elements based on it.
        :return: The `Selectors` object of the elements or empty list
        """
        if self._is_text_node(self._root):
            return Selectors()

        if not args and not kwargs:
            raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")

        attributes: Dict[str, Any] = dict()
        tags: Set[str] = set()
        patterns: Set[Pattern] = set()
        results, functions, selectors = Selectors(), [], []

        # Brace yourself for a wonderful journey!
        for arg in args:
            if isinstance(arg, str):
                tags.add(arg)

            elif type(arg) in (list, tuple, set):
                arg = cast(Iterable, arg)  # Type narrowing for type checkers like pyright
                if not all(map(lambda x: isinstance(x, str), arg)):
                    raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
                tags.update(set(arg))

            elif isinstance(arg, dict):
                if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
                    raise TypeError(
                        "Nested dictionaries are not accepted, only string keys and string values are accepted"
                    )
                attributes.update(arg)

            elif isinstance(arg, re_Pattern):
                patterns.add(arg)

            elif callable(arg):
                if len(signature(arg).parameters) > 0:
                    functions.append(arg)
                else:
                    raise TypeError(
                        "Callable filter function must have at least one argument to take `Selector` objects."
                    )

            else:
                raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')

        if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
            raise TypeError("Only string values are accepted for arguments")

        for attribute_name, value in kwargs.items():
            # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
            attribute_name = _whitelisted.get(attribute_name, attribute_name)
            attributes[attribute_name] = value

        # It's easier and faster to build a selector than traversing the tree
        tags = tags or set("*")
        for tag in tags:
            selector = tag
            for key, value in attributes.items():
                value = value.replace('"', r"\"")  # Escape double quotes in user input
                # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
                selector += '[{}="{}"]'.format(key, value)
            if selector != "*":
                selectors.append(selector)

        if selectors:
            results = cast(Selectors, self.css(", ".join(selectors)))
            if results:
                # From the results, get the ones that fulfill passed regex patterns
                for pattern in patterns:
                    results = results.filter(lambda e: e.text.re(pattern, check_match=True))

                # From the results, get the ones that fulfill passed functions
                for function in functions:
                    results = results.filter(function)
        else:
            results = results or self.below_elements
            for pattern in patterns:
                results = results.filter(lambda e: e.text.re(pattern, check_match=True))

            # Collect an element if it fulfills the passed function otherwise
            for function in functions:
                results = results.filter(function)

        return results

    def find(
        self,
        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
        **kwargs: str,
    ) -> Optional["Selector"]:
        """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.

        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
        :param kwargs: The attributes you want to filter elements based on it.
        :return: The `Selector` object of the element or `None` if the result didn't match
        """
        for element in self.find_all(*args, **kwargs):
            return element
        return None

    def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
        """Used internally to calculate a score that shows how a candidate element similar to the original one

        :param original: The original element in the form of the dictionary generated from `element_to_dict` function
        :param candidate: The element to compare with the original element.
        :return: A percentage score of how similar is the candidate to the original element
        """
        score: float = 0
        checks: int = 0
        data = _StorageTools.element_to_dict(candidate)

        score += 1 if original["tag"] == data["tag"] else 0
        checks += 1

        if original["text"]:
            score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
            checks += 1

        # if both don't have attributes, it still counts for something!
        score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
        checks += 1

        # Separate similarity test for class, id, href,... this will help in full structural changes
        for attrib in (
            "class",
            "id",
            "href",
            "src",
        ):
            if original["attributes"].get(attrib):
                score += SequenceMatcher(
                    None,
                    original["attributes"][attrib],
                    data["attributes"].get(attrib) or "",
                ).ratio()
                checks += 1

        score += SequenceMatcher(None, original["path"], data["path"]).ratio()
        checks += 1

        if original.get("parent_name"):
            # Then we start comparing parents' data
            if data.get("parent_name"):
                score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio()
                checks += 1

                score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {})
                checks += 1

                if original["parent_text"]:
                    score += SequenceMatcher(
                        None,
                        original["parent_text"],
                        data.get("parent_text") or "",
                    ).ratio()
                    checks += 1
            # else:
            #     # The original element has a parent and this one not, this is not a good sign
            #     score -= 0.1

        if original.get("siblings"):
            score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
            checks += 1

        # How % sure? let's see
        return round((score / checks) * 100, 2)

    @staticmethod
    def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
        """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
        score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
        score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
        return score

    def save(self, element: HtmlElement, identifier: str) -> None:
        """Saves the element's unique properties to the storage for retrieval and relocation later

        :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
            the docs for more info.
        """
        if self.__adaptive_enabled and self._storage:
            target_element: Any = element
            if isinstance(target_element, self.__class__):
                target_element = target_element._root

            if self._is_text_node(target_element):
                target_element = target_element.getparent()

            self._storage.save(target_element, identifier)
        else:
            raise RuntimeError(
                "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
            )

    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
        """Using the identifier, we search the storage and return the unique properties of the element

        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
            the docs for more info.
        :return: A dictionary of the unique properties
        """
        if self.__adaptive_enabled and self._storage:
            return self._storage.retrieve(identifier)

        raise RuntimeError(
            "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
        )

    # Operations on text functions
    def json(self) -> Dict:
        """Return JSON response if the response is jsonable otherwise throws error"""
        if self._is_text_node(self._root):
            return TextHandler(str(self._root)).json()
        if self._raw_body and isinstance(self._raw_body, (str, bytes)):
            if isinstance(self._raw_body, str):
                return TextHandler(self._raw_body).json()
            else:
                if TYPE_CHECKING:
                    assert isinstance(self._raw_body, bytes)
                return TextHandler(self._raw_body.decode()).json()
        elif self.text:
            return self.text.json()
        else:
            return self.get_all_text(strip=True).json()

    def re(
        self,
        regex: str | Pattern[str],
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> TextHandlers:
        """Apply the given regex to the current text and return a list of strings with the matches.

        :param regex: Can be either a compiled regular expression or a string.
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
        """
        return self.text.re(regex, replace_entities, clean_match, case_sensitive)

    def re_first(
        self,
        regex: str | Pattern[str],
        default=None,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> TextHandler:
        """Apply the given regex to text and return the first match if found, otherwise return the default value.

        :param regex: Can be either a compiled regular expression or a string.
        :param default: The default value to be returned if there is no match
        :param replace_entities: if enabled character entity references are replaced by their corresponding character
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
        """
        return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)

    @staticmethod
    def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
        """Return attributes dictionary without the ignored list"""
        return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}

    def __are_alike(
        self,
        original: HtmlElement,
        original_attributes: Dict,
        candidate: HtmlElement,
        ignore_attributes: List | Tuple,
        similarity_threshold: float,
        match_text: bool = False,
    ) -> bool:
        """Calculate a score of how much these elements are alike and return True
        if the score is higher or equals the threshold"""
        candidate_attributes = (
            self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
        )
        score: float = 0
        checks: int = 0

        if original_attributes:
            score += sum(
                SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
                for k, v in original_attributes.items()
            )
            checks += len(candidate_attributes)
        else:
            if not candidate_attributes:
                # Both don't have attributes, this must mean something
                score += 1
                checks += 1

        if match_text:
            score += SequenceMatcher(
                None,
                clean_spaces(original.text or ""),
                clean_spaces(candidate.text or ""),
            ).ratio()
            checks += 1

        if checks:
            return round(score / checks, 2) >= similarity_threshold
        return False

    def find_similar(
        self,
        similarity_threshold: float = 0.2,
        ignore_attributes: List | Tuple = (
            "href",
            "src",
        ),
        match_text: bool = False,
    ) -> "Selectors":
        """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
        then return the ones that match the current element attributes with a percentage higher than the input threshold.

        This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
        a products-list container and want to find other products using that element as a starting point EXCEPT
        this function works in any case without depending on the element type.

        :param similarity_threshold: The percentage to use while comparing element attributes.
            Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
            same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
            extremely unlucky, then attributes matching comes into play, so don't play with this number unless
            you are getting the results you don't want.
            Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
        :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
            The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
        :param match_text: If True, element text content will be taken into calculation while matching.
            Not recommended to use in normal cases, but it depends.

        :return: A ``Selectors`` container of ``Selector`` objects or empty list
        """
        if self._is_text_node(self._root):
            return Selectors()

        # We will use the elements' root from now on to get the speed boost of using Lxml directly
        root = self._root
        similar_elements = list()

        current_depth = len(list(root.iterancestors()))
        target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib

        path_parts = [self.tag]
        if (parent := root.getparent()) is not None:
            path_parts.insert(0, parent.tag)
            if (grandparent := parent.getparent()) is not None:
                path_parts.insert(0, grandparent.tag)

        xpath_path = "//{}".format("/".join(path_parts))
        potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")

        for potential_match in potential_matches:
            if potential_match != root and self.__are_alike(
                root,
                target_attrs,
                potential_match,
                ignore_attributes,
                similarity_threshold,
                match_text,
            ):
                similar_elements.append(potential_match)

        return Selectors(map(self.__element_convertor, similar_elements))

    @overload
    def find_by_text(
        self,
        text: str,
        first_match: Literal[True] = ...,
        partial: bool = ...,
        case_sensitive: bool = ...,
        clean_match: bool = ...,
    ) -> "Selector": ...

    @overload
    def find_by_text(
        self,
        text: str,
        first_match: Literal[False],
        partial: bool = ...,
        case_sensitive: bool = ...,
        clean_match: bool = ...,
    ) -> "Selectors": ...

    def find_by_text(
        self,
        text: str,
        first_match: bool = True,
        partial: bool = False,
        case_sensitive: bool = False,
        clean_match: bool = True,
    ) -> Union["Selectors", "Selector"]:
        """Find elements that its text content fully/partially matches input.
        :param text: Text query to match
        :param first_match: Returns the first element that matches conditions, enabled by default
        :param partial: If enabled, the function returns elements that contain the input text
        :param case_sensitive: if enabled, the letters case will be taken into consideration
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        """
        if self._is_text_node(self._root):
            return Selectors()

        results = Selectors()
        if not case_sensitive:
            text = text.lower()

        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
        if possible_targets:
            for node in self.__elements_convertor(possible_targets):
                """Check if element matches given text otherwise, traverse the children tree and iterate"""
                node_text: TextHandler = node.text
                if clean_match:
                    node_text = TextHandler(node_text.clean())

                if not case_sensitive:
                    node_text = TextHandler(node_text.lower())

                if partial:
                    if text in node_text:
                        results.append(node)
                elif text == node_text:
                    results.append(node)

                if first_match and results:
                    # we got an element so we should stop
                    break

            if first_match:
                if results:
                    return results[0]
        return results

    @overload
    def find_by_regex(
        self,
        query: str | Pattern[str],
        first_match: Literal[True] = ...,
        case_sensitive: bool = ...,
        clean_match: bool = ...,
    ) -> "Selector": ...

    @overload
    def find_by_regex(
        self,
        query: str | Pattern[str],
        first_match: Literal[False],
        case_sensitive: bool = ...,
        clean_match: bool = ...,
    ) -> "Selectors": ...

    def find_by_regex(
        self,
        query: str | Pattern[str],
        first_match: bool = True,
        case_sensitive: bool = False,
        clean_match: bool = True,
    ) -> Union["Selectors", "Selector"]:
        """Find elements that its text content matches the input regex pattern.
        :param query: Regex query/pattern to match
        :param first_match: Return the first element that matches conditions; enabled by default.
        :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
        """
        if self._is_text_node(self._root):
            return Selectors()

        results = Selectors()

        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
        if possible_targets:
            for node in self.__elements_convertor(possible_targets):
                """Check if element matches given regex otherwise, traverse the children tree and iterate"""
                node_text = node.text
                if node_text.re(
                    query,
                    check_match=True,
                    clean_match=clean_match,
                    case_sensitive=case_sensitive,
                ):
                    results.append(node)

                if first_match and results:
                    # we got an element so we should stop
                    break

            if results and first_match:
                return results[0]
        return results


class Selectors(List[Selector]):
    """
    The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
    """

    __slots__ = ()

    @overload
    def __getitem__(self, pos: SupportsIndex) -> Selector:
        pass

    @overload
    def __getitem__(self, pos: slice) -> "Selectors":
        pass

    def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
        lst = super().__getitem__(pos)
        if isinstance(pos, slice):
            return self.__class__(cast(List[Selector], lst))
        else:
            return cast(Selector, lst)

    def xpath(
        self,
        selector: str,
        identifier: str = "",
        auto_save: bool = False,
        percentage: int = 0,
        **kwargs: Any,
    ) -> "Selectors":
        """
        Call the ``.xpath()`` method for each element in this list and return
        their results as another `Selectors` class.

        **Important:
        It's recommended to use the identifier argument if you plan to use a different selector later
        and want to relocate the same element(s)**

         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**

        :param selector: The XPath selector to be used.
        :param identifier: A string that will be used to retrieve element's data in adaptive,
         otherwise the selector will be used.
        :param auto_save: Automatically save new elements for `adaptive` later
        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
         number unless you must know what you are doing!

        :return: `Selectors` class.
        """
        results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
        return self.__class__(flatten(results))

    def css(
        self,
        selector: str,
        identifier: str = "",
        auto_save: bool = False,
        percentage: int = 0,
    ) -> "Selectors":
        """
        Call the ``.css()`` method for each element in this list and return
        their results flattened as another `Selectors` class.

        **Important:
        It's recommended to use the identifier argument if you plan to use a different selector later
        and want to relocate the same element(s)**

        :param selector: The CSS3 selector to be used.
        :param identifier: A string that will be used to retrieve element's data in adaptive,
         otherwise the selector will be used.
        :param auto_save: Automatically save new elements for `adaptive` later
        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
         number unless you must know what you are doing!

        :return: `Selectors` class.
        """
        results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
        return self.__class__(flatten(results))

    def re(
        self,
        regex: str | Pattern,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> TextHandlers:
        """Call the ``.re()`` method for each element in this list and return
        their results flattened as List of TextHandler.

        :param regex: Can be either a compiled regular expression or a string.
        :param replace_entities: If enabled character entity references are replaced by their corresponding character
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
        """
        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
        return TextHandlers(flatten(results))

    def re_first(
        self,
        regex: str | Pattern,
        default: Any = None,
        replace_entities: bool = True,
        clean_match: bool = False,
        case_sensitive: bool = True,
    ) -> TextHandler:
        """Call the ``.re_first()`` method for each element in this list and return
        the first result or the default value otherwise.

        :param regex: Can be either a compiled regular expression or a string.
        :param default: The default value to be returned if there is no match
        :param replace_entities: if enabled character entity references are replaced by their corresponding character
        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
        :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
        """
        for n in self:
            for result in n.re(regex, replace_entities, clean_match, case_sensitive):
                return result
        return default

    def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
        """Loop over all current elements and return the first element that matches the passed function
        :param func: A function that takes each element as an argument and returns True/False
        :return: The first element that match the function or ``None`` otherwise.
        """
        for element in self:
            if func(element):
                return element
        return None

    def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
        """Filter current elements based on the passed function
        :param func: A function that takes each element as an argument and returns True/False
        :return: The new `Selectors` object or empty list otherwise.
        """
        return self.__class__([element for element in self if func(element)])

    @overload
    def get(self) -> Optional[TextHandler]: ...

    @overload
    def get(self, default: _T) -> Union[TextHandler, _T]: ...

    def get(self, default=None):
        """Returns the serialized string of the first element, or ``default`` if empty.
        :param default: the default value to return if the current list is empty
        """
        for x in self:
            return x.get()
        return default

    def getall(self) -> TextHandlers:
        """Serialize all elements and return as a TextHandlers list."""
        return TextHandlers([x.get() for x in self])

    extract = getall
    extract_first = get

    @property
    def first(self) -> Optional[Selector]:
        """Returns the first Selector item of the current list or `None` if the list is empty"""
        return self[0] if len(self) > 0 else None

    @property
    def last(self) -> Optional[Selector]:
        """Returns the last Selector item of the current list or `None` if the list is empty"""
        return self[-1] if len(self) > 0 else None

    @property
    def length(self) -> int:
        """Returns the length of the current list"""
        return len(self)

    def __getstate__(self) -> Any:  # pragma: no cover
        # lxml don't like it :)
        raise TypeError("Can't pickle Selectors object")


# For backward compatibility
Adaptor = Selector
Adaptors = Selectors


================================================
FILE: scrapling/py.typed
================================================


================================================
FILE: scrapling/spiders/__init__.py
================================================
from .request import Request
from .result import CrawlResult
from .scheduler import Scheduler
from .engine import CrawlerEngine
from .session import SessionManager
from .spider import Spider, SessionConfigurationError
from scrapling.engines.toolbelt.custom import Response

__all__ = [
    "Spider",
    "SessionConfigurationError",
    "Request",
    "CrawlerEngine",
    "CrawlResult",
    "SessionManager",
    "Scheduler",
    "Response",
]


================================================
FILE: scrapling/spiders/checkpoint.py
================================================
import pickle
from pathlib import Path
from dataclasses import dataclass, field

import anyio
from anyio import Path as AsyncPath

from scrapling.core.utils import log
from scrapling.core._types import Set, List, Optional, TYPE_CHECKING

if TYPE_CHECKING:
    from scrapling.spiders.request import Request


@dataclass
class CheckpointData:
    """Container for checkpoint state."""

    requests: List["Request"] = field(default_factory=list)
    seen: Set[bytes] = field(default_factory=set)


class CheckpointManager:
    """Manages saving and loading checkpoint state to/from disk."""

    CHECKPOINT_FILE = "checkpoint.pkl"

    def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0):
        self.crawldir = AsyncPath(crawldir)
        self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE
        self.interval = interval
        if not isinstance(interval, (int, float)):
            raise TypeError("Checkpoints interval must be integer or float.")
        else:
            if interval < 0:
                raise ValueError("Checkpoints interval must be equal or greater than 0.")

    async def has_checkpoint(self) -> bool:
        """Check if a checkpoint exists."""
        return await self._checkpoint_path.exists()

    async def save(self, data: CheckpointData) -> None:
        """Save checkpoint data to disk atomically."""
        await self.crawldir.mkdir(parents=True, exist_ok=True)

        temp_path = self._checkpoint_path.with_suffix(".tmp")

        try:
            serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
            async with await anyio.open_file(temp_path, "wb") as f:
                await f.write(serialized)

            await temp_path.rename(self._checkpoint_path)

            log.info(f"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs")
        except Exception as e:
            # Clean up temp file if it exists
            if await temp_path.exists():
                await temp_path.unlink()
            log.error(f"Failed to save checkpoint: {e}")
            raise

    async def load(self) -> Optional[CheckpointData]:
        """Load checkpoint data from disk.

        Returns None if no checkpoint exists or if loading fails.
        """
        if not await self.has_checkpoint():
            return None

        try:
            async with await anyio.open_file(self._checkpoint_path, "rb") as f:
                content = await f.read()
                data: CheckpointData = pickle.loads(content)

            log.info(f"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs")
            return data

        except Exception as e:
            log.error(f"Failed to load checkpoint (starting fresh): {e}")
            return None

    async def cleanup(self) -> None:
        """Delete checkpoint file after successful completion."""
        try:
            if await self._checkpoint_path.exists():
                await self._checkpoint_path.unlink()
            log.debug("Checkpoint file cleaned up")
        except Exception as e:
            log.warning(f"Failed to cleanup checkpoint file: {e}")


================================================
FILE: scrapling/spiders/engine.py
================================================
import json
import pprint
from pathlib import Path

import anyio
from anyio import Path as AsyncPath
from anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream

from scrapling.core.utils import log
from scrapling.spiders.request import Request
from scrapling.spiders.scheduler import Scheduler
from scrapling.spiders.session import SessionManager
from scrapling.spiders.result import CrawlStats, ItemList
from scrapling.spiders.checkpoint import CheckpointManager, CheckpointData
from scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator

if TYPE_CHECKING:
    from scrapling.spiders.spider import Spider


def _dump(obj: Dict) -> str:
    return json.dumps(obj, indent=4)


class CrawlerEngine:
    """Orchestrates the crawling process."""

    def __init__(
        self,
        spider: "Spider",
        session_manager: SessionManager,
        crawldir: Optional[Union[str, Path, AsyncPath]] = None,
        interval: float = 300.0,
    ):
        self.spider = spider
        self.session_manager = session_manager
        self.scheduler = Scheduler(
            include_kwargs=spider.fp_include_kwargs,
            include_headers=spider.fp_include_headers,
            keep_fragments=spider.fp_keep_fragments,
        )
        self.stats = CrawlStats()

        self._global_limiter = CapacityLimiter(spider.concurrent_requests)
        self._domain_limiters: dict[str, CapacityLimiter] = {}
        self._allowed_domains: set[str] = spider.allowed_domains or set()

        self._active_tasks: int = 0
        self._running: bool = False
        self._items: ItemList = ItemList()
        self._item_stream: Any = None

        self._checkpoint_system_enabled = bool(crawldir)
        self._checkpoint_manager = CheckpointManager(crawldir or "", interval)
        self._last_checkpoint_time: float = 0.0
        self._pause_requested: bool = False
        self._force_stop: bool = False
        self.paused: bool = False

    def _is_domain_allowed(self, request: Request) -> bool:
        """Check if the request's domain is in allowed_domains."""
        if not self._allowed_domains:
            return True

        domain = request.domain
        for allowed in self._allowed_domains:
            if domain == allowed or domain.endswith("." + allowed):
                return True
        return False

    def _rate_limiter(self, domain: str) -> CapacityLimiter:
        """Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter."""
        if self.spider.concurrent_requests_per_domain:
            if domain not in self._domain_limiters:
                self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain)
            return self._domain_limiters[domain]
        return self._global_limiter

    def _normalize_request(self, request: Request) -> None:
        """Normalize request fields before enqueueing.

        Resolves empty sid to the session manager's default session ID.
        This ensures consistent fingerprinting for requests using the same session.
        """
        if not request.sid:
            request.sid = self.session_manager.default_session_id

    async def _process_request(self, request: Request) -> None:
        """Download and process a single request."""
        async with self._rate_limiter(request.domain):
            if self.spider.download_delay:
                await anyio.sleep(self.spider.download_delay)

            if request._session_kwargs.get("proxy"):
                self.stats.proxies.append(request._session_kwargs["proxy"])
            if request._session_kwargs.get("proxies"):
                self.stats.proxies.append(dict(request._session_kwargs["proxies"]))
            try:
                response = await self.session_manager.fetch(request)
                self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id)
                self.stats.increment_response_bytes(request.domain, len(response.body))
                self.stats.increment_status(response.status)

            except Exception as e:
                self.stats.failed_requests_count += 1
                await self.spider.on_error(request, e)
                return

        if await self.spider.is_blocked(response):
            self.stats.blocked_requests_count += 1
            if request._retry_count < self.spider.max_blocked_retries:
                retry_request = request.copy()
                retry_request._retry_count += 1
                retry_request.priority -= 1  # Don't retry immediately
                retry_request.dont_filter = True
                retry_request._session_kwargs.pop("proxy", None)
                retry_request._session_kwargs.pop("proxies", None)

                new_request = await self.spider.retry_blocked_request(retry_request, response)
                self._normalize_request(new_request)
                await self.scheduler.enqueue(new_request)
                log.info(
                    f"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}"
                )
            else:
                log.warning(f"Max retries exceeded for blocked request: {request.url}")
            return

        callback = request.callback if request.callback else self.spider.parse
        try:
            async for result in callback(response):
                if isinstance(result, Request):
                    if self._is_domain_allowed(result):
                        self._normalize_request(result)
                        await self.scheduler.enqueue(result)
                    else:
                        self.stats.offsite_requests_count += 1
                        log.debug(f"Filtered offsite request to: {result.url}")
                elif isinstance(result, dict):
                    processed_result = await self.spider.on_scraped_item(result)
                    if processed_result:
                        self.stats.items_scraped += 1
                        log.debug(f"Scraped from {str(response)}\n{pprint.pformat(processed_result)}")
                        if self._item_stream:
                            await self._item_stream.send(processed_result)
                        else:
                            self._items.append(processed_result)
                    else:
                        self.stats.items_dropped += 1
                        log.warning(f"Dropped from {str(response)}\n{processed_result}")
                elif result is not None:
                    log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
        except Exception as e:
            msg = f"Spider error processing {request}:\n {e}"
            log.error(msg, exc_info=e)
            await self.spider.on_error(request, e)

    async def _task_wrapper(self, request: Request) -> None:
        """Wrapper to track active task count."""
        try:
            await self._process_request(request)
        finally:
            self._active_tasks -= 1

    def request_pause(self) -> None:
        """Request a graceful pause of the crawl.

        First call: requests graceful pause (waits for active tasks).
        Second call: forces immediate stop.
        """
        if self._force_stop:
            return  # Already forcing stop

        if self._pause_requested:
            # Second Ctrl+C - force stop
            self._force_stop = True
            log.warning("Force stop requested, cancelling immediately...")
        else:
            self._pause_requested = True
            log.info(
                "Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)..."
            )

    async def _save_checkpoint(self) -> None:
        """Save current state to checkpoint files."""
        requests, seen = self.scheduler.snapshot()
        data = CheckpointData(requests=requests, seen=seen)
        await self._checkpoint_manager.save(data)
        self._last_checkpoint_time = anyio.current_time()

    def _is_checkpoint_time(self) -> bool:
        """Check if it's time for the periodic checkpoint."""
        if not self._checkpoint_system_enabled:
            return False

        if self._checkpoint_manager.interval == 0:
            return False

        current_time = anyio.current_time()
        return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval

    async def _restore_from_checkpoint(self) -> bool:
        """Attempt to restore state from checkpoint.

        Returns True if successfully restored, False otherwise.
        """
        if not self._checkpoint_system_enabled:
            raise

        data = await self._checkpoint_manager.load()
        if data is None:
            return False

        self.scheduler.restore(data)

        # Restore callbacks from spider after scheduler restore
        for request in data.requests:
            request._restore_callback(self.spider)

        return True

    async def crawl(self) -> CrawlStats:
        """Run the spider and return CrawlStats."""
        self._running = True
        self._items.clear()
        self.paused = False
        self._pause_requested = False
        self._force_stop = False
        self.stats = CrawlStats(start_time=anyio.current_time())

        # Check for existing checkpoint
        resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False
        self._last_checkpoint_time = anyio.current_time()

        async with self.session_manager:
            self.stats.concurrent_requests = self.spider.concurrent_requests
            self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain
            self.stats.download_delay = self.spider.download_delay
            await self.spider.on_start(resuming=resuming)

            try:
                if not resuming:
                    async for request in self.spider.start_requests():
                        self._normalize_request(request)
                        await self.scheduler.enqueue(request)
                else:
                    log.info("Resuming from checkpoint, skipping start_requests()")

                # Process queue
                async with create_task_group() as tg:
                    while self._running:
                        if self._pause_requested:
                            if self._active_tasks == 0 or self._force_stop:
                                if self._force_stop:
                                    log.warning(f"Force stopping with {self._active_tasks} active tasks")
                                    tg.cancel_scope.cancel()

                                # Only save checkpoint if checkpoint system is enabled
                                if self._checkpoint_system_enabled:
                                    await self._save_checkpoint()
                                    self.paused = True
                                    log.info("Spider paused, checkpoint saved")
                                else:
                                    log.info("Spider stopped gracefully")

                                self._running = False
                                break

                            # Wait briefly and check again
                            await anyio.sleep(0.05)
                            continue

                        if self._checkpoint_system_enabled and self._is_checkpoint_time():
                            await self._save_checkpoint()

                        if self.scheduler.is_empty:
                            # Empty queue + no active tasks = done
                            if self._active_tasks == 0:
                                self._running = False
                                log.debug("Spider idle")
                                break

                            # Brief wait for callbacks to enqueue new requests
                            await anyio.sleep(0.05)
                            continue

                        # Only spawn tasks up to concurrent_requests limit
                        # This prevents spawning thousands of waiting tasks
                        if self._active_tasks >= self.spider.concurrent_requests:
                            await anyio.sleep(0.01)
                            continue

                        request = await self.scheduler.dequeue()
                        self._active_tasks += 1
                        tg.start_soon(self._task_wrapper, request)

            finally:
                await self.spider.on_close()
                # Clean up checkpoint files on successful completion (not paused)
                if not self.paused and self._checkpoint_system_enabled:
                    await self._checkpoint_manager.cleanup()

        self.stats.log_levels_counter = self.spider._log_counter.get_counts()
        self.stats.end_time = anyio.current_time()
        log.info(_dump(self.stats.to_dict()))
        return self.stats

    @property
    def items(self) -> ItemList:
        """Access scraped items."""
        return self._items

    def __aiter__(self) -> AsyncGenerator[dict, None]:
        return self._stream()

    async def _stream(self) -> AsyncGenerator[dict, None]:
        """Async generator that runs crawl and yields items."""
        send, recv = create_memory_object_stream[dict](100)
        self._item_stream = send

        async def run():
            try:
                await self.crawl()
            finally:
                await send.aclose()

        async with create_task_group() as tg:
            tg.start_soon(run)
            try:
                async for item in recv:
                    yield item
            except EndOfStream:
                pass


================================================
FILE: scrapling/spiders/request.py
================================================
import hashlib
from io import BytesIO
from functools import cached_property
from urllib.parse import urlparse, urlencode

import orjson
from w3lib.url import canonicalize_url

from scrapling.engines.toolbelt.custom import Response
from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING

if TYPE_CHECKING:
    from scrapling.spiders.spider import Spider


def _convert_to_bytes(value: str | bytes) -> bytes:
    if isinstance(value, bytes):
        return value
    if not isinstance(value, str):
        raise TypeError(f"Can't convert {type(value).__name__} to bytes")

    return value.encode(encoding="utf-8", errors="ignore")


class Request:
    def __init__(
        self,
        url: str,
        sid: str = "",
        callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
        priority: int = 0,
        dont_filter: bool = False,
        meta: dict[str, Any] | None = None,
        _retry_count: int = 0,
        **kwargs: Any,
    ) -> None:
        self.url: str = url
        self.sid: str = sid
        self.callback = callback
        self.priority: int = priority
        self.dont_filter: bool = dont_filter
        self.meta: dict[str, Any] = meta if meta else {}
        self._retry_count: int = _retry_count
        self._session_kwargs = kwargs if kwargs else {}
        self._fp: Optional[bytes] = None

    def copy(self) -> "Request":
        """Create a copy of this request."""
        return Request(
            url=self.url,
            sid=self.sid,
            callback=self.callback,
            priority=self.priority,
            dont_filter=self.dont_filter,
            meta=self.meta.copy(),
            _retry_count=self._retry_count,
            **self._session_kwargs,
        )

    @cached_property
    def domain(self) -> str:
        return urlparse(self.url).netloc

    def update_fingerprint(
        self,
        include_kwargs: bool = False,
        include_headers: bool = False,
        keep_fragments: bool = False,
    ) -> bytes:
        """Generate a unique fingerprint for deduplication.

        Caches the result in self._fp after first computation.
        """
        if self._fp is not None:
            return self._fp

        post_data = self._session_kwargs.get("data", {})
        body = b""
        if post_data:
            if isinstance(post_data, dict | list | tuple):
                body = urlencode(post_data).encode()
            elif isinstance(post_data, str):
                body = post_data.encode()
            elif isinstance(post_data, BytesIO):
                body = post_data.getvalue()
            elif isinstance(post_data, bytes):
                body = post_data
        else:
            post_data = self._session_kwargs.get("json", {})
            body = orjson.dumps(post_data) if post_data else b""

        data: Dict[str, str | Tuple] = {
            "sid": self.sid,
            "body": body.hex(),
            "method": self._session_kwargs.get("method", "GET"),
            "url": canonicalize_url(self.url, keep_fragments=keep_fragments),
        }

        if include_kwargs:
            kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
            data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))

        if include_headers:
            headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
            processed_headers = {}
            # Some header normalization
            for key, value in headers.items():
                processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
            data["headers"] = tuple(processed_headers.items())

        fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
        self._fp = fp
        return fp

    def __repr__(self) -> str:
        callback_name = getattr(self.callback, "__name__", None) or "None"
        return f"<Request({self.url}) priority={self.priority} callback={callback_name}>"

    def __str__(self) -> str:
        return self.url

    def __lt__(self, other: object) -> bool:
        """Compare requests by priority"""
        if not isinstance(other, Request):
            return NotImplemented
        return self.priority < other.priority

    def __gt__(self, other: object) -> bool:
        """Compare requests by priority"""
        if not isinstance(other, Request):
            return NotImplemented
        return self.priority > other.priority

    def __eq__(self, other: object) -> bool:
        """Requests are equal if they have the same fingerprint."""
        if not isinstance(other, Request):
            return NotImplemented
        if self._fp is None or other._fp is None:
            raise RuntimeError("Cannot compare requests before generating their fingerprints!")
        return self._fp == other._fp

    def __getstate__(self) -> dict[str, Any]:
        """Prepare state for pickling - store callback as name string for pickle compatibility."""
        state = self.__dict__.copy()
        state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None
        state["callback"] = None  # Don't pickle the actual callable
        return state

    def __setstate__(self, state: dict[str, Any]) -> None:
        """Restore state from pickle - callback restored later via _restore_callback()."""
        self._callback_name: str | None = state.pop("_callback_name", None)
        self.__dict__.update(state)

    def _restore_callback(self, spider: "Spider") -> None:
        """Restore callback from spider after unpickling.

        :param spider: Spider instance to look up callback method on
        """
        if hasattr(self, "_callback_name") and self._callback_name:
            self.callback = getattr(spider, self._callback_name, None) or spider.parse
            del self._callback_name
        elif hasattr(self, "_callback_name"):
            del self._callback_name


================================================
FILE: scrapling/spiders/result.py
================================================
from pathlib import Path
from dataclasses import dataclass, field

import orjson

from scrapling.core.utils import log
from scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union


class ItemList(list):
    """A list of scraped items with export capabilities."""

    def to_json(self, path: Union[str, Path], *, indent: bool = False):
        """Export items to a JSON file.

        :param path: Path to the output file
        :param indent: Pretty-print with 2-space indentation (slightly slower)
        """
        options = orjson.OPT_SERIALIZE_NUMPY
        if indent:
            options |= orjson.OPT_INDENT_2

        file = Path(path)
        file.parent.mkdir(parents=True, exist_ok=True)
        file.write_bytes(orjson.dumps(list(self), option=options))
        log.info("Saved %d items to %s", len(self), path)

    def to_jsonl(self, path: Union[str, Path]):
        """Export items as JSON Lines (one JSON object per line).

        :param path: Path to the output file
        """
        Path(path).parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as f:
            for item in self:
                f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))
                f.write(b"\n")
        log.info("Saved %d items to %s", len(self), path)


@dataclass
class CrawlStats:
    """Statistics for a crawl run."""

    requests_count: int = 0
    concurrent_requests: int = 0
    concurrent_requests_per_domain: int = 0
    failed_requests_count: int = 0
    offsite_requests_count: int = 0
    response_bytes: int = 0
    items_scraped: int = 0
    items_dropped: int = 0
    start_time: float = 0.0
    end_time: float = 0.0
    download_delay: float = 0.0
    blocked_requests_count: int = 0
    custom_stats: Dict = field(default_factory=dict)
    response_status_count: Dict = field(default_factory=dict)
    domains_response_bytes: Dict = field(default_factory=dict)
    sessions_requests_count: Dict = field(default_factory=dict)
    proxies: List[str | Dict | Tuple] = field(default_factory=list)
    log_levels_counter: Dict = field(default_factory=dict)

    @property
    def elapsed_seconds(self) -> float:
        return self.end_time - self.start_time

    @property
    def requests_per_second(self) -> float:
        if self.elapsed_seconds == 0:
            return 0.0
        return self.requests_count / self.elapsed_seconds

    def increment_status(self, status: int) -> None:
        self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1

    def increment_response_bytes(self, domain: str, count: int) -> None:
        self.response_bytes += count
        self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count

    def increment_requests_count(self, sid: str) -> None:
        self.requests_count += 1
        self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1

    def to_dict(self) -> dict[str, Any]:
        return {
            "items_scraped": self.items_scraped,
            "items_dropped": self.items_dropped,
            "elapsed_seconds": round(self.elapsed_seconds, 2),
            "download_delay": round(self.download_delay, 2),
            "concurrent_requests": self.concurrent_requests,
            "concurrent_requests_per_domain": self.concurrent_requests_per_domain,
            "requests_count": self.requests_count,
            "requests_per_second": round(self.requests_per_second, 2),
            "sessions_requests_count": self.sessions_requests_count,
            "failed_requests_count": self.failed_requests_count,
            "offsite_requests_count": self.offsite_requests_count,
            "blocked_requests_count": self.blocked_requests_count,
            "response_status_count": self.response_status_count,
            "response_bytes": self.response_bytes,
            "domains_response_bytes": self.domains_response_bytes,
            "proxies": self.proxies,
            "custom_stats": self.custom_stats,
            "log_count": self.log_levels_counter,
        }


@dataclass
class CrawlResult:
    """Complete result from a spider run."""

    stats: CrawlStats
    items: ItemList
    paused: bool = False

    @property
    def completed(self) -> bool:
        """True if the crawl completed normally (not paused)."""
        return not self.paused

    def __len__(self) -> int:
        return len(self.items)

    def __iter__(self) -> Iterator[dict[str, Any]]:
        return iter(self.items)


================================================
FILE: scrapling/spiders/scheduler.py
================================================
import asyncio
from itertools import count

from scrapling.core.utils import log
from scrapling.spiders.request import Request
from scrapling.core._types import List, Set, Tuple, TYPE_CHECKING

if TYPE_CHECKING:
    from scrapling.spiders.checkpoint import CheckpointData


class Scheduler:
    """
    Priority queue with URL deduplication. (heapq)

    Higher priority requests are processed first.
    Duplicate URLs are filtered unless dont_filter=True.
    """

    def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False):
        self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue()
        self._seen: set[bytes] = set()
        self._counter = count()
        # Mirror dict for snapshot without draining queue
        self._pending: dict[int, tuple[int, int, Request]] = {}
        self._include_kwargs = include_kwargs
        self._include_headers = include_headers
        self._keep_fragments = keep_fragments

    async def enqueue(self, request: Request) -> bool:
        """Add a request to the queue."""
        fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments)

        if not request.dont_filter and fingerprint in self._seen:
            log.debug("Dropped duplicate request: %s", request)
            return False

        self._seen.add(fingerprint)

        # Negative priority so higher priority = dequeued first
        counter = next(self._counter)
        item = (-request.priority, counter, request)
        self._pending[counter] = item
        await self._queue.put(item)
        return True

    async def dequeue(self) -> Request:
        """Get the next request to process."""
        _, counter, request = await self._queue.get()
        self._pending.pop(counter, None)
        return request

    def __len__(self) -> int:
        return self._queue.qsize()

    @property
    def is_empty(self) -> bool:
        return self._queue.empty()

    def snapshot(self) -> Tuple[List[Request], Set[bytes]]:
        """Create a snapshot of the current state for checkpoints."""
        sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1]))  # Maintain queue order
        requests = [item[2] for item in sorted_items]
        return requests, self._seen.copy()

    def restore(self, data: "CheckpointData") -> None:
        """Restore scheduler state from checkpoint data.

        :param data: CheckpointData containing requests and seen set
        """
        self._seen = data.seen.copy()

        # Restore pending requests in order (they're already sorted by priority)
        for request in data.requests:
            counter = next(self._counter)
            item = (-request.priority, counter, request)
            self._pending[counter] = item
            self._queue.put_nowait(item)

        log.info(f"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen")


================================================
FILE: scrapling/spiders/session.py
================================================
from asyncio import Lock

from scrapling.spiders.request import Request
from scrapling.engines.static import _ASyncSessionLogic
from scrapling.engines.toolbelt.convertor import Response
from scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS
from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession

Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession


class SessionManager:
    """Manages pre-configured session instances."""

    def __init__(self) -> None:
        self._sessions: dict[str, Session] = {}
        self._default_session_id: str | None = None
        self._started: bool = False
        self._lazy_sessions: Set[str] = set()
        self._lazy_lock = Lock()

    def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager":
        """Register a session instance.

        :param session_id: Name to reference this session in requests
        :param session: Your pre-configured session instance
        :param default: If True, this becomes the default session
        :param lazy: If True, the session will be started only when a request uses its ID.
        """
        if session_id in self._sessions:
            raise ValueError(f"Session '{session_id}' already registered")

        self._sessions[session_id] = session

        if default or self._default_session_id is None:
            self._default_session_id = session_id

        if lazy:
            self._lazy_sessions.add(session_id)

        return self

    def remove(self, session_id: str) -> None:
        """Removes a session.

        :param session_id: ID of session to remove
        """
        _ = self.pop(session_id)

    def pop(self, session_id: str) -> Session:
        """Remove and returns a session.

        :param session_id: ID of session to remove
        """
        if session_id not in self._sessions:
            raise KeyError(f"Session '{session_id}' not found")

        session = self._sessions.pop(session_id)
        if session_id in self._lazy_sessions:
            self._lazy_sessions.remove(session_id)

        if session and self._default_session_id == session_id:
            self._default_session_id = next(iter(self._sessions), None)

        return session

    @property
    def default_session_id(self) -> str:
        if self._default_session_id is None:
            raise RuntimeError("No sessions registered")
        return self._default_session_id

    @property
    def session_ids(self) -> list[str]:
        return list(self._sessions.keys())

    def get(self, session_id: str) -> Session:
        if session_id not in self._sessions:
            available = ", ".join(self._sessions.keys())
            raise KeyError(f"Session '{session_id}' not found. Available: {available}")
        return self._sessions[session_id]

    async def start(self) -> None:
        """Start all sessions that aren't already alive."""
        if self._started:
            return

        for sid, session in self._sessions.items():
            if sid not in self._lazy_sessions and not session._is_alive:
                await session.__aenter__()

        self._started = True

    async def close(self) -> None:
        """Close all registered sessions."""
        for session in self._sessions.values():
            _ = await session.__aexit__(None, None, None)

        self._started = False

    async def fetch(self, request: Request) -> Response:
        sid = request.sid if request.sid else self.default_session_id
        session = self.get(sid)

        if session:
            if sid in self._lazy_sessions and not session._is_alive:
                async with self._lazy_lock:
                    if not session._is_alive:
                        await session.__aenter__()

            if isinstance(session, FetcherSession):
                client = session._client

                if isinstance(client, _ASyncSessionLogic):
                    response = await client._make_request(
                        method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
                        url=request.url,
                        **request._session_kwargs,
                    )
                else:
                    # Sync session or other types - shouldn't happen in async context
                    raise TypeError(f"Session type {type(client)} not supported for async fetch")
            else:
                response = await session.fetch(url=request.url, **request._session_kwargs)

            response.request = request
            # Merge request meta into response meta (response meta takes priority)
            response.meta = {**request.meta, **response.meta}
            return response
        raise RuntimeError("No session found with the request session id")

    async def __aenter__(self) -> "SessionManager":
        await self.start()
        return self

    async def __aexit__(self, *exc) -> None:
        await self.close()

    def __contains__(self, session_id: str) -> bool:
        """Check if a session ID is registered."""
        return session_id in self._sessions

    def __len__(self) -> int:
        """Number of registered sessions."""
        return len(self._sessions)


================================================
FILE: scrapling/spiders/spider.py
================================================
import signal
import logging
from pathlib import Path
from abc import ABC, abstractmethod

import anyio
from anyio import Path as AsyncPath

from scrapling.spiders.request import Request
from scrapling.spiders.engine import CrawlerEngine
from scrapling.spiders.session import SessionManager
from scrapling.core.utils import set_logger, reset_logger
from scrapling.spiders.result import CrawlResult, CrawlStats
from scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator

BLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504}
if TYPE_CHECKING:
    from scrapling.engines.toolbelt.custom import Response


class LogCounterHandler(logging.Handler):
    """A logging handler that counts log messages by level."""

    def __init__(self):
        super().__init__()
        self.counts = {
            logging.DEBUG: 0,
            logging.INFO: 0,
            logging.WARNING: 0,
            logging.ERROR: 0,
            logging.CRITICAL: 0,
        }

    def emit(self, record: logging.LogRecord) -> None:
        level = record.levelno
        # Map to the closest standard level
        if level >= logging.CRITICAL:
            self.counts[logging.CRITICAL] += 1
        elif level >= logging.ERROR:
            self.counts[logging.ERROR] += 1
        elif level >= logging.WARNING:
            self.counts[logging.WARNING] += 1
        elif level >= logging.INFO:
            self.counts[logging.INFO] += 1
        else:
            self.counts[logging.DEBUG] += 1

    def get_counts(self) -> Dict[str, int]:
        """Return counts as a dictionary with string keys."""
        return {
            "debug": self.counts[logging.DEBUG],
            "info": self.counts[logging.INFO],
            "warning": self.counts[logging.WARNING],
            "error": self.counts[logging.ERROR],
            "critical": self.counts[logging.CRITICAL],
        }


class SessionConfigurationError(Exception):
    """Raised when session configuration fails."""

    pass


class Spider(ABC):
    """An abstract base class for creating web spiders.

    Check the documentation website for more information.
    """

    name: Optional[str] = None
    start_urls: list[str] = []
    allowed_domains: Set[str] = set()

    # Concurrency settings
    concurrent_requests: int = 4
    concurrent_requests_per_domain: int = 0
    download_delay: float = 0.0
    max_blocked_retries: int = 3

    # Fingerprint adjustments
    fp_include_kwargs: bool = False
    fp_keep_fragments: bool = False
    fp_include_headers: bool = False

    # Logging settings
    logging_level: int = logging.DEBUG
    logging_format: str = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"
    logging_date_format: str = "%Y-%m-%d %H:%M:%S"
    log_file: Optional[str] = None

    def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):
        """Initialize the spider.

        :param crawldir: Directory for checkpoint files. If provided, enables pause/resume.
        :param interval: Seconds between periodic checkpoint saves (default 5 minutes).
        """
        if self.name is None:
            raise ValueError(f"{self.__class__.__name__} must have a name.")

        self.logger = logging.getLogger(f"scrapling.spiders.{self.name}")
        self.logger.setLevel(self.logging_level)
        self.logger.handlers.clear()
        self.logger.propagate = False  # Don't propagate to parent 'scrapling' logger

        formatter = logging.Formatter(
            fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format
        )

        # Add a log counter handler to track log counts by level
        self._log_counter = LogCounterHandler()
        self.logger.addHandler(self._log_counter)

        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        self.logger.addHandler(console_handler)

        if self.log_file:
            Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)
            file_handler = logging.FileHandler(self.log_file)
            file_handler.setFormatter(formatter)
            self.logger.addHandler(file_handler)

        self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None
        self._interval = interval
        self._engine: Optional[CrawlerEngine] = None
        self._original_sigint_handler: Any = None

        self._session_manager = SessionManager()

        try:
            self.configure_sessions(self._session_manager)
        except Exception as e:
            raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e

        if len(self._session_manager) == 0:
            raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions")

        self.logger.info("Spider initialized")

    async def start_requests(self) -> AsyncGenerator[Request, None]:
        """Generate initial requests to start the crawl.

        By default, this generates Request objects for each URL in `start_urls`
        using the session manager's default session and `parse()` as callback.

        Override this method for more control over initial requests
        (e.g., to add custom headers, use different callbacks, etc.)
        """
        if not self.start_urls:
            raise RuntimeError(
                "Spider has no starting point, either set `start_urls` or override `start_requests` function."
            )

        for url in self.start_urls:
            yield Request(url, sid=self._session_manager.default_session_id)

    @abstractmethod
    async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
        """Default callback for processing responses"""
        raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method")
        yield  # Make this a generator for type checkers

    async def on_start(self, resuming: bool = False) -> None:
        """Called before crawling starts. Override for setup logic.

        :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.
        """
        if resuming:
            self.logger.debug("Resuming spider from checkpoint")
        else:
            self.logger.debug("Starting spider")

    async def on_close(self) -> None:
        """Called after crawling finishes. Override for cleanup logic."""
        self.logger.debug("Spider closed")

    async def on_error(self, request: Request, error: Exception) -> None:
        """
        Handle request errors for all spider requests.

        Override for custom error handling.
        """
        pass

    async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
        """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
        return item

    async def is_blocked(self, response: "Response") -> bool:
        """Check if the response is blocked. Users should override this for custom detection logic."""
        if response.status in BLOCKED_CODES:
            return True
        return False

    async def retry_blocked_request(self, request: Request, response: "Response") -> Request:
        """Users should override this to prepare the blocked request before retrying, if needed."""
        return request

    def __repr__(self) -> str:
        """String representation of the spider."""
        return f"<{self.__class__.__name__} '{self.name}'>"

    def configure_sessions(self, manager: SessionManager) -> None:
        """Configure sessions for this spider.

        Override this method to add custom sessions.
        The default implementation creates a FetcherSession session.

        The first session added becomes the default for `start_requests()` unless specified otherwise.

        :param manager: SessionManager to configure
        """
        from scrapling.fetchers import FetcherSession

        manager.add("default", FetcherSession())

    def pause(self):
        """Request graceful shutdown of the crawling process."""
        if self._engine:
            self._engine.request_pause()
        else:
            raise RuntimeError("No active crawl to stop")

    def _setup_signal_handler(self) -> None:
        """Set up SIGINT handler for graceful pause."""

        def handler(_signum: int, _frame: Any) -> None:
            if self._engine:
                self._engine.request_pause()
            else:
                # No engine yet, just raise KeyboardInterrupt
                raise KeyboardInterrupt

        try:
            self._original_sigint_handler = signal.signal(signal.SIGINT, handler)
        except ValueError:
            self._original_sigint_handler = None

    def _restore_signal_handler(self) -> None:
        """Restore original SIGINT handler."""
        if self._original_sigint_handler is not None:
            try:
                signal.signal(signal.SIGINT, self._original_sigint_handler)
            except ValueError:
                pass

    async def __run(self) -> CrawlResult:
        token = set_logger(self.logger)
        try:
            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
            stats = await self._engine.crawl()
            paused = self._engine.paused
            return CrawlResult(stats=stats, items=self._engine.items, paused=paused)
        finally:
            self._engine = None
            reset_logger(token)
            # Close any file handlers to release file resources.
            if self.log_file:
                for handler in self.logger.handlers:
                    if isinstance(handler, logging.FileHandler):
                        handler.close()

    def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:
        """Run the spider and return results.

        This is the main entry point for running a spider.
        Handles async execution internally via anyio.

        Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).
        Pressing Ctrl+C a second time will force immediate stop.

        If crawldir is set, a checkpoint will also be saved on graceful shutdown,
        allowing you to resume the crawl later by running the spider again.

        :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.
        :param backend_options: Asyncio backend options to be used with `anyio.run`
        """
        backend_options = backend_options or {}
        if use_uvloop:
            backend_options.update({"use_uvloop": True})

        # Set up SIGINT handler for graceful shutdown
        self._setup_signal_handler()
        try:
            return anyio.run(self.__run, backend="asyncio", backend_options=backend_options)
        finally:
            self._restore_signal_handler()

    async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
        """Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.

        Must be called from an async context. Yields items one by one as they are scraped.
        Access `spider.stats` during iteration for real-time statistics.

        Note: SIGINT handling for pause/resume is not available in stream mode.
        """
        token = set_logger(self.logger)
        try:
            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
            async for item in self._engine:
                yield item
        finally:
            self._engine = None
            reset_logger(token)
            if self.log_file:
                for handler in self.logger.handlers:
                    if isinstance(handler, logging.FileHandler):
                        handler.close()

    @property
    def stats(self) -> CrawlStats:
        """Access current crawl stats (works during streaming)."""
        if self._engine:
            return self._engine.stats
        raise RuntimeError("No active crawl. Use this property inside `async for item in spider.stream():`")


================================================
FILE: server.json
================================================
{
  "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
  "name": "io.github.D4Vinci/Scrapling",
  "title": "Scrapling MCP Server",
  "description": "Web scraping with stealth HTTP, real browsers, and Cloudflare bypass. CSS selectors supported.",
  "websiteUrl": "https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html",
  "repository": {
    "url": "https://github.com/D4Vinci/Scrapling",
    "source": "github"
  },
  "icons": [
    {
      "src": "https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/logo.png",
      "mimeType": "image/png"
    }
  ],
  "version": "0.4.2",
  "packages": [
    {
      "registryType": "pypi",
      "identifier": "scrapling",
      "version": "0.4.2",
      "runtimeHint": "uvx",
      "packageArguments": [
        {
          "type": "positional",
          "valueHint": "mcp",
          "isFixed": true
        }
      ],
      "transport": {
        "type": "stdio"
      }
    },
    {
      "registryType": "oci",
      "identifier": "ghcr.io/d4vinci/scrapling",
      "packageArguments": [
        {
          "type": "positional",
          "valueHint": "mcp",
          "isFixed": true
        }
      ],
      "transport": {
        "type": "stdio"
      }
    }
  ]
}

================================================
FILE: setup.cfg
================================================
[metadata]
name = scrapling
version = 0.4.2
author = Karim Shoair
author_email = karim.shoair@pm.me
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
license = BSD
home_page = https://github.com/D4Vinci/Scrapling

================================================
FILE: tests/__init__.py
================================================
"""Package for test project."""


================================================
FILE: tests/ai/__init__.py
================================================


================================================
FILE: tests/ai/test_ai_mcp.py
================================================
import pytest
import pytest_httpbin

from scrapling.core.ai import ScraplingMCPServer, ResponseModel


@pytest_httpbin.use_class_based_httpbin
class TestMCPServer:
    """Test MCP server functionality"""

    @pytest.fixture(scope="class")
    def test_url(self, httpbin):
        return f"{httpbin.url}/html"

    @pytest.fixture
    def server(self):
        return ScraplingMCPServer()

    def test_get_tool(self, server, test_url):
        """Test the get tool method"""
        result = server.get(url=test_url, extraction_type="markdown")
        assert isinstance(result, ResponseModel)
        assert result.status == 200
        assert result.url == test_url

    @pytest.mark.asyncio
    async def test_bulk_get_tool(self, server, test_url):
        """Test the bulk_get tool method"""
        results = await server.bulk_get(urls=(test_url, test_url), extraction_type="html")

        assert len(results) == 2
        assert all(isinstance(r, ResponseModel) for r in results)

    @pytest.mark.asyncio
    async def test_fetch_tool(self, server, test_url):
        """Test the fetch tool method"""
        result = await server.fetch(url=test_url, headless=True)
        assert isinstance(result, ResponseModel)
        assert result.status == 200

    @pytest.mark.asyncio
    async def test_bulk_fetch_tool(self, server, test_url):
        """Test the bulk_fetch tool method"""
        result = await server.bulk_fetch(urls=(test_url, test_url), headless=True)
        assert all(isinstance(r, ResponseModel) for r in result)

    @pytest.mark.asyncio
    async def test_stealthy_fetch_tool(self, server, test_url):
        """Test the stealthy_fetch tool method"""
        result = await server.stealthy_fetch(url=test_url, headless=True)
        assert isinstance(result, ResponseModel)
        assert result.status == 200

    @pytest.mark.asyncio
    async def test_bulk_stealthy_fetch_tool(self, server, test_url):
        """Test the bulk_stealthy_fetch tool method"""
        result = await server.bulk_stealthy_fetch(urls=(test_url, test_url), headless=True)
        assert all(isinstance(r, ResponseModel) for r in result)


================================================
FILE: tests/cli/__init__.py
================================================


================================================
FILE: tests/cli/test_cli.py
================================================
import pytest
from click.testing import CliRunner
from unittest.mock import patch, MagicMock
import pytest_httpbin

from scrapling.parser import Selector
from scrapling.cli import (
    shell, mcp, get, post, put, delete, fetch, stealthy_fetch
)


@pytest_httpbin.use_class_based_httpbin
def configure_selector_mock():
    """Helper function to create a properly configured Selector mock"""
    mock_response = MagicMock(spec=Selector)
    mock_response.body = "<html><body>Test content</body></html>"
    mock_response.html_content = "<html><body>Test content</body></html>"
    mock_response.encoding = "utf-8"
    mock_response.get_all_text.return_value = "Test content"
    mock_response.css.return_value = [mock_response]
    return mock_response


class TestCLI:
    """Test CLI functionality"""

    @pytest.fixture
    def html_url(self, httpbin):
        return f"{httpbin.url}/html"

    @pytest.fixture
    def runner(self):
        return CliRunner()

    def test_shell_command(self, runner):
        """Test shell command"""
        with patch('scrapling.core.shell.CustomShell') as mock_shell:
            mock_instance = MagicMock()
            mock_shell.return_value = mock_instance

            result = runner.invoke(shell)
            assert result.exit_code == 0
            mock_instance.start.assert_called_once()

    def test_mcp_command(self, runner):
        """Test MCP command"""
        with patch('scrapling.core.ai.ScraplingMCPServer') as mock_server:
            mock_instance = MagicMock()
            mock_server.return_value = mock_instance

            result = runner.invoke(mcp)
            assert result.exit_code == 0
            mock_instance.serve.assert_called_once()

    def test_extract_get_command(self, runner, tmp_path, html_url):
        """Test extract `get` command"""
        output_file = tmp_path / "output.md"

        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_response = configure_selector_mock()
            mock_response.status = 200
            mock_get.return_value = mock_response

            result = runner.invoke(
                get,
                [html_url, str(output_file)]
            )
            assert result.exit_code == 0

        # Test with various options
        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_get.return_value = mock_response

            result = runner.invoke(
                get,
                [
                    html_url,
                    str(output_file),
                    '-H', 'User-Agent: Test',
                    '--cookies', 'session=abc123',
                    '--timeout', '60',
                    '--proxy', 'http://proxy:8080',
                    '-s', '.content',
                    '-p', 'page=1'
                ]
            )
            assert result.exit_code == 0

    def test_extract_post_command(self, runner, tmp_path, html_url):
        """Test extract `post` command"""
        output_file = tmp_path / "output.html"

        with patch('scrapling.fetchers.Fetcher.post') as mock_post:
            mock_response = configure_selector_mock()
            mock_post.return_value = mock_response

            result = runner.invoke(
                post,
                [
                    html_url,
                    str(output_file),
                    '-d', 'key=value',
                    '-j', '{"data": "test"}'
                ]
            )
            assert result.exit_code == 0

    def test_extract_put_command(self, runner, tmp_path, html_url):
        """Test extract `put` command"""
        output_file = tmp_path / "output.html"

        with patch('scrapling.fetchers.Fetcher.put') as mock_put:
            mock_response = configure_selector_mock()
            mock_put.return_value = mock_response

            result = runner.invoke(
                put,
                [
                    html_url,
                    str(output_file),
                    '-d', 'key=value',
                    '-j', '{"data": "test"}'
                ]
            )
            assert result.exit_code == 0

    def test_extract_delete_command(self, runner, tmp_path, html_url):
        """Test extract `delete` command"""
        output_file = tmp_path / "output.html"

        with patch('scrapling.fetchers.Fetcher.delete') as mock_delete:
            mock_response = configure_selector_mock()
            mock_delete.return_value = mock_response

            result = runner.invoke(
                delete,
                [
                    html_url,
                    str(output_file)
                ]
            )
            assert result.exit_code == 0

    def test_extract_fetch_command(self, runner, tmp_path, html_url):
        """Test extract fetch command"""
        output_file = tmp_path / "output.txt"

        with patch('scrapling.fetchers.DynamicFetcher.fetch') as mock_fetch:
            mock_response = configure_selector_mock()
            mock_fetch.return_value = mock_response

            result = runner.invoke(
                fetch,
                [
                    html_url,
                    str(output_file),
                    '--headless',
                    '--timeout', '60000'
                ]
            )
            assert result.exit_code == 0

    def test_extract_stealthy_fetch_command(self, runner, tmp_path, html_url):
        """Test extract fetch command"""
        output_file = tmp_path / "output.md"

        with patch('scrapling.fetchers.StealthyFetcher.fetch') as mock_fetch:
            mock_response = configure_selector_mock()
            mock_fetch.return_value = mock_response

            result = runner.invoke(
                stealthy_fetch,
                [
                    html_url,
                    str(output_file),
                    '--headless',
                    '--css-selector', 'body',
                    '--timeout', '60000'
                ]
            )
            assert result.exit_code == 0

    def test_invalid_arguments(self, runner, html_url):
        """Test invalid arguments handling"""
        # Missing required arguments
        result = runner.invoke(get)
        assert result.exit_code != 0

        _ = runner.invoke(
            get,
            [html_url, 'output.invalid']
        )
        # Should handle the error gracefully

    def test_impersonate_comma_separated(self, runner, tmp_path, html_url):
        """Test that comma-separated impersonate values are parsed correctly"""
        output_file = tmp_path / "output.md"

        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_response = configure_selector_mock()
            mock_response.status = 200
            mock_get.return_value = mock_response

            result = runner.invoke(
                get,
                [
                    html_url,
                    str(output_file),
                    '--impersonate', 'chrome,firefox,safari'
                ]
            )
            assert result.exit_code == 0

            # Verify that the impersonate argument was converted to a list
            call_kwargs = mock_get.call_args[1]
            assert isinstance(call_kwargs['impersonate'], list)
            assert call_kwargs['impersonate'] == ['chrome', 'firefox', 'safari']

    def test_impersonate_single_browser(self, runner, tmp_path, html_url):
        """Test that single impersonate value remains as string"""
        output_file = tmp_path / "output.md"

        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_response = configure_selector_mock()
            mock_response.status = 200
            mock_get.return_value = mock_response

            result = runner.invoke(
                get,
                [
                    html_url,
                    str(output_file),
                    '--impersonate', 'chrome'
                ]
            )
            assert result.exit_code == 0

            # Verify that the impersonate argument remains a string
            call_kwargs = mock_get.call_args[1]
            assert isinstance(call_kwargs['impersonate'], str)
            assert call_kwargs['impersonate'] == 'chrome'


================================================
FILE: tests/cli/test_shell_functionality.py
================================================
import pytest
from unittest.mock import patch, MagicMock

from scrapling.parser import Selector
from scrapling.core.shell import CustomShell, CurlParser, Convertor


class TestCurlParser:
    """Test curl command parsing"""

    @pytest.fixture
    def parser(self):
        return CurlParser()

    def test_basic_curl_parse(self, parser):
        """Test parsing basic curl commands"""
        # Simple GET
        curl_cmd = 'curl https://example.com'
        request = parser.parse(curl_cmd)

        assert request.url == 'https://example.com'
        assert request.method == 'get'
        assert request.data is None

    def test_curl_with_headers(self, parser):
        """Test parsing curl with headers"""
        curl_cmd = '''curl https://example.com \
            -H "User-Agent: Mozilla/5.0" \
            -H "Accept: application/json"'''

        request = parser.parse(curl_cmd)

        assert request.headers['User-Agent'] == 'Mozilla/5.0'
        assert request.headers['Accept'] == 'application/json'

    def test_curl_with_data(self, parser):
        """Test parsing curl with data"""
        # Form data
        curl_cmd = 'curl https://example.com -X POST -d "key=value&foo=bar"'
        request = parser.parse(curl_cmd)

        assert request.method == 'post'
        assert request.data == 'key=value&foo=bar'

        # JSON data
        curl_cmd = """curl https://example.com -X POST --data-raw '{"key": "value"}'"""
        request = parser.parse(curl_cmd)

        assert request.json_data == {"key": "value"}

    def test_curl_with_cookies(self, parser):
        """Test parsing curl with cookies"""
        curl_cmd = '''curl https://example.com \
            -H "Cookie: session=abc123; user=john" \
            -b "extra=cookie"'''

        request = parser.parse(curl_cmd)

        assert request.cookies['session'] == 'abc123'
        assert request.cookies['user'] == 'john'
        assert request.cookies['extra'] == 'cookie'

    def test_curl_with_proxy(self, parser):
        """Test parsing curl with proxy"""
        curl_cmd = 'curl https://example.com -x http://proxy:8080 -U user:pass'
        request = parser.parse(curl_cmd)

        assert 'http://user:pass@proxy:8080' in request.proxy['http']

    def test_curl2fetcher(self, parser):
        """Test converting curl to fetcher request"""
        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_response = MagicMock()
            mock_get.return_value = mock_response

            curl_cmd = 'curl https://example.com'
            _ = parser.convert2fetcher(curl_cmd)

            mock_get.assert_called_once()

    def test_invalid_curl_commands(self, parser):
        """Test handling invalid curl commands"""
        # Invalid format
        with pytest.raises(AttributeError):
            parser.parse('not a curl command')


class TestConvertor:
    """Test content conversion functionality"""

    @pytest.fixture
    def sample_html(self):
        return """
        <html>
            <body>
                <div class="content">
                    <h1>Title</h1>
                    <p>Some text content</p>
                </div>
            </body>
        </html>
        """

    def test_extract_markdown(self, sample_html):
        """Test extracting content as Markdown"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "markdown"))

        assert len(content) > 0
        assert "Title\n=====" in content[0]  # Markdown conversion

    def test_extract_html(self, sample_html):
        """Test extracting content as HTML"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "html"))

        assert len(content) > 0
        assert "<h1>Title</h1>" in content[0]

    def test_extract_text(self, sample_html):
        """Test extracting content as plain text"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "text"))

        assert len(content) > 0
        assert "Title" in content[0]
        assert "Some text content" in content[0]

    def test_extract_with_selector(self, sample_html):
        """Test extracting with CSS selector"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(
            page,
            "text",
            css_selector=".content"
        ))

        assert len(content) > 0

    def test_write_to_file(self, sample_html, tmp_path):
        """Test writing content to files"""
        page = Selector(sample_html)

        # Test markdown
        md_file = tmp_path / "output.md"
        Convertor.write_content_to_file(page, str(md_file))
        assert md_file.exists()

        # Test HTML
        html_file = tmp_path / "output.html"
        Convertor.write_content_to_file(page, str(html_file))
        assert html_file.exists()

        # Test text
        txt_file = tmp_path / "output.txt"
        Convertor.write_content_to_file(page, str(txt_file))
        assert txt_file.exists()

    def test_invalid_operations(self, sample_html):
        """Test error handling in convertor"""
        page = Selector(sample_html)

        # Invalid extraction type
        with pytest.raises(ValueError):
            list(Convertor._extract_content(page, "invalid"))

        # Invalid filename
        with pytest.raises(ValueError):
            Convertor.write_content_to_file(page, "")

        # Unknown file extension
        with pytest.raises(ValueError):
            Convertor.write_content_to_file(page, "output.xyz")


class TestCustomShell:
    """Test interactive shell functionality"""

    def test_shell_initialization(self):
        """Test shell initialization"""
        shell = CustomShell(code="", log_level="debug")

        assert shell.log_level == 10  # DEBUG level
        assert shell.page is None
        assert len(shell.pages) == 0

    def test_shell_namespace(self):
        """Test shell namespace creation"""
        shell = CustomShell(code="")
        namespace = shell.get_namespace()

        # Check all expected functions/classes are available
        assert 'get' in namespace
        assert 'post' in namespace
        assert 'Fetcher' in namespace
        assert 'DynamicFetcher' in namespace
        assert 'view' in namespace
        assert 'uncurl' in namespace


================================================
FILE: tests/core/__init__.py
================================================


================================================
FILE: tests/core/test_shell_core.py
================================================
import pytest

from scrapling.core.shell import (
    _CookieParser,
    _ParseHeaders,
    Request,
    _known_logging_levels,
)


class TestCookieParser:
    """Test cookie parsing functionality"""
    
    def test_simple_cookie_parsing(self):
        """Test parsing a simple cookie"""
        cookie_string = "session_id=abc123"
        cookies = list(_CookieParser(cookie_string))
        assert len(cookies) == 1
        assert cookies[0] == ("session_id", "abc123")
    
    def test_multiple_cookies_parsing(self):
        """Test parsing multiple cookies"""
        cookie_string = "session_id=abc123; theme=dark; lang=en"
        cookies = list(_CookieParser(cookie_string))
        assert len(cookies) == 3
        cookie_dict = dict(cookies)
        assert cookie_dict["session_id"] == "abc123"
        assert cookie_dict["theme"] == "dark"
        assert cookie_dict["lang"] == "en"
    
    def test_cookie_with_attributes(self):
        """Test parsing cookies with attributes"""
        cookie_string = "session_id=abc123; Path=/; HttpOnly; Secure"
        cookies = list(_CookieParser(cookie_string))
        assert len(cookies) == 1
        assert cookies[0] == ("session_id", "abc123")
    
    def test_empty_cookie_string(self):
        """Test parsing empty cookie string"""
        cookies = list(_CookieParser(""))
        assert len(cookies) == 0
    
    def test_malformed_cookie_handling(self):
        """Test handling of malformed cookies"""
        # Should not raise exception but may return an empty list
        cookies = list(_CookieParser("invalid_cookie_format"))
        assert isinstance(cookies, list)


class TestParseHeaders:
    """Test header parsing functionality"""
    
    def test_simple_headers(self):
        """Test parsing simple headers"""
        header_lines = [
            "Content-Type: text/html",
            "Content-Length: 1234",
            "User-Agent: TestAgent/1.0"
        ]
        headers, cookies = _ParseHeaders(header_lines)
        
        assert headers["Content-Type"] == "text/html"
        assert headers["Content-Length"] == "1234"
        assert headers["User-Agent"] == "TestAgent/1.0"
        assert len(cookies) == 0
    
    def test_headers_with_cookies(self):
        """Test parsing headers with cookie headers"""
        header_lines = [
            "Content-Type: text/html",
            "Set-Cookie: session_id=abc123",
            "Set-Cookie: theme=dark; Path=/",
        ]
        headers, cookies = _ParseHeaders(header_lines)
        
        assert headers["Content-Type"] == "text/html"
        assert "Set-Cookie" in headers  # Should contain the first Set-Cookie
        # Cookie parsing behavior depends on implementation
    
    def test_headers_without_colons(self):
        """Test headers without colons"""
        header_lines = [
            "Content-Type: text/html",
            "InvalidHeader;",  # Header ending with semicolon
        ]
        headers, cookies = _ParseHeaders(header_lines)
        
        assert headers["Content-Type"] == "text/html"
        assert "InvalidHeader" in headers
        assert headers["InvalidHeader"] == ""
    
    def test_invalid_header_format(self):
        """Test invalid header format raises error"""
        header_lines = [
            "Content-Type: text/html",
            "InvalidHeaderWithoutColon",  # No colon, no semicolon
        ]
        
        with pytest.raises(ValueError, match="Could not parse header without colon"):
            _ParseHeaders(header_lines)
    
    def test_headers_with_multiple_colons(self):
        """Test headers with multiple colons"""
        header_lines = [
            "Authorization: Bearer: token123",
            "X-Custom: value:with:colons",
        ]
        headers, cookies = _ParseHeaders(header_lines)
        
        assert headers["Authorization"] == "Bearer: token123"
        assert headers["X-Custom"] == "value:with:colons"
    
    def test_headers_with_whitespace(self):
        """Test headers with extra whitespace"""
        header_lines = [
            "  Content-Type  :  text/html  ",
            "\tUser-Agent\t:\tTestAgent/1.0\t",
        ]
        headers, cookies = _ParseHeaders(header_lines)
        
        # Should handle whitespace correctly
        assert "Content-Type" in headers or "  Content-Type  " in headers
        assert "text/html" in str(headers.values()) or "  text/html  " in str(headers.values())
    
    def test_parse_cookies_disabled(self):
        """Test parsing with cookies disabled"""
        header_lines = [
            "Content-Type: text/html",
            "Set-Cookie: session_id=abc123",
        ]
        headers, cookies = _ParseHeaders(header_lines, parse_cookies=False)
        
        assert headers["Content-Type"] == "text/html"
        # Cookie parsing behavior when disabled
        assert len(cookies) == 0 or "Set-Cookie" in headers
    
    def test_empty_header_lines(self):
        """Test parsing empty header lines"""
        headers, cookies = _ParseHeaders([])
        assert len(headers) == 0
        assert len(cookies) == 0


class TestRequestNamedTuple:
    """Test Request namedtuple functionality"""
    
    def test_request_creation(self):
        """Test creating Request namedtuple"""
        request = Request(
            method="GET",
            url="https://example.com",
            params={"q": "test"},
            data=None,
            json_data=None,
            headers={"User-Agent": "Test"},
            cookies={"session": "abc123"},
            proxy=None,
            follow_redirects=True
        )
        
        assert request.method == "GET"
        assert request.url == "https://example.com"
        assert request.params == {"q": "test"}
        assert request.headers == {"User-Agent": "Test"}
        assert request.follow_redirects is True
    
    def test_request_defaults(self):
        """Test Request with default/None values"""
        request = Request(
            method="POST",
            url="https://api.example.com",
            params=None,
            data='{"key": "value"}',
            json_data={"key": "value"},
            headers={},
            cookies={},
            proxy="http://proxy:8080",
            follow_redirects=False
        )
        
        assert request.method == "POST"
        assert request.data == '{"key": "value"}'
        assert request.json_data == {"key": "value"}
        assert request.proxy == "http://proxy:8080"
        assert request.follow_redirects is False
    
    def test_request_field_access(self):
        """Test accessing Request fields"""
        request = Request(
            "GET", "https://example.com", {}, None, None, {}, {}, None, True
        )
        
        # Test field access by name
        assert hasattr(request, 'method')
        assert hasattr(request, 'url') 
        assert hasattr(request, 'params')
        assert hasattr(request, 'data')
        assert hasattr(request, 'json_data')
        assert hasattr(request, 'headers')
        assert hasattr(request, 'cookies')
        assert hasattr(request, 'proxy')
        assert hasattr(request, 'follow_redirects')
        
        # Test field access by index
        assert request[0] == "GET"
        assert request[1] == "https://example.com"


class TestLoggingLevels:
    """Test logging level constants"""
    
    def test_known_logging_levels(self):
        """Test that all known logging levels are defined"""
        expected_levels = ["debug", "info", "warning", "error", "critical", "fatal"]
        
        for level in expected_levels:
            assert level in _known_logging_levels
            assert isinstance(_known_logging_levels[level], int)
    
    def test_logging_level_values(self):
        """Test logging level values are correct"""
        from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL
        
        assert _known_logging_levels["debug"] == DEBUG
        assert _known_logging_levels["info"] == INFO
        assert _known_logging_levels["warning"] == WARNING
        assert _known_logging_levels["error"] == ERROR
        assert _known_logging_levels["critical"] == CRITICAL
        assert _known_logging_levels["fatal"] == FATAL
    
    def test_level_hierarchy(self):
        """Test that logging levels have correct hierarchy"""
        levels = [
            _known_logging_levels["debug"],
            _known_logging_levels["info"],
            _known_logging_levels["warning"],
            _known_logging_levels["error"],
            _known_logging_levels["critical"],
        ]
        
        # Levels should be in ascending order
        for i in range(len(levels) - 1):
            assert levels[i] < levels[i + 1]


================================================
FILE: tests/core/test_storage_core.py
================================================
import tempfile
import os

from scrapling.core.storage import SQLiteStorageSystem


class TestSQLiteStorageSystem:
    """Test SQLiteStorageSystem functionality"""
    
    def test_sqlite_storage_creation(self):
        """Test SQLite storage system creation"""
        # Use an in-memory database for testing
        storage = SQLiteStorageSystem(storage_file=":memory:")
        assert storage is not None
    
    def test_sqlite_storage_with_file(self):
        """Test SQLite storage with an actual file"""
        with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp_file:
            db_path = tmp_file.name

        storage = None
        try:
            storage = SQLiteStorageSystem(storage_file=db_path)
            assert storage is not None
            assert os.path.exists(db_path)
        finally:
            # Close the database connection before deleting (required on Windows)
            if storage is not None:
                storage.close()
            if os.path.exists(db_path):
                os.unlink(db_path)
    
    def test_sqlite_storage_initialization_args(self):
        """Test SQLite storage with various initialization arguments"""
        # Test with URL parameter
        storage = SQLiteStorageSystem(
            storage_file=":memory:",
            url="https://example.com"
        )
        assert storage is not None
        assert storage.url == "https://example.com"


================================================
FILE: tests/fetchers/__init__.py
================================================
# Because I'm too lazy to mock requests :)


================================================
FILE: tests/fetchers/async/__init__.py
================================================


================================================
FILE: tests/fetchers/async/test_dynamic.py
================================================
import pytest
import pytest_httpbin

from scrapling import DynamicFetcher

DynamicFetcher.adaptive = True


@pytest_httpbin.use_class_based_httpbin
class TestDynamicFetcherAsync:
    @pytest.fixture
    def fetcher(self):
        return DynamicFetcher

    @pytest.fixture
    def urls(self, httpbin):
        return {
            "status_200": f"{httpbin.url}/status/200",
            "status_404": f"{httpbin.url}/status/404",
            "status_501": f"{httpbin.url}/status/501",
            "basic_url": f"{httpbin.url}/get",
            "html_url": f"{httpbin.url}/html",
            "delayed_url": f"{httpbin.url}/delay/10",
            "cookies_url": f"{httpbin.url}/cookies/set/test/value",
        }

    @pytest.mark.asyncio
    async def test_basic_fetch(self, fetcher, urls):
        """Test doing a basic fetch request with multiple statuses"""
        response = await fetcher.async_fetch(urls["status_200"])
        assert response.status == 200

    @pytest.mark.asyncio
    async def test_cookies_loading(self, fetcher, urls):
        """Test if cookies are set after the request"""
        response = await fetcher.async_fetch(urls["cookies_url"])
        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}
        assert cookies == {"test": "value"}

    @pytest.mark.asyncio
    async def test_automation(self, fetcher, urls):
        """Test if automation breaks the code or not"""

        async def scroll_page(page):
            await page.mouse.wheel(10, 0)
            await page.mouse.move(100, 400)
            await page.mouse.up()
            return page

        response = await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
        assert response.status == 200

    @pytest.mark.parametrize(
        "kwargs",
        [
            {"real_chrome": True, "disable_resources": True},
            {"wait_selector": "h1", "wait_selector_state": "attached"},
            {"wait_selector": "h1", "wait_selector_state": "visible"},
            {
                "google_search": True,
                "real_chrome": True,
                "wait": 10,
                "locale": "en-US",
                "extra_headers": {"ayo": ""},
                "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
                "cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
                "network_idle": True,
                "selector_config": {"keep_comments": False, "keep_cdata": False},
            },
        ],
    )
    @pytest.mark.asyncio
    async def test_properties(self, fetcher, urls, kwargs):
        """Test if different arguments break the code or not"""
        response = await fetcher.async_fetch(urls["html_url"], **kwargs)
        assert response.status == 200

    @pytest.mark.asyncio
    async def test_cdp_url_invalid(self, fetcher, urls):
        """Test if invalid CDP URLs raise appropriate exceptions"""
        with pytest.raises(TypeError):
            await fetcher.async_fetch(urls["html_url"], cdp_url="blahblah")

        with pytest.raises(TypeError):
            await fetcher.async_fetch(
                urls["html_url"], cdp_url="blahblah"
            )

        with pytest.raises(Exception):
            await fetcher.async_fetch(urls["html_url"], cdp_url="ws://blahblah")


================================================
FILE: tests/fetchers/async/test_dynamic_session.py
================================================
import pytest
import asyncio

import pytest_httpbin

from scrapling.fetchers import AsyncDynamicSession


@pytest_httpbin.use_class_based_httpbin
@pytest.mark.asyncio
class TestAsyncDynamicSession:
    """Test AsyncDynamicSession"""

    # The `AsyncDynamicSession` is inheriting from `DynamicSession` class so no need to repeat all the tests
    @pytest.fixture
    def urls(self, httpbin):
        return {
            "basic": f"{httpbin.url}/get",
            "html": f"{httpbin.url}/html",
        }

    async def test_concurrent_async_requests(self, urls):
        """Test concurrent requests with async session"""
        async with AsyncDynamicSession(max_pages=3) as session:
            # Launch multiple concurrent requests
            tasks = [
                session.fetch(urls["basic"]),
                session.fetch(urls["html"]),
                session.fetch(urls["basic"])
            ]

            assert session.max_pages == 3
            assert session.page_pool.max_pages == 3
            assert session.context is not None

            responses = await asyncio.gather(*tasks)

            # All should succeed
            assert all(r.status == 200 for r in responses)

            # Check pool stats
            stats = session.get_pool_stats()
            assert stats["total_pages"] <= 3

        # After exit, should be closed
        assert session._is_alive is False

        # Should raise RuntimeError when used after closing
        with pytest.raises(RuntimeError):
            await session.fetch(urls["basic"])

    async def test_page_pool_management(self, urls):
        """Test page pool creation and reuse"""
        async with AsyncDynamicSession() as session:
            # The first request creates a page
            response = await session.fetch(urls["basic"])
            assert response.status == 200
            assert session.page_pool.pages_count == 0
            
            # The second request should reuse the page
            response = await session.fetch(urls["html"])
            assert response.status == 200
            assert session.page_pool.pages_count == 0

            # Check pool stats
            stats = session.get_pool_stats()
            assert stats["total_pages"] == 0
            assert stats["max_pages"] == 1

    async def test_dynamic_session_with_options(self, urls):
        """Test AsyncDynamicSession with various options"""
        async with AsyncDynamicSession(
                headless=False,
                disable_resources=True,
                extra_headers={"X-Test": "value"}
        ) as session:
            response = await session.fetch(urls["html"])
            assert response.status == 200

    async def test_error_handling_in_fetch(self, urls):
        """Test error handling during fetch"""
        async with AsyncDynamicSession() as session:
            # Test with invalid URL
            with pytest.raises(Exception):
                await session.fetch("invalid://url")


================================================
FILE: tests/fetchers/async/test_requests.py
================================================
import pytest
import pytest_httpbin

from scrapling.fetchers import AsyncFetcher

AsyncFetcher.adaptive = True


@pytest_httpbin.use_class_based_httpbin
@pytest.mark.asyncio
class TestAsyncFetcher:
    @pytest.fixture(scope="class")
    def fetcher(self):
        return AsyncFetcher

    @pytest.fixture(scope="class")
    def urls(self, httpbin):
        return {
            "status_200": f"{httpbin.url}/status/200",
            "status_404": f"{httpbin.url}/status/404",
            "status_501": f"{httpbin.url}/status/501",
            "basic_url": f"{httpbin.url}/get",
            "post_url": f"{httpbin.url}/post",
            "put_url": f"{httpbin.url}/put",
            "delete_url": f"{httpbin.url}/delete",
            "html_url": f"{httpbin.url}/html",
        }

    async def test_basic_get(self, fetcher, urls):
        """Test doing basic get request with multiple statuses"""
        assert (await fetcher.get(urls["status_200"])).status == 200
        assert (await fetcher.get(urls["status_404"])).status == 404
        assert (await fetcher.get(urls["status_501"])).status == 501

    async def test_get_properties(self, fetcher, urls):
        """Test if different arguments with the GET request break the code or not"""
        assert (
            await fetcher.get(urls["status_200"], stealthy_headers=True)
        ).status == 200
        assert (
            await fetcher.get(urls["status_200"], follow_redirects=True)
        ).status == 200
        assert (await fetcher.get(urls["status_200"], timeout=None)).status == 200
        assert (
            await fetcher.get(
                urls["status_200"],
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            )
        ).status == 200

    async def test_post_properties(self, fetcher, urls):
        """Test if different arguments with the POST request break the code or not"""
        assert (
            await fetcher.post(urls["post_url"], data={"key": "value"})
        ).status == 200
        assert (
            await fetcher.post(
                urls["post_url"], data={"key": "value"}, stealthy_headers=True
            )
        ).status == 200
        assert (
            await fetcher.post(
                urls["post_url"], data={"key": "value"}, follow_redirects=True
            )
        ).status == 200
        assert (
            await fetcher.post(urls["post_url"], data={"key": "value"}, timeout=None)
        ).status == 200
        assert (
            await fetcher.post(
                urls["post_url"],
                data={"key": "value"},
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            )
        ).status == 200

    async def test_put_properties(self, fetcher, urls):
        """Test if different arguments with a PUT request break the code or not"""
        assert (await fetcher.put(urls["put_url"], data={"key": "value"})).status in [
            200,
            405,
        ]
        assert (
            await fetcher.put(
                urls["put_url"], data={"key": "value"}, stealthy_headers=True
            )
        ).status in [200, 405]
        assert (
            await fetcher.put(
                urls["put_url"], data={"key": "value"}, follow_redirects=True
            )
        ).status in [200, 405]
        assert (
            await fetcher.put(urls["put_url"], data={"key": "value"}, timeout=None)
        ).status in [200, 405]
        assert (
            await fetcher.put(
                urls["put_url"],
                data={"key": "value"},
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            )
        ).status in [200, 405]

    async def test_delete_properties(self, fetcher, urls):
        """Test if different arguments with the DELETE request break the code or not"""
        assert (
            await fetcher.delete(urls["delete_url"], stealthy_headers=True)
        ).status == 200
        assert (
            await fetcher.delete(urls["delete_url"], follow_redirects=True)
        ).status == 200
        assert (await fetcher.delete(urls["delete_url"], timeout=None)).status == 200
        assert (
            await fetcher.delete(
                urls["delete_url"],
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            )
        ).status == 200


================================================
FILE: tests/fetchers/async/test_requests_session.py
================================================


from scrapling.engines.static import AsyncFetcherClient


class TestFetcherSession:
    """Test FetcherSession functionality"""

    def test_async_fetcher_client_creation(self):
        """Test AsyncFetcherClient creation"""
        client = AsyncFetcherClient()

        # Should not have context manager methods
        assert client.__aenter__ is None
        assert client.__aexit__ is None


================================================
FILE: tests/fetchers/async/test_stealth.py
================================================
import pytest
import pytest_httpbin

from scrapling import StealthyFetcher

StealthyFetcher.adaptive = True


@pytest_httpbin.use_class_based_httpbin
@pytest.mark.asyncio
class TestStealthyFetcher:
    @pytest.fixture(scope="class")
    def fetcher(self):
        return StealthyFetcher

    @pytest.fixture(scope="class")
    def urls(self, httpbin):
        url = httpbin.url
        return {
            "status_200": f"{url}/status/200",
            "status_404": f"{url}/status/404",
            "status_501": f"{url}/status/501",
            "basic_url": f"{url}/get",
            "html_url": f"{url}/html",
            "delayed_url": f"{url}/delay/10",  # 10 Seconds delay response
            "cookies_url": f"{url}/cookies/set/test/value"
        }

    async def test_basic_fetch(self, fetcher, urls):
        """Test doing a basic fetch request with multiple statuses"""
        assert (await fetcher.async_fetch(urls["status_200"])).status == 200
        # assert (await fetcher.async_fetch(urls["status_404"])).status == 404
        # assert (await fetcher.async_fetch(urls["status_501"])).status == 501

    async def test_cookies_loading(self, fetcher, urls):
        """Test if cookies are set after the request"""
        response = await fetcher.async_fetch(urls["cookies_url"])
        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}
        assert cookies == {"test": "value"}

    async def test_automation(self, fetcher, urls):
        """Test if automation breaks the code or not"""

        async def scroll_page(page):
            await page.mouse.wheel(10, 0)
            await page.mouse.move(100, 400)
            await page.mouse.up()
            return page

        assert (
            await fetcher.async_fetch(urls["html_url"], page_action=scroll_page, humanize=True)
        ).status == 200

    @pytest.mark.parametrize(
        "kwargs",
        [
            {"block_webrtc": True, "allow_webgl": True},
            {"block_webrtc": False, "allow_webgl": True},
            {"block_webrtc": True, "allow_webgl": False, "disable_resources": True},
            {"wait_selector": "h1", "wait_selector_state": "attached"},
            {"wait_selector": "h1", "wait_selector_state": "visible"},
            {
                "network_idle": True,
                "wait": 10,
                "cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
                "google_search": True,
                "extra_headers": {"ayo": ""},
                "selector_config": {"keep_comments": False, "keep_cdata": False},
                "additional_args": {},
            },
        ],
    )
    async def test_properties(self, fetcher, urls, kwargs):
        """Test if different arguments break the code or not"""
        response = await fetcher.async_fetch(
            urls["html_url"],
            **kwargs
        )
        assert response.status == 200


================================================
FILE: tests/fetchers/async/test_stealth_session.py
================================================

import pytest
import asyncio

import pytest_httpbin

from scrapling.fetchers import AsyncStealthySession


@pytest_httpbin.use_class_based_httpbin
@pytest.mark.asyncio
class TestAsyncStealthySession:
    """Test AsyncStealthySession"""

    # The `AsyncStealthySession` is inheriting from `StealthySession` class so no need to repeat all the tests
    @pytest.fixture
    def urls(self, httpbin):
        return {
            "basic": f"{httpbin.url}/get",
            "html": f"{httpbin.url}/html",
        }

    async def test_concurrent_async_requests(self, urls):
        """Test concurrent requests with async session"""
        async with AsyncStealthySession(max_pages=3) as session:
            # Launch multiple concurrent requests
            tasks = [
                session.fetch(urls["basic"]),
                session.fetch(urls["html"]),
                session.fetch(urls["basic"])
            ]

            assert session.max_pages == 3
            assert session.page_pool.max_pages == 3
            assert session.context is not None

            responses = await asyncio.gather(*tasks)

            # All should succeed
            assert all(r.status == 200 for r in responses)

            # Check pool stats
            stats = session.get_pool_stats()
            assert stats["total_pages"] <= 3

        # After exit, should be closed
        assert session._is_alive is False

        # Should raise RuntimeError when used after closing
        with pytest.raises(RuntimeError):
            await session.fetch(urls["basic"])

    async def test_page_pool_management(self, urls):
        """Test page pool creation and reuse"""
        async with AsyncStealthySession() as session:
            # The first request creates a page
            response = await session.fetch(urls["basic"])
            assert response.status == 200
            assert session.page_pool.pages_count == 0

            # The second request should reuse the page
            response = await session.fetch(urls["html"])
            assert response.status == 200
            assert session.page_pool.pages_count == 0

            # Check pool stats
            stats = session.get_pool_stats()
            assert stats["total_pages"] == 0
            assert stats["max_pages"] == 1

    async def test_stealthy_session_with_options(self, urls):
        """Test AsyncStealthySession with various options"""
        async with AsyncStealthySession(
                max_pages=1,
                block_webrtc=True,
                allow_webgl=True
        ) as session:
            response = await session.fetch(urls["html"])
            assert response.status == 200

    async def test_error_handling_in_fetch(self, urls):
        """Test error handling during fetch"""
        async with AsyncStealthySession() as session:
            # Test with invalid URL
            with pytest.raises(Exception):
                await session.fetch("invalid://url")


================================================
FILE: tests/fetchers/sync/__init__.py
================================================


================================================
FILE: tests/fetchers/sync/test_dynamic.py
================================================
import pytest
import pytest_httpbin

from scrapling import DynamicFetcher

DynamicFetcher.adaptive = True


@pytest_httpbin.use_class_based_httpbin
class TestDynamicFetcher:
    @pytest.fixture(scope="class")
    def fetcher(self):
        """Fixture to create a StealthyFetcher instance for the entire test class"""
        return DynamicFetcher

    @pytest.fixture(autouse=True)
    def setup_urls(self, httpbin):
        """Fixture to set up URLs for testing"""
        self.status_200 = f"{httpbin.url}/status/200"
        self.status_404 = f"{httpbin.url}/status/404"
        self.status_501 = f"{httpbin.url}/status/501"
        self.basic_url = f"{httpbin.url}/get"
        self.html_url = f"{httpbin.url}/html"
        self.delayed_url = f"{httpbin.url}/delay/10"  # 10 Seconds delay response
        self.cookies_url = f"{httpbin.url}/cookies/set/test/value"

    def test_basic_fetch(self, fetcher):
        """Test doing a basic fetch request with multiple statuses"""
        assert fetcher.fetch(self.status_200).status == 200
        # There's a bug with playwright makes it crashes if a URL returns status code 4xx/5xx without body, let's disable this till they reply to my issue report
        # assert fetcher.fetch(self.status_404).status == 404
        # assert fetcher.fetch(self.status_501).status == 501

    def test_cookies_loading(self, fetcher):
        """Test if cookies are set after the request"""
        response = fetcher.fetch(self.cookies_url)
        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}
        assert cookies == {"test": "value"}

    def test_automation(self, fetcher):
        """Test if automation breaks the code or not"""

        def scroll_page(page):
            page.mouse.wheel(10, 0)
            page.mouse.move(100, 400)
            page.mouse.up()
            return page

        assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200

    @pytest.mark.parametrize(
        "kwargs",
        [
            {"disable_resources": True, "real_chrome": True},
            {"wait_selector": "h1", "wait_selector_state": "attached"},
            {"wait_selector": "h1", "wait_selector_state": "visible"},
            {
                "google_search": True,
                "real_chrome": True,
                "wait": 10,
                "locale": "en-US",
                "timezone_id": "America/New_York",
                "extra_headers": {"ayo": ""},
                "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
                "cookies": [{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
                "network_idle": True,
                "selector_config": {"keep_comments": False, "keep_cdata": False},
            },
        ],
    )
    def test_properties(self, fetcher, kwargs):
        """Test if different arguments break the code or not"""
        response = fetcher.fetch(self.html_url, **kwargs)
        assert response.status == 200

    def test_cdp_url_invalid(self, fetcher):
        """Test if invalid CDP URLs raise appropriate exceptions"""
        with pytest.raises(TypeError):
            fetcher.fetch(self.html_url, cdp_url="blahblah")

        with pytest.raises(TypeError):
            fetcher.fetch(self.html_url, cdp_url="blahblah")

        with pytest.raises(Exception):
            fetcher.fetch(self.html_url, cdp_url="ws://blahblah")


================================================
FILE: tests/fetchers/sync/test_requests.py
================================================
import pytest
import pytest_httpbin

from scrapling import Fetcher

Fetcher.adaptive = True


@pytest_httpbin.use_class_based_httpbin
class TestFetcher:
    @pytest.fixture(scope="class")
    def fetcher(self):
        """Fixture to create a Fetcher instance for the entire test class"""
        return Fetcher

    @pytest.fixture(autouse=True)
    def setup_urls(self, httpbin):
        """Fixture to set up URLs for testing"""
        self.status_200 = f"{httpbin.url}/status/200"
        self.status_404 = f"{httpbin.url}/status/404"
        self.status_501 = f"{httpbin.url}/status/501"
        self.basic_url = f"{httpbin.url}/get"
        self.post_url = f"{httpbin.url}/post"
        self.put_url = f"{httpbin.url}/put"
        self.delete_url = f"{httpbin.url}/delete"
        self.html_url = f"{httpbin.url}/html"

    def test_basic_get(self, fetcher):
        """Test doing basic get request with multiple statuses"""
        assert fetcher.get(self.status_200).status == 200
        assert fetcher.get(self.status_404).status == 404
        assert fetcher.get(self.status_501).status == 501

    def test_get_properties(self, fetcher):
        """Test if different arguments with the GET request break the code or not"""
        assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
        assert fetcher.get(self.status_200, follow_redirects=True).status == 200
        assert fetcher.get(self.status_200, timeout=None).status == 200
        assert (
            fetcher.get(
                self.status_200,
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            ).status
            == 200
        )

    def test_post_properties(self, fetcher):
        """Test if different arguments with the POST request break the code or not"""
        assert fetcher.post(self.post_url, data={"key": "value"}).status == 200
        assert (
            fetcher.post(
                self.post_url, data={"key": "value"}, stealthy_headers=True
            ).status
            == 200
        )
        assert (
            fetcher.post(
                self.post_url, data={"key": "value"}, follow_redirects=True
            ).status
            == 200
        )
        assert (
            fetcher.post(self.post_url, data={"key": "value"}, timeout=None).status
            == 200
        )
        assert (
            fetcher.post(
                self.post_url,
                data={"key": "value"},
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            ).status
            == 200
        )

    def test_put_properties(self, fetcher):
        """Test if different arguments with a PUT request break the code or not"""
        assert fetcher.put(self.put_url, data={"key": "value"}).status == 200
        assert (
            fetcher.put(
                self.put_url, data={"key": "value"}, stealthy_headers=True
            ).status
            == 200
        )
        assert (
            fetcher.put(
                self.put_url, data={"key": "value"}, follow_redirects=True
            ).status
            == 200
        )
        assert (
            fetcher.put(self.put_url, data={"key": "value"}, timeout=None).status == 200
        )
        assert (
            fetcher.put(
                self.put_url,
                data={"key": "value"},
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            ).status
            == 200
        )

    def test_delete_properties(self, fetcher):
        """Test if different arguments with the DELETE request break the code or not"""
        assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
        assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
        assert fetcher.delete(self.delete_url, timeout=None).status == 200
        assert (
            fetcher.delete(
                self.delete_url,
                stealthy_headers=True,
                follow_redirects=True,
                timeout=None,
            ).status
            == 200
        )


================================================
FILE: tests/fetchers/sync/test_requests_session.py
================================================
import pytest


from scrapling.engines.static import _SyncSessionLogic as FetcherSession, FetcherClient


class TestFetcherSession:
    """Test FetcherSession functionality"""

    def test_fetcher_session_creation(self):
        """Test FetcherSession creation"""
        session = FetcherSession(
            timeout=30,
            retries=3,
            stealthy_headers=True
        )

        assert session._default_timeout == 30
        assert session._default_retries == 3

    def test_fetcher_session_context_manager(self):
        """Test FetcherSession as a context manager"""
        session = FetcherSession()

        with session as s:
            assert s == session
            assert session._curl_session is not None

        # Session should be cleaned up

    def test_fetcher_session_double_enter(self):
        """Test error on double entering"""
        session = FetcherSession()

        with session:
            with pytest.raises(RuntimeError):
                session.__enter__()

    def test_fetcher_client_creation(self):
        """Test FetcherClient creation"""
        client = FetcherClient()

        # Should not have context manager methods
        assert client.__enter__ is None
        assert client.__exit__ is None


================================================
FILE: tests/fetchers/sync/test_stealth_session.py
================================================
import re
import pytest
import pytest_httpbin

from scrapling.engines._browsers._stealth import StealthySession, __CF_PATTERN__


class TestStealthConstants:
    """Test Stealth constants and patterns"""

    def test_cf_pattern_regex(self):
        """Test __CF_PATTERN__ regex compilation"""

        assert isinstance(__CF_PATTERN__, re.Pattern)

        # Test matching URLs
        test_urls = [
            "https://challenges.cloudflare.com/cdn-cgi/challenge-platform/h/123456",
            "https://challenges.cloudflare.com/cdn-cgi/challenge-platform/orchestrate/jsch/v1",
            "http://challenges.cloudflare.com/cdn-cgi/challenge-platform/scripts/abc"
        ]

        for url in test_urls:
            assert __CF_PATTERN__.search(url) is not None

        # Test non-matching URLs
        non_matching_urls = [
            "https://example.com/challenge",
            "https://cloudflare.com/something",
            "https://challenges.cloudflare.com/other-path"
        ]

        for url in non_matching_urls:
            assert __CF_PATTERN__.search(url) is None


@pytest_httpbin.use_class_based_httpbin
class TestStealthySession:

    """All the code is tested in the async version tests, so no need to repeat it here. The async class inherits from this one."""
    @pytest.fixture(autouse=True)
    def setup_urls(self, httpbin):
        """Fixture to set up URLs for testing"""
        self.status_200 = f"{httpbin.url}/status/200"
        self.status_404 = f"{httpbin.url}/status/404"
        self.status_501 = f"{httpbin.url}/status/501"
        self.basic_url = f"{httpbin.url}/get"
        self.html_url = f"{httpbin.url}/html"
        self.delayed_url = f"{httpbin.url}/delay/10"  # 10 Seconds delay response
        self.cookies_url = f"{httpbin.url}/cookies/set/test/value"

    def test_session_creation(self):
        """Test if the session is created correctly"""

        with StealthySession(
            headless=True,
            disable_resources=True,
            solve_cloudflare=True,
            wait=1000,
            timeout=60000,
            cookies=[{"name": "test", "value": "123", "domain": "example.com", "path": "/"}],
        ) as session:

            assert session.max_pages == 1
            assert session._config.headless is True
            assert session._config.disable_resources is True
            assert session._config.solve_cloudflare is True
            assert session._config.wait == 1000
            assert session._config.timeout == 60000
            assert session.context is not None

            # Test Cloudflare detection
            for cloudflare_type in ('managed', 'interactive', 'non-interactive'):
                page_content = f"""
                <html>
                    <script>
                        cType: '{cloudflare_type}'
                    </script>
                </html>
                """
                result = session._detect_cloudflare(page_content)
                assert result == cloudflare_type

            page_content = """
            <html>
                <body>
                    <p>Regular page content</p>
                </body>
            </html>
            """

            result = StealthySession._detect_cloudflare(page_content)
            assert result is None
            assert session.fetch(self.status_200).status == 200


================================================
FILE: tests/fetchers/test_base.py
================================================
import pytest

from scrapling.engines.toolbelt.custom import BaseFetcher


class TestBaseFetcher:
    """Test BaseFetcher configuration functionality"""

    def test_default_configuration(self):
        """Test default configuration values"""
        config = BaseFetcher.display_config()

        assert config['huge_tree'] is True
        assert config['adaptive'] is False
        assert config['keep_comments'] is False
        assert config['keep_cdata'] is False

    def test_configure_single_parameter(self):
        """Test configuring single parameter"""
        BaseFetcher.configure(adaptive=True)

        config = BaseFetcher.display_config()
        assert config['adaptive'] is True

        # Reset
        BaseFetcher.configure(adaptive=False)

    def test_configure_multiple_parameters(self):
        """Test configuring multiple parameters"""
        BaseFetcher.configure(
            huge_tree=False,
            keep_comments=True,
            adaptive=True
        )

        config = BaseFetcher.display_config()
        assert config['huge_tree'] is False
        assert config['keep_comments'] is True
        assert config['adaptive'] is True

        # Reset
        BaseFetcher.configure(
            huge_tree=True,
            keep_comments=False,
            adaptive=False
        )

    def test_configure_invalid_parameter(self):
        """Test configuring invalid parameter"""
        with pytest.raises(ValueError):
            BaseFetcher.configure(invalid_param=True)

    def test_configure_no_parameters(self):
        """Test configure with no parameters"""
        with pytest.raises(AttributeError):
            BaseFetcher.configure()

    def test_configure_non_parser_keyword(self):
        """Test configuring non-parser keyword"""
        with pytest.raises(AttributeError):
            # Assuming there's some attribute that's not in parser_keywords
            BaseFetcher.some_other_attr = "test"
            BaseFetcher.configure(some_other_attr="new_value")

    def test_generate_parser_arguments(self):
        """Test parser arguments generation"""
        BaseFetcher.configure(
            huge_tree=False,
            adaptive=True,
            adaptive_domain="example.com"
        )

        args = BaseFetcher._generate_parser_arguments()

        assert args['huge_tree'] is False
        assert args['adaptive'] is True
        assert args['adaptive_domain'] == "example.com"

        # Reset
        BaseFetcher.configure(
            huge_tree=True,
            adaptive=False
        )
        BaseFetcher.adaptive_domain = None


================================================
FILE: tests/fetchers/test_constants.py
================================================
from scrapling.engines.constants import EXTRA_RESOURCES, STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS


class TestConstants:
    """Test constant values"""

    def test_default_disabled_resources(self):
        """Test default disabled resources"""
        assert "image" in EXTRA_RESOURCES
        assert "font" in EXTRA_RESOURCES
        assert "stylesheet" in EXTRA_RESOURCES
        assert "media" in EXTRA_RESOURCES

    def test_harmful_default_args(self):
        """Test harmful default arguments"""
        assert "--enable-automation" in HARMFUL_ARGS
        assert "--disable-popup-blocking" in HARMFUL_ARGS

    def test_flags(self):
        """Test default stealth flags"""
        assert "--no-pings" in DEFAULT_ARGS
        # assert "--incognito" in STEALTH_ARGS
        assert "--disable-blink-features=AutomationControlled" in STEALTH_ARGS


================================================
FILE: tests/fetchers/test_impersonate_list.py
================================================
"""Test suite for list-based impersonate parameter functionality."""
import pytest
import pytest_httpbin
from unittest.mock import patch, MagicMock

from scrapling import Fetcher
from scrapling.fetchers import FetcherSession
from scrapling.engines.static import _select_random_browser


class TestRandomBrowserSelection:
    """Test the random browser selection helper function."""

    def test_select_random_browser_with_single_string(self):
        """Test that single browser string is returned as-is."""
        result = _select_random_browser("chrome")
        assert result == "chrome"

    def test_select_random_browser_with_none(self):
        """Test that None is returned as-is."""
        result = _select_random_browser(None)
        assert result is None

    def test_select_random_browser_with_list(self):
        """Test that a browser is randomly selected from a list."""
        browsers = ["chrome", "firefox", "safari"]
        result = _select_random_browser(browsers)
        assert result in browsers

    def test_select_random_browser_with_empty_list(self):
        """Test that empty list returns None."""
        result = _select_random_browser([])
        assert result is None

    def test_select_random_browser_with_single_item_list(self):
        """Test that single-item list returns that item."""
        result = _select_random_browser(["chrome"])
        assert result == "chrome"


@pytest_httpbin.use_class_based_httpbin
class TestFetcherWithImpersonateList:
    """Test Fetcher with list-based impersonate parameter."""

    @pytest.fixture(autouse=True)
    def setup_urls(self, httpbin):
        """Fixture to set up URLs for testing."""
        self.basic_url = f"{httpbin.url}/get"

    def test_get_with_impersonate_list(self):
        """Test that GET request works with impersonate as a list."""
        browsers = ["chrome", "firefox"]
        response = Fetcher.get(self.basic_url, impersonate=browsers)
        assert response.status == 200

    def test_get_with_single_impersonate(self):
        """Test that GET request still works with single browser string."""
        response = Fetcher.get(self.basic_url, impersonate="chrome")
        assert response.status == 200

    def test_post_with_impersonate_list(self):
        """Test that POST request works with impersonate as a list."""
        browsers = ["chrome", "firefox"]
        post_url = self.basic_url.replace("/get", "/post")
        response = Fetcher.post(post_url, data={"key": "value"}, impersonate=browsers)
        assert response.status == 200

    def test_put_with_impersonate_list(self):
        """Test that PUT request works with impersonate as a list."""
        browsers = ["chrome", "safari"]
        put_url = self.basic_url.replace("/get", "/put")
        response = Fetcher.put(put_url, data={"key": "value"}, impersonate=browsers)
        assert response.status == 200

    def test_delete_with_impersonate_list(self):
        """Test that DELETE request works with impersonate as a list."""
        browsers = ["chrome", "edge"]
        delete_url = self.basic_url.replace("/get", "/delete")
        response = Fetcher.delete(delete_url, impersonate=browsers)
        assert response.status == 200


@pytest_httpbin.use_class_based_httpbin
class TestFetcherSessionWithImpersonateList:
    """Test FetcherSession with list-based impersonate parameter."""

    @pytest.fixture(autouse=True)
    def setup_urls(self, httpbin):
        """Fixture to set up URLs for testing."""
        self.basic_url = f"{httpbin.url}/get"

    def test_session_init_with_impersonate_list(self):
        """Test that FetcherSession can be initialized with impersonate as a list."""
        browsers = ["chrome", "firefox", "safari"]
        session = FetcherSession(impersonate=browsers)
        assert session._default_impersonate == browsers

    def test_session_request_with_impersonate_list(self):
        """Test that session request works with impersonate as a list."""
        browsers = ["chrome", "firefox"]
        with FetcherSession(impersonate=browsers) as session:
            response = session.get(self.basic_url)
            assert response.status == 200

    def test_session_multiple_requests_with_impersonate_list(self):
        """Test that multiple requests in a session work with impersonate list."""
        browsers = ["chrome110", "chrome120", "chrome131"]
        with FetcherSession(impersonate=browsers) as session:
            response1 = session.get(self.basic_url)
            response2 = session.get(self.basic_url)
            assert response1.status == 200
            assert response2.status == 200

    def test_session_request_level_impersonate_override(self):
        """Test that request-level impersonate overrides session-level."""
        session_browsers = ["chrome", "firefox"]
        request_browser = "safari"

        with FetcherSession(impersonate=session_browsers) as session:
            response = session.get(self.basic_url, impersonate=request_browser)
            assert response.status == 200

    def test_session_request_level_impersonate_list_override(self):
        """Test that request-level impersonate list overrides session-level."""
        session_browsers = ["chrome", "firefox"]
        request_browsers = ["safari", "edge"]

        with FetcherSession(impersonate=session_browsers) as session:
            response = session.get(self.basic_url, impersonate=request_browsers)
            assert response.status == 200


class TestImpersonateTypeValidation:
    """Test type validation for impersonate parameter."""

    def test_impersonate_accepts_string(self):
        """Test that impersonate accepts string type."""
        # This should not raise any type errors
        session = FetcherSession(impersonate="chrome")
        assert session._default_impersonate == "chrome"

    def test_impersonate_accepts_list(self):
        """Test that impersonate accepts list type."""
        # This should not raise any type errors
        browsers = ["chrome", "firefox"]
        session = FetcherSession(impersonate=browsers)
        assert session._default_impersonate == browsers

    def test_impersonate_accepts_none(self):
        """Test that impersonate accepts None."""
        # This should not raise any type errors
        session = FetcherSession(impersonate=None)
        assert session._default_impersonate is None


================================================
FILE: tests/fetchers/test_pages.py
================================================
import pytest
from unittest.mock import Mock
from scrapling.engines._browsers._page import PageInfo, PagePool


class TestPageInfo:
    """Test PageInfo functionality"""

    def test_page_info_creation(self):
        """Test PageInfo creation"""
        mock_page = Mock()
        page_info = PageInfo(mock_page, "ready", "https://example.com")

        assert page_info.page == mock_page
        assert page_info.state == "ready"
        assert page_info.url == "https://example.com"

    def test_page_info_marking(self):
        """Test marking page"""
        mock_page = Mock()
        page_info = PageInfo(mock_page, "ready", None)

        page_info.mark_busy("https://example.com")
        assert page_info.state == "busy"
        assert page_info.url == "https://example.com"

        page_info.mark_error()
        assert page_info.state == "error"

    def test_page_info_equality(self):
        """Test PageInfo equality comparison"""
        mock_page1 = Mock()
        mock_page2 = Mock()

        page_info1 = PageInfo(mock_page1, "ready", None)
        page_info2 = PageInfo(mock_page1, "busy", None)  # Same page, different state
        page_info3 = PageInfo(mock_page2, "ready", None)  # Different page

        assert page_info1 == page_info2  # Same page
        assert page_info1 != page_info3  # Different page
        assert page_info1 != "not a page info"  # Different type

    def test_page_info_repr(self):
        """Test PageInfo string representation"""
        mock_page = Mock()
        page_info = PageInfo(mock_page, "ready", "https://example.com")

        repr_str = repr(page_info)
        assert "ready" in repr_str
        assert "https://example.com" in repr_str


class TestPagePool:
    """Test PagePool functionality"""

    def test_page_pool_creation(self):
        """Test PagePool creation"""
        pool = PagePool(max_pages=5)

        assert pool.max_pages == 5
        assert pool.pages_count == 0
        assert pool.busy_count == 0

    def test_add_page(self):
        """Test adding page to pool"""
        pool = PagePool(max_pages=2)
        mock_page = Mock()

        page_info = pool.add_page(mock_page)

        assert isinstance(page_info, PageInfo)
        assert page_info.page == mock_page
        assert page_info.state == "ready"
        assert pool.pages_count == 1

    def test_add_page_limit_exceeded(self):
        """Test adding page when limit exceeded"""
        pool = PagePool(max_pages=1)

        # Add first page
        pool.add_page(Mock())

        # Try to add a second page
        with pytest.raises(RuntimeError):
            pool.add_page(Mock())


    def test_cleanup_error_pages(self):
        """Test cleaning up error pages"""
        pool = PagePool(max_pages=3)

        # Add pages
        page1 = pool.add_page(Mock())
        _ = pool.add_page(Mock())
        page3 = pool.add_page(Mock())

        # Mark some as error
        page1.mark_error()
        page3.mark_error()

        assert pool.pages_count == 3

        pool.cleanup_error_pages()

        assert pool.pages_count == 1  # Only 2 should remain


================================================
FILE: tests/fetchers/test_proxy_rotation.py
================================================
import pytest
import random
from threading import Thread
from concurrent.futures import ThreadPoolExecutor

from scrapling.engines.toolbelt import ProxyRotator, is_proxy_error, cyclic_rotation


class TestCyclicRotationStrategy:
    """Test the default cyclic_rotation strategy function"""

    def test_cyclic_rotation_cycles_through_proxies(self):
        """Test that cyclic_rotation returns proxies in order"""
        proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"]

        proxy, next_idx = cyclic_rotation(proxies, 0)
        assert proxy == "http://p1:8080"
        assert next_idx == 1

        proxy, next_idx = cyclic_rotation(proxies, 1)
        assert proxy == "http://p2:8080"
        assert next_idx == 2

        proxy, next_idx = cyclic_rotation(proxies, 2)
        assert proxy == "http://p3:8080"
        assert next_idx == 0  # Wraps around

    def test_cyclic_rotation_wraps_index(self):
        """Test that cyclic_rotation handles index overflow"""
        proxies = ["http://p1:8080", "http://p2:8080"]

        # Index larger than list length should wrap
        proxy, next_idx = cyclic_rotation(proxies, 5)
        assert proxy == "http://p2:8080"  # 5 % 2 = 1
        assert next_idx == 0


class TestProxyRotatorCreation:
    """Test ProxyRotator initialization and validation"""

    def test_create_with_string_proxies(self):
        """Test creating rotator with string proxy URLs"""
        proxies = ["http://p1:8080", "http://p2:8080"]
        rotator = ProxyRotator(proxies)

        assert len(rotator) == 2
        assert rotator.proxies == proxies

    def test_create_with_dict_proxies(self):
        """Test creating rotator with dict proxies"""
        proxies = [
            {"server": "http://p1:8080", "username": "user1", "password": "pass1"},
            {"server": "http://p2:8080"},
        ]
        rotator = ProxyRotator(proxies)

        assert len(rotator) == 2
        assert rotator.proxies == proxies

    def test_create_with_mixed_proxies(self):
        """Test creating rotator with mixed string and dict proxies"""
        proxies = [
            "http://p1:8080",
            {"server": "http://p2:8080", "username": "user"},
        ]
        rotator = ProxyRotator(proxies)

        assert len(rotator) == 2

    def test_empty_proxies_raises_error(self):
        """Test that empty proxy list raises ValueError"""
        with pytest.raises(ValueError, match="At least one proxy must be provided"):
            ProxyRotator([])

    def test_dict_without_server_raises_error(self):
        """Test that dict proxy without 'server' key raises ValueError"""
        with pytest.raises(ValueError, match="Proxy dict must have a 'server' key"):
            ProxyRotator([{"username": "user", "password": "pass"}])

    def test_invalid_proxy_type_raises_error(self):
        """Test that invalid proxy type raises TypeError"""
        with pytest.raises(TypeError, match="Invalid proxy type"):
            ProxyRotator([123])

        with pytest.raises(TypeError, match="Invalid proxy type"):
            ProxyRotator([None])

    def test_non_callable_strategy_raises_error(self):
        """Test that non-callable strategy raises TypeError"""
        with pytest.raises(TypeError, match="strategy must be callable"):
            ProxyRotator(["http://p1:8080"], strategy="cyclic_rotation")

        with pytest.raises(TypeError, match="strategy must be callable"):
            ProxyRotator(["http://p1:8080"], strategy=123)


class TestProxyRotatorRotation:
    """Test ProxyRotator rotation behavior"""

    def test_get_proxy_cyclic_rotation(self):
        """Test that get_proxy cycles through proxies in order"""
        proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
        rotator = ProxyRotator(proxies)

        # First cycle
        assert rotator.get_proxy() == "http://p1:8080"
        assert rotator.get_proxy() == "http://p2:8080"
        assert rotator.get_proxy() == "http://p3:8080"

        # Second cycle - wraps around
        assert rotator.get_proxy() == "http://p1:8080"
        assert rotator.get_proxy() == "http://p2:8080"
        assert rotator.get_proxy() == "http://p3:8080"

    def test_get_proxy_single_proxy(self):
        """Test rotation with single proxy always returns the same proxy"""
        rotator = ProxyRotator(["http://only:8080"])

        for _ in range(5):
            assert rotator.get_proxy() == "http://only:8080"

    def test_get_proxy_with_dict_proxies(self):
        """Test rotation with dict proxies"""
        proxies = [
            {"server": "http://p1:8080"},
            {"server": "http://p2:8080"},
        ]
        rotator = ProxyRotator(proxies)

        assert rotator.get_proxy() == {"server": "http://p1:8080"}
        assert rotator.get_proxy() == {"server": "http://p2:8080"}
        assert rotator.get_proxy() == {"server": "http://p1:8080"}


class TestCustomStrategies:
    """Test ProxyRotator with custom rotation strategies"""

    def test_random_strategy(self):
        """Test custom random selection strategy"""
        def random_strategy(proxies, idx):
            return random.choice(proxies), idx

        proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
        rotator = ProxyRotator(proxies, strategy=random_strategy)

        # Get multiple proxies - they should all be valid
        results = [rotator.get_proxy() for _ in range(10)]
        assert all(p in proxies for p in results)

    def test_sticky_strategy(self):
        """Test custom sticky strategy that always returns first proxy"""
        def sticky_strategy(proxies, idx):
            return proxies[0], idx

        rotator = ProxyRotator(
            ["http://p1:8080", "http://p2:8080"],
            strategy=sticky_strategy
        )

        for _ in range(5):
            assert rotator.get_proxy() == "http://p1:8080"

    def test_weighted_strategy(self):
        """Test custom weighted strategy"""
        call_count = {"count": 0}

        def alternating_strategy(proxies, idx):
            # Returns first proxy twice, then second proxy once
            call_count["count"] += 1
            if call_count["count"] % 3 == 0:
                return proxies[1], idx
            return proxies[0], idx

        rotator = ProxyRotator(
            ["http://primary:8080", "http://backup:8080"],
            strategy=alternating_strategy
        )

        assert rotator.get_proxy() == "http://primary:8080"
        assert rotator.get_proxy() == "http://primary:8080"
        assert rotator.get_proxy() == "http://backup:8080"

    def test_lambda_strategy(self):
        """Test using lambda as strategy"""
        rotator = ProxyRotator(
            ["http://p1:8080", "http://p2:8080"],
            strategy=lambda proxies, idx: (proxies[-1], idx)  # Always last
        )

        assert rotator.get_proxy() == "http://p2:8080"
        assert rotator.get_proxy() == "http://p2:8080"


class TestProxyRotatorProperties:
    """Test ProxyRotator properties and methods"""

    def test_proxies_property_returns_copy(self):
        """Test that proxies property returns a copy, not the original list"""
        original = ["http://p1:8080", "http://p2:8080"]
        rotator = ProxyRotator(original)

        proxies_copy = rotator.proxies
        proxies_copy.append("http://p3:8080")

        # Original should be unchanged
        assert len(rotator) == 2
        assert len(rotator.proxies) == 2

    def test_len_returns_proxy_count(self):
        """Test __len__ returns correct count"""
        assert len(ProxyRotator(["http://p1:8080"])) == 1
        assert len(ProxyRotator(["http://p1:8080", "http://p2:8080"])) == 2
        assert len(ProxyRotator(["a", "b", "c", "d", "e"])) == 5

    def test_repr(self):
        """Test __repr__ format"""
        rotator = ProxyRotator(["http://p1:8080", "http://p2:8080", "http://p3:8080"])
        assert repr(rotator) == "ProxyRotator(proxies=3)"


class TestProxyRotatorThreadSafety:
    """Test ProxyRotator thread safety"""

    def test_concurrent_get_proxy(self):
        """Test that concurrent get_proxy calls don't cause errors"""
        proxies = [f"http://p{i}:8080" for i in range(10)]
        rotator = ProxyRotator(proxies)
        results = []

        def get_proxies(n):
            for _ in range(n):
                results.append(rotator.get_proxy())

        threads = [Thread(target=get_proxies, args=(100,)) for _ in range(10)]
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        # All results should be valid proxies
        assert len(results) == 1000
        assert all(p in proxies for p in results)

    def test_thread_pool_concurrent_access(self):
        """Test concurrent access using ThreadPoolExecutor"""
        proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
        rotator = ProxyRotator(proxies)

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(rotator.get_proxy) for _ in range(100)]
            results = [f.result() for f in futures]

        assert len(results) == 100
        assert all(p in proxies for p in results)


class TestIsProxyError:
    """Test is_proxy_error utility function"""

    @pytest.mark.parametrize("error_msg", [
        "net::err_proxy_connection_failed",
        "NET::ERR_PROXY_AUTH_FAILED",
        "net::err_tunnel_connection_failed",
        "Connection refused by proxy",
        "Connection reset by peer",
        "Connection timed out while connecting to proxy",
        "Failed to connect to proxy server",
        "Could not resolve proxy host",
    ])
    def test_proxy_errors_detected(self, error_msg):
        """Test that proxy-related errors are detected"""
        assert is_proxy_error(Exception(error_msg)) is True

    @pytest.mark.parametrize("error_msg", [
        "Page not found",
        "404 Not Found",
        "Internal server error",
        "DNS resolution failed",
        "SSL certificate error",
        "Timeout waiting for response",
        "Invalid JSON response",
    ])
    def test_non_proxy_errors_not_detected(self, error_msg):
        """Test that non-proxy errors are not detected as proxy errors"""
        assert is_proxy_error(Exception(error_msg)) is False

    def test_case_insensitive_detection(self):
        """Test that error detection is case-insensitive"""
        assert is_proxy_error(Exception("NET::ERR_PROXY")) is True
        assert is_proxy_error(Exception("Net::Err_Proxy")) is True
        assert is_proxy_error(Exception("CONNECTION REFUSED")) is True

    def test_empty_error_message(self):
        """Test handling of empty error message"""
        assert is_proxy_error(Exception("")) is False

    def test_custom_exception_types(self):
        """Test with custom exception types"""
        class CustomError(Exception):
            pass

        assert is_proxy_error(CustomError("net::err_proxy_failed")) is True
        assert is_proxy_error(CustomError("normal error")) is False


================================================
FILE: tests/fetchers/test_response_handling.py
================================================
from unittest.mock import Mock

from scrapling.parser import Selector
from scrapling.engines.toolbelt.convertor import ResponseFactory, Response


class TestResponseFactory:
    """Test ResponseFactory functionality"""

    def test_response_from_curl(self):
        """Test creating response from curl_cffi response"""
        # Mock curl response
        mock_curl_response = Mock()
        mock_curl_response.url = "https://example.com"
        mock_curl_response.content = b"<html><body>Test</body></html>"
        mock_curl_response.status_code = 200
        mock_curl_response.reason = "OK"
        mock_curl_response.encoding = "utf-8"
        mock_curl_response.cookies = {"session": "abc"}
        mock_curl_response.headers = {"Content-Type": "text/html"}
        mock_curl_response.request.headers = {"User-Agent": "Test"}
        mock_curl_response.request.method = "GET"
        mock_curl_response.history = []

        response = ResponseFactory.from_http_request(
            mock_curl_response,
            {"adaptive": False}
        )

        assert response.status == 200
        assert response.url == "https://example.com"
        assert isinstance(response, Response)

    def test_response_history_processing(self):
        """Test processing response history"""
        # Mock responses with redirects
        mock_final = Mock()
        mock_final.status = 200
        mock_final.status_text = "OK"
        mock_final.all_headers = Mock(return_value={})

        mock_redirect = Mock()
        mock_redirect.url = "https://example.com/redirect"
        mock_redirect.response = Mock(return_value=mock_final)
        mock_redirect.all_headers = Mock(return_value={})
        mock_redirect.redirected_from = None

        mock_first = Mock()
        mock_first.request.redirected_from = mock_redirect

        # Process history
        history = ResponseFactory._process_response_history(
            mock_first,
            {}
        )

        assert len(history) >= 0  # Should process redirects


class TestErrorScenarios:
    """Test various error scenarios"""

    def test_invalid_html_handling(self):
        """Test handling of malformed HTML"""
        malformed_html = """
        <html>
            <body>
                <div>Unclosed div
                <p>Paragraph without closing tag
                <span>Nested unclosed
            </body>
        """

        # Should handle gracefully
        page = Selector(malformed_html)
        assert page is not None

        # Should still be able to select elements
        divs = page.css("div")
        assert len(divs) > 0

    def test_empty_responses(self):
        """Test handling of empty responses"""
        # Empty HTML
        page = Selector("")
        assert page is not None

        # Whitespace only
        page = Selector("   \n\t   ")
        assert page is not None

        # Null bytes
        page = Selector("Hello\x00World")
        assert "Hello" in page.get_all_text()


================================================
FILE: tests/fetchers/test_utils.py
================================================
import pytest

from scrapling.engines.toolbelt.custom import StatusText, Response
from scrapling.engines.toolbelt.navigation import (
    construct_proxy_dict,
    create_intercept_handler,
    create_async_intercept_handler,
)
from scrapling.engines.toolbelt.fingerprints import (
    get_os_name,
    generate_headers
)


@pytest.fixture
def content_type_map():
    return {
        # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
        "text/html; charset=UTF-8": "UTF-8",
        "text/html; charset=ISO-8859-1": "ISO-8859-1",
        "text/html": "ISO-8859-1",
        "application/json; charset=UTF-8": "UTF-8",
        "application/json": "utf-8",
        "text/json": "utf-8",
        "application/javascript; charset=UTF-8": "UTF-8",
        "application/javascript": "utf-8",
        "text/plain; charset=UTF-8": "UTF-8",
        "text/plain; charset=ISO-8859-1": "ISO-8859-1",
        "text/plain": "ISO-8859-1",
        "application/xhtml+xml; charset=UTF-8": "UTF-8",
        "application/xhtml+xml": "utf-8",
        "text/html; charset=windows-1252": "windows-1252",
        "application/json; charset=windows-1252": "windows-1252",
        "text/plain; charset=windows-1252": "windows-1252",
        'text/html; charset="UTF-8"': "UTF-8",
        'text/html; charset="ISO-8859-1"': "ISO-8859-1",
        'text/html; charset="windows-1252"': "windows-1252",
        'application/json; charset="UTF-8"': "UTF-8",
        'application/json; charset="ISO-8859-1"': "ISO-8859-1",
        'application/json; charset="windows-1252"': "windows-1252",
        'text/json; charset="UTF-8"': "UTF-8",
        'application/javascript; charset="UTF-8"': "UTF-8",
        'application/javascript; charset="ISO-8859-1"': "ISO-8859-1",
        'text/plain; charset="UTF-8"': "UTF-8",
        'text/plain; charset="ISO-8859-1"': "ISO-8859-1",
        'text/plain; charset="windows-1252"': "windows-1252",
        'application/xhtml+xml; charset="UTF-8"': "UTF-8",
        'application/xhtml+xml; charset="ISO-8859-1"': "ISO-8859-1",
        'application/xhtml+xml; charset="windows-1252"': "windows-1252",
        'text/html; charset="US-ASCII"': "US-ASCII",
        'application/json; charset="US-ASCII"': "US-ASCII",
        'text/plain; charset="US-ASCII"': "US-ASCII",
        'text/html; charset="Shift_JIS"': "Shift_JIS",
        'application/json; charset="Shift_JIS"': "Shift_JIS",
        'text/plain; charset="Shift_JIS"': "Shift_JIS",
        'application/xml; charset="UTF-8"': "UTF-8",
        'application/xml; charset="ISO-8859-1"': "ISO-8859-1",
        "application/xml": "utf-8",
        'text/xml; charset="UTF-8"': "UTF-8",
        'text/xml; charset="ISO-8859-1"': "ISO-8859-1",
        "text/xml": "utf-8",
    }


@pytest.fixture
def status_map():
    return {
        100: "Continue",
        101: "Switching Protocols",
        102: "Processing",
        103: "Early Hints",
        200: "OK",
        201: "Created",
        202: "Accepted",
        203: "Non-Authoritative Information",
        204: "No Content",
        205: "Reset Content",
        206: "Partial Content",
        207: "Multi-Status",
        208: "Already Reported",
        226: "IM Used",
        300: "Multiple Choices",
        301: "Moved Permanently",
        302: "Found",
        303: "See Other",
        304: "Not Modified",
        305: "Use Proxy",
        307: "Temporary Redirect",
        308: "Permanent Redirect",
        400: "Bad Request",
        401: "Unauthorized",
        402: "Payment Required",
        403: "Forbidden",
        404: "Not Found",
        405: "Method Not Allowed",
        406: "Not Acceptable",
        407: "Proxy Authentication Required",
        408: "Request Timeout",
        409: "Conflict",
        410: "Gone",
        411: "Length Required",
        412: "Precondition Failed",
        413: "Payload Too Large",
        414: "URI Too Long",
        415: "Unsupported Media Type",
        416: "Range Not Satisfiable",
        417: "Expectation Failed",
        418: "I'm a teapot",
        421: "Misdirected Request",
        422: "Unprocessable Entity",
        423: "Locked",
        424: "Failed Dependency",
        425: "Too Early",
        426: "Upgrade Required",
        428: "Precondition Required",
        429: "Too Many Requests",
        431: "Request Header Fields Too Large",
        451: "Unavailable For Legal Reasons",
        500: "Internal Server Error",
        501: "Not Implemented",
        502: "Bad Gateway",
        503: "Service Unavailable",
        504: "Gateway Timeout",
        505: "HTTP Version Not Supported",
        506: "Variant Also Negotiates",
        507: "Insufficient Storage",
        508: "Loop Detected",
        510: "Not Extended",
        511: "Network Authentication Required",
    }


def test_parsing_response_status(status_map):
    """Test if using different http responses' status codes returns the expected result"""
    for status_code, expected_status_text in status_map.items():
        assert StatusText.get(status_code) == expected_status_text


def test_unknown_status_code():
    """Test handling of an unknown status code"""
    assert StatusText.get(1000) == "Unknown Status Code"


class TestConstructProxyDict:
    """Test proxy dictionary construction"""

    def test_proxy_string_basic(self):
        """Test a basic proxy string"""
        result = construct_proxy_dict("http://proxy.example.com:8080")

        expected = {
            "server": "http://proxy.example.com:8080",
            "username": "",
            "password": ""
        }
        assert result == expected

    def test_proxy_string_with_auth(self):
        """Test proxy string with authentication"""
        result = construct_proxy_dict("http://user:pass@proxy.example.com:8080")

        expected = {
            "server": "http://proxy.example.com:8080",
            "username": "user",
            "password": "pass"
        }
        assert result == expected

    def test_proxy_dict_input(self):
        """Test proxy dictionary input"""
        input_dict = {
            "server": "http://proxy.example.com:8080",
            "username": "user",
            "password": "pass"
        }
        result = construct_proxy_dict(input_dict)

        assert result == input_dict

    def test_proxy_dict_minimal(self):
        """Test minimal proxy dictionary"""
        input_dict = {"server": "http://proxy.example.com:8080"}
        result = construct_proxy_dict(input_dict)

        expected = {
            "server": "http://proxy.example.com:8080",
            "username": "",
            "password": ""
        }
        assert result == expected

    def test_invalid_proxy_string(self):
        """Test invalid proxy string"""
        with pytest.raises(ValueError):
            construct_proxy_dict("invalid-proxy-format")

    def test_invalid_proxy_dict(self):
        """Test invalid proxy dictionary"""
        with pytest.raises(TypeError):
            construct_proxy_dict({"invalid": "structure"})


class TestFingerprintFunctions:
    """Test fingerprint generation functions"""

    def test_get_os_name(self):
        """Test OS name detection"""
        result = get_os_name()

        # Should return one of the known OS names or None
        valid_names = ["linux", "macos", "windows", "ios"]
        assert result is None or result in valid_names

    def test_generate_headers_basic(self):
        """Test basic header generation"""
        headers = generate_headers()

        assert isinstance(headers, dict)
        assert "User-Agent" in headers
        assert len(headers["User-Agent"]) > 0

    def test_generate_headers_browser_mode(self):
        """Test header generation in browser mode"""
        headers = generate_headers(browser_mode=True)

        assert isinstance(headers, dict)
        assert "User-Agent" in headers


class TestResponse:
    """Test Response class functionality"""

    def test_response_creation(self):
        """Test Response object creation"""
        response = Response(
            url="https://example.com",
            content="<html><body>Test</body></html>",
            status=200,
            reason="OK",
            cookies={"session": "abc123"},
            headers={"Content-Type": "text/html"},
            request_headers={"User-Agent": "Test"},
            encoding="utf-8"
        )

        assert response.url == "https://example.com"
        assert response.status == 200
        assert response.reason == "OK"
        assert response.cookies == {"session": "abc123"}

    def test_response_with_bytes_content(self):
        """Test Response with 'bytes' content"""
        content_bytes = "<html><body>Test</body></html>".encode('utf-8')

        response = Response(
            url="https://example.com",
            content=content_bytes,
            status=200,
            reason="OK",
            cookies={},
            headers={},
            request_headers={}
        )

        # Should handle 'bytes' content properly
        assert response.status == 200


class _MockRequest:
    """Minimal mock for Playwright's Request object."""
    def __init__(self, url: str, resource_type: str = "document"):
        self.url = url
        self.resource_type = resource_type


class _MockRoute:
    """Minimal mock for Playwright's sync Route object."""
    def __init__(self, url: str, resource_type: str = "document"):
        self.request = _MockRequest(url, resource_type)
        self.aborted = False
        self.continued = False

    def abort(self):
        self.aborted = True

    def continue_(self):
        self.continued = True


class _AsyncMockRoute:
    """Minimal mock for Playwright's async Route object."""
    def __init__(self, url: str, resource_type: str = "document"):
        self.request = _MockRequest(url, resource_type)
        self.aborted = False
        self.continued = False

    async def abort(self):
        self.aborted = True

    async def continue_(self):
        self.continued = True


class TestCreateInterceptHandler:
    """Test the unified sync route handler factory."""

    def test_blocks_disabled_resource_types(self):
        handler = create_intercept_handler(disable_resources=True)
        route = _MockRoute("https://example.com/image.png", resource_type="image")
        handler(route)
        assert route.aborted

    def test_continues_allowed_resource_types(self):
        handler = create_intercept_handler(disable_resources=True)
        route = _MockRoute("https://example.com/page", resource_type="document")
        handler(route)
        assert route.continued

    def test_blocks_exact_domain(self):
        handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
        route = _MockRoute("https://ads.example.com/tracker.js")
        handler(route)
        assert route.aborted

    def test_blocks_subdomain(self):
        handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
        route = _MockRoute("https://sub.example.com/page")
        handler(route)
        assert route.aborted

    def test_continues_non_blocked_domain(self):
        handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
        route = _MockRoute("https://safe.example.com/page")
        handler(route)
        assert route.continued

    def test_resource_blocking_takes_priority_over_domain(self):
        """When both are active, resource type check comes first."""
        handler = create_intercept_handler(disable_resources=True, blocked_domains={"example.com"})
        route = _MockRoute("https://example.com/style.css", resource_type="stylesheet")
        handler(route)
        assert route.aborted

    def test_domain_blocking_with_resources_disabled(self):
        """Non-blocked resource type from a blocked domain should still be aborted."""
        handler = create_intercept_handler(disable_resources=True, blocked_domains={"tracker.io"})
        route = _MockRoute("https://tracker.io/api", resource_type="document")
        handler(route)
        assert route.aborted

    def test_no_blocking_continues(self):
        handler = create_intercept_handler(disable_resources=False)
        route = _MockRoute("https://example.com/page")
        handler(route)
        assert route.continued

    def test_does_not_block_partial_domain_match(self):
        """'example.com' should not block 'notexample.com'."""
        handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
        route = _MockRoute("https://notexample.com/page")
        handler(route)
        assert route.continued

    def test_multiple_blocked_domains(self):
        handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.com", "tracker.io"})
        route_ads = _MockRoute("https://ads.com/banner")
        route_tracker = _MockRoute("https://cdn.tracker.io/script.js")
        route_safe = _MockRoute("https://example.com/page")
        handler(route_ads)
        handler(route_tracker)
        handler(route_safe)
        assert route_ads.aborted
        assert route_tracker.aborted
        assert route_safe.continued


class TestCreateAsyncInterceptHandler:
    """Test the unified async route handler factory."""

    @pytest.mark.asyncio
    async def test_blocks_disabled_resource_types(self):
        handler = create_async_intercept_handler(disable_resources=True)
        route = _AsyncMockRoute("https://example.com/font.woff", resource_type="font")
        await handler(route)
        assert route.aborted

    @pytest.mark.asyncio
    async def test_blocks_domain(self):
        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
        route = _AsyncMockRoute("https://ads.example.com/track")
        await handler(route)
        assert route.aborted

    @pytest.mark.asyncio
    async def test_continues_non_blocked(self):
        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"})
        route = _AsyncMockRoute("https://safe.example.com/page")
        await handler(route)
        assert route.continued

    @pytest.mark.asyncio
    async def test_blocks_subdomain(self):
        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"tracker.io"})
        route = _AsyncMockRoute("https://cdn.tracker.io/script.js")
        await handler(route)
        assert route.aborted

    @pytest.mark.asyncio
    async def test_does_not_block_partial_domain_match(self):
        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"example.com"})
        route = _AsyncMockRoute("https://notexample.com/page")
        await handler(route)
        assert route.continued


================================================
FILE: tests/fetchers/test_validator.py
================================================
import pytest
from scrapling.engines._browsers._validators import (
    validate,
    StealthConfig,
    PlaywrightConfig,
)


class TestValidators:
    """Test configuration validators"""

    def test_playwright_config_valid(self):
        """Test valid PlaywrightConfig"""
        params = {
            "max_pages": 2,
            "headless": True,
            "timeout": 30000,
            "proxy": "http://proxy.example.com:8080"
        }

        config = validate(params, PlaywrightConfig)

        assert config.max_pages == 2
        assert config.headless is True
        assert config.timeout == 30000
        assert isinstance(config.proxy, dict)

    def test_playwright_config_invalid_max_pages(self):
        """Test PlaywrightConfig with invalid max_pages"""
        params = {"max_pages": 0}

        with pytest.raises(TypeError):
            validate(params, PlaywrightConfig)

        params = {"max_pages": 51}

        with pytest.raises(TypeError):
            validate(params, PlaywrightConfig)

    def test_playwright_config_invalid_timeout(self):
        """Test PlaywrightConfig with an invalid timeout"""
        params = {"timeout": -1}

        with pytest.raises(TypeError):
            validate(params, PlaywrightConfig)

    def test_playwright_config_invalid_cdp_url(self):
        """Test PlaywrightConfig with invalid CDP URL"""
        params = {"cdp_url": "invalid-url"}

        with pytest.raises(TypeError):
            validate(params, PlaywrightConfig)

    def test_stealth_config_valid(self):
        """Test valid StealthConfig"""
        params = {
            "max_pages": 1,
            "headless": True,
            "solve_cloudflare": False,
            "timeout": 30000
        }

        config = validate(params, StealthConfig)

        assert config.max_pages == 1
        assert config.headless is True
        assert config.solve_cloudflare is False
        assert config.timeout == 30000

    def test_stealth_config_cloudflare_timeout(self):
        """Test StealthConfig timeout adjustment for Cloudflare"""
        params = {
            "solve_cloudflare": True,
            "timeout": 10000  # Less than the required 60,000
        }

        config = validate(params, StealthConfig)

        assert config.timeout == 60000  # Should be increased

    def test_playwright_config_blocked_domains(self):
        """Test PlaywrightConfig with blocked_domains"""
        params = {"blocked_domains": {"ads.example.com", "tracker.io"}}

        config = validate(params, PlaywrightConfig)

        assert config.blocked_domains == {"ads.example.com", "tracker.io"}

    def test_playwright_config_blocked_domains_default_none(self):
        """Test PlaywrightConfig blocked_domains defaults to None"""
        config = validate({}, PlaywrightConfig)

        assert config.blocked_domains is None

    def test_stealth_config_blocked_domains(self):
        """Test StealthConfig inherits blocked_domains"""
        params = {"blocked_domains": {"ads.example.com"}}

        config = validate(params, StealthConfig)

        assert config.blocked_domains == {"ads.example.com"}


================================================
FILE: tests/parser/__init__.py
================================================


================================================
FILE: tests/parser/test_adaptive.py
================================================
import asyncio

import pytest

from scrapling import Selector


class TestParserAdaptive:
    def test_element_relocation(self):
        """Test relocating element after structure change"""
        original_html = """
                <div class="container">
                    <section class="products">
                        <article class="product" id="p1">
                            <h3>Product 1</h3>
                            <p class="description">Description 1</p>
                        </article>
                        <article class="product" id="p2">
                            <h3>Product 2</h3>
                            <p class="description">Description 2</p>
                        </article>
                    </section>
                </div>
                """
        changed_html = """
                <div class="new-container">
                    <div class="product-wrapper">
                        <section class="products">
                            <article class="product new-class" data-id="p1">
                                <div class="product-info">
                                    <h3>Product 1</h3>
                                    <p class="new-description">Description 1</p>
                                </div>
                            </article>
                            <article class="product new-class" data-id="p2">
                                <div class="product-info">
                                    <h3>Product 2</h3>
                                    <p class="new-description">Description 2</p>
                                </div>
                            </article>
                        </section>
                    </div>
                </div>
                """

        old_page = Selector(original_html, url="example.com", adaptive=True)
        new_page = Selector(changed_html, url="example.com", adaptive=True)

        # 'p1' was used as ID and now it's not and all the path elements have changes
        # Also at the same time testing `adaptive` vs combined selectors
        _ = old_page.css("#p1, #p2", auto_save=True)[0]
        relocated = new_page.css("#p1", adaptive=True)

        assert relocated is not None
        assert relocated[0].attrib["data-id"] == "p1"
        assert relocated[0].has_class("new-class")
        assert relocated[0].css(".new-description")[0].text == "Description 1"

    @pytest.mark.asyncio
    async def test_element_relocation_async(self):
        """Test relocating element after structure change in async mode"""
        original_html = """
                <div class="container">
                    <section class="products">
                        <article class="product" id="p1">
                            <h3>Product 1</h3>
                            <p class="description">Description 1</p>
                        </article>
                        <article class="product" id="p2">
                            <h3>Product 2</h3>
                            <p class="description">Description 2</p>
                        </article>
                    </section>
                </div>
                """
        changed_html = """
                <div class="new-container">
                    <div class="product-wrapper">
                        <section class="products">
                            <article class="product new-class" data-id="p1">
                                <div class="product-info">
                                    <h3>Product 1</h3>
                                    <p class="new-description">Description 1</p>
                                </div>
                            </article>
                            <article class="product new-class" data-id="p2">
                                <div class="product-info">
                                    <h3>Product 2</h3>
                                    <p class="new-description">Description 2</p>
                                </div>
                            </article>
                        </section>
                    </div>
                </div>
                """

        # Simulate async operation
        await asyncio.sleep(0.1)  # Minimal async operation

        old_page = Selector(original_html, url="example.com", adaptive=True)
        new_page = Selector(changed_html, url="example.com", adaptive=True)

        # 'p1' was used as ID and now it's not and all the path elements have changes
        # Also at the same time testing `adaptive` vs combined selectors
        _ = old_page.css("#p1, #p2", auto_save=True)[0]
        relocated = new_page.css("#p1", adaptive=True)

        assert relocated is not None
        assert relocated[0].attrib["data-id"] == "p1"
        assert relocated[0].has_class("new-class")
        assert relocated[0].css(".new-description")[0].text == "Description 1"


================================================
FILE: tests/parser/test_attributes_handler.py
================================================
import pytest
import json

from scrapling import Selector
from scrapling.core.custom_types import AttributesHandler


class TestAttributesHandler:
    """Test AttributesHandler functionality"""

    @pytest.fixture
    def sample_html(self):
        return """
        <html>
            <body>
                <div id="main" 
                     class="container active" 
                     data-config='{"theme": "dark", "version": 2.5}'
                     data-items='[1, 2, 3, 4, 5]'
                     data-invalid-json='{"broken: json}'
                     title="Main Container"
                     style="color: red; background: blue;"
                     data-empty=""
                     data-number="42"
                     data-bool="true"
                     data-url="https://example.com/page?param=value"
                     custom-attr="custom-value"
                     data-nested='{"user": {"name": "John", "age": 30}}'
                     data-encoded="&lt;div&gt;HTML&lt;/div&gt;"
                     onclick="handleClick()"
                     data-null="null"
                     data-undefined="undefined">
                    Content
                </div>
                <input type="text" 
                       name="username" 
                       value="test@example.com" 
                       placeholder="Enter email"
                       required
                       disabled>
                <img src="/images/photo.jpg" 
                     alt="Photo" 
                     width="100" 
                     height="100"
                     loading="lazy">
            </body>
        </html>
        """

    @pytest.fixture
    def attributes(self, sample_html):
        page = Selector(sample_html)
        element = page.css("#main")[0]
        return element.attrib

    def test_basic_attribute_access(self, attributes):
        """Test basic attribute access"""
        # Dict-like access
        assert attributes["id"] == "main"
        assert attributes["class"] == "container active"
        assert attributes["title"] == "Main Container"

        # Key existence
        assert "id" in attributes
        assert "nonexistent" not in attributes

        # Get with default
        assert attributes.get("id") == "main"
        assert attributes.get("nonexistent") is None
        assert attributes.get("nonexistent", "default") == "default"

    def test_iteration_methods(self, attributes):
        """Test iteration over attributes"""
        # Keys
        keys = list(attributes.keys())
        assert "id" in keys
        assert "class" in keys
        assert "data-config" in keys

        # Values
        values = list(attributes.values())
        assert "main" in values
        assert "container active" in values

        # Items
        items = dict(attributes.items())
        assert items["id"] == "main"
        assert items["class"] == "container active"

        # Length
        assert len(attributes) > 0

    def test_json_parsing(self, attributes):
        """Test JSON parsing from attributes"""
        # Valid JSON object
        config = attributes["data-config"].json()
        assert config["theme"] == "dark"
        assert config["version"] == 2.5

        # Valid JSON array
        items = attributes["data-items"].json()
        assert items == [1, 2, 3, 4, 5]

        # Nested JSON
        nested = attributes["data-nested"].json()
        assert nested["user"]["name"] == "John"
        assert nested["user"]["age"] == 30

        # JSON null
        assert attributes["data-null"].json() is None

    def test_json_error_handling(self, attributes):
        """Test JSON parsing error handling"""
        # Invalid JSON should raise error or return None
        with pytest.raises((json.JSONDecodeError, AttributeError)):
            attributes["data-invalid-json"].json()

        # Non-existent attribute
        with pytest.raises(KeyError):
            attributes["nonexistent"].json()

    def test_json_string_property(self, attributes):
        """Test json_string property"""
        # Should return JSON representation of all attributes
        json_string = attributes.json_string
        assert isinstance(json_string, bytes)

        # Parse it back
        parsed = json.loads(json_string)
        assert parsed["id"] == "main"
        assert parsed["class"] == "container active"

    def test_search_values(self, attributes):
        """Test search_values method"""
        # Exact match
        results = list(attributes.search_values("main", partial=False))
        assert len(results) == 1
        assert "id" in results[0]

        # Partial match
        results = list(attributes.search_values("container", partial=True))
        assert len(results) >= 1
        found_keys = []
        for result in results:
            found_keys.extend(result.keys())
        assert "class" in found_keys or "title" in found_keys

        # Case sensitivity
        results = list(attributes.search_values("MAIN", partial=False))
        assert len(results) == 0  # Should be case-sensitive by default

        # Multiple matches
        results = list(attributes.search_values("2", partial=True))
        assert len(results) > 1  # Should find multiple attributes

        # No matches
        results = list(attributes.search_values("nonexistent", partial=False))
        assert len(results) == 0

    def test_special_attribute_types(self, sample_html):
        """Test handling of special attribute types"""
        page = Selector(sample_html)

        # Boolean attributes
        input_elem = page.css("input")[0]
        assert "required" in input_elem.attrib
        assert "disabled" in input_elem.attrib

        # Empty attributes
        main_elem = page.css("#main")[0]
        assert main_elem.attrib["data-empty"] == ""

        # Numeric string attributes
        assert main_elem.attrib["data-number"] == "42"
        assert main_elem.attrib["data-bool"] == "true"

    def test_attribute_modification(self, sample_html):
        """Test that AttributesHandler is read-only (if applicable)"""
        page = Selector(sample_html)
        element = page.css("#main")[0]
        attrs = element.attrib

        # Test if attributes can be modified
        # This behavior depends on implementation
        original_id = attrs["id"]
        try:
            attrs["id"] = "new-id"
            # If modification is allowed
            assert attrs["id"] == "new-id"
            # Reset
            attrs["id"] = original_id
        except (TypeError, AttributeError):
            # If modification is not allowed (read-only)
            assert attrs["id"] == original_id

    def test_string_representation(self, attributes):
        """Test string representations"""
        # __str__
        str_repr = str(attributes)
        assert isinstance(str_repr, str)
        assert "id" in str_repr or "main" in str_repr

        # __repr__
        repr_str = repr(attributes)
        assert isinstance(repr_str, str)

    def test_edge_cases(self, sample_html):
        """Test edge cases and special scenarios"""
        page = Selector(sample_html)

        # Element with no attributes
        page_with_no_attrs = Selector("<div>Content</div>")
        elem = page_with_no_attrs.css("div")[0]
        assert len(elem.attrib) == 0
        assert list(elem.attrib.keys()) == []
        assert elem.attrib.get("any") is None

        # Element with encoded content
        main_elem = page.css("#main")[0]
        encoded = main_elem.attrib["data-encoded"]
        assert "<" in encoded  # Should decode it

        # Style attribute parsing
        style = main_elem.attrib["style"]
        assert "color: red" in style
        assert "background: blue" in style

    def test_url_attribute(self, attributes):
        """Test URL attributes"""
        url = attributes["data-url"]
        assert url == "https://example.com/page?param=value"

        # Could test URL joining if AttributesHandler supports it
        # based on the parent element's base URL

    def test_comparison_operations(self, sample_html):
        """Test comparison operations if supported"""
        page = Selector(sample_html)
        elem1 = page.css("#main")[0]
        elem2 = page.css("input")[0]

        # Different elements should have different attributes
        assert elem1.attrib != elem2.attrib

        # The same element should have equal attributes
        elem1_again = page.css("#main")[0]
        assert elem1.attrib == elem1_again.attrib

    def test_complex_search_patterns(self, attributes):
        """Test complex search patterns"""
        # Search for JSON-containing attributes
        json_attrs = []
        for key, value in attributes.items():
            try:
                if isinstance(value, str) and (value.startswith('{') or value.startswith('[')):
                    json.loads(value)
                    json_attrs.append(key)
            except:
                pass

        assert "data-config" in json_attrs
        assert "data-items" in json_attrs
        assert "data-nested" in json_attrs

    def test_attribute_filtering(self, attributes):
        """Test filtering attributes by patterns"""
        # Get all data-* attributes
        data_attrs = {k: v for k, v in attributes.items() if k.startswith("data-")}
        assert len(data_attrs) > 5
        assert "data-config" in data_attrs
        assert "data-items" in data_attrs

        # Get all event handler attributes
        event_attrs = {k: v for k, v in attributes.items() if k.startswith("on")}
        assert "onclick" in event_attrs

    def test_performance_with_many_attributes(self):
        """Test performance with elements having many attributes"""
        # Create an element with many attributes
        attrs_list = [f'data-attr{i}="value{i}"' for i in range(100)]
        html = f'<div id="test" {" ".join(attrs_list)}>Content</div>'

        page = Selector(html)
        element = page.css("#test")[0]
        attribs = element.attrib

        # Should handle many attributes efficiently
        assert len(attribs) == 101  # id + 100 data attributes

        # Search should still work efficiently
        results = list(attribs.search_values("value50", partial=False))
        assert len(results) == 1

    def test_unicode_attributes(self):
        """Test handling of Unicode in attributes"""
        html = """
        <div id="unicode-test"
             data-emoji="😀🎉"
             data-chinese="你好世界"
             data-arabic="مرحبا بالعالم"
             data-special="café naïve">
        </div>
        """

        page = Selector(html)
        attrs = page.css("#unicode-test")[0].attrib

        assert attrs["data-emoji"] == "😀🎉"
        assert attrs["data-chinese"] == "你好世界"
        assert attrs["data-arabic"] == "مرحبا بالعالم"
        assert attrs["data-special"] == "café naïve"

        # Search with Unicode
        results = list(attrs.search_values("你好", partial=True))
        assert len(results) == 1

    def test_malformed_attributes(self):
        """Test handling of malformed attributes"""
        # Various malformed HTML scenarios
        test_cases = [
            '<div id="test" class=>Content</div>',  # Empty attribute value
            '<div id="test" class>Content</div>',  # No attribute value
            '<div id="test" data-"invalid"="value">Content</div>',  # Invalid attribute name
            '<div id=test class=no-quotes>Content</div>',  # Unquoted values
        ]

        for html in test_cases:
            try:
                page = Selector(html)
                if page.css("div"):
                    attrs = page.css("div")[0].attrib
                    # Should handle gracefully without crashing
                    assert isinstance(attrs, AttributesHandler)
            except:
                # Some malformed HTML might not parse at all
                pass


================================================
FILE: tests/parser/test_general.py
================================================
import pickle
import time
import logging

import pytest
from cssselect import SelectorError, SelectorSyntaxError

from scrapling import Selector
logging.getLogger("scrapling").setLevel(logging.DEBUG)


@pytest.fixture
def html_content():
    return """
    <html>
    <head>
        <title>Complex Web Page</title>
        <style>
            .hidden { display: none; }
        </style>
    </head>
    <body>
        <header>
            <nav>
                <ul>
                    <li><a href="#home">Home</a></li>
                    <li><a href="#about">About</a></li>
                    <li><a href="#contact">Contact</a></li>
                </ul>
            </nav>
        </header>
        <main>
            <section id="products" schema='{"jsonable": "data"}'>
                <h2>Products</h2>
                <div class="product-list">
                    <article class="product" data-id="1">
                        <h3>Product 1</h3>
                        <p class="description">This is product 1</p>
                        <span class="price">$10.99</span>
                        <div class="hidden stock">In stock: 5</div>
                    </article>
                    <article class="product" data-id="2">
                        <h3>Product 2</h3>
                        <p class="description">This is product 2</p>
                        <span class="price">$20.99</span>
                        <div class="hidden stock">In stock: 3</div>
                    </article>
                    <article class="product" data-id="3">
                        <h3>Product 3</h3>
                        <p class="description">This is product 3</p>
                        <span class="price">$15.99</span>
                        <div class="hidden stock">Out of stock</div>
                    </article>
                </div>
            </section>
            <section id="reviews">
                <h2>Customer Reviews</h2>
                <div class="review-list">
                    <div class="review" data-rating="5">
                        <p class="review-text">Great product!</p>
                        <span class="reviewer">John Doe</span>
                    </div>
                    <div class="review" data-rating="4">
                        <p class="review-text">Good value for money.</p>
                        <span class="reviewer">Jane Smith</span>
                    </div>
                </div>
            </section>
        </main>
        <footer>
            <p>&copy; 2024 Our Company</p>
        </footer>
        <script id="page-data" type="application/json">
            {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
        </script>
    </body>
    </html>
    """


@pytest.fixture
def page(html_content):
    return Selector(html_content, adaptive=False)


# CSS Selector Tests
class TestCSSSelectors:
    def test_basic_product_selection(self, page):
        """Test selecting all product elements"""
        elements = page.css("main #products .product-list article.product")
        assert len(elements) == 3

    def test_in_stock_product_selection(self, page):
        """Test selecting in-stock products"""
        in_stock_products = page.css(
            'main #products .product-list article.product:not(:contains("Out of stock"))'
        )
        assert len(in_stock_products) == 2


# XPath Selector Tests
class TestXPathSelectors:
    def test_high_rating_reviews(self, page):
        """Test selecting reviews with high ratings"""
        reviews = page.xpath(
            '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
        )
        assert len(reviews) == 2

    def test_high_priced_products(self, page):
        """Test selecting products above a certain price"""
        high_priced_products = page.xpath(
            '//article[contains(@class, "product")]'
            '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
        )
        assert len(high_priced_products) == 2


# Text Matching Tests
class TestTextMatching:
    def test_regex_multiple_matches(self, page):
        """Test finding multiple matches with regex"""
        stock_info = page.find_by_regex(r"In stock: \d+", first_match=False)
        assert len(stock_info) == 2

    def test_regex_first_match(self, page):
        """Test finding the first match with regex"""
        stock_info = page.find_by_regex(
            r"In stock: \d+", first_match=True, case_sensitive=True
        )
        assert stock_info.text == "In stock: 5"

    def test_partial_text_match(self, page):
        """Test finding elements with partial text match"""
        stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False)
        assert len(stock_info) == 2

    def test_exact_text_match(self, page):
        """Test finding elements with exact text match"""
        out_of_stock = page.find_by_text(
            "Out of stock", partial=False, first_match=False
        )
        assert len(out_of_stock) == 1


# Similar Elements Tests
class TestSimilarElements:
    def test_finding_similar_products(self, page):
        """Test finding similar product elements"""
        first_product = page.css(".product").first
        similar_products = first_product.find_similar()
        assert len(similar_products) == 2

    def test_finding_similar_reviews(self, page):
        """Test finding similar review elements with additional filtering"""
        first_review = page.find("div", class_="review")
        similar_high_rated_reviews = [
            review
            for review in first_review.find_similar()
            if int(review.attrib.get("data-rating", 0)) >= 4
        ]
        assert len(similar_high_rated_reviews) == 1


# Error Handling Tests
class TestErrorHandling:
    def test_invalid_selector_initialization(self):
        """Test various invalid Selector initializations"""
        # No arguments
        with pytest.raises(ValueError):
            _ = Selector(adaptive=False)

        with pytest.raises(TypeError):
            _ = Selector(content=1, adaptive=False)

    def test_invalid_storage(self, page, html_content):
        """Test invalid storage parameter"""
        with pytest.raises(ValueError):
            _ = Selector(html_content, storage=object, adaptive=True)

    def test_bad_selectors(self, page):
        """Test handling of invalid selectors"""
        with pytest.raises((SelectorError, SelectorSyntaxError)):
            page.css("4 ayo")

        with pytest.raises((SelectorError, SelectorSyntaxError)):
            page.xpath("4 ayo")


# Pickling and Object Representation Tests
class TestPicklingAndRepresentation:
    def test_unpickleable_objects(self, page):
        """Test that Selector objects cannot be pickled"""
        table = page.css(".product-list")[0]
        with pytest.raises(TypeError):
            pickle.dumps(table)

    def test_string_representations(self, page):
        """Test custom string representations of objects"""
        table = page.css(".product-list")[0]
        assert issubclass(type(table.__str__()), str)
        assert issubclass(type(table.__repr__()), str)
        assert issubclass(type(table.attrib.__str__()), str)
        assert issubclass(type(table.attrib.__repr__()), str)


# Navigation and Traversal Tests
class TestElementNavigation:
    def test_basic_navigation_properties(self, page):
        """Test basic navigation properties of elements"""
        table = page.css(".product-list")[0]
        assert table.path is not None
        assert table.html_content != ""
        assert table.prettify() != ""

    def test_parent_and_sibling_navigation(self, page):
        """Test parent and sibling navigation"""
        table = page.css(".product-list")[0]
        parent = table.parent
        assert parent["id"] == "products"

        parent_siblings = parent.siblings
        assert len(parent_siblings) == 1

    def test_child_navigation(self, page):
        """Test child navigation"""
        table = page.css(".product-list")[0]
        children = table.children
        assert len(children) == 3

    def test_next_and_previous_navigation(self, page):
        """Test next and previous element navigation"""
        child = page.css(".product-list")[0].find({"data-id": "1"})
        next_element = child.next
        assert next_element.attrib["data-id"] == "2"

        prev_element = next_element.previous
        assert prev_element.tag == child.tag

    def test_ancestor_finding(self, page):
        """Test finding ancestors of elements"""
        all_prices = page.css(".price")
        products_with_prices = [
            price.find_ancestor(lambda p: p.has_class("product"))
            for price in all_prices
        ]
        assert len(products_with_prices) == 3


# JSON and Attribute Tests
class TestJSONAndAttributes:
    def test_json_conversion(self, page):
        """Test converting content to JSON"""
        script_content = page.css("#page-data::text")[0].get()
        assert issubclass(type(script_content.sort()), str)
        page_data = script_content.json()
        assert page_data["totalProducts"] == 3
        assert "lastUpdated" in page_data

    def test_attribute_operations(self, page):
        """Test various attribute-related operations"""
        # Product ID extraction
        products = page.css(".product")
        product_ids = [product.attrib["data-id"] for product in products]
        assert product_ids == ["1", "2", "3"]
        assert "data-id" in products[0]

        # Review rating calculations
        reviews = page.css(".review")
        review_ratings = [int(review.attrib["data-rating"]) for review in reviews]
        assert sum(review_ratings) / len(review_ratings) == 4.5

        # Attribute searching
        key_value = list(products[0].attrib.search_values("1", partial=False))
        assert list(key_value[0].keys()) == ["data-id"]

        key_value = list(products[0].attrib.search_values("1", partial=True))
        assert list(key_value[0].keys()) == ["data-id"]

        # JSON attribute conversion
        attr_json = page.css("#products").first.attrib["schema"].json()
        assert attr_json == {"jsonable": "data"}
        assert isinstance(page.css("#products")[0].attrib.json_string, bytes)


# Performance Test
def test_large_html_parsing_performance():
    """Test parsing and selecting performance on large HTML"""
    large_html = (
        "<html><body>"
        + '<div class="item">' * 5000
        + "</div>" * 5000
        + "</body></html>"
    )

    start_time = time.time()
    parsed = Selector(large_html, adaptive=False)
    elements = parsed.css(".item")
    end_time = time.time()

    # assert len(elements) == 5000  # GitHub actions don't like this line
    # Converting 5000 elements to a class and doing operations on them will take time
    # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
    assert (
        end_time - start_time < 0.5
    )  # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds


# Selector Generation Test
def test_selectors_generation(page):
    """Try to create selectors for all elements in the page"""

    def _traverse(element: Selector):
        assert isinstance(element.generate_css_selector, str)
        assert isinstance(element.generate_full_css_selector, str)
        assert isinstance(element.generate_xpath_selector, str)
        assert isinstance(element.generate_full_xpath_selector, str)
        for branch in element.children:
            _traverse(branch)

    _traverse(page)


# Miscellaneous Tests
def test_getting_all_text(page):
    """Test getting all text from the page"""
    assert page.get_all_text() != ""


def test_regex_on_text(page):
    """Test regex operations on text"""
    element = page.css('[data-id="1"] .price')[0]
    match = element.re_first(r"[\.\d]+")
    assert match == "10.99"
    match = element.text.re(r"(\d+)", replace_entities=False)
    assert len(match) == 2


================================================
FILE: tests/parser/test_parser_advanced.py
================================================
import re
import pytest
from unittest.mock import Mock

from scrapling import Selector, Selectors
from scrapling.core.custom_types import TextHandler, TextHandlers
from scrapling.core.storage import SQLiteStorageSystem


class TestSelectorAdvancedFeatures:
    """Test advanced Selector features like adaptive matching"""

    def test_adaptive_initialization_with_storage(self):
        """Test adaptive initialization with custom storage"""
        html = "<html><body><p>Test</p></body></html>"

        # Use the actual SQLiteStorageSystem for this test
        selector = Selector(
            content=html,
            adaptive=True,
            storage=SQLiteStorageSystem,
            storage_args={"storage_file": ":memory:", "url": "https://example.com"}
        )

        assert selector._Selector__adaptive_enabled is True
        assert selector._storage is not None

    def test_adaptive_initialization_with_default_storage_args(self):
        """Test adaptive initialization with default storage args"""
        html = "<html><body><p>Test</p></body></html>"
        url = "https://example.com"

        # Test that adaptive mode uses default storage when no explicit args provided
        selector = Selector(
            content=html,
            url=url,
            adaptive=True
        )

        # Should create storage with default args
        assert selector._storage is not None

    def test_adaptive_with_existing_storage(self):
        """Test adaptive initialization with existing storage object"""
        html = "<html><body><p>Test</p></body></html>"

        mock_storage = Mock()

        selector = Selector(
            content=html,
            adaptive=True,
            _storage=mock_storage
        )

        assert selector._storage is mock_storage


class TestAdvancedSelectors:
    """Test advanced selector functionality"""

    @pytest.fixture
    def complex_html(self):
        return """
        <html>
            <body>
                <div class="container" data-test='{"key": "value"}'>
                    <p>First paragraph</p>
                    <!-- Comment -->
                    <p>Second paragraph</p>
                    <![CDATA[Some CDATA content]]>
                    <div class="nested">
                        <span id="special">Special content</span>
                        <span>Regular content</span>
                    </div>
                    <table>
                        <tr><td>Cell 1</td><td>Cell 2</td></tr>
                        <tr><td>Cell 3</td><td>Cell 4</td></tr>
                    </table>
                </div>
            </body>
        </html>
        """

    def test_comment_and_cdata_handling(self, complex_html):
        """Test handling of comments and CDATA"""
        # With comments/CDATA kept
        page = Selector(
            complex_html,
            keep_comments=True,
            keep_cdata=True
        )
        content = page.body
        assert "Comment" in content
        assert "CDATA" in content

        # Without comments/CDATA
        page = Selector(
            complex_html,
            keep_comments=False,
            keep_cdata=False
        )
        content = page.html_content
        assert "Comment" not in content

    def test_advanced_xpath_variables(self, complex_html):
        """Test XPath with variables"""
        page = Selector(complex_html)

        # Using XPath variables
        cells = page.xpath(
            "//td[text()=$cell_text]",
            cell_text="Cell 1"
        )
        assert len(cells) == 1
        assert cells[0].text == "Cell 1"

    def test_pseudo_elements(self, complex_html):
        """Test CSS pseudo-elements"""
        page = Selector(complex_html)

        # ::text pseudo-element
        texts = page.css("p::text")
        assert len(texts) == 2
        assert isinstance(texts[0], Selector)
        assert isinstance(texts[0].get(), TextHandler)

        # ::attr() pseudo-element
        attrs = page.css("div::attr(class)")
        assert "container" in attrs.getall()

    def test_complex_attribute_operations(self, complex_html):
        """Test complex attribute handling"""
        page = Selector(complex_html)
        container = page.css(".container")[0]

        # JSON in attributes
        data = container.attrib["data-test"].json()
        assert data["key"] == "value"

        # Attribute searching
        matches = list(container.attrib.search_values("container"))
        assert len(matches) == 1

    def test_url_joining(self):
        """Test URL joining functionality"""
        page = Selector("<html></html>", url="https://example.com/page")

        # Relative URL
        assert page.urljoin("../other") == "https://example.com/other"
        assert page.urljoin("/absolute") == "https://example.com/absolute"
        assert page.urljoin("relative") == "https://example.com/relative"

    def test_find_operations_edge_cases(self, complex_html):
        """Test edge cases in find operations"""
        page = Selector(complex_html)

        # Multiple argument types
        _ = page.find_all(
            "span",
            ["div"],
            {"class": "nested"},
            lambda e: e.text != ""
        )

        # Regex pattern matching
        pattern = re.compile(r"Cell \d+")
        cells = page.find_all(pattern)
        assert len(cells) == 4

    def test_text_operations_edge_cases(self, complex_html):
        """Test text operation edge cases"""
        page = Selector(complex_html)

        # get_all_text with a custom separator
        text = page.get_all_text(separator=" | ", strip=True)
        assert " | " in text

        # Ignore specific tags
        text = page.get_all_text(ignore_tags=("table",))
        assert "Cell" not in text

        # With empty values
        text = page.get_all_text(valid_values=False)
        assert text != ""

    def test_get_all_text_preserves_interleaved_text_nodes(self):
        """Test get_all_text preserves interleaved text nodes"""
        html = """
        <html>
        <body>
            <main>
                string1
                <b>string2</b>
                string3
                <div>
                    <span>string4</span>
                </div>
                string5
                <script>ignored</script>
                string6
                <style>ignored</style>
                string7
            </main>
        </body>
        </html>
        """

        page = Selector(html, adaptive=False)
        node = page.css("main")[0]

        assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5\nstring6\nstring7"


class TestTextHandlerAdvanced:
    """Test advanced TextHandler functionality"""

    def test_text_handler_operations(self):
        """Test various TextHandler operations"""
        text = TextHandler("  Hello World  ")

        # All string methods should return TextHandler
        assert isinstance(text.strip(), TextHandler)
        assert isinstance(text.upper(), TextHandler)
        assert isinstance(text.lower(), TextHandler)
        assert isinstance(text.replace("World", "Python"), TextHandler)

        # Custom methods
        assert text.clean() == "Hello World"

        # Sorting
        text2 = TextHandler("dcba")
        assert text2.sort() == "abcd"

    def test_text_handler_regex(self):
        """Test regex operations on TextHandler"""
        text = TextHandler("Price: $10.99, Sale: $8.99")

        # Basic regex
        prices = text.re(r"\$[\d.]+")
        assert len(prices) == 2
        assert prices[0] == "$10.99"

        # Case insensitive
        text2 = TextHandler("HELLO hello HeLLo")
        matches = text2.re(r"hello", case_sensitive=False)
        assert len(matches) == 3

        # Clean match
        text3 = TextHandler(" He  l  lo  ")
        matches = text3.re(r"He l lo", clean_match=True, case_sensitive=False)
        assert len(matches) == 1

    def test_text_handlers_operations(self):
        """Test TextHandlers list operations"""
        handlers = TextHandlers([
            TextHandler("First"),
            TextHandler("Second"),
            TextHandler("Third")
        ])

        # Slicing should return TextHandlers
        assert isinstance(handlers[0:2], TextHandlers)

        # Get methods
        assert handlers.get() == "First"
        assert handlers.get("default") == "First"
        assert TextHandlers([]).get("default") == "default"


class TestSelectorsAdvanced:
    """Test advanced Selectors functionality"""

    def test_selectors_filtering(self):
        """Test filtering operations on Selectors"""
        html = """
        <div>
            <p class="highlight">Important</p>
            <p>Regular</p>
            <p class="highlight">Also important</p>
        </div>
        """
        page = Selector(html)
        paragraphs = page.css("p")

        # Filter by class
        highlighted = paragraphs.filter(lambda p: p.has_class("highlight"))
        assert len(highlighted) == 2

        # Search for a specific element
        found = paragraphs.search(lambda p: p.text == "Regular")
        assert found is not None
        assert found.text == "Regular"

    def test_selectors_properties(self):
        """Test Selectors properties"""
        html = "<div><p>1</p><p>2</p><p>3</p></div>"
        page = Selector(html)
        paragraphs = page.css("p")

        assert paragraphs.first.text == "1"
        assert paragraphs.last.text == "3"
        assert paragraphs.length == 3


================================================
FILE: tests/requirements.txt
================================================
pytest>=2.8.0,<9
pytest-cov
playwright==1.58.0
werkzeug<3.0.0
pytest-httpbin==2.1.0
pytest-asyncio
httpbin~=0.10.0
pytest-xdist


================================================
FILE: tests/spiders/__init__.py
================================================


================================================
FILE: tests/spiders/test_checkpoint.py
================================================
"""Tests for the CheckpointManager and CheckpointData classes."""

import pickle
import tempfile
from pathlib import Path

import pytest
import anyio

from scrapling.spiders.request import Request
from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager


class TestCheckpointData:
    """Test CheckpointData dataclass."""

    def test_default_values(self):
        """Test CheckpointData with default values."""
        data = CheckpointData()

        assert data.requests == []
        assert data.seen == set()

    def test_with_requests_and_seen(self):
        """Test CheckpointData with requests and seen URLs."""
        requests = [
            Request("https://example.com/1", priority=10),
            Request("https://example.com/2", priority=5),
        ]
        seen = {"url1", "url2", "url3"}

        data = CheckpointData(requests=requests, seen=seen)

        assert len(data.requests) == 2
        assert data.requests[0].url == "https://example.com/1"
        assert data.seen == {"url1", "url2", "url3"}

    def test_pickle_roundtrip(self):
        """Test that CheckpointData can be pickled and unpickled."""
        requests = [Request("https://example.com", priority=5)]
        seen = {"fingerprint1", "fingerprint2"}
        data = CheckpointData(requests=requests, seen=seen)

        pickled = pickle.dumps(data)
        restored = pickle.loads(pickled)

        assert len(restored.requests) == 1
        assert restored.requests[0].url == "https://example.com"
        assert restored.seen == {"fingerprint1", "fingerprint2"}


class TestCheckpointManagerInit:
    """Test CheckpointManager initialization."""

    def test_init_with_string_path(self):
        """Test initialization with string path."""
        manager = CheckpointManager("/tmp/test_crawl")

        assert str(manager.crawldir) == "/tmp/test_crawl"
        assert manager.interval == 300.0

    def test_init_with_pathlib_path(self):
        """Test initialization with pathlib.Path."""
        path = Path("/tmp/test_crawl")
        manager = CheckpointManager(path)

        assert str(manager.crawldir) == "/tmp/test_crawl"

    def test_init_with_custom_interval(self):
        """Test initialization with custom interval."""
        manager = CheckpointManager("/tmp/test", interval=60.0)
        assert manager.interval == 60.0

    def test_init_with_zero_interval(self):
        """Test initialization with zero interval (disable periodic checkpoints)."""
        manager = CheckpointManager("/tmp/test", interval=0)
        assert manager.interval == 0

    def test_init_with_negative_interval_raises(self):
        """Test that negative interval raises ValueError."""
        with pytest.raises(ValueError, match="greater than 0"):
            CheckpointManager("/tmp/test", interval=-1)

    def test_init_with_invalid_interval_type_raises(self):
        """Test that invalid interval type raises TypeError."""
        with pytest.raises(TypeError, match="integer or float"):
            CheckpointManager("/tmp/test", interval="invalid")  # type: ignore

    def test_checkpoint_file_path(self):
        """Test that checkpoint file path is correctly constructed."""
        manager = CheckpointManager("/tmp/test_crawl")

        expected_path = "/tmp/test_crawl/checkpoint.pkl"
        assert str(manager._checkpoint_path) == expected_path


class TestCheckpointManagerOperations:
    """Test CheckpointManager save/load/cleanup operations."""

    @pytest.fixture
    def temp_dir(self):
        """Create a temporary directory for testing."""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir)

    @pytest.mark.asyncio
    async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):
        """Test has_checkpoint returns False when no checkpoint exists."""
        manager = CheckpointManager(temp_dir / "crawl")

        result = await manager.has_checkpoint()

        assert result is False

    @pytest.mark.asyncio
    async def test_save_creates_checkpoint_file(self, temp_dir: Path):
        """Test that save creates the checkpoint file."""
        crawl_dir = temp_dir / "crawl"
        manager = CheckpointManager(crawl_dir)

        data = CheckpointData(
            requests=[Request("https://example.com")],
            seen={"fp1", "fp2"},
        )

        await manager.save(data)

        checkpoint_path = crawl_dir / "checkpoint.pkl"
        assert checkpoint_path.exists()

    @pytest.mark.asyncio
    async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):
        """Test that save creates the directory if it doesn't exist."""
        crawl_dir = temp_dir / "nested" / "crawl" / "dir"
        manager = CheckpointManager(crawl_dir)

        data = CheckpointData()
        await manager.save(data)

        assert crawl_dir.exists()

    @pytest.mark.asyncio
    async def test_has_checkpoint_true_after_save(self, temp_dir: Path):
        """Test has_checkpoint returns True after saving."""
        manager = CheckpointManager(temp_dir / "crawl")

        data = CheckpointData()
        await manager.save(data)

        result = await manager.has_checkpoint()
        assert result is True

    @pytest.mark.asyncio
    async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):
        """Test load returns None when no checkpoint exists."""
        manager = CheckpointManager(temp_dir / "crawl")

        result = await manager.load()

        assert result is None

    @pytest.mark.asyncio
    async def test_save_and_load_roundtrip(self, temp_dir: Path):
        """Test saving and loading checkpoint data."""
        manager = CheckpointManager(temp_dir / "crawl")

        original_data = CheckpointData(
            requests=[
                Request("https://example.com/1", priority=10),
                Request("https://example.com/2", priority=5),
            ],
            seen={"fp1", "fp2", "fp3"},
        )

        await manager.save(original_data)
        loaded_data = await manager.load()

        assert loaded_data is not None
        assert len(loaded_data.requests) == 2
        assert loaded_data.requests[0].url == "https://example.com/1"
        assert loaded_data.requests[0].priority == 10
        assert loaded_data.seen == {"fp1", "fp2", "fp3"}

    @pytest.mark.asyncio
    async def test_save_is_atomic(self, temp_dir: Path):
        """Test that save uses atomic write (temp file + rename)."""
        crawl_dir = temp_dir / "crawl"
        manager = CheckpointManager(crawl_dir)

        data = CheckpointData(requests=[Request("https://example.com")])
        await manager.save(data)

        # Temp file should not exist after successful save
        temp_path = crawl_dir / "checkpoint.tmp"
        assert not temp_path.exists()

        # Checkpoint file should exist
        checkpoint_path = crawl_dir / "checkpoint.pkl"
        assert checkpoint_path.exists()

    @pytest.mark.asyncio
    async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):
        """Test that cleanup removes the checkpoint file."""
        crawl_dir = temp_dir / "crawl"
        manager = CheckpointManager(crawl_dir)

        # Save a checkpoint first
        data = CheckpointData()
        await manager.save(data)

        checkpoint_path = crawl_dir / "checkpoint.pkl"
        assert checkpoint_path.exists()

        # Cleanup should remove it
        await manager.cleanup()

        assert not checkpoint_path.exists()

    @pytest.mark.asyncio
    async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):
        """Test that cleanup doesn't raise error when no file exists."""
        manager = CheckpointManager(temp_dir / "crawl")

        # Should not raise
        await manager.cleanup()

    @pytest.mark.asyncio
    async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):
        """Test load returns None when checkpoint file is corrupt."""
        crawl_dir = temp_dir / "crawl"
        crawl_dir.mkdir(parents=True)

        checkpoint_path = crawl_dir / "checkpoint.pkl"
        checkpoint_path.write_bytes(b"not valid pickle data")

        manager = CheckpointManager(crawl_dir)

        result = await manager.load()

        assert result is None

    @pytest.mark.asyncio
    async def test_multiple_saves_overwrite(self, temp_dir: Path):
        """Test that multiple saves overwrite the checkpoint."""
        manager = CheckpointManager(temp_dir / "crawl")

        # First save
        data1 = CheckpointData(
            requests=[Request("https://example.com/1")],
            seen={"fp1"},
        )
        await manager.save(data1)

        # Second save
        data2 = CheckpointData(
            requests=[Request("https://example.com/2"), Request("https://example.com/3")],
            seen={"fp2", "fp3"},
        )
        await manager.save(data2)

        # Load should return the second save
        loaded = await manager.load()

        assert loaded is not None
        assert len(loaded.requests) == 2
        assert loaded.requests[0].url == "https://example.com/2"
        assert loaded.seen == {"fp2", "fp3"}


class TestCheckpointManagerEdgeCases:
    """Test edge cases for CheckpointManager."""

    @pytest.fixture
    def temp_dir(self):
        """Create a temporary directory for testing."""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir)

    @pytest.mark.asyncio
    async def test_save_empty_checkpoint(self, temp_dir: Path):
        """Test saving empty checkpoint data."""
        manager = CheckpointManager(temp_dir / "crawl")

        data = CheckpointData(requests=[], seen=set())
        await manager.save(data)

        loaded = await manager.load()

        assert loaded is not None
        assert loaded.requests == []
        assert loaded.seen == set()

    @pytest.mark.asyncio
    async def test_save_large_checkpoint(self, temp_dir: Path):
        """Test saving checkpoint with many requests."""
        manager = CheckpointManager(temp_dir / "crawl")

        # Create 1000 requests
        requests = [
            Request(f"https://example.com/{i}", priority=i % 10)
            for i in range(1000)
        ]
        seen = {f"fp_{i}" for i in range(2000)}

        data = CheckpointData(requests=requests, seen=seen)
        await manager.save(data)

        loaded = await manager.load()

        assert loaded is not None
        assert len(loaded.requests) == 1000
        assert len(loaded.seen) == 2000

    @pytest.mark.asyncio
    async def test_requests_preserve_metadata(self, temp_dir: Path):
        """Test that request metadata is preserved through checkpoint."""
        manager = CheckpointManager(temp_dir / "crawl")

        original_request = Request(
            url="https://example.com",
            sid="my_session",
            priority=42,
            dont_filter=True,
            meta={"item_id": 123, "page": 5},
            proxy="http://proxy:8080",
        )

        data = CheckpointData(requests=[original_request], seen=set())
        await manager.save(data)

        loaded = await manager.load()

        assert loaded is not None
        restored = loaded.requests[0]

        assert restored.url == "https://example.com"
        assert restored.sid == "my_session"
        assert restored.priority == 42
        assert restored.dont_filter is True
        assert restored.meta == {"item_id": 123, "page": 5}
        assert restored._session_kwargs == {"proxy": "http://proxy:8080"}


================================================
FILE: tests/spiders/test_engine.py
================================================
"""Tests for the CrawlerEngine class."""

import tempfile
from pathlib import Path

import anyio
import pytest

from scrapling.spiders.engine import CrawlerEngine, _dump
from scrapling.spiders.request import Request
from scrapling.spiders.session import SessionManager
from scrapling.spiders.result import CrawlStats, ItemList
from scrapling.spiders.checkpoint import CheckpointData
from scrapling.core._types import Any, Dict, Set, AsyncGenerator


# ---------------------------------------------------------------------------
# Mock helpers
# ---------------------------------------------------------------------------


class MockResponse:
    """Minimal Response stand-in."""

    def __init__(self, status: int = 200, body: bytes = b"ok", url: str = "https://example.com"):
        self.status = status
        self.body = body
        self.url = url
        self.request: Any = None
        self.meta: Dict[str, Any] = {}

    def __str__(self) -> str:
        return self.url


class MockSession:
    """Mock session that returns a canned response."""

    def __init__(self, name: str = "mock", response: MockResponse | None = None):
        self.name = name
        self._is_alive = False
        self._response = response or MockResponse()
        self.fetch_calls: list[dict] = []

    async def __aenter__(self):
        self._is_alive = True
        return self

    async def __aexit__(self, *args):
        self._is_alive = False

    async def fetch(self, url: str, **kwargs):
        self.fetch_calls.append({"url": url, **kwargs})
        resp = MockResponse(status=self._response.status, body=self._response.body, url=url)
        return resp


class ErrorSession(MockSession):
    """Session that raises on fetch."""

    def __init__(self, error: Exception | None = None):
        super().__init__("error")
        self._error = error or RuntimeError("fetch failed")

    async def fetch(self, url: str, **kwargs):
        raise self._error


class MockSpider:
    """Lightweight spider stub for engine tests."""

    def __init__(
        self,
        *,
        concurrent_requests: int = 4,
        concurrent_requests_per_domain: int = 0,
        download_delay: float = 0.0,
        max_blocked_retries: int = 3,
        allowed_domains: Set[str] | None = None,
        fp_include_kwargs: bool = False,
        fp_include_headers: bool = False,
        fp_keep_fragments: bool = False,
        is_blocked_fn=None,
        on_scraped_item_fn=None,
        retry_blocked_request_fn=None,
    ):
        self.concurrent_requests = concurrent_requests
        self.concurrent_requests_per_domain = concurrent_requests_per_domain
        self.download_delay = download_delay
        self.max_blocked_retries = max_blocked_retries
        self.allowed_domains = allowed_domains or set()
        self.fp_include_kwargs = fp_include_kwargs
        self.fp_include_headers = fp_include_headers
        self.fp_keep_fragments = fp_keep_fragments
        self.name = "test_spider"

        # Tracking lists
        self.on_start_calls: list[dict] = []
        self.on_close_calls: int = 0
        self.on_error_calls: list[tuple[Request, Exception]] = []
        self.scraped_items: list[dict] = []
        self.blocked_responses: list = []
        self.retry_requests: list = []

        # Pluggable behaviour
        self._is_blocked_fn = is_blocked_fn
        self._on_scraped_item_fn = on_scraped_item_fn
        self._retry_blocked_request_fn = retry_blocked_request_fn

        # Log counter stub
        self._log_counter = _LogCounterStub()

    async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
        yield {"url": str(response)}

    async def on_start(self, resuming: bool = False) -> None:
        self.on_start_calls.append({"resuming": resuming})

    async def on_close(self) -> None:
        self.on_close_calls += 1

    async def on_error(self, request: Request, error: Exception) -> None:
        self.on_error_calls.append((request, error))

    async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
        if self._on_scraped_item_fn:
            return self._on_scraped_item_fn(item)
        self.scraped_items.append(item)
        return item

    async def is_blocked(self, response) -> bool:
        if self._is_blocked_fn:
            return self._is_blocked_fn(response)
        return False

    async def retry_blocked_request(self, request: Request, response) -> Request:
        self.retry_requests.append(request)
        if self._retry_blocked_request_fn:
            return self._retry_blocked_request_fn(request, response)
        return request

    async def start_requests(self) -> AsyncGenerator[Request, None]:
        yield Request("https://example.com", sid="default")


class _LogCounterStub:
    """Stub for LogCounterHandler."""

    def get_counts(self) -> Dict[str, int]:
        return {"debug": 0, "info": 0, "warning": 0, "error": 0, "critical": 0}


def _make_engine(
    spider: MockSpider | None = None,
    session: MockSession | None = None,
    crawldir: str | None = None,
    interval: float = 300.0,
) -> CrawlerEngine:
    """Create a CrawlerEngine wired to mock objects."""
    spider = spider or MockSpider()
    sm = SessionManager()
    sm.add("default", session or MockSession())
    return CrawlerEngine(spider, sm, crawldir=crawldir, interval=interval)


# ---------------------------------------------------------------------------
# Tests: _dump helper
# ---------------------------------------------------------------------------


class TestDumpHelper:
    def test_dump_returns_json_string(self):
        result = _dump({"key": "value"})
        assert '"key": "value"' in result

    def test_dump_handles_nested(self):
        result = _dump({"a": {"b": 1}})
        assert '"a"' in result
        assert '"b"' in result


# ---------------------------------------------------------------------------
# Tests: __init__
# ---------------------------------------------------------------------------


class TestCrawlerEngineInit:
    def test_default_initialisation(self):
        engine = _make_engine()

        assert engine._running is False
        assert engine._active_tasks == 0
        assert engine._pause_requested is False
        assert engine._force_stop is False
        assert engine.paused is False
        assert isinstance(engine.stats, CrawlStats)
        assert isinstance(engine.items, ItemList)

    def test_checkpoint_system_disabled_by_default(self):
        engine = _make_engine()
        assert engine._checkpoint_system_enabled is False

    def test_checkpoint_system_enabled_with_crawldir(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            engine = _make_engine(crawldir=tmpdir)
            assert engine._checkpoint_system_enabled is True

    def test_global_limiter_uses_concurrent_requests(self):
        spider = MockSpider(concurrent_requests=8)
        engine = _make_engine(spider=spider)
        assert engine._global_limiter.total_tokens == 8

    def test_allowed_domains_from_spider(self):
        spider = MockSpider(allowed_domains={"example.com", "test.org"})
        engine = _make_engine(spider=spider)
        assert engine._allowed_domains == {"example.com", "test.org"}


# ---------------------------------------------------------------------------
# Tests: _is_domain_allowed
# ---------------------------------------------------------------------------


class TestIsDomainAllowed:
    def test_all_allowed_when_empty(self):
        engine = _make_engine()
        request = Request("https://anything.com/page")
        assert engine._is_domain_allowed(request) is True

    def test_exact_domain_match(self):
        spider = MockSpider(allowed_domains={"example.com"})
        engine = _make_engine(spider=spider)

        assert engine._is_domain_allowed(Request("https://example.com/page")) is True
        assert engine._is_domain_allowed(Request("https://other.com/page")) is False

    def test_subdomain_match(self):
        spider = MockSpider(allowed_domains={"example.com"})
        engine = _make_engine(spider=spider)

        assert engine._is_domain_allowed(Request("https://sub.example.com/page")) is True
        assert engine._is_domain_allowed(Request("https://deep.sub.example.com/x")) is True

    def test_partial_name_not_matched(self):
        spider = MockSpider(allowed_domains={"example.com"})
        engine = _make_engine(spider=spider)

        # "notexample.com" should NOT match "example.com"
        assert engine._is_domain_allowed(Request("https://notexample.com/x")) is False

    def test_multiple_allowed_domains(self):
        spider = MockSpider(allowed_domains={"a.com", "b.org"})
        engine = _make_engine(spider=spider)

        assert engine._is_domain_allowed(Request("https://a.com/")) is True
        assert engine._is_domain_allowed(Request("https://b.org/")) is True
        assert engine._is_domain_allowed(Request("https://c.net/")) is False


# ---------------------------------------------------------------------------
# Tests: _rate_limiter
# ---------------------------------------------------------------------------


class TestRateLimiter:
    def test_returns_global_limiter_when_per_domain_disabled(self):
        engine = _make_engine()  # concurrent_requests_per_domain=0
        limiter = engine._rate_limiter("example.com")
        assert limiter is engine._global_limiter

    def test_returns_per_domain_limiter_when_enabled(self):
        spider = MockSpider(concurrent_requests_per_domain=2)
        engine = _make_engine(spider=spider)

        limiter = engine._rate_limiter("example.com")
        assert limiter is not engine._global_limiter
        assert limiter.total_tokens == 2

    def test_same_domain_returns_same_limiter(self):
        spider = MockSpider(concurrent_requests_per_domain=2)
        engine = _make_engine(spider=spider)

        l1 = engine._rate_limiter("example.com")
        l2 = engine._rate_limiter("example.com")
        assert l1 is l2

    def test_different_domains_get_different_limiters(self):
        spider = MockSpider(concurrent_requests_per_domain=2)
        engine = _make_engine(spider=spider)

        l1 = engine._rate_limiter("a.com")
        l2 = engine._rate_limiter("b.com")
        assert l1 is not l2


# ---------------------------------------------------------------------------
# Tests: _normalize_request
# ---------------------------------------------------------------------------


class TestNormalizeRequest:
    def test_sets_default_sid_when_empty(self):
        engine = _make_engine()
        request = Request("https://example.com")
        assert request.sid == ""

        engine._normalize_request(request)
        assert request.sid == "default"

    def test_preserves_existing_sid(self):
        engine = _make_engine()
        request = Request("https://example.com", sid="custom")

        engine._normalize_request(request)
        assert request.sid == "custom"


# ---------------------------------------------------------------------------
# Tests: _process_request
# ---------------------------------------------------------------------------


class TestProcessRequest:
    @pytest.mark.asyncio
    async def test_successful_fetch_updates_stats(self):
        spider = MockSpider()
        session = MockSession(response=MockResponse(status=200, body=b"hello"))
        engine = _make_engine(spider=spider, session=session)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.requests_count == 1
        assert engine.stats.response_bytes == 5  # len(b"hello") from MockSession
        assert "status_200" in engine.stats.response_status_count

    @pytest.mark.asyncio
    async def test_failed_fetch_increments_failed_count(self):
        spider = MockSpider()
        sm = SessionManager()
        sm.add("default", ErrorSession())
        engine = CrawlerEngine(spider, sm)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.failed_requests_count == 1
        assert len(spider.on_error_calls) == 1

    @pytest.mark.asyncio
    async def test_failed_fetch_does_not_increment_requests_count(self):
        spider = MockSpider()
        sm = SessionManager()
        sm.add("default", ErrorSession())
        engine = CrawlerEngine(spider, sm)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.requests_count == 0

    @pytest.mark.asyncio
    async def test_blocked_response_triggers_retry(self):
        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.blocked_requests_count == 1
        # A retry request should be enqueued
        assert not engine.scheduler.is_empty

    @pytest.mark.asyncio
    async def test_blocked_response_max_retries_exceeded(self):
        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        request._retry_count = 2  # Already at max
        await engine._process_request(request)

        assert engine.stats.blocked_requests_count == 1
        # No retry enqueued
        assert engine.scheduler.is_empty

    @pytest.mark.asyncio
    async def test_retry_request_has_dont_filter(self):
        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        retry = await engine.scheduler.dequeue()
        assert retry.dont_filter is True
        assert retry._retry_count == 1

    @pytest.mark.asyncio
    async def test_retry_clears_proxy_kwargs(self):
        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default", proxy="http://proxy:8080")
        await engine._process_request(request)

        retry = await engine.scheduler.dequeue()
        assert "proxy" not in retry._session_kwargs
        assert "proxies" not in retry._session_kwargs

    @pytest.mark.asyncio
    async def test_callback_yielding_dict_increments_items(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.items_scraped == 1
        assert len(engine.items) == 1

    @pytest.mark.asyncio
    async def test_callback_yielding_request_enqueues(self):
        async def callback(response) -> AsyncGenerator:
            yield Request("https://example.com/page2", sid="default")

        spider = MockSpider()
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default", callback=callback)
        await engine._process_request(request)

        assert not engine.scheduler.is_empty

    @pytest.mark.asyncio
    async def test_callback_yielding_offsite_request_filtered(self):
        async def callback(response) -> AsyncGenerator:
            yield Request("https://other.com/page", sid="default")

        spider = MockSpider(allowed_domains={"example.com"})
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default", callback=callback)
        await engine._process_request(request)

        assert engine.stats.offsite_requests_count == 1
        assert engine.scheduler.is_empty

    @pytest.mark.asyncio
    async def test_dropped_item_when_on_scraped_item_returns_none(self):
        spider = MockSpider(on_scraped_item_fn=lambda item: None)
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        await engine._process_request(request)

        assert engine.stats.items_dropped == 1
        assert engine.stats.items_scraped == 0
        assert len(engine.items) == 0

    @pytest.mark.asyncio
    async def test_callback_exception_calls_on_error(self):
        async def bad_callback(response) -> AsyncGenerator:
            raise ValueError("callback boom")
            yield  # noqa: unreachable

        spider = MockSpider()
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default", callback=bad_callback)
        await engine._process_request(request)

        assert len(spider.on_error_calls) == 1
        assert isinstance(spider.on_error_calls[0][1], ValueError)

    @pytest.mark.asyncio
    async def test_proxy_tracked_in_stats(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default", proxy="http://p:8080")
        await engine._process_request(request)

        assert "http://p:8080" in engine.stats.proxies

    @pytest.mark.asyncio
    async def test_proxies_dict_tracked_in_stats(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        proxies = {"http": "http://p:8080", "https": "https://p:8443"}
        request = Request("https://example.com", sid="default", proxies=proxies)
        await engine._process_request(request)

        assert len(engine.stats.proxies) == 1
        assert engine.stats.proxies[0] == proxies

    @pytest.mark.asyncio
    async def test_uses_parse_when_no_callback(self):
        items_seen = []

        async def custom_parse(response) -> AsyncGenerator:
            yield {"from": "custom_parse"}

        spider = MockSpider()
        spider.parse = custom_parse  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        request = Request("https://example.com", sid="default")
        # No callback set → should use spider.parse
        await engine._process_request(request)

        assert engine.stats.items_scraped == 1


# ---------------------------------------------------------------------------
# Tests: _task_wrapper
# ---------------------------------------------------------------------------


class TestTaskWrapper:
    @pytest.mark.asyncio
    async def test_decrements_active_tasks(self):
        engine = _make_engine()
        engine._active_tasks = 1

        request = Request("https://example.com", sid="default")
        await engine._task_wrapper(request)

        assert engine._active_tasks == 0

    @pytest.mark.asyncio
    async def test_decrements_even_on_error(self):
        spider = MockSpider()
        sm = SessionManager()
        sm.add("default", ErrorSession())
        engine = CrawlerEngine(spider, sm)
        engine._active_tasks = 1

        request = Request("https://example.com", sid="default")
        await engine._task_wrapper(request)

        assert engine._active_tasks == 0


# ---------------------------------------------------------------------------
# Tests: request_pause
# ---------------------------------------------------------------------------


class TestRequestPause:
    def test_first_call_sets_pause_requested(self):
        engine = _make_engine()

        engine.request_pause()

        assert engine._pause_requested is True
        assert engine._force_stop is False

    def test_second_call_sets_force_stop(self):
        engine = _make_engine()

        engine.request_pause()  # first
        engine.request_pause()  # second

        assert engine._pause_requested is True
        assert engine._force_stop is True

    def test_third_call_after_force_stop_is_noop(self):
        engine = _make_engine()

        engine.request_pause()
        engine.request_pause()
        engine.request_pause()  # should not raise

        assert engine._force_stop is True


# ---------------------------------------------------------------------------
# Tests: checkpoint methods
# ---------------------------------------------------------------------------


class TestCheckpointMethods:
    def test_is_checkpoint_time_false_when_disabled(self):
        engine = _make_engine()  # no crawldir
        assert engine._is_checkpoint_time() is False

    @pytest.mark.asyncio
    async def test_save_and_restore_checkpoint(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            spider = MockSpider()
            engine = _make_engine(spider=spider, crawldir=tmpdir)

            # Enqueue a request so snapshot has data
            req = Request("https://example.com", sid="default")
            engine._normalize_request(req)
            await engine.scheduler.enqueue(req)

            await engine._save_checkpoint()

            # Verify checkpoint file exists
            checkpoint_path = Path(tmpdir) / "checkpoint.pkl"
            assert checkpoint_path.exists()

    @pytest.mark.asyncio
    async def test_restore_when_no_checkpoint_returns_false(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            engine = _make_engine(crawldir=tmpdir)
            result = await engine._restore_from_checkpoint()
            assert result is False

    @pytest.mark.asyncio
    async def test_restore_from_checkpoint_raises_when_disabled(self):
        engine = _make_engine()  # no crawldir → checkpoint disabled
        with pytest.raises(RuntimeError):
            await engine._restore_from_checkpoint()


# ---------------------------------------------------------------------------
# Tests: crawl
# ---------------------------------------------------------------------------


class TestCrawl:
    @pytest.mark.asyncio
    async def test_basic_crawl_returns_stats(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert isinstance(stats, CrawlStats)
        assert stats.requests_count >= 1
        assert stats.items_scraped >= 1

    @pytest.mark.asyncio
    async def test_crawl_calls_on_start_and_on_close(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        await engine.crawl()

        assert len(spider.on_start_calls) == 1
        assert spider.on_start_calls[0]["resuming"] is False
        assert spider.on_close_calls == 1

    @pytest.mark.asyncio
    async def test_crawl_sets_stats_timing(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.start_time > 0
        assert stats.end_time > 0
        assert stats.end_time >= stats.start_time

    @pytest.mark.asyncio
    async def test_crawl_sets_concurrency_stats(self):
        spider = MockSpider(concurrent_requests=16, concurrent_requests_per_domain=4)
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.concurrent_requests == 16
        assert stats.concurrent_requests_per_domain == 4

    @pytest.mark.asyncio
    async def test_crawl_processes_multiple_start_urls(self):
        spider = MockSpider()

        urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]

        async def multi_start_requests() -> AsyncGenerator[Request, None]:
            for url in urls:
                yield Request(url, sid="default")

        spider.start_requests = multi_start_requests  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.requests_count == 3
        assert stats.items_scraped == 3

    @pytest.mark.asyncio
    async def test_crawl_follows_yielded_requests(self):
        """Test that requests yielded from callbacks are processed."""
        call_count = 0

        async def parse_with_follow(response) -> AsyncGenerator:
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                yield Request("https://example.com/page2", sid="default")
            yield {"page": str(response)}

        spider = MockSpider()
        spider.parse = parse_with_follow  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.requests_count == 2
        assert stats.items_scraped == 2

    @pytest.mark.asyncio
    async def test_crawl_with_download_delay(self):
        spider = MockSpider(download_delay=0.01)
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.download_delay == 0.01
        assert stats.requests_count >= 1

    @pytest.mark.asyncio
    async def test_crawl_filters_offsite_requests(self):
        async def parse_offsite(response) -> AsyncGenerator:
            yield Request("https://other-domain.com/page", sid="default")
            yield {"url": str(response)}

        spider = MockSpider(allowed_domains={"example.com"})
        spider.parse = parse_offsite  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert stats.offsite_requests_count == 1
        assert stats.requests_count == 1  # Only the initial request

    @pytest.mark.asyncio
    async def test_crawl_cleans_up_checkpoint_on_completion(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            spider = MockSpider()
            engine = _make_engine(spider=spider, crawldir=tmpdir)

            await engine.crawl()

            checkpoint_path = Path(tmpdir) / "checkpoint.pkl"
            assert not checkpoint_path.exists()  # Cleaned up

    @pytest.mark.asyncio
    async def test_crawl_handles_fetch_error_gracefully(self):
        spider = MockSpider()
        sm = SessionManager()
        sm.add("default", ErrorSession())
        engine = CrawlerEngine(spider, sm)

        stats = await engine.crawl()

        assert stats.failed_requests_count == 1
        assert len(spider.on_error_calls) == 1

    @pytest.mark.asyncio
    async def test_crawl_log_levels_populated(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        stats = await engine.crawl()

        assert isinstance(stats.log_levels_counter, dict)

    @pytest.mark.asyncio
    async def test_crawl_resets_state_on_each_run(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        # Run first crawl
        await engine.crawl()
        assert engine.stats.requests_count >= 1

        # Run second crawl - stats should reset
        stats = await engine.crawl()
        # Items are cleared on each crawl
        assert engine.paused is False


# ---------------------------------------------------------------------------
# Tests: items property
# ---------------------------------------------------------------------------


class TestItemsProperty:
    def test_items_returns_item_list(self):
        engine = _make_engine()
        assert isinstance(engine.items, ItemList)

    def test_items_initially_empty(self):
        engine = _make_engine()
        assert len(engine.items) == 0

    @pytest.mark.asyncio
    async def test_items_populated_after_crawl(self):
        engine = _make_engine()
        await engine.crawl()
        assert len(engine.items) >= 1


# ---------------------------------------------------------------------------
# Tests: streaming (__aiter__ / _stream)
# ---------------------------------------------------------------------------


class TestStreaming:
    @pytest.mark.asyncio
    async def test_stream_yields_items(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        items = []
        async for item in engine:
            items.append(item)

        assert len(items) >= 1
        assert isinstance(items[0], dict)

    @pytest.mark.asyncio
    async def test_stream_processes_follow_up_requests(self):
        call_count = 0

        async def parse_with_follow(response) -> AsyncGenerator:
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                yield Request("https://example.com/page2", sid="default")
            yield {"page": call_count}

        spider = MockSpider()
        spider.parse = parse_with_follow  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        items = []
        async for item in engine:
            items.append(item)

        assert len(items) == 2

    @pytest.mark.asyncio
    async def test_stream_items_not_stored_in_items_list(self):
        """When streaming, items go to the stream, not to engine._items."""
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        items = []
        async for item in engine:
            items.append(item)

        # Items were sent through stream, not appended to _items
        assert len(items) >= 1
        assert len(engine.items) == 0


# ---------------------------------------------------------------------------
# Tests: pause during crawl
# ---------------------------------------------------------------------------


class TestPauseDuringCrawl:
    @pytest.mark.asyncio
    async def test_pause_stops_crawl_gracefully(self):
        processed = 0

        async def slow_parse(response) -> AsyncGenerator:
            nonlocal processed
            processed += 1
            # Yield more requests to keep the crawl going
            if processed <= 2:
                yield Request(f"https://example.com/p{processed + 1}", sid="default")
            yield {"n": processed}

        spider = MockSpider()
        spider.parse = slow_parse  # type: ignore[assignment]
        engine = _make_engine(spider=spider)

        # Request pause immediately - the engine will stop as soon as active tasks complete
        engine._pause_requested = True

        stats = await engine.crawl()
        # Should stop without processing everything
        assert engine._running is False

    @pytest.mark.asyncio
    async def test_pause_with_checkpoint_sets_paused(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            parse_count = 0

            async def parse_and_pause(response) -> AsyncGenerator:
                nonlocal parse_count
                parse_count += 1
                # Request pause after first request, but yield follow-ups
                if parse_count == 1:
                    engine.request_pause()
                    yield Request("https://example.com/p2", sid="default")
                yield {"n": parse_count}

            spider = MockSpider()
            spider.parse = parse_and_pause  # type: ignore[assignment]
            engine = _make_engine(spider=spider, crawldir=tmpdir)

            await engine.crawl()

            assert engine.paused is True

    @pytest.mark.asyncio
    async def test_pause_without_checkpoint_does_not_set_paused(self):
        spider = MockSpider()
        engine = _make_engine(spider=spider)

        engine._pause_requested = True

        await engine.crawl()

        assert engine.paused is False


================================================
FILE: tests/spiders/test_request.py
================================================
"""Tests for the Request class."""

import pickle

import pytest

from scrapling.spiders.request import Request
from scrapling.core._types import Any, Dict, AsyncGenerator


class TestRequestCreation:
    """Test Request initialization and basic attributes."""

    def test_basic_request_creation(self):
        """Test creating a request with just a URL."""
        request = Request("https://example.com")

        assert request.url == "https://example.com"
        assert request.sid == ""
        assert request.callback is None
        assert request.priority == 0
        assert request.dont_filter is False
        assert request.meta == {}
        assert request._retry_count == 0
        assert request._session_kwargs == {}

    def test_request_with_all_parameters(self):
        """Test creating a request with all parameters."""

        async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield {"test": "data"}

        request = Request(
            url="https://example.com/page",
            sid="my_session",
            callback=my_callback,
            priority=10,
            dont_filter=True,
            meta={"key": "value"},
            _retry_count=2,
            proxy="http://proxy:8080",
            timeout=30,
        )

        assert request.url == "https://example.com/page"
        assert request.sid == "my_session"
        assert request.callback == my_callback
        assert request.priority == 10
        assert request.dont_filter is True
        assert request.meta == {"key": "value"}
        assert request._retry_count == 2
        assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30}

    def test_request_meta_default_is_empty_dict(self):
        """Test that meta defaults to empty dict, not shared reference."""
        r1 = Request("https://example.com")
        r2 = Request("https://example.com")

        r1.meta["key"] = "value"

        assert r1.meta == {"key": "value"}
        assert r2.meta == {}


class TestRequestProperties:
    """Test Request computed properties."""

    def test_domain_extraction(self):
        """Test domain property extracts netloc correctly."""
        request = Request("https://www.example.com/path/page.html?query=1")
        assert request.domain == "www.example.com"

    def test_domain_with_port(self):
        """Test domain extraction with port number."""
        request = Request("http://localhost:8080/api")
        assert request.domain == "localhost:8080"

    def test_domain_with_subdomain(self):
        """Test domain extraction with subdomains."""
        request = Request("https://api.v2.example.com/endpoint")
        assert request.domain == "api.v2.example.com"

    def test_fingerprint_returns_bytes(self):
        """Test fingerprint generation returns bytes."""
        request = Request("https://example.com")
        fp = request.update_fingerprint()
        assert isinstance(fp, bytes)
        assert len(fp) == 20  # SHA1 produces 20 bytes

    def test_fingerprint_is_deterministic(self):
        """Test same request produces same fingerprint."""
        r1 = Request("https://example.com", data={"key": "value"})
        r2 = Request("https://example.com", data={"key": "value"})
        assert r1.update_fingerprint() == r2.update_fingerprint()

    def test_fingerprint_different_urls(self):
        """Test different URLs produce different fingerprints."""
        r1 = Request("https://example.com/page1")
        r2 = Request("https://example.com/page2")
        assert r1.update_fingerprint() != r2.update_fingerprint()


class TestRequestCopy:
    """Test Request copy functionality."""

    def test_copy_creates_independent_request(self):
        """Test that copy creates a new independent request."""

        async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield None

        original = Request(
            url="https://example.com",
            sid="session",
            callback=callback,
            priority=5,
            dont_filter=True,
            meta={"original": True},
            _retry_count=1,
            proxy="http://proxy:8080",
        )

        copied = original.copy()

        # Check all values are copied
        assert copied.url == original.url
        assert copied.sid == original.sid
        assert copied.callback == original.callback
        assert copied.priority == original.priority
        assert copied.dont_filter == original.dont_filter
        assert copied.meta == original.meta
        assert copied._retry_count == original._retry_count
        assert copied._session_kwargs == original._session_kwargs

        # Check they are different objects
        assert copied is not original
        assert copied.meta is not original.meta  # Meta should be a copy

    def test_copy_meta_is_independent(self):
        """Test that modifying copied meta doesn't affect original."""
        original = Request("https://example.com", meta={"key": "original"})
        copied = original.copy()

        copied.meta["key"] = "modified"
        copied.meta["new_key"] = "new_value"

        assert original.meta == {"key": "original"}
        assert copied.meta == {"key": "modified", "new_key": "new_value"}


class TestRequestComparison:
    """Test Request comparison operators."""

    def test_priority_less_than(self):
        """Test less than comparison by priority."""
        low_priority = Request("https://example.com/1", priority=1)
        high_priority = Request("https://example.com/2", priority=10)

        assert low_priority < high_priority
        assert not high_priority < low_priority

    def test_priority_greater_than(self):
        """Test greater than comparison by priority."""
        low_priority = Request("https://example.com/1", priority=1)
        high_priority = Request("https://example.com/2", priority=10)

        assert high_priority > low_priority
        assert not low_priority > high_priority

    def test_equality_by_fingerprint(self):
        """Test equality comparison by fingerprint."""
        r1 = Request("https://example.com")
        r2 = Request("https://example.com")
        r3 = Request("https://example.com/other")

        # Generate fingerprints first (required for equality)
        r1.update_fingerprint()
        r2.update_fingerprint()
        r3.update_fingerprint()

        assert r1 == r2
        assert r1 != r3

    def test_equality_different_priorities_same_fingerprint(self):
        """Test requests with same fingerprint are equal despite different priorities."""
        r1 = Request("https://example.com", priority=1)
        r2 = Request("https://example.com", priority=100)

        # Generate fingerprints first
        r1.update_fingerprint()
        r2.update_fingerprint()

        assert r1 == r2  # Same fingerprint means equal

    def test_comparison_with_non_request(self):
        """Test comparison with non-Request types returns NotImplemented."""
        request = Request("https://example.com")

        assert request.__lt__("not a request") == NotImplemented
        assert request.__gt__("not a request") == NotImplemented
        assert request.__eq__("not a request") == NotImplemented


class TestRequestStringRepresentation:
    """Test Request string representations."""

    def test_str_returns_url(self):
        """Test __str__ returns the URL."""
        request = Request("https://example.com/page")
        assert str(request) == "https://example.com/page"

    def test_repr_without_callback(self):
        """Test __repr__ without callback."""
        request = Request("https://example.com", priority=5)
        repr_str = repr(request)

        assert "Request" in repr_str
        assert "https://example.com" in repr_str
        assert "priority=5" in repr_str
        assert "callback=None" in repr_str

    def test_repr_with_callback(self):
        """Test __repr__ with named callback."""

        async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield None

        request = Request("https://example.com", callback=my_custom_callback)
        repr_str = repr(request)

        assert "callback=my_custom_callback" in repr_str


class TestRequestPickling:
    """Test Request serialization for checkpointing."""

    def test_pickle_without_callback(self):
        """Test pickling request without callback."""
        original = Request(
            url="https://example.com",
            sid="session",
            priority=5,
            meta={"key": "value"},
        )

        pickled = pickle.dumps(original)
        restored = pickle.loads(pickled)

        assert restored.url == original.url
        assert restored.sid == original.sid
        assert restored.priority == original.priority
        assert restored.meta == original.meta
        assert restored.callback is None

    def test_pickle_with_callback_stores_name(self):
        """Test that callback name is stored when pickling."""

        async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
            yield {"data": "test"}

        original = Request("https://example.com", callback=parse_page)

        # Check getstate stores callback name
        state = original.__getstate__()
        assert state["_callback_name"] == "parse_page"
        assert state["callback"] is None

    def test_pickle_with_none_callback(self):
        """Test pickling with None callback."""
        original = Request("https://example.com", callback=None)

        state = original.__getstate__()
        assert state["_callback_name"] is None
        assert state["callback"] is None

    def test_setstate_stores_callback_name(self):
        """Test that setstate correctly handles callback name."""
        request = Request("https://example.com")
        state = {
            "url": "https://example.com",
            "sid": "",
            "callback": None,
            "priority": 0,
            "dont_filter": False,
            "meta": {},
            "_retry_count": 0,
            "_session_kwargs": {},
            "_callback_name": "custom_parse",
        }

        request.__setstate__(state)

        assert hasattr(request, "_callback_name")
        assert request._callback_name == "custom_parse"

    def test_pickle_roundtrip_preserves_session_kwargs(self):
        """Test that session kwargs are preserved through pickle."""
        original = Request(
            "https://example.com",
            proxy="http://proxy:8080",
            timeout=30,
            headers={"User-Agent": "test"},
        )

        pickled = pickle.dumps(original)
        restored = pickle.loads(pickled)

        assert restored._session_kwargs == {
            "proxy": "http://proxy:8080",
            "timeout": 30,
            "headers": {"User-Agent": "test"},
        }


class TestRequestRestoreCallback:
    """Test callback restoration from spider."""

    def test_restore_callback_from_spider(self):
        """Test restoring callback from spider instance."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

            async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield {"detail": True}

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = "parse_detail"

        request._restore_callback(spider)  # type: ignore[arg-type]

        assert request.callback == spider.parse_detail
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_falls_back_to_parse(self):
        """Test that missing callback falls back to spider.parse."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = "nonexistent_method"

        request._restore_callback(spider)  # type: ignore[arg-type]

        assert request.callback == spider.parse
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_with_none_name(self):
        """Test restore callback when _callback_name is None."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        request._callback_name = None

        request._restore_callback(spider)  # type: ignore[arg-type]

        # Should clean up _callback_name attribute
        assert not hasattr(request, "_callback_name")

    def test_restore_callback_without_callback_name_attr(self):
        """Test restore callback when _callback_name attribute doesn't exist."""

        class MockSpider:
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MockSpider()
        request = Request("https://example.com")
        # Don't set _callback_name

        # Should not raise an error
        request._restore_callback(spider)  # type: ignore[arg-type]


================================================
FILE: tests/spiders/test_result.py
================================================
"""Tests for the result module (ItemList, CrawlStats, CrawlResult)."""

import json
import tempfile
from pathlib import Path

import pytest

from scrapling.spiders.result import ItemList, CrawlStats, CrawlResult


class TestItemList:
    """Test ItemList functionality."""

    def test_itemlist_is_list(self):
        """Test that ItemList is a list subclass."""
        items = ItemList()

        assert isinstance(items, list)

    def test_itemlist_basic_operations(self):
        """Test basic list operations work."""
        items = ItemList()

        items.append({"id": 1})
        items.append({"id": 2})

        assert len(items) == 2
        assert items[0] == {"id": 1}

    def test_to_json_creates_file(self):
        """Test to_json creates JSON file."""
        items = ItemList()
        items.append({"name": "test", "value": 123})
        items.append({"name": "test2", "value": 456})

        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "output.json"
            items.to_json(path)

            assert path.exists()

            content = json.loads(path.read_text())
            assert len(content) == 2
            assert content[0]["name"] == "test"

    def test_to_json_creates_parent_directory(self):
        """Test to_json creates parent directories."""
        items = ItemList()
        items.append({"data": "test"})

        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "nested" / "dirs" / "output.json"
            items.to_json(path)

            assert path.exists()

    def test_to_json_with_indent(self):
        """Test to_json with indentation."""
        items = ItemList()
        items.append({"key": "value"})

        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "output.json"
            items.to_json(path, indent=True)

            content = path.read_text()
            # Indented JSON should have newlines
            assert "\n" in content

    def test_to_jsonl_creates_file(self):
        """Test to_jsonl creates JSON Lines file."""
        items = ItemList()
        items.append({"id": 1, "name": "first"})
        items.append({"id": 2, "name": "second"})
        items.append({"id": 3, "name": "third"})

        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "output.jsonl"
            items.to_jsonl(path)

            assert path.exists()

            lines = path.read_text().strip().split("\n")
            assert len(lines) == 3

            # Each line should be valid JSON
            for line in lines:
                parsed = json.loads(line)
                assert "id" in parsed
                assert "name" in parsed

    def test_to_jsonl_one_object_per_line(self):
        """Test that JSONL has one JSON object per line."""
        items = ItemList()
        items.append({"line": 1})
        items.append({"line": 2})

        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "output.jsonl"
            items.to_jsonl(path)

            lines = path.read_text().strip().split("\n")

            assert json.loads(lines[0])["line"] == 1
            assert json.loads(lines[1])["line"] == 2


class TestCrawlStats:
    """Test CrawlStats dataclass."""

    def test_default_values(self):
        """Test CrawlStats default values."""
        stats = CrawlStats()

        assert stats.requests_count == 0
        assert stats.concurrent_requests == 0
        assert stats.failed_requests_count == 0
        assert stats.response_bytes == 0
        assert stats.items_scraped == 0
        assert stats.items_dropped == 0
        assert stats.start_time == 0.0
        assert stats.end_time == 0.0
        assert stats.custom_stats == {}
        assert stats.response_status_count == {}
        assert stats.proxies == []

    def test_elapsed_seconds(self):
        """Test elapsed_seconds property."""
        stats = CrawlStats(start_time=100.0, end_time=150.0)

        assert stats.elapsed_seconds == 50.0

    def test_requests_per_second(self):
        """Test requests_per_second calculation."""
        stats = CrawlStats(
            requests_count=100,
            start_time=0.0,
            end_time=10.0,
        )

        assert stats.requests_per_second == 10.0

    def test_requests_per_second_zero_elapsed(self):
        """Test requests_per_second when elapsed is zero."""
        stats = CrawlStats(
            requests_count=100,
            start_time=0.0,
            end_time=0.0,
        )

        assert stats.requests_per_second == 0.0

    def test_increment_status(self):
        """Test increment_status method."""
        stats = CrawlStats()

        stats.increment_status(200)
        stats.increment_status(200)
        stats.increment_status(404)

        assert stats.response_status_count == {"status_200": 2, "status_404": 1}

    def test_increment_response_bytes(self):
        """Test increment_response_bytes method."""
        stats = CrawlStats()

        stats.increment_response_bytes("example.com", 1000)
        stats.increment_response_bytes("example.com", 500)
        stats.increment_response_bytes("other.com", 2000)

        assert stats.response_bytes == 3500
        assert stats.domains_response_bytes == {
            "example.com": 1500,
            "other.com": 2000,
        }

    def test_increment_requests_count(self):
        """Test increment_requests_count method."""
        stats = CrawlStats()

        stats.increment_requests_count("session1")
        stats.increment_requests_count("session1")
        stats.increment_requests_count("session2")

        assert stats.requests_count == 3
        assert stats.sessions_requests_count == {"session1": 2, "session2": 1}

    def test_to_dict(self):
        """Test to_dict method returns all stats."""
        stats = CrawlStats(
            items_scraped=10,
            items_dropped=2,
            requests_count=15,
            start_time=0.0,
            end_time=5.0,
        )
        stats.increment_status(200)

        result = stats.to_dict()

        assert result["items_scraped"] == 10
        assert result["items_dropped"] == 2
        assert result["requests_count"] == 15
        assert result["elapsed_seconds"] == 5.0
        assert result["requests_per_second"] == 3.0
        assert result["response_status_count"] == {"status_200": 1}

    def test_custom_stats(self):
        """Test custom_stats can be used."""
        stats = CrawlStats()
        stats.custom_stats["my_metric"] = 42
        stats.custom_stats["another"] = "value"

        assert stats.custom_stats["my_metric"] == 42
        assert stats.to_dict()["custom_stats"]["my_metric"] == 42


class TestCrawlResult:
    """Test CrawlResult dataclass."""

    def test_basic_creation(self):
        """Test basic CrawlResult creation."""
        stats = CrawlStats(items_scraped=5)
        items = ItemList()
        items.extend([{"id": i} for i in range(5)])

        result = CrawlResult(stats=stats, items=items)

        assert result.stats.items_scraped == 5
        assert len(result.items) == 5
        assert result.paused is False

    def test_completed_property_true_when_not_paused(self):
        """Test completed is True when not paused."""
        result = CrawlResult(
            stats=CrawlStats(),
            items=ItemList(),
            paused=False,
        )

        assert result.completed is True

    def test_completed_property_false_when_paused(self):
        """Test completed is False when paused."""
        result = CrawlResult(
            stats=CrawlStats(),
            items=ItemList(),
            paused=True,
        )

        assert result.completed is False

    def test_len_returns_item_count(self):
        """Test len returns number of items."""
        items = ItemList()
        items.extend([{"id": i} for i in range(10)])

        result = CrawlResult(stats=CrawlStats(), items=items)

        assert len(result) == 10

    def test_iter_yields_items(self):
        """Test iteration yields items."""
        items = ItemList()
        items.extend([{"id": 1}, {"id": 2}, {"id": 3}])

        result = CrawlResult(stats=CrawlStats(), items=items)

        collected = list(result)

        assert collected == [{"id": 1}, {"id": 2}, {"id": 3}]

    def test_result_with_stats(self):
        """Test CrawlResult with populated stats."""
        stats = CrawlStats(
            requests_count=100,
            items_scraped=50,
            failed_requests_count=5,
            start_time=0.0,
            end_time=10.0,
        )
        items = ItemList()

        result = CrawlResult(stats=stats, items=items)

        assert result.stats.requests_count == 100
        assert result.stats.items_scraped == 50
        assert result.stats.requests_per_second == 10.0


class TestCrawlResultIntegration:
    """Integration tests for result classes."""

    def test_full_workflow(self):
        """Test realistic workflow with all result classes."""
        # Simulate a crawl
        stats = CrawlStats(start_time=1000.0)

        # Simulate requests
        for _ in range(10):
            stats.increment_requests_count("default")
            stats.increment_status(200)
            stats.increment_response_bytes("example.com", 5000)

        # Simulate some failures
        stats.failed_requests_count = 2
        stats.blocked_requests_count = 1

        # Collect items
        items = ItemList()
        for i in range(8):
            items.append({"product_id": i, "name": f"Product {i}"})
            stats.items_scraped += 1

        # Finish crawl
        stats.end_time = 1005.0

        # Create result
        result = CrawlResult(stats=stats, items=items, paused=False)

        # Verify
        assert result.completed is True
        assert len(result) == 8
        assert result.stats.requests_count == 10
        assert result.stats.requests_per_second == 2.0
        assert result.stats.response_bytes == 50000


================================================
FILE: tests/spiders/test_scheduler.py
================================================
"""Tests for the Scheduler class."""

import pytest

from scrapling.spiders.request import Request
from scrapling.spiders.scheduler import Scheduler
from scrapling.spiders.checkpoint import CheckpointData


class TestSchedulerInit:
    """Test Scheduler initialization."""

    def test_scheduler_starts_empty(self):
        """Test that scheduler starts with empty queue."""
        scheduler = Scheduler()

        assert len(scheduler) == 0
        assert scheduler.is_empty is True


class TestSchedulerEnqueue:
    """Test Scheduler enqueue functionality."""

    @pytest.mark.asyncio
    async def test_enqueue_single_request(self):
        """Test enqueueing a single request."""
        scheduler = Scheduler()
        request = Request("https://example.com")

        result = await scheduler.enqueue(request)

        assert result is True
        assert len(scheduler) == 1
        assert scheduler.is_empty is False

    @pytest.mark.asyncio
    async def test_enqueue_multiple_requests(self):
        """Test enqueueing multiple requests."""
        scheduler = Scheduler()

        for i in range(5):
            request = Request(f"https://example.com/{i}")
            await scheduler.enqueue(request)

        assert len(scheduler) == 5

    @pytest.mark.asyncio
    async def test_enqueue_duplicate_filtered(self):
        """Test that duplicate requests are filtered by default."""
        scheduler = Scheduler()

        request1 = Request("https://example.com", sid="s1")
        request2 = Request("https://example.com", sid="s1")  # Same fingerprint

        result1 = await scheduler.enqueue(request1)
        result2 = await scheduler.enqueue(request2)

        assert result1 is True
        assert result2 is False  # Duplicate filtered
        assert len(scheduler) == 1

    @pytest.mark.asyncio
    async def test_enqueue_duplicate_allowed_with_dont_filter(self):
        """Test that dont_filter allows duplicate requests."""
        scheduler = Scheduler()

        request1 = Request("https://example.com", sid="s1")
        request2 = Request("https://example.com", sid="s1", dont_filter=True)

        result1 = await scheduler.enqueue(request1)
        result2 = await scheduler.enqueue(request2)

        assert result1 is True
        assert result2 is True
        assert len(scheduler) == 2

    @pytest.mark.asyncio
    async def test_enqueue_different_methods_not_duplicate(self):
        """Test that same URL with different methods are not duplicates."""
        scheduler = Scheduler()

        request1 = Request("https://example.com", method="GET")
        request2 = Request("https://example.com", method="POST")

        result1 = await scheduler.enqueue(request1)
        result2 = await scheduler.enqueue(request2)

        assert result1 is True
        assert result2 is True
        assert len(scheduler) == 2


class TestSchedulerDequeue:
    """Test Scheduler dequeue functionality."""

    @pytest.mark.asyncio
    async def test_dequeue_returns_request(self):
        """Test that dequeue returns the enqueued request."""
        scheduler = Scheduler()
        original = Request("https://example.com")

        await scheduler.enqueue(original)
        dequeued = await scheduler.dequeue()

        assert dequeued.url == original.url

    @pytest.mark.asyncio
    async def test_dequeue_respects_priority_order(self):
        """Test that higher priority requests are dequeued first."""
        scheduler = Scheduler()

        low = Request("https://example.com/low", priority=1)
        high = Request("https://example.com/high", priority=10)
        medium = Request("https://example.com/medium", priority=5)

        await scheduler.enqueue(low)
        await scheduler.enqueue(high)
        await scheduler.enqueue(medium)

        # Should get high priority first
        first = await scheduler.dequeue()
        assert first.url == "https://example.com/high"

        second = await scheduler.dequeue()
        assert second.url == "https://example.com/medium"

        third = await scheduler.dequeue()
        assert third.url == "https://example.com/low"

    @pytest.mark.asyncio
    async def test_dequeue_fifo_for_same_priority(self):
        """Test FIFO ordering for requests with same priority."""
        scheduler = Scheduler()

        for i in range(3):
            request = Request(f"https://example.com/{i}", priority=5)
            await scheduler.enqueue(request)

        first = await scheduler.dequeue()
        second = await scheduler.dequeue()
        third = await scheduler.dequeue()

        # Should be in FIFO order since same priority
        assert first.url == "https://example.com/0"
        assert second.url == "https://example.com/1"
        assert third.url == "https://example.com/2"

    @pytest.mark.asyncio
    async def test_dequeue_updates_length(self):
        """Test that dequeue decreases the queue length."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com/1"))
        await scheduler.enqueue(Request("https://example.com/2"))

        assert len(scheduler) == 2

        await scheduler.dequeue()
        assert len(scheduler) == 1

        await scheduler.dequeue()
        assert len(scheduler) == 0
        assert scheduler.is_empty is True


class TestSchedulerSnapshot:
    """Test Scheduler snapshot functionality for checkpointing."""

    @pytest.mark.asyncio
    async def test_snapshot_empty_scheduler(self):
        """Test snapshot of empty scheduler."""
        scheduler = Scheduler()

        requests, seen = scheduler.snapshot()

        assert requests == []
        assert seen == set()

    @pytest.mark.asyncio
    async def test_snapshot_captures_pending_requests(self):
        """Test snapshot captures all pending requests."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com/1", priority=5))
        await scheduler.enqueue(Request("https://example.com/2", priority=10))
        await scheduler.enqueue(Request("https://example.com/3", priority=1))

        requests, seen = scheduler.snapshot()

        assert len(requests) == 3
        # Should be sorted by priority (highest first due to negative priority in queue)
        assert requests[0].url == "https://example.com/2"  # priority 10
        assert requests[1].url == "https://example.com/1"  # priority 5
        assert requests[2].url == "https://example.com/3"  # priority 1

    @pytest.mark.asyncio
    async def test_snapshot_captures_seen_set(self):
        """Test snapshot captures seen fingerprints."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com/1"))
        await scheduler.enqueue(Request("https://example.com/2"))

        requests, seen = scheduler.snapshot()

        assert len(seen) == 2
        # Fingerprints are now bytes (SHA1 hashes)
        for fp in seen:
            assert isinstance(fp, bytes)
            assert len(fp) == 20  # SHA1 produces 20 bytes

    @pytest.mark.asyncio
    async def test_snapshot_returns_copies(self):
        """Test that snapshot returns copies, not references."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com"))

        requests, seen = scheduler.snapshot()

        # Modifying snapshot shouldn't affect scheduler
        requests.append(Request("https://modified.com"))
        seen.add(b"new_fingerprint_bytes")

        original_requests, original_seen = scheduler.snapshot()

        assert len(original_requests) == 1
        assert b"new_fingerprint_bytes" not in original_seen

    @pytest.mark.asyncio
    async def test_snapshot_excludes_dequeued_requests(self):
        """Test snapshot only includes pending requests."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com/1"))
        await scheduler.enqueue(Request("https://example.com/2"))
        await scheduler.enqueue(Request("https://example.com/3"))

        # Dequeue one
        await scheduler.dequeue()

        requests, seen = scheduler.snapshot()

        # Snapshot should only have 2 pending requests
        assert len(requests) == 2
        # But seen should still have all 3 (deduplication tracking)
        assert len(seen) == 3


class TestSchedulerRestore:
    """Test Scheduler restore functionality from checkpoint."""

    @pytest.mark.asyncio
    async def test_restore_requests(self):
        """Test restoring requests from checkpoint data."""
        scheduler = Scheduler()

        checkpoint_requests = [
            Request("https://example.com/1", priority=10),
            Request("https://example.com/2", priority=5),
        ]
        checkpoint_seen = {b"fp1_bytes_padded!", b"fp2_bytes_padded!", b"fp3_bytes_padded!"}

        data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen)

        scheduler.restore(data)

        assert len(scheduler) == 2

    @pytest.mark.asyncio
    async def test_restore_seen_set(self):
        """Test that restore sets up seen fingerprints."""
        scheduler = Scheduler()

        data = CheckpointData(
            requests=[],
            seen={b"fp1_bytes_here_pad", b"fp2_bytes_here_pad"},  # Bytes fingerprints
        )

        scheduler.restore(data)

        # Verify seen set was restored
        _, seen = scheduler.snapshot()
        assert seen == {b"fp1_bytes_here_pad", b"fp2_bytes_here_pad"}

    @pytest.mark.asyncio
    async def test_restore_maintains_priority_order(self):
        """Test that restored requests maintain priority order."""
        scheduler = Scheduler()

        # Requests should already be sorted by priority in checkpoint
        checkpoint_requests = [
            Request("https://example.com/high", priority=10),
            Request("https://example.com/low", priority=1),
        ]

        data = CheckpointData(requests=checkpoint_requests, seen=set())
        scheduler.restore(data)

        # Dequeue should return high priority first
        first = await scheduler.dequeue()
        assert first.url == "https://example.com/high"

        second = await scheduler.dequeue()
        assert second.url == "https://example.com/low"

    @pytest.mark.asyncio
    async def test_restore_empty_checkpoint(self):
        """Test restoring from empty checkpoint."""
        scheduler = Scheduler()

        data = CheckpointData(requests=[], seen=set())
        scheduler.restore(data)

        assert len(scheduler) == 0
        assert scheduler.is_empty is True


class TestSchedulerIntegration:
    """Integration tests for Scheduler with checkpoint roundtrip."""

    @pytest.mark.asyncio
    async def test_snapshot_and_restore_roundtrip(self):
        """Test that snapshot -> restore works correctly."""
        # Create and populate original scheduler
        original = Scheduler()

        await original.enqueue(Request("https://example.com/1", sid="s1", priority=10))
        await original.enqueue(Request("https://example.com/2", sid="s1", priority=5))
        await original.enqueue(Request("https://example.com/3", sid="s2", priority=7))

        # Snapshot
        requests, seen = original.snapshot()
        data = CheckpointData(requests=requests, seen=seen)

        # Restore to new scheduler
        restored = Scheduler()
        restored.restore(data)

        # Verify state matches
        assert len(restored) == len(original)

        # Dequeue from both and compare
        for _ in range(3):
            orig_req = await original.dequeue()
            rest_req = await restored.dequeue()
            assert orig_req.url == rest_req.url
            assert orig_req.priority == rest_req.priority

    @pytest.mark.asyncio
    async def test_partial_processing_then_checkpoint(self):
        """Test checkpointing after partial processing."""
        scheduler = Scheduler()

        # Enqueue 5 requests
        for i in range(5):
            await scheduler.enqueue(Request(f"https://example.com/{i}"))

        # Process 2
        await scheduler.dequeue()
        await scheduler.dequeue()

        # Snapshot should show 3 pending, 5 seen
        requests, seen = scheduler.snapshot()

        assert len(requests) == 3
        assert len(seen) == 5

    @pytest.mark.asyncio
    async def test_deduplication_after_restore(self):
        """Test that deduplication works after restore."""
        scheduler = Scheduler()

        await scheduler.enqueue(Request("https://example.com", sid="s1"))

        requests, seen = scheduler.snapshot()
        data = CheckpointData(requests=requests, seen=seen)

        # Restore to new scheduler
        new_scheduler = Scheduler()
        new_scheduler.restore(data)

        # Try to add duplicate - should be filtered
        result = await new_scheduler.enqueue(Request("https://example.com", sid="s1"))

        assert result is False  # Duplicate filtered based on restored seen set


================================================
FILE: tests/spiders/test_session.py
================================================
"""Tests for the SessionManager class."""

from scrapling.core._types import Any
import pytest

from scrapling.spiders.session import SessionManager


class MockSession:  # type: ignore[type-arg]
    """Mock session for testing without actual network calls."""

    def __init__(self, name: str = "mock"):
        self.name = name
        self._is_alive = False
        self._started = False
        self._closed = False

    async def __aenter__(self):
        self._is_alive = True
        self._started = True
        return self

    async def __aexit__(self, *args):
        self._is_alive = False
        self._closed = True

    async def fetch(self, url: str, **kwargs):
        pass


class TestSessionManagerInit:
    """Test SessionManager initialization."""

    def test_manager_starts_empty(self):
        """Test that manager starts with no sessions."""
        manager = SessionManager()

        assert len(manager) == 0

    def test_manager_no_default_session_when_empty(self):
        """Test that accessing default_session_id raises when empty."""
        manager = SessionManager()

        with pytest.raises(RuntimeError, match="No sessions registered"):
            _ = manager.default_session_id


class TestSessionManagerAdd:
    """Test SessionManager add functionality."""

    def test_add_single_session(self):
        """Test adding a single session."""
        manager = SessionManager()
        session = MockSession()

        manager.add("test", session)

        assert len(manager) == 1
        assert "test" in manager
        assert manager.session_ids == ["test"]

    def test_first_session_becomes_default(self):
        """Test that first added session becomes default."""
        manager = SessionManager()
        session = MockSession()

        manager.add("first", session)

        assert manager.default_session_id == "first"

    def test_add_multiple_sessions(self):
        """Test adding multiple sessions."""
        manager = SessionManager()

        manager.add("session1", MockSession("s1"))
        manager.add("session2", MockSession("s2"))
        manager.add("session3", MockSession("s3"))

        assert len(manager) == 3
        assert "session1" in manager
        assert "session2" in manager
        assert "session3" in manager

    def test_explicit_default_session(self):
        """Test setting explicit default session."""
        manager = SessionManager()

        manager.add("first", MockSession())
        manager.add("second", MockSession(), default=True)

        assert manager.default_session_id == "second"

    def test_add_duplicate_id_raises(self):
        """Test that adding duplicate session ID raises."""
        manager = SessionManager()
        manager.add("test", MockSession())

        with pytest.raises(ValueError, match="already registered"):
            manager.add("test", MockSession())

    def test_add_returns_self_for_chaining(self):
        """Test that add returns self for method chaining."""
        manager = SessionManager()

        result = manager.add("test", MockSession())

        assert result is manager

    def test_method_chaining(self):
        """Test fluent interface for adding sessions."""
        manager = SessionManager()

        manager.add("s1", MockSession()).add("s2", MockSession()).add("s3", MockSession())

        assert len(manager) == 3

    def test_add_lazy_session(self):
        """Test adding lazy session."""
        manager = SessionManager()

        manager.add("lazy", MockSession(), lazy=True)

        assert "lazy" in manager
        assert "lazy" in manager._lazy_sessions


class TestSessionManagerRemove:
    """Test SessionManager remove/pop functionality."""

    def test_remove_session(self):
        """Test removing a session."""
        manager = SessionManager()
        manager.add("test", MockSession())

        manager.remove("test")

        assert "test" not in manager
        assert len(manager) == 0

    def test_remove_nonexistent_raises(self):
        """Test removing nonexistent session raises."""
        manager = SessionManager()

        with pytest.raises(KeyError, match="not found"):
            manager.remove("nonexistent")

    def test_pop_returns_session(self):
        """Test pop returns the removed session."""
        manager = SessionManager()
        session = MockSession("original")
        manager.add("test", session)

        popped = manager.pop("test")

        assert popped is session
        assert "test" not in manager

    def test_remove_default_updates_default(self):
        """Test that removing default session updates default."""
        manager = SessionManager()
        manager.add("first", MockSession())
        manager.add("second", MockSession())

        assert manager.default_session_id == "first"

        manager.remove("first")

        assert manager.default_session_id == "second"

    def test_remove_lazy_session_cleans_up(self):
        """Test that removing lazy session cleans up lazy set."""
        manager = SessionManager()
        manager.add("lazy", MockSession(), lazy=True)

        manager.remove("lazy")

        assert "lazy" not in manager._lazy_sessions


class TestSessionManagerGet:
    """Test SessionManager get functionality."""

    def test_get_existing_session(self):
        """Test getting an existing session."""
        manager = SessionManager()
        session = MockSession("test")
        manager.add("test", session)

        retrieved = manager.get("test")

        assert retrieved is session

    def test_get_nonexistent_raises_with_available(self):
        """Test getting nonexistent session shows available sessions."""
        manager = SessionManager()
        manager.add("session1", MockSession())
        manager.add("session2", MockSession())

        with pytest.raises(KeyError, match="Available:"):
            manager.get("nonexistent")


class TestSessionManagerContains:
    """Test SessionManager contains functionality."""

    def test_contains_existing(self):
        """Test contains for existing session."""
        manager = SessionManager()
        manager.add("test", MockSession())

        assert "test" in manager

    def test_not_contains_missing(self):
        """Test contains for missing session."""
        manager = SessionManager()
        manager.add("test", MockSession())

        assert "other" not in manager


class TestSessionManagerAsyncContext:
    """Test SessionManager async context manager."""

    @pytest.mark.asyncio
    async def test_start_activates_sessions(self):
        """Test that start activates non-lazy sessions."""
        manager = SessionManager()
        session = MockSession()
        manager.add("test", session)

        await manager.start()

        assert session._is_alive is True
        assert manager._started is True

    @pytest.mark.asyncio
    async def test_start_skips_lazy_sessions(self):
        """Test that start skips lazy sessions."""
        manager = SessionManager()
        eager_session = MockSession("eager")
        lazy_session = MockSession("lazy")

        manager.add("eager", eager_session)
        manager.add("lazy", lazy_session, lazy=True)

        await manager.start()

        assert eager_session._is_alive is True
        assert lazy_session._is_alive is False

    @pytest.mark.asyncio
    async def test_close_deactivates_sessions(self):
        """Test that close deactivates all sessions."""
        manager = SessionManager()
        session = MockSession()
        manager.add("test", session)

        await manager.start()
        assert session._is_alive is True

        await manager.close()
        assert session._is_alive is False
        assert manager._started is False

    @pytest.mark.asyncio
    async def test_async_context_manager(self):
        """Test using SessionManager as async context manager."""
        manager = SessionManager()
        session = MockSession()
        manager.add("test", session)

        async with manager:
            assert session._is_alive is True

        assert session._is_alive is False

    @pytest.mark.asyncio
    async def test_start_idempotent(self):
        """Test that calling start multiple times is safe."""
        manager = SessionManager()
        session = MockSession()
        manager.add("test", session)

        await manager.start()
        await manager.start()  # Should not raise or double-start

        assert session._started is True


class TestSessionManagerProperties:
    """Test SessionManager properties."""

    def test_session_ids_returns_list(self):
        """Test session_ids returns list of IDs."""
        manager = SessionManager()
        manager.add("a", MockSession())
        manager.add("b", MockSession())
        manager.add("c", MockSession())

        ids = manager.session_ids

        assert isinstance(ids, list)
        assert set(ids) == {"a", "b", "c"}

    def test_len_returns_session_count(self):
        """Test len returns number of sessions."""
        manager = SessionManager()

        assert len(manager) == 0

        manager.add("s1", MockSession())
        assert len(manager) == 1

        manager.add("s2", MockSession())
        assert len(manager) == 2


class TestSessionManagerIntegration:
    """Integration tests for SessionManager."""

    def test_realistic_setup(self):
        """Test realistic session manager setup."""
        manager = SessionManager()

        # Add different types of sessions
        manager.add("default", MockSession("default"))
        manager.add("backup", MockSession("backup"))
        manager.add("lazy_special", MockSession("special"), lazy=True)

        assert len(manager) == 3
        assert manager.default_session_id == "default"
        assert "lazy_special" in manager._lazy_sessions

    @pytest.mark.asyncio
    async def test_lifecycle_management(self):
        """Test complete lifecycle of session manager."""
        manager = SessionManager()
        sessions = [MockSession(f"s{i}") for i in range(3)]

        for i, session in enumerate(sessions):
            manager.add(f"session{i}", session)

        # Before start - no sessions active
        assert all(not s._is_alive for s in sessions)

        # After start - all active
        await manager.start()
        assert all(s._is_alive for s in sessions)

        # After close - all inactive
        await manager.close()
        assert all(not s._is_alive for s in sessions)


================================================
FILE: tests/spiders/test_spider.py
================================================
"""Tests for the Spider class and related components."""

import logging
import tempfile
from pathlib import Path

import pytest

from scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES
from scrapling.spiders.request import Request
from scrapling.spiders.session import SessionManager
from scrapling.spiders.result import CrawlStats
from scrapling.core._types import Any, Dict, AsyncGenerator


class TestLogCounterHandler:
    """Test LogCounterHandler for tracking log counts."""

    def test_initial_counts_are_zero(self):
        """Test that handler starts with zero counts."""
        handler = LogCounterHandler()
        counts = handler.get_counts()

        assert counts["debug"] == 0
        assert counts["info"] == 0
        assert counts["warning"] == 0
        assert counts["error"] == 0
        assert counts["critical"] == 0

    def test_counts_debug_messages(self):
        """Test counting debug level messages."""
        handler = LogCounterHandler()
        record = logging.LogRecord(
            name="test",
            level=logging.DEBUG,
            pathname="",
            lineno=0,
            msg="test",
            args=(),
            exc_info=None,
        )

        handler.emit(record)
        handler.emit(record)

        assert handler.get_counts()["debug"] == 2

    def test_counts_info_messages(self):
        """Test counting info level messages."""
        handler = LogCounterHandler()
        record = logging.LogRecord(
            name="test",
            level=logging.INFO,
            pathname="",
            lineno=0,
            msg="test",
            args=(),
            exc_info=None,
        )

        handler.emit(record)

        assert handler.get_counts()["info"] == 1

    def test_counts_warning_messages(self):
        """Test counting warning level messages."""
        handler = LogCounterHandler()
        record = logging.LogRecord(
            name="test",
            level=logging.WARNING,
            pathname="",
            lineno=0,
            msg="test",
            args=(),
            exc_info=None,
        )

        handler.emit(record)

        assert handler.get_counts()["warning"] == 1

    def test_counts_error_messages(self):
        """Test counting error level messages."""
        handler = LogCounterHandler()
        record = logging.LogRecord(
            name="test",
            level=logging.ERROR,
            pathname="",
            lineno=0,
            msg="test",
            args=(),
            exc_info=None,
        )

        handler.emit(record)

        assert handler.get_counts()["error"] == 1

    def test_counts_critical_messages(self):
        """Test counting critical level messages."""
        handler = LogCounterHandler()
        record = logging.LogRecord(
            name="test",
            level=logging.CRITICAL,
            pathname="",
            lineno=0,
            msg="test",
            args=(),
            exc_info=None,
        )

        handler.emit(record)

        assert handler.get_counts()["critical"] == 1

    def test_counts_multiple_levels(self):
        """Test counting messages at different levels."""
        handler = LogCounterHandler()

        levels = [
            logging.DEBUG,
            logging.DEBUG,
            logging.INFO,
            logging.WARNING,
            logging.ERROR,
            logging.ERROR,
            logging.ERROR,
            logging.CRITICAL,
        ]

        for level in levels:
            record = logging.LogRecord(
                name="test",
                level=level,
                pathname="",
                lineno=0,
                msg="test",
                args=(),
                exc_info=None,
            )
            handler.emit(record)

        counts = handler.get_counts()
        assert counts["debug"] == 2
        assert counts["info"] == 1
        assert counts["warning"] == 1
        assert counts["error"] == 3
        assert counts["critical"] == 1


class TestBlockedCodes:
    """Test BLOCKED_CODES constant."""

    def test_blocked_codes_contains_expected_values(self):
        """Test that BLOCKED_CODES contains expected HTTP status codes."""
        assert 401 in BLOCKED_CODES  # Unauthorized
        assert 403 in BLOCKED_CODES  # Forbidden
        assert 407 in BLOCKED_CODES  # Proxy Authentication Required
        assert 429 in BLOCKED_CODES  # Too Many Requests
        assert 444 in BLOCKED_CODES  # Connection Closed Without Response (nginx)
        assert 500 in BLOCKED_CODES  # Internal Server Error
        assert 502 in BLOCKED_CODES  # Bad Gateway
        assert 503 in BLOCKED_CODES  # Service Unavailable
        assert 504 in BLOCKED_CODES  # Gateway Timeout

    def test_blocked_codes_does_not_contain_success(self):
        """Test that success codes are not blocked."""
        assert 200 not in BLOCKED_CODES
        assert 201 not in BLOCKED_CODES
        assert 204 not in BLOCKED_CODES
        assert 301 not in BLOCKED_CODES
        assert 302 not in BLOCKED_CODES


class ConcreteSpider(Spider):
    """Concrete spider implementation for testing."""

    name = "test_spider"
    start_urls = ["https://example.com"]

    async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
        yield {"url": str(response)}


class TestSpiderInit:
    """Test Spider initialization."""

    def test_spider_requires_name(self):
        """Test that spider without name raises ValueError."""

        class NoNameSpider(Spider):
            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        with pytest.raises(ValueError, match="must have a name"):
            NoNameSpider()

    def test_spider_initializes_logger(self):
        """Test that spider creates a logger."""
        spider = ConcreteSpider()

        assert spider.logger is not None
        assert spider.logger.name == "scrapling.spiders.test_spider"

    def test_spider_logger_has_log_counter(self):
        """Test that spider logger has log counter handler."""
        spider = ConcreteSpider()

        assert spider._log_counter is not None
        assert isinstance(spider._log_counter, LogCounterHandler)

    def test_spider_with_crawldir(self):
        """Test spider initialization with crawldir."""
        with tempfile.TemporaryDirectory() as tmpdir:
            spider = ConcreteSpider(crawldir=tmpdir)

            assert spider.crawldir == Path(tmpdir)

    def test_spider_without_crawldir(self):
        """Test spider initialization without crawldir."""
        spider = ConcreteSpider()

        assert spider.crawldir is None

    def test_spider_custom_interval(self):
        """Test spider with custom checkpoint interval."""
        spider = ConcreteSpider(interval=60.0)

        assert spider._interval == 60.0

    def test_spider_default_interval(self):
        """Test spider has default checkpoint interval."""
        spider = ConcreteSpider()

        assert spider._interval == 300.0

    def test_spider_repr(self):
        """Test spider string representation."""
        spider = ConcreteSpider()

        repr_str = repr(spider)

        assert "ConcreteSpider" in repr_str
        assert "test_spider" in repr_str


class TestSpiderClassAttributes:
    """Test Spider class attribute defaults."""

    def test_default_concurrent_requests(self):
        """Test default concurrent_requests is 4."""
        assert ConcreteSpider.concurrent_requests == 4

    def test_default_concurrent_requests_per_domain(self):
        """Test default concurrent_requests_per_domain is 0 (disabled)."""
        assert ConcreteSpider.concurrent_requests_per_domain == 0

    def test_default_download_delay(self):
        """Test default download_delay is 0."""
        assert ConcreteSpider.download_delay == 0.0

    def test_default_max_blocked_retries(self):
        """Test default max_blocked_retries is 3."""
        assert ConcreteSpider.max_blocked_retries == 3

    def test_default_logging_level(self):
        """Test default logging level is DEBUG."""
        assert ConcreteSpider.logging_level == logging.DEBUG

    def test_default_allowed_domains_empty(self):
        """Test default allowed_domains is empty set."""
        assert ConcreteSpider.allowed_domains == set()


class TestSpiderSessionConfiguration:
    """Test Spider session configuration."""

    def test_default_configure_sessions(self):
        """Test that default configure_sessions adds a session."""
        spider = ConcreteSpider()

        assert len(spider._session_manager) > 0

    def test_configure_sessions_error_raises_custom_exception(self):
        """Test that errors in configure_sessions raise SessionConfigurationError."""

        class BadSessionSpider(Spider):
            name = "bad_spider"

            def configure_sessions(self, manager: SessionManager) -> None:
                raise RuntimeError("Configuration failed!")

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        with pytest.raises(SessionConfigurationError, match="Configuration failed"):
            BadSessionSpider()

    def test_configure_sessions_no_sessions_raises(self):
        """Test that not adding any sessions raises SessionConfigurationError."""

        class NoSessionSpider(Spider):
            name = "no_session_spider"

            def configure_sessions(self, manager: SessionManager) -> None:
                pass  # Don't add any sessions

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        with pytest.raises(SessionConfigurationError, match="did not add any sessions"):
            NoSessionSpider()


class TestSpiderStartRequests:
    """Test Spider start_requests method."""

    @pytest.mark.asyncio
    async def test_start_requests_yields_from_start_urls(self):
        """Test that start_requests yields requests for start_urls."""

        class MultiUrlSpider(Spider):
            name = "multi_url"
            start_urls = [
                "https://example.com/1",
                "https://example.com/2",
                "https://example.com/3",
            ]

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = MultiUrlSpider()
        requests = [r async for r in spider.start_requests()]

        assert len(requests) == 3
        assert requests[0].url == "https://example.com/1"
        assert requests[1].url == "https://example.com/2"
        assert requests[2].url == "https://example.com/3"

    @pytest.mark.asyncio
    async def test_start_requests_no_urls_raises(self):
        """Test that start_requests raises when no start_urls."""

        class NoUrlSpider(Spider):
            name = "no_url"
            start_urls = []

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = NoUrlSpider()

        with pytest.raises(RuntimeError, match="no starting point"):
            async for _ in spider.start_requests():
                pass

    @pytest.mark.asyncio
    async def test_start_requests_uses_default_session(self):
        """Test that start_requests uses default session ID."""
        spider = ConcreteSpider()
        requests = [r async for r in spider.start_requests()]

        # Should use the default session from session manager
        default_sid = spider._session_manager.default_session_id
        assert requests[0].sid == default_sid


class TestSpiderHooks:
    """Test Spider lifecycle hooks."""

    @pytest.mark.asyncio
    async def test_on_start_default(self):
        """Test default on_start doesn't raise."""
        spider = ConcreteSpider()

        # Should not raise
        await spider.on_start(resuming=False)
        await spider.on_start(resuming=True)

    @pytest.mark.asyncio
    async def test_on_close_default(self):
        """Test default on_close doesn't raise."""
        spider = ConcreteSpider()

        # Should not raise
        await spider.on_close()

    @pytest.mark.asyncio
    async def test_on_error_default(self):
        """Test default on_error logs the error."""
        spider = ConcreteSpider()
        request = Request("https://example.com")
        error = ValueError("test error")

        # Should not raise
        await spider.on_error(request, error)

    @pytest.mark.asyncio
    async def test_on_scraped_item_default_returns_item(self):
        """Test default on_scraped_item returns the item unchanged."""
        spider = ConcreteSpider()
        item = {"key": "value", "nested": {"a": 1}}

        result = await spider.on_scraped_item(item)

        assert result == item

    @pytest.mark.asyncio
    async def test_is_blocked_default_checks_status_codes(self):
        """Test default is_blocked checks blocked status codes."""

        class MockResponse:
            def __init__(self, status: int):
                self.status = status

        spider = ConcreteSpider()

        # Test blocked codes
        assert await spider.is_blocked(MockResponse(403)) is True
        assert await spider.is_blocked(MockResponse(429)) is True
        assert await spider.is_blocked(MockResponse(503)) is True

        # Test non-blocked codes
        assert await spider.is_blocked(MockResponse(200)) is False
        assert await spider.is_blocked(MockResponse(404)) is False

    @pytest.mark.asyncio
    async def test_retry_blocked_request_default_returns_request(self):
        """Test default retry_blocked_request returns the request unchanged."""

        class MockResponse:
            status = 429

        spider = ConcreteSpider()
        request = Request("https://example.com", priority=5)

        result = await spider.retry_blocked_request(request, MockResponse())

        assert result is request


class TestSpiderPause:
    """Test Spider pause functionality."""

    def test_pause_without_engine_raises(self):
        """Test that pause without active engine raises RuntimeError."""
        spider = ConcreteSpider()

        with pytest.raises(RuntimeError, match="No active crawl to stop"):
            spider.pause()


class TestSpiderStats:
    """Test Spider stats property."""

    def test_stats_without_engine_raises(self):
        """Test that accessing stats without active crawl raises."""
        spider = ConcreteSpider()

        with pytest.raises(RuntimeError, match="No active crawl"):
            _ = spider.stats


class TestSpiderCustomization:
    """Test Spider customization patterns."""

    def test_custom_concurrent_requests(self):
        """Test spider with custom concurrent_requests."""

        class CustomSpider(Spider):
            name = "custom"
            concurrent_requests = 32
            start_urls = ["https://example.com"]

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = CustomSpider()
        assert spider.concurrent_requests == 32

    def test_custom_allowed_domains(self):
        """Test spider with allowed_domains."""

        class DomainSpider(Spider):
            name = "domain_spider"
            start_urls = ["https://example.com"]
            allowed_domains = {"example.com", "api.example.com"}

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = DomainSpider()
        assert "example.com" in spider.allowed_domains
        assert "api.example.com" in spider.allowed_domains

    def test_custom_download_delay(self):
        """Test spider with download delay."""

        class SlowSpider(Spider):
            name = "slow"
            download_delay = 1.5
            start_urls = ["https://example.com"]

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = SlowSpider()
        assert spider.download_delay == 1.5


class TestSpiderLogging:
    """Test Spider logging configuration."""

    def test_custom_logging_level(self):
        """Test spider with custom logging level."""

        class QuietSpider(Spider):
            name = "quiet"
            logging_level = logging.WARNING
            start_urls = ["https://example.com"]

            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                yield None

        spider = QuietSpider()
        assert spider.logger.level == logging.WARNING

    def test_log_file_creates_handler(self):
        """Test spider with log file creates file handler."""
        with tempfile.TemporaryDirectory() as tmpdir:
            log_path = Path(tmpdir) / "spider.log"

            class FileLogSpider(Spider):
                name = "file_log"
                log_file = str(log_path)
                start_urls = ["https://example.com"]

                async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
                    yield None

            spider = FileLogSpider()

            # Should have a file handler
            file_handlers = [
                h for h in spider.logger.handlers if isinstance(h, logging.FileHandler)
            ]
            assert len(file_handlers) == 1

            # Clean up
            for h in file_handlers:
                h.close()

    def test_logger_does_not_propagate(self):
        """Test that spider logger does not propagate to parent."""
        spider = ConcreteSpider()

        assert spider.logger.propagate is False


class TestSessionConfigurationError:
    """Test SessionConfigurationError exception."""

    def test_exception_message(self):
        """Test that exception preserves message."""
        error = SessionConfigurationError("Custom error message")

        assert str(error) == "Custom error message"

    def test_exception_is_exception(self):
        """Test that it's a proper exception."""
        error = SessionConfigurationError("test")

        assert isinstance(error, Exception)


================================================
FILE: tox.ini
================================================
# Tox (https://tox.readthedocs.io/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.

[tox]
envlist = pre-commit,py{310,311,312,313}

[testenv]
usedevelop = True
changedir = tests
deps =
    playwright==1.58.0
    patchright==1.58.2
    -r{toxinidir}/tests/requirements.txt
extras = ai,shell
commands =
    # Run browser tests without parallelization (avoid browser conflicts)
    pytest --cov=scrapling --cov-report=xml -k "DynamicFetcher or StealthyFetcher" --verbose
    # Run asyncio tests without parallelization (avoid GitHub CI nested loop issues)
    pytest --cov=scrapling --cov-report=xml -m "asyncio" -k "not (DynamicFetcher or StealthyFetcher)" --verbose --cov-append
    # Run everything else with parallelization (for speed)
    pytest --cov=scrapling --cov-report=xml -m "not asyncio" -k "not (DynamicFetcher or StealthyFetcher)" -n auto --cov-append

[testenv:pre-commit]
basepython = python3
deps = pre-commit
commands = pre-commit run --all-files --show-diff-on-failure
skip_install = true

================================================
FILE: zensical.toml
================================================
[project]
site_name = "Scrapling"
site_description = "Scrapling - Effortless Web Scraping for the Modern Web!"
site_author = "Karim Shoair"
repo_url = "https://github.com/D4Vinci/Scrapling"
site_url = "https://scrapling.readthedocs.io/en/latest/"
repo_name = "D4Vinci/Scrapling"
copyright = "Copyright &copy; 2025 Karim Shoair - <a href=\"#__consent\">Change cookie settings</a>"
docs_dir = "docs"
use_directory_urls = false
exclude_docs = """
README*.md
"""
extra_css = ["stylesheets/extra.css"]

nav = [
    {Introduction = "index.md"},
    {Overview = "overview.md"},
    {"Performance Benchmarks" = "benchmarks.md"},
    {"User Guide" = [
        {Parsing = [
            {"Querying elements" = "parsing/selection.md"},
            {"Main classes" = "parsing/main_classes.md"},
            {"Adaptive scraping" = "parsing/adaptive.md"}
        ]},
        {Fetching = [
            {"Fetchers basics" = "fetching/choosing.md"},
            {"HTTP requests" = "fetching/static.md"},
            {"Dynamic websites" = "fetching/dynamic.md"},
            {"Dynamic websites with hard protections" = "fetching/stealthy.md"}
        ]},
        {Spiders = [
            {"Architecture" = "spiders/architecture.md"},
            {"Getting started" = "spiders/getting-started.md"},
            {"Requests & Responses" = "spiders/requests-responses.md"},
            {"Sessions" = "spiders/sessions.md"},
            {"Proxy management & Blocking" = "spiders/proxy-blocking.md"},
            {"Advanced features" = "spiders/advanced.md"}
        ]},
        {"Command Line Interface" = [
            {Overview = "cli/overview.md"},
            {"Interactive shell" = "cli/interactive-shell.md"},
            {"Extract commands" = "cli/extract-commands.md"}
        ]},
        {Integrations = [
            {"AI MCP server" = "ai/mcp-server.md"}
        ]}
    ]},
    {Tutorials = [
        {"A Free Alternative to AI for Robust Web Scraping" = "tutorials/replacing_ai.md"},
        {"Migrating from BeautifulSoup" = "tutorials/migrating_from_beautifulsoup.md"}
    ]},
    {Development = [
        {"API Reference" = [
            {Selector = "api-reference/selector.md"},
            {Fetchers = "api-reference/fetchers.md"},
            {"MCP Server" = "api-reference/mcp-server.md"},
            {"Custom Types" = "api-reference/custom-types.md"},
            {Response = "api-reference/response.md"},
            {Spiders = "api-reference/spiders.md"},
            {"Proxy Rotation" = "api-reference/proxy-rotation.md"}
        ]},
        {"Writing your retrieval system" = "development/adaptive_storage_system.md"},
        {"Using Scrapling's custom types" = "development/scrapling_custom_types.md"}
    ]},
    {"Support and Advertisement" = "donate.md"},
    {Contributing = "https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md"},
    {Changelog = "https://github.com/D4Vinci/Scrapling/releases"}
]

[project.theme]
language = "en"
custom_dir = "docs/overrides"
logo = "assets/logo.png"
favicon = "assets/favicon.ico"
features = [
    "navigation.path",
#    "announce.dismiss",
    "navigation.top",
    "navigation.footer",
    "navigation.indexes",
    "navigation.sections",
    "navigation.tracking",
    "navigation.instant",
    "navigation.instant.prefetch",
    "navigation.instant.progress",
#    "navigation.tabs",
#    "navigation.expand",
#    "toc.integrate",
    "search.share",
    "search.suggest",
    "search.highlight",
]

[[project.theme.palette]]
media = "(prefers-color-scheme: light)"
scheme = "default"
accent = "green"
primary = "deep purple"
toggle.icon = "lucide/sun"
toggle.name = "Switch to dark mode"

[[project.theme.palette]]
media = "(prefers-color-scheme: dark)"
scheme = "slate"
accent = "light green"
primary = "deep purple"
toggle.icon = "lucide/moon"
toggle.name = "Switch to light mode"

# Uncomment if needed:
# [project.theme.font]
# text = "Open Sans"
# code = "JetBrains Mono"

[project.markdown_extensions.pymdownx.caret]
[project.markdown_extensions.pymdownx.mark]
[project.markdown_extensions.pymdownx.tilde]
[project.markdown_extensions.admonition]
[project.markdown_extensions.abbr]
#[project.markdown_extensions.mkautodoc]
[project.markdown_extensions.pymdownx.details]
[project.markdown_extensions.pymdownx.superfences]
custom_fences = [
    {name = "mermaid", class = "mermaid", format = "pymdownx.superfences.fence_code_format"}
]
[project.markdown_extensions.pymdownx.inlinehilite]
[project.markdown_extensions.pymdownx.snippets]
[project.markdown_extensions.tables]

[project.markdown_extensions.pymdownx.emoji]
emoji_index = "zensical.extensions.emoji.twemoji"
emoji_generator = "zensical.extensions.emoji.to_svg"

[project.markdown_extensions.pymdownx.highlight]
pygments_lang_class = true
anchor_linenums = true
line_spans = "__span"

[project.markdown_extensions.pymdownx.tabbed]
alternate_style = true

[project.markdown_extensions.codehilite]
css_class = "highlight"

[project.markdown_extensions.toc]
title = "On this page"
permalink = true
toc_depth = 3

[project.plugins.mkdocstrings.handlers.python]
inventories = ["https://docs.python.org/3/objects.inv"]
paths = ["scrapling"]

[project.plugins.mkdocstrings.handlers.python.options]
docstring_style = "sphinx"
show_source = true
show_root_heading = true
show_if_no_docstring = true
inherited_members = true
members_order = "source"
separate_signature = true
unwrap_annotated = true
filters = "public"
merge_init_into_class = true
docstring_section_style = "spacy"
signature_crossrefs = true
show_symbol_type_heading = true
show_symbol_type_toc = true
show_inheritance_diagram = true
modernize_annotations = true
extensions = [
    "griffe_runtime_objects",
    "griffe_sphinx",
    {griffe_inherited_docstrings = {merge = true}}
]

[[project.extra.social]]
icon = "fontawesome/brands/github"
link = "https://github.com/D4Vinci/Scrapling"

[[project.extra.social]]
icon = "fontawesome/brands/x-twitter"
link = "https://x.com/Scrapling_dev"

[[project.extra.social]]
icon = "fontawesome/brands/discord"
link = "https://discord.gg/EMgGbDceNQ"

[[project.extra.social]]
icon = "fontawesome/brands/python"
link = "https://pypi.org/project/scrapling/"

[[project.extra.social]]
icon = "fontawesome/brands/docker"
link = "https://hub.docker.com/r/pyd4vinci/scrapling"

[project.extra.analytics]
provider = "google"
property = "G-CS3DKLY73Z"

[project.extra.analytics.feedback]
title = "Was this page helpful?"

[[project.extra.analytics.feedback.ratings]]
icon = "material/heart"
name = "This page was helpful"
data = 1
note = "Thanks for your feedback!"

[[project.extra.analytics.feedback.ratings]]
icon = "material/heart-broken"
name = "This page could be improved"
data = 0
note = """
Thanks for your feedback! Help us improve this page by
<a href="https://github.com/D4Vinci/Scrapling/issues/new?template=04-docs_issue.yml" target="_blank" rel="noopener">opening a documentation issue</a>.
"""

[project.extra.consent]
title = "Cookie consent"
description = """
We use cookies to recognize your repeated visits and preferences, as well
as to measure the effectiveness of our documentation and whether users
find what they're searching for. With your consent, you're helping us to
make our documentation better.
"""
actions = [
    "accept",
    "reject",
    "manage"
]